mirror of
https://github.com/ralsina/tartrazine.git
synced 2024-11-10 05:22:23 +00:00
Rebase to main
This commit is contained in:
parent
5a88a51f3e
commit
58fd42d936
5
TODO.md
5
TODO.md
@ -9,4 +9,7 @@
|
|||||||
* ✅ Implement lexer loader by file extension
|
* ✅ Implement lexer loader by file extension
|
||||||
* ✅ Add --line-numbers to terminal formatter
|
* ✅ Add --line-numbers to terminal formatter
|
||||||
* Implement lexer loader by mime type
|
* Implement lexer loader by mime type
|
||||||
* Implement Pygment's "DelegateLexer"
|
* ✅ Implement Delegating lexers
|
||||||
|
* ✅ Add RstLexer
|
||||||
|
* Add Mako template lexer
|
||||||
|
* Implement heuristic lexer detection
|
||||||
|
@ -23,14 +23,17 @@ module Tartrazine
|
|||||||
struct Action
|
struct Action
|
||||||
property actions : Array(Action) = [] of Action
|
property actions : Array(Action) = [] of Action
|
||||||
|
|
||||||
property token_type : String = ""
|
@content_index : Array(Int32) = [] of Int32
|
||||||
property states_to_push : Array(String) = [] of String
|
@depth : Int32 = 0
|
||||||
property depth = 0
|
@lexer_index : Int32 = 0
|
||||||
property lexer_name : String = ""
|
@lexer_name : String = ""
|
||||||
property states_to_combine : Array(String) = [] of String
|
@states : Array(String) = [] of String
|
||||||
|
@states_to_push : Array(String) = [] of String
|
||||||
|
@token_type : String = ""
|
||||||
|
@type : ActionType = ActionType::Token
|
||||||
|
|
||||||
def initialize(@type : String, @xml : XML::Node?)
|
def initialize(t : String, xml : XML::Node?)
|
||||||
# Extract information from the XML node we will use later
|
@type = ActionType.parse(t.capitalize)
|
||||||
|
|
||||||
# Some actions may have actions in them, like this:
|
# Some actions may have actions in them, like this:
|
||||||
# <bygroups>
|
# <bygroups>
|
||||||
@ -41,31 +44,30 @@ module Tartrazine
|
|||||||
#
|
#
|
||||||
# The token actions match with the first 2 groups in the regex
|
# The token actions match with the first 2 groups in the regex
|
||||||
# the using action matches the 3rd and shunts it to another lexer
|
# the using action matches the 3rd and shunts it to another lexer
|
||||||
|
xml.children.each do |node|
|
||||||
known_types = %w(token push pop bygroups using usingself include combined)
|
|
||||||
raise Exception.new(
|
|
||||||
"Unknown action type: #{@type}") unless known_types.includes? @type
|
|
||||||
|
|
||||||
@xml.children.each do |node|
|
|
||||||
next unless node.element?
|
next unless node.element?
|
||||||
@actions << Action.new(node.name, node)
|
@actions << Action.new(node.name, node)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# Prefetch the attributes we ned from the XML and keep them
|
||||||
case @type
|
case @type
|
||||||
when "token"
|
when ActionType::Token
|
||||||
@token_type = xml["type"]? || ""
|
@token_type = xml["type"]
|
||||||
when "push"
|
when ActionType::Push
|
||||||
@states_to_push = xml.attributes.select { |attrib|
|
@states_to_push = xml.attributes.select { |attrib|
|
||||||
attrib.name == "state"
|
attrib.name == "state"
|
||||||
}.map &.content || [] of String
|
}.map &.content
|
||||||
when "pop"
|
when ActionType::Pop
|
||||||
@depth = xml["depth"]?.try &.to_i || 0
|
@depth = xml["depth"].to_i
|
||||||
when "using"
|
when ActionType::Using
|
||||||
@lexer_name = xml["lexer"]?.try &.downcase || ""
|
@lexer_name = xml["lexer"].downcase
|
||||||
when "combined"
|
when ActionType::Combined
|
||||||
@states_to_combine = xml.attributes.select { |attrib|
|
@states = xml.attributes.select { |attrib|
|
||||||
attrib.name == "state"
|
attrib.name == "state"
|
||||||
}.map &.content
|
}.map &.content
|
||||||
|
when ActionType::Usingbygroup
|
||||||
|
@lexer_index = xml["lexer"].to_i
|
||||||
|
@content_index = xml["content"].split(",").map(&.to_i)
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
@ -75,25 +77,21 @@ module Tartrazine
|
|||||||
when ActionType::Token
|
when ActionType::Token
|
||||||
raise Exception.new "Can't have a token without a match" if match.empty?
|
raise Exception.new "Can't have a token without a match" if match.empty?
|
||||||
[Token.new(type: @token_type, value: String.new(match[match_group].value))]
|
[Token.new(type: @token_type, value: String.new(match[match_group].value))]
|
||||||
when "push"
|
when ActionType::Push
|
||||||
if @states_to_push.empty?
|
to_push = @states_to_push.empty? ? [tokenizer.state_stack.last] : @states_to_push
|
||||||
# Push without a state means push the current state
|
to_push.each do |state|
|
||||||
@states_to_push = [lexer.state_stack.last]
|
if state == "#pop" && tokenizer.state_stack.size > 1
|
||||||
end
|
|
||||||
@states_to_push.each do |state|
|
|
||||||
if state == "#pop"
|
|
||||||
# Pop the state
|
# Pop the state
|
||||||
lexer.state_stack.pop
|
tokenizer.state_stack.pop
|
||||||
else
|
else
|
||||||
# Really push
|
# Really push
|
||||||
lexer.state_stack << state
|
tokenizer.state_stack << state
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
[] of Token
|
[] of Token
|
||||||
when "pop"
|
when ActionType::Pop
|
||||||
if lexer.state_stack.size > @depth
|
to_pop = [@depth, tokenizer.state_stack.size - 1].min
|
||||||
lexer.state_stack.pop(@depth)
|
tokenizer.state_stack.pop(to_pop)
|
||||||
end
|
|
||||||
[] of Token
|
[] of Token
|
||||||
when ActionType::Bygroups
|
when ActionType::Bygroups
|
||||||
# FIXME: handle
|
# FIXME: handle
|
||||||
@ -104,7 +102,7 @@ module Tartrazine
|
|||||||
#
|
#
|
||||||
# where that None means skipping a group
|
# where that None means skipping a group
|
||||||
#
|
#
|
||||||
raise Exception.new "Can't have a token without a match" if match.empty?
|
raise Exception.new "Can't have a token without a match" if match.nil?
|
||||||
|
|
||||||
# Each group matches an action. If the group match is empty,
|
# Each group matches an action. If the group match is empty,
|
||||||
# the action is skipped.
|
# the action is skipped.
|
||||||
@ -113,7 +111,8 @@ module Tartrazine
|
|||||||
begin
|
begin
|
||||||
next if match[i + 1].size == 0
|
next if match[i + 1].size == 0
|
||||||
rescue IndexError
|
rescue IndexError
|
||||||
# No match for the last group
|
# FIXME: This should not actually happen
|
||||||
|
# No match for this group
|
||||||
next
|
next
|
||||||
end
|
end
|
||||||
result += e.emit(match, tokenizer, i + 1)
|
result += e.emit(match, tokenizer, i + 1)
|
||||||
@ -122,16 +121,19 @@ module Tartrazine
|
|||||||
when ActionType::Using
|
when ActionType::Using
|
||||||
# Shunt to another lexer entirely
|
# Shunt to another lexer entirely
|
||||||
return [] of Token if match.empty?
|
return [] of Token if match.empty?
|
||||||
Tartrazine.lexer(@lexer_name).tokenize(String.new(match[match_group].value), usingself: true)
|
Tartrazine.lexer(@lexer_name).tokenizer(
|
||||||
when "usingself"
|
String.new(match[match_group].value),
|
||||||
|
secondary: true).to_a
|
||||||
|
when ActionType::Usingself
|
||||||
# Shunt to another copy of this lexer
|
# Shunt to another copy of this lexer
|
||||||
return [] of Token if match.empty?
|
return [] of Token if match.empty?
|
||||||
new_lexer = Lexer.from_xml(lexer.xml)
|
tokenizer.lexer.tokenizer(
|
||||||
new_lexer.tokenize(String.new(match[match_group].value), usingself: true)
|
String.new(match[match_group].value),
|
||||||
when "combined"
|
secondary: true).to_a
|
||||||
# Combine two states into one anonymous state
|
when ActionType::Combined
|
||||||
new_state = @states_to_combine.map { |name|
|
# Combine two or more states into one anonymous state
|
||||||
lexer.states[name]
|
new_state = @states.map { |name|
|
||||||
|
tokenizer.lexer.states[name]
|
||||||
}.reduce { |state1, state2|
|
}.reduce { |state1, state2|
|
||||||
state1 + state2
|
state1 + state2
|
||||||
}
|
}
|
||||||
@ -149,7 +151,7 @@ module Tartrazine
|
|||||||
content,
|
content,
|
||||||
secondary: true).to_a
|
secondary: true).to_a
|
||||||
else
|
else
|
||||||
raise Exception.new("Unhandled action type: #{type}")
|
raise Exception.new("Unknown action type: #{@type}")
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
@ -31,7 +31,6 @@ module BytesRegex
|
|||||||
end
|
end
|
||||||
|
|
||||||
def match(str : Bytes, pos = 0) : Array(Match)
|
def match(str : Bytes, pos = 0) : Array(Match)
|
||||||
match = [] of Match
|
|
||||||
rc = LibPCRE2.match(
|
rc = LibPCRE2.match(
|
||||||
@re,
|
@re,
|
||||||
str,
|
str,
|
||||||
@ -40,9 +39,9 @@ module BytesRegex
|
|||||||
LibPCRE2::NO_UTF_CHECK,
|
LibPCRE2::NO_UTF_CHECK,
|
||||||
@match_data,
|
@match_data,
|
||||||
nil)
|
nil)
|
||||||
if rc >= 0
|
if rc > 0
|
||||||
ovector = LibPCRE2.get_ovector_pointer(@match_data)
|
ovector = LibPCRE2.get_ovector_pointer(@match_data)
|
||||||
(0...rc).each do |i|
|
(0...rc).map do |i|
|
||||||
m_start = ovector[2 * i]
|
m_start = ovector[2 * i]
|
||||||
m_end = ovector[2 * i + 1]
|
m_end = ovector[2 * i + 1]
|
||||||
if m_start == m_end
|
if m_start == m_end
|
||||||
@ -55,7 +54,6 @@ module BytesRegex
|
|||||||
else
|
else
|
||||||
[] of Match
|
[] of Match
|
||||||
end
|
end
|
||||||
match
|
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
13
src/main.cr
13
src/main.cr
@ -1,18 +1,6 @@
|
|||||||
require "docopt"
|
require "docopt"
|
||||||
require "./**"
|
require "./**"
|
||||||
|
|
||||||
# Performance data (in milliseconds):
|
|
||||||
#
|
|
||||||
# Docopt parsing: 0.5
|
|
||||||
# Instantiating a theme: 0.1
|
|
||||||
# Instantiating a formatter: 1.0
|
|
||||||
# Instantiating a lexer: 2.0
|
|
||||||
# Tokenizing crycco.cr: 16.0
|
|
||||||
# Formatting: 0.5
|
|
||||||
# I/O: 1.5
|
|
||||||
# ---------------------------------
|
|
||||||
# Total: 21.6
|
|
||||||
|
|
||||||
HELP = <<-HELP
|
HELP = <<-HELP
|
||||||
tartrazine: a syntax highlighting tool
|
tartrazine: a syntax highlighting tool
|
||||||
|
|
||||||
@ -96,6 +84,7 @@ if options["-f"]
|
|||||||
end
|
end
|
||||||
|
|
||||||
lexer = Tartrazine.lexer(name: options["-l"].as(String), filename: options["FILE"].as(String))
|
lexer = Tartrazine.lexer(name: options["-l"].as(String), filename: options["FILE"].as(String))
|
||||||
|
|
||||||
input = File.open(options["FILE"].as(String)).gets_to_end
|
input = File.open(options["FILE"].as(String)).gets_to_end
|
||||||
|
|
||||||
if options["-o"].nil?
|
if options["-o"].nil?
|
||||||
|
31
src/rules.cr
31
src/rules.cr
@ -15,28 +15,11 @@ module Tartrazine
|
|||||||
alias Match = BytesRegex::Match
|
alias Match = BytesRegex::Match
|
||||||
alias MatchData = Array(Match)
|
alias MatchData = Array(Match)
|
||||||
|
|
||||||
class Rule
|
abstract struct BaseRule
|
||||||
property pattern : Regex = Regex.new ""
|
abstract def match(text : Bytes, pos : Int32, tokenizer : Tokenizer) : Tuple(Bool, Int32, Array(Token))
|
||||||
property actions : Array(Action) = [] of Action
|
abstract def initialize(node : XML::Node)
|
||||||
|
|
||||||
def match(text : Bytes, pos, lexer) : Tuple(Bool, Int32, Array(Token))
|
@actions : Array(Action) = [] of Action
|
||||||
match = pattern.match(text, pos)
|
|
||||||
# We don't match if the match doesn't move the cursor
|
|
||||||
# because that causes infinite loops
|
|
||||||
return false, pos, [] of Token if match.empty? || match[0].size == 0
|
|
||||||
tokens = [] of Token
|
|
||||||
actions.each do |action|
|
|
||||||
tokens += action.emit(match, lexer)
|
|
||||||
end
|
|
||||||
return true, pos + match[0].size, tokens
|
|
||||||
end
|
|
||||||
|
|
||||||
def initialize(node : XML::Node, multiline, dotall, ignorecase)
|
|
||||||
pattern = node["pattern"]
|
|
||||||
pattern = "(?m)" + pattern if multiline
|
|
||||||
@pattern = Regex.new(pattern, multiline, dotall, ignorecase, true)
|
|
||||||
add_actions(node)
|
|
||||||
end
|
|
||||||
|
|
||||||
def add_actions(node : XML::Node)
|
def add_actions(node : XML::Node)
|
||||||
node.children.each do |child|
|
node.children.each do |child|
|
||||||
@ -73,9 +56,9 @@ module Tartrazine
|
|||||||
struct IncludeStateRule < BaseRule
|
struct IncludeStateRule < BaseRule
|
||||||
@state : String = ""
|
@state : String = ""
|
||||||
|
|
||||||
def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token))
|
def match(text : Bytes, pos : Int32, tokenizer : Tokenizer) : Tuple(Bool, Int32, Array(Token))
|
||||||
lexer.states[state].rules.each do |rule|
|
tokenizer.@lexer.states[@state].rules.each do |rule|
|
||||||
matched, new_pos, new_tokens = rule.match(text, pos, lexer)
|
matched, new_pos, new_tokens = rule.match(text, pos, tokenizer)
|
||||||
return true, new_pos, new_tokens if matched
|
return true, new_pos, new_tokens if matched
|
||||||
end
|
end
|
||||||
return false, pos, [] of Token
|
return false, pos, [] of Token
|
||||||
|
Loading…
Reference in New Issue
Block a user