Implement sub-emitters

This commit is contained in:
Roberto Alsina 2024-08-03 21:37:22 -03:00
parent 937b9d50e0
commit 6c22222f0a

View File

@ -32,9 +32,9 @@ module Tartrazine
def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token)) def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token))
tokens = [] of Token tokens = [] of Token
match = pattern.match(text, pos) match = pattern.match(text, pos)
# We are matched, move post to after the match # We don't match if the match doesn't move the cursor
return false, pos, [] of Token if match.nil? # because that causes infinite loops
return false, pos, [] of Token if match.nil? || match.end == pos
# Emit the tokens # Emit the tokens
emitters.each do |emitter| emitters.each do |emitter|
# Emit the token # Emit the token
@ -88,15 +88,29 @@ module Tartrazine
class Emitter class Emitter
property type : String property type : String
property xml : XML::Node property xml : XML::Node
property emitters : Array(Emitter) = [] of Emitter
def initialize(@type : String, @xml : XML::Node?) def initialize(@type : String, @xml : XML::Node?)
# Some emitters may have emitters in them, like this:
# <bygroups>
# <token type="GenericPrompt"/>
# <token type="Text"/>
# <using lexer="bash"/>
# </bygroups>
#
# The token emitters match with the first 2 groups in the regex
# the using emitter matches the 3rd and shunts it to another lexer
@xml.children.each do |node|
next unless node.element?
@emitters << Emitter.new(node.name, node)
end
end end
def emit(match : Regex::MatchData?, lexer : Lexer) : Array(Token) def emit(match : Regex::MatchData?, lexer : Lexer, match_group = 0) : Array(Token)
case type case type
when "token" when "token"
raise Exception.new "Can't have a token without a match" if match.nil? raise Exception.new "Can't have a token without a match" if match.nil?
[Token.new(type: xml["type"], value: match[0])] [Token.new(type: xml["type"], value: match[match_group])]
when "push" when "push"
states_to_push = xml.attributes.select { |a| a.name == "state" }.map &.content states_to_push = xml.attributes.select { |a| a.name == "state" }.map &.content
if states_to_push.empty? if states_to_push.empty?
@ -110,8 +124,8 @@ module Tartrazine
lexer.state_stack.pop lexer.state_stack.pop
else else
# Really push # Really push
puts "Pushing state #{state}"
lexer.state_stack << state lexer.state_stack << state
puts "Pushed #{lexer.state_stack}"
end end
end end
[] of Token [] of Token
@ -125,24 +139,35 @@ module Tartrazine
end end
[] of Token [] of Token
when "bygroups" when "bygroups"
# This takes the groups in the regex and emits them as tokens # FIXME: handle
# Get all the token nodes # ><bygroups>
# <token type="Punctuation"/>
# None
# <token type="LiteralStringRegex"/>
#
# where that None means skipping a group
#
raise Exception.new "Can't have a token without a match" if match.nil? raise Exception.new "Can't have a token without a match" if match.nil?
tokens = xml.children.select { |n| n.name == "token" }.map(&.["type"].to_s)
# Each group matches an emitter
result = [] of Token result = [] of Token
tokens.each_with_index do |t, i| @emitters.each_with_index do |e, i|
result << {type: t, value: match[i + 1]} next if match[i + 1]?.nil?
result += e.emit(match, lexer, i + 1)
end end
result result
# TODO: Implement usingself
when "using" when "using"
# Shunt to another lexer entirely # Shunt to another lexer entirely
return [] of Token if match.nil? return [] of Token if match.nil?
lexer_name = xml["lexer"].downcase lexer_name = xml["lexer"].downcase
LEXERS[lexer_name].tokenize(match[0]) pp! "to tokenize:", match[match_group]
LEXERS[lexer_name].tokenize(match[match_group])
when "combined" when "combined"
# Combine two states into one anonymous state # Combine two states into one anonymous state
states = xml.attributes.select { |a| a.name == "state" }.map &.content states = xml.attributes.select { |a| a.name == "state" }.map &.content
new_state = states.map {|name| lexer.states[name]}.reduce { |s1, s2| s1 + s2 } new_state = states.map { |name| lexer.states[name] }.reduce { |s1, s2| s1 + s2 }
lexer.states[new_state.name] = new_state lexer.states[new_state.name] = new_state
lexer.state_stack << new_state.name lexer.state_stack << new_state.name
[] of Token [] of Token
@ -180,16 +205,18 @@ module Tartrazine
# Turn the text into a list of tokens. # Turn the text into a list of tokens.
def tokenize(text) : Array(Token) def tokenize(text) : Array(Token)
@state_stack = ["root"]
tokens = [] of Token tokens = [] of Token
pos = 0 pos = 0
matched = false matched = false
while pos < text.size while pos < text.size
state = states[state_stack.last] state = states[@state_stack.last]
puts "Stack is #{@state_stack} State is #{state.name}, pos is #{pos}"
p! state_stack.last, pos p! state_stack.last, pos
state.rules.each do |rule| state.rules.each do |rule|
matched, new_pos, new_tokens = rule.match(text, pos, self) matched, new_pos, new_tokens = rule.match(text, pos, self)
next unless matched next unless matched
p! rule.xml puts "MATCHED: #{rule.xml}"
pos = new_pos pos = new_pos
tokens += new_tokens tokens += new_tokens
@ -197,7 +224,7 @@ module Tartrazine
end end
# If no rule matches, emit an error token # If no rule matches, emit an error token
unless matched unless matched
tokens << {type: "Error", value: "?"} tokens << {type: "Error", value: "#{text[pos]}"}
pos += 1 pos += 1
end end
end end
@ -266,15 +293,7 @@ module Tartrazine
# the state stack. # the state stack.
rule_node.children.each do |node| rule_node.children.each do |node|
next unless node.element? next unless node.element?
# case node.name
# when "pop", "push", "multi", "combine" # "include",
# transformer = Transformer.new
# transformer.type = node.name
# transformer.xml = node.to_s
# rule.transformers << transformer
# else
rule.emitters << Emitter.new(node.name, node) rule.emitters << Emitter.new(node.name, node)
# end
end end
end end
end end
@ -327,11 +346,11 @@ def test_file(testname, lexer)
test = File.read(testname).split("---input---\n").last.split("---tokens---").first test = File.read(testname).split("---input---\n").last.split("---tokens---").first
pp! test pp! test
begin begin
tokens = lexer.tokenize(test) tokens = collapse_tokens(lexer.tokenize(test))
rescue ex : Exception rescue ex : Exception
puts ">>>ERROR" puts ">>>ERROR"
p! ex p! ex
exit 1 return
end end
outp = IO::Memory.new outp = IO::Memory.new
i = IO::Memory.new(test) i = IO::Memory.new(test)
@ -340,7 +359,7 @@ def test_file(testname, lexer)
"chroma", "chroma",
["-f", "json", "-l", lname], input: i, output: outp ["-f", "json", "-l", lname], input: i, output: outp
) )
chroma_tokens = Array(Tartrazine::Token).from_json(outp.to_s) chroma_tokens = collapse_tokens(Array(Tartrazine::Token).from_json(outp.to_s))
if chroma_tokens != tokens if chroma_tokens != tokens
pp! tokens pp! tokens
pp! chroma_tokens pp! chroma_tokens
@ -350,9 +369,32 @@ def test_file(testname, lexer)
end end
end end
# test_file("tests/c/test_preproc_file2.txt", lexers["c"]) def collapse_tokens(tokens : Array(Tartrazine::Token))
# exit 0 result = [] of Tartrazine::Token
tokens.each do |token|
if result.empty?
result << token
next
end
last = result.last
if last[:type] == token[:type]
new_token = {type: last[:type], value: last[:value] + token[:value]}
result.pop
result << new_token
else
result << token
end
end
result
end
# test_file(
# "tests/console/test_newline_in_ls_no_ps2.txt",
# lexers["console"])
# exit 0
total = 0 total = 0
Dir.glob("tests/*/") do |lexername| Dir.glob("tests/*/") do |lexername|
key = File.basename(lexername).downcase key = File.basename(lexername).downcase