Make it fail when running into an unknown emitter

This commit is contained in:
Roberto Alsina 2024-08-03 07:21:21 -03:00
parent b82a535928
commit 2c49457ca3

View File

@ -19,22 +19,18 @@ module Tartrazine
property emitters : Array(Emitter) = [] of Emitter property emitters : Array(Emitter) = [] of Emitter
property transformers : Array(Transformer) = [] of Transformer property transformers : Array(Transformer) = [] of Transformer
def match(text, pos) : Tuple(Int32, Array(Token)) def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token))
tokens = [] of Token tokens = [] of Token
match = pattern.match(text, pos) match = pattern.match(text, pos)
# We are matched, move post to after the match # We are matched, move post to after the match
return pos, [] of Token if match.nil? return false, pos, [] of Token if match.nil?
# Emit the tokens # Emit the tokens
emitters.each do |emitter| emitters.each do |emitter|
# Emit the token # Emit the token
tokens += emitter.emit(match) tokens += emitter.emit(match)
end end
# Transform the state return true, match.end, tokens
transformers.each do |transformer|
transformer.transform
end
return match.end, tokens
end end
end end
@ -51,9 +47,28 @@ module Tartrazine
class IncludeStateRule < Rule class IncludeStateRule < Rule
property state : String = "" property state : String = ""
def match(text, pos) : Tuple(Int32, Array(Token)) def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token))
puts "Including state #{state}" puts "Including state #{state} from #{lexer.state_stack.last}"
return pos, [] of Token lexer.states[state].rules.each do |rule|
matched, new_pos, new_tokens = rule.match(text, pos, lexer)
return true, new_pos, new_tokens if matched
end
return false, pos, [] of Token
end
end
# These rules look like this:
# <rule>
# <pop depth="1"/>
# </rule>
# They match, don't move pos, and alter the stack
class Always < Rule
def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token))
tokens = [] of Token
emitters.each do |emitter|
tokens += emitter.emit(nil)
end
return true, pos, tokens
end end
end end
@ -64,9 +79,10 @@ module Tartrazine
def initialize(@type : String, @xml : XML::Node?) def initialize(@type : String, @xml : XML::Node?)
end end
def emit(match : Regex::MatchData) : Array(Token) def emit(match : Regex::MatchData?) : Array(Token)
case type case type
when "token" when "token"
raise Exception.new "Can't have a token without a match" if match.nil?
[Token.new(type: xml["type"], value: match[0])] [Token.new(type: xml["type"], value: match[0])]
else else
raise Exception.new("Unknown emitter type: #{type}") raise Exception.new("Unknown emitter type: #{type}")
@ -102,18 +118,18 @@ module Tartrazine
def tokenize(text) : Array(Token) def tokenize(text) : Array(Token)
tokens = [] of Token tokens = [] of Token
pos = 0 pos = 0
matched = false
while pos < text.size while pos < text.size
state = states[state_stack.last] state = states[state_stack.last]
matched = false
state.rules.each do |rule| state.rules.each do |rule|
new_pos, new_tokens = rule.match(text, pos) matched, new_pos, new_tokens = rule.match(text, pos, self)
next unless matched
pos = new_pos pos = new_pos
tokens += new_tokens tokens += new_tokens
matched = true
break # We go back to processing with current state break # We go back to processing with current state
end end
# If no rule matches, emit an error token # If no rule matches, emit an error token
if !matched unless matched
tokens << {type: "Error", value: ""} tokens << {type: "Error", value: ""}
pos += 1 pos += 1
end end
@ -149,36 +165,43 @@ module Tartrazine
end end
# And states contain rules 🤷 # And states contain rules 🤷
state_node.children.select { |n| n.name == "rule" }.each do |rule_node| state_node.children.select { |n| n.name == "rule" }.each do |rule_node|
if rule_node["pattern"]? case rule_node["pattern"]?
# We have patter rules when nil
rule = Rule.new if rule_node.first_element_child.try &.name == "include"
begin
rule.pattern = /#{rule_node["pattern"]}/m
rescue ex : Exception
puts "Bad regex in #{l.config[:name]}: #{ex}"
end
else
# And rules that include a state
rule = IncludeStateRule.new rule = IncludeStateRule.new
include_node = rule_node.children.find { |n| n.name == "include" } include_node = rule_node.children.find { |n| n.name == "include" }
rule.state = include_node["state"] if include_node rule.state = include_node["state"] if include_node
end
state.rules << rule state.rules << rule
else
rule = Always.new
state.rules << rule
end
else
rule = Rule.new
begin
rule.pattern = /#{rule_node["pattern"]}/m
state.rules << rule
rescue ex : Exception
puts "Bad regex in #{l.config[:name]}: #{ex}"
next
end
end
next if rule.nil?
# Rules contain maybe an emitter and maybe a transformer # Rules contain maybe an emitter and maybe a transformer
# emitters emit tokens, transformers do things to # emitters emit tokens, transformers do things to
# the state stack. # the state stack.
rule_node.children.each do |node| rule_node.children.each do |node|
next unless node.element? next unless node.element?
case node.name # case node.name
when "pop", "push", "multi", "combine" # "include", # when "pop", "push", "multi", "combine" # "include",
transformer = Transformer.new # transformer = Transformer.new
transformer.type = node.name # transformer.type = node.name
transformer.xml = node.to_s # transformer.xml = node.to_s
rule.transformers << transformer # rule.transformers << transformer
when "bygroups", "combined", "mutators", "token", "using", "usingbygroup", "usingself" # else
rule.emitters << Emitter.new(node.name, node) rule.emitters << Emitter.new(node.name, node)
end # end
end end
end end
end end
@ -200,8 +223,9 @@ end
# Parse some plaintext # Parse some plaintext
puts lexers["plaintext"].tokenize("Hello, world!\n") puts lexers["plaintext"].tokenize("Hello, world!\n")
pp! lexers["pepe"]
# Now some bash # Now some bash
puts lexers["Bash"].tokenize("echo 'Hello, world!'\n") puts lexers["pepe"].tokenize("echo 'Hello, world!'\n")
# Convenience macros to parse XML # Convenience macros to parse XML
macro xml_to_s(node, name) macro xml_to_s(node, name)