Parsing plaintext works!

This commit is contained in:
Roberto Alsina 2024-08-03 06:05:29 -03:00
parent 270f51a811
commit d47c946e8b

View File

@ -15,9 +15,9 @@ module Tartrazine
end end
class Rule class Rule
property pattern : Regex? = nil property pattern : Regex = Regex.new ""
property emitters : Array(Emitter) = [] of Emitter property emitters : Array(Emitter) = [] of Emitter
property transformers : Array(TRansformer) = [] of TRansformer property transformers : Array(Transformer) = [] of Transformer
end end
# This rule includes another state like this: # This rule includes another state like this:
@ -35,14 +35,32 @@ module Tartrazine
end end
class Emitter class Emitter
property type : String = "" property type : String
property xml : String = "" property xml : XML::Node
def initialize(@type : String, @xml : XML::Node?)
end
def emit(match : Regex::MatchData) : Array(Token)
case type
when "token"
return [Token.new(type: xml["type"], value: match[0])]
end
[] of Token
end
end end
class Transformer class Transformer
property xml: String = "" property type : String = ""
property xml : String = ""
def transform
puts "Transforming #{type} #{xml}"
end
end end
alias Token = NamedTuple(type: String, value: String)
class Lexer class Lexer
property config = { property config = {
name: "", name: "",
@ -54,6 +72,43 @@ module Tartrazine
property states = {} of String => State property states = {} of String => State
property state_stack = ["root"]
# Turn the text into a list of tokens.
def tokenize(text) : Array(Token)
tokens = [] of Token
pos = 0
while pos < text.size
state = states[state_stack.last]
matched = false
state.rules.each do |rule|
case rule
when Rule # A normal regex rule
match = rule.pattern.match(text, pos)
# We are matched, move post to after the match
next if match.nil?
matched = true
pos = match.end
# Emit the tokens
rule.emitters.each do |emitter|
# Emit the token
tokens += emitter.emit(match)
end
# Transform the state
rule.transformers.each do |transformer|
transformer.transform
end
when IncludeStateRule
# TODO: something
end
# TODO: Emit error if no rule matched
end
end
tokens
end
def self.from_xml(xml : String) : Lexer def self.from_xml(xml : String) : Lexer
l = Lexer.new l = Lexer.new
lexer = XML.parse(xml).first_element_child lexer = XML.parse(xml).first_element_child
@ -86,7 +141,7 @@ module Tartrazine
# We have patter rules # We have patter rules
rule = Rule.new rule = Rule.new
begin begin
rule.pattern = /#{rule_node["pattern"]}/ rule.pattern = /#{rule_node["pattern"]}/m
rescue ex : Exception rescue ex : Exception
puts "Bad regex in #{l.config[:name]}: #{ex}" puts "Bad regex in #{l.config[:name]}: #{ex}"
end end
@ -106,12 +161,11 @@ module Tartrazine
case node.name case node.name
when "pop", "push", "include", "multi", "combine" when "pop", "push", "include", "multi", "combine"
transformer = Transformer.new transformer = Transformer.new
transformer.type = node.name
transformer.xml = node.to_s transformer.xml = node.to_s
rule.transformers << transformer rule.transformers << transformer
when "bygroups", "combined", "mutators", "token", "using", "usingbygroup", "usingself" when "bygroups", "combined", "mutators", "token", "using", "usingbygroup", "usingself"
emitter = Emitter.new rule.emitters << Emitter.new(node.name, node)
emitter.xml = node.to_s
rule.emitters << emitter
end end
end end
end end
@ -123,10 +177,17 @@ module Tartrazine
end end
end end
# Try loading all lexers
lexers = {} of String => Tartrazine::Lexer
Dir.glob("lexers/*.xml").each do |fname| Dir.glob("lexers/*.xml").each do |fname|
Tartrazine::Lexer.from_xml(File.read(fname)) l = Tartrazine::Lexer.from_xml(File.read(fname))
lexers[l.config[:name]] = l
end end
# Parse some plaintext
puts lexers["plaintext"].tokenize("Hello, world!\n")
# Convenience macros to parse XML # Convenience macros to parse XML
macro xml_to_s(node, name) macro xml_to_s(node, name)
{{node}}.children.find{|n| n.name == "{{name}}".lstrip("_")}.try &.content.to_s {{node}}.children.find{|n| n.name == "{{name}}".lstrip("_")}.try &.content.to_s