mirror of
https://github.com/ralsina/tartrazine.git
synced 2024-11-13 23:12:24 +00:00
Parsing plaintext works!
This commit is contained in:
parent
270f51a811
commit
d47c946e8b
@ -15,9 +15,9 @@ module Tartrazine
|
|||||||
end
|
end
|
||||||
|
|
||||||
class Rule
|
class Rule
|
||||||
property pattern : Regex? = nil
|
property pattern : Regex = Regex.new ""
|
||||||
property emitters : Array(Emitter) = [] of Emitter
|
property emitters : Array(Emitter) = [] of Emitter
|
||||||
property transformers : Array(TRansformer) = [] of TRansformer
|
property transformers : Array(Transformer) = [] of Transformer
|
||||||
end
|
end
|
||||||
|
|
||||||
# This rule includes another state like this:
|
# This rule includes another state like this:
|
||||||
@ -35,14 +35,32 @@ module Tartrazine
|
|||||||
end
|
end
|
||||||
|
|
||||||
class Emitter
|
class Emitter
|
||||||
property type : String = ""
|
property type : String
|
||||||
property xml : String = ""
|
property xml : XML::Node
|
||||||
|
|
||||||
|
def initialize(@type : String, @xml : XML::Node?)
|
||||||
|
end
|
||||||
|
|
||||||
|
def emit(match : Regex::MatchData) : Array(Token)
|
||||||
|
case type
|
||||||
|
when "token"
|
||||||
|
return [Token.new(type: xml["type"], value: match[0])]
|
||||||
|
end
|
||||||
|
[] of Token
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
class Transformer
|
class Transformer
|
||||||
property xml: String = ""
|
property type : String = ""
|
||||||
|
property xml : String = ""
|
||||||
|
|
||||||
|
def transform
|
||||||
|
puts "Transforming #{type} #{xml}"
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
alias Token = NamedTuple(type: String, value: String)
|
||||||
|
|
||||||
class Lexer
|
class Lexer
|
||||||
property config = {
|
property config = {
|
||||||
name: "",
|
name: "",
|
||||||
@ -54,6 +72,43 @@ module Tartrazine
|
|||||||
|
|
||||||
property states = {} of String => State
|
property states = {} of String => State
|
||||||
|
|
||||||
|
property state_stack = ["root"]
|
||||||
|
|
||||||
|
# Turn the text into a list of tokens.
|
||||||
|
def tokenize(text) : Array(Token)
|
||||||
|
tokens = [] of Token
|
||||||
|
pos = 0
|
||||||
|
while pos < text.size
|
||||||
|
state = states[state_stack.last]
|
||||||
|
matched = false
|
||||||
|
state.rules.each do |rule|
|
||||||
|
case rule
|
||||||
|
when Rule # A normal regex rule
|
||||||
|
match = rule.pattern.match(text, pos)
|
||||||
|
|
||||||
|
# We are matched, move post to after the match
|
||||||
|
next if match.nil?
|
||||||
|
matched = true
|
||||||
|
pos = match.end
|
||||||
|
|
||||||
|
# Emit the tokens
|
||||||
|
rule.emitters.each do |emitter|
|
||||||
|
# Emit the token
|
||||||
|
tokens += emitter.emit(match)
|
||||||
|
end
|
||||||
|
# Transform the state
|
||||||
|
rule.transformers.each do |transformer|
|
||||||
|
transformer.transform
|
||||||
|
end
|
||||||
|
when IncludeStateRule
|
||||||
|
# TODO: something
|
||||||
|
end
|
||||||
|
# TODO: Emit error if no rule matched
|
||||||
|
end
|
||||||
|
end
|
||||||
|
tokens
|
||||||
|
end
|
||||||
|
|
||||||
def self.from_xml(xml : String) : Lexer
|
def self.from_xml(xml : String) : Lexer
|
||||||
l = Lexer.new
|
l = Lexer.new
|
||||||
lexer = XML.parse(xml).first_element_child
|
lexer = XML.parse(xml).first_element_child
|
||||||
@ -86,7 +141,7 @@ module Tartrazine
|
|||||||
# We have patter rules
|
# We have patter rules
|
||||||
rule = Rule.new
|
rule = Rule.new
|
||||||
begin
|
begin
|
||||||
rule.pattern = /#{rule_node["pattern"]}/
|
rule.pattern = /#{rule_node["pattern"]}/m
|
||||||
rescue ex : Exception
|
rescue ex : Exception
|
||||||
puts "Bad regex in #{l.config[:name]}: #{ex}"
|
puts "Bad regex in #{l.config[:name]}: #{ex}"
|
||||||
end
|
end
|
||||||
@ -106,12 +161,11 @@ module Tartrazine
|
|||||||
case node.name
|
case node.name
|
||||||
when "pop", "push", "include", "multi", "combine"
|
when "pop", "push", "include", "multi", "combine"
|
||||||
transformer = Transformer.new
|
transformer = Transformer.new
|
||||||
|
transformer.type = node.name
|
||||||
transformer.xml = node.to_s
|
transformer.xml = node.to_s
|
||||||
rule.transformers << transformer
|
rule.transformers << transformer
|
||||||
when "bygroups", "combined", "mutators", "token", "using", "usingbygroup", "usingself"
|
when "bygroups", "combined", "mutators", "token", "using", "usingbygroup", "usingself"
|
||||||
emitter = Emitter.new
|
rule.emitters << Emitter.new(node.name, node)
|
||||||
emitter.xml = node.to_s
|
|
||||||
rule.emitters << emitter
|
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
@ -123,10 +177,17 @@ module Tartrazine
|
|||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# Try loading all lexers
|
||||||
|
|
||||||
|
lexers = {} of String => Tartrazine::Lexer
|
||||||
Dir.glob("lexers/*.xml").each do |fname|
|
Dir.glob("lexers/*.xml").each do |fname|
|
||||||
Tartrazine::Lexer.from_xml(File.read(fname))
|
l = Tartrazine::Lexer.from_xml(File.read(fname))
|
||||||
|
lexers[l.config[:name]] = l
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# Parse some plaintext
|
||||||
|
puts lexers["plaintext"].tokenize("Hello, world!\n")
|
||||||
|
|
||||||
# Convenience macros to parse XML
|
# Convenience macros to parse XML
|
||||||
macro xml_to_s(node, name)
|
macro xml_to_s(node, name)
|
||||||
{{node}}.children.find{|n| n.name == "{{name}}".lstrip("_")}.try &.content.to_s
|
{{node}}.children.find{|n| n.name == "{{name}}".lstrip("_")}.try &.content.to_s
|
||||||
|
Loading…
Reference in New Issue
Block a user