From 70cfbef572f10da2ab7551c832a949cda8a5096e Mon Sep 17 00:00:00 2001 From: Roberto Alsina Date: Sun, 4 Aug 2024 19:54:25 -0300 Subject: [PATCH] refactor --- src/actions.cr | 102 ++++++++++++++++++++++++++++++++++++++++++++++ src/rules.cr | 27 ++++++------ src/tartrazine.cr | 102 ++-------------------------------------------- 3 files changed, 119 insertions(+), 112 deletions(-) create mode 100644 src/actions.cr diff --git a/src/actions.cr b/src/actions.cr new file mode 100644 index 0000000..a77f4f9 --- /dev/null +++ b/src/actions.cr @@ -0,0 +1,102 @@ +# These are Lexer actions. When a rule matches, it will +# perform a list of actions. These actions can emit tokens +# or change the state machine. +module Tartrazine + class Action + property type : String + property xml : XML::Node + property actions : Array(Action) = [] of Action + + def initialize(@type : String, @xml : XML::Node?) + # Some actions may have actions in them, like this: + # + # + # + # + # + # + # The token actions match with the first 2 groups in the regex + # the using action matches the 3rd and shunts it to another lexer + @xml.children.each do |node| + next unless node.element? + @actions << Action.new(node.name, node) + end + end + + def emit(match : Regex::MatchData?, lexer : Lexer, match_group = 0) : Array(Token) + case type + when "token" + raise Exception.new "Can't have a token without a match" if match.nil? + [Token.new(type: xml["type"], value: match[match_group])] + when "push" + states_to_push = xml.attributes.select { |a| a.name == "state" }.map &.content + if states_to_push.empty? + # Push without a state means push the current state + states_to_push = [lexer.state_stack.last] + end + states_to_push.each do |state| + if state == "#pop" + # Pop the state + # puts "Popping state" + lexer.state_stack.pop + else + # Really push + lexer.state_stack << state + # puts "Pushed #{lexer.state_stack}" + end + end + [] of Token + when "pop" + depth = xml["depth"].to_i + # puts "Popping #{depth} states" + if lexer.state_stack.size <= depth + # puts "Can't pop #{depth} states, only have #{lexer.state_stack.size}" + else + lexer.state_stack.pop(depth) + end + [] of Token + when "bygroups" + # FIXME: handle + # > + # + # None + # + # + # where that None means skipping a group + # + raise Exception.new "Can't have a token without a match" if match.nil? + + # Each group matches an action + + result = [] of Token + @actions.each_with_index do |e, i| + next if match[i + 1]?.nil? + result += e.emit(match, lexer, i + 1) + end + result + when "using" + # Shunt to another lexer entirely + return [] of Token if match.nil? + lexer_name = xml["lexer"].downcase + # pp! "to tokenize:", match[match_group] + LEXERS[lexer_name].tokenize(match[match_group], usingself: true) + when "usingself" + # Shunt to another copy of this lexer + return [] of Token if match.nil? + + new_lexer = Lexer.from_xml(lexer.xml) + # pp! "to tokenize:", match[match_group] + new_lexer.tokenize(match[match_group], usingself: true) + when "combined" + # Combine two states into one anonymous state + states = xml.attributes.select { |a| a.name == "state" }.map &.content + new_state = states.map { |name| lexer.states[name] }.reduce { |s1, s2| s1 + s2 } + lexer.states[new_state.name] = new_state + lexer.state_stack << new_state.name + [] of Token + else + raise Exception.new("Unknown action type: #{type}: #{xml}") + end + end + end +end diff --git a/src/rules.cr b/src/rules.cr index a318f3e..eb59114 100644 --- a/src/rules.cr +++ b/src/rules.cr @@ -1,3 +1,5 @@ +require "./actions" + # These are lexer rules. They match with the text being parsed # and perform actions, either emitting tokens or changing the # state of the lexer. @@ -5,7 +7,7 @@ module Tartrazine # This rule matches via a regex pattern class Rule property pattern : Regex = Regex.new "" - property emitters : Array(Emitter) = [] of Emitter + property actions : Array(Action) = [] of Action property xml : String = "foo" def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token)) @@ -16,9 +18,9 @@ module Tartrazine # pp! match, pattern.inspect, text, pos return false, pos, [] of Token if match.nil? || match.end == 0 # Emit the tokens - emitters.each do |emitter| + actions.each do |action| # Emit the token - tokens += emitter.emit(match, lexer) + tokens += action.emit(match, lexer) end # p! xml, match.end, tokens return true, match.end, tokens @@ -27,13 +29,13 @@ module Tartrazine def initialize(node : XML::Node, flags) @xml = node.to_s @pattern = Regex.new(node["pattern"], flags) - add_emitters(node) + add_actions(node) end - def add_emitters(node : XML::Node) + def add_actions(node : XML::Node) node.children.each do |node| next unless node.element? - @emitters << Emitter.new(node.name, node) + @actions << Action.new(node.name, node) end end end @@ -57,24 +59,23 @@ module Tartrazine @xml = node.to_s include_node = node.children.find { |n| n.name == "include" } @state = include_node["state"] if include_node - add_emitters(node) + add_actions(node) end end # This rule always matches, unconditionally - class Always < Rule + class UnconditionalRule < Rule def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token)) tokens = [] of Token - emitters.each do |emitter| - tokens += emitter.emit(nil, lexer) + actions.each do |action| + tokens += action.emit(nil, lexer) end return true, pos, tokens end def initialize(node : XML::Node) @xml = node.to_s - add_emitters(node) + add_actions(node) end end - -end \ No newline at end of file +end diff --git a/src/tartrazine.cr b/src/tartrazine.cr index 1621462..65a752b 100644 --- a/src/tartrazine.cr +++ b/src/tartrazine.cr @@ -2,13 +2,14 @@ require "base58" require "json" require "xml" require "./rules" +require "./actions" module Tartrazine VERSION = "0.1.0" # This implements a lexer for Pygments RegexLexers as expressed # in Chroma's XML serialization. # - # For explanations on what emitters, transformers, etc do + # For explanations on what actions, transformers, etc do # the Pygments documentation is a good place to start. # https://pygments.org/docs/lexerdevelopment/ class State @@ -24,103 +25,6 @@ module Tartrazine end - class Emitter - property type : String - property xml : XML::Node - property emitters : Array(Emitter) = [] of Emitter - - def initialize(@type : String, @xml : XML::Node?) - # Some emitters may have emitters in them, like this: - # - # - # - # - # - # - # The token emitters match with the first 2 groups in the regex - # the using emitter matches the 3rd and shunts it to another lexer - @xml.children.each do |node| - next unless node.element? - @emitters << Emitter.new(node.name, node) - end - end - - def emit(match : Regex::MatchData?, lexer : Lexer, match_group = 0) : Array(Token) - case type - when "token" - raise Exception.new "Can't have a token without a match" if match.nil? - [Token.new(type: xml["type"], value: match[match_group])] - when "push" - states_to_push = xml.attributes.select { |a| a.name == "state" }.map &.content - if states_to_push.empty? - # Push without a state means push the current state - states_to_push = [lexer.state_stack.last] - end - states_to_push.each do |state| - if state == "#pop" - # Pop the state - # puts "Popping state" - lexer.state_stack.pop - else - # Really push - lexer.state_stack << state - # puts "Pushed #{lexer.state_stack}" - end - end - [] of Token - when "pop" - depth = xml["depth"].to_i - # puts "Popping #{depth} states" - if lexer.state_stack.size <= depth - # puts "Can't pop #{depth} states, only have #{lexer.state_stack.size}" - else - lexer.state_stack.pop(depth) - end - [] of Token - when "bygroups" - # FIXME: handle - # > - # - # None - # - # - # where that None means skipping a group - # - raise Exception.new "Can't have a token without a match" if match.nil? - - # Each group matches an emitter - - result = [] of Token - @emitters.each_with_index do |e, i| - next if match[i + 1]?.nil? - result += e.emit(match, lexer, i + 1) - end - result - when "using" - # Shunt to another lexer entirely - return [] of Token if match.nil? - lexer_name = xml["lexer"].downcase - # pp! "to tokenize:", match[match_group] - LEXERS[lexer_name].tokenize(match[match_group], usingself: true) - when "usingself" - # Shunt to another copy of this lexer - return [] of Token if match.nil? - - new_lexer = Lexer.from_xml(lexer.xml) - # pp! "to tokenize:", match[match_group] - new_lexer.tokenize(match[match_group], usingself: true) - when "combined" - # Combine two states into one anonymous state - states = xml.attributes.select { |a| a.name == "state" }.map &.content - new_state = states.map { |name| lexer.states[name] }.reduce { |s1, s2| s1 + s2 } - lexer.states[new_state.name] = new_state - lexer.state_stack << new_state.name - [] of Token - else - raise Exception.new("Unknown emitter type: #{type}: #{xml}") - end - end - end alias Token = NamedTuple(type: String, value: String) @@ -215,7 +119,7 @@ module Tartrazine if rule_node.first_element_child.try &.name == "include" rule = IncludeStateRule.new(rule_node) else - rule = Always.new(rule_node) + rule = UnconditionalRule.new(rule_node) end else flags = Regex::Options::ANCHORED