require "base58" require "json" require "xml" module Tartrazine VERSION = "0.1.0" # This implements a lexer for Pygments RegexLexers as expressed # in Chroma's XML serialization. # # For explanations on what emitters, transformers, etc do # the Pygments documentation is a good place to start. # https://pygments.org/docs/lexerdevelopment/ class State property name : String = "" property rules = [] of Rule def +(other : State) new_state = State.new new_state.name = Random.base58(8) new_state.rules = rules + other.rules new_state end end class Rule property pattern : Regex = Regex.new "" property emitters : Array(Emitter) = [] of Emitter property transformers : Array(Transformer) = [] of Transformer property xml : String = "foo" def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token)) tokens = [] of Token match = pattern.match(text, pos) # We don't match if the match doesn't move the cursor # because that causes infinite loops # pp! match, pattern.inspect, text, pos return false, pos, [] of Token if match.nil? || match.end == 0 # Emit the tokens emitters.each do |emitter| # Emit the token tokens += emitter.emit(match, lexer) end # p! xml, match.end, tokens return true, match.end, tokens end end # This rule includes another state like this: # # # # # # # # ... class IncludeStateRule < Rule property state : String = "" def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token)) puts "Including state #{state} from #{lexer.state_stack.last}" lexer.states[state].rules.each do |rule| matched, new_pos, new_tokens = rule.match(text, pos, lexer) # p! xml, new_pos, new_tokens if matched return true, new_pos, new_tokens if matched end return false, pos, [] of Token end end # These rules look like this: # # # # They match, don't move pos, probably alter # the stack, probably not generate tokens class Always < Rule def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token)) tokens = [] of Token emitters.each do |emitter| tokens += emitter.emit(nil, lexer) end return true, pos, tokens end end class Emitter property type : String property xml : XML::Node property emitters : Array(Emitter) = [] of Emitter def initialize(@type : String, @xml : XML::Node?) # Some emitters may have emitters in them, like this: # # # # # # # The token emitters match with the first 2 groups in the regex # the using emitter matches the 3rd and shunts it to another lexer @xml.children.each do |node| next unless node.element? @emitters << Emitter.new(node.name, node) end end def emit(match : Regex::MatchData?, lexer : Lexer, match_group = 0) : Array(Token) case type when "token" raise Exception.new "Can't have a token without a match" if match.nil? [Token.new(type: xml["type"], value: match[match_group])] when "push" states_to_push = xml.attributes.select { |a| a.name == "state" }.map &.content if states_to_push.empty? # Push without a state means push the current state states_to_push = [lexer.state_stack.last] end states_to_push.each do |state| if state == "#pop" # Pop the state puts "Popping state" lexer.state_stack.pop else # Really push lexer.state_stack << state puts "Pushed #{lexer.state_stack}" end end [] of Token when "pop" depth = xml["depth"].to_i puts "Popping #{depth} states" if lexer.state_stack.size <= depth puts "Can't pop #{depth} states, only have #{lexer.state_stack.size}" else lexer.state_stack.pop(depth) end [] of Token when "bygroups" # FIXME: handle # > # # None # # # where that None means skipping a group # raise Exception.new "Can't have a token without a match" if match.nil? # Each group matches an emitter result = [] of Token @emitters.each_with_index do |e, i| next if match[i + 1]?.nil? result += e.emit(match, lexer, i + 1) end result when "using" # Shunt to another lexer entirely return [] of Token if match.nil? lexer_name = xml["lexer"].downcase # pp! "to tokenize:", match[match_group] LEXERS[lexer_name].tokenize(match[match_group], usingself: true) when "usingself" # Shunt to another copy of this lexer return [] of Token if match.nil? new_lexer = Lexer.from_xml(lexer.xml) # pp! "to tokenize:", match[match_group] new_lexer.tokenize(match[match_group], usingself: true) when "combined" # Combine two states into one anonymous state states = xml.attributes.select { |a| a.name == "state" }.map &.content new_state = states.map { |name| lexer.states[name] }.reduce { |s1, s2| s1 + s2 } lexer.states[new_state.name] = new_state lexer.state_stack << new_state.name [] of Token else raise Exception.new("Unknown emitter type: #{type}: #{xml}") end end end class Transformer property type : String = "" property xml : String = "" def transform puts "Transforming #{type} #{xml}" end end alias Token = NamedTuple(type: String, value: String) LEXERS = {} of String => Tartrazine::Lexer class Lexer property config = { name: "", aliases: [] of String, filenames: [] of String, mime_types: [] of String, priority: 0.0, case_insensitive: false, dot_all: false, not_multiline: false, ensure_nl: false, } property xml : String = "" property states = {} of String => State property state_stack = ["root"] # Turn the text into a list of tokens. def tokenize(text, usingself = false) : Array(Token) @state_stack = ["root"] tokens = [] of Token pos = 0 matched = false if text.size > 0 && text[-1] != '\n' && config[:ensure_nl] && !usingself text += "\n" end while pos < text.size state = states[@state_stack.last] puts "Stack is #{@state_stack} State is #{state.name}, pos is #{pos}, text is #{text[pos..pos + 10]}" state.rules.each do |rule| matched, new_pos, new_tokens = rule.match(text, pos, self) puts "NOT MATCHED: #{rule.xml}" next unless matched puts "MATCHED: #{rule.xml}" pos = new_pos tokens += new_tokens break # We go back to processing with current state end # If no rule matches, emit an error token unless matched # p! "Error at #{pos}" tokens << {type: "Error", value: "#{text[pos]}"} pos += 1 end end tokens.reject { |t| t[:value] == "" } end def self.from_xml(xml : String) : Lexer l = Lexer.new l.xml = xml lexer = XML.parse(xml).first_element_child if lexer config = lexer.children.find { |n| n.name == "config" } if config l.config = { name: xml_to_s(config, name) || "", aliases: xml_to_a(config, _alias) || [] of String, filenames: xml_to_a(config, filename) || [] of String, mime_types: xml_to_a(config, mime_type) || [] of String, priority: xml_to_f(config, priority) || 0.0, not_multiline: xml_to_s(config, not_multiline) == "true", # FIXME: This has no effect yet (see ) dot_all: xml_to_s(config, dot_all) == "true", case_insensitive: xml_to_s(config, case_insensitive) == "true", ensure_nl: xml_to_s(config, ensure_nl) == "true", } end rules = lexer.children.find { |n| n.name == "rules" } if rules # Rules contains states 🤷 rules.children.select { |n| n.name == "state" }.each do |state_node| state = State.new state.name = state_node["name"] if l.states.has_key?(state.name) puts "Duplicate state: #{state.name}" else l.states[state.name] = state end # And states contain rules 🤷 state_node.children.select { |n| n.name == "rule" }.each do |rule_node| case rule_node["pattern"]? when nil if rule_node.first_element_child.try &.name == "include" rule = IncludeStateRule.new rule.xml = rule_node.to_s include_node = rule_node.children.find { |n| n.name == "include" } rule.state = include_node["state"] if include_node state.rules << rule else rule = Always.new rule.xml = rule_node.to_s state.rules << rule end else flags = Regex::Options::ANCHORED flags |= Regex::Options::MULTILINE unless l.config[:not_multiline] flags |= Regex::Options::IGNORE_CASE if l.config[:case_insensitive] flags |= Regex::Options::DOTALL if l.config[:dot_all] rule = Rule.new rule.xml = rule_node.to_s rule.pattern = Regex.new(rule_node["pattern"], flags) state.rules << rule end next if rule.nil? # Rules contain maybe an emitter and maybe a transformer # emitters emit tokens, transformers do things to # the state stack. rule_node.children.each do |node| next unless node.element? rule.emitters << Emitter.new(node.name, node) end end end end end l end end end # Try loading all lexers lexers = Tartrazine::LEXERS Dir.glob("lexers/*.xml").each do |fname| begin l = Tartrazine::Lexer.from_xml(File.read(fname)) rescue ex : Exception # p! ex next end lexers[l.config[:name].downcase] = l l.config[:aliases].each do |key| lexers[key.downcase] = l end end # Convenience macros to parse XML macro xml_to_s(node, name) {{node}}.children.find{|n| n.name == "{{name}}".lstrip("_")}.try &.content.to_s end macro xml_to_f(node, name) ({{node}}.children.find{|n| n.name == "{{name}}".lstrip("_")}.try &.content.to_s.to_f) end macro xml_to_a(node, name) {{node}}.children.select{|n| n.name == "{{name}}".lstrip("_")}.map {|n| n.content.to_s} end # Let's run some tests def chroma_tokenize(lexer, text) output = IO::Memory.new input = IO::Memory.new(text) Process.run( "chroma", ["-f", "json", "-l", lexer], input: input, output: output ) Array(Tartrazine::Token).from_json(output.to_s) end def test_file(testname, lexer) test = File.read(testname).split("---input---\n").last.split("---tokens---").first begin tokens = collapse_tokens(lexer.tokenize(test)) rescue ex : Exception puts ">>>ERROR" raise ex return end outp = IO::Memory.new i = IO::Memory.new(test) lname = lexer.config[:name] Process.run( "chroma", ["-f", "json", "-l", lname], input: i, output: outp ) chroma_tokens = collapse_tokens(Array(Tartrazine::Token).from_json(outp.to_s)) if chroma_tokens != tokens puts ">>>BAD - #{testname}" else puts ">>>GOOD" end end def collapse_tokens(tokens : Array(Tartrazine::Token)) result = [] of Tartrazine::Token tokens.each do |token| if result.empty? result << token next end last = result.last if last[:type] == token[:type] new_token = {type: last[:type], value: last[:value] + token[:value]} result.pop result << new_token else result << token end end result end total = 0 Dir.glob("tests/*/") do |lexername| key = File.basename(lexername).downcase # next if key == "console" next unless lexers.has_key? key lexer = lexers[key] Dir.glob("#{lexername}*.txt") do |testname| # # next if testname == "tests/fortran/test_string_cataback.txt" # Difference is different unicode representation of a string literal next if testname == "tests/java/test_string_literals.txt" next if testname == "tests/systemd/example1.txt" next if testname == "tests/json/test_strings.txt" # Tartrazine agrees with pygments, disagrees with chroma next if testname == "tests/java/test_default.txt" next if testname == "tests/java/test_numeric_literals.txt" next if testname == "tests/java/test_multiline_string.txt" # Tartrazine disagrees with pygments and chroma, but it's fine next if testname == "tests/php/test_string_escaping_run.txt" # Chroma's output is bad, but so is Tartrazine's next if "tests/html/javascript_unclosed.txt" == testname # KNOWN BAD -- TO FIX next if "tests/html/css_backtracking.txt" == testname next if "tests/php/anonymous_class.txt" == testname next if "tests/c/test_string_resembling_decl_end.txt" == testname next if "tests/mcfunction/data.txt" == testname next if "tests/mcfunction/selectors.txt" == testname # I disagree with these tests # next if testname.starts_with? "tests/console" puts "Testing #{key} with #{testname}" total += 1 test_file(testname, lexer) end end puts ">>>TOTAL #{total}"