2024-08-03 19:25:09 -03:00
|
|
|
require "base58"
|
2024-08-03 18:46:45 -03:00
|
|
|
require "json"
|
2024-08-03 19:25:09 -03:00
|
|
|
require "xml"
|
2024-08-02 17:03:39 -03:00
|
|
|
|
|
|
|
module Tartrazine
|
|
|
|
VERSION = "0.1.0"
|
|
|
|
|
2024-08-03 05:26:32 -03:00
|
|
|
# This implements a lexer for Pygments RegexLexers as expressed
|
|
|
|
# in Chroma's XML serialization.
|
2024-08-03 06:05:29 -03:00
|
|
|
#
|
2024-08-03 05:26:32 -03:00
|
|
|
# For explanations on what emitters, transformers, etc do
|
|
|
|
# the Pygments documentation is a good place to start.
|
|
|
|
# https://pygments.org/docs/lexerdevelopment/
|
2024-08-02 17:23:40 -03:00
|
|
|
class State
|
|
|
|
property name : String = ""
|
2024-08-02 17:33:01 -03:00
|
|
|
property rules = [] of Rule
|
2024-08-03 19:25:09 -03:00
|
|
|
|
|
|
|
def +(other : State)
|
|
|
|
new_state = State.new
|
|
|
|
new_state.name = Random.base58(8)
|
|
|
|
new_state.rules = rules + other.rules
|
|
|
|
new_state
|
|
|
|
end
|
2024-08-02 17:33:01 -03:00
|
|
|
end
|
|
|
|
|
|
|
|
class Rule
|
2024-08-03 06:05:29 -03:00
|
|
|
property pattern : Regex = Regex.new ""
|
2024-08-03 05:26:32 -03:00
|
|
|
property emitters : Array(Emitter) = [] of Emitter
|
2024-08-03 06:05:29 -03:00
|
|
|
property transformers : Array(Transformer) = [] of Transformer
|
2024-08-03 18:46:45 -03:00
|
|
|
property xml : String = "foo"
|
2024-08-03 06:37:15 -03:00
|
|
|
|
2024-08-03 07:21:21 -03:00
|
|
|
def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token))
|
2024-08-03 06:37:15 -03:00
|
|
|
tokens = [] of Token
|
2024-08-04 17:45:32 -03:00
|
|
|
match = pattern.match(text, pos)
|
2024-08-03 21:37:22 -03:00
|
|
|
# We don't match if the match doesn't move the cursor
|
|
|
|
# because that causes infinite loops
|
2024-08-04 17:45:32 -03:00
|
|
|
# pp! match, pattern.inspect, text, pos
|
2024-08-04 11:46:41 -03:00
|
|
|
return false, pos, [] of Token if match.nil? || match.end == 0
|
2024-08-03 06:37:15 -03:00
|
|
|
# Emit the tokens
|
|
|
|
emitters.each do |emitter|
|
|
|
|
# Emit the token
|
2024-08-03 07:24:55 -03:00
|
|
|
tokens += emitter.emit(match, lexer)
|
2024-08-03 06:37:15 -03:00
|
|
|
end
|
2024-08-04 17:45:32 -03:00
|
|
|
# p! xml, match.end, tokens
|
2024-08-03 07:21:21 -03:00
|
|
|
return true, match.end, tokens
|
2024-08-03 06:37:15 -03:00
|
|
|
end
|
2024-08-02 17:51:12 -03:00
|
|
|
end
|
|
|
|
|
2024-08-03 05:26:32 -03:00
|
|
|
# This rule includes another state like this:
|
2024-08-02 17:51:12 -03:00
|
|
|
# <rule>
|
|
|
|
# <include state="interp"/>
|
|
|
|
# </rule>
|
|
|
|
# </state>
|
|
|
|
# <state name="interp">
|
|
|
|
# <rule pattern="\$\(\(">
|
|
|
|
# <token type="Keyword"/>
|
|
|
|
# ...
|
|
|
|
|
2024-08-02 20:32:15 -03:00
|
|
|
class IncludeStateRule < Rule
|
2024-08-02 20:01:53 -03:00
|
|
|
property state : String = ""
|
2024-08-03 06:37:15 -03:00
|
|
|
|
2024-08-03 07:21:21 -03:00
|
|
|
def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token))
|
|
|
|
puts "Including state #{state} from #{lexer.state_stack.last}"
|
|
|
|
lexer.states[state].rules.each do |rule|
|
|
|
|
matched, new_pos, new_tokens = rule.match(text, pos, lexer)
|
2024-08-04 17:45:32 -03:00
|
|
|
# p! xml, new_pos, new_tokens if matched
|
2024-08-03 07:21:21 -03:00
|
|
|
return true, new_pos, new_tokens if matched
|
|
|
|
end
|
|
|
|
return false, pos, [] of Token
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
# These rules look like this:
|
|
|
|
# <rule>
|
|
|
|
# <pop depth="1"/>
|
|
|
|
# </rule>
|
2024-08-03 07:44:28 -03:00
|
|
|
# They match, don't move pos, probably alter
|
|
|
|
# the stack, probably not generate tokens
|
2024-08-03 07:21:21 -03:00
|
|
|
class Always < Rule
|
|
|
|
def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token))
|
|
|
|
tokens = [] of Token
|
|
|
|
emitters.each do |emitter|
|
2024-08-03 07:24:55 -03:00
|
|
|
tokens += emitter.emit(nil, lexer)
|
2024-08-03 07:21:21 -03:00
|
|
|
end
|
|
|
|
return true, pos, tokens
|
2024-08-03 06:37:15 -03:00
|
|
|
end
|
2024-08-02 17:23:40 -03:00
|
|
|
end
|
|
|
|
|
2024-08-02 20:32:15 -03:00
|
|
|
class Emitter
|
2024-08-03 06:05:29 -03:00
|
|
|
property type : String
|
|
|
|
property xml : XML::Node
|
2024-08-03 21:37:22 -03:00
|
|
|
property emitters : Array(Emitter) = [] of Emitter
|
2024-08-03 06:05:29 -03:00
|
|
|
|
|
|
|
def initialize(@type : String, @xml : XML::Node?)
|
2024-08-03 21:37:22 -03:00
|
|
|
# Some emitters may have emitters in them, like this:
|
|
|
|
# <bygroups>
|
|
|
|
# <token type="GenericPrompt"/>
|
|
|
|
# <token type="Text"/>
|
|
|
|
# <using lexer="bash"/>
|
|
|
|
# </bygroups>
|
|
|
|
#
|
|
|
|
# The token emitters match with the first 2 groups in the regex
|
|
|
|
# the using emitter matches the 3rd and shunts it to another lexer
|
|
|
|
@xml.children.each do |node|
|
|
|
|
next unless node.element?
|
|
|
|
@emitters << Emitter.new(node.name, node)
|
|
|
|
end
|
2024-08-03 06:05:29 -03:00
|
|
|
end
|
|
|
|
|
2024-08-03 21:37:22 -03:00
|
|
|
def emit(match : Regex::MatchData?, lexer : Lexer, match_group = 0) : Array(Token)
|
2024-08-03 06:05:29 -03:00
|
|
|
case type
|
|
|
|
when "token"
|
2024-08-03 07:21:21 -03:00
|
|
|
raise Exception.new "Can't have a token without a match" if match.nil?
|
2024-08-03 21:37:22 -03:00
|
|
|
[Token.new(type: xml["type"], value: match[match_group])]
|
2024-08-03 07:24:55 -03:00
|
|
|
when "push"
|
2024-08-03 18:46:45 -03:00
|
|
|
states_to_push = xml.attributes.select { |a| a.name == "state" }.map &.content
|
|
|
|
if states_to_push.empty?
|
|
|
|
# Push without a state means push the current state
|
|
|
|
states_to_push = [lexer.state_stack.last]
|
|
|
|
end
|
|
|
|
states_to_push.each do |state|
|
|
|
|
if state == "#pop"
|
|
|
|
# Pop the state
|
|
|
|
puts "Popping state"
|
|
|
|
lexer.state_stack.pop
|
|
|
|
else
|
|
|
|
# Really push
|
|
|
|
lexer.state_stack << state
|
2024-08-03 21:37:22 -03:00
|
|
|
puts "Pushed #{lexer.state_stack}"
|
2024-08-03 18:46:45 -03:00
|
|
|
end
|
|
|
|
end
|
2024-08-03 07:27:29 -03:00
|
|
|
[] of Token
|
|
|
|
when "pop"
|
2024-08-03 10:26:36 -03:00
|
|
|
depth = xml["depth"].to_i
|
|
|
|
puts "Popping #{depth} states"
|
2024-08-03 17:15:12 -03:00
|
|
|
if lexer.state_stack.size <= depth
|
2024-08-03 10:26:36 -03:00
|
|
|
puts "Can't pop #{depth} states, only have #{lexer.state_stack.size}"
|
|
|
|
else
|
|
|
|
lexer.state_stack.pop(depth)
|
|
|
|
end
|
2024-08-03 07:27:29 -03:00
|
|
|
[] of Token
|
2024-08-03 10:26:36 -03:00
|
|
|
when "bygroups"
|
2024-08-03 21:37:22 -03:00
|
|
|
# FIXME: handle
|
|
|
|
# ><bygroups>
|
|
|
|
# <token type="Punctuation"/>
|
|
|
|
# None
|
|
|
|
# <token type="LiteralStringRegex"/>
|
2024-08-04 17:45:32 -03:00
|
|
|
#
|
2024-08-03 21:37:22 -03:00
|
|
|
# where that None means skipping a group
|
2024-08-04 17:45:32 -03:00
|
|
|
#
|
2024-08-03 10:26:36 -03:00
|
|
|
raise Exception.new "Can't have a token without a match" if match.nil?
|
2024-08-03 21:37:22 -03:00
|
|
|
|
|
|
|
# Each group matches an emitter
|
2024-08-04 17:45:32 -03:00
|
|
|
|
2024-08-03 10:26:36 -03:00
|
|
|
result = [] of Token
|
2024-08-03 21:37:22 -03:00
|
|
|
@emitters.each_with_index do |e, i|
|
|
|
|
next if match[i + 1]?.nil?
|
|
|
|
result += e.emit(match, lexer, i + 1)
|
2024-08-03 10:26:36 -03:00
|
|
|
end
|
|
|
|
result
|
2024-08-03 19:06:06 -03:00
|
|
|
when "using"
|
|
|
|
# Shunt to another lexer entirely
|
|
|
|
return [] of Token if match.nil?
|
|
|
|
lexer_name = xml["lexer"].downcase
|
2024-08-04 17:45:32 -03:00
|
|
|
# pp! "to tokenize:", match[match_group]
|
|
|
|
LEXERS[lexer_name].tokenize(match[match_group], usingself: true)
|
2024-08-04 11:46:41 -03:00
|
|
|
when "usingself"
|
|
|
|
# Shunt to another copy of this lexer
|
|
|
|
return [] of Token if match.nil?
|
|
|
|
|
|
|
|
new_lexer = Lexer.from_xml(lexer.xml)
|
2024-08-04 17:45:32 -03:00
|
|
|
# pp! "to tokenize:", match[match_group]
|
|
|
|
new_lexer.tokenize(match[match_group], usingself: true)
|
2024-08-03 19:25:09 -03:00
|
|
|
when "combined"
|
|
|
|
# Combine two states into one anonymous state
|
|
|
|
states = xml.attributes.select { |a| a.name == "state" }.map &.content
|
2024-08-03 21:37:22 -03:00
|
|
|
new_state = states.map { |name| lexer.states[name] }.reduce { |s1, s2| s1 + s2 }
|
2024-08-03 19:25:09 -03:00
|
|
|
lexer.states[new_state.name] = new_state
|
|
|
|
lexer.state_stack << new_state.name
|
|
|
|
[] of Token
|
2024-08-03 06:37:15 -03:00
|
|
|
else
|
2024-08-03 07:24:55 -03:00
|
|
|
raise Exception.new("Unknown emitter type: #{type}: #{xml}")
|
2024-08-03 06:05:29 -03:00
|
|
|
end
|
|
|
|
end
|
2024-08-03 05:26:32 -03:00
|
|
|
end
|
|
|
|
|
|
|
|
class Transformer
|
2024-08-03 06:05:29 -03:00
|
|
|
property type : String = ""
|
|
|
|
property xml : String = ""
|
|
|
|
|
|
|
|
def transform
|
|
|
|
puts "Transforming #{type} #{xml}"
|
|
|
|
end
|
2024-08-02 20:32:15 -03:00
|
|
|
end
|
|
|
|
|
2024-08-03 06:05:29 -03:00
|
|
|
alias Token = NamedTuple(type: String, value: String)
|
|
|
|
|
2024-08-03 19:06:06 -03:00
|
|
|
LEXERS = {} of String => Tartrazine::Lexer
|
|
|
|
|
2024-08-02 17:03:39 -03:00
|
|
|
class Lexer
|
|
|
|
property config = {
|
2024-08-04 17:45:32 -03:00
|
|
|
name: "",
|
|
|
|
aliases: [] of String,
|
|
|
|
filenames: [] of String,
|
|
|
|
mime_types: [] of String,
|
|
|
|
priority: 0.0,
|
|
|
|
case_insensitive: false,
|
|
|
|
dot_all: false,
|
|
|
|
not_multiline: false,
|
|
|
|
ensure_nl: false,
|
2024-08-02 17:03:39 -03:00
|
|
|
}
|
2024-08-04 11:46:41 -03:00
|
|
|
property xml : String = ""
|
2024-08-02 17:03:39 -03:00
|
|
|
|
2024-08-02 19:48:58 -03:00
|
|
|
property states = {} of String => State
|
2024-08-02 17:23:40 -03:00
|
|
|
|
2024-08-03 06:05:29 -03:00
|
|
|
property state_stack = ["root"]
|
|
|
|
|
|
|
|
# Turn the text into a list of tokens.
|
2024-08-04 17:45:32 -03:00
|
|
|
def tokenize(text, usingself = false) : Array(Token)
|
2024-08-03 21:37:22 -03:00
|
|
|
@state_stack = ["root"]
|
2024-08-03 06:05:29 -03:00
|
|
|
tokens = [] of Token
|
|
|
|
pos = 0
|
2024-08-03 07:21:21 -03:00
|
|
|
matched = false
|
2024-08-04 17:45:32 -03:00
|
|
|
if text.size > 0 && text[-1] != '\n' && config[:ensure_nl] && !usingself
|
|
|
|
text += "\n"
|
|
|
|
end
|
2024-08-03 06:05:29 -03:00
|
|
|
while pos < text.size
|
2024-08-03 21:37:22 -03:00
|
|
|
state = states[@state_stack.last]
|
2024-08-04 17:45:32 -03:00
|
|
|
puts "Stack is #{@state_stack} State is #{state.name}, pos is #{pos}, text is #{text[pos..pos + 10]}"
|
2024-08-03 06:05:29 -03:00
|
|
|
state.rules.each do |rule|
|
2024-08-03 07:21:21 -03:00
|
|
|
matched, new_pos, new_tokens = rule.match(text, pos, self)
|
2024-08-04 11:46:41 -03:00
|
|
|
puts "NOT MATCHED: #{rule.xml}"
|
2024-08-03 07:21:21 -03:00
|
|
|
next unless matched
|
2024-08-03 21:37:22 -03:00
|
|
|
puts "MATCHED: #{rule.xml}"
|
2024-08-03 18:46:45 -03:00
|
|
|
|
2024-08-03 06:37:15 -03:00
|
|
|
pos = new_pos
|
|
|
|
tokens += new_tokens
|
|
|
|
break # We go back to processing with current state
|
|
|
|
end
|
|
|
|
# If no rule matches, emit an error token
|
2024-08-03 07:21:21 -03:00
|
|
|
unless matched
|
2024-08-04 17:45:32 -03:00
|
|
|
# p! "Error at #{pos}"
|
2024-08-03 21:37:22 -03:00
|
|
|
tokens << {type: "Error", value: "#{text[pos]}"}
|
2024-08-03 06:37:15 -03:00
|
|
|
pos += 1
|
2024-08-03 06:05:29 -03:00
|
|
|
end
|
|
|
|
end
|
2024-08-04 17:45:32 -03:00
|
|
|
tokens.reject { |t| t[:value] == "" }
|
2024-08-03 06:05:29 -03:00
|
|
|
end
|
|
|
|
|
2024-08-02 17:03:39 -03:00
|
|
|
def self.from_xml(xml : String) : Lexer
|
|
|
|
l = Lexer.new
|
2024-08-04 11:46:41 -03:00
|
|
|
l.xml = xml
|
2024-08-02 17:03:39 -03:00
|
|
|
lexer = XML.parse(xml).first_element_child
|
|
|
|
if lexer
|
|
|
|
config = lexer.children.find { |n| n.name == "config" }
|
|
|
|
if config
|
|
|
|
l.config = {
|
2024-08-04 17:45:32 -03:00
|
|
|
name: xml_to_s(config, name) || "",
|
|
|
|
aliases: xml_to_a(config, _alias) || [] of String,
|
|
|
|
filenames: xml_to_a(config, filename) || [] of String,
|
|
|
|
mime_types: xml_to_a(config, mime_type) || [] of String,
|
|
|
|
priority: xml_to_f(config, priority) || 0.0,
|
|
|
|
not_multiline: xml_to_s(config, not_multiline) == "true",
|
|
|
|
# FIXME: This has no effect yet (see )
|
|
|
|
dot_all: xml_to_s(config, dot_all) == "true",
|
|
|
|
case_insensitive: xml_to_s(config, case_insensitive) == "true",
|
|
|
|
ensure_nl: xml_to_s(config, ensure_nl) == "true",
|
2024-08-02 17:03:39 -03:00
|
|
|
}
|
|
|
|
end
|
2024-08-02 17:23:40 -03:00
|
|
|
|
|
|
|
rules = lexer.children.find { |n| n.name == "rules" }
|
|
|
|
if rules
|
|
|
|
# Rules contains states 🤷
|
2024-08-02 20:01:53 -03:00
|
|
|
rules.children.select { |n| n.name == "state" }.each do |state_node|
|
2024-08-02 17:23:40 -03:00
|
|
|
state = State.new
|
2024-08-02 20:01:53 -03:00
|
|
|
state.name = state_node["name"]
|
2024-08-02 19:48:58 -03:00
|
|
|
if l.states.has_key?(state.name)
|
|
|
|
puts "Duplicate state: #{state.name}"
|
|
|
|
else
|
|
|
|
l.states[state.name] = state
|
|
|
|
end
|
2024-08-02 17:33:01 -03:00
|
|
|
# And states contain rules 🤷
|
2024-08-02 20:01:53 -03:00
|
|
|
state_node.children.select { |n| n.name == "rule" }.each do |rule_node|
|
2024-08-03 07:21:21 -03:00
|
|
|
case rule_node["pattern"]?
|
|
|
|
when nil
|
|
|
|
if rule_node.first_element_child.try &.name == "include"
|
|
|
|
rule = IncludeStateRule.new
|
2024-08-03 18:46:45 -03:00
|
|
|
rule.xml = rule_node.to_s
|
2024-08-03 07:21:21 -03:00
|
|
|
include_node = rule_node.children.find { |n| n.name == "include" }
|
|
|
|
rule.state = include_node["state"] if include_node
|
|
|
|
state.rules << rule
|
|
|
|
else
|
|
|
|
rule = Always.new
|
2024-08-03 18:46:45 -03:00
|
|
|
rule.xml = rule_node.to_s
|
2024-08-03 07:21:21 -03:00
|
|
|
state.rules << rule
|
|
|
|
end
|
|
|
|
else
|
2024-08-04 17:45:32 -03:00
|
|
|
flags = Regex::Options::ANCHORED
|
|
|
|
flags |= Regex::Options::MULTILINE unless l.config[:not_multiline]
|
|
|
|
flags |= Regex::Options::IGNORE_CASE if l.config[:case_insensitive]
|
|
|
|
flags |= Regex::Options::DOTALL if l.config[:dot_all]
|
2024-08-02 17:51:12 -03:00
|
|
|
rule = Rule.new
|
2024-08-03 18:46:45 -03:00
|
|
|
rule.xml = rule_node.to_s
|
2024-08-04 17:45:32 -03:00
|
|
|
rule.pattern = Regex.new(rule_node["pattern"], flags)
|
|
|
|
state.rules << rule
|
2024-08-02 17:33:01 -03:00
|
|
|
end
|
2024-08-02 20:32:15 -03:00
|
|
|
|
2024-08-03 07:21:21 -03:00
|
|
|
next if rule.nil?
|
2024-08-02 20:32:15 -03:00
|
|
|
# Rules contain maybe an emitter and maybe a transformer
|
|
|
|
# emitters emit tokens, transformers do things to
|
2024-08-03 05:13:34 -03:00
|
|
|
# the state stack.
|
2024-08-02 20:32:15 -03:00
|
|
|
rule_node.children.each do |node|
|
|
|
|
next unless node.element?
|
2024-08-03 07:21:21 -03:00
|
|
|
rule.emitters << Emitter.new(node.name, node)
|
2024-08-02 20:32:15 -03:00
|
|
|
end
|
2024-08-02 17:33:01 -03:00
|
|
|
end
|
2024-08-02 17:23:40 -03:00
|
|
|
end
|
|
|
|
end
|
2024-08-02 17:03:39 -03:00
|
|
|
end
|
|
|
|
l
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2024-08-03 06:05:29 -03:00
|
|
|
# Try loading all lexers
|
|
|
|
|
2024-08-03 19:06:06 -03:00
|
|
|
lexers = Tartrazine::LEXERS
|
|
|
|
|
2024-08-03 05:05:01 -03:00
|
|
|
Dir.glob("lexers/*.xml").each do |fname|
|
2024-08-04 17:45:32 -03:00
|
|
|
begin
|
|
|
|
l = Tartrazine::Lexer.from_xml(File.read(fname))
|
|
|
|
rescue ex : Exception
|
|
|
|
# p! ex
|
|
|
|
next
|
|
|
|
end
|
2024-08-03 09:42:32 -03:00
|
|
|
lexers[l.config[:name].downcase] = l
|
|
|
|
l.config[:aliases].each do |key|
|
|
|
|
lexers[key.downcase] = l
|
|
|
|
end
|
2024-08-03 05:05:01 -03:00
|
|
|
end
|
2024-08-02 17:09:05 -03:00
|
|
|
|
|
|
|
# Convenience macros to parse XML
|
|
|
|
macro xml_to_s(node, name)
|
2024-08-02 17:23:40 -03:00
|
|
|
{{node}}.children.find{|n| n.name == "{{name}}".lstrip("_")}.try &.content.to_s
|
|
|
|
end
|
|
|
|
|
2024-08-03 05:05:01 -03:00
|
|
|
macro xml_to_f(node, name)
|
|
|
|
({{node}}.children.find{|n| n.name == "{{name}}".lstrip("_")}.try &.content.to_s.to_f)
|
2024-08-02 17:09:05 -03:00
|
|
|
end
|
|
|
|
|
|
|
|
macro xml_to_a(node, name)
|
|
|
|
{{node}}.children.select{|n| n.name == "{{name}}".lstrip("_")}.map {|n| n.content.to_s}
|
|
|
|
end
|
2024-08-03 09:42:32 -03:00
|
|
|
|
|
|
|
# Let's run some tests
|
|
|
|
|
2024-08-03 18:46:45 -03:00
|
|
|
def chroma_tokenize(lexer, text)
|
|
|
|
output = IO::Memory.new
|
|
|
|
input = IO::Memory.new(text)
|
|
|
|
Process.run(
|
|
|
|
"chroma",
|
|
|
|
["-f", "json", "-l", lexer],
|
|
|
|
input: input, output: output
|
|
|
|
)
|
|
|
|
Array(Tartrazine::Token).from_json(output.to_s)
|
|
|
|
end
|
|
|
|
|
|
|
|
def test_file(testname, lexer)
|
|
|
|
test = File.read(testname).split("---input---\n").last.split("---tokens---").first
|
|
|
|
begin
|
2024-08-03 21:37:22 -03:00
|
|
|
tokens = collapse_tokens(lexer.tokenize(test))
|
2024-08-03 18:46:45 -03:00
|
|
|
rescue ex : Exception
|
|
|
|
puts ">>>ERROR"
|
2024-08-04 17:45:32 -03:00
|
|
|
raise ex
|
2024-08-03 21:37:22 -03:00
|
|
|
return
|
2024-08-03 18:46:45 -03:00
|
|
|
end
|
|
|
|
outp = IO::Memory.new
|
|
|
|
i = IO::Memory.new(test)
|
|
|
|
lname = lexer.config[:name]
|
|
|
|
Process.run(
|
|
|
|
"chroma",
|
|
|
|
["-f", "json", "-l", lname], input: i, output: outp
|
|
|
|
)
|
2024-08-03 21:37:22 -03:00
|
|
|
chroma_tokens = collapse_tokens(Array(Tartrazine::Token).from_json(outp.to_s))
|
2024-08-03 18:46:45 -03:00
|
|
|
if chroma_tokens != tokens
|
2024-08-04 17:45:32 -03:00
|
|
|
puts ">>>BAD - #{testname}"
|
2024-08-03 18:46:45 -03:00
|
|
|
else
|
|
|
|
puts ">>>GOOD"
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2024-08-03 21:37:22 -03:00
|
|
|
def collapse_tokens(tokens : Array(Tartrazine::Token))
|
|
|
|
result = [] of Tartrazine::Token
|
|
|
|
|
|
|
|
tokens.each do |token|
|
|
|
|
if result.empty?
|
|
|
|
result << token
|
|
|
|
next
|
|
|
|
end
|
|
|
|
last = result.last
|
|
|
|
if last[:type] == token[:type]
|
|
|
|
new_token = {type: last[:type], value: last[:value] + token[:value]}
|
|
|
|
result.pop
|
|
|
|
result << new_token
|
|
|
|
else
|
|
|
|
result << token
|
|
|
|
end
|
|
|
|
end
|
|
|
|
result
|
|
|
|
end
|
|
|
|
|
2024-08-03 19:06:06 -03:00
|
|
|
total = 0
|
2024-08-03 09:42:32 -03:00
|
|
|
Dir.glob("tests/*/") do |lexername|
|
|
|
|
key = File.basename(lexername).downcase
|
2024-08-04 11:46:41 -03:00
|
|
|
# next if key == "console"
|
2024-08-03 09:42:32 -03:00
|
|
|
next unless lexers.has_key? key
|
|
|
|
lexer = lexers[key]
|
|
|
|
|
|
|
|
Dir.glob("#{lexername}*.txt") do |testname|
|
2024-08-04 11:46:41 -03:00
|
|
|
# #<Regex::Error:Regex match error: match limit exceeded>
|
|
|
|
next if testname == "tests/fortran/test_string_cataback.txt"
|
|
|
|
|
2024-08-04 17:45:32 -03:00
|
|
|
# Difference is different unicode representation of a string literal
|
|
|
|
next if testname == "tests/java/test_string_literals.txt"
|
|
|
|
next if testname == "tests/systemd/example1.txt"
|
|
|
|
next if testname == "tests/json/test_strings.txt"
|
|
|
|
|
|
|
|
# Tartrazine agrees with pygments, disagrees with chroma
|
|
|
|
next if testname == "tests/java/test_default.txt"
|
|
|
|
next if testname == "tests/java/test_numeric_literals.txt"
|
|
|
|
next if testname == "tests/java/test_multiline_string.txt"
|
|
|
|
|
|
|
|
# Tartrazine disagrees with pygments and chroma, but it's fine
|
|
|
|
next if testname == "tests/php/test_string_escaping_run.txt"
|
|
|
|
|
|
|
|
# Chroma's output is bad, but so is Tartrazine's
|
|
|
|
next if "tests/html/javascript_unclosed.txt" == testname
|
|
|
|
|
|
|
|
# KNOWN BAD -- TO FIX
|
|
|
|
next if "tests/html/css_backtracking.txt" == testname
|
|
|
|
next if "tests/php/anonymous_class.txt" == testname
|
|
|
|
next if "tests/c/test_string_resembling_decl_end.txt" == testname
|
|
|
|
next if "tests/mcfunction/data.txt" == testname
|
|
|
|
next if "tests/mcfunction/selectors.txt" == testname
|
|
|
|
|
2024-08-04 11:46:41 -03:00
|
|
|
# I disagree with these tests
|
2024-08-04 17:45:32 -03:00
|
|
|
# next if testname.starts_with? "tests/console"
|
2024-08-04 11:46:41 -03:00
|
|
|
|
2024-08-03 09:42:32 -03:00
|
|
|
puts "Testing #{key} with #{testname}"
|
2024-08-03 19:06:06 -03:00
|
|
|
total += 1
|
2024-08-03 18:46:45 -03:00
|
|
|
test_file(testname, lexer)
|
2024-08-03 09:42:32 -03:00
|
|
|
end
|
2024-08-03 10:26:36 -03:00
|
|
|
end
|
2024-08-03 19:25:09 -03:00
|
|
|
puts ">>>TOTAL #{total}"
|