2024-08-03 19:25:09 -03:00
|
|
|
require "base58"
|
2024-08-03 18:46:45 -03:00
|
|
|
require "json"
|
2024-08-03 19:25:09 -03:00
|
|
|
require "xml"
|
2024-08-04 19:47:54 -03:00
|
|
|
require "./rules"
|
2024-08-04 19:54:25 -03:00
|
|
|
require "./actions"
|
2024-08-02 17:03:39 -03:00
|
|
|
module Tartrazine
|
|
|
|
VERSION = "0.1.0"
|
|
|
|
|
2024-08-03 05:26:32 -03:00
|
|
|
# This implements a lexer for Pygments RegexLexers as expressed
|
|
|
|
# in Chroma's XML serialization.
|
2024-08-03 06:05:29 -03:00
|
|
|
#
|
2024-08-04 19:54:25 -03:00
|
|
|
# For explanations on what actions, transformers, etc do
|
2024-08-03 05:26:32 -03:00
|
|
|
# the Pygments documentation is a good place to start.
|
|
|
|
# https://pygments.org/docs/lexerdevelopment/
|
2024-08-02 17:23:40 -03:00
|
|
|
class State
|
|
|
|
property name : String = ""
|
2024-08-02 17:33:01 -03:00
|
|
|
property rules = [] of Rule
|
2024-08-03 19:25:09 -03:00
|
|
|
|
|
|
|
def +(other : State)
|
|
|
|
new_state = State.new
|
|
|
|
new_state.name = Random.base58(8)
|
|
|
|
new_state.rules = rules + other.rules
|
|
|
|
new_state
|
|
|
|
end
|
2024-08-02 17:33:01 -03:00
|
|
|
end
|
|
|
|
|
2024-08-02 17:23:40 -03:00
|
|
|
|
2024-08-03 05:26:32 -03:00
|
|
|
|
2024-08-03 06:05:29 -03:00
|
|
|
alias Token = NamedTuple(type: String, value: String)
|
|
|
|
|
2024-08-03 19:06:06 -03:00
|
|
|
LEXERS = {} of String => Tartrazine::Lexer
|
|
|
|
|
2024-08-02 17:03:39 -03:00
|
|
|
class Lexer
|
|
|
|
property config = {
|
2024-08-04 17:45:32 -03:00
|
|
|
name: "",
|
|
|
|
aliases: [] of String,
|
|
|
|
filenames: [] of String,
|
|
|
|
mime_types: [] of String,
|
|
|
|
priority: 0.0,
|
|
|
|
case_insensitive: false,
|
|
|
|
dot_all: false,
|
|
|
|
not_multiline: false,
|
|
|
|
ensure_nl: false,
|
2024-08-02 17:03:39 -03:00
|
|
|
}
|
2024-08-04 11:46:41 -03:00
|
|
|
property xml : String = ""
|
2024-08-02 17:03:39 -03:00
|
|
|
|
2024-08-02 19:48:58 -03:00
|
|
|
property states = {} of String => State
|
2024-08-02 17:23:40 -03:00
|
|
|
|
2024-08-03 06:05:29 -03:00
|
|
|
property state_stack = ["root"]
|
|
|
|
|
|
|
|
# Turn the text into a list of tokens.
|
2024-08-04 17:45:32 -03:00
|
|
|
def tokenize(text, usingself = false) : Array(Token)
|
2024-08-03 21:37:22 -03:00
|
|
|
@state_stack = ["root"]
|
2024-08-03 06:05:29 -03:00
|
|
|
tokens = [] of Token
|
|
|
|
pos = 0
|
2024-08-03 07:21:21 -03:00
|
|
|
matched = false
|
2024-08-04 17:45:32 -03:00
|
|
|
if text.size > 0 && text[-1] != '\n' && config[:ensure_nl] && !usingself
|
|
|
|
text += "\n"
|
|
|
|
end
|
2024-08-03 06:05:29 -03:00
|
|
|
while pos < text.size
|
2024-08-03 21:37:22 -03:00
|
|
|
state = states[@state_stack.last]
|
2024-08-04 19:18:43 -03:00
|
|
|
# puts "Stack is #{@state_stack} State is #{state.name}, pos is #{pos}, text is #{text[pos..pos + 10]}"
|
2024-08-03 06:05:29 -03:00
|
|
|
state.rules.each do |rule|
|
2024-08-03 07:21:21 -03:00
|
|
|
matched, new_pos, new_tokens = rule.match(text, pos, self)
|
2024-08-04 19:18:43 -03:00
|
|
|
# puts "NOT MATCHED: #{rule.xml}"
|
2024-08-03 07:21:21 -03:00
|
|
|
next unless matched
|
2024-08-04 19:18:43 -03:00
|
|
|
# puts "MATCHED: #{rule.xml}"
|
2024-08-03 18:46:45 -03:00
|
|
|
|
2024-08-03 06:37:15 -03:00
|
|
|
pos = new_pos
|
|
|
|
tokens += new_tokens
|
|
|
|
break # We go back to processing with current state
|
|
|
|
end
|
|
|
|
# If no rule matches, emit an error token
|
2024-08-03 07:21:21 -03:00
|
|
|
unless matched
|
2024-08-04 17:45:32 -03:00
|
|
|
# p! "Error at #{pos}"
|
2024-08-03 21:37:22 -03:00
|
|
|
tokens << {type: "Error", value: "#{text[pos]}"}
|
2024-08-03 06:37:15 -03:00
|
|
|
pos += 1
|
2024-08-03 06:05:29 -03:00
|
|
|
end
|
|
|
|
end
|
2024-08-04 17:45:32 -03:00
|
|
|
tokens.reject { |t| t[:value] == "" }
|
2024-08-03 06:05:29 -03:00
|
|
|
end
|
|
|
|
|
2024-08-02 17:03:39 -03:00
|
|
|
def self.from_xml(xml : String) : Lexer
|
|
|
|
l = Lexer.new
|
2024-08-04 11:46:41 -03:00
|
|
|
l.xml = xml
|
2024-08-02 17:03:39 -03:00
|
|
|
lexer = XML.parse(xml).first_element_child
|
|
|
|
if lexer
|
|
|
|
config = lexer.children.find { |n| n.name == "config" }
|
|
|
|
if config
|
|
|
|
l.config = {
|
2024-08-04 17:45:32 -03:00
|
|
|
name: xml_to_s(config, name) || "",
|
|
|
|
aliases: xml_to_a(config, _alias) || [] of String,
|
|
|
|
filenames: xml_to_a(config, filename) || [] of String,
|
|
|
|
mime_types: xml_to_a(config, mime_type) || [] of String,
|
|
|
|
priority: xml_to_f(config, priority) || 0.0,
|
|
|
|
not_multiline: xml_to_s(config, not_multiline) == "true",
|
|
|
|
# FIXME: This has no effect yet (see )
|
|
|
|
dot_all: xml_to_s(config, dot_all) == "true",
|
|
|
|
case_insensitive: xml_to_s(config, case_insensitive) == "true",
|
|
|
|
ensure_nl: xml_to_s(config, ensure_nl) == "true",
|
2024-08-02 17:03:39 -03:00
|
|
|
}
|
|
|
|
end
|
2024-08-02 17:23:40 -03:00
|
|
|
|
|
|
|
rules = lexer.children.find { |n| n.name == "rules" }
|
|
|
|
if rules
|
|
|
|
# Rules contains states 🤷
|
2024-08-02 20:01:53 -03:00
|
|
|
rules.children.select { |n| n.name == "state" }.each do |state_node|
|
2024-08-02 17:23:40 -03:00
|
|
|
state = State.new
|
2024-08-02 20:01:53 -03:00
|
|
|
state.name = state_node["name"]
|
2024-08-02 19:48:58 -03:00
|
|
|
if l.states.has_key?(state.name)
|
2024-08-04 19:18:43 -03:00
|
|
|
raise Exception.new("Duplicate state: #{state.name}")
|
2024-08-02 19:48:58 -03:00
|
|
|
else
|
|
|
|
l.states[state.name] = state
|
|
|
|
end
|
2024-08-02 17:33:01 -03:00
|
|
|
# And states contain rules 🤷
|
2024-08-02 20:01:53 -03:00
|
|
|
state_node.children.select { |n| n.name == "rule" }.each do |rule_node|
|
2024-08-03 07:21:21 -03:00
|
|
|
case rule_node["pattern"]?
|
|
|
|
when nil
|
|
|
|
if rule_node.first_element_child.try &.name == "include"
|
2024-08-04 19:47:54 -03:00
|
|
|
rule = IncludeStateRule.new(rule_node)
|
2024-08-03 07:21:21 -03:00
|
|
|
else
|
2024-08-04 19:54:25 -03:00
|
|
|
rule = UnconditionalRule.new(rule_node)
|
2024-08-03 07:21:21 -03:00
|
|
|
end
|
|
|
|
else
|
2024-08-04 17:45:32 -03:00
|
|
|
flags = Regex::Options::ANCHORED
|
|
|
|
flags |= Regex::Options::MULTILINE unless l.config[:not_multiline]
|
|
|
|
flags |= Regex::Options::IGNORE_CASE if l.config[:case_insensitive]
|
|
|
|
flags |= Regex::Options::DOTALL if l.config[:dot_all]
|
2024-08-04 19:47:54 -03:00
|
|
|
rule = Rule.new(rule_node, flags)
|
2024-08-02 20:32:15 -03:00
|
|
|
end
|
2024-08-04 19:47:54 -03:00
|
|
|
state.rules << rule
|
2024-08-02 17:33:01 -03:00
|
|
|
end
|
2024-08-02 17:23:40 -03:00
|
|
|
end
|
|
|
|
end
|
2024-08-02 17:03:39 -03:00
|
|
|
end
|
|
|
|
l
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2024-08-03 06:05:29 -03:00
|
|
|
# Try loading all lexers
|
|
|
|
|
2024-08-03 19:06:06 -03:00
|
|
|
lexers = Tartrazine::LEXERS
|
|
|
|
|
2024-08-03 05:05:01 -03:00
|
|
|
Dir.glob("lexers/*.xml").each do |fname|
|
2024-08-04 17:45:32 -03:00
|
|
|
begin
|
|
|
|
l = Tartrazine::Lexer.from_xml(File.read(fname))
|
|
|
|
rescue ex : Exception
|
|
|
|
# p! ex
|
|
|
|
next
|
|
|
|
end
|
2024-08-03 09:42:32 -03:00
|
|
|
lexers[l.config[:name].downcase] = l
|
|
|
|
l.config[:aliases].each do |key|
|
|
|
|
lexers[key.downcase] = l
|
|
|
|
end
|
2024-08-03 05:05:01 -03:00
|
|
|
end
|
2024-08-02 17:09:05 -03:00
|
|
|
|
|
|
|
# Convenience macros to parse XML
|
|
|
|
macro xml_to_s(node, name)
|
2024-08-02 17:23:40 -03:00
|
|
|
{{node}}.children.find{|n| n.name == "{{name}}".lstrip("_")}.try &.content.to_s
|
|
|
|
end
|
|
|
|
|
2024-08-03 05:05:01 -03:00
|
|
|
macro xml_to_f(node, name)
|
|
|
|
({{node}}.children.find{|n| n.name == "{{name}}".lstrip("_")}.try &.content.to_s.to_f)
|
2024-08-02 17:09:05 -03:00
|
|
|
end
|
|
|
|
|
|
|
|
macro xml_to_a(node, name)
|
|
|
|
{{node}}.children.select{|n| n.name == "{{name}}".lstrip("_")}.map {|n| n.content.to_s}
|
|
|
|
end
|
2024-08-03 09:42:32 -03:00
|
|
|
|
2024-08-03 18:46:45 -03:00
|
|
|
|
2024-08-04 19:18:43 -03:00
|
|
|
# # #<Regex::Error:Regex match error: match limit exceeded>
|
|
|
|
# next if testname == "tests/fortran/test_string_cataback.txt"
|
2024-08-03 18:46:45 -03:00
|
|
|
|
2024-08-04 19:18:43 -03:00
|
|
|
# # Difference is different unicode representation of a string literal
|
|
|
|
# next if testname == "tests/java/test_string_literals.txt"
|
|
|
|
# next if testname == "tests/systemd/example1.txt"
|
|
|
|
# next if testname == "tests/json/test_strings.txt"
|
2024-08-03 21:37:22 -03:00
|
|
|
|
2024-08-04 19:18:43 -03:00
|
|
|
# # Tartrazine agrees with pygments, disagrees with chroma
|
|
|
|
# next if testname == "tests/java/test_default.txt"
|
|
|
|
# next if testname == "tests/java/test_numeric_literals.txt"
|
|
|
|
# next if testname == "tests/java/test_multiline_string.txt"
|
|
|
|
|
|
|
|
# # Tartrazine disagrees with pygments and chroma, but it's fine
|
|
|
|
# next if testname == "tests/php/test_string_escaping_run.txt"
|
|
|
|
|
|
|
|
# # Chroma's output is bad, but so is Tartrazine's
|
|
|
|
# next if "tests/html/javascript_unclosed.txt" == testname
|
|
|
|
|
|
|
|
# # KNOWN BAD -- TO FIX
|
|
|
|
# next if "tests/html/css_backtracking.txt" == testname
|
|
|
|
# next if "tests/php/anonymous_class.txt" == testname
|
|
|
|
# next if "tests/c/test_string_resembling_decl_end.txt" == testname
|
|
|
|
# next if "tests/mcfunction/data.txt" == testname
|
|
|
|
# next if "tests/mcfunction/selectors.txt" == testname
|
2024-08-03 21:37:22 -03:00
|
|
|
|