This commit is contained in:
Roberto Alsina 2024-08-04 20:00:48 -03:00
parent 696227a935
commit 935e72c18e
3 changed files with 11 additions and 47 deletions

View File

@ -74,7 +74,7 @@ end
# Helper that creates lexer and tokenizes # Helper that creates lexer and tokenizes
def tokenize(lexer_name, text) def tokenize(lexer_name, text)
lexer = Tartrazine::Lexer.from_xml(File.read("lexers/#{lexer_name}.xml")) lexer = Tartrazine.get_lexer(lexer_name)
collapse_tokens(lexer.tokenize(text)) collapse_tokens(lexer.tokenize(text))
end end

View File

@ -66,8 +66,8 @@ module Tartrazine
# #
raise Exception.new "Can't have a token without a match" if match.nil? raise Exception.new "Can't have a token without a match" if match.nil?
# Each group matches an action # Each group matches an action. If the group match is empty,
# the action is skipped.
result = [] of Token result = [] of Token
@actions.each_with_index do |e, i| @actions.each_with_index do |e, i|
next if match[i + 1]?.nil? next if match[i + 1]?.nil?
@ -79,7 +79,7 @@ module Tartrazine
return [] of Token if match.nil? return [] of Token if match.nil?
lexer_name = xml["lexer"].downcase lexer_name = xml["lexer"].downcase
# pp! "to tokenize:", match[match_group] # pp! "to tokenize:", match[match_group]
LEXERS[lexer_name].tokenize(match[match_group], usingself: true) Tartrazine.get_lexer(lexer_name).tokenize(match[match_group], usingself: true)
when "usingself" when "usingself"
# Shunt to another copy of this lexer # Shunt to another copy of this lexer
return [] of Token if match.nil? return [] of Token if match.nil?

View File

@ -13,6 +13,10 @@ module Tartrazine
# For explanations on what actions, transformers, etc do # For explanations on what actions, transformers, etc do
# the Pygments documentation is a good place to start. # the Pygments documentation is a good place to start.
# https://pygments.org/docs/lexerdevelopment/ # https://pygments.org/docs/lexerdevelopment/
# A Lexer state. A state has a name and a list of rules.
# The state machine has a state stack containing references
# to states to decide which rules to apply.
class State class State
property name : String = "" property name : String = ""
property rules = [] of Rule property rules = [] of Rule
@ -25,10 +29,9 @@ module Tartrazine
end end
end end
# A token, the output of the tokenizer
alias Token = NamedTuple(type: String, value: String) alias Token = NamedTuple(type: String, value: String)
LEXERS = {} of String => Tartrazine::Lexer
class Lexer class Lexer
property config = { property config = {
name: "", name: "",
@ -135,22 +138,9 @@ module Tartrazine
l l
end end
end end
end
# Try loading all lexers def self.get_lexer(name : String) : Lexer
Lexer.from_xml(File.read("lexers/#{name}.xml"))
lexers = Tartrazine::LEXERS
Dir.glob("lexers/*.xml").each do |fname|
begin
l = Tartrazine::Lexer.from_xml(File.read(fname))
rescue ex : Exception
# p! ex
next
end
lexers[l.config[:name].downcase] = l
l.config[:aliases].each do |key|
lexers[key.downcase] = l
end end
end end
@ -166,29 +156,3 @@ end
macro xml_to_a(node, name) macro xml_to_a(node, name)
{{node}}.children.select{|n| n.name == "{{name}}".lstrip("_")}.map {|n| n.content.to_s} {{node}}.children.select{|n| n.name == "{{name}}".lstrip("_")}.map {|n| n.content.to_s}
end end
# # #<Regex::Error:Regex match error: match limit exceeded>
# next if testname == "tests/fortran/test_string_cataback.txt"
# # Difference is different unicode representation of a string literal
# next if testname == "tests/java/test_string_literals.txt"
# next if testname == "tests/systemd/example1.txt"
# next if testname == "tests/json/test_strings.txt"
# # Tartrazine agrees with pygments, disagrees with chroma
# next if testname == "tests/java/test_default.txt"
# next if testname == "tests/java/test_numeric_literals.txt"
# next if testname == "tests/java/test_multiline_string.txt"
# # Tartrazine disagrees with pygments and chroma, but it's fine
# next if testname == "tests/php/test_string_escaping_run.txt"
# # Chroma's output is bad, but so is Tartrazine's
# next if "tests/html/javascript_unclosed.txt" == testname
# # KNOWN BAD -- TO FIX
# next if "tests/html/css_backtracking.txt" == testname
# next if "tests/php/anonymous_class.txt" == testname
# next if "tests/c/test_string_resembling_decl_end.txt" == testname
# next if "tests/mcfunction/data.txt" == testname
# next if "tests/mcfunction/selectors.txt" == testname