Files
tartrazine/src/tartrazine.cr

188 lines
5.9 KiB
Crystal
Raw Normal View History

2024-08-04 20:24:48 -03:00
require "./actions"
require "./rules"
2024-08-03 19:25:09 -03:00
require "base58"
2024-08-03 18:46:45 -03:00
require "json"
2024-08-04 20:24:48 -03:00
require "log"
2024-08-03 19:25:09 -03:00
require "xml"
2024-08-04 19:54:34 -03:00
2024-08-02 17:03:39 -03:00
module Tartrazine
VERSION = "0.1.0"
2024-08-04 20:24:48 -03:00
Log = ::Log.for("tartrazine")
2024-08-03 05:26:32 -03:00
# This implements a lexer for Pygments RegexLexers as expressed
# in Chroma's XML serialization.
2024-08-03 06:05:29 -03:00
#
2024-08-04 19:54:25 -03:00
# For explanations on what actions, transformers, etc do
2024-08-03 05:26:32 -03:00
# the Pygments documentation is a good place to start.
# https://pygments.org/docs/lexerdevelopment/
2024-08-04 20:00:48 -03:00
# A Lexer state. A state has a name and a list of rules.
# The state machine has a state stack containing references
# to states to decide which rules to apply.
2024-08-02 17:23:40 -03:00
class State
property name : String = ""
2024-08-02 17:33:01 -03:00
property rules = [] of Rule
2024-08-03 19:25:09 -03:00
def +(other : State)
new_state = State.new
new_state.name = Random.base58(8)
new_state.rules = rules + other.rules
new_state
end
2024-08-02 17:33:01 -03:00
end
2024-08-04 20:00:48 -03:00
# A token, the output of the tokenizer
2024-08-03 06:05:29 -03:00
alias Token = NamedTuple(type: String, value: String)
2024-08-02 17:03:39 -03:00
class Lexer
property config = {
2024-08-04 17:45:32 -03:00
name: "",
aliases: [] of String,
filenames: [] of String,
mime_types: [] of String,
priority: 0.0,
case_insensitive: false,
dot_all: false,
not_multiline: false,
ensure_nl: false,
2024-08-02 17:03:39 -03:00
}
property xml : String = ""
2024-08-02 17:03:39 -03:00
2024-08-02 19:48:58 -03:00
property states = {} of String => State
2024-08-02 17:23:40 -03:00
2024-08-03 06:05:29 -03:00
property state_stack = ["root"]
# Turn the text into a list of tokens.
2024-08-04 17:45:32 -03:00
def tokenize(text, usingself = false) : Array(Token)
2024-08-03 21:37:22 -03:00
@state_stack = ["root"]
2024-08-03 06:05:29 -03:00
tokens = [] of Token
pos = 0
matched = false
2024-08-04 17:45:32 -03:00
if text.size > 0 && text[-1] != '\n' && config[:ensure_nl] && !usingself
text += "\n"
end
2024-08-03 06:05:29 -03:00
while pos < text.size
2024-08-03 21:37:22 -03:00
state = states[@state_stack.last]
2024-08-04 20:24:48 -03:00
Log.trace { "Stack is #{@state_stack} State is #{state.name}, pos is #{pos}, text is #{text[pos..pos + 10]}" }
2024-08-03 06:05:29 -03:00
state.rules.each do |rule|
matched, new_pos, new_tokens = rule.match(text, pos, self)
2024-08-05 07:54:14 -03:00
if matched
Log.trace { "MATCHED: #{rule.xml}" }
pos = new_pos
tokens += new_tokens
break # We go back to processing with current state
end
2024-08-04 20:25:12 -03:00
Log.trace { "NOT MATCHED: #{rule.xml}" }
2024-08-03 06:37:15 -03:00
end
# If no rule matches, emit an error token
unless matched
2024-08-04 20:24:48 -03:00
Log.trace { "Error at #{pos}" }
2024-08-03 21:37:22 -03:00
tokens << {type: "Error", value: "#{text[pos]}"}
2024-08-03 06:37:15 -03:00
pos += 1
2024-08-03 06:05:29 -03:00
end
end
2024-08-04 20:09:15 -03:00
tokens.reject { |token| token[:value] == "" }
2024-08-03 06:05:29 -03:00
end
2024-08-04 20:09:15 -03:00
# ameba:disable Metrics/CyclomaticComplexity
2024-08-02 17:03:39 -03:00
def self.from_xml(xml : String) : Lexer
l = Lexer.new
l.xml = xml
2024-08-02 17:03:39 -03:00
lexer = XML.parse(xml).first_element_child
if lexer
2024-08-04 20:09:15 -03:00
config = lexer.children.find { |node|
node.name == "config"
}
2024-08-02 17:03:39 -03:00
if config
l.config = {
2024-08-05 10:01:23 -03:00
name: xml_to_s(config, name) || "",
aliases: xml_to_a(config, _alias) || [] of String,
filenames: xml_to_a(config, filename) || [] of String,
mime_types: xml_to_a(config, mime_type) || [] of String,
priority: xml_to_f(config, priority) || 0.0,
not_multiline: xml_to_s(config, not_multiline) == "true",
2024-08-04 17:45:32 -03:00
dot_all: xml_to_s(config, dot_all) == "true",
case_insensitive: xml_to_s(config, case_insensitive) == "true",
ensure_nl: xml_to_s(config, ensure_nl) == "true",
2024-08-02 17:03:39 -03:00
}
end
2024-08-02 17:23:40 -03:00
2024-08-04 20:09:15 -03:00
rules = lexer.children.find { |node|
node.name == "rules"
}
2024-08-02 17:23:40 -03:00
if rules
# Rules contains states 🤷
2024-08-04 20:09:15 -03:00
rules.children.select { |node|
node.name == "state"
}.each do |state_node|
2024-08-02 17:23:40 -03:00
state = State.new
2024-08-02 20:01:53 -03:00
state.name = state_node["name"]
2024-08-02 19:48:58 -03:00
if l.states.has_key?(state.name)
raise Exception.new("Duplicate state: #{state.name}")
2024-08-02 19:48:58 -03:00
else
l.states[state.name] = state
end
2024-08-02 17:33:01 -03:00
# And states contain rules 🤷
2024-08-04 20:09:15 -03:00
state_node.children.select { |node|
node.name == "rule"
}.each do |rule_node|
case rule_node["pattern"]?
when nil
if rule_node.first_element_child.try &.name == "include"
2024-08-04 19:47:54 -03:00
rule = IncludeStateRule.new(rule_node)
else
2024-08-04 19:54:25 -03:00
rule = UnconditionalRule.new(rule_node)
end
else
2024-08-04 21:38:00 -03:00
rule = Rule.new(rule_node,
multiline: !l.config[:not_multiline],
dotall: l.config[:dot_all],
ignorecase: l.config[:case_insensitive])
2024-08-02 20:32:15 -03:00
end
2024-08-04 19:47:54 -03:00
state.rules << rule
2024-08-02 17:33:01 -03:00
end
2024-08-02 17:23:40 -03:00
end
end
2024-08-02 17:03:39 -03:00
end
l
end
end
2024-08-04 20:00:48 -03:00
def self.get_lexer(name : String) : Lexer
Lexer.from_xml(File.read("lexers/#{name}.xml"))
end
2024-08-04 21:38:00 -03:00
# This is a hack to workaround that Crystal seems to disallow
# having regexes multiline but not dot_all
class Re2 < Regex
@source = "fa"
@options = Regex::Options::None
@jit = true
def initialize(pattern : String, multiline = false, dotall = false, ignorecase = false)
flags = LibPCRE2::UTF | LibPCRE2::DUPNAMES |
LibPCRE2::UCP | LibPCRE2::ANCHORED
flags |= LibPCRE2::MULTILINE if multiline
flags |= LibPCRE2::DOTALL if dotall
flags |= LibPCRE2::CASELESS if ignorecase
@re = Regex::PCRE2.compile(pattern, flags) do |error_message|
raise Exception.new(error_message)
end
end
end
2024-08-03 05:05:01 -03:00
end
2024-08-02 17:09:05 -03:00
# Convenience macros to parse XML
macro xml_to_s(node, name)
2024-08-02 17:23:40 -03:00
{{node}}.children.find{|n| n.name == "{{name}}".lstrip("_")}.try &.content.to_s
end
2024-08-03 05:05:01 -03:00
macro xml_to_f(node, name)
({{node}}.children.find{|n| n.name == "{{name}}".lstrip("_")}.try &.content.to_s.to_f)
2024-08-02 17:09:05 -03:00
end
macro xml_to_a(node, name)
{{node}}.children.select{|n| n.name == "{{name}}".lstrip("_")}.map {|n| n.content.to_s}
end