mirror of
https://github.com/ralsina/tartrazine.git
synced 2024-11-10 05:22:23 +00:00
Workaround for the regex problem
This commit is contained in:
parent
be93132f4c
commit
dc5b982a0b
11
README.md
11
README.md
@ -9,12 +9,6 @@ to turn your files into a pile of json describing its
|
|||||||
constituent tokens, because I have not implemented any
|
constituent tokens, because I have not implemented any
|
||||||
formatters, yet, only the part that parses the code (the lexers).
|
formatters, yet, only the part that parses the code (the lexers).
|
||||||
|
|
||||||
TO make this even more "not there yet", Crystal doesn't allow
|
|
||||||
for creating regular expressions that are MULTILINE but not
|
|
||||||
DOT_ALL, whcih means I can't reproduce the behaviour of the
|
|
||||||
golang (or Python's) regular expressions, so that causes
|
|
||||||
issues unless you go and patch Crystal itself (don't patch Crystal itself, please)
|
|
||||||
|
|
||||||
# A port of what? Why "kind of"?
|
# A port of what? Why "kind of"?
|
||||||
|
|
||||||
Because I did not read the Pygments code. And this is actually
|
Because I did not read the Pygments code. And this is actually
|
||||||
@ -27,9 +21,8 @@ and a pile of test cases from Pygments, and I slapped them together
|
|||||||
until the tests passed and my code produced the same output as
|
until the tests passed and my code produced the same output as
|
||||||
Chroma. Think of it as *extreme TDD*.
|
Chroma. Think of it as *extreme TDD*.
|
||||||
|
|
||||||
With a patched Crystal regex engine [see here](https://forum.crystal-lang.org/t/regex-that-is-multiline-but-not-dotall-how/7054)
|
Currently the pass rate for tests in the supported languages
|
||||||
the pass rate for tests in the supported languages is 96.8%, which
|
is `96.8%`, which is *not bad for a couple days hacking*.
|
||||||
is *not bad for a couple days hacking*.
|
|
||||||
|
|
||||||
This only covers the RegexLexers, which are the most common ones,
|
This only covers the RegexLexers, which are the most common ones,
|
||||||
but it means the supported languages are a subset of Chroma's, which
|
but it means the supported languages are a subset of Chroma's, which
|
||||||
|
@ -6,7 +6,7 @@ require "./actions"
|
|||||||
module Tartrazine
|
module Tartrazine
|
||||||
# This rule matches via a regex pattern
|
# This rule matches via a regex pattern
|
||||||
class Rule
|
class Rule
|
||||||
property pattern : Regex = Regex.new ""
|
property pattern : Regex = Re2.new ""
|
||||||
property actions : Array(Action) = [] of Action
|
property actions : Array(Action) = [] of Action
|
||||||
property xml : String = "foo"
|
property xml : String = "foo"
|
||||||
|
|
||||||
@ -26,9 +26,9 @@ module Tartrazine
|
|||||||
return true, match.end, tokens
|
return true, match.end, tokens
|
||||||
end
|
end
|
||||||
|
|
||||||
def initialize(node : XML::Node, flags)
|
def initialize(node : XML::Node, multiline, dotall, ignorecase)
|
||||||
@xml = node.to_s
|
@xml = node.to_s
|
||||||
@pattern = Regex.new(node["pattern"], flags)
|
@pattern = Re2.new(node["pattern"], multiline, dotall, ignorecase)
|
||||||
add_actions(node)
|
add_actions(node)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -137,11 +137,10 @@ module Tartrazine
|
|||||||
rule = UnconditionalRule.new(rule_node)
|
rule = UnconditionalRule.new(rule_node)
|
||||||
end
|
end
|
||||||
else
|
else
|
||||||
flags = Regex::Options::ANCHORED
|
rule = Rule.new(rule_node,
|
||||||
flags |= Regex::Options::MULTILINE unless l.config[:not_multiline]
|
multiline: !l.config[:not_multiline],
|
||||||
flags |= Regex::Options::IGNORE_CASE if l.config[:case_insensitive]
|
dotall: l.config[:dot_all],
|
||||||
flags |= Regex::Options::DOTALL if l.config[:dot_all]
|
ignorecase: l.config[:case_insensitive])
|
||||||
rule = Rule.new(rule_node, flags)
|
|
||||||
end
|
end
|
||||||
state.rules << rule
|
state.rules << rule
|
||||||
end
|
end
|
||||||
@ -155,6 +154,25 @@ module Tartrazine
|
|||||||
def self.get_lexer(name : String) : Lexer
|
def self.get_lexer(name : String) : Lexer
|
||||||
Lexer.from_xml(File.read("lexers/#{name}.xml"))
|
Lexer.from_xml(File.read("lexers/#{name}.xml"))
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# This is a hack to workaround that Crystal seems to disallow
|
||||||
|
# having regexes multiline but not dot_all
|
||||||
|
class Re2 < Regex
|
||||||
|
@source = "fa"
|
||||||
|
@options = Regex::Options::None
|
||||||
|
@jit = true
|
||||||
|
|
||||||
|
def initialize(pattern : String, multiline = false, dotall = false, ignorecase = false)
|
||||||
|
flags = LibPCRE2::UTF | LibPCRE2::DUPNAMES |
|
||||||
|
LibPCRE2::UCP | LibPCRE2::ANCHORED
|
||||||
|
flags |= LibPCRE2::MULTILINE if multiline
|
||||||
|
flags |= LibPCRE2::DOTALL if dotall
|
||||||
|
flags |= LibPCRE2::CASELESS if ignorecase
|
||||||
|
@re = Regex::PCRE2.compile(pattern, flags) do |error_message|
|
||||||
|
raise Exception.new(error_message)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
# Convenience macros to parse XML
|
# Convenience macros to parse XML
|
||||||
|
Loading…
Reference in New Issue
Block a user