From dc5b982a0b458e2d81cb308680aaeb69fc780a27 Mon Sep 17 00:00:00 2001 From: Roberto Alsina Date: Sun, 4 Aug 2024 21:38:00 -0300 Subject: [PATCH] Workaround for the regex problem --- README.md | 11 ++--------- src/rules.cr | 6 +++--- src/tartrazine.cr | 28 +++++++++++++++++++++++----- 3 files changed, 28 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index 7e955bb..1fbd7f4 100644 --- a/README.md +++ b/README.md @@ -9,12 +9,6 @@ to turn your files into a pile of json describing its constituent tokens, because I have not implemented any formatters, yet, only the part that parses the code (the lexers). -TO make this even more "not there yet", Crystal doesn't allow -for creating regular expressions that are MULTILINE but not -DOT_ALL, whcih means I can't reproduce the behaviour of the -golang (or Python's) regular expressions, so that causes -issues unless you go and patch Crystal itself (don't patch Crystal itself, please) - # A port of what? Why "kind of"? Because I did not read the Pygments code. And this is actually @@ -27,9 +21,8 @@ and a pile of test cases from Pygments, and I slapped them together until the tests passed and my code produced the same output as Chroma. Think of it as *extreme TDD*. -With a patched Crystal regex engine [see here](https://forum.crystal-lang.org/t/regex-that-is-multiline-but-not-dotall-how/7054) -the pass rate for tests in the supported languages is 96.8%, which -is *not bad for a couple days hacking*. +Currently the pass rate for tests in the supported languages +is `96.8%`, which is *not bad for a couple days hacking*. This only covers the RegexLexers, which are the most common ones, but it means the supported languages are a subset of Chroma's, which diff --git a/src/rules.cr b/src/rules.cr index bc57b6f..37900db 100644 --- a/src/rules.cr +++ b/src/rules.cr @@ -6,7 +6,7 @@ require "./actions" module Tartrazine # This rule matches via a regex pattern class Rule - property pattern : Regex = Regex.new "" + property pattern : Regex = Re2.new "" property actions : Array(Action) = [] of Action property xml : String = "foo" @@ -26,9 +26,9 @@ module Tartrazine return true, match.end, tokens end - def initialize(node : XML::Node, flags) + def initialize(node : XML::Node, multiline, dotall, ignorecase) @xml = node.to_s - @pattern = Regex.new(node["pattern"], flags) + @pattern = Re2.new(node["pattern"], multiline, dotall, ignorecase) add_actions(node) end diff --git a/src/tartrazine.cr b/src/tartrazine.cr index c153999..67b884e 100644 --- a/src/tartrazine.cr +++ b/src/tartrazine.cr @@ -137,11 +137,10 @@ module Tartrazine rule = UnconditionalRule.new(rule_node) end else - flags = Regex::Options::ANCHORED - flags |= Regex::Options::MULTILINE unless l.config[:not_multiline] - flags |= Regex::Options::IGNORE_CASE if l.config[:case_insensitive] - flags |= Regex::Options::DOTALL if l.config[:dot_all] - rule = Rule.new(rule_node, flags) + rule = Rule.new(rule_node, + multiline: !l.config[:not_multiline], + dotall: l.config[:dot_all], + ignorecase: l.config[:case_insensitive]) end state.rules << rule end @@ -155,6 +154,25 @@ module Tartrazine def self.get_lexer(name : String) : Lexer Lexer.from_xml(File.read("lexers/#{name}.xml")) end + + # This is a hack to workaround that Crystal seems to disallow + # having regexes multiline but not dot_all + class Re2 < Regex + @source = "fa" + @options = Regex::Options::None + @jit = true + + def initialize(pattern : String, multiline = false, dotall = false, ignorecase = false) + flags = LibPCRE2::UTF | LibPCRE2::DUPNAMES | + LibPCRE2::UCP | LibPCRE2::ANCHORED + flags |= LibPCRE2::MULTILINE if multiline + flags |= LibPCRE2::DOTALL if dotall + flags |= LibPCRE2::CASELESS if ignorecase + @re = Regex::PCRE2.compile(pattern, flags) do |error_message| + raise Exception.new(error_message) + end + end + end end # Convenience macros to parse XML