From dc5b982a0b458e2d81cb308680aaeb69fc780a27 Mon Sep 17 00:00:00 2001
From: Roberto Alsina <roberto.alsina@gmail.com>
Date: Sun, 4 Aug 2024 21:38:00 -0300
Subject: [PATCH] Workaround for the regex problem

---
 README.md         | 11 ++---------
 src/rules.cr      |  6 +++---
 src/tartrazine.cr | 28 +++++++++++++++++++++++-----
 3 files changed, 28 insertions(+), 17 deletions(-)

diff --git a/README.md b/README.md
index 7e955bb..1fbd7f4 100644
--- a/README.md
+++ b/README.md
@@ -9,12 +9,6 @@ to turn your files into a pile of json describing its
 constituent tokens, because I have not implemented any
 formatters, yet, only the part that parses the code (the lexers).
 
-TO make this even more "not there yet", Crystal doesn't allow
-for creating regular expressions that are MULTILINE but not
-DOT_ALL, whcih means I can't reproduce the behaviour of the
-golang (or Python's) regular expressions, so that causes
-issues unless you go and patch Crystal itself (don't patch Crystal itself, please)
-
 # A port of what? Why "kind of"?
 
 Because I did not read the Pygments code. And this is actually
@@ -27,9 +21,8 @@ and a pile of test cases from Pygments, and I slapped them together
 until the tests passed and my code produced the same output as
 Chroma. Think of it as *extreme TDD*.
 
-With a patched Crystal regex engine [see here](https://forum.crystal-lang.org/t/regex-that-is-multiline-but-not-dotall-how/7054)
-the pass rate for tests in the supported languages is 96.8%, which
-is *not bad for a couple days hacking*.
+Currently the pass rate for tests in the supported languages 
+is `96.8%`, which is *not bad for a couple days hacking*.
 
 This only covers the RegexLexers, which are the most common ones,
 but it means the supported languages are a subset of Chroma's, which 
diff --git a/src/rules.cr b/src/rules.cr
index bc57b6f..37900db 100644
--- a/src/rules.cr
+++ b/src/rules.cr
@@ -6,7 +6,7 @@ require "./actions"
 module Tartrazine
   # This rule matches via a regex pattern
   class Rule
-    property pattern : Regex = Regex.new ""
+    property pattern : Regex = Re2.new ""
     property actions : Array(Action) = [] of Action
     property xml : String = "foo"
 
@@ -26,9 +26,9 @@ module Tartrazine
       return true, match.end, tokens
     end
 
-    def initialize(node : XML::Node, flags)
+    def initialize(node : XML::Node, multiline, dotall, ignorecase)
       @xml = node.to_s
-      @pattern = Regex.new(node["pattern"], flags)
+      @pattern = Re2.new(node["pattern"], multiline, dotall, ignorecase)
       add_actions(node)
     end
 
diff --git a/src/tartrazine.cr b/src/tartrazine.cr
index c153999..67b884e 100644
--- a/src/tartrazine.cr
+++ b/src/tartrazine.cr
@@ -137,11 +137,10 @@ module Tartrazine
                   rule = UnconditionalRule.new(rule_node)
                 end
               else
-                flags = Regex::Options::ANCHORED
-                flags |= Regex::Options::MULTILINE unless l.config[:not_multiline]
-                flags |= Regex::Options::IGNORE_CASE if l.config[:case_insensitive]
-                flags |= Regex::Options::DOTALL if l.config[:dot_all]
-                rule = Rule.new(rule_node, flags)
+                rule = Rule.new(rule_node,
+                  multiline: !l.config[:not_multiline],
+                  dotall: l.config[:dot_all],
+                  ignorecase: l.config[:case_insensitive])
               end
               state.rules << rule
             end
@@ -155,6 +154,25 @@ module Tartrazine
   def self.get_lexer(name : String) : Lexer
     Lexer.from_xml(File.read("lexers/#{name}.xml"))
   end
+
+  # This is a hack to workaround that Crystal seems to disallow
+  # having regexes multiline but not dot_all
+  class Re2 < Regex
+    @source = "fa"
+    @options = Regex::Options::None
+    @jit = true
+
+    def initialize(pattern : String, multiline = false, dotall = false, ignorecase = false)
+      flags = LibPCRE2::UTF | LibPCRE2::DUPNAMES |
+              LibPCRE2::UCP | LibPCRE2::ANCHORED
+      flags |= LibPCRE2::MULTILINE if multiline
+      flags |= LibPCRE2::DOTALL if dotall
+      flags |= LibPCRE2::CASELESS if ignorecase
+      @re = Regex::PCRE2.compile(pattern, flags) do |error_message|
+        raise Exception.new(error_message)
+      end
+    end
+  end
 end
 
 # Convenience macros to parse XML