From 0626c8619f81b219b271fa0e92f1aba517852327 Mon Sep 17 00:00:00 2001
From: Roberto Alsina <roberto.alsina@gmail.com>
Date: Wed, 14 Aug 2024 11:06:53 -0300
Subject: [PATCH] Working bytes-regexes, faster, MORE tests pass

---
 spec/tartrazine_spec.cr | 14 ++++----
 src/actions.cr          | 22 ++++++++-----
 src/bytes_regex.cr      | 72 +++++++++++++++++++++++++++++++++++++++++
 src/lexer.cr            | 14 ++++----
 src/rules.cr            | 26 +++++++++------
 5 files changed, 116 insertions(+), 32 deletions(-)
 create mode 100644 src/bytes_regex.cr

diff --git a/spec/tartrazine_spec.cr b/spec/tartrazine_spec.cr
index 4a06dc2..36662bc 100644
--- a/spec/tartrazine_spec.cr
+++ b/spec/tartrazine_spec.cr
@@ -20,9 +20,11 @@ unicode_problems = {
 # but tartrazine is correct
 bad_in_chroma = {
   "#{__DIR__}/tests/bash_session/test_comment_after_prompt.txt",
+  "#{__DIR__}/tests/html/javascript_backtracking.txt",
   "#{__DIR__}/tests/java/test_default.txt",
   "#{__DIR__}/tests/java/test_multiline_string.txt",
   "#{__DIR__}/tests/java/test_numeric_literals.txt",
+  "#{__DIR__}/tests/octave/test_multilinecomment.txt",
   "#{__DIR__}/tests/php/test_string_escaping_run.txt",
   "#{__DIR__}/tests/python_2/test_cls_builtin.txt",
 }
@@ -30,18 +32,14 @@ bad_in_chroma = {
 known_bad = {
   "#{__DIR__}/tests/bash_session/fake_ps2_prompt.txt",
   "#{__DIR__}/tests/bash_session/prompt_in_output.txt",
-  "#{__DIR__}/tests/bash_session/test_newline_in_echo_no_ps2.txt",
-  "#{__DIR__}/tests/bash_session/test_newline_in_ls_ps2.txt",
   "#{__DIR__}/tests/bash_session/ps2_prompt.txt",
-  "#{__DIR__}/tests/bash_session/test_newline_in_ls_no_ps2.txt",
-  "#{__DIR__}/tests/bash_session/test_virtualenv.txt",
+  "#{__DIR__}/tests/bash_session/test_newline_in_echo_no_ps2.txt",
   "#{__DIR__}/tests/bash_session/test_newline_in_echo_ps2.txt",
-  "#{__DIR__}/tests/c/test_string_resembling_decl_end.txt",
-  "#{__DIR__}/tests/html/css_backtracking.txt",
+  "#{__DIR__}/tests/bash_session/test_newline_in_ls_no_ps2.txt",
+  "#{__DIR__}/tests/bash_session/test_newline_in_ls_ps2.txt",
+  "#{__DIR__}/tests/bash_session/test_virtualenv.txt",
   "#{__DIR__}/tests/mcfunction/data.txt",
   "#{__DIR__}/tests/mcfunction/selectors.txt",
-  "#{__DIR__}/tests/php/anonymous_class.txt",
-  "#{__DIR__}/tests/html/javascript_unclosed.txt",
 }
 
 # Tests that fail because of a limitation in PCRE2
diff --git a/src/actions.cr b/src/actions.cr
index 4ed4008..858ca40 100644
--- a/src/actions.cr
+++ b/src/actions.cr
@@ -30,11 +30,11 @@ module Tartrazine
     end
 
     # ameba:disable Metrics/CyclomaticComplexity
-    def emit(match : Regex::MatchData?, lexer : Lexer, match_group = 0) : Array(Token)
+    def emit(match : MatchData, lexer : Lexer, match_group = 0) : Array(Token)
       case type
       when "token"
-        raise Exception.new "Can't have a token without a match" if match.nil?
-        [Token.new(type: xml["type"], value: match[match_group])]
+        raise Exception.new "Can't have a token without a match" if match.empty?
+        [Token.new(type: xml["type"], value: String.new(match[match_group].value))]
       when "push"
         states_to_push = xml.attributes.select { |attrib|
           attrib.name == "state"
@@ -79,23 +79,29 @@ module Tartrazine
         # the action is skipped.
         result = [] of Token
         @actions.each_with_index do |e, i|
-          next if match[i + 1]?.nil?
+          begin
+            next if match[i + 1].size == 0
+          rescue IndexError
+            # FIXME: This should not actually happen
+            # No match for this group
+            next
+          end
           result += e.emit(match, lexer, i + 1)
         end
         result
       when "using"
         # Shunt to another lexer entirely
-        return [] of Token if match.nil?
+        return [] of Token if match.empty?
         lexer_name = xml["lexer"].downcase
         Log.trace { "to tokenize: #{match[match_group]}" }
-        Tartrazine.lexer(lexer_name).tokenize(match[match_group], usingself: true)
+        Tartrazine.lexer(lexer_name).tokenize(String.new(match[match_group].value), usingself: true)
       when "usingself"
         # Shunt to another copy of this lexer
-        return [] of Token if match.nil?
+        return [] of Token if match.empty?
 
         new_lexer = Lexer.from_xml(lexer.xml)
         Log.trace { "to tokenize: #{match[match_group]}" }
-        new_lexer.tokenize(match[match_group], usingself: true)
+        new_lexer.tokenize(String.new(match[match_group].value), usingself: true)
       when "combined"
         # Combine two states into one anonymous state
         states = xml.attributes.select { |attrib|
diff --git a/src/bytes_regex.cr b/src/bytes_regex.cr
new file mode 100644
index 0000000..737ac62
--- /dev/null
+++ b/src/bytes_regex.cr
@@ -0,0 +1,72 @@
+module BytesRegex
+  extend self
+
+  class Regex
+    def initialize(pattern : String, multiline = false, dotall = false, ignorecase = false, anchored = false)
+      flags = LibPCRE2::UTF | LibPCRE2::DUPNAMES | LibPCRE2::UCP
+      flags |= LibPCRE2::MULTILINE if multiline
+      flags |= LibPCRE2::DOTALL if dotall
+      flags |= LibPCRE2::CASELESS if ignorecase
+      flags |= LibPCRE2::ANCHORED if anchored
+      if @re = LibPCRE2.compile(
+           pattern,
+           pattern.bytesize,
+           flags,
+           out errorcode,
+           out erroroffset,
+           nil)
+      else
+        # FIXME: show actual error message
+        raise Exception.new "Error compiling regex"
+      end
+    end
+
+    def finalize
+      LibPCRE2.code_free(@re)
+    end
+
+    def match(str : Bytes, pos = 0) : Array(Match)
+      match_data = LibPCRE2.match_data_create_from_pattern(@re, nil)
+      match = [] of Match
+      rc = LibPCRE2.match(
+        @re,
+        str,
+        str.size,
+        pos,
+        0,
+        match_data,
+        nil)
+      if rc < 0
+        # FIXME: handle actual errors
+      else
+        ovector = LibPCRE2.get_ovector_pointer(match_data)
+        (0...rc).each do |i|
+          m_start = ovector[2 * i]
+          m_size = ovector[2 * i + 1] - m_start
+          if m_size == 0
+            m_value = Bytes.new(0)
+          else
+            m_value = str[m_start...m_start + m_size]
+          end
+          match << Match.new(m_value, m_start, m_size)
+        end
+      end
+      LibPCRE2.match_data_free(match_data)
+      match
+    end
+  end
+
+  class Match
+    property value : Bytes
+    property start : UInt64
+    property size : UInt64
+
+    def initialize(@value : Bytes, @start : UInt64, @size : UInt64)
+    end
+  end
+end
+
+# pattern = "foo"
+# str = "foo bar"
+# re = BytesRegex::Regex.new(pattern)
+# p! String.new(re.match(str.to_slice)[0].value)
diff --git a/src/lexer.cr b/src/lexer.cr
index a22cecb..31fc5e7 100644
--- a/src/lexer.cr
+++ b/src/lexer.cr
@@ -1,3 +1,4 @@
+require "baked_file_system"
 require "./constants/lexers"
 
 module Tartrazine
@@ -65,7 +66,7 @@ module Tartrazine
     # is true when the lexer is being used to tokenize a string
     # from a larger text that is already being tokenized.
     # So, when it's true, we don't modify the text.
-    def tokenize(text, usingself = false) : Array(Token)
+    def tokenize(text : String, usingself = false) : Array(Token)
       @state_stack = ["root"]
       tokens = [] of Token
       pos = 0
@@ -76,12 +77,13 @@ module Tartrazine
         text += "\n"
       end
 
+      text_bytes = text.to_slice
       # Loop through the text, applying rules
-      while pos < text.size
+      while pos < text_bytes.size
         state = states[@state_stack.last]
         # Log.trace { "Stack is #{@state_stack} State is #{state.name}, pos is #{pos}, text is #{text[pos..pos + 10]}" }
         state.rules.each do |rule|
-          matched, new_pos, new_tokens = rule.match(text, pos, self)
+          matched, new_pos, new_tokens = rule.match(text_bytes, pos, self)
           if matched
             # Move position forward, save the tokens,
             # tokenize from the new position
@@ -94,12 +96,12 @@ module Tartrazine
         end
         # If no rule matches, emit an error token
         unless matched
-          if text[pos] == "\n"
+          if text_bytes[pos] == 10u8
             # at EOL, reset state to "root"
-            tokens << {type: "TextWhitespace", value: "\n"}
+            tokens << {type: "Text", value: "\n"}
             @state_stack = ["root"]
           else
-            tokens << {type: "Error", value: text[pos..pos]}
+            tokens << {type: "Error", value: String.new(text_bytes[pos..pos])}
           end
           pos += 1
         end
diff --git a/src/rules.cr b/src/rules.cr
index a7dc872..e88b9bd 100644
--- a/src/rules.cr
+++ b/src/rules.cr
@@ -1,8 +1,9 @@
 require "./actions"
+require "./bytes_regex"
 require "./formatter"
+require "./lexer"
 require "./rules"
 require "./styles"
-require "./lexer"
 
 # These are lexer rules. They match with the text being parsed
 # and perform actions, either emitting tokens or changing the
@@ -10,16 +11,21 @@ require "./lexer"
 module Tartrazine
   # This rule matches via a regex pattern
 
+  alias Regex = BytesRegex::Regex
+  alias Match = BytesRegex::Match
+  alias MatchData = Array(Match)
+
   class Rule
     property pattern : Regex = Regex.new ""
     property actions : Array(Action) = [] of Action
     property xml : String = "foo"
 
-    def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token))
+    def match(text : Bytes, pos, lexer) : Tuple(Bool, Int32, Array(Token))
       match = pattern.match(text, pos)
       # We don't match if the match doesn't move the cursor
       # because that causes infinite loops
-      return false, pos, [] of Token if match.nil? || match.size == 0
+      return false, pos, [] of Token if match.empty? || match[0].size == 0
+      # p! match, String.new(text[pos..pos+20])
       # Log.trace { "#{match}, #{pattern.inspect}, #{text}, #{pos}" }
       tokens = [] of Token
       # Emit the tokens
@@ -27,21 +33,21 @@ module Tartrazine
         # Emit the token
         tokens += action.emit(match, lexer)
       end
-      Log.trace { "#{xml}, #{match.end}, #{tokens}" }
-      return true, match.end, tokens
+      Log.trace { "#{xml}, #{pos + match[0].size}, #{tokens}" }
+      return true, pos + match[0].size, tokens
     end
 
     def initialize(node : XML::Node, multiline, dotall, ignorecase)
       @xml = node.to_s
       pattern = node["pattern"]
-      flags = Regex::Options::ANCHORED
+      # flags = Regex::Options::ANCHORED
       # MULTILINE implies DOTALL which we don't want, so we
       # use in-pattern flag (?m) instead
       # flags |= Regex::Options::MULTILINE if multiline
       pattern = "(?m)" + pattern if multiline
-      flags |= Regex::Options::DOTALL if dotall
-      flags |= Regex::Options::IGNORE_CASE if ignorecase
-      @pattern = Regex.new(pattern, flags)
+      # flags |= Regex::Options::DOTALL if dotall
+      # flags |= Regex::Options::IGNORE_CASE if ignorecase
+      @pattern = Regex.new(pattern, multiline, dotall, ignorecase, true)
       add_actions(node)
     end
 
@@ -83,7 +89,7 @@ module Tartrazine
     def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token))
       tokens = [] of Token
       actions.each do |action|
-        tokens += action.emit(nil, lexer)
+        tokens += action.emit([] of Match, lexer)
       end
       return true, pos, tokens
     end