CLose to 100% tests working, but slooooooow

More tests pass
Some tests pass!
2025-09-17 10:48:12 +00:00 · 2024-08-13 20:45:46 -03:00 · 2024-08-13 20:09:36 -03:00 · 2024-08-13 19:19:12 -03:00 · 2024-08-13 14:02:13 -03:00
7 changed files with 229 additions and 93 deletions
--- a/spec/tartrazine_spec.cr
+++ b/spec/tartrazine_spec.cr
@@ -42,6 +42,9 @@ known_bad = {
  "#{__DIR__}/tests/mcfunction/selectors.txt",
  "#{__DIR__}/tests/php/anonymous_class.txt",
  "#{__DIR__}/tests/html/javascript_unclosed.txt",
+# BAD FOR ONIGMO
+"#{__DIR__}/tests/json/test_backtracking.txt",
+

 }

@@ -58,6 +61,7 @@ describe Tartrazine do
        end
      else
        it "parses #{testcase}".split("/")[-2...].join("/") do
+          p! testcase
          text = File.read(testcase).split("---input---\n").last.split("---tokens---").first
          lexer_name = File.basename(File.dirname(testcase)).downcase
          unless failing_lexers.includes?(lexer_name) ||
--- a/src/actions.cr
+++ b/src/actions.cr
@@ -30,11 +30,11 @@ module Tartrazine
    end

    # ameba:disable Metrics/CyclomaticComplexity
-    def emit(match : Regex::MatchData?, lexer : Lexer, match_group = 0) : Array(Token)
+    def emit(match, lexer : Lexer, match_group = 0) : Array(Token)
      case type
      when "token"
        raise Exception.new "Can't have a token without a match" if match.nil?
-        [Token.new(type: xml["type"], value: match[match_group])]
+        [Token.new(type: xml["type"], value: match[match_group].as(Onigmo::Match).value)]
      when "push"
        states_to_push = xml.attributes.select { |attrib|
          attrib.name == "state"
@@ -88,14 +88,14 @@ module Tartrazine
        return [] of Token if match.nil?
        lexer_name = xml["lexer"].downcase
        Log.trace { "to tokenize: #{match[match_group]}" }
-        Tartrazine.lexer(lexer_name).tokenize(match[match_group], usingself: true)
+        Tartrazine.lexer(lexer_name).tokenize(match[match_group].as(Onigmo::Match).value, usingself: true)
      when "usingself"
        # Shunt to another copy of this lexer
        return [] of Token if match.nil?

        new_lexer = Lexer.from_xml(lexer.xml)
        Log.trace { "to tokenize: #{match[match_group]}" }
-        new_lexer.tokenize(match[match_group], usingself: true)
+        new_lexer.tokenize(match[match_group].as(Onigmo::Match).value, usingself: true)
      when "combined"
        # Combine two states into one anonymous state
        states = xml.attributes.select { |attrib|
--- a/src/formatters/html.cr
+++ b/src/formatters/html.cr
@@ -73,7 +73,7 @@ module Tartrazine
          # These are true/false/nil
          outp << "border: none;" if style.border == false
          outp << "font-weight: bold;" if style.bold
-          outp << "font-weight: #{weight_of_bold};" if style.bold == false
+          outp << "font-weight: #{@weight_of_bold};" if style.bold == false
          outp << "font-style: italic;" if style.italic
          outp << "font-style: normal;" if style.italic == false
          outp << "text-decoration: underline;" if style.underline
--- a/src/onigmo.cr
+++ b/src/onigmo.cr
@@ -1,88 +1,85 @@
@[Link("onigmo")]
+@[Link(ldflags: "#{__DIR__}/onigmo/onigwrap.o")]

-lib Onigmo
-  type OnigOptionType = UInt32
-  type OnigCaseFoldType = UInt32
-  type OnigDistance = LibC::SizeT
+lib LibOnigmo
+  type Regex = Pointer(Void)
+  type Region = Pointer(Void)

-  struct OnigRegex
-    p : LibC::UChar*
-    used : UInt32
-    alloc : UInt32
+  fun create = onigwrap_create(pattern : LibC::Char*, len : UInt32,
+                               ignoreCase : Int32,
+                               multiline : Int32,
+                               dotall : Int32) : Regex
+  fun free = onigwrap_free(re : Regex)
+  fun region_free = onigwrap_region_free(region : Region)

-    num_mem : Int32
-    num_repeat : Int32
-    num_null_check : Int32
-    num_comb_exp_check : Int32
-    num_call : Int32
-    capture_history : UInt32
-    bt_mem_start : UInt32
-    bt_mem_end : UInt32
-    stack_pop_level : Int32
-    repeat_range_alloc : Int32
-    options : OnigOptionType
-    syntax : OnigSyntaxType*
-    name_table : Void*
-    case_fold_flag : OnigCaseFoldType
-    optimize : Int32
-    threshold_len : Int32
-    anchor : Int32
-    anchor_dmin : OnigDistance
-    anchor_dmax : OnigDistance
-    sub_anchor : Int32
-    exact : LibC::UChar*
-    exact_end : LibC::UChar*
-    map : LibC::UChar*
-    int_map : Int32*
-    int_map_backward : Int32*
-    dmin : OnigDistance
-    dmax : OnigDistance
-    chain : OnigRegex*
+  fun search = onigwrap_search(re : Regex, str : LibC::Char*, offset : UInt32, length : UInt32) : Region
+  fun num_regs = onigwrap_num_regs(region : Region) : Int32
+  fun pos = onigwrap_pos(region : Region, index : Int32) : Int32
+  fun len = onigwrap_len(region : Region, index : Int32) : Int32
 end

-  type OnigRegexType = OnigRegex*
-  type OnigCodePoint = UInt32
-  type OnigUChar = LibC::UChar
-  type OnigEncoding = Void*
+module Onigmo
+  class Match
+    property begin : Int32
+    property end : Int32
+    property value : String

-  struct OnigMetaCharTableType
-    esc : OnigCodePoint
-    anychar : OnigCodePoint
-    anytime : OnigCodePoint
-    zero_or_one_time : OnigCodePoint
-    one_or_one_time : OnigCodePoint
-    anychar_anytime : OnigCodePoint
+    def initialize(@begin, @end, @value)
    end

-  struct OnigSyntaxType
-    op : UInt32
-    op2 : UInt32
-    behavior : UInt32
-    options : OnigOptionType
-    meta_char_table : OnigMetaCharTableType
+    def to_s
+      @value
+    end
  end

-  struct OnigErrorInfo
-    enc : OnigEncoding
-    par : OnigUChar*
-    par_end : OnigUChar*
+  class Regex
+    def initialize(@pattern : String, @ignorecase = false, @multiline = false, @dotall = false)
+      @re = LibOnigmo.create(@pattern.to_unsafe, @pattern.bytesize, @ignorecase ? 1 : 0, @multiline ? 1 : 0, @dotall ? 1 : 0)
    end

-  ONIG_OPTION_NONE = 0u32
-  ONIG_OPTION_DEFAULT = ONIG_OPTION_NONE
-
-
-  fun new = onig_new(OnigRegex*, OnigUChar*, OnigUChar*, OnigOptionType, OnigEncoding, OnigSyntaxType*, OnigErrorInfo*)
+    def finalize
+      LibOnigmo.free(@re)
    end

-pattern = "a(.*)b|[e-f]+"
-str = "zzzzaffffffffb"
+    def match(str : String, offset = 0)
+      # The offset argument is a character index, but Onigmo expects a byte index
+      offset = str.char_index_to_byte_index(offset)
+      if offset.nil?
+        raise Exception.new "Invalid offset"
+      end

-einfo = Onigmo::OnigErrorInfo.new
+      region = LibOnigmo.search(@re, str.to_unsafe, offset, str.bytesize)
+      result = [] of Match?
+      num_regs = LibOnigmo.num_regs(region)
+      if num_regs > 0
+        (0...num_regs).each do |i|
+          pos = LibOnigmo.pos(region, i)
+          l = LibOnigmo.len(region, i)
+          if pos == -1 || l == -1
+            result << nil
+          else
+            b = str.byte_index_to_char_index(pos)
+            e = str.byte_index_to_char_index(pos + l)
+            # p! pos, l, b, e, str[pos..]
+            if b.nil? || e.nil?
+              raise Exception.new "Invalid substring"
+            end

-Onigmo.new(out reg, 
-pattern.to_unsafe, 
-pattern.to_unsafe + pattern.size,
-Onigmo::ONIG_OPTION_DEFAULT, 
-0, 
-Onigmo::ONIG_SYNTAX_DEFAULT, pointerof(einfo))
+            v = str[b...e]
+            result << Match.new(b, b + v.size, v)
+          end
+        end
+      else
+        return [] of Match
+      end
+      LibOnigmo.region_free(region)
+      result
+    end
+  end
+end
+
+# pattern = "\\w"
+# str = "α"
+
+# re = Onigmo::Regex.new(pattern, false, false, false)
+# p! re.match(str)
--- a/src/onigmo/onigwrap.c
+++ b/src/onigmo/onigwrap.c
@@ -0,0 +1,94 @@
+#include "onigmo.h"
+
+regex_t *onigwrap_create(char *pattern, int len, int ignoreCase, int multiline, int dotall)
+{
+	regex_t *reg;
+
+	OnigErrorInfo einfo;
+
+	OnigOptionType onigOptions = ONIG_OPTION_DEFAULT;
+
+	if (ignoreCase == 1)
+		onigOptions |= ONIG_OPTION_IGNORECASE;
+
+	if (multiline == 1)
+		onigOptions |= ONIG_OPTION_NEGATE_SINGLELINE;
+
+	if (dotall == 1)
+		onigOptions |= ONIG_OPTION_DOTALL;
+
+	OnigUChar *stringStart = (OnigUChar*) pattern;
+	OnigUChar *stringEnd   = (OnigUChar*) pattern + len;
+	int res = onig_new(&reg, stringStart, stringEnd, onigOptions, ONIG_ENCODING_UTF8, ONIG_SYNTAX_PYTHON, &einfo);
+
+	return reg;
+}
+
+void onigwrap_region_free(OnigRegion *region)	
+{
+	onig_region_free(region, 1);
+}
+
+void onigwrap_free(regex_t *reg)
+{
+	onig_free(reg);
+}
+
+int onigwrap_index_in(regex_t *reg, char *charPtr, int offset, int length)
+{
+	OnigUChar *stringStart  = (OnigUChar*) charPtr;
+	OnigUChar *stringEnd    = (OnigUChar*) (charPtr + length);
+	OnigUChar *stringOffset = (OnigUChar*) (charPtr + offset);
+	OnigUChar *stringRange  = (OnigUChar*) stringEnd;
+
+	OnigRegion *region = onig_region_new();
+	int result = onig_search(reg, stringStart, stringEnd, stringOffset, stringRange, region, ONIG_OPTION_NONE);
+	onig_region_free(region, 1);
+
+	if (result >= 0)
+		return result >> 1;
+	if (result == ONIG_MISMATCH)
+		return -1;
+	return -2;
+}
+
+OnigRegion *onigwrap_search(regex_t *reg, char *charPtr, int offset, int length)
+{
+	OnigUChar *stringStart  = (OnigUChar*) charPtr;
+	OnigUChar *stringEnd    = (OnigUChar*) (charPtr + length);
+	OnigUChar *stringOffset = (OnigUChar*) (charPtr + offset);
+	OnigUChar *stringRange  = (OnigUChar*) stringEnd;
+
+	OnigRegion *region = onig_region_new();
+
+	int result = onig_search(reg, stringStart, stringEnd, stringOffset, stringRange, region, ONIG_OPTION_NONE);
+	return region;
+}
+
+int onigwrap_num_regs(OnigRegion *region)
+{
+	return region->num_regs;
+}
+
+int onigwrap_pos(OnigRegion *region, int nth)
+{
+	if (nth < region->num_regs)
+	{
+		int result = region->beg[nth];
+		if (result < 0)
+			return -1;
+		return result;
+	}
+	return -1;
+}
+
+int onigwrap_len(OnigRegion *region, int nth)
+{
+	if (nth < region->num_regs)
+	{
+		int result = region->end[nth] - region->beg[nth];
+		return result;
+	}
+	return -1;
+}
+
--- a/src/onigmo/onigwrap.h
+++ b/src/onigmo/onigwrap.h
@@ -0,0 +1,32 @@
+#include "onigmo.h"
+
+#if defined(_WIN32)
+#define ONIGWRAP_EXTERN extern __declspec(dllexport)
+#else
+#define ONIGWRAP_EXTERN extern
+#endif
+
+ONIGWRAP_EXTERN
+regex_t *onigwrap_create(char *pattern, int len, int ignoreCase, int multiline);
+
+ONIGWRAP_EXTERN
+void onigwrap_region_free(OnigRegion *region);
+
+ONIGWRAP_EXTERN
+void onigwrap_free(regex_t *reg);
+
+ONIGWRAP_EXTERN
+int onigwrap_index_in(regex_t *reg, char *charPtr, int offset, int length);
+
+ONIGWRAP_EXTERN
+OnigRegion *onigwrap_search(regex_t *reg, char *charPtr, int offset, int length);
+
+ONIGWRAP_EXTERN
+int onigwrap_num_regs(OnigRegion *region);
+
+ONIGWRAP_EXTERN
+int onigwrap_pos(OnigRegion *region, int nth);
+
+ONIGWRAP_EXTERN
+int onigwrap_len(OnigRegion *region, int nth);
+
--- a/src/rules.cr
+++ b/src/rules.cr
@@ -3,6 +3,7 @@ require "./formatter"
 require "./rules"
 require "./styles"
 require "./lexer"
+require "./onigmo"

 # These are lexer rules. They match with the text being parsed
 # and perform actions, either emitting tokens or changing the
@@ -10,16 +11,22 @@ require "./lexer"
 module Tartrazine
  # This rule matches via a regex pattern

+  alias Regex = Onigmo::Regex
+
  class Rule
    property pattern : Regex = Regex.new ""
+    property pattern2 : ::Regex = ::Regex.new ""
    property actions : Array(Action) = [] of Action
    property xml : String = "foo"

    def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token))
      match = pattern.match(text, pos)
+      match2 = pattern2.match(text, pos)
      # We don't match if the match doesn't move the cursor
      # because that causes infinite loops
-      return false, pos, [] of Token if match.nil? || match.end == 0
+      # The `match.begin > pos` is the same as the ANCHORED option
+      return false, pos, [] of Token if match.empty? || match[0].nil? || match[0].try { |m| m.begin > pos }
+      # p! match.map(&.to_s), match2, text[pos-1..pos + 20],"----------------------"
      # Log.trace { "#{match}, #{pattern.inspect}, #{text}, #{pos}" }
      tokens = [] of Token
      # Emit the tokens
@@ -27,21 +34,23 @@ module Tartrazine
        # Emit the token
        tokens += action.emit(match, lexer)
      end
-      Log.trace { "#{xml}, #{match.end}, #{tokens}" }
-      return true, match.end, tokens
+      # Log.trace { "#{xml}, #{match[0].end}, #{tokens}" }
+      return true, pos + match[0].as(Onigmo::Match).value.size, tokens
    end

    def initialize(node : XML::Node, multiline, dotall, ignorecase)
      @xml = node.to_s
      pattern = node["pattern"]
-      flags = Regex::Options::ANCHORED
+      # flags = Regex::Options::ANCHORED
+      flags = ::Regex::Options::NO_UTF_CHECK
      # MULTILINE implies DOTALL which we don't want, so we
      # use in-pattern flag (?m) instead
-      # flags |= Regex::Options::MULTILINE if multiline
+      flags |= ::Regex::Options::MULTILINE if multiline
      pattern = "(?m)" + pattern if multiline
-      flags |= Regex::Options::DOTALL if dotall
-      flags |= Regex::Options::IGNORE_CASE if ignorecase
-      @pattern = Regex.new(pattern, flags)
+      flags |= ::Regex::Options::DOTALL if dotall
+      flags |= ::Regex::Options::IGNORE_CASE if ignorecase
+      @pattern = Regex.new(pattern, ignorecase, multiline, dotall)
+      @pattern2 = ::Regex.new(pattern, flags)
      add_actions(node)
    end
Author	SHA1	Message	Date
Roberto Alsina	32816eb207	CLose to 100% tests working, but slooooooow	2024-08-13 20:45:46 -03:00
Roberto Alsina	d2b61fdc6c	More tests pass	2024-08-13 20:09:36 -03:00
Roberto Alsina	a704c59fa9	Some tests pass!	2024-08-13 19:19:12 -03:00
Roberto Alsina	2a9e7fde0d	Working onigmo wrapper, but onigmo doesn't support anchored regexes	2024-08-13 14:02:13 -03:00