2025-06-08 12:40:25 -03:00
7 changed files with 90 additions and 226 deletions
--- a/spec/tartrazine_spec.cr
+++ b/spec/tartrazine_spec.cr
@ -42,9 +42,6 @@ known_bad = {
  "#{__DIR__}/tests/mcfunction/selectors.txt",
  "#{__DIR__}/tests/php/anonymous_class.txt",
  "#{__DIR__}/tests/html/javascript_unclosed.txt",
 # BAD FOR ONIGMO
 "#{__DIR__}/tests/json/test_backtracking.txt",
 }
@ -61,7 +58,6 @@ describe Tartrazine do
        end
      else
        it "parses #{testcase}".split("/")[-2...].join("/") do
          p! testcase
          text = File.read(testcase).split("---input---\n").last.split("---tokens---").first
          lexer_name = File.basename(File.dirname(testcase)).downcase
          unless failing_lexers.includes?(lexer_name) ||
--- a/src/actions.cr
+++ b/src/actions.cr
@ -30,11 +30,11 @@ module Tartrazine
    end
    # ameba:disable Metrics/CyclomaticComplexity
-    def emit(match, lexer : Lexer, match_group = 0) : Array(Token)
+    def emit(match : Regex::MatchData?, lexer : Lexer, match_group = 0) : Array(Token)
      case type
      when "token"
        raise Exception.new "Can't have a token without a match" if match.nil?
-        [Token.new(type: xml["type"], value: match[match_group].as(Onigmo::Match).value)]
+        [Token.new(type: xml["type"], value: match[match_group])]
      when "push"
        states_to_push = xml.attributes.select { |attrib|
          attrib.name == "state"
@ -88,14 +88,14 @@ module Tartrazine
        return [] of Token if match.nil?
        lexer_name = xml["lexer"].downcase
        Log.trace { "to tokenize: #{match[match_group]}" }
-        Tartrazine.lexer(lexer_name).tokenize(match[match_group].as(Onigmo::Match).value, usingself: true)
+        Tartrazine.lexer(lexer_name).tokenize(match[match_group], usingself: true)
      when "usingself"
        # Shunt to another copy of this lexer
        return [] of Token if match.nil?
        new_lexer = Lexer.from_xml(lexer.xml)
        Log.trace { "to tokenize: #{match[match_group]}" }
-        new_lexer.tokenize(match[match_group].as(Onigmo::Match).value, usingself: true)
+        new_lexer.tokenize(match[match_group], usingself: true)
      when "combined"
        # Combine two states into one anonymous state
        states = xml.attributes.select { |attrib|
--- a/src/formatters/html.cr
+++ b/src/formatters/html.cr
@ -73,7 +73,7 @@ module Tartrazine
          # These are true/false/nil
          outp << "border: none;" if style.border == false
          outp << "font-weight: bold;" if style.bold
-          outp << "font-weight: #{@weight_of_bold};" if style.bold == false
+          outp << "font-weight: #{weight_of_bold};" if style.bold == false
          outp << "font-style: italic;" if style.italic
          outp << "font-style: normal;" if style.italic == false
          outp << "text-decoration: underline;" if style.underline
--- a/src/onigmo.cr
+++ b/src/onigmo.cr
@ -1,85 +1,88 @@
@[Link("onigmo")]
@[Link(ldflags: "#{__DIR__}/onigmo/onigwrap.o")]
-lib LibOnigmo
+lib Onigmo
-  type Regex = Pointer(Void)
+  type OnigOptionType = UInt32
-  type Region = Pointer(Void)
+  type OnigCaseFoldType = UInt32
  type OnigDistance = LibC::SizeT
-  fun create = onigwrap_create(pattern : LibC::Char*, len : UInt32,
+  struct OnigRegex
-                               ignoreCase : Int32,
+    p : LibC::UChar*
-                               multiline : Int32,
+    used : UInt32
-                               dotall : Int32) : Regex
+    alloc : UInt32
  fun free = onigwrap_free(re : Regex)
  fun region_free = onigwrap_region_free(region : Region)
-  fun search = onigwrap_search(re : Regex, str : LibC::Char*, offset : UInt32, length : UInt32) : Region
+    num_mem : Int32
-  fun num_regs = onigwrap_num_regs(region : Region) : Int32
+    num_repeat : Int32
-  fun pos = onigwrap_pos(region : Region, index : Int32) : Int32
+    num_null_check : Int32
-  fun len = onigwrap_len(region : Region, index : Int32) : Int32
+    num_comb_exp_check : Int32
    num_call : Int32
    capture_history : UInt32
    bt_mem_start : UInt32
    bt_mem_end : UInt32
    stack_pop_level : Int32
    repeat_range_alloc : Int32
    options : OnigOptionType
    syntax : OnigSyntaxType*
    name_table : Void*
    case_fold_flag : OnigCaseFoldType
    optimize : Int32
    threshold_len : Int32
    anchor : Int32
    anchor_dmin : OnigDistance
    anchor_dmax : OnigDistance
    sub_anchor : Int32
    exact : LibC::UChar*
    exact_end : LibC::UChar*
    map : LibC::UChar*
    int_map : Int32*
    int_map_backward : Int32*
    dmin : OnigDistance
    dmax : OnigDistance
    chain : OnigRegex*
  end
  type OnigRegexType = OnigRegex*
  type OnigCodePoint = UInt32
  type OnigUChar = LibC::UChar
  type OnigEncoding = Void*
  struct OnigMetaCharTableType
    esc : OnigCodePoint
    anychar : OnigCodePoint
    anytime : OnigCodePoint
    zero_or_one_time : OnigCodePoint
    one_or_one_time : OnigCodePoint
    anychar_anytime : OnigCodePoint
  end
  struct OnigSyntaxType
    op : UInt32
    op2 : UInt32
    behavior : UInt32
    options : OnigOptionType
    meta_char_table : OnigMetaCharTableType
  end
  struct OnigErrorInfo
    enc : OnigEncoding
    par : OnigUChar*
    par_end : OnigUChar*
  end
  ONIG_OPTION_NONE = 0u32
  ONIG_OPTION_DEFAULT = ONIG_OPTION_NONE
  fun new = onig_new(OnigRegex*, OnigUChar*, OnigUChar*, OnigOptionType, OnigEncoding, OnigSyntaxType*, OnigErrorInfo*)
 end
-module Onigmo
+pattern = "a(.*)b|[e-f]+"
-  class Match
+str = "zzzzaffffffffb"
    property begin : Int32
    property end : Int32
    property value : String
-    def initialize(@begin, @end, @value)
+einfo = Onigmo::OnigErrorInfo.new
    end
-    def to_s
+Onigmo.new(out reg, 
-      @value
+pattern.to_unsafe, 
-    end
+pattern.to_unsafe + pattern.size,
-  end
+Onigmo::ONIG_OPTION_DEFAULT, 
-
+0, 
-  class Regex
+Onigmo::ONIG_SYNTAX_DEFAULT, pointerof(einfo))
    def initialize(@pattern : String, @ignorecase = false, @multiline = false, @dotall = false)
      @re = LibOnigmo.create(@pattern.to_unsafe, @pattern.bytesize, @ignorecase ? 1 : 0, @multiline ? 1 : 0, @dotall ? 1 : 0)
    end
    def finalize
      LibOnigmo.free(@re)
    end
    def match(str : String, offset = 0)
      # The offset argument is a character index, but Onigmo expects a byte index
      offset = str.char_index_to_byte_index(offset)
      if offset.nil?
        raise Exception.new "Invalid offset"
      end
      region = LibOnigmo.search(@re, str.to_unsafe, offset, str.bytesize)
      result = [] of Match?
      num_regs = LibOnigmo.num_regs(region)
      if num_regs > 0
        (0...num_regs).each do |i|
          pos = LibOnigmo.pos(region, i)
          l = LibOnigmo.len(region, i)
          if pos == -1 || l == -1
            result << nil
          else
            b = str.byte_index_to_char_index(pos)
            e = str.byte_index_to_char_index(pos + l)
            # p! pos, l, b, e, str[pos..]
            if b.nil? || e.nil?
              raise Exception.new "Invalid substring"
            end
            v = str[b...e]
            result << Match.new(b, b + v.size, v)
          end
        end
      else
        return [] of Match
      end
      LibOnigmo.region_free(region)
      result
    end
  end
 end
 # pattern = "\\w"
 # str = "α"
 # re = Onigmo::Regex.new(pattern, false, false, false)
 # p! re.match(str)
--- a/src/onigmo/onigwrap.c
+++ b/src/onigmo/onigwrap.c
@ -1,94 +0,0 @@
 #include "onigmo.h"
 regex_t *onigwrap_create(char *pattern, int len, int ignoreCase, int multiline, int dotall)
 {
 	regex_t *reg;
 	OnigErrorInfo einfo;
 	OnigOptionType onigOptions = ONIG_OPTION_DEFAULT;
 	if (ignoreCase == 1)
 		onigOptions |= ONIG_OPTION_IGNORECASE;
 	if (multiline == 1)
 		onigOptions |= ONIG_OPTION_NEGATE_SINGLELINE;
 	if (dotall == 1)
 		onigOptions |= ONIG_OPTION_DOTALL;
 	OnigUChar *stringStart = (OnigUChar*) pattern;
 	OnigUChar *stringEnd   = (OnigUChar*) pattern + len;
 	int res = onig_new(&reg, stringStart, stringEnd, onigOptions, ONIG_ENCODING_UTF8, ONIG_SYNTAX_PYTHON, &einfo);
 	return reg;
 }
 void onigwrap_region_free(OnigRegion *region)	
 {
 	onig_region_free(region, 1);
 }
 void onigwrap_free(regex_t *reg)
 {
 	onig_free(reg);
 }
 int onigwrap_index_in(regex_t *reg, char *charPtr, int offset, int length)
 {
 	OnigUChar *stringStart  = (OnigUChar*) charPtr;
 	OnigUChar *stringEnd    = (OnigUChar*) (charPtr + length);
 	OnigUChar *stringOffset = (OnigUChar*) (charPtr + offset);
 	OnigUChar *stringRange  = (OnigUChar*) stringEnd;
 	OnigRegion *region = onig_region_new();
 	int result = onig_search(reg, stringStart, stringEnd, stringOffset, stringRange, region, ONIG_OPTION_NONE);
 	onig_region_free(region, 1);
 	if (result >= 0)
 		return result >> 1;
 	if (result == ONIG_MISMATCH)
 		return -1;
 	return -2;
 }
 OnigRegion *onigwrap_search(regex_t *reg, char *charPtr, int offset, int length)
 {
 	OnigUChar *stringStart  = (OnigUChar*) charPtr;
 	OnigUChar *stringEnd    = (OnigUChar*) (charPtr + length);
 	OnigUChar *stringOffset = (OnigUChar*) (charPtr + offset);
 	OnigUChar *stringRange  = (OnigUChar*) stringEnd;
 	OnigRegion *region = onig_region_new();
 	int result = onig_search(reg, stringStart, stringEnd, stringOffset, stringRange, region, ONIG_OPTION_NONE);
 	return region;
 }
 int onigwrap_num_regs(OnigRegion *region)
 {
 	return region->num_regs;
 }
 int onigwrap_pos(OnigRegion *region, int nth)
 {
 	if (nth < region->num_regs)
 	{
 		int result = region->beg[nth];
 		if (result < 0)
 			return -1;
 		return result;
 	}
 	return -1;
 }
 int onigwrap_len(OnigRegion *region, int nth)
 {
 	if (nth < region->num_regs)
 	{
 		int result = region->end[nth] - region->beg[nth];
 		return result;
 	}
 	return -1;
 }
--- a/src/onigmo/onigwrap.h
+++ b/src/onigmo/onigwrap.h
@ -1,32 +0,0 @@
 #include "onigmo.h"
 #if defined(_WIN32)
 #define ONIGWRAP_EXTERN extern __declspec(dllexport)
 #else
 #define ONIGWRAP_EXTERN extern
 #endif
 ONIGWRAP_EXTERN
 regex_t *onigwrap_create(char *pattern, int len, int ignoreCase, int multiline);
 ONIGWRAP_EXTERN
 void onigwrap_region_free(OnigRegion *region);
 ONIGWRAP_EXTERN
 void onigwrap_free(regex_t *reg);
 ONIGWRAP_EXTERN
 int onigwrap_index_in(regex_t *reg, char *charPtr, int offset, int length);
 ONIGWRAP_EXTERN
 OnigRegion *onigwrap_search(regex_t *reg, char *charPtr, int offset, int length);
 ONIGWRAP_EXTERN
 int onigwrap_num_regs(OnigRegion *region);
 ONIGWRAP_EXTERN
 int onigwrap_pos(OnigRegion *region, int nth);
 ONIGWRAP_EXTERN
 int onigwrap_len(OnigRegion *region, int nth);
--- a/src/rules.cr
+++ b/src/rules.cr
@ -3,7 +3,6 @@ require "./formatter"
 require "./rules"
 require "./styles"
 require "./lexer"
 require "./onigmo"
 # These are lexer rules. They match with the text being parsed
 # and perform actions, either emitting tokens or changing the
@ -11,22 +10,16 @@ require "./onigmo"
 module Tartrazine
  # This rule matches via a regex pattern
  alias Regex = Onigmo::Regex
  class Rule
    property pattern : Regex = Regex.new ""
    property pattern2 : ::Regex = ::Regex.new ""
    property actions : Array(Action) = [] of Action
    property xml : String = "foo"
    def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token))
      match = pattern.match(text, pos)
      match2 = pattern2.match(text, pos)
      # We don't match if the match doesn't move the cursor
      # because that causes infinite loops
-      # The `match.begin > pos` is the same as the ANCHORED option
+      return false, pos, [] of Token if match.nil? || match.end == 0
      return false, pos, [] of Token if match.empty? || match[0].nil? || match[0].try { |m| m.begin > pos }
      # p! match.map(&.to_s), match2, text[pos-1..pos + 20],"----------------------"
      # Log.trace { "#{match}, #{pattern.inspect}, #{text}, #{pos}" }
      tokens = [] of Token
      # Emit the tokens
@ -34,23 +27,21 @@ module Tartrazine
        # Emit the token
        tokens += action.emit(match, lexer)
      end
-      # Log.trace { "#{xml}, #{match[0].end}, #{tokens}" }
+      Log.trace { "#{xml}, #{match.end}, #{tokens}" }
-      return true, pos + match[0].as(Onigmo::Match).value.size, tokens
+      return true, match.end, tokens
    end
    def initialize(node : XML::Node, multiline, dotall, ignorecase)
      @xml = node.to_s
      pattern = node["pattern"]
-      # flags = Regex::Options::ANCHORED
+      flags = Regex::Options::ANCHORED
      flags = ::Regex::Options::NO_UTF_CHECK
      # MULTILINE implies DOTALL which we don't want, so we
      # use in-pattern flag (?m) instead
-      flags |= ::Regex::Options::MULTILINE if multiline
+      # flags |= Regex::Options::MULTILINE if multiline
      pattern = "(?m)" + pattern if multiline
-      flags |= ::Regex::Options::DOTALL if dotall
+      flags |= Regex::Options::DOTALL if dotall
-      flags |= ::Regex::Options::IGNORE_CASE if ignorecase
+      flags |= Regex::Options::IGNORE_CASE if ignorecase
-      @pattern = Regex.new(pattern, ignorecase, multiline, dotall)
+      @pattern = Regex.new(pattern, flags)
      @pattern2 = ::Regex.new(pattern, flags)
      add_actions(node)
    end