CLose to 100% tests working, but slooooooow

More tests pass
Some tests pass!
2025-09-17 10:48:12 +00:00 · 2024-08-13 20:45:46 -03:00 · 2024-08-13 20:09:36 -03:00 · 2024-08-13 19:19:12 -03:00 · 2024-08-13 14:02:13 -03:00 · 2024-08-12 20:10:50 -03:00
8 changed files with 348 additions and 39 deletions
--- a/.ameba.yml
+++ b/.ameba.yml
@@ -1,5 +1,5 @@
 # This configuration file was generated by `ameba --gen-config`
-# on 2024-08-04 23:09:09 UTC using Ameba version 1.6.1.
+# on 2024-08-12 22:00:49 UTC using Ameba version 1.6.1.
 # The point is for the user to remove these configuration records
 # one by one as the reported problems are removed from the code base.

@@ -9,7 +9,7 @@ Documentation/DocumentationAdmonition:
  Description: Reports documentation admonitions
  Timezone: UTC
  Excluded:
-  - src/tartrazine.cr
+  - src/lexer.cr
  - src/actions.cr
  Admonitions:
  - TODO
@@ -17,3 +17,105 @@ Documentation/DocumentationAdmonition:
  - BUG
  Enabled: true
  Severity: Warning
+
+# Problems found: 22
+# Run `ameba --only Lint/MissingBlockArgument` for details
+Lint/MissingBlockArgument:
+  Description: Disallows yielding method definitions without block argument
+  Excluded:
+  - pygments/tests/examplefiles/cr/test.cr
+  Enabled: true
+  Severity: Warning
+
+# Problems found: 1
+# Run `ameba --only Lint/NotNil` for details
+Lint/NotNil:
+  Description: Identifies usage of `not_nil!` calls
+  Excluded:
+  - pygments/tests/examplefiles/cr/test.cr
+  Enabled: true
+  Severity: Warning
+
+# Problems found: 34
+# Run `ameba --only Lint/ShadowingOuterLocalVar` for details
+Lint/ShadowingOuterLocalVar:
+  Description: Disallows the usage of the same name as outer local variables for block
+    or proc arguments
+  Excluded:
+  - pygments/tests/examplefiles/cr/test.cr
+  Enabled: true
+  Severity: Warning
+
+# Problems found: 1
+# Run `ameba --only Lint/UnreachableCode` for details
+Lint/UnreachableCode:
+  Description: Reports unreachable code
+  Excluded:
+  - pygments/tests/examplefiles/cr/test.cr
+  Enabled: true
+  Severity: Warning
+
+# Problems found: 6
+# Run `ameba --only Lint/UselessAssign` for details
+Lint/UselessAssign:
+  Description: Disallows useless variable assignments
+  ExcludeTypeDeclarations: false
+  Excluded:
+  - pygments/tests/examplefiles/cr/test.cr
+  Enabled: true
+  Severity: Warning
+
+# Problems found: 3
+# Run `ameba --only Naming/BlockParameterName` for details
+Naming/BlockParameterName:
+  Description: Disallows non-descriptive block parameter names
+  MinNameLength: 3
+  AllowNamesEndingInNumbers: true
+  Excluded:
+  - pygments/tests/examplefiles/cr/test.cr
+  AllowedNames:
+  - _
+  - e
+  - i
+  - j
+  - k
+  - v
+  - x
+  - y
+  - ex
+  - io
+  - ws
+  - op
+  - tx
+  - id
+  - ip
+  - k1
+  - k2
+  - v1
+  - v2
+  ForbiddenNames: []
+  Enabled: true
+  Severity: Convention
+
+# Problems found: 1
+# Run `ameba --only Naming/RescuedExceptionsVariableName` for details
+Naming/RescuedExceptionsVariableName:
+  Description: Makes sure that rescued exceptions variables are named as expected
+  Excluded:
+  - pygments/tests/examplefiles/cr/test.cr
+  AllowedNames:
+  - e
+  - ex
+  - exception
+  - error
+  Enabled: true
+  Severity: Convention
+
+# Problems found: 6
+# Run `ameba --only Naming/TypeNames` for details
+Naming/TypeNames:
+  Description: Enforces type names in camelcase manner
+  Excluded:
+  - pygments/tests/examplefiles/cr/test.cr
+  Enabled: true
+  Severity: Convention
--- a/spec/tartrazine_spec.cr
+++ b/spec/tartrazine_spec.cr
@@ -42,6 +42,9 @@ known_bad = {
  "#{__DIR__}/tests/mcfunction/selectors.txt",
  "#{__DIR__}/tests/php/anonymous_class.txt",
  "#{__DIR__}/tests/html/javascript_unclosed.txt",
+# BAD FOR ONIGMO
+"#{__DIR__}/tests/json/test_backtracking.txt",
+

 }

@@ -58,6 +61,7 @@ describe Tartrazine do
        end
      else
        it "parses #{testcase}".split("/")[-2...].join("/") do
+          p! testcase
          text = File.read(testcase).split("---input---\n").last.split("---tokens---").first
          lexer_name = File.basename(File.dirname(testcase)).downcase
          unless failing_lexers.includes?(lexer_name) ||
--- a/src/actions.cr
+++ b/src/actions.cr
@@ -30,11 +30,11 @@ module Tartrazine
    end

    # ameba:disable Metrics/CyclomaticComplexity
-    def emit(match : Regex::MatchData?, lexer : Lexer, match_group = 0) : Array(Token)
+    def emit(match, lexer : Lexer, match_group = 0) : Array(Token)
      case type
      when "token"
        raise Exception.new "Can't have a token without a match" if match.nil?
-        [Token.new(type: xml["type"], value: match[match_group])]
+        [Token.new(type: xml["type"], value: match[match_group].as(Onigmo::Match).value)]
      when "push"
        states_to_push = xml.attributes.select { |attrib|
          attrib.name == "state"
@@ -88,14 +88,14 @@ module Tartrazine
        return [] of Token if match.nil?
        lexer_name = xml["lexer"].downcase
        Log.trace { "to tokenize: #{match[match_group]}" }
-        Tartrazine.lexer(lexer_name).tokenize(match[match_group], usingself: true)
+        Tartrazine.lexer(lexer_name).tokenize(match[match_group].as(Onigmo::Match).value, usingself: true)
      when "usingself"
        # Shunt to another copy of this lexer
        return [] of Token if match.nil?

        new_lexer = Lexer.from_xml(lexer.xml)
        Log.trace { "to tokenize: #{match[match_group]}" }
-        new_lexer.tokenize(match[match_group], usingself: true)
+        new_lexer.tokenize(match[match_group].as(Onigmo::Match).value, usingself: true)
      when "combined"
        # Combine two states into one anonymous state
        states = xml.attributes.select { |attrib|
--- a/src/formatters/html.cr
+++ b/src/formatters/html.cr
@@ -15,6 +15,7 @@ module Tartrazine
    property? standalone : Bool = false
    property? surrounding_pre : Bool = true
    property? wrap_long_lines : Bool = false
+    property? weight_of_bold : Int32 = 600

    def format(text : String, lexer : Lexer, theme : Theme) : String
      text = format_text(text, lexer, theme)
@@ -43,7 +44,7 @@ module Tartrazine
          pre_style = wrap_long_lines? ? "style=\"white-space: pre-wrap; word-break: break-word;\"" : ""
          outp << "<pre class=\"#{get_css_class("Background", theme)}\" #{pre_style}>"
        end
-        "<code class=\"#{get_css_class("Background", theme)}\">"
+        outp << "<code class=\"#{get_css_class("Background", theme)}\">"
        lines.each_with_index(offset: line_number_start - 1) do |line, i|
          line_label = line_numbers? ? "#{i + 1}".rjust(4).ljust(5) : ""
          line_class = highlighted?(i + 1) ? "class=\"#{get_css_class("LineHighlight", theme)}\"" : ""
@@ -72,7 +73,7 @@ module Tartrazine
          # These are true/false/nil
          outp << "border: none;" if style.border == false
          outp << "font-weight: bold;" if style.bold
-          outp << "font-weight: 400;" if style.bold == false
+          outp << "font-weight: #{@weight_of_bold};" if style.bold == false
          outp << "font-style: italic;" if style.italic
          outp << "font-style: normal;" if style.italic == false
          outp << "text-decoration: underline;" if style.underline
--- a/src/onigmo.cr
+++ b/src/onigmo.cr
@@ -0,0 +1,85 @@
+@[Link("onigmo")]
+@[Link(ldflags: "#{__DIR__}/onigmo/onigwrap.o")]
+
+lib LibOnigmo
+  type Regex = Pointer(Void)
+  type Region = Pointer(Void)
+
+  fun create = onigwrap_create(pattern : LibC::Char*, len : UInt32,
+                               ignoreCase : Int32,
+                               multiline : Int32,
+                               dotall : Int32) : Regex
+  fun free = onigwrap_free(re : Regex)
+  fun region_free = onigwrap_region_free(region : Region)
+
+  fun search = onigwrap_search(re : Regex, str : LibC::Char*, offset : UInt32, length : UInt32) : Region
+  fun num_regs = onigwrap_num_regs(region : Region) : Int32
+  fun pos = onigwrap_pos(region : Region, index : Int32) : Int32
+  fun len = onigwrap_len(region : Region, index : Int32) : Int32
+end
+
+module Onigmo
+  class Match
+    property begin : Int32
+    property end : Int32
+    property value : String
+
+    def initialize(@begin, @end, @value)
+    end
+
+    def to_s
+      @value
+    end
+  end
+
+  class Regex
+    def initialize(@pattern : String, @ignorecase = false, @multiline = false, @dotall = false)
+      @re = LibOnigmo.create(@pattern.to_unsafe, @pattern.bytesize, @ignorecase ? 1 : 0, @multiline ? 1 : 0, @dotall ? 1 : 0)
+    end
+
+    def finalize
+      LibOnigmo.free(@re)
+    end
+
+    def match(str : String, offset = 0)
+      # The offset argument is a character index, but Onigmo expects a byte index
+      offset = str.char_index_to_byte_index(offset)
+      if offset.nil?
+        raise Exception.new "Invalid offset"
+      end
+
+      region = LibOnigmo.search(@re, str.to_unsafe, offset, str.bytesize)
+      result = [] of Match?
+      num_regs = LibOnigmo.num_regs(region)
+      if num_regs > 0
+        (0...num_regs).each do |i|
+          pos = LibOnigmo.pos(region, i)
+          l = LibOnigmo.len(region, i)
+          if pos == -1 || l == -1
+            result << nil
+          else
+            b = str.byte_index_to_char_index(pos)
+            e = str.byte_index_to_char_index(pos + l)
+            # p! pos, l, b, e, str[pos..]
+            if b.nil? || e.nil?
+              raise Exception.new "Invalid substring"
+            end
+
+            v = str[b...e]
+            result << Match.new(b, b + v.size, v)
+          end
+        end
+      else
+        return [] of Match
+      end
+      LibOnigmo.region_free(region)
+      result
+    end
+  end
+end
+
+# pattern = "\\w"
+# str = "α"
+
+# re = Onigmo::Regex.new(pattern, false, false, false)
+# p! re.match(str)
--- a/src/onigmo/onigwrap.c
+++ b/src/onigmo/onigwrap.c
@@ -0,0 +1,94 @@
+#include "onigmo.h"
+
+regex_t *onigwrap_create(char *pattern, int len, int ignoreCase, int multiline, int dotall)
+{
+	regex_t *reg;
+
+	OnigErrorInfo einfo;
+
+	OnigOptionType onigOptions = ONIG_OPTION_DEFAULT;
+
+	if (ignoreCase == 1)
+		onigOptions |= ONIG_OPTION_IGNORECASE;
+
+	if (multiline == 1)
+		onigOptions |= ONIG_OPTION_NEGATE_SINGLELINE;
+
+	if (dotall == 1)
+		onigOptions |= ONIG_OPTION_DOTALL;
+
+	OnigUChar *stringStart = (OnigUChar*) pattern;
+	OnigUChar *stringEnd   = (OnigUChar*) pattern + len;
+	int res = onig_new(&reg, stringStart, stringEnd, onigOptions, ONIG_ENCODING_UTF8, ONIG_SYNTAX_PYTHON, &einfo);
+
+	return reg;
+}
+
+void onigwrap_region_free(OnigRegion *region)	
+{
+	onig_region_free(region, 1);
+}
+
+void onigwrap_free(regex_t *reg)
+{
+	onig_free(reg);
+}
+
+int onigwrap_index_in(regex_t *reg, char *charPtr, int offset, int length)
+{
+	OnigUChar *stringStart  = (OnigUChar*) charPtr;
+	OnigUChar *stringEnd    = (OnigUChar*) (charPtr + length);
+	OnigUChar *stringOffset = (OnigUChar*) (charPtr + offset);
+	OnigUChar *stringRange  = (OnigUChar*) stringEnd;
+
+	OnigRegion *region = onig_region_new();
+	int result = onig_search(reg, stringStart, stringEnd, stringOffset, stringRange, region, ONIG_OPTION_NONE);
+	onig_region_free(region, 1);
+
+	if (result >= 0)
+		return result >> 1;
+	if (result == ONIG_MISMATCH)
+		return -1;
+	return -2;
+}
+
+OnigRegion *onigwrap_search(regex_t *reg, char *charPtr, int offset, int length)
+{
+	OnigUChar *stringStart  = (OnigUChar*) charPtr;
+	OnigUChar *stringEnd    = (OnigUChar*) (charPtr + length);
+	OnigUChar *stringOffset = (OnigUChar*) (charPtr + offset);
+	OnigUChar *stringRange  = (OnigUChar*) stringEnd;
+
+	OnigRegion *region = onig_region_new();
+
+	int result = onig_search(reg, stringStart, stringEnd, stringOffset, stringRange, region, ONIG_OPTION_NONE);
+	return region;
+}
+
+int onigwrap_num_regs(OnigRegion *region)
+{
+	return region->num_regs;
+}
+
+int onigwrap_pos(OnigRegion *region, int nth)
+{
+	if (nth < region->num_regs)
+	{
+		int result = region->beg[nth];
+		if (result < 0)
+			return -1;
+		return result;
+	}
+	return -1;
+}
+
+int onigwrap_len(OnigRegion *region, int nth)
+{
+	if (nth < region->num_regs)
+	{
+		int result = region->end[nth] - region->beg[nth];
+		return result;
+	}
+	return -1;
+}
+
--- a/src/onigmo/onigwrap.h
+++ b/src/onigmo/onigwrap.h
@@ -0,0 +1,32 @@
+#include "onigmo.h"
+
+#if defined(_WIN32)
+#define ONIGWRAP_EXTERN extern __declspec(dllexport)
+#else
+#define ONIGWRAP_EXTERN extern
+#endif
+
+ONIGWRAP_EXTERN
+regex_t *onigwrap_create(char *pattern, int len, int ignoreCase, int multiline);
+
+ONIGWRAP_EXTERN
+void onigwrap_region_free(OnigRegion *region);
+
+ONIGWRAP_EXTERN
+void onigwrap_free(regex_t *reg);
+
+ONIGWRAP_EXTERN
+int onigwrap_index_in(regex_t *reg, char *charPtr, int offset, int length);
+
+ONIGWRAP_EXTERN
+OnigRegion *onigwrap_search(regex_t *reg, char *charPtr, int offset, int length);
+
+ONIGWRAP_EXTERN
+int onigwrap_num_regs(OnigRegion *region);
+
+ONIGWRAP_EXTERN
+int onigwrap_pos(OnigRegion *region, int nth);
+
+ONIGWRAP_EXTERN
+int onigwrap_len(OnigRegion *region, int nth);
+
--- a/src/rules.cr
+++ b/src/rules.cr
@@ -3,6 +3,7 @@ require "./formatter"
 require "./rules"
 require "./styles"
 require "./lexer"
+require "./onigmo"

 # These are lexer rules. They match with the text being parsed
 # and perform actions, either emitting tokens or changing the
@@ -10,16 +11,22 @@ require "./lexer"
 module Tartrazine
  # This rule matches via a regex pattern

+  alias Regex = Onigmo::Regex
+
  class Rule
-    property pattern : Regex = Re2.new ""
+    property pattern : Regex = Regex.new ""
+    property pattern2 : ::Regex = ::Regex.new ""
    property actions : Array(Action) = [] of Action
    property xml : String = "foo"

    def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token))
      match = pattern.match(text, pos)
+      match2 = pattern2.match(text, pos)
      # We don't match if the match doesn't move the cursor
      # because that causes infinite loops
-      return false, pos, [] of Token if match.nil? || match.end == 0
+      # The `match.begin > pos` is the same as the ANCHORED option
+      return false, pos, [] of Token if match.empty? || match[0].nil? || match[0].try { |m| m.begin > pos }
+      # p! match.map(&.to_s), match2, text[pos-1..pos + 20],"----------------------"
      # Log.trace { "#{match}, #{pattern.inspect}, #{text}, #{pos}" }
      tokens = [] of Token
      # Emit the tokens
@@ -27,18 +34,23 @@ module Tartrazine
        # Emit the token
        tokens += action.emit(match, lexer)
      end
-      Log.trace { "#{xml}, #{match.end}, #{tokens}" }
-      return true, match.end, tokens
+      # Log.trace { "#{xml}, #{match[0].end}, #{tokens}" }
+      return true, pos + match[0].as(Onigmo::Match).value.size, tokens
    end

    def initialize(node : XML::Node, multiline, dotall, ignorecase)
      @xml = node.to_s
-      @pattern = Re2.new(
-        node["pattern"],
-        multiline,
-        dotall,
-        ignorecase,
-        anchored: true)
+      pattern = node["pattern"]
+      # flags = Regex::Options::ANCHORED
+      flags = ::Regex::Options::NO_UTF_CHECK
+      # MULTILINE implies DOTALL which we don't want, so we
+      # use in-pattern flag (?m) instead
+      flags |= ::Regex::Options::MULTILINE if multiline
+      pattern = "(?m)" + pattern if multiline
+      flags |= ::Regex::Options::DOTALL if dotall
+      flags |= ::Regex::Options::IGNORE_CASE if ignorecase
+      @pattern = Regex.new(pattern, ignorecase, multiline, dotall)
+      @pattern2 = ::Regex.new(pattern, flags)
      add_actions(node)
    end

@@ -90,25 +102,4 @@ module Tartrazine
      add_actions(node)
    end
  end
-
-  # This is a hack to workaround that Crystal seems to disallow
-  # having regexes multiline but not dot_all
-  class Re2 < Regex
-    @source = "fa"
-    @options = Regex::Options::None
-    @jit = true
-
-    def initialize(pattern : String, multiline = false, dotall = false, ignorecase = false, anchored = false)
-      flags = LibPCRE2::UTF | LibPCRE2::DUPNAMES |
-              LibPCRE2::UCP
-      flags |= LibPCRE2::MULTILINE if multiline
-      flags |= LibPCRE2::DOTALL if dotall
-      flags |= LibPCRE2::CASELESS if ignorecase
-      flags |= LibPCRE2::ANCHORED if anchored
-      flags |= LibPCRE2::NO_UTF_CHECK
-      @re = Regex::PCRE2.compile(pattern, flags) do |error_message|
-        raise Exception.new(error_message)
-      end
-    end
-  end
 end
Author	SHA1	Message	Date
Roberto Alsina	32816eb207	CLose to 100% tests working, but slooooooow	2024-08-13 20:45:46 -03:00
Roberto Alsina	d2b61fdc6c	More tests pass	2024-08-13 20:09:36 -03:00
Roberto Alsina	a704c59fa9	Some tests pass!	2024-08-13 19:19:12 -03:00
Roberto Alsina	2a9e7fde0d	Working onigmo wrapper, but onigmo doesn't support anchored regexes	2024-08-13 14:02:13 -03:00
Roberto Alsina	d49d0969a9	Started binding, ran into things I don't know how to bind	2024-08-12 20:10:50 -03:00
Roberto Alsina	ce6f3d29b5	Remove Re2 hack	2024-08-12 19:01:13 -03:00
Roberto Alsina	46d6d3f467	Make how-heavy-is-bold configurable	2024-08-12 10:55:58 -03:00
Roberto Alsina	78ddc69937	Merge branch 'main' of github.com:ralsina/tartrazine	2024-08-12 10:11:03 -03:00
Roberto Alsina	b1ad7b64c0	oops	2024-08-12 10:10:51 -03:00