From a704c59fa9fb220d52756c8118a8da26dba439bd Mon Sep 17 00:00:00 2001 From: Roberto Alsina Date: Tue, 13 Aug 2024 19:19:12 -0300 Subject: [PATCH] Some tests pass! --- spec/tartrazine_spec.cr | 1 - src/actions.cr | 8 +++--- src/formatters/html.cr | 2 +- src/onigmo.cr | 62 +++++++++++++++++++++++++++++++---------- src/onigmo/onigwrap.c | 2 +- src/rules.cr | 20 ++++++++----- 6 files changed, 67 insertions(+), 28 deletions(-) diff --git a/spec/tartrazine_spec.cr b/spec/tartrazine_spec.cr index 4615a25..4a06dc2 100644 --- a/spec/tartrazine_spec.cr +++ b/spec/tartrazine_spec.cr @@ -42,7 +42,6 @@ known_bad = { "#{__DIR__}/tests/mcfunction/selectors.txt", "#{__DIR__}/tests/php/anonymous_class.txt", "#{__DIR__}/tests/html/javascript_unclosed.txt", - } # Tests that fail because of a limitation in PCRE2 diff --git a/src/actions.cr b/src/actions.cr index 4ed4008..633583d 100644 --- a/src/actions.cr +++ b/src/actions.cr @@ -30,11 +30,11 @@ module Tartrazine end # ameba:disable Metrics/CyclomaticComplexity - def emit(match : Regex::MatchData?, lexer : Lexer, match_group = 0) : Array(Token) + def emit(match, lexer : Lexer, match_group = 0) : Array(Token) case type when "token" raise Exception.new "Can't have a token without a match" if match.nil? - [Token.new(type: xml["type"], value: match[match_group])] + [Token.new(type: xml["type"], value: match[match_group].as(Onigmo::Match).value)] when "push" states_to_push = xml.attributes.select { |attrib| attrib.name == "state" @@ -88,14 +88,14 @@ module Tartrazine return [] of Token if match.nil? lexer_name = xml["lexer"].downcase Log.trace { "to tokenize: #{match[match_group]}" } - Tartrazine.lexer(lexer_name).tokenize(match[match_group], usingself: true) + Tartrazine.lexer(lexer_name).tokenize(match[match_group].as(Onigmo::Match).value, usingself: true) when "usingself" # Shunt to another copy of this lexer return [] of Token if match.nil? new_lexer = Lexer.from_xml(lexer.xml) Log.trace { "to tokenize: #{match[match_group]}" } - new_lexer.tokenize(match[match_group], usingself: true) + new_lexer.tokenize(match[match_group].as(Onigmo::Match).value, usingself: true) when "combined" # Combine two states into one anonymous state states = xml.attributes.select { |attrib| diff --git a/src/formatters/html.cr b/src/formatters/html.cr index 20f41a2..c8e7611 100644 --- a/src/formatters/html.cr +++ b/src/formatters/html.cr @@ -73,7 +73,7 @@ module Tartrazine # These are true/false/nil outp << "border: none;" if style.border == false outp << "font-weight: bold;" if style.bold - outp << "font-weight: #{weight_of_bold};" if style.bold == false + outp << "font-weight: #{@weight_of_bold};" if style.bold == false outp << "font-style: italic;" if style.italic outp << "font-style: normal;" if style.italic == false outp << "text-decoration: underline;" if style.underline diff --git a/src/onigmo.cr b/src/onigmo.cr index 255cde2..4277fce 100644 --- a/src/onigmo.cr +++ b/src/onigmo.cr @@ -1,15 +1,14 @@ @[Link("onigmo")] @[Link(ldflags: "#{__DIR__}/onigmo/onigwrap.o")] -lib Onigmo +lib LibOnigmo type Regex = Pointer(Void) type Region = Pointer(Void) fun create = onigwrap_create(pattern : LibC::Char*, len : UInt32, ignoreCase : Int32, multiline : Int32, - dotall : Int32, - anchored : Int32) : Regex + dotall : Int32) : Regex fun free = onigwrap_free(re : Regex) fun region_free = onigwrap_region_free(region : Region) @@ -19,16 +18,51 @@ lib Onigmo fun len = onigwrap_len(region : Region, index : Int32) : Int32 end -pattern = "a" +module Onigmo + class Match + property begin : Int32 + property end : Int32 + property value : String + + def initialize(@begin, @end, @value) + end + end + + class Regex + def initialize(@pattern : String, @ignorecase = false, @multiline = false, @dotall = false) + @re = LibOnigmo.create(@pattern.to_unsafe, @pattern.bytesize, @ignorecase ? 1 : 0, @multiline ? 1 : 0, @dotall ? 1 : 0) + end + + def finalize + LibOnigmo.free(@re) + end + + def match(str : String, offset = 0) + region = LibOnigmo.search(@re, str.to_unsafe, offset, str.bytesize) + result = [] of Match? + num_regs = LibOnigmo.num_regs(region) + if num_regs > 0 + (0...num_regs).each do |i| + b = LibOnigmo.pos(region, i) + e = b + LibOnigmo.len(region, i) + if b == -1 || e == -1 + result << nil + else + v = str[b...e] + result << Match.new(b, e, v) + end + end + else + return [] of Match + end + LibOnigmo.region_free(region) + result + end + end +end + +pattern = "#.*x" str = "# foobar" -re = Onigmo.create(pattern, pattern.size, false, true, false, false) -region = Onigmo.search(re, str.to_unsafe, 0, str.bytesize) -num_regs = Onigmo.num_regs(region) -(0...num_regs).each do |i| - pos = Onigmo.pos(region, i) - len = Onigmo.len(region, i) - puts "match #{i}: #{str[pos, len]}" -end -Onigmo.region_free(region) -Onigmo.free(re) +re = Onigmo::Regex.new(pattern, false, false, false) +p! re.match(str) diff --git a/src/onigmo/onigwrap.c b/src/onigmo/onigwrap.c index 75bc4ae..68da0bb 100644 --- a/src/onigmo/onigwrap.c +++ b/src/onigmo/onigwrap.c @@ -1,6 +1,6 @@ #include "onigmo.h" -regex_t *onigwrap_create(char *pattern, int len, int ignoreCase, int multiline, int dotall, int anchored ) +regex_t *onigwrap_create(char *pattern, int len, int ignoreCase, int multiline, int dotall) { regex_t *reg; diff --git a/src/rules.cr b/src/rules.cr index 1f4a7eb..57b1dd0 100644 --- a/src/rules.cr +++ b/src/rules.cr @@ -3,6 +3,7 @@ require "./formatter" require "./rules" require "./styles" require "./lexer" +require "./onigmo" # These are lexer rules. They match with the text being parsed # and perform actions, either emitting tokens or changing the @@ -10,6 +11,8 @@ require "./lexer" module Tartrazine # This rule matches via a regex pattern + alias Regex = Onigmo::Regex + class Rule property pattern : Regex = Regex.new "" property actions : Array(Action) = [] of Action @@ -19,7 +22,9 @@ module Tartrazine match = pattern.match(text, pos) # We don't match if the match doesn't move the cursor # because that causes infinite loops - return false, pos, [] of Token if match.nil? || match.end == 0 + # The `match.begin > pos` is the same as the ANCHORED option + return false, pos, [] of Token if match.empty? || match[0].nil? || match[0].try { |m| m.begin > pos } + # p! match.map(&.value), text[pos..pos + 20] # Log.trace { "#{match}, #{pattern.inspect}, #{text}, #{pos}" } tokens = [] of Token # Emit the tokens @@ -27,21 +32,22 @@ module Tartrazine # Emit the token tokens += action.emit(match, lexer) end - Log.trace { "#{xml}, #{match.end}, #{tokens}" } - return true, match.end, tokens + # Log.trace { "#{xml}, #{match[0].end}, #{tokens}" } + return true, match[0].as(Onigmo::Match).end, tokens end def initialize(node : XML::Node, multiline, dotall, ignorecase) @xml = node.to_s pattern = node["pattern"] - flags = Regex::Options::ANCHORED + # flags = Regex::Options::ANCHORED + # flags = Regex::Options::NO_UTF_CHECK # MULTILINE implies DOTALL which we don't want, so we # use in-pattern flag (?m) instead # flags |= Regex::Options::MULTILINE if multiline pattern = "(?m)" + pattern if multiline - flags |= Regex::Options::DOTALL if dotall - flags |= Regex::Options::IGNORE_CASE if ignorecase - @pattern = Regex.new(pattern, flags) + # flags |= Regex::Options::DOTALL if dotall + # flags |= Regex::Options::IGNORE_CASE if ignorecase + @pattern = Regex.new(pattern, ignorecase, multiline, dotall) add_actions(node) end