From ff1c0012eca9af9f0ed806bb6943fd9ab0b6d458 Mon Sep 17 00:00:00 2001 From: Roberto Alsina Date: Wed, 7 Aug 2024 13:11:19 -0300 Subject: [PATCH] Exploring re2, doesn't really work --- src/actions.cr | 30 +++++++----- src/cre2/Makefile | 5 ++ src/cre2/cre2.cpp | 121 ++++++++++++++++++++++++++++++++++++++++++++++ src/cre2/cre2.cr | 69 ++++++++++++++++++++++++++ src/cre2/cre2.h | 66 +++++++++++++++++++++++++ src/cre2/cre2.o | Bin 0 -> 5832 bytes src/re2.cr | 0 src/rules.cr | 54 ++++++++++++++++++--- src/tartrazine.cr | 2 - 9 files changed, 324 insertions(+), 23 deletions(-) create mode 100644 src/cre2/Makefile create mode 100644 src/cre2/cre2.cpp create mode 100644 src/cre2/cre2.cr create mode 100644 src/cre2/cre2.h create mode 100644 src/cre2/cre2.o create mode 100644 src/re2.cr diff --git a/src/actions.cr b/src/actions.cr index b899859..bd46a38 100644 --- a/src/actions.cr +++ b/src/actions.cr @@ -1,3 +1,5 @@ +require "xml" + # These are Lexer actions. When a rule matches, it will # perform a list of actions. These actions can emit tokens # or change the state machine. @@ -24,11 +26,11 @@ module Tartrazine end # ameba:disable Metrics/CyclomaticComplexity - def emit(match : Regex::MatchData?, lexer : Lexer, match_group = 0) : Array(Token) + def emit(matches : Pointer(LibCre2::StringPiece), lexer : Lexer, match_group = 0) : Array(Token) case type when "token" - raise Exception.new "Can't have a token without a match" if match.nil? - [Token.new(type: xml["type"], value: match[match_group])] + raise Exception.new "Can't have a token without a match" if matches[0].length == 0 + [Token.new(type: xml["type"], value: String.new(Slice.new(matches[0].data, matches[0].length)))] when "push" states_to_push = xml.attributes.select { |attrib| attrib.name == "state" @@ -61,35 +63,37 @@ module Tartrazine when "bygroups" # FIXME: handle # > - # + # https://github.com/google/re2/wiki/Syntax # None # # # where that None means skipping a group # - raise Exception.new "Can't have a token without a match" if match.nil? + raise Exception.new "Can't have a bygroups without a match" if matches[0].length == 0 # Each group matches an action. If the group match is empty, # the action is skipped. result = [] of Token @actions.each_with_index do |e, i| - next if match[i + 1]?.nil? - result += e.emit(match, lexer, i + 1) + next if matches[i].length == 0 + result += e.emit(matches, lexer, i) end result when "using" # Shunt to another lexer entirely - return [] of Token if match.nil? + return [] of Token if matches[0].length == 0 lexer_name = xml["lexer"].downcase - Log.trace { "to tokenize: #{match[match_group]}" } - Tartrazine.lexer(lexer_name).tokenize(match[match_group], usingself: true) + # Log.trace { "to tokenize: #{match[match_group]}" } + to_tokenize = String.new(Slice.new(matches[match_group].data, matches[match_group].length)) + Tartrazine.lexer(lexer_name).tokenize(to_tokenize, usingself: true) when "usingself" # Shunt to another copy of this lexer - return [] of Token if match.nil? + return [] of Token if matches[0].length == 0 new_lexer = Lexer.from_xml(lexer.xml) - Log.trace { "to tokenize: #{match[match_group]}" } - new_lexer.tokenize(match[match_group], usingself: true) + # Log.trace { "to tokenize: #{match[match_group]}" } + to_tokenize = String.new(Slice.new(matches[match_group].data, matches[match_group].length)) + new_lexer.tokenize(to_tokenize, usingself: true) when "combined" # Combine two states into one anonymous state states = xml.attributes.select { |attrib| diff --git a/src/cre2/Makefile b/src/cre2/Makefile new file mode 100644 index 0000000..4667226 --- /dev/null +++ b/src/cre2/Makefile @@ -0,0 +1,5 @@ +all: cre2.o +clean: + rm -f cre2.o +cre2.o: cre2.cpp cre2.h + g++ -O3 -c -o cre2.o cre2.cpp diff --git a/src/cre2/cre2.cpp b/src/cre2/cre2.cpp new file mode 100644 index 0000000..51d0906 --- /dev/null +++ b/src/cre2/cre2.cpp @@ -0,0 +1,121 @@ +#include +#include "cre2.h" + +#define TO_OPT(opt) (reinterpret_cast(opt)) + +cre2_options *cre2_opt_new(void) { + return reinterpret_cast(new RE2::Options()); +} + +void cre2_opt_delete(cre2_options *opt) { + delete TO_OPT(opt); +} + + +#define OPT_bool(name) \ +void cre2_opt_##name(cre2_options *opt, int flag) { \ + TO_OPT(opt)->set_##name(bool(flag)); \ +} + +OPT_bool(posix_syntax) +OPT_bool(longest_match) +OPT_bool(log_errors) +OPT_bool(literal) +OPT_bool(never_nl) +OPT_bool(case_sensitive) +OPT_bool(perl_classes) +OPT_bool(word_boundary) +OPT_bool(one_line) + +#undef OPT_BOOL + + +void cre2_opt_encoding(cre2_options *opt, encoding_t enc) { + switch (enc) { + case CRE2_UTF8: + TO_OPT(opt)->set_encoding(RE2::Options::EncodingUTF8); + break; + case CRE2_Latin1: + TO_OPT(opt)->set_encoding(RE2::Options::EncodingLatin1); + break; + } +} + +void cre2_opt_max_mem(cre2_options *opt, int m) { + TO_OPT(opt)->set_max_mem(m); +} + + +#define TO_RE2(re) (reinterpret_cast(re)) +#define TO_CONST_RE2(re) (reinterpret_cast(re)) + +cre2 *cre2_new(const char *pattern, int patternlen, const cre2_options *opt) { + re2::StringPiece pattern_re2(pattern, patternlen); + return reinterpret_cast( + new RE2(pattern_re2, *reinterpret_cast(opt))); +} + +void cre2_delete(cre2 *re) { + delete TO_RE2(re); +} + + +int cre2_error_code(const cre2 *re) { + return int(TO_CONST_RE2(re)->error_code()); +} + +const char *cre2_error_string(const cre2 *re) { + return TO_CONST_RE2(re)->error().c_str(); +} + +void cre2_error_arg(const cre2 *re, struct string_piece *arg) { + const std::string &argstr = TO_CONST_RE2(re)->error_arg(); + arg->data = argstr.data(); + arg->length = argstr.length(); +} + +int cre2_num_capturing_groups(const cre2 *re) { + return TO_CONST_RE2(re)->NumberOfCapturingGroups(); +} + +int cre2_program_size(const cre2 *re) { + return TO_CONST_RE2(re)->ProgramSize(); +} + + +int cre2_match( + const cre2 *re + , const char *text + , int textlen + , int startpos + , int endpos + , anchor_t anchor + , struct string_piece *match + , int nmatch) { + + re2::StringPiece text_re2(text, textlen); + // FIXME: exceptions? + re2::StringPiece *match_re2 = new re2::StringPiece[nmatch]; + + RE2::Anchor anchor_re2 = RE2::UNANCHORED; + switch (anchor) { + case CRE2_ANCHOR_START: + anchor_re2 = RE2::ANCHOR_START; break; + case CRE2_ANCHOR_BOTH: + anchor_re2 = RE2::ANCHOR_BOTH; break; + } + + bool ret = TO_CONST_RE2(re) + ->Match(text_re2, startpos, endpos, anchor_re2, match_re2, nmatch); + + if (ret) { + for (int i=0; i_1KO1dKH-I5-a^v#mqBk7Mz z`j(bn|2X9K)jtlq>tF^M##V+SVF(qJAZE}p1#7xy-qpN4t^Vp5keqk_?F(R~9?f}w z9n-wujba-84CcKL^WGmd?<37y(Y)m#tB^0vyZPJ+^|*RsS@W*=se+Wi_?W-GQO)~T zVClX07CL^;x4`_#`D_elo>*O7-L{th#&A+=2=O+C2en32YouYwE#861gAQ0@I4zQ; zAe7dcsmIi3)UllRH}&Wk&Z=KC-FPA2I29k)yi@Urmybs^?@q?c#Cx<$xAt|PhHzQV zyQ`VkHP4OrX1w3zE`8XU^KRt5zXy|Q0ES%T&iAa-PTzIo!WLQbcNnCh-i&H1>3^Uz zZSOlXx70Ur8S3}L?#68DYW;qP+WpF?DVy}zINK=a%^PPAx$_Dlb@x*Ce~busqEt|12X&M4F$km3FNpNjPlA0GQ$EVU~! zkVwS_k^{SvgUJEktP|cbc(5B7u`mlYk#Px;9-LFgdZ;0E@(2sfM?-xZ)<>405&5U7 zZ7-G?CF&{>h6i~U@lQ~ROr!dGfgNwds%=hYu;e=Q=V<>`Z4y z>92QXl;nv=m3{!K(hCMdN<@7u$nm7#o3Y1veK(A#+erso*A9a{&rquw@42vGk86U0 z>wui=JU_U1HVNbUX9d`k@L)b=*#`ja)6~Gv&_U4K*^iR_IN9@kStMSS_#$yEQMmmp zP{(nXeA?pQC;qy`e@6VG#D7b?De(`8Uz0eVF&y`Oi7T*>$nOxxJfg&iV~N7!PZ3WN z$Fqj=6zJQ-eD}lem;ke9fJdn=7Oy>o{}?#Nvj`>1_Ml*RjkNP~mFzE)J&)iI#GAx< zJbxm7jW|Dt|0I5gI6sR$u$3_${6pdUZ3pP>;u#|QDB1gG75W`|i2dk8_!|%5KYR#R zA?|kZFw?OH5@xmPS9IH&OPH0?WT2G|*DMHnrRt&u)1S5H%7F=@NvmYJ7SPGk$gIGt zm0EFLuPxZFG2gN-RqSc2=IXw45l`!u<5ZlQz>2Qr7^N1lty#;_ZGo6Z&C+X@T`Ri9 zS*vAPwVaY}mW*1>5**G|oJoD6GGk8~&O*z$Vq1EtXbasf+pJ6$?dg_H*_hYM7R?Tm zkU5rctf_>qPtVWmFkG!-8>OPVpwA{D6^D01UUweJ4(v~5N8Qw(38Pjt^_uH|Q+>8* z&E-tMDYIZWy6YH4x0W-r*&_!Jk0$j!U#+^uie1YxeaxOK+LQVbt2R@n@Yb@uHbf?s zoecu@6R1PzWE<$gsG^18(9G-kMD3mok?P=_A~9Q`knep%xD{=6yiCh4~&&hJmY3D)vJ zc6e{1aL(=eCC=~3MS=VQGI74#|hkI@0SAT`MS ze*%5mt^d1}zbf^|{2}7~dH&js4QA};v>}@`4?N>IPq{y?NxM|d(+?y5I|a;qk@JUs R+w~vB29RF`#oAl$|8HAHO-ld( literal 0 HcmV?d00001 diff --git a/src/re2.cr b/src/re2.cr new file mode 100644 index 0000000..e69de29 diff --git a/src/rules.cr b/src/rules.cr index ea0ddc0..5efb81a 100644 --- a/src/rules.cr +++ b/src/rules.cr @@ -1,3 +1,4 @@ +require "./cre2/cre2" require "./actions" # These are lexer rules. They match with the text being parsed @@ -7,29 +8,32 @@ module Tartrazine # This rule matches via a regex pattern class Rule - property pattern : Regex = Re2.new "" + property pattern : Re3 = Re3.new "" property actions : Array(Action) = [] of Action property xml : String = "foo" def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token)) - match = pattern.match(text, pos) + return false, pos, [] of Token + matched, matches = pattern.match(text, pos) # We don't match if the match doesn't move the cursor # because that causes infinite loops - return false, pos, [] of Token if match.nil? || match.end == 0 + + return false, pos, [] of Token unless matched # Log.trace { "#{match}, #{pattern.inspect}, #{text}, #{pos}" } tokens = [] of Token # Emit the tokens actions.each do |action| # Emit the token - tokens += action.emit(match, lexer) + tokens += action.emit(matches, lexer) end - Log.trace { "#{xml}, #{match.end}, #{tokens}" } - return true, match.end, tokens + # Log.trace { "#{xml}, #{match.end}, #{tokens}" } + return true, matches[0].length, tokens end def initialize(node : XML::Node, multiline, dotall, ignorecase) @xml = node.to_s - @pattern = Re2.new( + p! node["pattern"] + @pattern = Re3.new( node["pattern"], multiline, dotall, @@ -76,7 +80,7 @@ module Tartrazine def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token)) tokens = [] of Token actions.each do |action| - tokens += action.emit(nil, lexer) + tokens += action.emit(Pointer(LibCre2::StringPiece).malloc(1), lexer) end return true, pos, tokens end @@ -106,4 +110,38 @@ module Tartrazine end end end + + class Re3 + @matches = Pointer(LibCre2::StringPiece).malloc(50) + @opts : LibCre2::Options + + @re : LibCre2::CRe2 + + def group_count + LibCre2.num_capturing_groups(@re) + end + + def initialize(pattern : String, multiline = false, dotall = false, + ignorecase = false, anchored = false) + @opts = LibCre2.opt_new + LibCre2.opt_posix_syntax(@opts, false) + LibCre2.opt_longest_match(@opts, true) + LibCre2.opt_perl_classes(@opts, true) + LibCre2.opt_encoding(@opts, 1) + LibCre2.opt_one_line(@opts, !multiline) + LibCre2.opt_case_sensitive(@opts, !ignorecase) + pattern = "(m?)#{pattern}" if multiline + @re = LibCre2.new(pattern, pattern.size, @opts) + end + + def match(text, pos) + matched = LibCre2.match(@re, text, text.size, pos, text.size, + LibCre2::CRE2_ANCHOR_START, @matches, 50) + return {matched != 0, @matches} + end + end end + +re = Tartrazine::Re3.new("(require-instance|fraction-digits|error-app-tag|error-message|min-elements|max-elements|yin-element|ordered-by|position|modifier|default|pattern|length|status|units|value|range|type|path|enum|base|bit)(?=[^\w\-\:])") +p! re.match("value ", 0) + diff --git a/src/tartrazine.cr b/src/tartrazine.cr index a6a1f03..82c471f 100644 --- a/src/tartrazine.cr +++ b/src/tartrazine.cr @@ -63,8 +63,6 @@ module Tartrazine tokens = [] of Token pos = 0 matched = false - time = 0 - count = 0 # Respect the `ensure_nl` config option if text.size > 0 && text[-1] != '\n' && config[:ensure_nl] && !usingself