Exploring re2, doesn't really work

This commit is contained in:
Roberto Alsina 2024-08-07 13:11:19 -03:00
parent e0f697f1f9
commit ff1c0012ec
9 changed files with 324 additions and 23 deletions

View File

@ -1,3 +1,5 @@
require "xml"
# These are Lexer actions. When a rule matches, it will # These are Lexer actions. When a rule matches, it will
# perform a list of actions. These actions can emit tokens # perform a list of actions. These actions can emit tokens
# or change the state machine. # or change the state machine.
@ -24,11 +26,11 @@ module Tartrazine
end end
# ameba:disable Metrics/CyclomaticComplexity # ameba:disable Metrics/CyclomaticComplexity
def emit(match : Regex::MatchData?, lexer : Lexer, match_group = 0) : Array(Token) def emit(matches : Pointer(LibCre2::StringPiece), lexer : Lexer, match_group = 0) : Array(Token)
case type case type
when "token" when "token"
raise Exception.new "Can't have a token without a match" if match.nil? raise Exception.new "Can't have a token without a match" if matches[0].length == 0
[Token.new(type: xml["type"], value: match[match_group])] [Token.new(type: xml["type"], value: String.new(Slice.new(matches[0].data, matches[0].length)))]
when "push" when "push"
states_to_push = xml.attributes.select { |attrib| states_to_push = xml.attributes.select { |attrib|
attrib.name == "state" attrib.name == "state"
@ -61,35 +63,37 @@ module Tartrazine
when "bygroups" when "bygroups"
# FIXME: handle # FIXME: handle
# ><bygroups> # ><bygroups>
# <token type="Punctuation"/> # <token type="Punctuation"/>https://github.com/google/re2/wiki/Syntax
# None # None
# <token type="LiteralStringRegex"/> # <token type="LiteralStringRegex"/>
# #
# where that None means skipping a group # where that None means skipping a group
# #
raise Exception.new "Can't have a token without a match" if match.nil? raise Exception.new "Can't have a bygroups without a match" if matches[0].length == 0
# Each group matches an action. If the group match is empty, # Each group matches an action. If the group match is empty,
# the action is skipped. # the action is skipped.
result = [] of Token result = [] of Token
@actions.each_with_index do |e, i| @actions.each_with_index do |e, i|
next if match[i + 1]?.nil? next if matches[i].length == 0
result += e.emit(match, lexer, i + 1) result += e.emit(matches, lexer, i)
end end
result result
when "using" when "using"
# Shunt to another lexer entirely # Shunt to another lexer entirely
return [] of Token if match.nil? return [] of Token if matches[0].length == 0
lexer_name = xml["lexer"].downcase lexer_name = xml["lexer"].downcase
Log.trace { "to tokenize: #{match[match_group]}" } # Log.trace { "to tokenize: #{match[match_group]}" }
Tartrazine.lexer(lexer_name).tokenize(match[match_group], usingself: true) to_tokenize = String.new(Slice.new(matches[match_group].data, matches[match_group].length))
Tartrazine.lexer(lexer_name).tokenize(to_tokenize, usingself: true)
when "usingself" when "usingself"
# Shunt to another copy of this lexer # Shunt to another copy of this lexer
return [] of Token if match.nil? return [] of Token if matches[0].length == 0
new_lexer = Lexer.from_xml(lexer.xml) new_lexer = Lexer.from_xml(lexer.xml)
Log.trace { "to tokenize: #{match[match_group]}" } # Log.trace { "to tokenize: #{match[match_group]}" }
new_lexer.tokenize(match[match_group], usingself: true) to_tokenize = String.new(Slice.new(matches[match_group].data, matches[match_group].length))
new_lexer.tokenize(to_tokenize, usingself: true)
when "combined" when "combined"
# Combine two states into one anonymous state # Combine two states into one anonymous state
states = xml.attributes.select { |attrib| states = xml.attributes.select { |attrib|

5
src/cre2/Makefile Normal file
View File

@ -0,0 +1,5 @@
all: cre2.o
clean:
rm -f cre2.o
cre2.o: cre2.cpp cre2.h
g++ -O3 -c -o cre2.o cre2.cpp

121
src/cre2/cre2.cpp Normal file
View File

@ -0,0 +1,121 @@
#include <re2/re2.h>
#include "cre2.h"
#define TO_OPT(opt) (reinterpret_cast<RE2::Options *>(opt))
cre2_options *cre2_opt_new(void) {
return reinterpret_cast<void*>(new RE2::Options());
}
void cre2_opt_delete(cre2_options *opt) {
delete TO_OPT(opt);
}
#define OPT_bool(name) \
void cre2_opt_##name(cre2_options *opt, int flag) { \
TO_OPT(opt)->set_##name(bool(flag)); \
}
OPT_bool(posix_syntax)
OPT_bool(longest_match)
OPT_bool(log_errors)
OPT_bool(literal)
OPT_bool(never_nl)
OPT_bool(case_sensitive)
OPT_bool(perl_classes)
OPT_bool(word_boundary)
OPT_bool(one_line)
#undef OPT_BOOL
void cre2_opt_encoding(cre2_options *opt, encoding_t enc) {
switch (enc) {
case CRE2_UTF8:
TO_OPT(opt)->set_encoding(RE2::Options::EncodingUTF8);
break;
case CRE2_Latin1:
TO_OPT(opt)->set_encoding(RE2::Options::EncodingLatin1);
break;
}
}
void cre2_opt_max_mem(cre2_options *opt, int m) {
TO_OPT(opt)->set_max_mem(m);
}
#define TO_RE2(re) (reinterpret_cast<RE2 *>(re))
#define TO_CONST_RE2(re) (reinterpret_cast<const RE2 *>(re))
cre2 *cre2_new(const char *pattern, int patternlen, const cre2_options *opt) {
re2::StringPiece pattern_re2(pattern, patternlen);
return reinterpret_cast<void*>(
new RE2(pattern_re2, *reinterpret_cast<const RE2::Options *>(opt)));
}
void cre2_delete(cre2 *re) {
delete TO_RE2(re);
}
int cre2_error_code(const cre2 *re) {
return int(TO_CONST_RE2(re)->error_code());
}
const char *cre2_error_string(const cre2 *re) {
return TO_CONST_RE2(re)->error().c_str();
}
void cre2_error_arg(const cre2 *re, struct string_piece *arg) {
const std::string &argstr = TO_CONST_RE2(re)->error_arg();
arg->data = argstr.data();
arg->length = argstr.length();
}
int cre2_num_capturing_groups(const cre2 *re) {
return TO_CONST_RE2(re)->NumberOfCapturingGroups();
}
int cre2_program_size(const cre2 *re) {
return TO_CONST_RE2(re)->ProgramSize();
}
int cre2_match(
const cre2 *re
, const char *text
, int textlen
, int startpos
, int endpos
, anchor_t anchor
, struct string_piece *match
, int nmatch) {
re2::StringPiece text_re2(text, textlen);
// FIXME: exceptions?
re2::StringPiece *match_re2 = new re2::StringPiece[nmatch];
RE2::Anchor anchor_re2 = RE2::UNANCHORED;
switch (anchor) {
case CRE2_ANCHOR_START:
anchor_re2 = RE2::ANCHOR_START; break;
case CRE2_ANCHOR_BOTH:
anchor_re2 = RE2::ANCHOR_BOTH; break;
}
bool ret = TO_CONST_RE2(re)
->Match(text_re2, startpos, endpos, anchor_re2, match_re2, nmatch);
if (ret) {
for (int i=0; i<nmatch; i++) {
match[i].data = match_re2[i].data();
match[i].length = match_re2[i].length();
}
}
delete [] match_re2;
return int(ret);
}

69
src/cre2/cre2.cr Normal file
View File

@ -0,0 +1,69 @@
@[Link(ldflags: "#{__DIR__}/cre2.o -Wl,--copy-dt-needed-entries `pkg-config --libs re2`")]
lib LibCre2
type Options = Void*
fun opt_new = cre2_opt_new : Options
fun opt_delete = cre2_opt_delete(op : Options) : Nil
fun opt_posix_syntax = cre2_opt_posix_syntax(op : Options, flag : Bool) : Nil
fun opt_longest_match = cre2_opt_longest_match(op : Options, flag : Bool) : Nil
fun opt_log_errors = cre2_opt_log_errors(op : Options, flag : Bool) : Nil
fun opt_literal = cre2_opt_literal(op : Options, flag : Bool) : Nil
fun opt_never_nl = cre2_opt_never_nl(op : Options, flag : Bool) : Nil
fun opt_case_sensitive = cre2_opt_case_sensitive(op : Options, flag : Bool) : Nil
fun opt_perl_classes = cre2_opt_perl_classes(op : Options, flag : Bool) : Nil
fun opt_word_boundary = cre2_opt_word_boundary(op : Options, flag : Bool) : Nil
fun opt_one_line = cre2_opt_one_line(op : Options, flag : Bool) : Nil
fun opt_encoding = cre2_opt_encoding(op : Options, encoding : Int32) : Nil
fun opt_max_mem = cre2_opt_max_mem(op : Options, flag : Bool) : Nil
struct StringPiece
data : LibC::Char*
length : Int32
end
type CRe2 = Void*
fun new = cre2_new(pattern : LibC::Char*, patternlen : UInt32, opt : Options) : CRe2
fun del = cre2_delete(re : CRe2) : Nil
fun error_code = cre2_error_core(re : CRe2) : Int32
fun num_capturing_groups(re : CRe2) : Int32
fun program_size(re : CRe2) : Int32
# Invalidated by further re use
fun error_string = cre2_error_string(re : CRe2) : LibC::Char*
fun error_arg = cre2_error_arg(re : CRe2, arg : StringPiece*) : Nil
CRE2_UNANCHORED = 1
CRE2_ANCHOR_START = 2
CRE2_ANCHOR_BOTH = 3
fun match = cre2_match(
re : CRe2,
text : LibC::Char*,
textlen : UInt32,
startpos : UInt32,
endpos : UInt32,
anchor : Int32,
match : StringPiece*,
nmatch : Int32
) : Int32
end
# match = Pointer(LibCre2::StringPiece).malloc(10)
# opts = LibCre2.opt_new
# LibCre2.opt_posix_syntax(opts, true)
# LibCre2.opt_longest_match(opts, true)
# LibCre2.opt_perl_classes(opts, true)
# LibCre2.opt_encoding(opts, 1)
# # LibCre2.opt_one_line(opts, false)
# # LibCre2.opt_never_nl(opts, false)
# pattern = "(\\s+)(foo)"
# text = " foo"
# re = LibCre2.new(pattern, pattern.size, opts)
# p! LibCre2.match(re, text, text.size, 0, text.size,
# LibCre2::CRE2_ANCHOR_START, match, 10)
# (0...10).each do |i|
# p! String.new(Slice.new(match[i].data, match[i].length))
# end

66
src/cre2/cre2.h Normal file
View File

@ -0,0 +1,66 @@
#ifdef __cplusplus
extern "C" {
#endif
typedef void cre2_options;
typedef int encoding_t;
#define CRE2_UTF8 1
#define CRE2_Latin1 2
cre2_options *cre2_opt_new(void);
void cre2_opt_delete(cre2_options *opt);
void cre2_opt_posix_syntax(cre2_options *opt, int flag);
void cre2_opt_longest_match(cre2_options *opt, int flag);
void cre2_opt_log_errors(cre2_options *opt, int flag);
void cre2_opt_literal(cre2_options *opt, int flag);
void cre2_opt_never_nl(cre2_options *opt, int flag);
void cre2_opt_case_sensitive(cre2_options *opt, int flag);
void cre2_opt_perl_classes(cre2_options *opt, int flag);
void cre2_opt_word_boundary(cre2_options *opt, int flag);
void cre2_opt_one_line(cre2_options *opt, int flag);
void cre2_opt_encoding(cre2_options *opt, encoding_t enc);
void cre2_opt_max_mem(cre2_options *opt, int m);
struct string_piece {
const char *data;
int length;
};
typedef void cre2;
cre2 *cre2_new(const char *pattern, int patternlen, const cre2_options *opt);
void cre2_delete(cre2 *re);
int cre2_error_code(const cre2 *re);
int cre2_num_capturing_groups(const cre2 *re);
int cre2_program_size(const cre2 *re);
// invalidated by further re use
const char *cre2_error_string(const cre2 *re);
void cre2_error_arg(const cre2 *re, struct string_piece *arg);
typedef int anchor_t;
#define CRE2_UNANCHORED 1
#define CRE2_ANCHOR_START 2
#define CRE2_ANCHOR_BOTH 3
int cre2_match(
const cre2 *re
, const char *text
, int textlen
, int startpos
, int endpos
, anchor_t anchor
, struct string_piece *match
, int nmatch);
#ifdef __cplusplus
} // extern "C"
#endif

BIN
src/cre2/cre2.o Normal file

Binary file not shown.

0
src/re2.cr Normal file
View File

View File

@ -1,3 +1,4 @@
require "./cre2/cre2"
require "./actions" require "./actions"
# These are lexer rules. They match with the text being parsed # These are lexer rules. They match with the text being parsed
@ -7,29 +8,32 @@ module Tartrazine
# This rule matches via a regex pattern # This rule matches via a regex pattern
class Rule class Rule
property pattern : Regex = Re2.new "" property pattern : Re3 = Re3.new ""
property actions : Array(Action) = [] of Action property actions : Array(Action) = [] of Action
property xml : String = "foo" property xml : String = "foo"
def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token)) def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token))
match = pattern.match(text, pos) return false, pos, [] of Token
matched, matches = pattern.match(text, pos)
# We don't match if the match doesn't move the cursor # We don't match if the match doesn't move the cursor
# because that causes infinite loops # because that causes infinite loops
return false, pos, [] of Token if match.nil? || match.end == 0
return false, pos, [] of Token unless matched
# Log.trace { "#{match}, #{pattern.inspect}, #{text}, #{pos}" } # Log.trace { "#{match}, #{pattern.inspect}, #{text}, #{pos}" }
tokens = [] of Token tokens = [] of Token
# Emit the tokens # Emit the tokens
actions.each do |action| actions.each do |action|
# Emit the token # Emit the token
tokens += action.emit(match, lexer) tokens += action.emit(matches, lexer)
end end
Log.trace { "#{xml}, #{match.end}, #{tokens}" } # Log.trace { "#{xml}, #{match.end}, #{tokens}" }
return true, match.end, tokens return true, matches[0].length, tokens
end end
def initialize(node : XML::Node, multiline, dotall, ignorecase) def initialize(node : XML::Node, multiline, dotall, ignorecase)
@xml = node.to_s @xml = node.to_s
@pattern = Re2.new( p! node["pattern"]
@pattern = Re3.new(
node["pattern"], node["pattern"],
multiline, multiline,
dotall, dotall,
@ -76,7 +80,7 @@ module Tartrazine
def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token)) def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token))
tokens = [] of Token tokens = [] of Token
actions.each do |action| actions.each do |action|
tokens += action.emit(nil, lexer) tokens += action.emit(Pointer(LibCre2::StringPiece).malloc(1), lexer)
end end
return true, pos, tokens return true, pos, tokens
end end
@ -106,4 +110,38 @@ module Tartrazine
end end
end end
end end
class Re3
@matches = Pointer(LibCre2::StringPiece).malloc(50)
@opts : LibCre2::Options
@re : LibCre2::CRe2
def group_count
LibCre2.num_capturing_groups(@re)
end
def initialize(pattern : String, multiline = false, dotall = false,
ignorecase = false, anchored = false)
@opts = LibCre2.opt_new
LibCre2.opt_posix_syntax(@opts, false)
LibCre2.opt_longest_match(@opts, true)
LibCre2.opt_perl_classes(@opts, true)
LibCre2.opt_encoding(@opts, 1)
LibCre2.opt_one_line(@opts, !multiline)
LibCre2.opt_case_sensitive(@opts, !ignorecase)
pattern = "(m?)#{pattern}" if multiline
@re = LibCre2.new(pattern, pattern.size, @opts)
end
def match(text, pos)
matched = LibCre2.match(@re, text, text.size, pos, text.size,
LibCre2::CRE2_ANCHOR_START, @matches, 50)
return {matched != 0, @matches}
end
end
end end
re = Tartrazine::Re3.new("(require-instance|fraction-digits|error-app-tag|error-message|min-elements|max-elements|yin-element|ordered-by|position|modifier|default|pattern|length|status|units|value|range|type|path|enum|base|bit)(?=[^\w\-\:])")
p! re.match("value ", 0)

View File

@ -63,8 +63,6 @@ module Tartrazine
tokens = [] of Token tokens = [] of Token
pos = 0 pos = 0
matched = false matched = false
time = 0
count = 0
# Respect the `ensure_nl` config option # Respect the `ensure_nl` config option
if text.size > 0 && text[-1] != '\n' && config[:ensure_nl] && !usingself if text.size > 0 && text[-1] != '\n' && config[:ensure_nl] && !usingself