diff --git a/src/actions.cr b/src/actions.cr index b899859..bd46a38 100644 --- a/src/actions.cr +++ b/src/actions.cr @@ -1,3 +1,5 @@ +require "xml" + # These are Lexer actions. When a rule matches, it will # perform a list of actions. These actions can emit tokens # or change the state machine. @@ -24,11 +26,11 @@ module Tartrazine end # ameba:disable Metrics/CyclomaticComplexity - def emit(match : Regex::MatchData?, lexer : Lexer, match_group = 0) : Array(Token) + def emit(matches : Pointer(LibCre2::StringPiece), lexer : Lexer, match_group = 0) : Array(Token) case type when "token" - raise Exception.new "Can't have a token without a match" if match.nil? - [Token.new(type: xml["type"], value: match[match_group])] + raise Exception.new "Can't have a token without a match" if matches[0].length == 0 + [Token.new(type: xml["type"], value: String.new(Slice.new(matches[0].data, matches[0].length)))] when "push" states_to_push = xml.attributes.select { |attrib| attrib.name == "state" @@ -61,35 +63,37 @@ module Tartrazine when "bygroups" # FIXME: handle # > - # + # https://github.com/google/re2/wiki/Syntax # None # # # where that None means skipping a group # - raise Exception.new "Can't have a token without a match" if match.nil? + raise Exception.new "Can't have a bygroups without a match" if matches[0].length == 0 # Each group matches an action. If the group match is empty, # the action is skipped. result = [] of Token @actions.each_with_index do |e, i| - next if match[i + 1]?.nil? - result += e.emit(match, lexer, i + 1) + next if matches[i].length == 0 + result += e.emit(matches, lexer, i) end result when "using" # Shunt to another lexer entirely - return [] of Token if match.nil? + return [] of Token if matches[0].length == 0 lexer_name = xml["lexer"].downcase - Log.trace { "to tokenize: #{match[match_group]}" } - Tartrazine.lexer(lexer_name).tokenize(match[match_group], usingself: true) + # Log.trace { "to tokenize: #{match[match_group]}" } + to_tokenize = String.new(Slice.new(matches[match_group].data, matches[match_group].length)) + Tartrazine.lexer(lexer_name).tokenize(to_tokenize, usingself: true) when "usingself" # Shunt to another copy of this lexer - return [] of Token if match.nil? + return [] of Token if matches[0].length == 0 new_lexer = Lexer.from_xml(lexer.xml) - Log.trace { "to tokenize: #{match[match_group]}" } - new_lexer.tokenize(match[match_group], usingself: true) + # Log.trace { "to tokenize: #{match[match_group]}" } + to_tokenize = String.new(Slice.new(matches[match_group].data, matches[match_group].length)) + new_lexer.tokenize(to_tokenize, usingself: true) when "combined" # Combine two states into one anonymous state states = xml.attributes.select { |attrib| diff --git a/src/cre2/Makefile b/src/cre2/Makefile new file mode 100644 index 0000000..4667226 --- /dev/null +++ b/src/cre2/Makefile @@ -0,0 +1,5 @@ +all: cre2.o +clean: + rm -f cre2.o +cre2.o: cre2.cpp cre2.h + g++ -O3 -c -o cre2.o cre2.cpp diff --git a/src/cre2/cre2.cpp b/src/cre2/cre2.cpp new file mode 100644 index 0000000..51d0906 --- /dev/null +++ b/src/cre2/cre2.cpp @@ -0,0 +1,121 @@ +#include +#include "cre2.h" + +#define TO_OPT(opt) (reinterpret_cast(opt)) + +cre2_options *cre2_opt_new(void) { + return reinterpret_cast(new RE2::Options()); +} + +void cre2_opt_delete(cre2_options *opt) { + delete TO_OPT(opt); +} + + +#define OPT_bool(name) \ +void cre2_opt_##name(cre2_options *opt, int flag) { \ + TO_OPT(opt)->set_##name(bool(flag)); \ +} + +OPT_bool(posix_syntax) +OPT_bool(longest_match) +OPT_bool(log_errors) +OPT_bool(literal) +OPT_bool(never_nl) +OPT_bool(case_sensitive) +OPT_bool(perl_classes) +OPT_bool(word_boundary) +OPT_bool(one_line) + +#undef OPT_BOOL + + +void cre2_opt_encoding(cre2_options *opt, encoding_t enc) { + switch (enc) { + case CRE2_UTF8: + TO_OPT(opt)->set_encoding(RE2::Options::EncodingUTF8); + break; + case CRE2_Latin1: + TO_OPT(opt)->set_encoding(RE2::Options::EncodingLatin1); + break; + } +} + +void cre2_opt_max_mem(cre2_options *opt, int m) { + TO_OPT(opt)->set_max_mem(m); +} + + +#define TO_RE2(re) (reinterpret_cast(re)) +#define TO_CONST_RE2(re) (reinterpret_cast(re)) + +cre2 *cre2_new(const char *pattern, int patternlen, const cre2_options *opt) { + re2::StringPiece pattern_re2(pattern, patternlen); + return reinterpret_cast( + new RE2(pattern_re2, *reinterpret_cast(opt))); +} + +void cre2_delete(cre2 *re) { + delete TO_RE2(re); +} + + +int cre2_error_code(const cre2 *re) { + return int(TO_CONST_RE2(re)->error_code()); +} + +const char *cre2_error_string(const cre2 *re) { + return TO_CONST_RE2(re)->error().c_str(); +} + +void cre2_error_arg(const cre2 *re, struct string_piece *arg) { + const std::string &argstr = TO_CONST_RE2(re)->error_arg(); + arg->data = argstr.data(); + arg->length = argstr.length(); +} + +int cre2_num_capturing_groups(const cre2 *re) { + return TO_CONST_RE2(re)->NumberOfCapturingGroups(); +} + +int cre2_program_size(const cre2 *re) { + return TO_CONST_RE2(re)->ProgramSize(); +} + + +int cre2_match( + const cre2 *re + , const char *text + , int textlen + , int startpos + , int endpos + , anchor_t anchor + , struct string_piece *match + , int nmatch) { + + re2::StringPiece text_re2(text, textlen); + // FIXME: exceptions? + re2::StringPiece *match_re2 = new re2::StringPiece[nmatch]; + + RE2::Anchor anchor_re2 = RE2::UNANCHORED; + switch (anchor) { + case CRE2_ANCHOR_START: + anchor_re2 = RE2::ANCHOR_START; break; + case CRE2_ANCHOR_BOTH: + anchor_re2 = RE2::ANCHOR_BOTH; break; + } + + bool ret = TO_CONST_RE2(re) + ->Match(text_re2, startpos, endpos, anchor_re2, match_re2, nmatch); + + if (ret) { + for (int i=0; i 0 && text[-1] != '\n' && config[:ensure_nl] && !usingself