mirror of
https://github.com/ralsina/tartrazine.git
synced 2025-06-07 20:20:26 -03:00
Lots of stuff ... compiler crashes
This commit is contained in:
parent
2a19f3889f
commit
fb54b08841
@ -14,6 +14,8 @@ dependencies:
|
||||
sixteen:
|
||||
github: ralsina/sixteen
|
||||
branch: main
|
||||
cre2:
|
||||
git: "https://git.ralsina.me/ralsina/cre2.git"
|
||||
|
||||
crystal: ">= 1.13.0"
|
||||
|
||||
|
@ -26,11 +26,12 @@ module Tartrazine
|
||||
end
|
||||
|
||||
# ameba:disable Metrics/CyclomaticComplexity
|
||||
def emit(matches : Pointer(LibCre2::StringPiece), lexer : Lexer, match_group = 0) : Array(Token)
|
||||
def emit(match : Regex::MatchData | CRe2::MatchDataLike | Nil,
|
||||
lexer : Lexer, match_group = 0) : Array(Token)
|
||||
case type
|
||||
when "token"
|
||||
raise Exception.new "Can't have a token without a match" if matches[0].length == 0
|
||||
[Token.new(type: xml["type"], value: String.new(Slice.new(matches[0].data, matches[0].length)))]
|
||||
raise Exception.new "Can't have a token without a match" if (match.nil? || match[0].size == 0)
|
||||
[Token.new(type: xml["type"], value: match[0])]
|
||||
when "push"
|
||||
states_to_push = xml.attributes.select { |attrib|
|
||||
attrib.name == "state"
|
||||
@ -69,31 +70,30 @@ module Tartrazine
|
||||
#
|
||||
# where that None means skipping a group
|
||||
#
|
||||
raise Exception.new "Can't have a bygroups without a match" if matches[0].length == 0
|
||||
raise Exception.new "Can't have a bygroups without a match" if (match.nil? || match[0].size == 0)
|
||||
|
||||
# Each group matches an action. If the group match is empty,
|
||||
# the action is skipped.
|
||||
result = [] of Token
|
||||
@actions.each_with_index do |e, i|
|
||||
next if matches[i].length == 0
|
||||
result += e.emit(matches, lexer, i)
|
||||
next if match[i].size == 0
|
||||
result += e.emit(match, lexer, i)
|
||||
end
|
||||
result
|
||||
when "using"
|
||||
# Shunt to another lexer entirely
|
||||
return [] of Token if matches[0].length == 0
|
||||
return [] of Token if (match.nil? || match[0].size == 0)
|
||||
lexer_name = xml["lexer"].downcase
|
||||
# Log.trace { "to tokenize: #{match[match_group]}" }
|
||||
to_tokenize = String.new(Slice.new(matches[match_group].data, matches[match_group].length))
|
||||
to_tokenize = match[match_group]
|
||||
Tartrazine.lexer(lexer_name).tokenize(to_tokenize, usingself: true)
|
||||
when "usingself"
|
||||
# Shunt to another copy of this lexer
|
||||
return [] of Token if matches[0].length == 0
|
||||
return [] of Token if (match.nil? || match[0].size == 0)
|
||||
|
||||
new_lexer = Lexer.from_xml(lexer.xml)
|
||||
# Log.trace { "to tokenize: #{match[match_group]}" }
|
||||
to_tokenize = String.new(Slice.new(matches[match_group].data, matches[match_group].length))
|
||||
new_lexer.tokenize(to_tokenize, usingself: true)
|
||||
new_lexer.tokenize(match[match_group], usingself: true)
|
||||
when "combined"
|
||||
# Combine two states into one anonymous state
|
||||
states = xml.attributes.select { |attrib|
|
||||
|
@ -1,5 +0,0 @@
|
||||
all: cre2.o
|
||||
clean:
|
||||
rm -f cre2.o
|
||||
cre2.o: cre2.cpp cre2.h
|
||||
g++ -O3 -c -o cre2.o cre2.cpp
|
@ -1,122 +0,0 @@
|
||||
#include <re2/re2.h>
|
||||
#include "cre2.h"
|
||||
|
||||
#define TO_OPT(opt) (reinterpret_cast<RE2::Options *>(opt))
|
||||
|
||||
cre2_options *cre2_opt_new(void) {
|
||||
return reinterpret_cast<void*>(new RE2::Options());
|
||||
}
|
||||
|
||||
void cre2_opt_delete(cre2_options *opt) {
|
||||
delete TO_OPT(opt);
|
||||
}
|
||||
|
||||
|
||||
#define OPT_bool(name) \
|
||||
void cre2_opt_##name(cre2_options *opt, int flag) { \
|
||||
TO_OPT(opt)->set_##name(bool(flag)); \
|
||||
}
|
||||
|
||||
OPT_bool(posix_syntax)
|
||||
OPT_bool(longest_match)
|
||||
OPT_bool(log_errors)
|
||||
OPT_bool(literal)
|
||||
OPT_bool(never_nl)
|
||||
OPT_bool(dot_nl)
|
||||
OPT_bool(case_sensitive)
|
||||
OPT_bool(perl_classes)
|
||||
OPT_bool(word_boundary)
|
||||
OPT_bool(one_line)
|
||||
|
||||
#undef OPT_BOOL
|
||||
|
||||
|
||||
void cre2_opt_encoding(cre2_options *opt, encoding_t enc) {
|
||||
switch (enc) {
|
||||
case CRE2_UTF8:
|
||||
TO_OPT(opt)->set_encoding(RE2::Options::EncodingUTF8);
|
||||
break;
|
||||
case CRE2_Latin1:
|
||||
TO_OPT(opt)->set_encoding(RE2::Options::EncodingLatin1);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
void cre2_opt_max_mem(cre2_options *opt, int m) {
|
||||
TO_OPT(opt)->set_max_mem(m);
|
||||
}
|
||||
|
||||
|
||||
#define TO_RE2(re) (reinterpret_cast<RE2 *>(re))
|
||||
#define TO_CONST_RE2(re) (reinterpret_cast<const RE2 *>(re))
|
||||
|
||||
cre2 *cre2_new(const char *pattern, int patternlen, const cre2_options *opt) {
|
||||
re2::StringPiece pattern_re2(pattern, patternlen);
|
||||
return reinterpret_cast<void*>(
|
||||
new RE2(pattern_re2, *reinterpret_cast<const RE2::Options *>(opt)));
|
||||
}
|
||||
|
||||
void cre2_delete(cre2 *re) {
|
||||
delete TO_RE2(re);
|
||||
}
|
||||
|
||||
|
||||
int cre2_error_code(const cre2 *re) {
|
||||
return int(TO_CONST_RE2(re)->error_code());
|
||||
}
|
||||
|
||||
const char *cre2_error_string(const cre2 *re) {
|
||||
return TO_CONST_RE2(re)->error().c_str();
|
||||
}
|
||||
|
||||
void cre2_error_arg(const cre2 *re, struct string_piece *arg) {
|
||||
const std::string &argstr = TO_CONST_RE2(re)->error_arg();
|
||||
arg->data = argstr.data();
|
||||
arg->length = argstr.length();
|
||||
}
|
||||
|
||||
int cre2_num_capturing_groups(const cre2 *re) {
|
||||
return TO_CONST_RE2(re)->NumberOfCapturingGroups();
|
||||
}
|
||||
|
||||
int cre2_program_size(const cre2 *re) {
|
||||
return TO_CONST_RE2(re)->ProgramSize();
|
||||
}
|
||||
|
||||
|
||||
int cre2_match(
|
||||
const cre2 *re
|
||||
, const char *text
|
||||
, int textlen
|
||||
, int startpos
|
||||
, int endpos
|
||||
, anchor_t anchor
|
||||
, struct string_piece *match
|
||||
, int nmatch) {
|
||||
|
||||
re2::StringPiece text_re2(text, textlen);
|
||||
// FIXME: exceptions?
|
||||
re2::StringPiece *match_re2 = new re2::StringPiece[nmatch];
|
||||
|
||||
RE2::Anchor anchor_re2 = RE2::UNANCHORED;
|
||||
switch (anchor) {
|
||||
case CRE2_ANCHOR_START:
|
||||
anchor_re2 = RE2::ANCHOR_START; break;
|
||||
case CRE2_ANCHOR_BOTH:
|
||||
anchor_re2 = RE2::ANCHOR_BOTH; break;
|
||||
}
|
||||
|
||||
bool ret = TO_CONST_RE2(re)
|
||||
->Match(text_re2, startpos, endpos, anchor_re2, match_re2, nmatch);
|
||||
|
||||
if (ret) {
|
||||
for (int i=0; i<nmatch; i++) {
|
||||
match[i].data = match_re2[i].data();
|
||||
match[i].length = match_re2[i].length();
|
||||
}
|
||||
}
|
||||
|
||||
delete [] match_re2;
|
||||
|
||||
return int(ret);
|
||||
}
|
@ -1,70 +0,0 @@
|
||||
@[Link(ldflags: "#{__DIR__}/cre2.o -Wl,--copy-dt-needed-entries `pkg-config --libs re2`")]
|
||||
lib LibCre2
|
||||
type Options = Void*
|
||||
|
||||
fun opt_new = cre2_opt_new : Options
|
||||
fun opt_delete = cre2_opt_delete(op : Options) : Nil
|
||||
|
||||
fun opt_posix_syntax = cre2_opt_posix_syntax(op : Options, flag : Bool) : Nil
|
||||
fun opt_longest_match = cre2_opt_longest_match(op : Options, flag : Bool) : Nil
|
||||
fun opt_log_errors = cre2_opt_log_errors(op : Options, flag : Bool) : Nil
|
||||
fun opt_literal = cre2_opt_literal(op : Options, flag : Bool) : Nil
|
||||
fun opt_never_nl = cre2_opt_never_nl(op : Options, flag : Bool) : Nil
|
||||
fun opt_case_sensitive = cre2_opt_case_sensitive(op : Options, flag : Bool) : Nil
|
||||
fun opt_perl_classes = cre2_opt_perl_classes(op : Options, flag : Bool) : Nil
|
||||
fun opt_word_boundary = cre2_opt_word_boundary(op : Options, flag : Bool) : Nil
|
||||
fun opt_one_line = cre2_opt_one_line(op : Options, flag : Bool) : Nil
|
||||
fun opt_dot_nl = cre2_opt_dot_nl(op : Options, flag : Bool) : Nil
|
||||
fun opt_encoding = cre2_opt_encoding(op : Options, encoding : Int32) : Nil
|
||||
fun opt_max_mem = cre2_opt_max_mem(op : Options, flag : Bool) : Nil
|
||||
|
||||
struct StringPiece
|
||||
data : LibC::Char*
|
||||
length : Int32
|
||||
end
|
||||
|
||||
type CRe2 = Void*
|
||||
|
||||
fun new = cre2_new(pattern : LibC::Char*, patternlen : UInt32, opt : Options) : CRe2
|
||||
fun del = cre2_delete(re : CRe2) : Nil
|
||||
fun error_code = cre2_error_core(re : CRe2) : Int32
|
||||
fun num_capturing_groups(re : CRe2) : Int32
|
||||
fun program_size(re : CRe2) : Int32
|
||||
|
||||
# Invalidated by further re use
|
||||
fun error_string = cre2_error_string(re : CRe2) : LibC::Char*
|
||||
fun error_arg = cre2_error_arg(re : CRe2, arg : StringPiece*) : Nil
|
||||
|
||||
CRE2_UNANCHORED = 1
|
||||
CRE2_ANCHOR_START = 2
|
||||
CRE2_ANCHOR_BOTH = 3
|
||||
|
||||
fun match = cre2_match(
|
||||
re : CRe2,
|
||||
text : LibC::Char*,
|
||||
textlen : UInt32,
|
||||
startpos : UInt32,
|
||||
endpos : UInt32,
|
||||
anchor : Int32,
|
||||
match : StringPiece*,
|
||||
nmatch : Int32
|
||||
) : Int32
|
||||
end
|
||||
|
||||
# match = Pointer(LibCre2::StringPiece).malloc(10)
|
||||
# opts = LibCre2.opt_new
|
||||
# LibCre2.opt_posix_syntax(opts, true)
|
||||
# LibCre2.opt_longest_match(opts, true)
|
||||
# LibCre2.opt_perl_classes(opts, true)
|
||||
# LibCre2.opt_encoding(opts, 1)
|
||||
# # LibCre2.opt_one_line(opts, false)
|
||||
# # LibCre2.opt_never_nl(opts, false)
|
||||
|
||||
# pattern = "(\\s+)(foo)"
|
||||
# text = " foo"
|
||||
# re = LibCre2.new(pattern, pattern.size, opts)
|
||||
# p! LibCre2.match(re, text, text.size, 0, text.size,
|
||||
# LibCre2::CRE2_ANCHOR_START, match, 10)
|
||||
# (0...10).each do |i|
|
||||
# p! String.new(Slice.new(match[i].data, match[i].length))
|
||||
# end
|
@ -1,67 +0,0 @@
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
|
||||
typedef void cre2_options;
|
||||
|
||||
typedef int encoding_t;
|
||||
#define CRE2_UTF8 1
|
||||
#define CRE2_Latin1 2
|
||||
|
||||
cre2_options *cre2_opt_new(void);
|
||||
void cre2_opt_delete(cre2_options *opt);
|
||||
|
||||
void cre2_opt_posix_syntax(cre2_options *opt, int flag);
|
||||
void cre2_opt_longest_match(cre2_options *opt, int flag);
|
||||
void cre2_opt_log_errors(cre2_options *opt, int flag);
|
||||
void cre2_opt_literal(cre2_options *opt, int flag);
|
||||
void cre2_opt_never_nl(cre2_options *opt, int flag);
|
||||
void cre2_opt_case_sensitive(cre2_options *opt, int flag);
|
||||
void cre2_opt_perl_classes(cre2_options *opt, int flag);
|
||||
void cre2_opt_word_boundary(cre2_options *opt, int flag);
|
||||
void cre2_opt_one_line(cre2_options *opt, int flag);
|
||||
void cre2_opt_dot_nl(cre2_options *opt, int flag);
|
||||
void cre2_opt_encoding(cre2_options *opt, encoding_t enc);
|
||||
void cre2_opt_max_mem(cre2_options *opt, int m);
|
||||
|
||||
|
||||
struct string_piece {
|
||||
const char *data;
|
||||
int length;
|
||||
};
|
||||
|
||||
|
||||
typedef void cre2;
|
||||
|
||||
cre2 *cre2_new(const char *pattern, int patternlen, const cre2_options *opt);
|
||||
void cre2_delete(cre2 *re);
|
||||
|
||||
int cre2_error_code(const cre2 *re);
|
||||
int cre2_num_capturing_groups(const cre2 *re);
|
||||
int cre2_program_size(const cre2 *re);
|
||||
|
||||
// invalidated by further re use
|
||||
const char *cre2_error_string(const cre2 *re);
|
||||
void cre2_error_arg(const cre2 *re, struct string_piece *arg);
|
||||
|
||||
|
||||
typedef int anchor_t;
|
||||
#define CRE2_UNANCHORED 1
|
||||
#define CRE2_ANCHOR_START 2
|
||||
#define CRE2_ANCHOR_BOTH 3
|
||||
|
||||
int cre2_match(
|
||||
const cre2 *re
|
||||
, const char *text
|
||||
, int textlen
|
||||
, int startpos
|
||||
, int endpos
|
||||
, anchor_t anchor
|
||||
, struct string_piece *match
|
||||
, int nmatch);
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
BIN
src/cre2/cre2.o
BIN
src/cre2/cre2.o
Binary file not shown.
86
src/rules.cr
86
src/rules.cr
@ -1,5 +1,5 @@
|
||||
require "./cre2/cre2"
|
||||
require "./actions"
|
||||
require "cre2"
|
||||
|
||||
# These are lexer rules. They match with the text being parsed
|
||||
# and perform actions, either emitting tokens or changing the
|
||||
@ -8,35 +8,35 @@ module Tartrazine
|
||||
# This rule matches via a regex pattern
|
||||
|
||||
class Rule
|
||||
property pattern : Re3 = Re3.new ""
|
||||
property pattern : CRe2::Regex = CRe2::Regex.new ""
|
||||
property actions : Array(Action) = [] of Action
|
||||
property xml : String = "foo"
|
||||
|
||||
def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token))
|
||||
matched, matches = pattern.match(text, pos)
|
||||
match = pattern.match(text, pos)
|
||||
# We don't match if the match doesn't move the cursor
|
||||
# because that causes infinite loops
|
||||
|
||||
return false, pos, [] of Token unless matched
|
||||
return false, pos, [] of Token if match.nil?
|
||||
# Log.trace { "#{match}, #{pattern.inspect}, #{text}, #{pos}" }
|
||||
tokens = [] of Token
|
||||
# Emit the tokens
|
||||
actions.each do |action|
|
||||
# Emit the token
|
||||
tokens += action.emit(matches, lexer)
|
||||
tokens += action.emit(match, lexer)
|
||||
end
|
||||
# Log.trace { "#{xml}, #{match.end}, #{tokens}" }
|
||||
return true, matches[0].length, tokens
|
||||
return true, match[0].size, tokens
|
||||
end
|
||||
|
||||
def initialize(node : XML::Node, multiline, dotall, ignorecase)
|
||||
@xml = node.to_s
|
||||
@pattern = Re3.new(
|
||||
node["pattern"],
|
||||
multiline,
|
||||
dotall,
|
||||
ignorecase,
|
||||
anchored: true)
|
||||
options = Regex::Options::ANCHORED
|
||||
options |= Regex::Options::MULTILINE if multiline
|
||||
options |= Regex::Options::DOTALL if dotall
|
||||
options |= Regex::Options::IGNORE_CASE if ignorecase
|
||||
@pattern = CRe2::Regex.new(
|
||||
node["pattern"], options)
|
||||
add_actions(node)
|
||||
end
|
||||
|
||||
@ -78,7 +78,7 @@ module Tartrazine
|
||||
def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token))
|
||||
tokens = [] of Token
|
||||
actions.each do |action|
|
||||
tokens += action.emit(Pointer(LibCre2::StringPiece).malloc(1), lexer)
|
||||
tokens += action.emit(nil, lexer)
|
||||
end
|
||||
return true, pos, tokens
|
||||
end
|
||||
@ -88,64 +88,4 @@ module Tartrazine
|
||||
add_actions(node)
|
||||
end
|
||||
end
|
||||
|
||||
# This is a hack to workaround that Crystal seems to disallow
|
||||
# having regexes multiline but not dot_all
|
||||
class Re2 < Regex
|
||||
@source = "fa"
|
||||
@options = Regex::Options::None
|
||||
@jit = true
|
||||
|
||||
def initialize(pattern : String, multiline = false, dotall = false, ignorecase = false, anchored = false)
|
||||
flags = LibPCRE2::UTF | LibPCRE2::DUPNAMES |
|
||||
LibPCRE2::UCP
|
||||
flags |= LibPCRE2::MULTILINE if multiline
|
||||
flags |= LibPCRE2::DOTALL if dotall
|
||||
flags |= LibPCRE2::CASELESS if ignorecase
|
||||
flags |= LibPCRE2::ANCHORED if anchored
|
||||
@re = Regex::PCRE2.compile(pattern, flags) do |error_message|
|
||||
raise Exception.new(error_message)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
class Re3
|
||||
@matches = Pointer(LibCre2::StringPiece).malloc(50)
|
||||
@opts : LibCre2::Options
|
||||
|
||||
@re : LibCre2::CRe2
|
||||
|
||||
def group_count
|
||||
LibCre2.num_capturing_groups(@re)
|
||||
end
|
||||
|
||||
def initialize(pattern : String, multiline = false, dotall = false,
|
||||
ignorecase = false, anchored = false)
|
||||
@opts = LibCre2.opt_new
|
||||
LibCre2.opt_posix_syntax(@opts, false)
|
||||
LibCre2.opt_longest_match(@opts, false)
|
||||
# These 3 are ignored when posix_syntax is false
|
||||
# LibCre2.opt_one_line(@opts, !multiline)
|
||||
# LibCre2.opt_perl_classes(@opts, true)
|
||||
# LibCre2.opt_word_boundary(@opts, true)
|
||||
LibCre2.opt_encoding(@opts, 1)
|
||||
LibCre2.opt_case_sensitive(@opts, !ignorecase)
|
||||
LibCre2.opt_dot_nl(@opts, dotall)
|
||||
pattern = "(?m)#{pattern}" if multiline
|
||||
@re = LibCre2.new(pattern, pattern.size, @opts)
|
||||
end
|
||||
|
||||
def match(text, pos)
|
||||
matched = LibCre2.match(@re, text, text.size, pos, text.size,
|
||||
LibCre2::CRE2_ANCHOR_START, @matches, 50)
|
||||
return {matched != 0, @matches}
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
|
||||
# re2 doesn't support this (should match "x")
|
||||
# re = Tartrazine::Re3.new("x(?!foo)", multiline: true, dotall: false)
|
||||
# m = re.match("xfoo", 0)
|
||||
# p m[0], m[1][0]
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user