Compare commits

...

4 Commits

7 changed files with 229 additions and 93 deletions

View File

@ -42,6 +42,9 @@ known_bad = {
"#{__DIR__}/tests/mcfunction/selectors.txt",
"#{__DIR__}/tests/php/anonymous_class.txt",
"#{__DIR__}/tests/html/javascript_unclosed.txt",
# BAD FOR ONIGMO
"#{__DIR__}/tests/json/test_backtracking.txt",
}
@ -58,6 +61,7 @@ describe Tartrazine do
end
else
it "parses #{testcase}".split("/")[-2...].join("/") do
p! testcase
text = File.read(testcase).split("---input---\n").last.split("---tokens---").first
lexer_name = File.basename(File.dirname(testcase)).downcase
unless failing_lexers.includes?(lexer_name) ||

View File

@ -30,11 +30,11 @@ module Tartrazine
end
# ameba:disable Metrics/CyclomaticComplexity
def emit(match : Regex::MatchData?, lexer : Lexer, match_group = 0) : Array(Token)
def emit(match, lexer : Lexer, match_group = 0) : Array(Token)
case type
when "token"
raise Exception.new "Can't have a token without a match" if match.nil?
[Token.new(type: xml["type"], value: match[match_group])]
[Token.new(type: xml["type"], value: match[match_group].as(Onigmo::Match).value)]
when "push"
states_to_push = xml.attributes.select { |attrib|
attrib.name == "state"
@ -88,14 +88,14 @@ module Tartrazine
return [] of Token if match.nil?
lexer_name = xml["lexer"].downcase
Log.trace { "to tokenize: #{match[match_group]}" }
Tartrazine.lexer(lexer_name).tokenize(match[match_group], usingself: true)
Tartrazine.lexer(lexer_name).tokenize(match[match_group].as(Onigmo::Match).value, usingself: true)
when "usingself"
# Shunt to another copy of this lexer
return [] of Token if match.nil?
new_lexer = Lexer.from_xml(lexer.xml)
Log.trace { "to tokenize: #{match[match_group]}" }
new_lexer.tokenize(match[match_group], usingself: true)
new_lexer.tokenize(match[match_group].as(Onigmo::Match).value, usingself: true)
when "combined"
# Combine two states into one anonymous state
states = xml.attributes.select { |attrib|

View File

@ -73,7 +73,7 @@ module Tartrazine
# These are true/false/nil
outp << "border: none;" if style.border == false
outp << "font-weight: bold;" if style.bold
outp << "font-weight: #{weight_of_bold};" if style.bold == false
outp << "font-weight: #{@weight_of_bold};" if style.bold == false
outp << "font-style: italic;" if style.italic
outp << "font-style: normal;" if style.italic == false
outp << "text-decoration: underline;" if style.underline

View File

@ -1,88 +1,85 @@
@[Link("onigmo")]
@[Link(ldflags: "#{__DIR__}/onigmo/onigwrap.o")]
lib Onigmo
type OnigOptionType = UInt32
type OnigCaseFoldType = UInt32
type OnigDistance = LibC::SizeT
lib LibOnigmo
type Regex = Pointer(Void)
type Region = Pointer(Void)
struct OnigRegex
p : LibC::UChar*
used : UInt32
alloc : UInt32
fun create = onigwrap_create(pattern : LibC::Char*, len : UInt32,
ignoreCase : Int32,
multiline : Int32,
dotall : Int32) : Regex
fun free = onigwrap_free(re : Regex)
fun region_free = onigwrap_region_free(region : Region)
num_mem : Int32
num_repeat : Int32
num_null_check : Int32
num_comb_exp_check : Int32
num_call : Int32
capture_history : UInt32
bt_mem_start : UInt32
bt_mem_end : UInt32
stack_pop_level : Int32
repeat_range_alloc : Int32
options : OnigOptionType
syntax : OnigSyntaxType*
name_table : Void*
case_fold_flag : OnigCaseFoldType
optimize : Int32
threshold_len : Int32
anchor : Int32
anchor_dmin : OnigDistance
anchor_dmax : OnigDistance
sub_anchor : Int32
exact : LibC::UChar*
exact_end : LibC::UChar*
map : LibC::UChar*
int_map : Int32*
int_map_backward : Int32*
dmin : OnigDistance
dmax : OnigDistance
chain : OnigRegex*
fun search = onigwrap_search(re : Regex, str : LibC::Char*, offset : UInt32, length : UInt32) : Region
fun num_regs = onigwrap_num_regs(region : Region) : Int32
fun pos = onigwrap_pos(region : Region, index : Int32) : Int32
fun len = onigwrap_len(region : Region, index : Int32) : Int32
end
type OnigRegexType = OnigRegex*
type OnigCodePoint = UInt32
type OnigUChar = LibC::UChar
type OnigEncoding = Void*
module Onigmo
class Match
property begin : Int32
property end : Int32
property value : String
struct OnigMetaCharTableType
esc : OnigCodePoint
anychar : OnigCodePoint
anytime : OnigCodePoint
zero_or_one_time : OnigCodePoint
one_or_one_time : OnigCodePoint
anychar_anytime : OnigCodePoint
def initialize(@begin, @end, @value)
end
struct OnigSyntaxType
op : UInt32
op2 : UInt32
behavior : UInt32
options : OnigOptionType
meta_char_table : OnigMetaCharTableType
def to_s
@value
end
end
struct OnigErrorInfo
enc : OnigEncoding
par : OnigUChar*
par_end : OnigUChar*
class Regex
def initialize(@pattern : String, @ignorecase = false, @multiline = false, @dotall = false)
@re = LibOnigmo.create(@pattern.to_unsafe, @pattern.bytesize, @ignorecase ? 1 : 0, @multiline ? 1 : 0, @dotall ? 1 : 0)
end
ONIG_OPTION_NONE = 0u32
ONIG_OPTION_DEFAULT = ONIG_OPTION_NONE
fun new = onig_new(OnigRegex*, OnigUChar*, OnigUChar*, OnigOptionType, OnigEncoding, OnigSyntaxType*, OnigErrorInfo*)
def finalize
LibOnigmo.free(@re)
end
pattern = "a(.*)b|[e-f]+"
str = "zzzzaffffffffb"
def match(str : String, offset = 0)
# The offset argument is a character index, but Onigmo expects a byte index
offset = str.char_index_to_byte_index(offset)
if offset.nil?
raise Exception.new "Invalid offset"
end
einfo = Onigmo::OnigErrorInfo.new
region = LibOnigmo.search(@re, str.to_unsafe, offset, str.bytesize)
result = [] of Match?
num_regs = LibOnigmo.num_regs(region)
if num_regs > 0
(0...num_regs).each do |i|
pos = LibOnigmo.pos(region, i)
l = LibOnigmo.len(region, i)
if pos == -1 || l == -1
result << nil
else
b = str.byte_index_to_char_index(pos)
e = str.byte_index_to_char_index(pos + l)
# p! pos, l, b, e, str[pos..]
if b.nil? || e.nil?
raise Exception.new "Invalid substring"
end
Onigmo.new(out reg,
pattern.to_unsafe,
pattern.to_unsafe + pattern.size,
Onigmo::ONIG_OPTION_DEFAULT,
0,
Onigmo::ONIG_SYNTAX_DEFAULT, pointerof(einfo))
v = str[b...e]
result << Match.new(b, b + v.size, v)
end
end
else
return [] of Match
end
LibOnigmo.region_free(region)
result
end
end
end
# pattern = "\\w"
# str = "α"
# re = Onigmo::Regex.new(pattern, false, false, false)
# p! re.match(str)

94
src/onigmo/onigwrap.c Normal file
View File

@ -0,0 +1,94 @@
#include "onigmo.h"
regex_t *onigwrap_create(char *pattern, int len, int ignoreCase, int multiline, int dotall)
{
regex_t *reg;
OnigErrorInfo einfo;
OnigOptionType onigOptions = ONIG_OPTION_DEFAULT;
if (ignoreCase == 1)
onigOptions |= ONIG_OPTION_IGNORECASE;
if (multiline == 1)
onigOptions |= ONIG_OPTION_NEGATE_SINGLELINE;
if (dotall == 1)
onigOptions |= ONIG_OPTION_DOTALL;
OnigUChar *stringStart = (OnigUChar*) pattern;
OnigUChar *stringEnd = (OnigUChar*) pattern + len;
int res = onig_new(&reg, stringStart, stringEnd, onigOptions, ONIG_ENCODING_UTF8, ONIG_SYNTAX_PYTHON, &einfo);
return reg;
}
void onigwrap_region_free(OnigRegion *region)
{
onig_region_free(region, 1);
}
void onigwrap_free(regex_t *reg)
{
onig_free(reg);
}
int onigwrap_index_in(regex_t *reg, char *charPtr, int offset, int length)
{
OnigUChar *stringStart = (OnigUChar*) charPtr;
OnigUChar *stringEnd = (OnigUChar*) (charPtr + length);
OnigUChar *stringOffset = (OnigUChar*) (charPtr + offset);
OnigUChar *stringRange = (OnigUChar*) stringEnd;
OnigRegion *region = onig_region_new();
int result = onig_search(reg, stringStart, stringEnd, stringOffset, stringRange, region, ONIG_OPTION_NONE);
onig_region_free(region, 1);
if (result >= 0)
return result >> 1;
if (result == ONIG_MISMATCH)
return -1;
return -2;
}
OnigRegion *onigwrap_search(regex_t *reg, char *charPtr, int offset, int length)
{
OnigUChar *stringStart = (OnigUChar*) charPtr;
OnigUChar *stringEnd = (OnigUChar*) (charPtr + length);
OnigUChar *stringOffset = (OnigUChar*) (charPtr + offset);
OnigUChar *stringRange = (OnigUChar*) stringEnd;
OnigRegion *region = onig_region_new();
int result = onig_search(reg, stringStart, stringEnd, stringOffset, stringRange, region, ONIG_OPTION_NONE);
return region;
}
int onigwrap_num_regs(OnigRegion *region)
{
return region->num_regs;
}
int onigwrap_pos(OnigRegion *region, int nth)
{
if (nth < region->num_regs)
{
int result = region->beg[nth];
if (result < 0)
return -1;
return result;
}
return -1;
}
int onigwrap_len(OnigRegion *region, int nth)
{
if (nth < region->num_regs)
{
int result = region->end[nth] - region->beg[nth];
return result;
}
return -1;
}

32
src/onigmo/onigwrap.h Normal file
View File

@ -0,0 +1,32 @@
#include "onigmo.h"
#if defined(_WIN32)
#define ONIGWRAP_EXTERN extern __declspec(dllexport)
#else
#define ONIGWRAP_EXTERN extern
#endif
ONIGWRAP_EXTERN
regex_t *onigwrap_create(char *pattern, int len, int ignoreCase, int multiline);
ONIGWRAP_EXTERN
void onigwrap_region_free(OnigRegion *region);
ONIGWRAP_EXTERN
void onigwrap_free(regex_t *reg);
ONIGWRAP_EXTERN
int onigwrap_index_in(regex_t *reg, char *charPtr, int offset, int length);
ONIGWRAP_EXTERN
OnigRegion *onigwrap_search(regex_t *reg, char *charPtr, int offset, int length);
ONIGWRAP_EXTERN
int onigwrap_num_regs(OnigRegion *region);
ONIGWRAP_EXTERN
int onigwrap_pos(OnigRegion *region, int nth);
ONIGWRAP_EXTERN
int onigwrap_len(OnigRegion *region, int nth);

View File

@ -3,6 +3,7 @@ require "./formatter"
require "./rules"
require "./styles"
require "./lexer"
require "./onigmo"
# These are lexer rules. They match with the text being parsed
# and perform actions, either emitting tokens or changing the
@ -10,16 +11,22 @@ require "./lexer"
module Tartrazine
# This rule matches via a regex pattern
alias Regex = Onigmo::Regex
class Rule
property pattern : Regex = Regex.new ""
property pattern2 : ::Regex = ::Regex.new ""
property actions : Array(Action) = [] of Action
property xml : String = "foo"
def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token))
match = pattern.match(text, pos)
match2 = pattern2.match(text, pos)
# We don't match if the match doesn't move the cursor
# because that causes infinite loops
return false, pos, [] of Token if match.nil? || match.end == 0
# The `match.begin > pos` is the same as the ANCHORED option
return false, pos, [] of Token if match.empty? || match[0].nil? || match[0].try { |m| m.begin > pos }
# p! match.map(&.to_s), match2, text[pos-1..pos + 20],"----------------------"
# Log.trace { "#{match}, #{pattern.inspect}, #{text}, #{pos}" }
tokens = [] of Token
# Emit the tokens
@ -27,21 +34,23 @@ module Tartrazine
# Emit the token
tokens += action.emit(match, lexer)
end
Log.trace { "#{xml}, #{match.end}, #{tokens}" }
return true, match.end, tokens
# Log.trace { "#{xml}, #{match[0].end}, #{tokens}" }
return true, pos + match[0].as(Onigmo::Match).value.size, tokens
end
def initialize(node : XML::Node, multiline, dotall, ignorecase)
@xml = node.to_s
pattern = node["pattern"]
flags = Regex::Options::ANCHORED
# flags = Regex::Options::ANCHORED
flags = ::Regex::Options::NO_UTF_CHECK
# MULTILINE implies DOTALL which we don't want, so we
# use in-pattern flag (?m) instead
# flags |= Regex::Options::MULTILINE if multiline
flags |= ::Regex::Options::MULTILINE if multiline
pattern = "(?m)" + pattern if multiline
flags |= Regex::Options::DOTALL if dotall
flags |= Regex::Options::IGNORE_CASE if ignorecase
@pattern = Regex.new(pattern, flags)
flags |= ::Regex::Options::DOTALL if dotall
flags |= ::Regex::Options::IGNORE_CASE if ignorecase
@pattern = Regex.new(pattern, ignorecase, multiline, dotall)
@pattern2 = ::Regex.new(pattern, flags)
add_actions(node)
end