mirror of
https://github.com/ralsina/tartrazine.git
synced 2025-06-08 12:40:25 -03:00
Compare commits
No commits in common. "32816eb2070982287163229bcfb43f9504727edb" and "d49d0969a9bbef99f1b297b019acfdc5b9a75cec" have entirely different histories.
32816eb207
...
d49d0969a9
@ -42,9 +42,6 @@ known_bad = {
|
|||||||
"#{__DIR__}/tests/mcfunction/selectors.txt",
|
"#{__DIR__}/tests/mcfunction/selectors.txt",
|
||||||
"#{__DIR__}/tests/php/anonymous_class.txt",
|
"#{__DIR__}/tests/php/anonymous_class.txt",
|
||||||
"#{__DIR__}/tests/html/javascript_unclosed.txt",
|
"#{__DIR__}/tests/html/javascript_unclosed.txt",
|
||||||
# BAD FOR ONIGMO
|
|
||||||
"#{__DIR__}/tests/json/test_backtracking.txt",
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -61,7 +58,6 @@ describe Tartrazine do
|
|||||||
end
|
end
|
||||||
else
|
else
|
||||||
it "parses #{testcase}".split("/")[-2...].join("/") do
|
it "parses #{testcase}".split("/")[-2...].join("/") do
|
||||||
p! testcase
|
|
||||||
text = File.read(testcase).split("---input---\n").last.split("---tokens---").first
|
text = File.read(testcase).split("---input---\n").last.split("---tokens---").first
|
||||||
lexer_name = File.basename(File.dirname(testcase)).downcase
|
lexer_name = File.basename(File.dirname(testcase)).downcase
|
||||||
unless failing_lexers.includes?(lexer_name) ||
|
unless failing_lexers.includes?(lexer_name) ||
|
||||||
|
@ -30,11 +30,11 @@ module Tartrazine
|
|||||||
end
|
end
|
||||||
|
|
||||||
# ameba:disable Metrics/CyclomaticComplexity
|
# ameba:disable Metrics/CyclomaticComplexity
|
||||||
def emit(match, lexer : Lexer, match_group = 0) : Array(Token)
|
def emit(match : Regex::MatchData?, lexer : Lexer, match_group = 0) : Array(Token)
|
||||||
case type
|
case type
|
||||||
when "token"
|
when "token"
|
||||||
raise Exception.new "Can't have a token without a match" if match.nil?
|
raise Exception.new "Can't have a token without a match" if match.nil?
|
||||||
[Token.new(type: xml["type"], value: match[match_group].as(Onigmo::Match).value)]
|
[Token.new(type: xml["type"], value: match[match_group])]
|
||||||
when "push"
|
when "push"
|
||||||
states_to_push = xml.attributes.select { |attrib|
|
states_to_push = xml.attributes.select { |attrib|
|
||||||
attrib.name == "state"
|
attrib.name == "state"
|
||||||
@ -88,14 +88,14 @@ module Tartrazine
|
|||||||
return [] of Token if match.nil?
|
return [] of Token if match.nil?
|
||||||
lexer_name = xml["lexer"].downcase
|
lexer_name = xml["lexer"].downcase
|
||||||
Log.trace { "to tokenize: #{match[match_group]}" }
|
Log.trace { "to tokenize: #{match[match_group]}" }
|
||||||
Tartrazine.lexer(lexer_name).tokenize(match[match_group].as(Onigmo::Match).value, usingself: true)
|
Tartrazine.lexer(lexer_name).tokenize(match[match_group], usingself: true)
|
||||||
when "usingself"
|
when "usingself"
|
||||||
# Shunt to another copy of this lexer
|
# Shunt to another copy of this lexer
|
||||||
return [] of Token if match.nil?
|
return [] of Token if match.nil?
|
||||||
|
|
||||||
new_lexer = Lexer.from_xml(lexer.xml)
|
new_lexer = Lexer.from_xml(lexer.xml)
|
||||||
Log.trace { "to tokenize: #{match[match_group]}" }
|
Log.trace { "to tokenize: #{match[match_group]}" }
|
||||||
new_lexer.tokenize(match[match_group].as(Onigmo::Match).value, usingself: true)
|
new_lexer.tokenize(match[match_group], usingself: true)
|
||||||
when "combined"
|
when "combined"
|
||||||
# Combine two states into one anonymous state
|
# Combine two states into one anonymous state
|
||||||
states = xml.attributes.select { |attrib|
|
states = xml.attributes.select { |attrib|
|
||||||
|
@ -73,7 +73,7 @@ module Tartrazine
|
|||||||
# These are true/false/nil
|
# These are true/false/nil
|
||||||
outp << "border: none;" if style.border == false
|
outp << "border: none;" if style.border == false
|
||||||
outp << "font-weight: bold;" if style.bold
|
outp << "font-weight: bold;" if style.bold
|
||||||
outp << "font-weight: #{@weight_of_bold};" if style.bold == false
|
outp << "font-weight: #{weight_of_bold};" if style.bold == false
|
||||||
outp << "font-style: italic;" if style.italic
|
outp << "font-style: italic;" if style.italic
|
||||||
outp << "font-style: normal;" if style.italic == false
|
outp << "font-style: normal;" if style.italic == false
|
||||||
outp << "text-decoration: underline;" if style.underline
|
outp << "text-decoration: underline;" if style.underline
|
||||||
|
157
src/onigmo.cr
157
src/onigmo.cr
@ -1,85 +1,88 @@
|
|||||||
@[Link("onigmo")]
|
@[Link("onigmo")]
|
||||||
@[Link(ldflags: "#{__DIR__}/onigmo/onigwrap.o")]
|
|
||||||
|
|
||||||
lib LibOnigmo
|
lib Onigmo
|
||||||
type Regex = Pointer(Void)
|
type OnigOptionType = UInt32
|
||||||
type Region = Pointer(Void)
|
type OnigCaseFoldType = UInt32
|
||||||
|
type OnigDistance = LibC::SizeT
|
||||||
|
|
||||||
fun create = onigwrap_create(pattern : LibC::Char*, len : UInt32,
|
struct OnigRegex
|
||||||
ignoreCase : Int32,
|
p : LibC::UChar*
|
||||||
multiline : Int32,
|
used : UInt32
|
||||||
dotall : Int32) : Regex
|
alloc : UInt32
|
||||||
fun free = onigwrap_free(re : Regex)
|
|
||||||
fun region_free = onigwrap_region_free(region : Region)
|
|
||||||
|
|
||||||
fun search = onigwrap_search(re : Regex, str : LibC::Char*, offset : UInt32, length : UInt32) : Region
|
num_mem : Int32
|
||||||
fun num_regs = onigwrap_num_regs(region : Region) : Int32
|
num_repeat : Int32
|
||||||
fun pos = onigwrap_pos(region : Region, index : Int32) : Int32
|
num_null_check : Int32
|
||||||
fun len = onigwrap_len(region : Region, index : Int32) : Int32
|
num_comb_exp_check : Int32
|
||||||
|
num_call : Int32
|
||||||
|
capture_history : UInt32
|
||||||
|
bt_mem_start : UInt32
|
||||||
|
bt_mem_end : UInt32
|
||||||
|
stack_pop_level : Int32
|
||||||
|
repeat_range_alloc : Int32
|
||||||
|
options : OnigOptionType
|
||||||
|
syntax : OnigSyntaxType*
|
||||||
|
name_table : Void*
|
||||||
|
case_fold_flag : OnigCaseFoldType
|
||||||
|
optimize : Int32
|
||||||
|
threshold_len : Int32
|
||||||
|
anchor : Int32
|
||||||
|
anchor_dmin : OnigDistance
|
||||||
|
anchor_dmax : OnigDistance
|
||||||
|
sub_anchor : Int32
|
||||||
|
exact : LibC::UChar*
|
||||||
|
exact_end : LibC::UChar*
|
||||||
|
map : LibC::UChar*
|
||||||
|
int_map : Int32*
|
||||||
|
int_map_backward : Int32*
|
||||||
|
dmin : OnigDistance
|
||||||
|
dmax : OnigDistance
|
||||||
|
chain : OnigRegex*
|
||||||
|
end
|
||||||
|
|
||||||
|
type OnigRegexType = OnigRegex*
|
||||||
|
type OnigCodePoint = UInt32
|
||||||
|
type OnigUChar = LibC::UChar
|
||||||
|
type OnigEncoding = Void*
|
||||||
|
|
||||||
|
struct OnigMetaCharTableType
|
||||||
|
esc : OnigCodePoint
|
||||||
|
anychar : OnigCodePoint
|
||||||
|
anytime : OnigCodePoint
|
||||||
|
zero_or_one_time : OnigCodePoint
|
||||||
|
one_or_one_time : OnigCodePoint
|
||||||
|
anychar_anytime : OnigCodePoint
|
||||||
|
end
|
||||||
|
|
||||||
|
struct OnigSyntaxType
|
||||||
|
op : UInt32
|
||||||
|
op2 : UInt32
|
||||||
|
behavior : UInt32
|
||||||
|
options : OnigOptionType
|
||||||
|
meta_char_table : OnigMetaCharTableType
|
||||||
|
end
|
||||||
|
|
||||||
|
struct OnigErrorInfo
|
||||||
|
enc : OnigEncoding
|
||||||
|
par : OnigUChar*
|
||||||
|
par_end : OnigUChar*
|
||||||
|
end
|
||||||
|
|
||||||
|
ONIG_OPTION_NONE = 0u32
|
||||||
|
ONIG_OPTION_DEFAULT = ONIG_OPTION_NONE
|
||||||
|
|
||||||
|
|
||||||
|
fun new = onig_new(OnigRegex*, OnigUChar*, OnigUChar*, OnigOptionType, OnigEncoding, OnigSyntaxType*, OnigErrorInfo*)
|
||||||
end
|
end
|
||||||
|
|
||||||
module Onigmo
|
pattern = "a(.*)b|[e-f]+"
|
||||||
class Match
|
str = "zzzzaffffffffb"
|
||||||
property begin : Int32
|
|
||||||
property end : Int32
|
|
||||||
property value : String
|
|
||||||
|
|
||||||
def initialize(@begin, @end, @value)
|
einfo = Onigmo::OnigErrorInfo.new
|
||||||
end
|
|
||||||
|
|
||||||
def to_s
|
Onigmo.new(out reg,
|
||||||
@value
|
pattern.to_unsafe,
|
||||||
end
|
pattern.to_unsafe + pattern.size,
|
||||||
end
|
Onigmo::ONIG_OPTION_DEFAULT,
|
||||||
|
0,
|
||||||
class Regex
|
Onigmo::ONIG_SYNTAX_DEFAULT, pointerof(einfo))
|
||||||
def initialize(@pattern : String, @ignorecase = false, @multiline = false, @dotall = false)
|
|
||||||
@re = LibOnigmo.create(@pattern.to_unsafe, @pattern.bytesize, @ignorecase ? 1 : 0, @multiline ? 1 : 0, @dotall ? 1 : 0)
|
|
||||||
end
|
|
||||||
|
|
||||||
def finalize
|
|
||||||
LibOnigmo.free(@re)
|
|
||||||
end
|
|
||||||
|
|
||||||
def match(str : String, offset = 0)
|
|
||||||
# The offset argument is a character index, but Onigmo expects a byte index
|
|
||||||
offset = str.char_index_to_byte_index(offset)
|
|
||||||
if offset.nil?
|
|
||||||
raise Exception.new "Invalid offset"
|
|
||||||
end
|
|
||||||
|
|
||||||
region = LibOnigmo.search(@re, str.to_unsafe, offset, str.bytesize)
|
|
||||||
result = [] of Match?
|
|
||||||
num_regs = LibOnigmo.num_regs(region)
|
|
||||||
if num_regs > 0
|
|
||||||
(0...num_regs).each do |i|
|
|
||||||
pos = LibOnigmo.pos(region, i)
|
|
||||||
l = LibOnigmo.len(region, i)
|
|
||||||
if pos == -1 || l == -1
|
|
||||||
result << nil
|
|
||||||
else
|
|
||||||
b = str.byte_index_to_char_index(pos)
|
|
||||||
e = str.byte_index_to_char_index(pos + l)
|
|
||||||
# p! pos, l, b, e, str[pos..]
|
|
||||||
if b.nil? || e.nil?
|
|
||||||
raise Exception.new "Invalid substring"
|
|
||||||
end
|
|
||||||
|
|
||||||
v = str[b...e]
|
|
||||||
result << Match.new(b, b + v.size, v)
|
|
||||||
end
|
|
||||||
end
|
|
||||||
else
|
|
||||||
return [] of Match
|
|
||||||
end
|
|
||||||
LibOnigmo.region_free(region)
|
|
||||||
result
|
|
||||||
end
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
# pattern = "\\w"
|
|
||||||
# str = "α"
|
|
||||||
|
|
||||||
# re = Onigmo::Regex.new(pattern, false, false, false)
|
|
||||||
# p! re.match(str)
|
|
@ -1,94 +0,0 @@
|
|||||||
#include "onigmo.h"
|
|
||||||
|
|
||||||
regex_t *onigwrap_create(char *pattern, int len, int ignoreCase, int multiline, int dotall)
|
|
||||||
{
|
|
||||||
regex_t *reg;
|
|
||||||
|
|
||||||
OnigErrorInfo einfo;
|
|
||||||
|
|
||||||
OnigOptionType onigOptions = ONIG_OPTION_DEFAULT;
|
|
||||||
|
|
||||||
if (ignoreCase == 1)
|
|
||||||
onigOptions |= ONIG_OPTION_IGNORECASE;
|
|
||||||
|
|
||||||
if (multiline == 1)
|
|
||||||
onigOptions |= ONIG_OPTION_NEGATE_SINGLELINE;
|
|
||||||
|
|
||||||
if (dotall == 1)
|
|
||||||
onigOptions |= ONIG_OPTION_DOTALL;
|
|
||||||
|
|
||||||
OnigUChar *stringStart = (OnigUChar*) pattern;
|
|
||||||
OnigUChar *stringEnd = (OnigUChar*) pattern + len;
|
|
||||||
int res = onig_new(®, stringStart, stringEnd, onigOptions, ONIG_ENCODING_UTF8, ONIG_SYNTAX_PYTHON, &einfo);
|
|
||||||
|
|
||||||
return reg;
|
|
||||||
}
|
|
||||||
|
|
||||||
void onigwrap_region_free(OnigRegion *region)
|
|
||||||
{
|
|
||||||
onig_region_free(region, 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
void onigwrap_free(regex_t *reg)
|
|
||||||
{
|
|
||||||
onig_free(reg);
|
|
||||||
}
|
|
||||||
|
|
||||||
int onigwrap_index_in(regex_t *reg, char *charPtr, int offset, int length)
|
|
||||||
{
|
|
||||||
OnigUChar *stringStart = (OnigUChar*) charPtr;
|
|
||||||
OnigUChar *stringEnd = (OnigUChar*) (charPtr + length);
|
|
||||||
OnigUChar *stringOffset = (OnigUChar*) (charPtr + offset);
|
|
||||||
OnigUChar *stringRange = (OnigUChar*) stringEnd;
|
|
||||||
|
|
||||||
OnigRegion *region = onig_region_new();
|
|
||||||
int result = onig_search(reg, stringStart, stringEnd, stringOffset, stringRange, region, ONIG_OPTION_NONE);
|
|
||||||
onig_region_free(region, 1);
|
|
||||||
|
|
||||||
if (result >= 0)
|
|
||||||
return result >> 1;
|
|
||||||
if (result == ONIG_MISMATCH)
|
|
||||||
return -1;
|
|
||||||
return -2;
|
|
||||||
}
|
|
||||||
|
|
||||||
OnigRegion *onigwrap_search(regex_t *reg, char *charPtr, int offset, int length)
|
|
||||||
{
|
|
||||||
OnigUChar *stringStart = (OnigUChar*) charPtr;
|
|
||||||
OnigUChar *stringEnd = (OnigUChar*) (charPtr + length);
|
|
||||||
OnigUChar *stringOffset = (OnigUChar*) (charPtr + offset);
|
|
||||||
OnigUChar *stringRange = (OnigUChar*) stringEnd;
|
|
||||||
|
|
||||||
OnigRegion *region = onig_region_new();
|
|
||||||
|
|
||||||
int result = onig_search(reg, stringStart, stringEnd, stringOffset, stringRange, region, ONIG_OPTION_NONE);
|
|
||||||
return region;
|
|
||||||
}
|
|
||||||
|
|
||||||
int onigwrap_num_regs(OnigRegion *region)
|
|
||||||
{
|
|
||||||
return region->num_regs;
|
|
||||||
}
|
|
||||||
|
|
||||||
int onigwrap_pos(OnigRegion *region, int nth)
|
|
||||||
{
|
|
||||||
if (nth < region->num_regs)
|
|
||||||
{
|
|
||||||
int result = region->beg[nth];
|
|
||||||
if (result < 0)
|
|
||||||
return -1;
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
int onigwrap_len(OnigRegion *region, int nth)
|
|
||||||
{
|
|
||||||
if (nth < region->num_regs)
|
|
||||||
{
|
|
||||||
int result = region->end[nth] - region->beg[nth];
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
|
|
@ -1,32 +0,0 @@
|
|||||||
#include "onigmo.h"
|
|
||||||
|
|
||||||
#if defined(_WIN32)
|
|
||||||
#define ONIGWRAP_EXTERN extern __declspec(dllexport)
|
|
||||||
#else
|
|
||||||
#define ONIGWRAP_EXTERN extern
|
|
||||||
#endif
|
|
||||||
|
|
||||||
ONIGWRAP_EXTERN
|
|
||||||
regex_t *onigwrap_create(char *pattern, int len, int ignoreCase, int multiline);
|
|
||||||
|
|
||||||
ONIGWRAP_EXTERN
|
|
||||||
void onigwrap_region_free(OnigRegion *region);
|
|
||||||
|
|
||||||
ONIGWRAP_EXTERN
|
|
||||||
void onigwrap_free(regex_t *reg);
|
|
||||||
|
|
||||||
ONIGWRAP_EXTERN
|
|
||||||
int onigwrap_index_in(regex_t *reg, char *charPtr, int offset, int length);
|
|
||||||
|
|
||||||
ONIGWRAP_EXTERN
|
|
||||||
OnigRegion *onigwrap_search(regex_t *reg, char *charPtr, int offset, int length);
|
|
||||||
|
|
||||||
ONIGWRAP_EXTERN
|
|
||||||
int onigwrap_num_regs(OnigRegion *region);
|
|
||||||
|
|
||||||
ONIGWRAP_EXTERN
|
|
||||||
int onigwrap_pos(OnigRegion *region, int nth);
|
|
||||||
|
|
||||||
ONIGWRAP_EXTERN
|
|
||||||
int onigwrap_len(OnigRegion *region, int nth);
|
|
||||||
|
|
25
src/rules.cr
25
src/rules.cr
@ -3,7 +3,6 @@ require "./formatter"
|
|||||||
require "./rules"
|
require "./rules"
|
||||||
require "./styles"
|
require "./styles"
|
||||||
require "./lexer"
|
require "./lexer"
|
||||||
require "./onigmo"
|
|
||||||
|
|
||||||
# These are lexer rules. They match with the text being parsed
|
# These are lexer rules. They match with the text being parsed
|
||||||
# and perform actions, either emitting tokens or changing the
|
# and perform actions, either emitting tokens or changing the
|
||||||
@ -11,22 +10,16 @@ require "./onigmo"
|
|||||||
module Tartrazine
|
module Tartrazine
|
||||||
# This rule matches via a regex pattern
|
# This rule matches via a regex pattern
|
||||||
|
|
||||||
alias Regex = Onigmo::Regex
|
|
||||||
|
|
||||||
class Rule
|
class Rule
|
||||||
property pattern : Regex = Regex.new ""
|
property pattern : Regex = Regex.new ""
|
||||||
property pattern2 : ::Regex = ::Regex.new ""
|
|
||||||
property actions : Array(Action) = [] of Action
|
property actions : Array(Action) = [] of Action
|
||||||
property xml : String = "foo"
|
property xml : String = "foo"
|
||||||
|
|
||||||
def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token))
|
def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token))
|
||||||
match = pattern.match(text, pos)
|
match = pattern.match(text, pos)
|
||||||
match2 = pattern2.match(text, pos)
|
|
||||||
# We don't match if the match doesn't move the cursor
|
# We don't match if the match doesn't move the cursor
|
||||||
# because that causes infinite loops
|
# because that causes infinite loops
|
||||||
# The `match.begin > pos` is the same as the ANCHORED option
|
return false, pos, [] of Token if match.nil? || match.end == 0
|
||||||
return false, pos, [] of Token if match.empty? || match[0].nil? || match[0].try { |m| m.begin > pos }
|
|
||||||
# p! match.map(&.to_s), match2, text[pos-1..pos + 20],"----------------------"
|
|
||||||
# Log.trace { "#{match}, #{pattern.inspect}, #{text}, #{pos}" }
|
# Log.trace { "#{match}, #{pattern.inspect}, #{text}, #{pos}" }
|
||||||
tokens = [] of Token
|
tokens = [] of Token
|
||||||
# Emit the tokens
|
# Emit the tokens
|
||||||
@ -34,23 +27,21 @@ module Tartrazine
|
|||||||
# Emit the token
|
# Emit the token
|
||||||
tokens += action.emit(match, lexer)
|
tokens += action.emit(match, lexer)
|
||||||
end
|
end
|
||||||
# Log.trace { "#{xml}, #{match[0].end}, #{tokens}" }
|
Log.trace { "#{xml}, #{match.end}, #{tokens}" }
|
||||||
return true, pos + match[0].as(Onigmo::Match).value.size, tokens
|
return true, match.end, tokens
|
||||||
end
|
end
|
||||||
|
|
||||||
def initialize(node : XML::Node, multiline, dotall, ignorecase)
|
def initialize(node : XML::Node, multiline, dotall, ignorecase)
|
||||||
@xml = node.to_s
|
@xml = node.to_s
|
||||||
pattern = node["pattern"]
|
pattern = node["pattern"]
|
||||||
# flags = Regex::Options::ANCHORED
|
flags = Regex::Options::ANCHORED
|
||||||
flags = ::Regex::Options::NO_UTF_CHECK
|
|
||||||
# MULTILINE implies DOTALL which we don't want, so we
|
# MULTILINE implies DOTALL which we don't want, so we
|
||||||
# use in-pattern flag (?m) instead
|
# use in-pattern flag (?m) instead
|
||||||
flags |= ::Regex::Options::MULTILINE if multiline
|
# flags |= Regex::Options::MULTILINE if multiline
|
||||||
pattern = "(?m)" + pattern if multiline
|
pattern = "(?m)" + pattern if multiline
|
||||||
flags |= ::Regex::Options::DOTALL if dotall
|
flags |= Regex::Options::DOTALL if dotall
|
||||||
flags |= ::Regex::Options::IGNORE_CASE if ignorecase
|
flags |= Regex::Options::IGNORE_CASE if ignorecase
|
||||||
@pattern = Regex.new(pattern, ignorecase, multiline, dotall)
|
@pattern = Regex.new(pattern, flags)
|
||||||
@pattern2 = ::Regex.new(pattern, flags)
|
|
||||||
add_actions(node)
|
add_actions(node)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user