Some tests pass!

This commit is contained in:
Roberto Alsina 2024-08-13 19:19:12 -03:00
parent 2a9e7fde0d
commit a704c59fa9
6 changed files with 67 additions and 28 deletions

View File

@ -42,7 +42,6 @@ known_bad = {
"#{__DIR__}/tests/mcfunction/selectors.txt", "#{__DIR__}/tests/mcfunction/selectors.txt",
"#{__DIR__}/tests/php/anonymous_class.txt", "#{__DIR__}/tests/php/anonymous_class.txt",
"#{__DIR__}/tests/html/javascript_unclosed.txt", "#{__DIR__}/tests/html/javascript_unclosed.txt",
} }
# Tests that fail because of a limitation in PCRE2 # Tests that fail because of a limitation in PCRE2

View File

@ -30,11 +30,11 @@ module Tartrazine
end end
# ameba:disable Metrics/CyclomaticComplexity # ameba:disable Metrics/CyclomaticComplexity
def emit(match : Regex::MatchData?, lexer : Lexer, match_group = 0) : Array(Token) def emit(match, lexer : Lexer, match_group = 0) : Array(Token)
case type case type
when "token" when "token"
raise Exception.new "Can't have a token without a match" if match.nil? raise Exception.new "Can't have a token without a match" if match.nil?
[Token.new(type: xml["type"], value: match[match_group])] [Token.new(type: xml["type"], value: match[match_group].as(Onigmo::Match).value)]
when "push" when "push"
states_to_push = xml.attributes.select { |attrib| states_to_push = xml.attributes.select { |attrib|
attrib.name == "state" attrib.name == "state"
@ -88,14 +88,14 @@ module Tartrazine
return [] of Token if match.nil? return [] of Token if match.nil?
lexer_name = xml["lexer"].downcase lexer_name = xml["lexer"].downcase
Log.trace { "to tokenize: #{match[match_group]}" } Log.trace { "to tokenize: #{match[match_group]}" }
Tartrazine.lexer(lexer_name).tokenize(match[match_group], usingself: true) Tartrazine.lexer(lexer_name).tokenize(match[match_group].as(Onigmo::Match).value, usingself: true)
when "usingself" when "usingself"
# Shunt to another copy of this lexer # Shunt to another copy of this lexer
return [] of Token if match.nil? return [] of Token if match.nil?
new_lexer = Lexer.from_xml(lexer.xml) new_lexer = Lexer.from_xml(lexer.xml)
Log.trace { "to tokenize: #{match[match_group]}" } Log.trace { "to tokenize: #{match[match_group]}" }
new_lexer.tokenize(match[match_group], usingself: true) new_lexer.tokenize(match[match_group].as(Onigmo::Match).value, usingself: true)
when "combined" when "combined"
# Combine two states into one anonymous state # Combine two states into one anonymous state
states = xml.attributes.select { |attrib| states = xml.attributes.select { |attrib|

View File

@ -73,7 +73,7 @@ module Tartrazine
# These are true/false/nil # These are true/false/nil
outp << "border: none;" if style.border == false outp << "border: none;" if style.border == false
outp << "font-weight: bold;" if style.bold outp << "font-weight: bold;" if style.bold
outp << "font-weight: #{weight_of_bold};" if style.bold == false outp << "font-weight: #{@weight_of_bold};" if style.bold == false
outp << "font-style: italic;" if style.italic outp << "font-style: italic;" if style.italic
outp << "font-style: normal;" if style.italic == false outp << "font-style: normal;" if style.italic == false
outp << "text-decoration: underline;" if style.underline outp << "text-decoration: underline;" if style.underline

View File

@ -1,15 +1,14 @@
@[Link("onigmo")] @[Link("onigmo")]
@[Link(ldflags: "#{__DIR__}/onigmo/onigwrap.o")] @[Link(ldflags: "#{__DIR__}/onigmo/onigwrap.o")]
lib Onigmo lib LibOnigmo
type Regex = Pointer(Void) type Regex = Pointer(Void)
type Region = Pointer(Void) type Region = Pointer(Void)
fun create = onigwrap_create(pattern : LibC::Char*, len : UInt32, fun create = onigwrap_create(pattern : LibC::Char*, len : UInt32,
ignoreCase : Int32, ignoreCase : Int32,
multiline : Int32, multiline : Int32,
dotall : Int32, dotall : Int32) : Regex
anchored : Int32) : Regex
fun free = onigwrap_free(re : Regex) fun free = onigwrap_free(re : Regex)
fun region_free = onigwrap_region_free(region : Region) fun region_free = onigwrap_region_free(region : Region)
@ -19,16 +18,51 @@ lib Onigmo
fun len = onigwrap_len(region : Region, index : Int32) : Int32 fun len = onigwrap_len(region : Region, index : Int32) : Int32
end end
pattern = "a" module Onigmo
class Match
property begin : Int32
property end : Int32
property value : String
def initialize(@begin, @end, @value)
end
end
class Regex
def initialize(@pattern : String, @ignorecase = false, @multiline = false, @dotall = false)
@re = LibOnigmo.create(@pattern.to_unsafe, @pattern.bytesize, @ignorecase ? 1 : 0, @multiline ? 1 : 0, @dotall ? 1 : 0)
end
def finalize
LibOnigmo.free(@re)
end
def match(str : String, offset = 0)
region = LibOnigmo.search(@re, str.to_unsafe, offset, str.bytesize)
result = [] of Match?
num_regs = LibOnigmo.num_regs(region)
if num_regs > 0
(0...num_regs).each do |i|
b = LibOnigmo.pos(region, i)
e = b + LibOnigmo.len(region, i)
if b == -1 || e == -1
result << nil
else
v = str[b...e]
result << Match.new(b, e, v)
end
end
else
return [] of Match
end
LibOnigmo.region_free(region)
result
end
end
end
pattern = "#.*x"
str = "# foobar" str = "# foobar"
re = Onigmo.create(pattern, pattern.size, false, true, false, false) re = Onigmo::Regex.new(pattern, false, false, false)
region = Onigmo.search(re, str.to_unsafe, 0, str.bytesize) p! re.match(str)
num_regs = Onigmo.num_regs(region)
(0...num_regs).each do |i|
pos = Onigmo.pos(region, i)
len = Onigmo.len(region, i)
puts "match #{i}: #{str[pos, len]}"
end
Onigmo.region_free(region)
Onigmo.free(re)

View File

@ -1,6 +1,6 @@
#include "onigmo.h" #include "onigmo.h"
regex_t *onigwrap_create(char *pattern, int len, int ignoreCase, int multiline, int dotall, int anchored ) regex_t *onigwrap_create(char *pattern, int len, int ignoreCase, int multiline, int dotall)
{ {
regex_t *reg; regex_t *reg;

View File

@ -3,6 +3,7 @@ require "./formatter"
require "./rules" require "./rules"
require "./styles" require "./styles"
require "./lexer" require "./lexer"
require "./onigmo"
# These are lexer rules. They match with the text being parsed # These are lexer rules. They match with the text being parsed
# and perform actions, either emitting tokens or changing the # and perform actions, either emitting tokens or changing the
@ -10,6 +11,8 @@ require "./lexer"
module Tartrazine module Tartrazine
# This rule matches via a regex pattern # This rule matches via a regex pattern
alias Regex = Onigmo::Regex
class Rule class Rule
property pattern : Regex = Regex.new "" property pattern : Regex = Regex.new ""
property actions : Array(Action) = [] of Action property actions : Array(Action) = [] of Action
@ -19,7 +22,9 @@ module Tartrazine
match = pattern.match(text, pos) match = pattern.match(text, pos)
# We don't match if the match doesn't move the cursor # We don't match if the match doesn't move the cursor
# because that causes infinite loops # because that causes infinite loops
return false, pos, [] of Token if match.nil? || match.end == 0 # The `match.begin > pos` is the same as the ANCHORED option
return false, pos, [] of Token if match.empty? || match[0].nil? || match[0].try { |m| m.begin > pos }
# p! match.map(&.value), text[pos..pos + 20]
# Log.trace { "#{match}, #{pattern.inspect}, #{text}, #{pos}" } # Log.trace { "#{match}, #{pattern.inspect}, #{text}, #{pos}" }
tokens = [] of Token tokens = [] of Token
# Emit the tokens # Emit the tokens
@ -27,21 +32,22 @@ module Tartrazine
# Emit the token # Emit the token
tokens += action.emit(match, lexer) tokens += action.emit(match, lexer)
end end
Log.trace { "#{xml}, #{match.end}, #{tokens}" } # Log.trace { "#{xml}, #{match[0].end}, #{tokens}" }
return true, match.end, tokens return true, match[0].as(Onigmo::Match).end, tokens
end end
def initialize(node : XML::Node, multiline, dotall, ignorecase) def initialize(node : XML::Node, multiline, dotall, ignorecase)
@xml = node.to_s @xml = node.to_s
pattern = node["pattern"] pattern = node["pattern"]
flags = Regex::Options::ANCHORED # flags = Regex::Options::ANCHORED
# flags = Regex::Options::NO_UTF_CHECK
# MULTILINE implies DOTALL which we don't want, so we # MULTILINE implies DOTALL which we don't want, so we
# use in-pattern flag (?m) instead # use in-pattern flag (?m) instead
# flags |= Regex::Options::MULTILINE if multiline # flags |= Regex::Options::MULTILINE if multiline
pattern = "(?m)" + pattern if multiline pattern = "(?m)" + pattern if multiline
flags |= Regex::Options::DOTALL if dotall # flags |= Regex::Options::DOTALL if dotall
flags |= Regex::Options::IGNORE_CASE if ignorecase # flags |= Regex::Options::IGNORE_CASE if ignorecase
@pattern = Regex.new(pattern, flags) @pattern = Regex.new(pattern, ignorecase, multiline, dotall)
add_actions(node) add_actions(node)
end end