6 Commits

5 changed files with 35 additions and 45 deletions

View File

@@ -14,6 +14,8 @@ dependencies:
sixteen: sixteen:
github: ralsina/sixteen github: ralsina/sixteen
branch: main branch: main
cre2:
git: "https://git.ralsina.me/ralsina/cre2.git"
crystal: ">= 1.13.0" crystal: ">= 1.13.0"

View File

@@ -1,3 +1,5 @@
require "xml"
# These are Lexer actions. When a rule matches, it will # These are Lexer actions. When a rule matches, it will
# perform a list of actions. These actions can emit tokens # perform a list of actions. These actions can emit tokens
# or change the state machine. # or change the state machine.
@@ -24,11 +26,12 @@ module Tartrazine
end end
# ameba:disable Metrics/CyclomaticComplexity # ameba:disable Metrics/CyclomaticComplexity
def emit(match : Regex::MatchData?, lexer : Lexer, match_group = 0) : Array(Token) def emit(match : MatchData,
lexer : Lexer, match_group = 0) : Array(Token)
case type case type
when "token" when "token"
raise Exception.new "Can't have a token without a match" if match.nil? raise Exception.new "Can't have a token without a match" if match.nil? || match[0].size == 0
[Token.new(type: xml["type"], value: match[match_group])] [Token.new(type: xml["type"], value: match[0])]
when "push" when "push"
states_to_push = xml.attributes.select { |attrib| states_to_push = xml.attributes.select { |attrib|
attrib.name == "state" attrib.name == "state"
@@ -61,35 +64,37 @@ module Tartrazine
when "bygroups" when "bygroups"
# FIXME: handle # FIXME: handle
# ><bygroups> # ><bygroups>
# <token type="Punctuation"/> # <token type="Punctuation"/>https://github.com/google/re2/wiki/Syntax
# None # None
# <token type="LiteralStringRegex"/> # <token type="LiteralStringRegex"/>
# #
# where that None means skipping a group # where that None means skipping a group
# #
raise Exception.new "Can't have a token without a match" if match.nil? raise Exception.new "Can't have a bygroups without a match" if match.nil? || match[0].size == 0
# Each group matches an action. If the group match is empty, # Each group matches an action. If the group match is empty,
# the action is skipped. # the action is skipped.
result = [] of Token result = [] of Token
@actions.each_with_index do |e, i| @actions.each_with_index do |e, i|
next if match[i + 1]?.nil? next if match[i].size == 0
result += e.emit(match, lexer, i + 1) result += e.emit(match, lexer, i)
end end
result result
when "using" when "using"
# Shunt to another lexer entirely # Shunt to another lexer entirely
return [] of Token if match.nil? return [] of Token if match.nil? || match[0].size == 0
lexer_name = xml["lexer"].downcase lexer_name = xml["lexer"].downcase
Log.trace { "to tokenize: #{match[match_group]}" } # Log.trace { "to tokenize: #{match[match_group]}" }
Tartrazine.lexer(lexer_name).tokenize(match[match_group], usingself: true) to_tokenize = match[match_group]
Tartrazine.lexer(lexer_name).tokenize(to_tokenize, usingself: true)
when "usingself" when "usingself"
# Shunt to another copy of this lexer # Shunt to another copy of this lexer
return [] of Token if match.nil? return [] of Token if match.nil? || match[0].size == 0
new_lexer = Lexer.from_xml(lexer.xml) new_lexer = Lexer.from_xml(lexer.xml)
Log.trace { "to tokenize: #{match[match_group]}" } # Log.trace { "to tokenize: #{match[match_group]}" }
new_lexer.tokenize(match[match_group], usingself: true) to_tokenize = match[match_group]
new_lexer.tokenize(to_tokenize, usingself: true)
when "combined" when "combined"
# Combine two states into one anonymous state # Combine two states into one anonymous state
states = xml.attributes.select { |attrib| states = xml.attributes.select { |attrib|

0
src/re2.cr Normal file
View File

View File

@@ -1,4 +1,5 @@
require "./actions" require "./actions"
# require "cre2"
# These are lexer rules. They match with the text being parsed # These are lexer rules. They match with the text being parsed
# and perform actions, either emitting tokens or changing the # and perform actions, either emitting tokens or changing the
@@ -6,8 +7,12 @@ require "./actions"
module Tartrazine module Tartrazine
# This rule matches via a regex pattern # This rule matches via a regex pattern
# alias Regex = CRe2::Regex
# alias MatchData = CRe2::MatchDataLike | Regex::MatchData | Nil
alias MatchData = Regex::MatchData | Nil
class Rule class Rule
property pattern : Regex = Re2.new "" property pattern : Regex = Regex.new ""
property actions : Array(Action) = [] of Action property actions : Array(Action) = [] of Action
property xml : String = "foo" property xml : String = "foo"
@@ -15,7 +20,8 @@ module Tartrazine
match = pattern.match(text, pos) match = pattern.match(text, pos)
# We don't match if the match doesn't move the cursor # We don't match if the match doesn't move the cursor
# because that causes infinite loops # because that causes infinite loops
return false, pos, [] of Token if match.nil? || match.end == 0
return false, pos, [] of Token if match.nil?
# Log.trace { "#{match}, #{pattern.inspect}, #{text}, #{pos}" } # Log.trace { "#{match}, #{pattern.inspect}, #{text}, #{pos}" }
tokens = [] of Token tokens = [] of Token
# Emit the tokens # Emit the tokens
@@ -23,18 +29,17 @@ module Tartrazine
# Emit the token # Emit the token
tokens += action.emit(match, lexer) tokens += action.emit(match, lexer)
end end
Log.trace { "#{xml}, #{match.end}, #{tokens}" } # Log.trace { "#{xml}, #{match.end}, #{tokens}" }
return true, match.end, tokens return true, match[0].size, tokens
end end
def initialize(node : XML::Node, multiline, dotall, ignorecase) def initialize(node : XML::Node, multiline, dotall, ignorecase)
@xml = node.to_s @xml = node.to_s
@pattern = Re2.new( options = Regex::Options::ANCHORED
node["pattern"], options |= Regex::Options::MULTILINE if multiline
multiline, options |= Regex::Options::DOTALL if dotall
dotall, options |= Regex::Options::IGNORE_CASE if ignorecase
ignorecase, @pattern = Regex.new(node["pattern"], options)
anchored: true)
add_actions(node) add_actions(node)
end end
@@ -86,24 +91,4 @@ module Tartrazine
add_actions(node) add_actions(node)
end end
end end
# This is a hack to workaround that Crystal seems to disallow
# having regexes multiline but not dot_all
class Re2 < Regex
@source = "fa"
@options = Regex::Options::None
@jit = true
def initialize(pattern : String, multiline = false, dotall = false, ignorecase = false, anchored = false)
flags = LibPCRE2::UTF | LibPCRE2::DUPNAMES |
LibPCRE2::UCP
flags |= LibPCRE2::MULTILINE if multiline
flags |= LibPCRE2::DOTALL if dotall
flags |= LibPCRE2::CASELESS if ignorecase
flags |= LibPCRE2::ANCHORED if anchored
@re = Regex::PCRE2.compile(pattern, flags) do |error_message|
raise Exception.new(error_message)
end
end
end
end end

View File

@@ -63,8 +63,6 @@ module Tartrazine
tokens = [] of Token tokens = [] of Token
pos = 0 pos = 0
matched = false matched = false
time = 0
count = 0
# Respect the `ensure_nl` config option # Respect the `ensure_nl` config option
if text.size > 0 && text[-1] != '\n' && config[:ensure_nl] && !usingself if text.size > 0 && text[-1] != '\n' && config[:ensure_nl] && !usingself