6 Commits

5 changed files with 35 additions and 45 deletions

View File

@@ -14,6 +14,8 @@ dependencies:
sixteen:
github: ralsina/sixteen
branch: main
cre2:
git: "https://git.ralsina.me/ralsina/cre2.git"
crystal: ">= 1.13.0"

View File

@@ -1,3 +1,5 @@
require "xml"
# These are Lexer actions. When a rule matches, it will
# perform a list of actions. These actions can emit tokens
# or change the state machine.
@@ -24,11 +26,12 @@ module Tartrazine
end
# ameba:disable Metrics/CyclomaticComplexity
def emit(match : Regex::MatchData?, lexer : Lexer, match_group = 0) : Array(Token)
def emit(match : MatchData,
lexer : Lexer, match_group = 0) : Array(Token)
case type
when "token"
raise Exception.new "Can't have a token without a match" if match.nil?
[Token.new(type: xml["type"], value: match[match_group])]
raise Exception.new "Can't have a token without a match" if match.nil? || match[0].size == 0
[Token.new(type: xml["type"], value: match[0])]
when "push"
states_to_push = xml.attributes.select { |attrib|
attrib.name == "state"
@@ -61,35 +64,37 @@ module Tartrazine
when "bygroups"
# FIXME: handle
# ><bygroups>
# <token type="Punctuation"/>
# <token type="Punctuation"/>https://github.com/google/re2/wiki/Syntax
# None
# <token type="LiteralStringRegex"/>
#
# where that None means skipping a group
#
raise Exception.new "Can't have a token without a match" if match.nil?
raise Exception.new "Can't have a bygroups without a match" if match.nil? || match[0].size == 0
# Each group matches an action. If the group match is empty,
# the action is skipped.
result = [] of Token
@actions.each_with_index do |e, i|
next if match[i + 1]?.nil?
result += e.emit(match, lexer, i + 1)
next if match[i].size == 0
result += e.emit(match, lexer, i)
end
result
when "using"
# Shunt to another lexer entirely
return [] of Token if match.nil?
return [] of Token if match.nil? || match[0].size == 0
lexer_name = xml["lexer"].downcase
Log.trace { "to tokenize: #{match[match_group]}" }
Tartrazine.lexer(lexer_name).tokenize(match[match_group], usingself: true)
# Log.trace { "to tokenize: #{match[match_group]}" }
to_tokenize = match[match_group]
Tartrazine.lexer(lexer_name).tokenize(to_tokenize, usingself: true)
when "usingself"
# Shunt to another copy of this lexer
return [] of Token if match.nil?
return [] of Token if match.nil? || match[0].size == 0
new_lexer = Lexer.from_xml(lexer.xml)
Log.trace { "to tokenize: #{match[match_group]}" }
new_lexer.tokenize(match[match_group], usingself: true)
# Log.trace { "to tokenize: #{match[match_group]}" }
to_tokenize = match[match_group]
new_lexer.tokenize(to_tokenize, usingself: true)
when "combined"
# Combine two states into one anonymous state
states = xml.attributes.select { |attrib|

0
src/re2.cr Normal file
View File

View File

@@ -1,4 +1,5 @@
require "./actions"
# require "cre2"
# These are lexer rules. They match with the text being parsed
# and perform actions, either emitting tokens or changing the
@@ -6,8 +7,12 @@ require "./actions"
module Tartrazine
# This rule matches via a regex pattern
# alias Regex = CRe2::Regex
# alias MatchData = CRe2::MatchDataLike | Regex::MatchData | Nil
alias MatchData = Regex::MatchData | Nil
class Rule
property pattern : Regex = Re2.new ""
property pattern : Regex = Regex.new ""
property actions : Array(Action) = [] of Action
property xml : String = "foo"
@@ -15,7 +20,8 @@ module Tartrazine
match = pattern.match(text, pos)
# We don't match if the match doesn't move the cursor
# because that causes infinite loops
return false, pos, [] of Token if match.nil? || match.end == 0
return false, pos, [] of Token if match.nil?
# Log.trace { "#{match}, #{pattern.inspect}, #{text}, #{pos}" }
tokens = [] of Token
# Emit the tokens
@@ -23,18 +29,17 @@ module Tartrazine
# Emit the token
tokens += action.emit(match, lexer)
end
Log.trace { "#{xml}, #{match.end}, #{tokens}" }
return true, match.end, tokens
# Log.trace { "#{xml}, #{match.end}, #{tokens}" }
return true, match[0].size, tokens
end
def initialize(node : XML::Node, multiline, dotall, ignorecase)
@xml = node.to_s
@pattern = Re2.new(
node["pattern"],
multiline,
dotall,
ignorecase,
anchored: true)
options = Regex::Options::ANCHORED
options |= Regex::Options::MULTILINE if multiline
options |= Regex::Options::DOTALL if dotall
options |= Regex::Options::IGNORE_CASE if ignorecase
@pattern = Regex.new(node["pattern"], options)
add_actions(node)
end
@@ -86,24 +91,4 @@ module Tartrazine
add_actions(node)
end
end
# This is a hack to workaround that Crystal seems to disallow
# having regexes multiline but not dot_all
class Re2 < Regex
@source = "fa"
@options = Regex::Options::None
@jit = true
def initialize(pattern : String, multiline = false, dotall = false, ignorecase = false, anchored = false)
flags = LibPCRE2::UTF | LibPCRE2::DUPNAMES |
LibPCRE2::UCP
flags |= LibPCRE2::MULTILINE if multiline
flags |= LibPCRE2::DOTALL if dotall
flags |= LibPCRE2::CASELESS if ignorecase
flags |= LibPCRE2::ANCHORED if anchored
@re = Regex::PCRE2.compile(pattern, flags) do |error_message|
raise Exception.new(error_message)
end
end
end
end

View File

@@ -63,8 +63,6 @@ module Tartrazine
tokens = [] of Token
pos = 0
matched = false
time = 0
count = 0
# Respect the `ensure_nl` config option
if text.size > 0 && text[-1] != '\n' && config[:ensure_nl] && !usingself