mirror of
https://github.com/ralsina/tartrazine.git
synced 2025-09-07 22:33:09 +00:00
Compare commits
6 Commits
image_writ
...
e38feb0736
Author | SHA1 | Date | |
---|---|---|---|
e38feb0736 | |||
e88fd3a48f | |||
fb54b08841 | |||
2a19f3889f | |||
b9e51824df | |||
ff1c0012ec |
@@ -14,6 +14,8 @@ dependencies:
|
||||
sixteen:
|
||||
github: ralsina/sixteen
|
||||
branch: main
|
||||
cre2:
|
||||
git: "https://git.ralsina.me/ralsina/cre2.git"
|
||||
|
||||
crystal: ">= 1.13.0"
|
||||
|
||||
|
@@ -1,3 +1,5 @@
|
||||
require "xml"
|
||||
|
||||
# These are Lexer actions. When a rule matches, it will
|
||||
# perform a list of actions. These actions can emit tokens
|
||||
# or change the state machine.
|
||||
@@ -24,11 +26,12 @@ module Tartrazine
|
||||
end
|
||||
|
||||
# ameba:disable Metrics/CyclomaticComplexity
|
||||
def emit(match : Regex::MatchData?, lexer : Lexer, match_group = 0) : Array(Token)
|
||||
def emit(match : MatchData,
|
||||
lexer : Lexer, match_group = 0) : Array(Token)
|
||||
case type
|
||||
when "token"
|
||||
raise Exception.new "Can't have a token without a match" if match.nil?
|
||||
[Token.new(type: xml["type"], value: match[match_group])]
|
||||
raise Exception.new "Can't have a token without a match" if match.nil? || match[0].size == 0
|
||||
[Token.new(type: xml["type"], value: match[0])]
|
||||
when "push"
|
||||
states_to_push = xml.attributes.select { |attrib|
|
||||
attrib.name == "state"
|
||||
@@ -61,35 +64,37 @@ module Tartrazine
|
||||
when "bygroups"
|
||||
# FIXME: handle
|
||||
# ><bygroups>
|
||||
# <token type="Punctuation"/>
|
||||
# <token type="Punctuation"/>https://github.com/google/re2/wiki/Syntax
|
||||
# None
|
||||
# <token type="LiteralStringRegex"/>
|
||||
#
|
||||
# where that None means skipping a group
|
||||
#
|
||||
raise Exception.new "Can't have a token without a match" if match.nil?
|
||||
raise Exception.new "Can't have a bygroups without a match" if match.nil? || match[0].size == 0
|
||||
|
||||
# Each group matches an action. If the group match is empty,
|
||||
# the action is skipped.
|
||||
result = [] of Token
|
||||
@actions.each_with_index do |e, i|
|
||||
next if match[i + 1]?.nil?
|
||||
result += e.emit(match, lexer, i + 1)
|
||||
next if match[i].size == 0
|
||||
result += e.emit(match, lexer, i)
|
||||
end
|
||||
result
|
||||
when "using"
|
||||
# Shunt to another lexer entirely
|
||||
return [] of Token if match.nil?
|
||||
return [] of Token if match.nil? || match[0].size == 0
|
||||
lexer_name = xml["lexer"].downcase
|
||||
Log.trace { "to tokenize: #{match[match_group]}" }
|
||||
Tartrazine.lexer(lexer_name).tokenize(match[match_group], usingself: true)
|
||||
# Log.trace { "to tokenize: #{match[match_group]}" }
|
||||
to_tokenize = match[match_group]
|
||||
Tartrazine.lexer(lexer_name).tokenize(to_tokenize, usingself: true)
|
||||
when "usingself"
|
||||
# Shunt to another copy of this lexer
|
||||
return [] of Token if match.nil?
|
||||
return [] of Token if match.nil? || match[0].size == 0
|
||||
|
||||
new_lexer = Lexer.from_xml(lexer.xml)
|
||||
Log.trace { "to tokenize: #{match[match_group]}" }
|
||||
new_lexer.tokenize(match[match_group], usingself: true)
|
||||
# Log.trace { "to tokenize: #{match[match_group]}" }
|
||||
to_tokenize = match[match_group]
|
||||
new_lexer.tokenize(to_tokenize, usingself: true)
|
||||
when "combined"
|
||||
# Combine two states into one anonymous state
|
||||
states = xml.attributes.select { |attrib|
|
||||
|
0
src/re2.cr
Normal file
0
src/re2.cr
Normal file
45
src/rules.cr
45
src/rules.cr
@@ -1,4 +1,5 @@
|
||||
require "./actions"
|
||||
# require "cre2"
|
||||
|
||||
# These are lexer rules. They match with the text being parsed
|
||||
# and perform actions, either emitting tokens or changing the
|
||||
@@ -6,8 +7,12 @@ require "./actions"
|
||||
module Tartrazine
|
||||
# This rule matches via a regex pattern
|
||||
|
||||
# alias Regex = CRe2::Regex
|
||||
# alias MatchData = CRe2::MatchDataLike | Regex::MatchData | Nil
|
||||
alias MatchData = Regex::MatchData | Nil
|
||||
|
||||
class Rule
|
||||
property pattern : Regex = Re2.new ""
|
||||
property pattern : Regex = Regex.new ""
|
||||
property actions : Array(Action) = [] of Action
|
||||
property xml : String = "foo"
|
||||
|
||||
@@ -15,7 +20,8 @@ module Tartrazine
|
||||
match = pattern.match(text, pos)
|
||||
# We don't match if the match doesn't move the cursor
|
||||
# because that causes infinite loops
|
||||
return false, pos, [] of Token if match.nil? || match.end == 0
|
||||
|
||||
return false, pos, [] of Token if match.nil?
|
||||
# Log.trace { "#{match}, #{pattern.inspect}, #{text}, #{pos}" }
|
||||
tokens = [] of Token
|
||||
# Emit the tokens
|
||||
@@ -23,18 +29,17 @@ module Tartrazine
|
||||
# Emit the token
|
||||
tokens += action.emit(match, lexer)
|
||||
end
|
||||
Log.trace { "#{xml}, #{match.end}, #{tokens}" }
|
||||
return true, match.end, tokens
|
||||
# Log.trace { "#{xml}, #{match.end}, #{tokens}" }
|
||||
return true, match[0].size, tokens
|
||||
end
|
||||
|
||||
def initialize(node : XML::Node, multiline, dotall, ignorecase)
|
||||
@xml = node.to_s
|
||||
@pattern = Re2.new(
|
||||
node["pattern"],
|
||||
multiline,
|
||||
dotall,
|
||||
ignorecase,
|
||||
anchored: true)
|
||||
options = Regex::Options::ANCHORED
|
||||
options |= Regex::Options::MULTILINE if multiline
|
||||
options |= Regex::Options::DOTALL if dotall
|
||||
options |= Regex::Options::IGNORE_CASE if ignorecase
|
||||
@pattern = Regex.new(node["pattern"], options)
|
||||
add_actions(node)
|
||||
end
|
||||
|
||||
@@ -86,24 +91,4 @@ module Tartrazine
|
||||
add_actions(node)
|
||||
end
|
||||
end
|
||||
|
||||
# This is a hack to workaround that Crystal seems to disallow
|
||||
# having regexes multiline but not dot_all
|
||||
class Re2 < Regex
|
||||
@source = "fa"
|
||||
@options = Regex::Options::None
|
||||
@jit = true
|
||||
|
||||
def initialize(pattern : String, multiline = false, dotall = false, ignorecase = false, anchored = false)
|
||||
flags = LibPCRE2::UTF | LibPCRE2::DUPNAMES |
|
||||
LibPCRE2::UCP
|
||||
flags |= LibPCRE2::MULTILINE if multiline
|
||||
flags |= LibPCRE2::DOTALL if dotall
|
||||
flags |= LibPCRE2::CASELESS if ignorecase
|
||||
flags |= LibPCRE2::ANCHORED if anchored
|
||||
@re = Regex::PCRE2.compile(pattern, flags) do |error_message|
|
||||
raise Exception.new(error_message)
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
@@ -63,8 +63,6 @@ module Tartrazine
|
||||
tokens = [] of Token
|
||||
pos = 0
|
||||
matched = false
|
||||
time = 0
|
||||
count = 0
|
||||
|
||||
# Respect the `ensure_nl` config option
|
||||
if text.size > 0 && text[-1] != '\n' && config[:ensure_nl] && !usingself
|
||||
|
Reference in New Issue
Block a user