mirror of
https://github.com/ralsina/tartrazine.git
synced 2025-09-07 22:33:09 +00:00
Compare commits
6 Commits
main
...
e38feb0736
Author | SHA1 | Date | |
---|---|---|---|
e38feb0736 | |||
e88fd3a48f | |||
fb54b08841 | |||
2a19f3889f | |||
b9e51824df | |||
ff1c0012ec |
@@ -14,6 +14,8 @@ dependencies:
|
|||||||
sixteen:
|
sixteen:
|
||||||
github: ralsina/sixteen
|
github: ralsina/sixteen
|
||||||
branch: main
|
branch: main
|
||||||
|
cre2:
|
||||||
|
git: "https://git.ralsina.me/ralsina/cre2.git"
|
||||||
|
|
||||||
crystal: ">= 1.13.0"
|
crystal: ">= 1.13.0"
|
||||||
|
|
||||||
|
@@ -1,3 +1,5 @@
|
|||||||
|
require "xml"
|
||||||
|
|
||||||
# These are Lexer actions. When a rule matches, it will
|
# These are Lexer actions. When a rule matches, it will
|
||||||
# perform a list of actions. These actions can emit tokens
|
# perform a list of actions. These actions can emit tokens
|
||||||
# or change the state machine.
|
# or change the state machine.
|
||||||
@@ -24,11 +26,12 @@ module Tartrazine
|
|||||||
end
|
end
|
||||||
|
|
||||||
# ameba:disable Metrics/CyclomaticComplexity
|
# ameba:disable Metrics/CyclomaticComplexity
|
||||||
def emit(match : Regex::MatchData?, lexer : Lexer, match_group = 0) : Array(Token)
|
def emit(match : MatchData,
|
||||||
|
lexer : Lexer, match_group = 0) : Array(Token)
|
||||||
case type
|
case type
|
||||||
when "token"
|
when "token"
|
||||||
raise Exception.new "Can't have a token without a match" if match.nil?
|
raise Exception.new "Can't have a token without a match" if match.nil? || match[0].size == 0
|
||||||
[Token.new(type: xml["type"], value: match[match_group])]
|
[Token.new(type: xml["type"], value: match[0])]
|
||||||
when "push"
|
when "push"
|
||||||
states_to_push = xml.attributes.select { |attrib|
|
states_to_push = xml.attributes.select { |attrib|
|
||||||
attrib.name == "state"
|
attrib.name == "state"
|
||||||
@@ -61,35 +64,37 @@ module Tartrazine
|
|||||||
when "bygroups"
|
when "bygroups"
|
||||||
# FIXME: handle
|
# FIXME: handle
|
||||||
# ><bygroups>
|
# ><bygroups>
|
||||||
# <token type="Punctuation"/>
|
# <token type="Punctuation"/>https://github.com/google/re2/wiki/Syntax
|
||||||
# None
|
# None
|
||||||
# <token type="LiteralStringRegex"/>
|
# <token type="LiteralStringRegex"/>
|
||||||
#
|
#
|
||||||
# where that None means skipping a group
|
# where that None means skipping a group
|
||||||
#
|
#
|
||||||
raise Exception.new "Can't have a token without a match" if match.nil?
|
raise Exception.new "Can't have a bygroups without a match" if match.nil? || match[0].size == 0
|
||||||
|
|
||||||
# Each group matches an action. If the group match is empty,
|
# Each group matches an action. If the group match is empty,
|
||||||
# the action is skipped.
|
# the action is skipped.
|
||||||
result = [] of Token
|
result = [] of Token
|
||||||
@actions.each_with_index do |e, i|
|
@actions.each_with_index do |e, i|
|
||||||
next if match[i + 1]?.nil?
|
next if match[i].size == 0
|
||||||
result += e.emit(match, lexer, i + 1)
|
result += e.emit(match, lexer, i)
|
||||||
end
|
end
|
||||||
result
|
result
|
||||||
when "using"
|
when "using"
|
||||||
# Shunt to another lexer entirely
|
# Shunt to another lexer entirely
|
||||||
return [] of Token if match.nil?
|
return [] of Token if match.nil? || match[0].size == 0
|
||||||
lexer_name = xml["lexer"].downcase
|
lexer_name = xml["lexer"].downcase
|
||||||
Log.trace { "to tokenize: #{match[match_group]}" }
|
# Log.trace { "to tokenize: #{match[match_group]}" }
|
||||||
Tartrazine.lexer(lexer_name).tokenize(match[match_group], usingself: true)
|
to_tokenize = match[match_group]
|
||||||
|
Tartrazine.lexer(lexer_name).tokenize(to_tokenize, usingself: true)
|
||||||
when "usingself"
|
when "usingself"
|
||||||
# Shunt to another copy of this lexer
|
# Shunt to another copy of this lexer
|
||||||
return [] of Token if match.nil?
|
return [] of Token if match.nil? || match[0].size == 0
|
||||||
|
|
||||||
new_lexer = Lexer.from_xml(lexer.xml)
|
new_lexer = Lexer.from_xml(lexer.xml)
|
||||||
Log.trace { "to tokenize: #{match[match_group]}" }
|
# Log.trace { "to tokenize: #{match[match_group]}" }
|
||||||
new_lexer.tokenize(match[match_group], usingself: true)
|
to_tokenize = match[match_group]
|
||||||
|
new_lexer.tokenize(to_tokenize, usingself: true)
|
||||||
when "combined"
|
when "combined"
|
||||||
# Combine two states into one anonymous state
|
# Combine two states into one anonymous state
|
||||||
states = xml.attributes.select { |attrib|
|
states = xml.attributes.select { |attrib|
|
||||||
|
0
src/re2.cr
Normal file
0
src/re2.cr
Normal file
45
src/rules.cr
45
src/rules.cr
@@ -1,4 +1,5 @@
|
|||||||
require "./actions"
|
require "./actions"
|
||||||
|
# require "cre2"
|
||||||
|
|
||||||
# These are lexer rules. They match with the text being parsed
|
# These are lexer rules. They match with the text being parsed
|
||||||
# and perform actions, either emitting tokens or changing the
|
# and perform actions, either emitting tokens or changing the
|
||||||
@@ -6,8 +7,12 @@ require "./actions"
|
|||||||
module Tartrazine
|
module Tartrazine
|
||||||
# This rule matches via a regex pattern
|
# This rule matches via a regex pattern
|
||||||
|
|
||||||
|
# alias Regex = CRe2::Regex
|
||||||
|
# alias MatchData = CRe2::MatchDataLike | Regex::MatchData | Nil
|
||||||
|
alias MatchData = Regex::MatchData | Nil
|
||||||
|
|
||||||
class Rule
|
class Rule
|
||||||
property pattern : Regex = Re2.new ""
|
property pattern : Regex = Regex.new ""
|
||||||
property actions : Array(Action) = [] of Action
|
property actions : Array(Action) = [] of Action
|
||||||
property xml : String = "foo"
|
property xml : String = "foo"
|
||||||
|
|
||||||
@@ -15,7 +20,8 @@ module Tartrazine
|
|||||||
match = pattern.match(text, pos)
|
match = pattern.match(text, pos)
|
||||||
# We don't match if the match doesn't move the cursor
|
# We don't match if the match doesn't move the cursor
|
||||||
# because that causes infinite loops
|
# because that causes infinite loops
|
||||||
return false, pos, [] of Token if match.nil? || match.end == 0
|
|
||||||
|
return false, pos, [] of Token if match.nil?
|
||||||
# Log.trace { "#{match}, #{pattern.inspect}, #{text}, #{pos}" }
|
# Log.trace { "#{match}, #{pattern.inspect}, #{text}, #{pos}" }
|
||||||
tokens = [] of Token
|
tokens = [] of Token
|
||||||
# Emit the tokens
|
# Emit the tokens
|
||||||
@@ -23,18 +29,17 @@ module Tartrazine
|
|||||||
# Emit the token
|
# Emit the token
|
||||||
tokens += action.emit(match, lexer)
|
tokens += action.emit(match, lexer)
|
||||||
end
|
end
|
||||||
Log.trace { "#{xml}, #{match.end}, #{tokens}" }
|
# Log.trace { "#{xml}, #{match.end}, #{tokens}" }
|
||||||
return true, match.end, tokens
|
return true, match[0].size, tokens
|
||||||
end
|
end
|
||||||
|
|
||||||
def initialize(node : XML::Node, multiline, dotall, ignorecase)
|
def initialize(node : XML::Node, multiline, dotall, ignorecase)
|
||||||
@xml = node.to_s
|
@xml = node.to_s
|
||||||
@pattern = Re2.new(
|
options = Regex::Options::ANCHORED
|
||||||
node["pattern"],
|
options |= Regex::Options::MULTILINE if multiline
|
||||||
multiline,
|
options |= Regex::Options::DOTALL if dotall
|
||||||
dotall,
|
options |= Regex::Options::IGNORE_CASE if ignorecase
|
||||||
ignorecase,
|
@pattern = Regex.new(node["pattern"], options)
|
||||||
anchored: true)
|
|
||||||
add_actions(node)
|
add_actions(node)
|
||||||
end
|
end
|
||||||
|
|
||||||
@@ -86,24 +91,4 @@ module Tartrazine
|
|||||||
add_actions(node)
|
add_actions(node)
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
# This is a hack to workaround that Crystal seems to disallow
|
|
||||||
# having regexes multiline but not dot_all
|
|
||||||
class Re2 < Regex
|
|
||||||
@source = "fa"
|
|
||||||
@options = Regex::Options::None
|
|
||||||
@jit = true
|
|
||||||
|
|
||||||
def initialize(pattern : String, multiline = false, dotall = false, ignorecase = false, anchored = false)
|
|
||||||
flags = LibPCRE2::UTF | LibPCRE2::DUPNAMES |
|
|
||||||
LibPCRE2::UCP
|
|
||||||
flags |= LibPCRE2::MULTILINE if multiline
|
|
||||||
flags |= LibPCRE2::DOTALL if dotall
|
|
||||||
flags |= LibPCRE2::CASELESS if ignorecase
|
|
||||||
flags |= LibPCRE2::ANCHORED if anchored
|
|
||||||
@re = Regex::PCRE2.compile(pattern, flags) do |error_message|
|
|
||||||
raise Exception.new(error_message)
|
|
||||||
end
|
|
||||||
end
|
|
||||||
end
|
|
||||||
end
|
end
|
||||||
|
@@ -63,8 +63,6 @@ module Tartrazine
|
|||||||
tokens = [] of Token
|
tokens = [] of Token
|
||||||
pos = 0
|
pos = 0
|
||||||
matched = false
|
matched = false
|
||||||
time = 0
|
|
||||||
count = 0
|
|
||||||
|
|
||||||
# Respect the `ensure_nl` config option
|
# Respect the `ensure_nl` config option
|
||||||
if text.size > 0 && text[-1] != '\n' && config[:ensure_nl] && !usingself
|
if text.size > 0 && text[-1] != '\n' && config[:ensure_nl] && !usingself
|
||||||
|
Reference in New Issue
Block a user