From e0f697f1f97cf058d5431d2b513fdff851f3f2e6 Mon Sep 17 00:00:00 2001 From: Roberto Alsina Date: Tue, 6 Aug 2024 23:34:14 -0300 Subject: [PATCH] refactor --- src/rules.cr | 32 +++++++++++++++++++++++++++++--- src/tartrazine.cr | 21 ++------------------- 2 files changed, 31 insertions(+), 22 deletions(-) diff --git a/src/rules.cr b/src/rules.cr index 37900db..ea0ddc0 100644 --- a/src/rules.cr +++ b/src/rules.cr @@ -5,18 +5,19 @@ require "./actions" # state of the lexer. module Tartrazine # This rule matches via a regex pattern + class Rule property pattern : Regex = Re2.new "" property actions : Array(Action) = [] of Action property xml : String = "foo" def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token)) - tokens = [] of Token match = pattern.match(text, pos) # We don't match if the match doesn't move the cursor # because that causes infinite loops - Log.trace { "#{match}, #{pattern.inspect}, #{text}, #{pos}" } return false, pos, [] of Token if match.nil? || match.end == 0 + # Log.trace { "#{match}, #{pattern.inspect}, #{text}, #{pos}" } + tokens = [] of Token # Emit the tokens actions.each do |action| # Emit the token @@ -28,7 +29,12 @@ module Tartrazine def initialize(node : XML::Node, multiline, dotall, ignorecase) @xml = node.to_s - @pattern = Re2.new(node["pattern"], multiline, dotall, ignorecase) + @pattern = Re2.new( + node["pattern"], + multiline, + dotall, + ignorecase, + anchored: true) add_actions(node) end @@ -80,4 +86,24 @@ module Tartrazine add_actions(node) end end + + # This is a hack to workaround that Crystal seems to disallow + # having regexes multiline but not dot_all + class Re2 < Regex + @source = "fa" + @options = Regex::Options::None + @jit = true + + def initialize(pattern : String, multiline = false, dotall = false, ignorecase = false, anchored = false) + flags = LibPCRE2::UTF | LibPCRE2::DUPNAMES | + LibPCRE2::UCP + flags |= LibPCRE2::MULTILINE if multiline + flags |= LibPCRE2::DOTALL if dotall + flags |= LibPCRE2::CASELESS if ignorecase + flags |= LibPCRE2::ANCHORED if anchored + @re = Regex::PCRE2.compile(pattern, flags) do |error_message| + raise Exception.new(error_message) + end + end + end end diff --git a/src/tartrazine.cr b/src/tartrazine.cr index 8630a94..a6a1f03 100644 --- a/src/tartrazine.cr +++ b/src/tartrazine.cr @@ -63,6 +63,8 @@ module Tartrazine tokens = [] of Token pos = 0 matched = false + time = 0 + count = 0 # Respect the `ensure_nl` config option if text.size > 0 && text[-1] != '\n' && config[:ensure_nl] && !usingself @@ -184,25 +186,6 @@ module Tartrazine def self.lexer(name : String) : Lexer Lexer.from_xml(File.read("lexers/#{name}.xml")) end - - # This is a hack to workaround that Crystal seems to disallow - # having regexes multiline but not dot_all - class Re2 < Regex - @source = "fa" - @options = Regex::Options::None - @jit = true - - def initialize(pattern : String, multiline = false, dotall = false, ignorecase = false) - flags = LibPCRE2::UTF | LibPCRE2::DUPNAMES | - LibPCRE2::UCP | LibPCRE2::ANCHORED - flags |= LibPCRE2::MULTILINE if multiline - flags |= LibPCRE2::DOTALL if dotall - flags |= LibPCRE2::CASELESS if ignorecase - @re = Regex::PCRE2.compile(pattern, flags) do |error_message| - raise Exception.new(error_message) - end - end - end end # Convenience macros to parse XML