From 0626c8619f81b219b271fa0e92f1aba517852327 Mon Sep 17 00:00:00 2001 From: Roberto Alsina Date: Wed, 14 Aug 2024 11:06:53 -0300 Subject: [PATCH] Working bytes-regexes, faster, MORE tests pass --- spec/tartrazine_spec.cr | 14 ++++---- src/actions.cr | 22 ++++++++----- src/bytes_regex.cr | 72 +++++++++++++++++++++++++++++++++++++++++ src/lexer.cr | 14 ++++---- src/rules.cr | 26 +++++++++------ 5 files changed, 116 insertions(+), 32 deletions(-) create mode 100644 src/bytes_regex.cr diff --git a/spec/tartrazine_spec.cr b/spec/tartrazine_spec.cr index 4a06dc2..36662bc 100644 --- a/spec/tartrazine_spec.cr +++ b/spec/tartrazine_spec.cr @@ -20,9 +20,11 @@ unicode_problems = { # but tartrazine is correct bad_in_chroma = { "#{__DIR__}/tests/bash_session/test_comment_after_prompt.txt", + "#{__DIR__}/tests/html/javascript_backtracking.txt", "#{__DIR__}/tests/java/test_default.txt", "#{__DIR__}/tests/java/test_multiline_string.txt", "#{__DIR__}/tests/java/test_numeric_literals.txt", + "#{__DIR__}/tests/octave/test_multilinecomment.txt", "#{__DIR__}/tests/php/test_string_escaping_run.txt", "#{__DIR__}/tests/python_2/test_cls_builtin.txt", } @@ -30,18 +32,14 @@ bad_in_chroma = { known_bad = { "#{__DIR__}/tests/bash_session/fake_ps2_prompt.txt", "#{__DIR__}/tests/bash_session/prompt_in_output.txt", - "#{__DIR__}/tests/bash_session/test_newline_in_echo_no_ps2.txt", - "#{__DIR__}/tests/bash_session/test_newline_in_ls_ps2.txt", "#{__DIR__}/tests/bash_session/ps2_prompt.txt", - "#{__DIR__}/tests/bash_session/test_newline_in_ls_no_ps2.txt", - "#{__DIR__}/tests/bash_session/test_virtualenv.txt", + "#{__DIR__}/tests/bash_session/test_newline_in_echo_no_ps2.txt", "#{__DIR__}/tests/bash_session/test_newline_in_echo_ps2.txt", - "#{__DIR__}/tests/c/test_string_resembling_decl_end.txt", - "#{__DIR__}/tests/html/css_backtracking.txt", + "#{__DIR__}/tests/bash_session/test_newline_in_ls_no_ps2.txt", + "#{__DIR__}/tests/bash_session/test_newline_in_ls_ps2.txt", + "#{__DIR__}/tests/bash_session/test_virtualenv.txt", "#{__DIR__}/tests/mcfunction/data.txt", "#{__DIR__}/tests/mcfunction/selectors.txt", - "#{__DIR__}/tests/php/anonymous_class.txt", - "#{__DIR__}/tests/html/javascript_unclosed.txt", } # Tests that fail because of a limitation in PCRE2 diff --git a/src/actions.cr b/src/actions.cr index 4ed4008..858ca40 100644 --- a/src/actions.cr +++ b/src/actions.cr @@ -30,11 +30,11 @@ module Tartrazine end # ameba:disable Metrics/CyclomaticComplexity - def emit(match : Regex::MatchData?, lexer : Lexer, match_group = 0) : Array(Token) + def emit(match : MatchData, lexer : Lexer, match_group = 0) : Array(Token) case type when "token" - raise Exception.new "Can't have a token without a match" if match.nil? - [Token.new(type: xml["type"], value: match[match_group])] + raise Exception.new "Can't have a token without a match" if match.empty? + [Token.new(type: xml["type"], value: String.new(match[match_group].value))] when "push" states_to_push = xml.attributes.select { |attrib| attrib.name == "state" @@ -79,23 +79,29 @@ module Tartrazine # the action is skipped. result = [] of Token @actions.each_with_index do |e, i| - next if match[i + 1]?.nil? + begin + next if match[i + 1].size == 0 + rescue IndexError + # FIXME: This should not actually happen + # No match for this group + next + end result += e.emit(match, lexer, i + 1) end result when "using" # Shunt to another lexer entirely - return [] of Token if match.nil? + return [] of Token if match.empty? lexer_name = xml["lexer"].downcase Log.trace { "to tokenize: #{match[match_group]}" } - Tartrazine.lexer(lexer_name).tokenize(match[match_group], usingself: true) + Tartrazine.lexer(lexer_name).tokenize(String.new(match[match_group].value), usingself: true) when "usingself" # Shunt to another copy of this lexer - return [] of Token if match.nil? + return [] of Token if match.empty? new_lexer = Lexer.from_xml(lexer.xml) Log.trace { "to tokenize: #{match[match_group]}" } - new_lexer.tokenize(match[match_group], usingself: true) + new_lexer.tokenize(String.new(match[match_group].value), usingself: true) when "combined" # Combine two states into one anonymous state states = xml.attributes.select { |attrib| diff --git a/src/bytes_regex.cr b/src/bytes_regex.cr new file mode 100644 index 0000000..737ac62 --- /dev/null +++ b/src/bytes_regex.cr @@ -0,0 +1,72 @@ +module BytesRegex + extend self + + class Regex + def initialize(pattern : String, multiline = false, dotall = false, ignorecase = false, anchored = false) + flags = LibPCRE2::UTF | LibPCRE2::DUPNAMES | LibPCRE2::UCP + flags |= LibPCRE2::MULTILINE if multiline + flags |= LibPCRE2::DOTALL if dotall + flags |= LibPCRE2::CASELESS if ignorecase + flags |= LibPCRE2::ANCHORED if anchored + if @re = LibPCRE2.compile( + pattern, + pattern.bytesize, + flags, + out errorcode, + out erroroffset, + nil) + else + # FIXME: show actual error message + raise Exception.new "Error compiling regex" + end + end + + def finalize + LibPCRE2.code_free(@re) + end + + def match(str : Bytes, pos = 0) : Array(Match) + match_data = LibPCRE2.match_data_create_from_pattern(@re, nil) + match = [] of Match + rc = LibPCRE2.match( + @re, + str, + str.size, + pos, + 0, + match_data, + nil) + if rc < 0 + # FIXME: handle actual errors + else + ovector = LibPCRE2.get_ovector_pointer(match_data) + (0...rc).each do |i| + m_start = ovector[2 * i] + m_size = ovector[2 * i + 1] - m_start + if m_size == 0 + m_value = Bytes.new(0) + else + m_value = str[m_start...m_start + m_size] + end + match << Match.new(m_value, m_start, m_size) + end + end + LibPCRE2.match_data_free(match_data) + match + end + end + + class Match + property value : Bytes + property start : UInt64 + property size : UInt64 + + def initialize(@value : Bytes, @start : UInt64, @size : UInt64) + end + end +end + +# pattern = "foo" +# str = "foo bar" +# re = BytesRegex::Regex.new(pattern) +# p! String.new(re.match(str.to_slice)[0].value) diff --git a/src/lexer.cr b/src/lexer.cr index a22cecb..31fc5e7 100644 --- a/src/lexer.cr +++ b/src/lexer.cr @@ -1,3 +1,4 @@ +require "baked_file_system" require "./constants/lexers" module Tartrazine @@ -65,7 +66,7 @@ module Tartrazine # is true when the lexer is being used to tokenize a string # from a larger text that is already being tokenized. # So, when it's true, we don't modify the text. - def tokenize(text, usingself = false) : Array(Token) + def tokenize(text : String, usingself = false) : Array(Token) @state_stack = ["root"] tokens = [] of Token pos = 0 @@ -76,12 +77,13 @@ module Tartrazine text += "\n" end + text_bytes = text.to_slice # Loop through the text, applying rules - while pos < text.size + while pos < text_bytes.size state = states[@state_stack.last] # Log.trace { "Stack is #{@state_stack} State is #{state.name}, pos is #{pos}, text is #{text[pos..pos + 10]}" } state.rules.each do |rule| - matched, new_pos, new_tokens = rule.match(text, pos, self) + matched, new_pos, new_tokens = rule.match(text_bytes, pos, self) if matched # Move position forward, save the tokens, # tokenize from the new position @@ -94,12 +96,12 @@ module Tartrazine end # If no rule matches, emit an error token unless matched - if text[pos] == "\n" + if text_bytes[pos] == 10u8 # at EOL, reset state to "root" - tokens << {type: "TextWhitespace", value: "\n"} + tokens << {type: "Text", value: "\n"} @state_stack = ["root"] else - tokens << {type: "Error", value: text[pos..pos]} + tokens << {type: "Error", value: String.new(text_bytes[pos..pos])} end pos += 1 end diff --git a/src/rules.cr b/src/rules.cr index a7dc872..e88b9bd 100644 --- a/src/rules.cr +++ b/src/rules.cr @@ -1,8 +1,9 @@ require "./actions" +require "./bytes_regex" require "./formatter" +require "./lexer" require "./rules" require "./styles" -require "./lexer" # These are lexer rules. They match with the text being parsed # and perform actions, either emitting tokens or changing the @@ -10,16 +11,21 @@ require "./lexer" module Tartrazine # This rule matches via a regex pattern + alias Regex = BytesRegex::Regex + alias Match = BytesRegex::Match + alias MatchData = Array(Match) + class Rule property pattern : Regex = Regex.new "" property actions : Array(Action) = [] of Action property xml : String = "foo" - def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token)) + def match(text : Bytes, pos, lexer) : Tuple(Bool, Int32, Array(Token)) match = pattern.match(text, pos) # We don't match if the match doesn't move the cursor # because that causes infinite loops - return false, pos, [] of Token if match.nil? || match.size == 0 + return false, pos, [] of Token if match.empty? || match[0].size == 0 + # p! match, String.new(text[pos..pos+20]) # Log.trace { "#{match}, #{pattern.inspect}, #{text}, #{pos}" } tokens = [] of Token # Emit the tokens @@ -27,21 +33,21 @@ module Tartrazine # Emit the token tokens += action.emit(match, lexer) end - Log.trace { "#{xml}, #{match.end}, #{tokens}" } - return true, match.end, tokens + Log.trace { "#{xml}, #{pos + match[0].size}, #{tokens}" } + return true, pos + match[0].size, tokens end def initialize(node : XML::Node, multiline, dotall, ignorecase) @xml = node.to_s pattern = node["pattern"] - flags = Regex::Options::ANCHORED + # flags = Regex::Options::ANCHORED # MULTILINE implies DOTALL which we don't want, so we # use in-pattern flag (?m) instead # flags |= Regex::Options::MULTILINE if multiline pattern = "(?m)" + pattern if multiline - flags |= Regex::Options::DOTALL if dotall - flags |= Regex::Options::IGNORE_CASE if ignorecase - @pattern = Regex.new(pattern, flags) + # flags |= Regex::Options::DOTALL if dotall + # flags |= Regex::Options::IGNORE_CASE if ignorecase + @pattern = Regex.new(pattern, multiline, dotall, ignorecase, true) add_actions(node) end @@ -83,7 +89,7 @@ module Tartrazine def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token)) tokens = [] of Token actions.each do |action| - tokens += action.emit(nil, lexer) + tokens += action.emit([] of Match, lexer) end return true, pos, tokens end