mirror of
https://github.com/ralsina/tartrazine.git
synced 2024-11-12 22:42:23 +00:00
Working bytes-regexes, faster, MORE tests pass
This commit is contained in:
parent
3725201f8a
commit
0626c8619f
@ -20,9 +20,11 @@ unicode_problems = {
|
||||
# but tartrazine is correct
|
||||
bad_in_chroma = {
|
||||
"#{__DIR__}/tests/bash_session/test_comment_after_prompt.txt",
|
||||
"#{__DIR__}/tests/html/javascript_backtracking.txt",
|
||||
"#{__DIR__}/tests/java/test_default.txt",
|
||||
"#{__DIR__}/tests/java/test_multiline_string.txt",
|
||||
"#{__DIR__}/tests/java/test_numeric_literals.txt",
|
||||
"#{__DIR__}/tests/octave/test_multilinecomment.txt",
|
||||
"#{__DIR__}/tests/php/test_string_escaping_run.txt",
|
||||
"#{__DIR__}/tests/python_2/test_cls_builtin.txt",
|
||||
}
|
||||
@ -30,18 +32,14 @@ bad_in_chroma = {
|
||||
known_bad = {
|
||||
"#{__DIR__}/tests/bash_session/fake_ps2_prompt.txt",
|
||||
"#{__DIR__}/tests/bash_session/prompt_in_output.txt",
|
||||
"#{__DIR__}/tests/bash_session/test_newline_in_echo_no_ps2.txt",
|
||||
"#{__DIR__}/tests/bash_session/test_newline_in_ls_ps2.txt",
|
||||
"#{__DIR__}/tests/bash_session/ps2_prompt.txt",
|
||||
"#{__DIR__}/tests/bash_session/test_newline_in_ls_no_ps2.txt",
|
||||
"#{__DIR__}/tests/bash_session/test_virtualenv.txt",
|
||||
"#{__DIR__}/tests/bash_session/test_newline_in_echo_no_ps2.txt",
|
||||
"#{__DIR__}/tests/bash_session/test_newline_in_echo_ps2.txt",
|
||||
"#{__DIR__}/tests/c/test_string_resembling_decl_end.txt",
|
||||
"#{__DIR__}/tests/html/css_backtracking.txt",
|
||||
"#{__DIR__}/tests/bash_session/test_newline_in_ls_no_ps2.txt",
|
||||
"#{__DIR__}/tests/bash_session/test_newline_in_ls_ps2.txt",
|
||||
"#{__DIR__}/tests/bash_session/test_virtualenv.txt",
|
||||
"#{__DIR__}/tests/mcfunction/data.txt",
|
||||
"#{__DIR__}/tests/mcfunction/selectors.txt",
|
||||
"#{__DIR__}/tests/php/anonymous_class.txt",
|
||||
"#{__DIR__}/tests/html/javascript_unclosed.txt",
|
||||
}
|
||||
|
||||
# Tests that fail because of a limitation in PCRE2
|
||||
|
@ -30,11 +30,11 @@ module Tartrazine
|
||||
end
|
||||
|
||||
# ameba:disable Metrics/CyclomaticComplexity
|
||||
def emit(match : Regex::MatchData?, lexer : Lexer, match_group = 0) : Array(Token)
|
||||
def emit(match : MatchData, lexer : Lexer, match_group = 0) : Array(Token)
|
||||
case type
|
||||
when "token"
|
||||
raise Exception.new "Can't have a token without a match" if match.nil?
|
||||
[Token.new(type: xml["type"], value: match[match_group])]
|
||||
raise Exception.new "Can't have a token without a match" if match.empty?
|
||||
[Token.new(type: xml["type"], value: String.new(match[match_group].value))]
|
||||
when "push"
|
||||
states_to_push = xml.attributes.select { |attrib|
|
||||
attrib.name == "state"
|
||||
@ -79,23 +79,29 @@ module Tartrazine
|
||||
# the action is skipped.
|
||||
result = [] of Token
|
||||
@actions.each_with_index do |e, i|
|
||||
next if match[i + 1]?.nil?
|
||||
begin
|
||||
next if match[i + 1].size == 0
|
||||
rescue IndexError
|
||||
# FIXME: This should not actually happen
|
||||
# No match for this group
|
||||
next
|
||||
end
|
||||
result += e.emit(match, lexer, i + 1)
|
||||
end
|
||||
result
|
||||
when "using"
|
||||
# Shunt to another lexer entirely
|
||||
return [] of Token if match.nil?
|
||||
return [] of Token if match.empty?
|
||||
lexer_name = xml["lexer"].downcase
|
||||
Log.trace { "to tokenize: #{match[match_group]}" }
|
||||
Tartrazine.lexer(lexer_name).tokenize(match[match_group], usingself: true)
|
||||
Tartrazine.lexer(lexer_name).tokenize(String.new(match[match_group].value), usingself: true)
|
||||
when "usingself"
|
||||
# Shunt to another copy of this lexer
|
||||
return [] of Token if match.nil?
|
||||
return [] of Token if match.empty?
|
||||
|
||||
new_lexer = Lexer.from_xml(lexer.xml)
|
||||
Log.trace { "to tokenize: #{match[match_group]}" }
|
||||
new_lexer.tokenize(match[match_group], usingself: true)
|
||||
new_lexer.tokenize(String.new(match[match_group].value), usingself: true)
|
||||
when "combined"
|
||||
# Combine two states into one anonymous state
|
||||
states = xml.attributes.select { |attrib|
|
||||
|
72
src/bytes_regex.cr
Normal file
72
src/bytes_regex.cr
Normal file
@ -0,0 +1,72 @@
|
||||
module BytesRegex
|
||||
extend self
|
||||
|
||||
class Regex
|
||||
def initialize(pattern : String, multiline = false, dotall = false, ignorecase = false, anchored = false)
|
||||
flags = LibPCRE2::UTF | LibPCRE2::DUPNAMES | LibPCRE2::UCP
|
||||
flags |= LibPCRE2::MULTILINE if multiline
|
||||
flags |= LibPCRE2::DOTALL if dotall
|
||||
flags |= LibPCRE2::CASELESS if ignorecase
|
||||
flags |= LibPCRE2::ANCHORED if anchored
|
||||
if @re = LibPCRE2.compile(
|
||||
pattern,
|
||||
pattern.bytesize,
|
||||
flags,
|
||||
out errorcode,
|
||||
out erroroffset,
|
||||
nil)
|
||||
else
|
||||
# FIXME: show actual error message
|
||||
raise Exception.new "Error compiling regex"
|
||||
end
|
||||
end
|
||||
|
||||
def finalize
|
||||
LibPCRE2.code_free(@re)
|
||||
end
|
||||
|
||||
def match(str : Bytes, pos = 0) : Array(Match)
|
||||
match_data = LibPCRE2.match_data_create_from_pattern(@re, nil)
|
||||
match = [] of Match
|
||||
rc = LibPCRE2.match(
|
||||
@re,
|
||||
str,
|
||||
str.size,
|
||||
pos,
|
||||
0,
|
||||
match_data,
|
||||
nil)
|
||||
if rc < 0
|
||||
# FIXME: handle actual errors
|
||||
else
|
||||
ovector = LibPCRE2.get_ovector_pointer(match_data)
|
||||
(0...rc).each do |i|
|
||||
m_start = ovector[2 * i]
|
||||
m_size = ovector[2 * i + 1] - m_start
|
||||
if m_size == 0
|
||||
m_value = Bytes.new(0)
|
||||
else
|
||||
m_value = str[m_start...m_start + m_size]
|
||||
end
|
||||
match << Match.new(m_value, m_start, m_size)
|
||||
end
|
||||
end
|
||||
LibPCRE2.match_data_free(match_data)
|
||||
match
|
||||
end
|
||||
end
|
||||
|
||||
class Match
|
||||
property value : Bytes
|
||||
property start : UInt64
|
||||
property size : UInt64
|
||||
|
||||
def initialize(@value : Bytes, @start : UInt64, @size : UInt64)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
# pattern = "foo"
|
||||
# str = "foo bar"
|
||||
# re = BytesRegex::Regex.new(pattern)
|
||||
# p! String.new(re.match(str.to_slice)[0].value)
|
14
src/lexer.cr
14
src/lexer.cr
@ -1,3 +1,4 @@
|
||||
require "baked_file_system"
|
||||
require "./constants/lexers"
|
||||
|
||||
module Tartrazine
|
||||
@ -65,7 +66,7 @@ module Tartrazine
|
||||
# is true when the lexer is being used to tokenize a string
|
||||
# from a larger text that is already being tokenized.
|
||||
# So, when it's true, we don't modify the text.
|
||||
def tokenize(text, usingself = false) : Array(Token)
|
||||
def tokenize(text : String, usingself = false) : Array(Token)
|
||||
@state_stack = ["root"]
|
||||
tokens = [] of Token
|
||||
pos = 0
|
||||
@ -76,12 +77,13 @@ module Tartrazine
|
||||
text += "\n"
|
||||
end
|
||||
|
||||
text_bytes = text.to_slice
|
||||
# Loop through the text, applying rules
|
||||
while pos < text.size
|
||||
while pos < text_bytes.size
|
||||
state = states[@state_stack.last]
|
||||
# Log.trace { "Stack is #{@state_stack} State is #{state.name}, pos is #{pos}, text is #{text[pos..pos + 10]}" }
|
||||
state.rules.each do |rule|
|
||||
matched, new_pos, new_tokens = rule.match(text, pos, self)
|
||||
matched, new_pos, new_tokens = rule.match(text_bytes, pos, self)
|
||||
if matched
|
||||
# Move position forward, save the tokens,
|
||||
# tokenize from the new position
|
||||
@ -94,12 +96,12 @@ module Tartrazine
|
||||
end
|
||||
# If no rule matches, emit an error token
|
||||
unless matched
|
||||
if text[pos] == "\n"
|
||||
if text_bytes[pos] == 10u8
|
||||
# at EOL, reset state to "root"
|
||||
tokens << {type: "TextWhitespace", value: "\n"}
|
||||
tokens << {type: "Text", value: "\n"}
|
||||
@state_stack = ["root"]
|
||||
else
|
||||
tokens << {type: "Error", value: text[pos..pos]}
|
||||
tokens << {type: "Error", value: String.new(text_bytes[pos..pos])}
|
||||
end
|
||||
pos += 1
|
||||
end
|
||||
|
26
src/rules.cr
26
src/rules.cr
@ -1,8 +1,9 @@
|
||||
require "./actions"
|
||||
require "./bytes_regex"
|
||||
require "./formatter"
|
||||
require "./lexer"
|
||||
require "./rules"
|
||||
require "./styles"
|
||||
require "./lexer"
|
||||
|
||||
# These are lexer rules. They match with the text being parsed
|
||||
# and perform actions, either emitting tokens or changing the
|
||||
@ -10,16 +11,21 @@ require "./lexer"
|
||||
module Tartrazine
|
||||
# This rule matches via a regex pattern
|
||||
|
||||
alias Regex = BytesRegex::Regex
|
||||
alias Match = BytesRegex::Match
|
||||
alias MatchData = Array(Match)
|
||||
|
||||
class Rule
|
||||
property pattern : Regex = Regex.new ""
|
||||
property actions : Array(Action) = [] of Action
|
||||
property xml : String = "foo"
|
||||
|
||||
def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token))
|
||||
def match(text : Bytes, pos, lexer) : Tuple(Bool, Int32, Array(Token))
|
||||
match = pattern.match(text, pos)
|
||||
# We don't match if the match doesn't move the cursor
|
||||
# because that causes infinite loops
|
||||
return false, pos, [] of Token if match.nil? || match.size == 0
|
||||
return false, pos, [] of Token if match.empty? || match[0].size == 0
|
||||
# p! match, String.new(text[pos..pos+20])
|
||||
# Log.trace { "#{match}, #{pattern.inspect}, #{text}, #{pos}" }
|
||||
tokens = [] of Token
|
||||
# Emit the tokens
|
||||
@ -27,21 +33,21 @@ module Tartrazine
|
||||
# Emit the token
|
||||
tokens += action.emit(match, lexer)
|
||||
end
|
||||
Log.trace { "#{xml}, #{match.end}, #{tokens}" }
|
||||
return true, match.end, tokens
|
||||
Log.trace { "#{xml}, #{pos + match[0].size}, #{tokens}" }
|
||||
return true, pos + match[0].size, tokens
|
||||
end
|
||||
|
||||
def initialize(node : XML::Node, multiline, dotall, ignorecase)
|
||||
@xml = node.to_s
|
||||
pattern = node["pattern"]
|
||||
flags = Regex::Options::ANCHORED
|
||||
# flags = Regex::Options::ANCHORED
|
||||
# MULTILINE implies DOTALL which we don't want, so we
|
||||
# use in-pattern flag (?m) instead
|
||||
# flags |= Regex::Options::MULTILINE if multiline
|
||||
pattern = "(?m)" + pattern if multiline
|
||||
flags |= Regex::Options::DOTALL if dotall
|
||||
flags |= Regex::Options::IGNORE_CASE if ignorecase
|
||||
@pattern = Regex.new(pattern, flags)
|
||||
# flags |= Regex::Options::DOTALL if dotall
|
||||
# flags |= Regex::Options::IGNORE_CASE if ignorecase
|
||||
@pattern = Regex.new(pattern, multiline, dotall, ignorecase, true)
|
||||
add_actions(node)
|
||||
end
|
||||
|
||||
@ -83,7 +89,7 @@ module Tartrazine
|
||||
def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token))
|
||||
tokens = [] of Token
|
||||
actions.each do |action|
|
||||
tokens += action.emit(nil, lexer)
|
||||
tokens += action.emit([] of Match, lexer)
|
||||
end
|
||||
return true, pos, tokens
|
||||
end
|
||||
|
Loading…
Reference in New Issue
Block a user