From 58fd42d936890dd6fa874205c543901c3520b9b6 Mon Sep 17 00:00:00 2001 From: Roberto Alsina Date: Sat, 24 Aug 2024 19:59:05 -0300 Subject: [PATCH] Rebase to main --- TODO.md | 5 ++- src/actions.cr | 96 +++++++++++++++++++++++----------------------- src/bytes_regex.cr | 6 +-- src/main.cr | 13 +------ src/rules.cr | 31 ++++----------- 5 files changed, 63 insertions(+), 88 deletions(-) diff --git a/TODO.md b/TODO.md index 95e5e2b..91d427d 100644 --- a/TODO.md +++ b/TODO.md @@ -9,4 +9,7 @@ * ✅ Implement lexer loader by file extension * ✅ Add --line-numbers to terminal formatter * Implement lexer loader by mime type -* Implement Pygment's "DelegateLexer" +* ✅ Implement Delegating lexers +* ✅ Add RstLexer +* Add Mako template lexer +* Implement heuristic lexer detection diff --git a/src/actions.cr b/src/actions.cr index 33fc4f8..da514a5 100644 --- a/src/actions.cr +++ b/src/actions.cr @@ -23,14 +23,17 @@ module Tartrazine struct Action property actions : Array(Action) = [] of Action - property token_type : String = "" - property states_to_push : Array(String) = [] of String - property depth = 0 - property lexer_name : String = "" - property states_to_combine : Array(String) = [] of String + @content_index : Array(Int32) = [] of Int32 + @depth : Int32 = 0 + @lexer_index : Int32 = 0 + @lexer_name : String = "" + @states : Array(String) = [] of String + @states_to_push : Array(String) = [] of String + @token_type : String = "" + @type : ActionType = ActionType::Token - def initialize(@type : String, @xml : XML::Node?) - # Extract information from the XML node we will use later + def initialize(t : String, xml : XML::Node?) + @type = ActionType.parse(t.capitalize) # Some actions may have actions in them, like this: # @@ -41,31 +44,30 @@ module Tartrazine # # The token actions match with the first 2 groups in the regex # the using action matches the 3rd and shunts it to another lexer - - known_types = %w(token push pop bygroups using usingself include combined) - raise Exception.new( - "Unknown action type: #{@type}") unless known_types.includes? @type - - @xml.children.each do |node| + xml.children.each do |node| next unless node.element? @actions << Action.new(node.name, node) end + # Prefetch the attributes we ned from the XML and keep them case @type - when "token" - @token_type = xml["type"]? || "" - when "push" + when ActionType::Token + @token_type = xml["type"] + when ActionType::Push @states_to_push = xml.attributes.select { |attrib| attrib.name == "state" - }.map &.content || [] of String - when "pop" - @depth = xml["depth"]?.try &.to_i || 0 - when "using" - @lexer_name = xml["lexer"]?.try &.downcase || "" - when "combined" - @states_to_combine = xml.attributes.select { |attrib| + }.map &.content + when ActionType::Pop + @depth = xml["depth"].to_i + when ActionType::Using + @lexer_name = xml["lexer"].downcase + when ActionType::Combined + @states = xml.attributes.select { |attrib| attrib.name == "state" }.map &.content + when ActionType::Usingbygroup + @lexer_index = xml["lexer"].to_i + @content_index = xml["content"].split(",").map(&.to_i) end end @@ -75,25 +77,21 @@ module Tartrazine when ActionType::Token raise Exception.new "Can't have a token without a match" if match.empty? [Token.new(type: @token_type, value: String.new(match[match_group].value))] - when "push" - if @states_to_push.empty? - # Push without a state means push the current state - @states_to_push = [lexer.state_stack.last] - end - @states_to_push.each do |state| - if state == "#pop" + when ActionType::Push + to_push = @states_to_push.empty? ? [tokenizer.state_stack.last] : @states_to_push + to_push.each do |state| + if state == "#pop" && tokenizer.state_stack.size > 1 # Pop the state - lexer.state_stack.pop + tokenizer.state_stack.pop else # Really push - lexer.state_stack << state + tokenizer.state_stack << state end end [] of Token - when "pop" - if lexer.state_stack.size > @depth - lexer.state_stack.pop(@depth) - end + when ActionType::Pop + to_pop = [@depth, tokenizer.state_stack.size - 1].min + tokenizer.state_stack.pop(to_pop) [] of Token when ActionType::Bygroups # FIXME: handle @@ -104,7 +102,7 @@ module Tartrazine # # where that None means skipping a group # - raise Exception.new "Can't have a token without a match" if match.empty? + raise Exception.new "Can't have a token without a match" if match.nil? # Each group matches an action. If the group match is empty, # the action is skipped. @@ -113,7 +111,8 @@ module Tartrazine begin next if match[i + 1].size == 0 rescue IndexError - # No match for the last group + # FIXME: This should not actually happen + # No match for this group next end result += e.emit(match, tokenizer, i + 1) @@ -122,16 +121,19 @@ module Tartrazine when ActionType::Using # Shunt to another lexer entirely return [] of Token if match.empty? - Tartrazine.lexer(@lexer_name).tokenize(String.new(match[match_group].value), usingself: true) - when "usingself" + Tartrazine.lexer(@lexer_name).tokenizer( + String.new(match[match_group].value), + secondary: true).to_a + when ActionType::Usingself # Shunt to another copy of this lexer return [] of Token if match.empty? - new_lexer = Lexer.from_xml(lexer.xml) - new_lexer.tokenize(String.new(match[match_group].value), usingself: true) - when "combined" - # Combine two states into one anonymous state - new_state = @states_to_combine.map { |name| - lexer.states[name] + tokenizer.lexer.tokenizer( + String.new(match[match_group].value), + secondary: true).to_a + when ActionType::Combined + # Combine two or more states into one anonymous state + new_state = @states.map { |name| + tokenizer.lexer.states[name] }.reduce { |state1, state2| state1 + state2 } @@ -149,7 +151,7 @@ module Tartrazine content, secondary: true).to_a else - raise Exception.new("Unhandled action type: #{type}") + raise Exception.new("Unknown action type: #{@type}") end end end diff --git a/src/bytes_regex.cr b/src/bytes_regex.cr index 6e277a8..c73f694 100644 --- a/src/bytes_regex.cr +++ b/src/bytes_regex.cr @@ -31,7 +31,6 @@ module BytesRegex end def match(str : Bytes, pos = 0) : Array(Match) - match = [] of Match rc = LibPCRE2.match( @re, str, @@ -40,9 +39,9 @@ module BytesRegex LibPCRE2::NO_UTF_CHECK, @match_data, nil) - if rc >= 0 + if rc > 0 ovector = LibPCRE2.get_ovector_pointer(@match_data) - (0...rc).each do |i| + (0...rc).map do |i| m_start = ovector[2 * i] m_end = ovector[2 * i + 1] if m_start == m_end @@ -55,7 +54,6 @@ module BytesRegex else [] of Match end - match end end diff --git a/src/main.cr b/src/main.cr index 357a860..8b92e58 100644 --- a/src/main.cr +++ b/src/main.cr @@ -1,18 +1,6 @@ require "docopt" require "./**" -# Performance data (in milliseconds): -# -# Docopt parsing: 0.5 -# Instantiating a theme: 0.1 -# Instantiating a formatter: 1.0 -# Instantiating a lexer: 2.0 -# Tokenizing crycco.cr: 16.0 -# Formatting: 0.5 -# I/O: 1.5 -# --------------------------------- -# Total: 21.6 - HELP = <<-HELP tartrazine: a syntax highlighting tool @@ -96,6 +84,7 @@ if options["-f"] end lexer = Tartrazine.lexer(name: options["-l"].as(String), filename: options["FILE"].as(String)) + input = File.open(options["FILE"].as(String)).gets_to_end if options["-o"].nil? diff --git a/src/rules.cr b/src/rules.cr index 37000db..6eaa8d7 100644 --- a/src/rules.cr +++ b/src/rules.cr @@ -15,28 +15,11 @@ module Tartrazine alias Match = BytesRegex::Match alias MatchData = Array(Match) - class Rule - property pattern : Regex = Regex.new "" - property actions : Array(Action) = [] of Action + abstract struct BaseRule + abstract def match(text : Bytes, pos : Int32, tokenizer : Tokenizer) : Tuple(Bool, Int32, Array(Token)) + abstract def initialize(node : XML::Node) - def match(text : Bytes, pos, lexer) : Tuple(Bool, Int32, Array(Token)) - match = pattern.match(text, pos) - # We don't match if the match doesn't move the cursor - # because that causes infinite loops - return false, pos, [] of Token if match.empty? || match[0].size == 0 - tokens = [] of Token - actions.each do |action| - tokens += action.emit(match, lexer) - end - return true, pos + match[0].size, tokens - end - - def initialize(node : XML::Node, multiline, dotall, ignorecase) - pattern = node["pattern"] - pattern = "(?m)" + pattern if multiline - @pattern = Regex.new(pattern, multiline, dotall, ignorecase, true) - add_actions(node) - end + @actions : Array(Action) = [] of Action def add_actions(node : XML::Node) node.children.each do |child| @@ -73,9 +56,9 @@ module Tartrazine struct IncludeStateRule < BaseRule @state : String = "" - def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token)) - lexer.states[state].rules.each do |rule| - matched, new_pos, new_tokens = rule.match(text, pos, lexer) + def match(text : Bytes, pos : Int32, tokenizer : Tokenizer) : Tuple(Bool, Int32, Array(Token)) + tokenizer.@lexer.states[@state].rules.each do |rule| + matched, new_pos, new_tokens = rule.match(text, pos, tokenizer) return true, new_pos, new_tokens if matched end return false, pos, [] of Token