From 7538fc76aae724288635ec53a05a4af44feb74c4 Mon Sep 17 00:00:00 2001 From: Roberto Alsina Date: Fri, 16 Aug 2024 13:27:02 -0300 Subject: [PATCH] Tokenize via an iterator, makes everything much faster --- spec/tartrazine_spec.cr | 3 +- src/actions.cr | 33 +++++---- src/formatters/ansi.cr | 30 ++++++-- src/formatters/html.cr | 24 ++++--- src/lexer.cr | 152 ++++++++++++++++++---------------------- src/rules.cr | 16 ++--- src/styles.cr | 6 +- 7 files changed, 142 insertions(+), 122 deletions(-) diff --git a/spec/tartrazine_spec.cr b/spec/tartrazine_spec.cr index 5125129..15011e9 100644 --- a/spec/tartrazine_spec.cr +++ b/spec/tartrazine_spec.cr @@ -73,7 +73,8 @@ end # Helper that creates lexer and tokenizes def tokenize(lexer_name, text) lexer = Tartrazine.lexer(lexer_name) - lexer.tokenize(text) + tokenizer = Tartrazine::Tokenizer.new(lexer, text) + Tartrazine::Lexer.collapse_tokens(tokenizer.to_a) end # Helper that tokenizes using chroma to validate the lexer diff --git a/src/actions.cr b/src/actions.cr index 13d80ae..b626dd2 100644 --- a/src/actions.cr +++ b/src/actions.cr @@ -66,26 +66,26 @@ module Tartrazine end # ameba:disable Metrics/CyclomaticComplexity - def emit(match : MatchData, lexer : Lexer, match_group = 0) : Array(Token) + def emit(match : MatchData, tokenizer : Tokenizer, match_group = 0) : Array(Token) case @type when ActionType::Token raise Exception.new "Can't have a token without a match" if match.empty? [Token.new(type: @token_type, value: String.new(match[match_group].value))] when ActionType::Push - to_push = @states_to_push.empty? ? [lexer.state_stack.last] : @states_to_push + to_push = @states_to_push.empty? ? [tokenizer.state_stack.last] : @states_to_push to_push.each do |state| - if state == "#pop" && lexer.state_stack.size > 1 + if state == "#pop" && tokenizer.state_stack.size > 1 # Pop the state - lexer.state_stack.pop + tokenizer.state_stack.pop else # Really push - lexer.state_stack << state + tokenizer.state_stack << state end end [] of Token when ActionType::Pop - to_pop = [@depth, lexer.state_stack.size - 1].min - lexer.state_stack.pop(to_pop) + to_pop = [@depth, tokenizer.state_stack.size - 1].min + tokenizer.state_stack.pop(to_pop) [] of Token when ActionType::Bygroups # FIXME: handle @@ -109,27 +109,32 @@ module Tartrazine # No match for this group next end - result += e.emit(match, lexer, i + 1) + result += e.emit(match, tokenizer, i + 1) end result when ActionType::Using # Shunt to another lexer entirely return [] of Token if match.empty? - Tartrazine.lexer(@lexer_name).tokenize(String.new(match[match_group].value), secondary: true) + Tokenizer.new( + Tartrazine.lexer(@lexer_name), + String.new(match[match_group].value), + secondary: true).to_a when ActionType::Usingself # Shunt to another copy of this lexer return [] of Token if match.empty? - new_lexer = lexer.copy - new_lexer.tokenize(String.new(match[match_group].value), secondary: true) + Tokenizer.new( + tokenizer.lexer, + String.new(match[match_group].value), + secondary: true).to_a when ActionType::Combined # Combine two or more states into one anonymous state new_state = @states.map { |name| - lexer.states[name] + tokenizer.lexer.states[name] }.reduce { |state1, state2| state1 + state2 } - lexer.states[new_state.name] = new_state - lexer.state_stack << new_state.name + tokenizer.lexer.states[new_state.name] = new_state + tokenizer.state_stack << new_state.name [] of Token else raise Exception.new("Unknown action type: #{@type}") diff --git a/src/formatters/ansi.cr b/src/formatters/ansi.cr index 5fb6ece..ea86790 100644 --- a/src/formatters/ansi.cr +++ b/src/formatters/ansi.cr @@ -7,19 +7,39 @@ module Tartrazine def initialize(@theme : Theme = Tartrazine.theme("default-dark"), @line_numbers : Bool = false) end + private def line_label(i : Int32) : String + "#{i + 1}".rjust(4).ljust(5) + end + def format(text : String, lexer : Lexer) : String + tokenizer = Tokenizer.new(lexer, text) + i = 0 output = String.build do |outp| - lexer.group_tokens_in_lines(lexer.tokenize(text)).each_with_index do |line, i| - label = line_numbers? ? "#{i + 1}".rjust(4).ljust(5) : "" - outp << label - line.each do |token| - outp << colorize(token[:value], token[:type]) + outp << line_label(i) if line_numbers? + tokenizer.each do |token| + outp << colorize(token[:value], token[:type]) + if token[:value].includes?("\n") + i += 1 + outp << line_label(i) if line_numbers? end end end output end + # def format(text : String, lexer : Lexer) : String + # output = String.build do |outp| + # lexer.group_tokens_in_lines(lexer.tokenize(text)).each_with_index do |line, i| + # label = line_numbers? ? "#{i + 1}".rjust(4).ljust(5) : "" + # outp << label + # line.each do |token| + # outp << colorize(token[:value], token[:type]) + # end + # end + # end + # output + # end + def colorize(text : String, token : String) : String style = theme.styles.fetch(token, nil) return text if style.nil? diff --git a/src/formatters/html.cr b/src/formatters/html.cr index b6037cc..87105d3 100644 --- a/src/formatters/html.cr +++ b/src/formatters/html.cr @@ -54,21 +54,29 @@ module Tartrazine output end + private def line_label(i : Int32) : String + line_label = "#{i + 1}".rjust(4).ljust(5) + line_class = highlighted?(i + 1) ? "class=\"#{get_css_class("LineHighlight")}\"" : "" + line_id = linkable_line_numbers? ? "id=\"#{line_number_id_prefix}#{i + 1}\"" : "" + "#{line_label} " + end + def format_text(text : String, lexer : Lexer) : String - lines = lexer.group_tokens_in_lines(lexer.tokenize(text)) + # lines = lexer.group_tokens_in_lines(lexer.tokenize(text)) + tokenizer = Tokenizer.new(lexer, text) + i = 0 output = String.build do |outp| if surrounding_pre? pre_style = wrap_long_lines? ? "style=\"white-space: pre-wrap; word-break: break-word;\"" : "" outp << "
"
         end
         outp << ""
-        lines.each_with_index(offset: line_number_start - 1) do |line, i|
-          line_label = line_numbers? ? "#{i + 1}".rjust(4).ljust(5) : ""
-          line_class = highlighted?(i + 1) ? "class=\"#{get_css_class("LineHighlight")}\"" : ""
-          line_id = linkable_line_numbers? ? "id=\"#{line_number_id_prefix}#{i + 1}\"" : ""
-          outp << "#{line_label} "
-          line.each do |token|
-            outp << "#{HTML.escape(token[:value])}"
+        outp << line_label(i) if line_numbers?
+        tokenizer.each do |token|
+          outp << "#{HTML.escape(token[:value])}"
+          if token[:value].ends_with? "\n"
+            i += 1
+            outp << line_label(i) if line_numbers?
           end
         end
         outp << "
" diff --git a/src/lexer.cr b/src/lexer.cr index 540505c..cabf71e 100644 --- a/src/lexer.cr +++ b/src/lexer.cr @@ -37,6 +37,75 @@ module Tartrazine LEXERS_BY_NAME.keys.sort! end + # A token, the output of the tokenizer + alias Token = NamedTuple(type: String, value: String) + + struct Tokenizer + include Iterator(Token) + property lexer : Lexer + property text : Bytes + property pos : Int32 = 0 + @dq = Deque(Token).new + property state_stack = ["root"] + + def initialize(@lexer : Lexer, text : String, secondary = false) + # Respect the `ensure_nl` config option + if text.size > 0 && text[-1] != '\n' && @lexer.config[:ensure_nl] && !secondary + text += "\n" + end + @text = text.to_slice + end + + def next : Iterator::Stop | Token + if @dq.size > 0 + return @dq.shift + end + if pos == @text.size + return stop + end + + matched = false + while @pos < @text.size + @lexer.states[@state_stack.last].rules.each do |rule| + matched, new_pos, new_tokens = rule.match(@text, @pos, self) + if matched + @pos = new_pos + split_tokens(new_tokens).each { |token| @dq << token } + break + end + end + if !matched + if @text[@pos] == 10u8 + @dq << {type: "Text", value: "\n"} + @state_stack = ["root"] + else + @dq << {type: "Error", value: String.new(@text[@pos..@pos])} + @pos += 1 + end + break + end + end + self.next + end + + # If a token contains a newline, split it into two tokens + def split_tokens(tokens : Array(Token)) : Array(Token) + split_tokens = [] of Token + tokens.each do |token| + if token[:value].includes?("\n") + values = token[:value].split("\n") + values.each_with_index do |value, index| + value += "\n" if index < values.size - 1 + split_tokens << {type: token[:type], value: value} + end + else + split_tokens << token + end + end + split_tokens + end + end + # This implements a lexer for Pygments RegexLexers as expressed # in Chroma's XML serialization. # @@ -52,62 +121,7 @@ module Tartrazine not_multiline: false, ensure_nl: false, } - # property xml : String = "" property states = {} of String => State - property state_stack = ["root"] - - def copy : Lexer - new_lexer = Lexer.new - new_lexer.config = config - new_lexer.states = states - new_lexer.state_stack = ["root"] - new_lexer - end - - # Turn the text into a list of tokens. The `secondary` parameter - # is true when the lexer is being used to tokenize a string - # from a larger text that is already being tokenized. - # So, when it's true, we don't modify the text. - def tokenize(text : String, secondary = false) : Array(Token) - @state_stack = ["root"] - tokens = [] of Token - pos = 0 - matched = false - - # Respect the `ensure_nl` config option - if text.size > 0 && text[-1] != '\n' && config[:ensure_nl] && !secondary - text += "\n" - end - - # We operate in bytes from now on - text_bytes = text.to_slice - # Loop through the text, matching rules - while pos < text_bytes.size - states[@state_stack.last].rules.each do |rule| - matched, new_pos, new_tokens = rule.match(text_bytes, pos, self) - if matched - # Move position forward, save the tokens - pos = new_pos - tokens += new_tokens - # Start matching rules at new position - break - end - end - if !matched - # at EOL, emit the newline, reset state to "root" - if text_bytes[pos] == 10u8 - tokens << {type: "Text", value: "\n"} - @state_stack = ["root"] - else - # Emit an error token - tokens << {type: "Error", value: String.new(text_bytes[pos..pos])} - end - # Move forward 1 - pos += 1 - end - end - Lexer.collapse_tokens(tokens) - end # Collapse consecutive tokens of the same type for easier comparison # and smaller output @@ -131,31 +145,6 @@ module Tartrazine result end - # Group tokens into lines, splitting them when a newline is found - def group_tokens_in_lines(tokens : Array(Token)) : Array(Array(Token)) - split_tokens = [] of Token - tokens.each do |token| - if token[:value].includes?("\n") - values = token[:value].split("\n") - values.each_with_index do |value, index| - value += "\n" if index < values.size - 1 - split_tokens << {type: token[:type], value: value} - end - else - split_tokens << token - end - end - lines = [Array(Token).new] - split_tokens.each do |token| - lines.last << token - if token[:value].includes?("\n") - lines << Array(Token).new - end - end - lines - end - - # ameba:disable Metrics/CyclomaticComplexity def self.from_xml(xml : String) : Lexer l = Lexer.new lexer = XML.parse(xml).first_element_child @@ -229,7 +218,4 @@ module Tartrazine new_state end end - - # A token, the output of the tokenizer - alias Token = NamedTuple(type: String, value: String) end diff --git a/src/rules.cr b/src/rules.cr index 597ea65..6eaa8d7 100644 --- a/src/rules.cr +++ b/src/rules.cr @@ -16,7 +16,7 @@ module Tartrazine alias MatchData = Array(Match) abstract struct BaseRule - abstract def match(text : Bytes, pos, lexer) : Tuple(Bool, Int32, Array(Token)) + abstract def match(text : Bytes, pos : Int32, tokenizer : Tokenizer) : Tuple(Bool, Int32, Array(Token)) abstract def initialize(node : XML::Node) @actions : Array(Action) = [] of Action @@ -32,12 +32,12 @@ module Tartrazine struct Rule < BaseRule property pattern : Regex = Regex.new "" - def match(text : Bytes, pos, lexer) : Tuple(Bool, Int32, Array(Token)) + def match(text : Bytes, pos, tokenizer) : Tuple(Bool, Int32, Array(Token)) match = pattern.match(text, pos) # No match return false, pos, [] of Token if match.size == 0 - return true, pos + match[0].size, @actions.flat_map(&.emit(match, lexer)) + return true, pos + match[0].size, @actions.flat_map(&.emit(match, tokenizer)) end def initialize(node : XML::Node) @@ -56,9 +56,9 @@ module Tartrazine struct IncludeStateRule < BaseRule @state : String = "" - def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token)) - lexer.states[@state].rules.each do |rule| - matched, new_pos, new_tokens = rule.match(text, pos, lexer) + def match(text : Bytes, pos : Int32, tokenizer : Tokenizer) : Tuple(Bool, Int32, Array(Token)) + tokenizer.@lexer.states[@state].rules.each do |rule| + matched, new_pos, new_tokens = rule.match(text, pos, tokenizer) return true, new_pos, new_tokens if matched end return false, pos, [] of Token @@ -77,8 +77,8 @@ module Tartrazine struct UnconditionalRule < BaseRule NO_MATCH = [] of Match - def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token)) - return true, pos, @actions.flat_map(&.emit(NO_MATCH, lexer)) + def match(text, pos, tokenizer) : Tuple(Bool, Int32, Array(Token)) + return true, pos, @actions.flat_map(&.emit(NO_MATCH, tokenizer)) end def initialize(node : XML::Node) diff --git a/src/styles.cr b/src/styles.cr index b3d6f1c..fd52d4a 100644 --- a/src/styles.cr +++ b/src/styles.cr @@ -9,7 +9,7 @@ require "xml" module Tartrazine alias Color = Sixteen::Color - class ThemeFiles + struct ThemeFiles extend BakedFileSystem bake_folder "../styles", __DIR__ end @@ -39,7 +39,7 @@ module Tartrazine themes.to_a.sort! end - class Style + struct Style # These properties are tri-state. # true means it's set # false means it's not set @@ -79,7 +79,7 @@ module Tartrazine end end - class Theme + struct Theme property name : String = "" property styles = {} of String => Style