From 7538fc76aae724288635ec53a05a4af44feb74c4 Mon Sep 17 00:00:00 2001
From: Roberto Alsina <roberto.alsina@gmail.com>
Date: Fri, 16 Aug 2024 13:27:02 -0300
Subject: [PATCH] Tokenize via an iterator, makes everything much faster

---
 spec/tartrazine_spec.cr |   3 +-
 src/actions.cr          |  33 +++++----
 src/formatters/ansi.cr  |  30 ++++++--
 src/formatters/html.cr  |  24 ++++---
 src/lexer.cr            | 152 ++++++++++++++++++----------------------
 src/rules.cr            |  16 ++---
 src/styles.cr           |   6 +-
 7 files changed, 142 insertions(+), 122 deletions(-)

diff --git a/spec/tartrazine_spec.cr b/spec/tartrazine_spec.cr
index 5125129..15011e9 100644
--- a/spec/tartrazine_spec.cr
+++ b/spec/tartrazine_spec.cr
@@ -73,7 +73,8 @@ end
 # Helper that creates lexer and tokenizes
 def tokenize(lexer_name, text)
   lexer = Tartrazine.lexer(lexer_name)
-  lexer.tokenize(text)
+  tokenizer = Tartrazine::Tokenizer.new(lexer, text)
+  Tartrazine::Lexer.collapse_tokens(tokenizer.to_a)
 end
 
 # Helper that tokenizes using chroma to validate the lexer
diff --git a/src/actions.cr b/src/actions.cr
index 13d80ae..b626dd2 100644
--- a/src/actions.cr
+++ b/src/actions.cr
@@ -66,26 +66,26 @@ module Tartrazine
     end
 
     # ameba:disable Metrics/CyclomaticComplexity
-    def emit(match : MatchData, lexer : Lexer, match_group = 0) : Array(Token)
+    def emit(match : MatchData, tokenizer : Tokenizer, match_group = 0) : Array(Token)
       case @type
       when ActionType::Token
         raise Exception.new "Can't have a token without a match" if match.empty?
         [Token.new(type: @token_type, value: String.new(match[match_group].value))]
       when ActionType::Push
-        to_push = @states_to_push.empty? ? [lexer.state_stack.last] : @states_to_push
+        to_push = @states_to_push.empty? ? [tokenizer.state_stack.last] : @states_to_push
         to_push.each do |state|
-          if state == "#pop" && lexer.state_stack.size > 1
+          if state == "#pop" && tokenizer.state_stack.size > 1
             # Pop the state
-            lexer.state_stack.pop
+            tokenizer.state_stack.pop
           else
             # Really push
-            lexer.state_stack << state
+            tokenizer.state_stack << state
           end
         end
         [] of Token
       when ActionType::Pop
-        to_pop = [@depth, lexer.state_stack.size - 1].min
-        lexer.state_stack.pop(to_pop)
+        to_pop = [@depth, tokenizer.state_stack.size - 1].min
+        tokenizer.state_stack.pop(to_pop)
         [] of Token
       when ActionType::Bygroups
         # FIXME: handle
@@ -109,27 +109,32 @@ module Tartrazine
             # No match for this group
             next
           end
-          result += e.emit(match, lexer, i + 1)
+          result += e.emit(match, tokenizer, i + 1)
         end
         result
       when ActionType::Using
         # Shunt to another lexer entirely
         return [] of Token if match.empty?
-        Tartrazine.lexer(@lexer_name).tokenize(String.new(match[match_group].value), secondary: true)
+        Tokenizer.new(
+          Tartrazine.lexer(@lexer_name),
+          String.new(match[match_group].value),
+          secondary: true).to_a
       when ActionType::Usingself
         # Shunt to another copy of this lexer
         return [] of Token if match.empty?
-        new_lexer = lexer.copy
-        new_lexer.tokenize(String.new(match[match_group].value), secondary: true)
+        Tokenizer.new(
+          tokenizer.lexer,
+          String.new(match[match_group].value),
+          secondary: true).to_a
       when ActionType::Combined
         # Combine two or more states into one anonymous state
         new_state = @states.map { |name|
-          lexer.states[name]
+          tokenizer.lexer.states[name]
         }.reduce { |state1, state2|
           state1 + state2
         }
-        lexer.states[new_state.name] = new_state
-        lexer.state_stack << new_state.name
+        tokenizer.lexer.states[new_state.name] = new_state
+        tokenizer.state_stack << new_state.name
         [] of Token
       else
         raise Exception.new("Unknown action type: #{@type}")
diff --git a/src/formatters/ansi.cr b/src/formatters/ansi.cr
index 5fb6ece..ea86790 100644
--- a/src/formatters/ansi.cr
+++ b/src/formatters/ansi.cr
@@ -7,19 +7,39 @@ module Tartrazine
     def initialize(@theme : Theme = Tartrazine.theme("default-dark"), @line_numbers : Bool = false)
     end
 
+    private def line_label(i : Int32) : String
+      "#{i + 1}".rjust(4).ljust(5)
+    end
+
     def format(text : String, lexer : Lexer) : String
+      tokenizer = Tokenizer.new(lexer, text)
+      i = 0
       output = String.build do |outp|
-        lexer.group_tokens_in_lines(lexer.tokenize(text)).each_with_index do |line, i|
-          label = line_numbers? ? "#{i + 1}".rjust(4).ljust(5) : ""
-          outp << label
-          line.each do |token|
-            outp << colorize(token[:value], token[:type])
+        outp << line_label(i) if line_numbers?
+        tokenizer.each do |token|
+          outp << colorize(token[:value], token[:type])
+          if token[:value].includes?("\n")
+            i += 1
+            outp << line_label(i) if line_numbers?
           end
         end
       end
       output
     end
 
+    # def format(text : String, lexer : Lexer) : String
+    #   output = String.build do |outp|
+    #     lexer.group_tokens_in_lines(lexer.tokenize(text)).each_with_index do |line, i|
+    #       label = line_numbers? ? "#{i + 1}".rjust(4).ljust(5) : ""
+    #       outp << label
+    #       line.each do |token|
+    #         outp << colorize(token[:value], token[:type])
+    #       end
+    #     end
+    #   end
+    #   output
+    # end
+
     def colorize(text : String, token : String) : String
       style = theme.styles.fetch(token, nil)
       return text if style.nil?
diff --git a/src/formatters/html.cr b/src/formatters/html.cr
index b6037cc..87105d3 100644
--- a/src/formatters/html.cr
+++ b/src/formatters/html.cr
@@ -54,21 +54,29 @@ module Tartrazine
       output
     end
 
+    private def line_label(i : Int32) : String
+      line_label = "#{i + 1}".rjust(4).ljust(5)
+      line_class = highlighted?(i + 1) ? "class=\"#{get_css_class("LineHighlight")}\"" : ""
+      line_id = linkable_line_numbers? ? "id=\"#{line_number_id_prefix}#{i + 1}\"" : ""
+      "<span #{line_id} #{line_class} style=\"user-select: none;\">#{line_label} </span>"
+    end
+
     def format_text(text : String, lexer : Lexer) : String
-      lines = lexer.group_tokens_in_lines(lexer.tokenize(text))
+      # lines = lexer.group_tokens_in_lines(lexer.tokenize(text))
+      tokenizer = Tokenizer.new(lexer, text)
+      i = 0
       output = String.build do |outp|
         if surrounding_pre?
           pre_style = wrap_long_lines? ? "style=\"white-space: pre-wrap; word-break: break-word;\"" : ""
           outp << "<pre class=\"#{get_css_class("Background")}\" #{pre_style}>"
         end
         outp << "<code class=\"#{get_css_class("Background")}\">"
-        lines.each_with_index(offset: line_number_start - 1) do |line, i|
-          line_label = line_numbers? ? "#{i + 1}".rjust(4).ljust(5) : ""
-          line_class = highlighted?(i + 1) ? "class=\"#{get_css_class("LineHighlight")}\"" : ""
-          line_id = linkable_line_numbers? ? "id=\"#{line_number_id_prefix}#{i + 1}\"" : ""
-          outp << "<span #{line_id} #{line_class} style=\"user-select: none;\">#{line_label} </span>"
-          line.each do |token|
-            outp << "<span class=\"#{get_css_class(token[:type])}\">#{HTML.escape(token[:value])}</span>"
+        outp << line_label(i) if line_numbers?
+        tokenizer.each do |token|
+          outp << "<span class=\"#{get_css_class(token[:type])}\">#{HTML.escape(token[:value])}</span>"
+          if token[:value].ends_with? "\n"
+            i += 1
+            outp << line_label(i) if line_numbers?
           end
         end
         outp << "</code></pre>"
diff --git a/src/lexer.cr b/src/lexer.cr
index 540505c..cabf71e 100644
--- a/src/lexer.cr
+++ b/src/lexer.cr
@@ -37,6 +37,75 @@ module Tartrazine
     LEXERS_BY_NAME.keys.sort!
   end
 
+  # A token, the output of the tokenizer
+  alias Token = NamedTuple(type: String, value: String)
+
+  struct Tokenizer
+    include Iterator(Token)
+    property lexer : Lexer
+    property text : Bytes
+    property pos : Int32 = 0
+    @dq = Deque(Token).new
+    property state_stack = ["root"]
+
+    def initialize(@lexer : Lexer, text : String, secondary = false)
+      # Respect the `ensure_nl` config option
+      if text.size > 0 && text[-1] != '\n' && @lexer.config[:ensure_nl] && !secondary
+        text += "\n"
+      end
+      @text = text.to_slice
+    end
+
+    def next : Iterator::Stop | Token
+      if @dq.size > 0
+        return @dq.shift
+      end
+      if pos == @text.size
+        return stop
+      end
+
+      matched = false
+      while @pos < @text.size
+        @lexer.states[@state_stack.last].rules.each do |rule|
+          matched, new_pos, new_tokens = rule.match(@text, @pos, self)
+          if matched
+            @pos = new_pos
+            split_tokens(new_tokens).each { |token| @dq << token }
+            break
+          end
+        end
+        if !matched
+          if @text[@pos] == 10u8
+            @dq << {type: "Text", value: "\n"}
+            @state_stack = ["root"]
+          else
+            @dq << {type: "Error", value: String.new(@text[@pos..@pos])}
+            @pos += 1
+          end
+          break
+        end  
+      end
+      self.next
+    end
+
+    # If a token contains a newline, split it into two tokens
+    def split_tokens(tokens : Array(Token)) : Array(Token)
+      split_tokens = [] of Token
+      tokens.each do |token|
+        if token[:value].includes?("\n")
+          values = token[:value].split("\n")
+          values.each_with_index do |value, index|
+            value += "\n" if index < values.size - 1
+            split_tokens << {type: token[:type], value: value}
+          end
+        else
+          split_tokens << token
+        end
+      end
+      split_tokens
+    end
+  end
+
   # This implements a lexer for Pygments RegexLexers as expressed
   # in Chroma's XML serialization.
   #
@@ -52,62 +121,7 @@ module Tartrazine
       not_multiline:    false,
       ensure_nl:        false,
     }
-    # property xml : String = ""
     property states = {} of String => State
-    property state_stack = ["root"]
-
-    def copy : Lexer
-      new_lexer = Lexer.new
-      new_lexer.config = config
-      new_lexer.states = states
-      new_lexer.state_stack = ["root"]
-      new_lexer
-    end
-
-    # Turn the text into a list of tokens. The `secondary` parameter
-    # is true when the lexer is being used to tokenize a string
-    # from a larger text that is already being tokenized.
-    # So, when it's true, we don't modify the text.
-    def tokenize(text : String, secondary = false) : Array(Token)
-      @state_stack = ["root"]
-      tokens = [] of Token
-      pos = 0
-      matched = false
-
-      # Respect the `ensure_nl` config option
-      if text.size > 0 && text[-1] != '\n' && config[:ensure_nl] && !secondary
-        text += "\n"
-      end
-
-      # We operate in bytes from now on
-      text_bytes = text.to_slice
-      # Loop through the text, matching rules
-      while pos < text_bytes.size
-        states[@state_stack.last].rules.each do |rule|
-          matched, new_pos, new_tokens = rule.match(text_bytes, pos, self)
-          if matched
-            # Move position forward, save the tokens
-            pos = new_pos
-            tokens += new_tokens
-            # Start matching rules at new position
-            break
-          end
-        end
-        if !matched
-          # at EOL, emit the newline, reset state to "root"
-          if text_bytes[pos] == 10u8
-            tokens << {type: "Text", value: "\n"}
-            @state_stack = ["root"]
-          else
-            # Emit an error token
-            tokens << {type: "Error", value: String.new(text_bytes[pos..pos])}
-          end
-          # Move forward 1
-          pos += 1
-        end
-      end
-      Lexer.collapse_tokens(tokens)
-    end
 
     # Collapse consecutive tokens of the same type for easier comparison
     # and smaller output
@@ -131,31 +145,6 @@ module Tartrazine
       result
     end
 
-    # Group tokens into lines, splitting them when a newline is found
-    def group_tokens_in_lines(tokens : Array(Token)) : Array(Array(Token))
-      split_tokens = [] of Token
-      tokens.each do |token|
-        if token[:value].includes?("\n")
-          values = token[:value].split("\n")
-          values.each_with_index do |value, index|
-            value += "\n" if index < values.size - 1
-            split_tokens << {type: token[:type], value: value}
-          end
-        else
-          split_tokens << token
-        end
-      end
-      lines = [Array(Token).new]
-      split_tokens.each do |token|
-        lines.last << token
-        if token[:value].includes?("\n")
-          lines << Array(Token).new
-        end
-      end
-      lines
-    end
-
-    # ameba:disable Metrics/CyclomaticComplexity
     def self.from_xml(xml : String) : Lexer
       l = Lexer.new
       lexer = XML.parse(xml).first_element_child
@@ -229,7 +218,4 @@ module Tartrazine
       new_state
     end
   end
-
-  # A token, the output of the tokenizer
-  alias Token = NamedTuple(type: String, value: String)
 end
diff --git a/src/rules.cr b/src/rules.cr
index 597ea65..6eaa8d7 100644
--- a/src/rules.cr
+++ b/src/rules.cr
@@ -16,7 +16,7 @@ module Tartrazine
   alias MatchData = Array(Match)
 
   abstract struct BaseRule
-    abstract def match(text : Bytes, pos, lexer) : Tuple(Bool, Int32, Array(Token))
+    abstract def match(text : Bytes, pos : Int32, tokenizer : Tokenizer) : Tuple(Bool, Int32, Array(Token))
     abstract def initialize(node : XML::Node)
 
     @actions : Array(Action) = [] of Action
@@ -32,12 +32,12 @@ module Tartrazine
   struct Rule < BaseRule
     property pattern : Regex = Regex.new ""
 
-    def match(text : Bytes, pos, lexer) : Tuple(Bool, Int32, Array(Token))
+    def match(text : Bytes, pos, tokenizer) : Tuple(Bool, Int32, Array(Token))
       match = pattern.match(text, pos)
 
       # No match
       return false, pos, [] of Token if match.size == 0
-      return true, pos + match[0].size, @actions.flat_map(&.emit(match, lexer))
+      return true, pos + match[0].size, @actions.flat_map(&.emit(match, tokenizer))
     end
 
     def initialize(node : XML::Node)
@@ -56,9 +56,9 @@ module Tartrazine
   struct IncludeStateRule < BaseRule
     @state : String = ""
 
-    def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token))
-      lexer.states[@state].rules.each do |rule|
-        matched, new_pos, new_tokens = rule.match(text, pos, lexer)
+    def match(text : Bytes, pos : Int32, tokenizer : Tokenizer) : Tuple(Bool, Int32, Array(Token))
+      tokenizer.@lexer.states[@state].rules.each do |rule|
+        matched, new_pos, new_tokens = rule.match(text, pos, tokenizer)
         return true, new_pos, new_tokens if matched
       end
       return false, pos, [] of Token
@@ -77,8 +77,8 @@ module Tartrazine
   struct UnconditionalRule < BaseRule
     NO_MATCH = [] of Match
 
-    def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token))
-      return true, pos, @actions.flat_map(&.emit(NO_MATCH, lexer))
+    def match(text, pos, tokenizer) : Tuple(Bool, Int32, Array(Token))
+      return true, pos, @actions.flat_map(&.emit(NO_MATCH, tokenizer))
     end
 
     def initialize(node : XML::Node)
diff --git a/src/styles.cr b/src/styles.cr
index b3d6f1c..fd52d4a 100644
--- a/src/styles.cr
+++ b/src/styles.cr
@@ -9,7 +9,7 @@ require "xml"
 module Tartrazine
   alias Color = Sixteen::Color
 
-  class ThemeFiles
+  struct ThemeFiles
     extend BakedFileSystem
     bake_folder "../styles", __DIR__
   end
@@ -39,7 +39,7 @@ module Tartrazine
     themes.to_a.sort!
   end
 
-  class Style
+  struct Style
     # These properties are tri-state.
     # true means it's set
     # false means it's not set
@@ -79,7 +79,7 @@ module Tartrazine
     end
   end
 
-  class Theme
+  struct Theme
     property name : String = ""
 
     property styles = {} of String => Style