v0.5.0

todo management
updated
2025-09-17 10:48:12 +00:00 · 2024-08-16 19:38:40 -03:00 · 2024-08-16 14:05:34 -03:00 · 2024-08-16 14:03:05 -03:00 · 2024-08-16 14:01:16 -03:00 · 2024-08-16 13:36:11 -03:00
11 changed files with 194 additions and 165 deletions
--- a/README.md
+++ b/README.md
@@ -47,7 +47,14 @@ To build from source:
 2. Run `make` to build the `tartrazine` binary
 3. Copy the binary somewhere in your PATH.

-## Usage
+## Usage as a CLI tool
+
+```shell
+$ tartrazine whatever.c -l c -t catppuccin-macchiato --line-numbers \
+  --standalone -o whatever.html 
+```
+
+## Usage as a Library

 This works:

--- a/TODO.md
+++ b/TODO.md
@@ -8,4 +8,5 @@
 * ✅ Implement lexer loader that respects aliases
 * ✅ Implement lexer loader by file extension
 * ✅ Add --line-numbers to terminal formatter
-* Implement lexer loader by mime type
+* Implement lexer loader by mime type
+* Implement Delegating lexers
--- a/shard.yml
+++ b/shard.yml
@@ -1,5 +1,5 @@
 name: tartrazine
-version: 0.4.0
+version: 0.5.0

 authors:
  - Roberto Alsina <roberto.alsina@gmail.com>
--- a/spec/tartrazine_spec.cr
+++ b/spec/tartrazine_spec.cr
@@ -73,7 +73,8 @@ end
 # Helper that creates lexer and tokenizes
 def tokenize(lexer_name, text)
  lexer = Tartrazine.lexer(lexer_name)
-  lexer.tokenize(text)
+  tokenizer = Tartrazine::Tokenizer.new(lexer, text)
+  Tartrazine::Lexer.collapse_tokens(tokenizer.to_a)
 end

 # Helper that tokenizes using chroma to validate the lexer
--- a/src/actions.cr
+++ b/src/actions.cr
@@ -8,19 +8,29 @@ require "./tartrazine"
 # perform a list of actions. These actions can emit tokens
 # or change the state machine.
 module Tartrazine
+  enum ActionType
+    Bygroups
+    Combined
+    Include
+    Pop
+    Push
+    Token
+    Using
+    Usingself
+  end
+
  struct Action
    property actions : Array(Action) = [] of Action
-    property type : String

    @depth : Int32 = 0
    @lexer_name : String = ""
    @states : Array(String) = [] of String
    @states_to_push : Array(String) = [] of String
    @token_type : String = ""
+    @type : ActionType = ActionType::Token

-    def initialize(@type : String, xml : XML::Node?)
-      known_types = %w(token push pop combined bygroups include using usingself)
-      raise Exception.new("Unknown action type: #{type}") unless known_types.includes? type
+    def initialize(t : String, xml : XML::Node?)
+      @type = ActionType.parse(t.capitalize)

      # Some actions may have actions in them, like this:
      # <bygroups>
@@ -37,18 +47,18 @@ module Tartrazine
      end

      # Prefetch the attributes we ned from the XML and keep them
-      case type
-      when "token"
+      case @type
+      when ActionType::Token
        @token_type = xml["type"]
-      when "push"
+      when ActionType::Push
        @states_to_push = xml.attributes.select { |attrib|
          attrib.name == "state"
        }.map &.content
-      when "pop"
+      when ActionType::Pop
        @depth = xml["depth"].to_i
-      when "using"
+      when ActionType::Using
        @lexer_name = xml["lexer"].downcase
-      when "combined"
+      when ActionType::Combined
        @states = xml.attributes.select { |attrib|
          attrib.name == "state"
        }.map &.content
@@ -56,28 +66,28 @@ module Tartrazine
    end

    # ameba:disable Metrics/CyclomaticComplexity
-    def emit(match : MatchData, lexer : Lexer, match_group = 0) : Array(Token)
-      case type
-      when "token"
+    def emit(match : MatchData, tokenizer : Tokenizer, match_group = 0) : Array(Token)
+      case @type
+      when ActionType::Token
        raise Exception.new "Can't have a token without a match" if match.empty?
        [Token.new(type: @token_type, value: String.new(match[match_group].value))]
-      when "push"
-        to_push = @states_to_push.empty? ? [lexer.state_stack.last] : @states_to_push
+      when ActionType::Push
+        to_push = @states_to_push.empty? ? [tokenizer.state_stack.last] : @states_to_push
        to_push.each do |state|
-          if state == "#pop" && lexer.state_stack.size > 1
+          if state == "#pop" && tokenizer.state_stack.size > 1
            # Pop the state
-            lexer.state_stack.pop
+            tokenizer.state_stack.pop
          else
            # Really push
-            lexer.state_stack << state
+            tokenizer.state_stack << state
          end
        end
        [] of Token
-      when "pop"
-        to_pop = [@depth, lexer.state_stack.size - 1].min
-        lexer.state_stack.pop(to_pop)
+      when ActionType::Pop
+        to_pop = [@depth, tokenizer.state_stack.size - 1].min
+        tokenizer.state_stack.pop(to_pop)
        [] of Token
-      when "bygroups"
+      when ActionType::Bygroups
        # FIXME: handle
        # ><bygroups>
        # <token type="Punctuation"/>
@@ -99,30 +109,35 @@ module Tartrazine
            # No match for this group
            next
          end
-          result += e.emit(match, lexer, i + 1)
+          result += e.emit(match, tokenizer, i + 1)
        end
        result
-      when "using"
+      when ActionType::Using
        # Shunt to another lexer entirely
        return [] of Token if match.empty?
-        Tartrazine.lexer(@lexer_name).tokenize(String.new(match[match_group].value), usingself: true)
-      when "usingself"
+        Tokenizer.new(
+          Tartrazine.lexer(@lexer_name),
+          String.new(match[match_group].value),
+          secondary: true).to_a
+      when ActionType::Usingself
        # Shunt to another copy of this lexer
        return [] of Token if match.empty?
-        new_lexer = lexer.copy
-        new_lexer.tokenize(String.new(match[match_group].value), usingself: true)
-      when "combined"
-        # Combine two states into one anonymous state
+        Tokenizer.new(
+          tokenizer.lexer,
+          String.new(match[match_group].value),
+          secondary: true).to_a
+      when ActionType::Combined
+        # Combine two or more states into one anonymous state
        new_state = @states.map { |name|
-          lexer.states[name]
+          tokenizer.lexer.states[name]
        }.reduce { |state1, state2|
          state1 + state2
        }
-        lexer.states[new_state.name] = new_state
-        lexer.state_stack << new_state.name
+        tokenizer.lexer.states[new_state.name] = new_state
+        tokenizer.state_stack << new_state.name
        [] of Token
      else
-        raise Exception.new("Unknown action type: #{type}")
+        raise Exception.new("Unknown action type: #{@type}")
      end
    end
  end
--- a/src/bytes_regex.cr
+++ b/src/bytes_regex.cr
@@ -31,7 +31,6 @@ module BytesRegex
    end

    def match(str : Bytes, pos = 0) : Array(Match)
-      match = [] of Match
      rc = LibPCRE2.match(
        @re,
        str,
@@ -42,22 +41,23 @@ module BytesRegex
        nil)
      if rc > 0
        ovector = LibPCRE2.get_ovector_pointer(@match_data)
-        (0...rc).each do |i|
+        (0...rc).map do |i|
          m_start = ovector[2 * i]
-          m_size = ovector[2 * i + 1] - m_start
-          if m_size == 0
+          m_end = ovector[2 * i + 1]
+          if m_start == m_end
            m_value = Bytes.new(0)
          else
-            m_value = str[m_start...m_start + m_size]
+            m_value = str[m_start...m_end]
          end
-          match << Match.new(m_value, m_start, m_size)
+          Match.new(m_value, m_start, m_end - m_start)
        end
+      else
+        [] of Match
      end
-      match
    end
  end

-  class Match
+  struct Match
    property value : Bytes
    property start : UInt64
    property size : UInt64
--- a/src/formatters/ansi.cr
+++ b/src/formatters/ansi.cr
@@ -7,19 +7,39 @@ module Tartrazine
    def initialize(@theme : Theme = Tartrazine.theme("default-dark"), @line_numbers : Bool = false)
    end

+    private def line_label(i : Int32) : String
+      "#{i + 1}".rjust(4).ljust(5)
+    end
+
    def format(text : String, lexer : Lexer) : String
+      tokenizer = Tokenizer.new(lexer, text)
+      i = 0
      output = String.build do |outp|
-        lexer.group_tokens_in_lines(lexer.tokenize(text)).each_with_index do |line, i|
-          label = line_numbers? ? "#{i + 1}".rjust(4).ljust(5) : ""
-          outp << label
-          line.each do |token|
-            outp << colorize(token[:value], token[:type])
+        outp << line_label(i) if line_numbers?
+        tokenizer.each do |token|
+          outp << colorize(token[:value], token[:type])
+          if token[:value].includes?("\n")
+            i += 1
+            outp << line_label(i) if line_numbers?
          end
        end
      end
      output
    end

+    # def format(text : String, lexer : Lexer) : String
+    #   output = String.build do |outp|
+    #     lexer.group_tokens_in_lines(lexer.tokenize(text)).each_with_index do |line, i|
+    #       label = line_numbers? ? "#{i + 1}".rjust(4).ljust(5) : ""
+    #       outp << label
+    #       line.each do |token|
+    #         outp << colorize(token[:value], token[:type])
+    #       end
+    #     end
+    #   end
+    #   output
+    # end
+
    def colorize(text : String, token : String) : String
      style = theme.styles.fetch(token, nil)
      return text if style.nil?
--- a/src/formatters/html.cr
+++ b/src/formatters/html.cr
@@ -54,21 +54,29 @@ module Tartrazine
      output
    end

+    private def line_label(i : Int32) : String
+      line_label = "#{i + 1}".rjust(4).ljust(5)
+      line_class = highlighted?(i + 1) ? "class=\"#{get_css_class("LineHighlight")}\"" : ""
+      line_id = linkable_line_numbers? ? "id=\"#{line_number_id_prefix}#{i + 1}\"" : ""
+      "<span #{line_id} #{line_class} style=\"user-select: none;\">#{line_label} </span>"
+    end
+
    def format_text(text : String, lexer : Lexer) : String
-      lines = lexer.group_tokens_in_lines(lexer.tokenize(text))
+      # lines = lexer.group_tokens_in_lines(lexer.tokenize(text))
+      tokenizer = Tokenizer.new(lexer, text)
+      i = 0
      output = String.build do |outp|
        if surrounding_pre?
          pre_style = wrap_long_lines? ? "style=\"white-space: pre-wrap; word-break: break-word;\"" : ""
          outp << "<pre class=\"#{get_css_class("Background")}\" #{pre_style}>"
        end
        outp << "<code class=\"#{get_css_class("Background")}\">"
-        lines.each_with_index(offset: line_number_start - 1) do |line, i|
-          line_label = line_numbers? ? "#{i + 1}".rjust(4).ljust(5) : ""
-          line_class = highlighted?(i + 1) ? "class=\"#{get_css_class("LineHighlight")}\"" : ""
-          line_id = linkable_line_numbers? ? "id=\"#{line_number_id_prefix}#{i + 1}\"" : ""
-          outp << "<span #{line_id} #{line_class} style=\"user-select: none;\">#{line_label} </span>"
-          line.each do |token|
-            outp << "<span class=\"#{get_css_class(token[:type])}\">#{HTML.escape(token[:value])}</span>"
+        outp << line_label(i) if line_numbers?
+        tokenizer.each do |token|
+          outp << "<span class=\"#{get_css_class(token[:type])}\">#{HTML.escape(token[:value])}</span>"
+          if token[:value].ends_with? "\n"
+            i += 1
+            outp << line_label(i) if line_numbers?
          end
        end
        outp << "</code></pre>"
--- a/src/lexer.cr
+++ b/src/lexer.cr
@@ -4,7 +4,6 @@ require "./constants/lexers"
 module Tartrazine
  class LexerFiles
    extend BakedFileSystem
-
    bake_folder "../lexers", __DIR__
  end

@@ -38,80 +37,91 @@ module Tartrazine
    LEXERS_BY_NAME.keys.sort!
  end

+  # A token, the output of the tokenizer
+  alias Token = NamedTuple(type: String, value: String)
+
+  struct Tokenizer
+    include Iterator(Token)
+    property lexer : Lexer
+    property text : Bytes
+    property pos : Int32 = 0
+    @dq = Deque(Token).new
+    property state_stack = ["root"]
+
+    def initialize(@lexer : Lexer, text : String, secondary = false)
+      # Respect the `ensure_nl` config option
+      if text.size > 0 && text[-1] != '\n' && @lexer.config[:ensure_nl] && !secondary
+        text += "\n"
+      end
+      @text = text.to_slice
+    end
+
+    def next : Iterator::Stop | Token
+      if @dq.size > 0
+        return @dq.shift
+      end
+      if pos == @text.size
+        return stop
+      end
+
+      matched = false
+      while @pos < @text.size
+        @lexer.states[@state_stack.last].rules.each do |rule|
+          matched, new_pos, new_tokens = rule.match(@text, @pos, self)
+          if matched
+            @pos = new_pos
+            split_tokens(new_tokens).each { |token| @dq << token }
+            break
+          end
+        end
+        if !matched
+          if @text[@pos] == 10u8
+            @dq << {type: "Text", value: "\n"}
+            @state_stack = ["root"]
+          else
+            @dq << {type: "Error", value: String.new(@text[@pos..@pos])}
+          end
+          @pos += 1
+          break
+        end
+      end
+      self.next
+    end
+
+    # If a token contains a newline, split it into two tokens
+    def split_tokens(tokens : Array(Token)) : Array(Token)
+      split_tokens = [] of Token
+      tokens.each do |token|
+        if token[:value].includes?("\n")
+          values = token[:value].split("\n")
+          values.each_with_index do |value, index|
+            value += "\n" if index < values.size - 1
+            split_tokens << {type: token[:type], value: value}
+          end
+        else
+          split_tokens << token
+        end
+      end
+      split_tokens
+    end
+  end
+
  # This implements a lexer for Pygments RegexLexers as expressed
  # in Chroma's XML serialization.
  #
  # For explanations on what actions and states do
  # the Pygments documentation is a good place to start.
  # https://pygments.org/docs/lexerdevelopment/
-  class Lexer
+  struct Lexer
    property config = {
      name:             "",
-      aliases:          [] of String,
-      filenames:        [] of String,
-      mime_types:       [] of String,
      priority:         0.0,
      case_insensitive: false,
      dot_all:          false,
      not_multiline:    false,
      ensure_nl:        false,
    }
-    # property xml : String = ""
    property states = {} of String => State
-    property state_stack = ["root"]
-
-    def copy : Lexer
-      new_lexer = Lexer.new
-      new_lexer.config = config
-      new_lexer.states = states
-      new_lexer.state_stack = state_stack[0..-1]
-      new_lexer
-    end
-
-    # Turn the text into a list of tokens. The `usingself` parameter
-    # is true when the lexer is being used to tokenize a string
-    # from a larger text that is already being tokenized.
-    # So, when it's true, we don't modify the text.
-    def tokenize(text : String, usingself = false) : Array(Token)
-      @state_stack = ["root"]
-      tokens = [] of Token
-      pos = 0
-      matched = false
-
-      # Respect the `ensure_nl` config option
-      if text.size > 0 && text[-1] != '\n' && config[:ensure_nl] && !usingself
-        text += "\n"
-      end
-
-      text_bytes = text.to_slice
-      # Loop through the text, applying rules
-      while pos < text_bytes.size
-        state = states[@state_stack.last]
-        # Log.trace { "Stack is #{@state_stack} State is #{state.name}, pos is #{pos}, text is #{text[pos..pos + 10]}" }
-        state.rules.each do |rule|
-          matched, new_pos, new_tokens = rule.match(text_bytes, pos, self)
-          if matched
-            # Move position forward, save the tokens,
-            # tokenize from the new position
-            pos = new_pos
-            tokens += new_tokens
-            break
-          end
-        end
-        # If no rule matches, emit an error token
-        unless matched
-          if text_bytes[pos] == 10u8
-            # at EOL, reset state to "root"
-            tokens << {type: "Text", value: "\n"}
-            @state_stack = ["root"]
-          else
-            tokens << {type: "Error", value: String.new(text_bytes[pos..pos])}
-          end
-          pos += 1
-        end
-      end
-      Lexer.collapse_tokens(tokens)
-    end

    # Collapse consecutive tokens of the same type for easier comparison
    # and smaller output
@@ -135,31 +145,6 @@ module Tartrazine
      result
    end

-    # Group tokens into lines, splitting them when a newline is found
-    def group_tokens_in_lines(tokens : Array(Token)) : Array(Array(Token))
-      split_tokens = [] of Token
-      tokens.each do |token|
-        if token[:value].includes?("\n")
-          values = token[:value].split("\n")
-          values.each_with_index do |value, index|
-            value += "\n" if index < values.size - 1
-            split_tokens << {type: token[:type], value: value}
-          end
-        else
-          split_tokens << token
-        end
-      end
-      lines = [Array(Token).new]
-      split_tokens.each do |token|
-        lines.last << token
-        if token[:value].includes?("\n")
-          lines << Array(Token).new
-        end
-      end
-      lines
-    end
-
-    # ameba:disable Metrics/CyclomaticComplexity
    def self.from_xml(xml : String) : Lexer
      l = Lexer.new
      lexer = XML.parse(xml).first_element_child
@@ -170,9 +155,6 @@ module Tartrazine
        if config
          l.config = {
            name:             xml_to_s(config, name) || "",
-            aliases:          xml_to_a(config, _alias) || [] of String,
-            filenames:        xml_to_a(config, filename) || [] of String,
-            mime_types:       xml_to_a(config, mime_type) || [] of String,
            priority:         xml_to_f(config, priority) || 0.0,
            not_multiline:    xml_to_s(config, not_multiline) == "true",
            dot_all:          xml_to_s(config, dot_all) == "true",
@@ -236,7 +218,4 @@ module Tartrazine
      new_state
    end
  end
-
-  # A token, the output of the tokenizer
-  alias Token = NamedTuple(type: String, value: String)
 end
--- a/src/rules.cr
+++ b/src/rules.cr
@@ -16,10 +16,10 @@ module Tartrazine
  alias MatchData = Array(Match)

  abstract struct BaseRule
-    abstract def match(text : Bytes, pos, lexer) : Tuple(Bool, Int32, Array(Token))
+    abstract def match(text : Bytes, pos : Int32, tokenizer : Tokenizer) : Tuple(Bool, Int32, Array(Token))
    abstract def initialize(node : XML::Node)

-    property actions : Array(Action) = [] of Action
+    @actions : Array(Action) = [] of Action

    def add_actions(node : XML::Node)
      node.children.each do |child|
@@ -31,14 +31,13 @@ module Tartrazine

  struct Rule < BaseRule
    property pattern : Regex = Regex.new ""
-    property actions : Array(Action) = [] of Action

-    def match(text : Bytes, pos, lexer) : Tuple(Bool, Int32, Array(Token))
+    def match(text : Bytes, pos, tokenizer) : Tuple(Bool, Int32, Array(Token))
      match = pattern.match(text, pos)

      # No match
      return false, pos, [] of Token if match.size == 0
-      return true, pos + match[0].size, actions.flat_map { |action| action.emit(match, lexer) }
+      return true, pos + match[0].size, @actions.flat_map(&.emit(match, tokenizer))
    end

    def initialize(node : XML::Node)
@@ -55,12 +54,11 @@ module Tartrazine
  # This rule includes another state. If any of the rules of the
  # included state matches, this rule matches.
  struct IncludeStateRule < BaseRule
-    property state : String = ""
+    @state : String = ""

-    def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token))
-      Log.trace { "Including state #{state} from #{lexer.state_stack.last}" }
-      lexer.states[state].rules.each do |rule|
-        matched, new_pos, new_tokens = rule.match(text, pos, lexer)
+    def match(text : Bytes, pos : Int32, tokenizer : Tokenizer) : Tuple(Bool, Int32, Array(Token))
+      tokenizer.@lexer.states[@state].rules.each do |rule|
+        matched, new_pos, new_tokens = rule.match(text, pos, tokenizer)
        return true, new_pos, new_tokens if matched
      end
      return false, pos, [] of Token
@@ -79,8 +77,8 @@ module Tartrazine
  struct UnconditionalRule < BaseRule
    NO_MATCH = [] of Match

-    def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token))
-      return true, pos, actions.flat_map { |action| action.emit(NO_MATCH, lexer) }
+    def match(text, pos, tokenizer) : Tuple(Bool, Int32, Array(Token))
+      return true, pos, @actions.flat_map(&.emit(NO_MATCH, tokenizer))
    end

    def initialize(node : XML::Node)
--- a/src/styles.cr
+++ b/src/styles.cr
@@ -9,7 +9,7 @@ require "xml"
 module Tartrazine
  alias Color = Sixteen::Color

-  class ThemeFiles
+  struct ThemeFiles
    extend BakedFileSystem
    bake_folder "../styles", __DIR__
  end
@@ -39,7 +39,7 @@ module Tartrazine
    themes.to_a.sort!
  end

-  class Style
+  struct Style
    # These properties are tri-state.
    # true means it's set
    # false means it's not set
@@ -79,7 +79,7 @@ module Tartrazine
    end
  end

-  class Theme
+  struct Theme
    property name : String = ""

    property styles = {} of String => Style
Author	SHA1	Message	Date
Roberto Alsina	10842f7074	v0.5.0	2024-08-16 19:38:40 -03:00
Roberto Alsina	ae03e4612e	todo management	2024-08-16 14:05:34 -03:00
Roberto Alsina	471b2f5050	updated	2024-08-16 14:03:05 -03:00
Roberto Alsina	5a3b08e716	lint	2024-08-16 14:01:16 -03:00
Roberto Alsina	9ebb9f2765	Fix off-by-1	2024-08-16 13:36:11 -03:00
Roberto Alsina	7538fc76aa	Tokenize via an iterator, makes everything much faster	2024-08-16 13:27:02 -03:00
Roberto Alsina	788577b226	Fix comment	2024-08-15 23:56:52 -03:00
Roberto Alsina	1f01146b1f	Minor cleanup	2024-08-15 23:21:21 -03:00
Roberto Alsina	9041b763ea	Remove unused bits of lexer config	2024-08-15 23:17:49 -03:00
Roberto Alsina	ada30915c3	Idiomatic changes	2024-08-15 23:16:29 -03:00
Roberto Alsina	78eff45ea0	Idiomatic changes	2024-08-15 23:11:49 -03:00
Roberto Alsina	e817aedd60	Idiomatic changes	2024-08-15 22:41:24 -03:00
Roberto Alsina	20d6b65346	More idiomatic	2024-08-15 22:01:50 -03:00