2025-06-08 12:40:25 -03:00
10 changed files with 119 additions and 118 deletions
--- a/.gitignore
+++ b/.gitignore
@ -7,4 +7,3 @@ chroma/
 pygments/
 shard.lock
 .vscode/
-.crystal/
--- a/README.md
+++ b/README.md
@ -4,17 +4,17 @@ Tartrazine is a library to syntax-highlight code. It is
 a port of [Pygments](https://pygments.org/) to
 [Crystal](https://crystal-lang.org/). Kind of.

-The CLI tool can be used to highlight many things in many styles.
+It's not currently usable because it's not finished, but:
+
+* The lexers work for the implemented languages
+* The provided styles work
+* There is a very very simple HTML formatter

 # A port of what? Why "kind of"?

-Pygments is a staple of the Python ecosystem, and it's great.
-It lets you highlight code in many languages, and it has many
-themes. Chroma is "Pygments for Go", it's actually a port of
-Pygments to Go, and it's great too.
-
-I wanted that in Crystal, so I started this project. But I did
-not read much of the Pygments code. Or much of Chroma's.
+Because I did not read the Pygments code. And this is actually
+based on [Chroma](https://github.com/alecthomas/chroma) ...
+although I did not read that code either.

 Chroma has taken most of the Pygments lexers and turned them into
 XML descriptions. What I did was take those XML files from Chroma
--- a/shard.yml
+++ b/shard.yml
@ -1,5 +1,5 @@
 name: tartrazine
-version: 0.4.0
+version: 0.3.0

 authors:
  - Roberto Alsina <roberto.alsina@gmail.com>
--- a/spec/tartrazine_spec.cr
+++ b/spec/tartrazine_spec.cr
@ -14,7 +14,6 @@ unicode_problems = {
  "#{__DIR__}/tests/java/test_string_literals.txt",
  "#{__DIR__}/tests/json/test_strings.txt",
  "#{__DIR__}/tests/systemd/example1.txt",
-  "#{__DIR__}/tests/c++/test_unicode_identifiers.txt",
 }

 # These testcases fail because of differences in the way chroma and tartrazine tokenize
--- a/src/actions.cr
+++ b/src/actions.cr
@ -8,20 +8,12 @@ require "./tartrazine"
 # perform a list of actions. These actions can emit tokens
 # or change the state machine.
 module Tartrazine
-  struct Action
-    property actions : Array(Action) = [] of Action
+  class Action
    property type : String
+    property xml : XML::Node
+    property actions : Array(Action) = [] of Action

-    @depth : Int32 = 0
-    @lexer_name : String = ""
-    @states : Array(String) = [] of String
-    @states_to_push : Array(String) = [] of String
-    @token_type : String = ""
-
-    def initialize(@type : String, xml : XML::Node?)
-      known_types = %w(token push pop combined bygroups include using usingself)
-      raise Exception.new("Unknown action type: #{type}") unless known_types.includes? type
-
+    def initialize(@type : String, @xml : XML::Node?)
      # Some actions may have actions in them, like this:
      # <bygroups>
      # <token type="GenericPrompt"/>
@ -31,28 +23,10 @@ module Tartrazine
      #
      # The token actions match with the first 2 groups in the regex
      # the using action matches the 3rd and shunts it to another lexer
-      xml.children.each do |node|
+      @xml.children.each do |node|
        next unless node.element?
        @actions << Action.new(node.name, node)
      end
-
-      # Prefetch the attributes we ned from the XML and keep them
-      case type
-      when "token"
-        @token_type = xml["type"]
-      when "push"
-        @states_to_push = xml.attributes.select { |attrib|
-          attrib.name == "state"
-        }.map &.content
-      when "pop"
-        @depth = xml["depth"].to_i
-      when "using"
-        @lexer_name = xml["lexer"].downcase
-      when "combined"
-        @states = xml.attributes.select { |attrib|
-          attrib.name == "state"
-        }.map &.content
-      end
    end

    # ameba:disable Metrics/CyclomaticComplexity
@ -60,22 +34,35 @@ module Tartrazine
      case type
      when "token"
        raise Exception.new "Can't have a token without a match" if match.empty?
-        [Token.new(type: @token_type, value: String.new(match[match_group].value))]
+        [Token.new(type: xml["type"], value: String.new(match[match_group].value))]
      when "push"
-        to_push = @states_to_push.empty? ? [lexer.state_stack.last] : @states_to_push
-        to_push.each do |state|
-          if state == "#pop" && lexer.state_stack.size > 1
+        states_to_push = xml.attributes.select { |attrib|
+          attrib.name == "state"
+        }.map &.content
+        if states_to_push.empty?
+          # Push without a state means push the current state
+          states_to_push = [lexer.state_stack.last]
+        end
+        states_to_push.each do |state|
+          if state == "#pop"
            # Pop the state
+            Log.trace { "Popping state" }
            lexer.state_stack.pop
          else
            # Really push
            lexer.state_stack << state
+            Log.trace { "Pushed #{lexer.state_stack}" }
          end
        end
        [] of Token
      when "pop"
-        to_pop = [@depth, lexer.state_stack.size - 1].min
-        lexer.state_stack.pop(to_pop)
+        depth = xml["depth"].to_i
+        Log.trace { "Popping #{depth} states" }
+        if lexer.state_stack.size <= depth
+          Log.trace { "Can't pop #{depth} states, only have #{lexer.state_stack.size}" }
+        else
+          lexer.state_stack.pop(depth)
+        end
        [] of Token
      when "bygroups"
        # FIXME: handle
@ -105,15 +92,22 @@ module Tartrazine
      when "using"
        # Shunt to another lexer entirely
        return [] of Token if match.empty?
-        Tartrazine.lexer(@lexer_name).tokenize(String.new(match[match_group].value), usingself: true)
+        lexer_name = xml["lexer"].downcase
+        Log.trace { "to tokenize: #{match[match_group]}" }
+        Tartrazine.lexer(lexer_name).tokenize(String.new(match[match_group].value), usingself: true)
      when "usingself"
        # Shunt to another copy of this lexer
        return [] of Token if match.empty?
-        new_lexer = lexer.copy
+
+        new_lexer = Lexer.from_xml(lexer.xml)
+        Log.trace { "to tokenize: #{match[match_group]}" }
        new_lexer.tokenize(String.new(match[match_group].value), usingself: true)
      when "combined"
        # Combine two states into one anonymous state
-        new_state = @states.map { |name|
+        states = xml.attributes.select { |attrib|
+          attrib.name == "state"
+        }.map &.content
+        new_state = states.map { |name|
          lexer.states[name]
        }.reduce { |state1, state2|
          state1 + state2
@ -122,7 +116,7 @@ module Tartrazine
        lexer.state_stack << new_state.name
        [] of Token
      else
-        raise Exception.new("Unknown action type: #{type}")
+        raise Exception.new("Unknown action type: #{type}: #{xml}")
      end
    end
  end
--- a/src/bytes_regex.cr
+++ b/src/bytes_regex.cr
@ -3,7 +3,7 @@ module BytesRegex

  class Regex
    def initialize(pattern : String, multiline = false, dotall = false, ignorecase = false, anchored = false)
-      flags = LibPCRE2::UTF | LibPCRE2::UCP | LibPCRE2::NO_UTF_CHECK
+      flags = LibPCRE2::UTF | LibPCRE2::DUPNAMES | LibPCRE2::UCP
      flags |= LibPCRE2::MULTILINE if multiline
      flags |= LibPCRE2::DOTALL if dotall
      flags |= LibPCRE2::CASELESS if ignorecase
@ -22,26 +22,27 @@ module BytesRegex
        end
        raise Exception.new "Error #{msg} compiling regex at offset #{erroroffset}"
      end
-      @match_data = LibPCRE2.match_data_create_from_pattern(@re, nil)
    end

    def finalize
-      LibPCRE2.match_data_free(@match_data)
      LibPCRE2.code_free(@re)
    end

    def match(str : Bytes, pos = 0) : Array(Match)
+      match_data = LibPCRE2.match_data_create_from_pattern(@re, nil)
      match = [] of Match
      rc = LibPCRE2.match(
        @re,
        str,
        str.size,
        pos,
-        LibPCRE2::NO_UTF_CHECK,
-        @match_data,
+        0,
+        match_data,
        nil)
-      if rc > 0
-        ovector = LibPCRE2.get_ovector_pointer(@match_data)
+      if rc < 0
+        # No match, do nothing
+      else
+        ovector = LibPCRE2.get_ovector_pointer(match_data)
        (0...rc).each do |i|
          m_start = ovector[2 * i]
          m_size = ovector[2 * i + 1] - m_start
@ -53,6 +54,7 @@ module BytesRegex
          match << Match.new(m_value, m_start, m_size)
        end
      end
+      LibPCRE2.match_data_free(match_data)
      match
    end
  end
--- a/src/formatters/html.cr
+++ b/src/formatters/html.cr
@ -1,6 +1,5 @@
 require "../constants/token_abbrevs.cr"
 require "../formatter"
-require "html"

 module Tartrazine
  class Html < Formatter
@ -68,7 +67,8 @@ module Tartrazine
          line_id = linkable_line_numbers? ? "id=\"#{line_number_id_prefix}#{i + 1}\"" : ""
          outp << "<span #{line_id} #{line_class} style=\"user-select: none;\">#{line_label} </span>"
          line.each do |token|
-            outp << "<span class=\"#{get_css_class(token[:type])}\">#{HTML.escape(token[:value])}</span>"
+            fragment = "<span class=\"#{get_css_class(token[:type])}\">#{token[:value]}</span>"
+            outp << fragment
          end
        end
        outp << "</code></pre>"
@ -104,17 +104,15 @@ module Tartrazine

    # Given a token type, return the CSS class to use.
    def get_css_class(token : String) : String
-      if !theme.styles.has_key? token
+      return class_prefix + Abbreviations[token] if theme.styles.has_key?(token)
+
      # Themes don't contain information for each specific
      # token type. However, they may contain information
      # for a parent style. Worst case, we go to the root
      # (Background) style.
-        parent = theme.style_parents(token).reverse.find { |dad|
-          theme.styles.has_key?(dad)
-        }
-        theme.styles[token] = theme.styles[parent]
-      end
-      class_prefix + Abbreviations[token]
+      class_prefix + Abbreviations[theme.style_parents(token).reverse.find { |parent|
+        theme.styles.has_key?(parent)
+      }]
    end

    # Is this line in the highlighted ranges?
--- a/src/lexer.cr
+++ b/src/lexer.cr
@ -56,17 +56,11 @@ module Tartrazine
      not_multiline:    false,
      ensure_nl:        false,
    }
-    # property xml : String = ""
-    property states = {} of String => State
-    property state_stack = ["root"]
+    property xml : String = ""

-    def copy : Lexer
-      new_lexer = Lexer.new
-      new_lexer.config = config
-      new_lexer.states = states
-      new_lexer.state_stack = state_stack[0..-1]
-      new_lexer
-    end
+    property states = {} of String => State
+
+    property state_stack = ["root"]

    # Turn the text into a list of tokens. The `usingself` parameter
    # is true when the lexer is being used to tokenize a string
@ -93,10 +87,12 @@ module Tartrazine
          if matched
            # Move position forward, save the tokens,
            # tokenize from the new position
+            # Log.trace { "MATCHED: #{rule.xml}" }
            pos = new_pos
            tokens += new_tokens
            break
          end
+          # Log.trace { "NOT MATCHED: #{rule.xml}" }
        end
        # If no rule matches, emit an error token
        unless matched
@ -162,6 +158,7 @@ module Tartrazine
    # ameba:disable Metrics/CyclomaticComplexity
    def self.from_xml(xml : String) : Lexer
      l = Lexer.new
+      l.xml = xml
      lexer = XML.parse(xml).first_element_child
      if lexer
        config = lexer.children.find { |node|
@ -225,9 +222,9 @@ module Tartrazine
  # A Lexer state. A state has a name and a list of rules.
  # The state machine has a state stack containing references
  # to states to decide which rules to apply.
-  struct State
+  class State
    property name : String = ""
-    property rules = [] of BaseRule
+    property rules = [] of Rule

    def +(other : State)
      new_state = State.new
--- a/src/main.cr
+++ b/src/main.cr
@ -77,7 +77,7 @@ if options["-f"]

  if formatter.is_a?(Tartrazine::Html) && options["--css"]
    File.open("#{options["-t"].as(String)}.css", "w") do |outf|
-      outf << formatter.style_defs
+      outf.puts formatter.style_defs
    end
    exit 0
  end
@ -91,7 +91,7 @@ if options["-f"]
    puts output
  else
    File.open(options["-o"].as(String), "w") do |outf|
-      outf << output
+      outf.puts output
    end
  end
 end
--- a/src/rules.cr
+++ b/src/rules.cr
@ -15,11 +15,41 @@ module Tartrazine
  alias Match = BytesRegex::Match
  alias MatchData = Array(Match)

-  abstract struct BaseRule
-    abstract def match(text : Bytes, pos, lexer) : Tuple(Bool, Int32, Array(Token))
-    abstract def initialize(node : XML::Node)
-
+  class Rule
+    property pattern : Regex = Regex.new ""
    property actions : Array(Action) = [] of Action
+    property xml : String = "foo"
+
+    def match(text : Bytes, pos, lexer) : Tuple(Bool, Int32, Array(Token))
+      match = pattern.match(text, pos)
+      # We don't match if the match doesn't move the cursor
+      # because that causes infinite loops
+      return false, pos, [] of Token if match.empty? || match[0].size == 0
+      # p! match, String.new(text[pos..pos+20])
+      # Log.trace { "#{match}, #{pattern.inspect}, #{text}, #{pos}" }
+      tokens = [] of Token
+      # Emit the tokens
+      actions.each do |action|
+        # Emit the token
+        tokens += action.emit(match, lexer)
+      end
+      Log.trace { "#{xml}, #{pos + match[0].size}, #{tokens}" }
+      return true, pos + match[0].size, tokens
+    end
+
+    def initialize(node : XML::Node, multiline, dotall, ignorecase)
+      @xml = node.to_s
+      pattern = node["pattern"]
+      # flags = Regex::Options::ANCHORED
+      # MULTILINE implies DOTALL which we don't want, so we
+      # use in-pattern flag (?m) instead
+      # flags |= Regex::Options::MULTILINE if multiline
+      pattern = "(?m)" + pattern if multiline
+      # flags |= Regex::Options::DOTALL if dotall
+      # flags |= Regex::Options::IGNORE_CASE if ignorecase
+      @pattern = Regex.new(pattern, multiline, dotall, ignorecase, true)
+      add_actions(node)
+    end

    def add_actions(node : XML::Node)
      node.children.each do |child|
@ -29,44 +59,23 @@ module Tartrazine
    end
  end

-  struct Rule < BaseRule
-    property pattern : Regex = Regex.new ""
-    property actions : Array(Action) = [] of Action
-
-    def match(text : Bytes, pos, lexer) : Tuple(Bool, Int32, Array(Token))
-      match = pattern.match(text, pos)
-
-      # No match
-      return false, pos, [] of Token if match.size == 0
-      return true, pos + match[0].size, actions.flat_map { |action| action.emit(match, lexer) }
-    end
-
-    def initialize(node : XML::Node)
-    end
-
-    def initialize(node : XML::Node, multiline, dotall, ignorecase)
-      pattern = node["pattern"]
-      pattern = "(?m)" + pattern if multiline
-      @pattern = Regex.new(pattern, multiline, dotall, ignorecase, true)
-      add_actions(node)
-    end
-  end
-
  # This rule includes another state. If any of the rules of the
  # included state matches, this rule matches.
-  struct IncludeStateRule < BaseRule
+  class IncludeStateRule < Rule
    property state : String = ""

    def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token))
      Log.trace { "Including state #{state} from #{lexer.state_stack.last}" }
      lexer.states[state].rules.each do |rule|
        matched, new_pos, new_tokens = rule.match(text, pos, lexer)
+        Log.trace { "#{xml}, #{new_pos}, #{new_tokens}" } if matched
        return true, new_pos, new_tokens if matched
      end
      return false, pos, [] of Token
    end

    def initialize(node : XML::Node)
+      @xml = node.to_s
      include_node = node.children.find { |child|
        child.name == "include"
      }
@ -76,14 +85,17 @@ module Tartrazine
  end

  # This rule always matches, unconditionally
-  struct UnconditionalRule < BaseRule
-    NO_MATCH = [] of Match
-
+  class UnconditionalRule < Rule
    def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token))
-      return true, pos, actions.flat_map { |action| action.emit(NO_MATCH, lexer) }
+      tokens = [] of Token
+      actions.each do |action|
+        tokens += action.emit([] of Match, lexer)
+      end
+      return true, pos, tokens
    end

    def initialize(node : XML::Node)
+      @xml = node.to_s
      add_actions(node)
    end
  end