Minor cleanup

Make action a struct, guard against popping too much
Many cleanups
2025-06-08 04:30:26 -03:00 · 2024-08-15 21:35:06 -03:00 · 2024-08-15 21:16:17 -03:00 · 2024-08-15 21:10:25 -03:00 · 2024-08-15 19:20:12 -03:00 · 2024-08-15 17:12:29 -03:00
10 changed files with 117 additions and 118 deletions
--- a/.gitignore
+++ b/.gitignore
@ -7,3 +7,4 @@ chroma/
 pygments/
 shard.lock
 .vscode/
+.crystal/
--- a/README.md
+++ b/README.md
@ -4,17 +4,17 @@ Tartrazine is a library to syntax-highlight code. It is
 a port of [Pygments](https://pygments.org/) to
 [Crystal](https://crystal-lang.org/). Kind of.

-It's not currently usable because it's not finished, but:
-
-* The lexers work for the implemented languages
-* The provided styles work
-* There is a very very simple HTML formatter
+The CLI tool can be used to highlight many things in many styles.

 # A port of what? Why "kind of"?

-Because I did not read the Pygments code. And this is actually
-based on [Chroma](https://github.com/alecthomas/chroma) ...
-although I did not read that code either.
+Pygments is a staple of the Python ecosystem, and it's great.
+It lets you highlight code in many languages, and it has many
+themes. Chroma is "Pygments for Go", it's actually a port of
+Pygments to Go, and it's great too.
+
+I wanted that in Crystal, so I started this project. But I did
+not read much of the Pygments code. Or much of Chroma's.

 Chroma has taken most of the Pygments lexers and turned them into
 XML descriptions. What I did was take those XML files from Chroma
--- a/shard.yml
+++ b/shard.yml
@ -1,5 +1,5 @@
 name: tartrazine
-version: 0.3.0
+version: 0.4.0

 authors:
  - Roberto Alsina <roberto.alsina@gmail.com>
--- a/spec/tartrazine_spec.cr
+++ b/spec/tartrazine_spec.cr
@ -14,6 +14,7 @@ unicode_problems = {
  "#{__DIR__}/tests/java/test_string_literals.txt",
  "#{__DIR__}/tests/json/test_strings.txt",
  "#{__DIR__}/tests/systemd/example1.txt",
+  "#{__DIR__}/tests/c++/test_unicode_identifiers.txt",
 }

 # These testcases fail because of differences in the way chroma and tartrazine tokenize
--- a/src/actions.cr
+++ b/src/actions.cr
@ -8,12 +8,20 @@ require "./tartrazine"
 # perform a list of actions. These actions can emit tokens
 # or change the state machine.
 module Tartrazine
-  class Action
-    property type : String
-    property xml : XML::Node
+  struct Action
    property actions : Array(Action) = [] of Action
+    property type : String
+
+    @depth : Int32 = 0
+    @lexer_name : String = ""
+    @states : Array(String) = [] of String
+    @states_to_push : Array(String) = [] of String
+    @token_type : String = ""
+
+    def initialize(@type : String, xml : XML::Node?)
+      known_types = %w(token push pop combined bygroups include using usingself)
+      raise Exception.new("Unknown action type: #{type}") unless known_types.includes? type

-    def initialize(@type : String, @xml : XML::Node?)
      # Some actions may have actions in them, like this:
      # <bygroups>
      # <token type="GenericPrompt"/>
@ -23,10 +31,28 @@ module Tartrazine
      #
      # The token actions match with the first 2 groups in the regex
      # the using action matches the 3rd and shunts it to another lexer
-      @xml.children.each do |node|
+      xml.children.each do |node|
        next unless node.element?
        @actions << Action.new(node.name, node)
      end
+
+      # Prefetch the attributes we ned from the XML and keep them
+      case type
+      when "token"
+        @token_type = xml["type"]
+      when "push"
+        @states_to_push = xml.attributes.select { |attrib|
+          attrib.name == "state"
+        }.map &.content
+      when "pop"
+        @depth = xml["depth"].to_i
+      when "using"
+        @lexer_name = xml["lexer"].downcase
+      when "combined"
+        @states = xml.attributes.select { |attrib|
+          attrib.name == "state"
+        }.map &.content
+      end
    end

    # ameba:disable Metrics/CyclomaticComplexity
@ -34,35 +60,22 @@ module Tartrazine
      case type
      when "token"
        raise Exception.new "Can't have a token without a match" if match.empty?
-        [Token.new(type: xml["type"], value: String.new(match[match_group].value))]
+        [Token.new(type: @token_type, value: String.new(match[match_group].value))]
      when "push"
-        states_to_push = xml.attributes.select { |attrib|
-          attrib.name == "state"
-        }.map &.content
-        if states_to_push.empty?
-          # Push without a state means push the current state
-          states_to_push = [lexer.state_stack.last]
-        end
-        states_to_push.each do |state|
-          if state == "#pop"
+        to_push = @states_to_push.empty? ? [lexer.state_stack.last] : @states_to_push
+        to_push.each do |state|
+          if state == "#pop" && lexer.state_stack.size > 1
            # Pop the state
-            Log.trace { "Popping state" }
            lexer.state_stack.pop
          else
            # Really push
            lexer.state_stack << state
-            Log.trace { "Pushed #{lexer.state_stack}" }
          end
        end
        [] of Token
      when "pop"
-        depth = xml["depth"].to_i
-        Log.trace { "Popping #{depth} states" }
-        if lexer.state_stack.size <= depth
-          Log.trace { "Can't pop #{depth} states, only have #{lexer.state_stack.size}" }
-        else
-          lexer.state_stack.pop(depth)
-        end
+        to_pop = [@depth, lexer.state_stack.size - 1].min
+        lexer.state_stack.pop(to_pop)
        [] of Token
      when "bygroups"
        # FIXME: handle
@ -92,22 +105,15 @@ module Tartrazine
      when "using"
        # Shunt to another lexer entirely
        return [] of Token if match.empty?
-        lexer_name = xml["lexer"].downcase
-        Log.trace { "to tokenize: #{match[match_group]}" }
-        Tartrazine.lexer(lexer_name).tokenize(String.new(match[match_group].value), usingself: true)
+        Tartrazine.lexer(@lexer_name).tokenize(String.new(match[match_group].value), usingself: true)
      when "usingself"
        # Shunt to another copy of this lexer
        return [] of Token if match.empty?
-
-        new_lexer = Lexer.from_xml(lexer.xml)
-        Log.trace { "to tokenize: #{match[match_group]}" }
+        new_lexer = lexer.copy
        new_lexer.tokenize(String.new(match[match_group].value), usingself: true)
      when "combined"
        # Combine two states into one anonymous state
-        states = xml.attributes.select { |attrib|
-          attrib.name == "state"
-        }.map &.content
-        new_state = states.map { |name|
+        new_state = @states.map { |name|
          lexer.states[name]
        }.reduce { |state1, state2|
          state1 + state2
@ -116,7 +122,7 @@ module Tartrazine
        lexer.state_stack << new_state.name
        [] of Token
      else
-        raise Exception.new("Unknown action type: #{type}: #{xml}")
+        raise Exception.new("Unknown action type: #{type}")
      end
    end
  end
--- a/src/bytes_regex.cr
+++ b/src/bytes_regex.cr
@ -3,7 +3,7 @@ module BytesRegex

  class Regex
    def initialize(pattern : String, multiline = false, dotall = false, ignorecase = false, anchored = false)
-      flags = LibPCRE2::UTF | LibPCRE2::DUPNAMES | LibPCRE2::UCP
+      flags = LibPCRE2::UTF | LibPCRE2::UCP | LibPCRE2::NO_UTF_CHECK
      flags |= LibPCRE2::MULTILINE if multiline
      flags |= LibPCRE2::DOTALL if dotall
      flags |= LibPCRE2::CASELESS if ignorecase
@ -22,27 +22,26 @@ module BytesRegex
        end
        raise Exception.new "Error #{msg} compiling regex at offset #{erroroffset}"
      end
+      @match_data = LibPCRE2.match_data_create_from_pattern(@re, nil)
    end

    def finalize
+      LibPCRE2.match_data_free(@match_data)
      LibPCRE2.code_free(@re)
    end

    def match(str : Bytes, pos = 0) : Array(Match)
-      match_data = LibPCRE2.match_data_create_from_pattern(@re, nil)
      match = [] of Match
      rc = LibPCRE2.match(
        @re,
        str,
        str.size,
        pos,
-        0,
-        match_data,
+        LibPCRE2::NO_UTF_CHECK,
+        @match_data,
        nil)
-      if rc < 0
-        # No match, do nothing
-      else
-        ovector = LibPCRE2.get_ovector_pointer(match_data)
+      if rc > 0
+        ovector = LibPCRE2.get_ovector_pointer(@match_data)
        (0...rc).each do |i|
          m_start = ovector[2 * i]
          m_size = ovector[2 * i + 1] - m_start
@ -54,7 +53,6 @@ module BytesRegex
          match << Match.new(m_value, m_start, m_size)
        end
      end
-      LibPCRE2.match_data_free(match_data)
      match
    end
  end
--- a/src/formatters/html.cr
+++ b/src/formatters/html.cr
@ -1,5 +1,6 @@
 require "../constants/token_abbrevs.cr"
 require "../formatter"
+require "html"

 module Tartrazine
  class Html < Formatter
@ -67,8 +68,7 @@ module Tartrazine
          line_id = linkable_line_numbers? ? "id=\"#{line_number_id_prefix}#{i + 1}\"" : ""
          outp << "<span #{line_id} #{line_class} style=\"user-select: none;\">#{line_label} </span>"
          line.each do |token|
-            fragment = "<span class=\"#{get_css_class(token[:type])}\">#{token[:value]}</span>"
-            outp << fragment
+            outp << "<span class=\"#{get_css_class(token[:type])}\">#{HTML.escape(token[:value])}</span>"
          end
        end
        outp << "</code></pre>"
@ -104,15 +104,17 @@ module Tartrazine

    # Given a token type, return the CSS class to use.
    def get_css_class(token : String) : String
-      return class_prefix + Abbreviations[token] if theme.styles.has_key?(token)
-
-      # Themes don't contain information for each specific
-      # token type. However, they may contain information
-      # for a parent style. Worst case, we go to the root
-      # (Background) style.
-      class_prefix + Abbreviations[theme.style_parents(token).reverse.find { |parent|
-        theme.styles.has_key?(parent)
-      }]
+      if !theme.styles.has_key? token
+        # Themes don't contain information for each specific
+        # token type. However, they may contain information
+        # for a parent style. Worst case, we go to the root
+        # (Background) style.
+        parent = theme.style_parents(token).reverse.find { |dad|
+          theme.styles.has_key?(dad)
+        }
+        theme.styles[token] = theme.styles[parent]
+      end
+      class_prefix + Abbreviations[token]
    end

    # Is this line in the highlighted ranges?
--- a/src/lexer.cr
+++ b/src/lexer.cr
@ -56,12 +56,18 @@ module Tartrazine
      not_multiline:    false,
      ensure_nl:        false,
    }
-    property xml : String = ""
-
+    # property xml : String = ""
    property states = {} of String => State
-
    property state_stack = ["root"]

+    def copy : Lexer
+      new_lexer = Lexer.new
+      new_lexer.config = config
+      new_lexer.states = states
+      new_lexer.state_stack = state_stack[0..-1]
+      new_lexer
+    end
+
    # Turn the text into a list of tokens. The `usingself` parameter
    # is true when the lexer is being used to tokenize a string
    # from a larger text that is already being tokenized.
@ -87,12 +93,10 @@ module Tartrazine
          if matched
            # Move position forward, save the tokens,
            # tokenize from the new position
-            # Log.trace { "MATCHED: #{rule.xml}" }
            pos = new_pos
            tokens += new_tokens
            break
          end
-          # Log.trace { "NOT MATCHED: #{rule.xml}" }
        end
        # If no rule matches, emit an error token
        unless matched
@ -158,7 +162,6 @@ module Tartrazine
    # ameba:disable Metrics/CyclomaticComplexity
    def self.from_xml(xml : String) : Lexer
      l = Lexer.new
-      l.xml = xml
      lexer = XML.parse(xml).first_element_child
      if lexer
        config = lexer.children.find { |node|
@ -222,9 +225,9 @@ module Tartrazine
  # A Lexer state. A state has a name and a list of rules.
  # The state machine has a state stack containing references
  # to states to decide which rules to apply.
-  class State
+  struct State
    property name : String = ""
-    property rules = [] of Rule
+    property rules = [] of BaseRule

    def +(other : State)
      new_state = State.new
--- a/src/main.cr
+++ b/src/main.cr
@ -77,7 +77,7 @@ if options["-f"]

  if formatter.is_a?(Tartrazine::Html) && options["--css"]
    File.open("#{options["-t"].as(String)}.css", "w") do |outf|
-      outf.puts formatter.style_defs
+      outf << formatter.style_defs
    end
    exit 0
  end
@ -91,7 +91,7 @@ if options["-f"]
    puts output
  else
    File.open(options["-o"].as(String), "w") do |outf|
-      outf.puts output
+      outf << output
    end
  end
 end
--- a/src/rules.cr
+++ b/src/rules.cr
@ -15,41 +15,11 @@ module Tartrazine
  alias Match = BytesRegex::Match
  alias MatchData = Array(Match)

-  class Rule
-    property pattern : Regex = Regex.new ""
+  abstract struct BaseRule
+    abstract def match(text : Bytes, pos, lexer) : Tuple(Bool, Int32, Array(Token))
+    abstract def initialize(node : XML::Node)
+
    property actions : Array(Action) = [] of Action
-    property xml : String = "foo"
-
-    def match(text : Bytes, pos, lexer) : Tuple(Bool, Int32, Array(Token))
-      match = pattern.match(text, pos)
-      # We don't match if the match doesn't move the cursor
-      # because that causes infinite loops
-      return false, pos, [] of Token if match.empty? || match[0].size == 0
-      # p! match, String.new(text[pos..pos+20])
-      # Log.trace { "#{match}, #{pattern.inspect}, #{text}, #{pos}" }
-      tokens = [] of Token
-      # Emit the tokens
-      actions.each do |action|
-        # Emit the token
-        tokens += action.emit(match, lexer)
-      end
-      Log.trace { "#{xml}, #{pos + match[0].size}, #{tokens}" }
-      return true, pos + match[0].size, tokens
-    end
-
-    def initialize(node : XML::Node, multiline, dotall, ignorecase)
-      @xml = node.to_s
-      pattern = node["pattern"]
-      # flags = Regex::Options::ANCHORED
-      # MULTILINE implies DOTALL which we don't want, so we
-      # use in-pattern flag (?m) instead
-      # flags |= Regex::Options::MULTILINE if multiline
-      pattern = "(?m)" + pattern if multiline
-      # flags |= Regex::Options::DOTALL if dotall
-      # flags |= Regex::Options::IGNORE_CASE if ignorecase
-      @pattern = Regex.new(pattern, multiline, dotall, ignorecase, true)
-      add_actions(node)
-    end

    def add_actions(node : XML::Node)
      node.children.each do |child|
@ -59,23 +29,44 @@ module Tartrazine
    end
  end

+  struct Rule < BaseRule
+    property pattern : Regex = Regex.new ""
+    property actions : Array(Action) = [] of Action
+
+    def match(text : Bytes, pos, lexer) : Tuple(Bool, Int32, Array(Token))
+      match = pattern.match(text, pos)
+
+      # No match
+      return false, pos, [] of Token if match.size == 0
+      return true, pos + match[0].size, actions.flat_map { |action| action.emit(match, lexer) }
+    end
+
+    def initialize(node : XML::Node)
+    end
+
+    def initialize(node : XML::Node, multiline, dotall, ignorecase)
+      pattern = node["pattern"]
+      pattern = "(?m)" + pattern if multiline
+      @pattern = Regex.new(pattern, multiline, dotall, ignorecase, true)
+      add_actions(node)
+    end
+  end
+
  # This rule includes another state. If any of the rules of the
  # included state matches, this rule matches.
-  class IncludeStateRule < Rule
+  struct IncludeStateRule < BaseRule
    property state : String = ""

    def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token))
      Log.trace { "Including state #{state} from #{lexer.state_stack.last}" }
      lexer.states[state].rules.each do |rule|
        matched, new_pos, new_tokens = rule.match(text, pos, lexer)
-        Log.trace { "#{xml}, #{new_pos}, #{new_tokens}" } if matched
        return true, new_pos, new_tokens if matched
      end
      return false, pos, [] of Token
    end

    def initialize(node : XML::Node)
-      @xml = node.to_s
      include_node = node.children.find { |child|
        child.name == "include"
      }
@ -85,17 +76,14 @@ module Tartrazine
  end

  # This rule always matches, unconditionally
-  class UnconditionalRule < Rule
+  struct UnconditionalRule < BaseRule
+    NO_MATCH = [] of Match
+
    def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token))
-      tokens = [] of Token
-      actions.each do |action|
-        tokens += action.emit([] of Match, lexer)
-      end
-      return true, pos, tokens
+      return true, pos, actions.flat_map { |action| action.emit(NO_MATCH, lexer) }
    end

    def initialize(node : XML::Node)
-      @xml = node.to_s
      add_actions(node)
    end
  end
Author	SHA1	Message	Date
Roberto Alsina	cb09dff9f1	Minor cleanup	2024-08-15 21:35:06 -03:00
Roberto Alsina	b589726352	Make action a struct, guard against popping too much	2024-08-15 21:16:17 -03:00
Roberto Alsina	a3a7b5bd9a	Many cleanups	2024-08-15 21:10:25 -03:00
Roberto Alsina	58e8dac038	Make usingself MUCH cheaper, since it was called many times when parsing C	2024-08-15 19:20:12 -03:00
Roberto Alsina	f72a40f095	Oops, escape things in HTML formatter!	2024-08-15 17:12:29 -03:00
Roberto Alsina	bf257a5b82	cleanup	2024-08-15 17:05:03 -03:00
Roberto Alsina	029495590c	cleanup	2024-08-15 17:04:48 -03:00
Roberto Alsina	115debdec6	Allocate match_data once	2024-08-15 17:04:16 -03:00
Roberto Alsina	4612db58fe	Prefetch XML data	2024-08-15 17:03:58 -03:00
Roberto Alsina	f45a86c83a	ignore	2024-08-15 16:35:58 -03:00
Roberto Alsina	27008640a6	v0.4.0	2024-08-14 13:25:39 -03:00
Roberto Alsina	7db8fdc9e4	Updated README	2024-08-14 13:25:20 -03:00