Minor cleanup

Make action a struct, guard against popping too much
Many cleanups
2025-06-08 12:40:25 -03:00 · 2024-08-15 21:35:06 -03:00 · 2024-08-15 21:16:17 -03:00 · 2024-08-15 21:10:25 -03:00 · 2024-08-15 19:20:12 -03:00 · 2024-08-15 17:12:29 -03:00
10 changed files with 117 additions and 118 deletions
--- a/.gitignore
+++ b/.gitignore
@ -7,3 +7,4 @@ chroma/
 pygments/
 shard.lock
 .vscode/
 .crystal/
--- a/README.md
+++ b/README.md
@ -4,17 +4,17 @@ Tartrazine is a library to syntax-highlight code. It is
 a port of [Pygments](https://pygments.org/) to
 [Crystal](https://crystal-lang.org/). Kind of.
-It's not currently usable because it's not finished, but:
+The CLI tool can be used to highlight many things in many styles.
 * The lexers work for the implemented languages
 * The provided styles work
 * There is a very very simple HTML formatter
 # A port of what? Why "kind of"?
-Because I did not read the Pygments code. And this is actually
+Pygments is a staple of the Python ecosystem, and it's great.
-based on [Chroma](https://github.com/alecthomas/chroma) ...
+It lets you highlight code in many languages, and it has many
-although I did not read that code either.
+themes. Chroma is "Pygments for Go", it's actually a port of
 Pygments to Go, and it's great too.
 I wanted that in Crystal, so I started this project. But I did
 not read much of the Pygments code. Or much of Chroma's.
 Chroma has taken most of the Pygments lexers and turned them into
 XML descriptions. What I did was take those XML files from Chroma
--- a/shard.yml
+++ b/shard.yml
@ -1,5 +1,5 @@
 name: tartrazine
-version: 0.3.0
+version: 0.4.0
 authors:
  - Roberto Alsina <roberto.alsina@gmail.com>
--- a/spec/tartrazine_spec.cr
+++ b/spec/tartrazine_spec.cr
@ -14,6 +14,7 @@ unicode_problems = {
  "#{__DIR__}/tests/java/test_string_literals.txt",
  "#{__DIR__}/tests/json/test_strings.txt",
  "#{__DIR__}/tests/systemd/example1.txt",
  "#{__DIR__}/tests/c++/test_unicode_identifiers.txt",
 }
 # These testcases fail because of differences in the way chroma and tartrazine tokenize
--- a/src/actions.cr
+++ b/src/actions.cr
@ -8,12 +8,20 @@ require "./tartrazine"
 # perform a list of actions. These actions can emit tokens
 # or change the state machine.
 module Tartrazine
-  class Action
+  struct Action
    property type : String
    property xml : XML::Node
    property actions : Array(Action) = [] of Action
    property type : String
    @depth : Int32 = 0
    @lexer_name : String = ""
    @states : Array(String) = [] of String
    @states_to_push : Array(String) = [] of String
    @token_type : String = ""
    def initialize(@type : String, xml : XML::Node?)
      known_types = %w(token push pop combined bygroups include using usingself)
      raise Exception.new("Unknown action type: #{type}") unless known_types.includes? type
    def initialize(@type : String, @xml : XML::Node?)
      # Some actions may have actions in them, like this:
      # <bygroups>
      # <token type="GenericPrompt"/>
@ -23,10 +31,28 @@ module Tartrazine
      #
      # The token actions match with the first 2 groups in the regex
      # the using action matches the 3rd and shunts it to another lexer
-      @xml.children.each do |node|
+      xml.children.each do |node|
        next unless node.element?
        @actions << Action.new(node.name, node)
      end
      # Prefetch the attributes we ned from the XML and keep them
      case type
      when "token"
        @token_type = xml["type"]
      when "push"
        @states_to_push = xml.attributes.select { |attrib|
          attrib.name == "state"
        }.map &.content
      when "pop"
        @depth = xml["depth"].to_i
      when "using"
        @lexer_name = xml["lexer"].downcase
      when "combined"
        @states = xml.attributes.select { |attrib|
          attrib.name == "state"
        }.map &.content
      end
    end
    # ameba:disable Metrics/CyclomaticComplexity
@ -34,35 +60,22 @@ module Tartrazine
      case type
      when "token"
        raise Exception.new "Can't have a token without a match" if match.empty?
-        [Token.new(type: xml["type"], value: String.new(match[match_group].value))]
+        [Token.new(type: @token_type, value: String.new(match[match_group].value))]
      when "push"
-        states_to_push = xml.attributes.select { |attrib|
+        to_push = @states_to_push.empty? ? [lexer.state_stack.last] : @states_to_push
-          attrib.name == "state"
+        to_push.each do |state|
-        }.map &.content
+          if state == "#pop" && lexer.state_stack.size > 1
        if states_to_push.empty?
          # Push without a state means push the current state
          states_to_push = [lexer.state_stack.last]
        end
        states_to_push.each do |state|
          if state == "#pop"
            # Pop the state
            Log.trace { "Popping state" }
            lexer.state_stack.pop
          else
            # Really push
            lexer.state_stack << state
            Log.trace { "Pushed #{lexer.state_stack}" }
          end
        end
        [] of Token
      when "pop"
-        depth = xml["depth"].to_i
+        to_pop = [@depth, lexer.state_stack.size - 1].min
-        Log.trace { "Popping #{depth} states" }
+        lexer.state_stack.pop(to_pop)
        if lexer.state_stack.size <= depth
          Log.trace { "Can't pop #{depth} states, only have #{lexer.state_stack.size}" }
        else
          lexer.state_stack.pop(depth)
        end
        [] of Token
      when "bygroups"
        # FIXME: handle
@ -92,22 +105,15 @@ module Tartrazine
      when "using"
        # Shunt to another lexer entirely
        return [] of Token if match.empty?
-        lexer_name = xml["lexer"].downcase
+        Tartrazine.lexer(@lexer_name).tokenize(String.new(match[match_group].value), usingself: true)
        Log.trace { "to tokenize: #{match[match_group]}" }
        Tartrazine.lexer(lexer_name).tokenize(String.new(match[match_group].value), usingself: true)
      when "usingself"
        # Shunt to another copy of this lexer
        return [] of Token if match.empty?
-
+        new_lexer = lexer.copy
        new_lexer = Lexer.from_xml(lexer.xml)
        Log.trace { "to tokenize: #{match[match_group]}" }
        new_lexer.tokenize(String.new(match[match_group].value), usingself: true)
      when "combined"
        # Combine two states into one anonymous state
-        states = xml.attributes.select { |attrib|
+        new_state = @states.map { |name|
          attrib.name == "state"
        }.map &.content
        new_state = states.map { |name|
          lexer.states[name]
        }.reduce { |state1, state2|
          state1 + state2
@ -116,7 +122,7 @@ module Tartrazine
        lexer.state_stack << new_state.name
        [] of Token
      else
-        raise Exception.new("Unknown action type: #{type}: #{xml}")
+        raise Exception.new("Unknown action type: #{type}")
      end
    end
  end
--- a/src/bytes_regex.cr
+++ b/src/bytes_regex.cr
@ -3,7 +3,7 @@ module BytesRegex
  class Regex
    def initialize(pattern : String, multiline = false, dotall = false, ignorecase = false, anchored = false)
-      flags = LibPCRE2::UTF | LibPCRE2::DUPNAMES | LibPCRE2::UCP
+      flags = LibPCRE2::UTF | LibPCRE2::UCP | LibPCRE2::NO_UTF_CHECK
      flags |= LibPCRE2::MULTILINE if multiline
      flags |= LibPCRE2::DOTALL if dotall
      flags |= LibPCRE2::CASELESS if ignorecase
@ -22,27 +22,26 @@ module BytesRegex
        end
        raise Exception.new "Error #{msg} compiling regex at offset #{erroroffset}"
      end
      @match_data = LibPCRE2.match_data_create_from_pattern(@re, nil)
    end
    def finalize
      LibPCRE2.match_data_free(@match_data)
      LibPCRE2.code_free(@re)
    end
    def match(str : Bytes, pos = 0) : Array(Match)
      match_data = LibPCRE2.match_data_create_from_pattern(@re, nil)
      match = [] of Match
      rc = LibPCRE2.match(
        @re,
        str,
        str.size,
        pos,
-        0,
+        LibPCRE2::NO_UTF_CHECK,
-        match_data,
+        @match_data,
        nil)
-      if rc < 0
+      if rc > 0
-        # No match, do nothing
+        ovector = LibPCRE2.get_ovector_pointer(@match_data)
      else
        ovector = LibPCRE2.get_ovector_pointer(match_data)
        (0...rc).each do |i|
          m_start = ovector[2 * i]
          m_size = ovector[2 * i + 1] - m_start
@ -54,7 +53,6 @@ module BytesRegex
          match << Match.new(m_value, m_start, m_size)
        end
      end
      LibPCRE2.match_data_free(match_data)
      match
    end
  end
--- a/src/formatters/html.cr
+++ b/src/formatters/html.cr
@ -1,5 +1,6 @@
 require "../constants/token_abbrevs.cr"
 require "../formatter"
 require "html"
 module Tartrazine
  class Html < Formatter
@ -67,8 +68,7 @@ module Tartrazine
          line_id = linkable_line_numbers? ? "id=\"#{line_number_id_prefix}#{i + 1}\"" : ""
          outp << "<span #{line_id} #{line_class} style=\"user-select: none;\">#{line_label} </span>"
          line.each do |token|
-            fragment = "<span class=\"#{get_css_class(token[:type])}\">#{token[:value]}</span>"
+            outp << "<span class=\"#{get_css_class(token[:type])}\">#{HTML.escape(token[:value])}</span>"
            outp << fragment
          end
        end
        outp << "</code></pre>"
@ -104,15 +104,17 @@ module Tartrazine
    # Given a token type, return the CSS class to use.
    def get_css_class(token : String) : String
-      return class_prefix + Abbreviations[token] if theme.styles.has_key?(token)
+      if !theme.styles.has_key? token
        # Themes don't contain information for each specific
        # token type. However, they may contain information
        # for a parent style. Worst case, we go to the root
        # (Background) style.
-      class_prefix + Abbreviations[theme.style_parents(token).reverse.find { |parent|
+        parent = theme.style_parents(token).reverse.find { |dad|
-        theme.styles.has_key?(parent)
+          theme.styles.has_key?(dad)
-      }]
+        }
        theme.styles[token] = theme.styles[parent]
      end
      class_prefix + Abbreviations[token]
    end
    # Is this line in the highlighted ranges?
--- a/src/lexer.cr
+++ b/src/lexer.cr
@ -56,12 +56,18 @@ module Tartrazine
      not_multiline:    false,
      ensure_nl:        false,
    }
-    property xml : String = ""
+    # property xml : String = ""
    property states = {} of String => State
    property state_stack = ["root"]
    def copy : Lexer
      new_lexer = Lexer.new
      new_lexer.config = config
      new_lexer.states = states
      new_lexer.state_stack = state_stack[0..-1]
      new_lexer
    end
    # Turn the text into a list of tokens. The `usingself` parameter
    # is true when the lexer is being used to tokenize a string
    # from a larger text that is already being tokenized.
@ -87,12 +93,10 @@ module Tartrazine
          if matched
            # Move position forward, save the tokens,
            # tokenize from the new position
            # Log.trace { "MATCHED: #{rule.xml}" }
            pos = new_pos
            tokens += new_tokens
            break
          end
          # Log.trace { "NOT MATCHED: #{rule.xml}" }
        end
        # If no rule matches, emit an error token
        unless matched
@ -158,7 +162,6 @@ module Tartrazine
    # ameba:disable Metrics/CyclomaticComplexity
    def self.from_xml(xml : String) : Lexer
      l = Lexer.new
      l.xml = xml
      lexer = XML.parse(xml).first_element_child
      if lexer
        config = lexer.children.find { |node|
@ -222,9 +225,9 @@ module Tartrazine
  # A Lexer state. A state has a name and a list of rules.
  # The state machine has a state stack containing references
  # to states to decide which rules to apply.
-  class State
+  struct State
    property name : String = ""
-    property rules = [] of Rule
+    property rules = [] of BaseRule
    def +(other : State)
      new_state = State.new
--- a/src/main.cr
+++ b/src/main.cr
@ -77,7 +77,7 @@ if options["-f"]
  if formatter.is_a?(Tartrazine::Html) && options["--css"]
    File.open("#{options["-t"].as(String)}.css", "w") do |outf|
-      outf.puts formatter.style_defs
+      outf << formatter.style_defs
    end
    exit 0
  end
@ -91,7 +91,7 @@ if options["-f"]
    puts output
  else
    File.open(options["-o"].as(String), "w") do |outf|
-      outf.puts output
+      outf << output
    end
  end
 end
--- a/src/rules.cr
+++ b/src/rules.cr
@ -15,41 +15,11 @@ module Tartrazine
  alias Match = BytesRegex::Match
  alias MatchData = Array(Match)
-  class Rule
+  abstract struct BaseRule
-    property pattern : Regex = Regex.new ""
+    abstract def match(text : Bytes, pos, lexer) : Tuple(Bool, Int32, Array(Token))
    abstract def initialize(node : XML::Node)
    property actions : Array(Action) = [] of Action
    property xml : String = "foo"
    def match(text : Bytes, pos, lexer) : Tuple(Bool, Int32, Array(Token))
      match = pattern.match(text, pos)
      # We don't match if the match doesn't move the cursor
      # because that causes infinite loops
      return false, pos, [] of Token if match.empty? || match[0].size == 0
      # p! match, String.new(text[pos..pos+20])
      # Log.trace { "#{match}, #{pattern.inspect}, #{text}, #{pos}" }
      tokens = [] of Token
      # Emit the tokens
      actions.each do |action|
        # Emit the token
        tokens += action.emit(match, lexer)
      end
      Log.trace { "#{xml}, #{pos + match[0].size}, #{tokens}" }
      return true, pos + match[0].size, tokens
    end
    def initialize(node : XML::Node, multiline, dotall, ignorecase)
      @xml = node.to_s
      pattern = node["pattern"]
      # flags = Regex::Options::ANCHORED
      # MULTILINE implies DOTALL which we don't want, so we
      # use in-pattern flag (?m) instead
      # flags |= Regex::Options::MULTILINE if multiline
      pattern = "(?m)" + pattern if multiline
      # flags |= Regex::Options::DOTALL if dotall
      # flags |= Regex::Options::IGNORE_CASE if ignorecase
      @pattern = Regex.new(pattern, multiline, dotall, ignorecase, true)
      add_actions(node)
    end
    def add_actions(node : XML::Node)
      node.children.each do |child|
@ -59,23 +29,44 @@ module Tartrazine
    end
  end
  struct Rule < BaseRule
    property pattern : Regex = Regex.new ""
    property actions : Array(Action) = [] of Action
    def match(text : Bytes, pos, lexer) : Tuple(Bool, Int32, Array(Token))
      match = pattern.match(text, pos)
      # No match
      return false, pos, [] of Token if match.size == 0
      return true, pos + match[0].size, actions.flat_map { |action| action.emit(match, lexer) }
    end
    def initialize(node : XML::Node)
    end
    def initialize(node : XML::Node, multiline, dotall, ignorecase)
      pattern = node["pattern"]
      pattern = "(?m)" + pattern if multiline
      @pattern = Regex.new(pattern, multiline, dotall, ignorecase, true)
      add_actions(node)
    end
  end
  # This rule includes another state. If any of the rules of the
  # included state matches, this rule matches.
-  class IncludeStateRule < Rule
+  struct IncludeStateRule < BaseRule
    property state : String = ""
    def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token))
      Log.trace { "Including state #{state} from #{lexer.state_stack.last}" }
      lexer.states[state].rules.each do |rule|
        matched, new_pos, new_tokens = rule.match(text, pos, lexer)
        Log.trace { "#{xml}, #{new_pos}, #{new_tokens}" } if matched
        return true, new_pos, new_tokens if matched
      end
      return false, pos, [] of Token
    end
    def initialize(node : XML::Node)
      @xml = node.to_s
      include_node = node.children.find { |child|
        child.name == "include"
      }
@ -85,17 +76,14 @@ module Tartrazine
  end
  # This rule always matches, unconditionally
-  class UnconditionalRule < Rule
+  struct UnconditionalRule < BaseRule
    NO_MATCH = [] of Match
    def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token))
-      tokens = [] of Token
+      return true, pos, actions.flat_map { |action| action.emit(NO_MATCH, lexer) }
      actions.each do |action|
        tokens += action.emit([] of Match, lexer)
      end
      return true, pos, tokens
    end
    def initialize(node : XML::Node)
      @xml = node.to_s
      add_actions(node)
    end
  end
Author	SHA1	Message	Date
Roberto Alsina	cb09dff9f1	Minor cleanup	2024-08-15 21:35:06 -03:00
Roberto Alsina	b589726352	Make action a struct, guard against popping too much	2024-08-15 21:16:17 -03:00
Roberto Alsina	a3a7b5bd9a	Many cleanups	2024-08-15 21:10:25 -03:00
Roberto Alsina	58e8dac038	Make usingself MUCH cheaper, since it was called many times when parsing C	2024-08-15 19:20:12 -03:00
Roberto Alsina	f72a40f095	Oops, escape things in HTML formatter!	2024-08-15 17:12:29 -03:00
Roberto Alsina	bf257a5b82	cleanup	2024-08-15 17:05:03 -03:00
Roberto Alsina	029495590c	cleanup	2024-08-15 17:04:48 -03:00
Roberto Alsina	115debdec6	Allocate match_data once	2024-08-15 17:04:16 -03:00
Roberto Alsina	4612db58fe	Prefetch XML data	2024-08-15 17:03:58 -03:00
Roberto Alsina	f45a86c83a	ignore	2024-08-15 16:35:58 -03:00
Roberto Alsina	27008640a6	v0.4.0	2024-08-14 13:25:39 -03:00
Roberto Alsina	7db8fdc9e4	Updated README	2024-08-14 13:25:20 -03:00