v0.4.0

Updated README
Added error handling
2025-07-02 04:47:07 -03:00 · 2024-08-14 13:25:39 -03:00 · 2024-08-14 13:25:20 -03:00 · 2024-08-14 11:24:25 -03:00 · 2024-08-14 11:06:53 -03:00 · 2024-08-14 09:25:08 -03:00
8 changed files with 134 additions and 42 deletions
--- a/README.md
+++ b/README.md
@ -4,17 +4,17 @@ Tartrazine is a library to syntax-highlight code. It is
 a port of [Pygments](https://pygments.org/) to
 [Crystal](https://crystal-lang.org/). Kind of.
-It's not currently usable because it's not finished, but:
+The CLI tool can be used to highlight many things in many styles.
 * The lexers work for the implemented languages
 * The provided styles work
 * There is a very very simple HTML formatter
 # A port of what? Why "kind of"?
-Because I did not read the Pygments code. And this is actually
+Pygments is a staple of the Python ecosystem, and it's great.
-based on [Chroma](https://github.com/alecthomas/chroma) ...
+It lets you highlight code in many languages, and it has many
-although I did not read that code either.
+themes. Chroma is "Pygments for Go", it's actually a port of
 Pygments to Go, and it's great too.
 I wanted that in Crystal, so I started this project. But I did
 not read much of the Pygments code. Or much of Chroma's.
 Chroma has taken most of the Pygments lexers and turned them into
 XML descriptions. What I did was take those XML files from Chroma
--- a/shard.yml
+++ b/shard.yml
@ -1,5 +1,5 @@
 name: tartrazine
-version: 0.3.0
+version: 0.4.0
 authors:
  - Roberto Alsina <roberto.alsina@gmail.com>
--- a/spec/tartrazine_spec.cr
+++ b/spec/tartrazine_spec.cr
@ -14,15 +14,18 @@ unicode_problems = {
  "#{__DIR__}/tests/java/test_string_literals.txt",
  "#{__DIR__}/tests/json/test_strings.txt",
  "#{__DIR__}/tests/systemd/example1.txt",
  "#{__DIR__}/tests/c++/test_unicode_identifiers.txt",
 }
 # These testcases fail because of differences in the way chroma and tartrazine tokenize
 # but tartrazine is correct
 bad_in_chroma = {
  "#{__DIR__}/tests/bash_session/test_comment_after_prompt.txt",
  "#{__DIR__}/tests/html/javascript_backtracking.txt",
  "#{__DIR__}/tests/java/test_default.txt",
  "#{__DIR__}/tests/java/test_multiline_string.txt",
  "#{__DIR__}/tests/java/test_numeric_literals.txt",
  "#{__DIR__}/tests/octave/test_multilinecomment.txt",
  "#{__DIR__}/tests/php/test_string_escaping_run.txt",
  "#{__DIR__}/tests/python_2/test_cls_builtin.txt",
 }
@ -30,19 +33,14 @@ bad_in_chroma = {
 known_bad = {
  "#{__DIR__}/tests/bash_session/fake_ps2_prompt.txt",
  "#{__DIR__}/tests/bash_session/prompt_in_output.txt",
  "#{__DIR__}/tests/bash_session/test_newline_in_echo_no_ps2.txt",
  "#{__DIR__}/tests/bash_session/test_newline_in_ls_ps2.txt",
  "#{__DIR__}/tests/bash_session/ps2_prompt.txt",
-  "#{__DIR__}/tests/bash_session/test_newline_in_ls_no_ps2.txt",
+  "#{__DIR__}/tests/bash_session/test_newline_in_echo_no_ps2.txt",
  "#{__DIR__}/tests/bash_session/test_virtualenv.txt",
  "#{__DIR__}/tests/bash_session/test_newline_in_echo_ps2.txt",
-  "#{__DIR__}/tests/c/test_string_resembling_decl_end.txt",
+  "#{__DIR__}/tests/bash_session/test_newline_in_ls_no_ps2.txt",
-  "#{__DIR__}/tests/html/css_backtracking.txt",
+  "#{__DIR__}/tests/bash_session/test_newline_in_ls_ps2.txt",
  "#{__DIR__}/tests/bash_session/test_virtualenv.txt",
  "#{__DIR__}/tests/mcfunction/data.txt",
  "#{__DIR__}/tests/mcfunction/selectors.txt",
  "#{__DIR__}/tests/php/anonymous_class.txt",
  "#{__DIR__}/tests/html/javascript_unclosed.txt",
 }
 # Tests that fail because of a limitation in PCRE2
--- a/src/actions.cr
+++ b/src/actions.cr
@ -30,11 +30,11 @@ module Tartrazine
    end
    # ameba:disable Metrics/CyclomaticComplexity
-    def emit(match : Regex::MatchData?, lexer : Lexer, match_group = 0) : Array(Token)
+    def emit(match : MatchData, lexer : Lexer, match_group = 0) : Array(Token)
      case type
      when "token"
-        raise Exception.new "Can't have a token without a match" if match.nil?
+        raise Exception.new "Can't have a token without a match" if match.empty?
-        [Token.new(type: xml["type"], value: match[match_group])]
+        [Token.new(type: xml["type"], value: String.new(match[match_group].value))]
      when "push"
        states_to_push = xml.attributes.select { |attrib|
          attrib.name == "state"
@ -79,23 +79,29 @@ module Tartrazine
        # the action is skipped.
        result = [] of Token
        @actions.each_with_index do |e, i|
-          next if match[i + 1]?.nil?
+          begin
            next if match[i + 1].size == 0
          rescue IndexError
            # FIXME: This should not actually happen
            # No match for this group
            next
          end
          result += e.emit(match, lexer, i + 1)
        end
        result
      when "using"
        # Shunt to another lexer entirely
-        return [] of Token if match.nil?
+        return [] of Token if match.empty?
        lexer_name = xml["lexer"].downcase
        Log.trace { "to tokenize: #{match[match_group]}" }
-        Tartrazine.lexer(lexer_name).tokenize(match[match_group], usingself: true)
+        Tartrazine.lexer(lexer_name).tokenize(String.new(match[match_group].value), usingself: true)
      when "usingself"
        # Shunt to another copy of this lexer
-        return [] of Token if match.nil?
+        return [] of Token if match.empty?
        new_lexer = Lexer.from_xml(lexer.xml)
        Log.trace { "to tokenize: #{match[match_group]}" }
-        new_lexer.tokenize(match[match_group], usingself: true)
+        new_lexer.tokenize(String.new(match[match_group].value), usingself: true)
      when "combined"
        # Combine two states into one anonymous state
        states = xml.attributes.select { |attrib|
--- a/src/bytes_regex.cr
+++ b/src/bytes_regex.cr
@ -0,0 +1,75 @@
 module BytesRegex
  extend self
  class Regex
    def initialize(pattern : String, multiline = false, dotall = false, ignorecase = false, anchored = false)
      flags = LibPCRE2::UTF | LibPCRE2::DUPNAMES | LibPCRE2::UCP | LibPCRE2::NO_UTF_CHECK
      flags |= LibPCRE2::MULTILINE if multiline
      flags |= LibPCRE2::DOTALL if dotall
      flags |= LibPCRE2::CASELESS if ignorecase
      flags |= LibPCRE2::ANCHORED if anchored
      if @re = LibPCRE2.compile(
           pattern,
           pattern.bytesize,
           flags,
           out errorcode,
           out erroroffset,
           nil)
      else
        msg = String.new(256) do |buffer|
          bytesize = LibPCRE2.get_error_message(errorcode, buffer, 256)
          {bytesize, 0}
        end
        raise Exception.new "Error #{msg} compiling regex at offset #{erroroffset}"
      end
    end
    def finalize
      LibPCRE2.code_free(@re)
    end
    def match(str : Bytes, pos = 0) : Array(Match)
      match_data = LibPCRE2.match_data_create_from_pattern(@re, nil)
      match = [] of Match
      rc = LibPCRE2.match(
        @re,
        str,
        str.size,
        pos,
        LibPCRE2::NO_UTF_CHECK,
        match_data,
        nil)
      if rc < 0
        # No match, do nothing
      else
        ovector = LibPCRE2.get_ovector_pointer(match_data)
        (0...rc).each do |i|
          m_start = ovector[2 * i]
          m_size = ovector[2 * i + 1] - m_start
          if m_size == 0
            m_value = Bytes.new(0)
          else
            m_value = str[m_start...m_start + m_size]
          end
          match << Match.new(m_value, m_start, m_size)
        end
      end
      LibPCRE2.match_data_free(match_data)
      match
    end
  end
  class Match
    property value : Bytes
    property start : UInt64
    property size : UInt64
    def initialize(@value : Bytes, @start : UInt64, @size : UInt64)
    end
  end
 end
 # pattern = "foo"
 # str = "foo bar"
 # re = BytesRegex::Regex.new(pattern)
 # p! String.new(re.match(str.to_slice)[0].value)
--- a/src/formatters/html.cr
+++ b/src/formatters/html.cr
@ -30,7 +30,7 @@ module Tartrazine
                   @standalone : Bool = false,
                   @surrounding_pre : Bool = true,
                   @wrap_long_lines : Bool = false,
-                   @weight_of_bold : Int32 = 600,)
+                   @weight_of_bold : Int32 = 600)
    end
    def format(text : String, lexer : Lexer) : String
--- a/src/lexer.cr
+++ b/src/lexer.cr
@ -1,3 +1,4 @@
 require "baked_file_system"
 require "./constants/lexers"
 module Tartrazine
@ -65,7 +66,7 @@ module Tartrazine
    # is true when the lexer is being used to tokenize a string
    # from a larger text that is already being tokenized.
    # So, when it's true, we don't modify the text.
-    def tokenize(text, usingself = false) : Array(Token)
+    def tokenize(text : String, usingself = false) : Array(Token)
      @state_stack = ["root"]
      tokens = [] of Token
      pos = 0
@ -76,12 +77,13 @@ module Tartrazine
        text += "\n"
      end
      text_bytes = text.to_slice
      # Loop through the text, applying rules
-      while pos < text.size
+      while pos < text_bytes.size
        state = states[@state_stack.last]
        # Log.trace { "Stack is #{@state_stack} State is #{state.name}, pos is #{pos}, text is #{text[pos..pos + 10]}" }
        state.rules.each do |rule|
-          matched, new_pos, new_tokens = rule.match(text, pos, self)
+          matched, new_pos, new_tokens = rule.match(text_bytes, pos, self)
          if matched
            # Move position forward, save the tokens,
            # tokenize from the new position
@ -94,8 +96,13 @@ module Tartrazine
        end
        # If no rule matches, emit an error token
        unless matched
-          # Log.trace { "Error at #{pos}" }
+          if text_bytes[pos] == 10u8
-          tokens << {type: "Error", value: "#{text[pos]}"}
+            # at EOL, reset state to "root"
            tokens << {type: "Text", value: "\n"}
            @state_stack = ["root"]
          else
            tokens << {type: "Error", value: String.new(text_bytes[pos..pos])}
          end
          pos += 1
        end
      end
--- a/src/rules.cr
+++ b/src/rules.cr
@ -1,8 +1,9 @@
 require "./actions"
 require "./bytes_regex"
 require "./formatter"
 require "./lexer"
 require "./rules"
 require "./styles"
 require "./lexer"
 # These are lexer rules. They match with the text being parsed
 # and perform actions, either emitting tokens or changing the
@ -10,16 +11,21 @@ require "./lexer"
 module Tartrazine
  # This rule matches via a regex pattern
  alias Regex = BytesRegex::Regex
  alias Match = BytesRegex::Match
  alias MatchData = Array(Match)
  class Rule
    property pattern : Regex = Regex.new ""
    property actions : Array(Action) = [] of Action
    property xml : String = "foo"
-    def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token))
+    def match(text : Bytes, pos, lexer) : Tuple(Bool, Int32, Array(Token))
      match = pattern.match(text, pos)
      # We don't match if the match doesn't move the cursor
      # because that causes infinite loops
-      return false, pos, [] of Token if match.nil? || match.end == 0
+      return false, pos, [] of Token if match.empty? || match[0].size == 0
      # p! match, String.new(text[pos..pos+20])
      # Log.trace { "#{match}, #{pattern.inspect}, #{text}, #{pos}" }
      tokens = [] of Token
      # Emit the tokens
@ -27,21 +33,21 @@ module Tartrazine
        # Emit the token
        tokens += action.emit(match, lexer)
      end
-      Log.trace { "#{xml}, #{match.end}, #{tokens}" }
+      Log.trace { "#{xml}, #{pos + match[0].size}, #{tokens}" }
-      return true, match.end, tokens
+      return true, pos + match[0].size, tokens
    end
    def initialize(node : XML::Node, multiline, dotall, ignorecase)
      @xml = node.to_s
      pattern = node["pattern"]
-      flags = Regex::Options::ANCHORED
+      # flags = Regex::Options::ANCHORED
      # MULTILINE implies DOTALL which we don't want, so we
      # use in-pattern flag (?m) instead
      # flags |= Regex::Options::MULTILINE if multiline
      pattern = "(?m)" + pattern if multiline
-      flags |= Regex::Options::DOTALL if dotall
+      # flags |= Regex::Options::DOTALL if dotall
-      flags |= Regex::Options::IGNORE_CASE if ignorecase
+      # flags |= Regex::Options::IGNORE_CASE if ignorecase
-      @pattern = Regex.new(pattern, flags)
+      @pattern = Regex.new(pattern, multiline, dotall, ignorecase, true)
      add_actions(node)
    end
@ -83,7 +89,7 @@ module Tartrazine
    def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token))
      tokens = [] of Token
      actions.each do |action|
-        tokens += action.emit(nil, lexer)
+        tokens += action.emit([] of Match, lexer)
      end
      return true, pos, tokens
    end
Author	SHA1	Message	Date
Roberto Alsina	27008640a6	v0.4.0	2024-08-14 13:25:39 -03:00
Roberto Alsina	7db8fdc9e4	Updated README	2024-08-14 13:25:20 -03:00
Roberto Alsina	ad664d9f93	Added error handling	2024-08-14 11:24:25 -03:00
Roberto Alsina	0626c8619f	Working bytes-regexes, faster, MORE tests pass	2024-08-14 11:06:53 -03:00
Roberto Alsina	3725201f8a	Merge branch 'main' of github.com:ralsina/tartrazine	2024-08-14 09:25:08 -03:00
Roberto Alsina	6f64b76c44	lint	2024-08-13 22:07:23 -03:00
Roberto Alsina	5218af6855	lint	2024-08-13 22:06:19 -03:00
Roberto Alsina	c898f395a1	reset stack on EOL instead of error, makes no difference, but it's in pygments version	2024-08-13 22:06:07 -03:00
Roberto Alsina	56e49328fb	Tiny bug	2024-08-13 21:00:00 -03:00