JSON formatter

Start actual CLI
HTML formatter option: wrap_long_lines
2025-09-17 10:48:12 +00:00 · 2024-08-09 16:58:15 -03:00 · 2024-08-09 16:53:24 -03:00 · 2024-08-09 16:20:30 -03:00 · 2024-08-09 15:59:49 -03:00 · 2024-08-09 14:54:00 -03:00
14 changed files with 562 additions and 369 deletions
--- a/Dockerfile.static
+++ b/Dockerfile.static
@@ -0,0 +1,15 @@
 FROM --platform=${TARGETPLATFORM:-linux/amd64} alpine:3.20 AS build
 RUN apk add --no-cache \
    crystal \
    shards \
    yaml-dev \
    yaml-static \
    openssl-dev \
    openssl-libs-static \
    libxml2-dev \
    libxml2-static \
    zlib-dev \
    zlib-static \
    xz-dev \
    xz-static \
    make
--- a/README.md
+++ b/README.md
@@ -36,9 +36,7 @@ It has 332 themes (64 from Chroma, the rest are base16 themes via
 ## Installation
-This will have a CLI tool that can be installed, but it's not
+This has a CLI but it's not generally usable.
 there yet.
 ## Usage
--- a/scripts/token_abbrevs.py
+++ b/scripts/token_abbrevs.py
@@ -1,24 +1,46 @@
 import sys
 import string
 import glob
-# Run it as grep token lexers/* | python scripts/token_abbrevs.py
+tokens = {"Highlight"}
-
+abbrevs = {"Highlight": "hl"}
 def abbr(line):
    return "".join(c for c in line if c in string.ascii_uppercase).lower()
-abbrevs = {}
+def check_abbrevs():
-tokens = set([])
+    if len(abbrevs) != len(tokens):
-for line in sys.stdin:
+        print("Warning: Abbreviations are not unique")
-    if "<token" not in line:
+        print(len(abbrevs), len(tokens))
-        continue
+        sys.exit(1)
    line = line.strip()
    line = line.split('<token ',1)[-1]
    line = line.split('"')[1]
    abbrevs[line] = abbr(line)
    tokens.add(line)
-print("Abbreviations: {")
+# Processes all files in lexers looking for token names
 for fname in glob.glob("lexers/*.xml"):
    with open(fname) as f:
        for line in f:
            if "<token" not in line:
                continue
            line = line.strip()
            line = line.split('<token ',1)[-1]
            line = line.split('"')[1]
            abbrevs[line] = abbr(line)
            tokens.add(line)
            check_abbrevs()
 # Processes all files in styles looking for token names too
 for fname in glob.glob("styles/*.xml"):
    with open(fname) as f:
        for line in f:
            if "<entry" not in line:
                continue
            line = line.strip()
            line = line.split('type=',1)[-1]
            line = line.split('"')[1]
            abbrevs[line] = abbr(line)
            tokens.add(line)
            check_abbrevs()
 print("Abbreviations = {")
 for k, v in abbrevs.items():
    print(f'    "{k}" => "{v}",')
 print("}")
--- a/shard.yml
+++ b/shard.yml
@@ -1,5 +1,5 @@
 name: tartrazine
-version: 0.1.0
+version: 0.1.1
 authors:
  - Roberto Alsina <roberto.alsina@gmail.com>
@@ -15,6 +15,8 @@ dependencies:
    github: crystal-china/base58.cr
  sixteen:
    github: ralsina/sixteen
  docopt:
    github: chenkovsky/docopt.cr
 crystal: ">= 1.13.0"
--- a/src/constants.cr
+++ b/src/constants.cr
@@ -1,92 +1,99 @@
 module Tartrazine
  Abbreviations = {
    "Background"               => "b",
-    "Text"                     => "t",
+    "CodeLine"                 => "cl",
    "Comment"                  => "c",
    "CommentHashbang"          => "ch",
    "CommentMultiline"         => "cm",
    "CommentPreproc"           => "cp",
    "CommentPreprocFile"       => "cpf",
    "CommentSingle"            => "cs",
    "CommentSpecial"           => "cs",
    "NameVariable"             => "nv",
    "Keyword"                  => "k",
    "NameFunction"             => "nf",
    "Punctuation"              => "p",
    "Operator"                 => "o",
    "LiteralNumberInteger"     => "lni",
    "NameBuiltin"              => "nb",
    "Name"                     => "n",
    "OperatorWord"             => "ow",
    "LiteralStringSingle"      => "lss",
    "Literal"                  => "l",
    "NameClass"                => "nc",
    "CommentMultiline"         => "cm",
    "LiteralStringRegex"       => "lsr",
    "KeywordDeclaration"       => "kd",
    "KeywordConstant"          => "kc",
    "NameOther"                => "no",
    "LiteralNumberFloat"       => "lnf",
    "LiteralNumberHex"         => "lnh",
    "LiteralStringDouble"      => "lsd",
    "KeywordType"              => "kt",
    "NameNamespace"            => "nn",
    "NameAttribute"            => "na",
    "KeywordReserved"          => "kr",
    "CommentPreproc"           => "cp",
    "KeywordNamespace"         => "kn",
    "NameConstant"             => "nc",
    "NameLabel"                => "nl",
    "LiteralString"            => "ls",
    "LiteralStringChar"        => "lsc",
    "TextWhitespace"           => "tw",
    "LiteralStringEscape"      => "lse",
    "LiteralNumber"            => "ln",
    "Other"                    => "o",
    "LiteralStringBoolean"     => "lsb",
    "NameProperty"             => "np",
    "Comment"                  => "c",
    "NameTag"                  => "nt",
    "LiteralStringOther"       => "lso",
    "NameVariableGlobal"       => "nvg",
    "NameBuiltinPseudo"        => "nbp",
    "LiteralNumberBin"         => "lnb",
    "KeywordPseudo"            => "kp",
    "CommentPreprocFile"       => "cpf",
    "LiteralStringAffix"       => "lsa",
    "LiteralStringDelimiter"   => "lsd",
    "LiteralNumberOct"         => "lno",
    "Error"                    => "e",
    "Generic"                  => "g",
    "LiteralNumberIntegerLong" => "lnil",
    "NameDecorator"            => "nd",
    "LiteralStringInterpol"    => "lsi",
    "LiteralStringBacktick"    => "lsb",
    "GenericPrompt"            => "gp",
    "GenericOutput"            => "go",
    "LiteralStringName"        => "lsn",
    "LiteralStringHeredoc"     => "lsh",
    "LiteralStringSymbol"      => "lss",
    "NameVariableInstance"     => "nvi",
    "LiteralOther"             => "lo",
    "NameVariableClass"        => "nvc",
    "NameOperator"             => "no",
    "None"                     => "n",
    "LiteralStringDoc"         => "lsd",
    "NameException"            => "ne",
    "GenericSubheading"        => "gs",
    "GenericStrong"            => "gs",
    "GenericDeleted"           => "gd",
    "GenericInserted"          => "gi",
    "GenericHeading"           => "gh",
    "NameEntity"               => "ne",
    "NamePseudo"               => "np",
    "CommentHashbang"          => "ch",
    "TextPunctuation"          => "tp",
    "NameVariableAnonymous"    => "nva",
    "NameVariableMagic"        => "nvm",
    "NameFunctionMagic"        => "nfm",
    "GenericEmph"              => "ge",
    "GenericUnderline"         => "gu",
    "LiteralStringAtom"        => "lsa",
    "LiteralDate"              => "ld",
    "GenericError"             => "ge",
-    "TextSymbol"               => "ts",
+    "GenericHeading"           => "gh",
    "GenericInserted"          => "gi",
    "GenericOutput"            => "go",
    "GenericPrompt"            => "gp",
    "GenericStrong"            => "gs",
    "GenericSubheading"        => "gs",
    "GenericTraceback"         => "gt",
    "GenericUnderline"         => "gu",
    "Keyword"                  => "k",
    "KeywordConstant"          => "kc",
    "KeywordDeclaration"       => "kd",
    "KeywordNamespace"         => "kn",
    "KeywordPseudo"            => "kp",
    "KeywordReserved"          => "kr",
    "KeywordType"              => "kt",
    "LineHighlight"            => "lh",
    "LineNumbers"              => "ln",
    "LineNumbersTable"         => "lnt",
    "LineTable"                => "lt",
    "LineTableTD"              => "lttd",
    "Literal"                  => "l",
    "LiteralDate"              => "ld",
    "LiteralNumber"            => "ln",
    "LiteralNumberBin"         => "lnb",
    "LiteralNumberFloat"       => "lnf",
    "LiteralNumberHex"         => "lnh",
    "LiteralNumberInteger"     => "lni",
    "LiteralNumberIntegerLong" => "lnil",
    "LiteralNumberOct"         => "lno",
    "LiteralOther"             => "lo",
    "LiteralString"            => "ls",
    "LiteralStringAffix"       => "lsa",
    "LiteralStringAtom"        => "lsa",
    "LiteralStringBacktick"    => "lsb",
    "LiteralStringBoolean"     => "lsb",
    "LiteralStringChar"        => "lsc",
    "LiteralStringDelimiter"   => "lsd",
    "LiteralStringDoc"         => "lsd",
    "LiteralStringDouble"      => "lsd",
    "LiteralStringEscape"      => "lse",
    "LiteralStringHeredoc"     => "lsh",
    "LiteralStringInterpol"    => "lsi",
    "LiteralStringName"        => "lsn",
    "LiteralStringOther"       => "lso",
    "LiteralStringRegex"       => "lsr",
    "LiteralStringSingle"      => "lss",
    "LiteralStringSymbol"      => "lss",
    "Name"                     => "n",
    "NameAttribute"            => "na",
    "NameBuiltin"              => "nb",
    "NameBuiltinPseudo"        => "nbp",
    "NameClass"                => "nc",
    "NameConstant"             => "nc",
    "NameDecorator"            => "nd",
    "NameEntity"               => "ne",
    "NameException"            => "ne",
    "NameFunction"             => "nf",
    "NameFunctionMagic"        => "nfm",
    "NameKeyword"              => "nk",
    "NameLabel"                => "nl",
    "NameNamespace"            => "nn",
    "NameOperator"             => "no",
    "NameOther"                => "no",
    "NameProperty"             => "np",
    "NamePseudo"               => "np",
    "NameTag"                  => "nt",
    "NameVariable"             => "nv",
    "NameVariableAnonymous"    => "nva",
    "NameVariableClass"        => "nvc",
    "NameVariableGlobal"       => "nvg",
    "NameVariableInstance"     => "nvi",
    "NameVariableMagic"        => "nvm",
    "None"                     => "n",
    "Operator"                 => "o",
    "OperatorWord"             => "ow",
    "Other"                    => "o",
    "Punctuation"              => "p",
    "Text"                     => "t",
    "TextPunctuation"          => "tp",
    "TextSymbol"               => "ts",
    "TextWhitespace"           => "tw",
  }
 end
--- a/src/formatter.cr
+++ b/src/formatter.cr
@@ -19,92 +19,4 @@ module Tartrazine
      raise Exception.new("Not implemented")
    end
  end
  class Ansi < Formatter
    def format(text : String, lexer : Lexer, theme : Theme) : String
      output = String.build do |outp|
        lexer.tokenize(text).each do |token|
          outp << self.colorize(token[:value], token[:type], theme)
        end
      end
      output
    end
    def colorize(text : String, token : String, theme : Theme) : String
      style = theme.styles.fetch(token, nil)
      return text if style.nil?
      if theme.styles.has_key?(token)
        s = theme.styles[token]
      else
        # Themes don't contain information for each specific
        # token type. However, they may contain information
        # for a parent style. Worst case, we go to the root
        # (Background) style.
        s = theme.styles[theme.style_parents(token).reverse.find { |parent|
          theme.styles.has_key?(parent)
        }]
      end
      colorized = text.colorize(s.color.try &.colorize)
      # Intentionally not setting background color
      colorized.mode(:bold) if s.bold
      colorized.mode(:italic) if s.italic
      colorized.mode(:underline) if s.underline
      colorized.to_s
    end
  end
  class Html < Formatter
    def format(text : String, lexer : Lexer, theme : Theme) : String
      output = String.build do |outp|
        outp << "<html><head><style>"
        outp << get_style_defs(theme)
        outp << "</style></head><body>"
        outp << "<pre class=\"#{get_css_class("Background", theme)}\"><code class=\"#{get_css_class("Background", theme)}\">"
        lexer.tokenize(text).each do |token|
          fragment = "<span class=\"#{get_css_class(token[:type], theme)}\">#{token[:value]}</span>"
          outp << fragment
        end
        outp << "</code></pre></body></html>"
      end
      output
    end
    # ameba:disable Metrics/CyclomaticComplexity
    def get_style_defs(theme : Theme) : String
      output = String.build do |outp|
        theme.styles.each do |token, style|
          outp << ".#{get_css_class(token, theme)} {"
          # These are set or nil
          outp << "color: #{style.color.try &.hex};" if style.color
          outp << "background-color: #{style.background.try &.hex};" if style.background
          outp << "border: 1px solid #{style.border.try &.hex};" if style.border
          # These are true/false/nil
          outp << "border: none;" if style.border == false
          outp << "font-weight: bold;" if style.bold
          outp << "font-weight: 400;" if style.bold == false
          outp << "font-style: italic;" if style.italic
          outp << "font-style: normal;" if style.italic == false
          outp << "text-decoration: underline;" if style.underline
          outp << "text-decoration: none;" if style.underline == false
          outp << "}"
        end
      end
      output
    end
    # Given a token type, return the CSS class to use.
    def get_css_class(token, theme)
      return Abbreviations[token] if theme.styles.has_key?(token)
      # Themes don't contain information for each specific
      # token type. However, they may contain information
      # for a parent style. Worst case, we go to the root
      # (Background) style.
      Abbreviations[theme.style_parents(token).reverse.find { |parent|
        theme.styles.has_key?(parent)
      }]
    end
  end
 end
--- a/src/formatters/ansi.cr
+++ b/src/formatters/ansi.cr
@@ -0,0 +1,37 @@
 require "../formatter"
 module Tartrazine
  class Ansi < Formatter
    def format(text : String, lexer : Lexer, theme : Theme) : String
      output = String.build do |outp|
        lexer.tokenize(text).each do |token|
          outp << self.colorize(token[:value], token[:type], theme)
        end
      end
      output
    end
    def colorize(text : String, token : String, theme : Theme) : String
      style = theme.styles.fetch(token, nil)
      return text if style.nil?
      if theme.styles.has_key?(token)
        s = theme.styles[token]
      else
        # Themes don't contain information for each specific
        # token type. However, they may contain information
        # for a parent style. Worst case, we go to the root
        # (Background) style.
        s = theme.styles[theme.style_parents(token).reverse.find { |parent|
          theme.styles.has_key?(parent)
        }]
      end
      colorized = text.colorize
      s.color.try { |col| colorized = colorized.fore(col.colorize) }
      # Intentionally not setting background color
      colorized.mode(:bold) if s.bold
      colorized.mode(:italic) if s.italic
      colorized.mode(:underline) if s.underline
      colorized.to_s
    end
  end
 end
--- a/src/formatters/html.cr
+++ b/src/formatters/html.cr
@@ -0,0 +1,127 @@
 require "../formatter"
 module Tartrazine
  class Html < Formatter
    # property line_number_in_table : Bool = false
    # property with_classes : Bool = true
    property class_prefix : String = ""
    property highlight_lines : Array(Range(Int32, Int32)) = [] of Range(Int32, Int32)
    property line_number_id_prefix : String = "line-"
    property line_number_start : Int32 = 1
    property tab_width = 8
    property? line_numbers : Bool = false
    property? linkable_line_numbers : Bool = true
    property? standalone : Bool = false
    property? surrounding_pre : Bool = true
    property? wrap_long_lines : Bool = false
    def format(text : String, lexer : Lexer, theme : Theme) : String
      text = format_text(text, lexer, theme)
      if standalone?
        text = wrap_standalone(text, theme)
      end
      text
    end
    # Wrap text into a full HTML document, including the CSS for the theme
    def wrap_standalone(text, theme) : String
      output = String.build do |outp|
        outp << "<!DOCTYPE html><html><head><style>"
        outp << get_style_defs(theme)
        outp << "</style></head><body>"
        outp << text
        outp << "</body></html>"
      end
      output
    end
    def format_text(text : String, lexer : Lexer, theme : Theme) : String
      lines = group_tokens_in_lines(lexer.tokenize(text))
      output = String.build do |outp|
        if surrounding_pre?
          pre_style= wrap_long_lines? ? "style=\"white-space: pre-wrap; word-break: break-word;\"" : ""
          outp << "<pre class=\"#{get_css_class("Background", theme)}\" #{pre_style}>"
        end
        "<code class=\"#{get_css_class("Background", theme)}\">"
        lines.each_with_index(offset: line_number_start - 1) do |line, i|
          line_label = line_numbers? ? "#{i + 1}".rjust(4).ljust(5) : ""
          line_class = highlighted?(i + 1) ? "class=\"#{get_css_class("LineHighlight", theme)}\"" : ""
          line_id = linkable_line_numbers? ? "id=\"#{line_number_id_prefix}#{i + 1}\"" : ""
          outp << "<span #{line_id} #{line_class}>#{line_label}</span>"
          line.each do |token|
            fragment = "<span class=\"#{get_css_class(token[:type], theme)}\">#{token[:value]}</span>"
            outp << fragment
          end
        end
        outp << "</code></pre>"
      end
      output
    end
    # ameba:disable Metrics/CyclomaticComplexity
    def get_style_defs(theme : Theme) : String
      output = String.build do |outp|
        theme.styles.each do |token, style|
          outp << ".#{get_css_class(token, theme)} {"
          # These are set or nil
          outp << "color: ##{style.color.try &.hex};" if style.color
          outp << "background-color: ##{style.background.try &.hex};" if style.background
          outp << "border: 1px solid ##{style.border.try &.hex};" if style.border
          # These are true/false/nil
          outp << "border: none;" if style.border == false
          outp << "font-weight: bold;" if style.bold
          outp << "font-weight: 400;" if style.bold == false
          outp << "font-style: italic;" if style.italic
          outp << "font-style: normal;" if style.italic == false
          outp << "text-decoration: underline;" if style.underline
          outp << "text-decoration: none;" if style.underline == false
          outp << "tab-size: #{tab_width};" if token == "Background"
          outp << "}"
        end
      end
      output
    end
    # Given a token type, return the CSS class to use.
    def get_css_class(token, theme)
      return class_prefix + Abbreviations[token] if theme.styles.has_key?(token)
      # Themes don't contain information for each specific
      # token type. However, they may contain information
      # for a parent style. Worst case, we go to the root
      # (Background) style.
      class_prefix + Abbreviations[theme.style_parents(token).reverse.find { |parent|
        theme.styles.has_key?(parent)
      }]
    end
    def highlighted?(line : Int) : Bool
      highlight_lines.any?(&.includes?(line))
    end
    def group_tokens_in_lines(tokens : Array(Token)) : Array(Array(Token))
      split_tokens = [] of Token
      tokens.each do |token|
        if token[:value].includes?("\n")
          values = token[:value].split("\n")
          values.each_with_index do |value, index|
            value += "\n" if index < values.size - 1
            split_tokens << {type: token[:type], value: value}
          end
        else
          split_tokens << token
        end
      end
      lines = [Array(Token).new]
      split_tokens.each do |token|
        lines.last << token
        if token[:value].includes?("\n")
          lines << Array(Token).new
        end
      end
      lines
    end
  end
 end
--- a/src/formatters/json.cr
+++ b/src/formatters/json.cr
@@ -0,0 +1,11 @@
 require "../formatter"
 module Tartrazine
  class Json < Formatter
    property name = "json"
    def format(text : String, lexer : Lexer, _theme : Theme) : String
      lexer.tokenize(text).to_json
    end
  end
 end
--- a/src/lexer.cr
+++ b/src/lexer.cr
@@ -0,0 +1,180 @@
 module Tartrazine
  class LexerFiles
    extend BakedFileSystem
    bake_folder "../lexers", __DIR__
  end
  # This implements a lexer for Pygments RegexLexers as expressed
  # in Chroma's XML serialization.
  #
  # For explanations on what actions and states do
  # the Pygments documentation is a good place to start.
  # https://pygments.org/docs/lexerdevelopment/
  class Lexer
    property config = {
      name:             "",
      aliases:          [] of String,
      filenames:        [] of String,
      mime_types:       [] of String,
      priority:         0.0,
      case_insensitive: false,
      dot_all:          false,
      not_multiline:    false,
      ensure_nl:        false,
    }
    property xml : String = ""
    property states = {} of String => State
    property state_stack = ["root"]
    # Turn the text into a list of tokens. The `usingself` parameter
    # is true when the lexer is being used to tokenize a string
    # from a larger text that is already being tokenized.
    # So, when it's true, we don't modify the text.
    def tokenize(text, usingself = false) : Array(Token)
      @state_stack = ["root"]
      tokens = [] of Token
      pos = 0
      matched = false
      # Respect the `ensure_nl` config option
      if text.size > 0 && text[-1] != '\n' && config[:ensure_nl] && !usingself
        text += "\n"
      end
      # Loop through the text, applying rules
      while pos < text.size
        state = states[@state_stack.last]
        # Log.trace { "Stack is #{@state_stack} State is #{state.name}, pos is #{pos}, text is #{text[pos..pos + 10]}" }
        state.rules.each do |rule|
          matched, new_pos, new_tokens = rule.match(text, pos, self)
          if matched
            # Move position forward, save the tokens,
            # tokenize from the new position
            # Log.trace { "MATCHED: #{rule.xml}" }
            pos = new_pos
            tokens += new_tokens
            break
          end
          # Log.trace { "NOT MATCHED: #{rule.xml}" }
        end
        # If no rule matches, emit an error token
        unless matched
          # Log.trace { "Error at #{pos}" }
          tokens << {type: "Error", value: "#{text[pos]}"}
          pos += 1
        end
      end
      Lexer.collapse_tokens(tokens)
    end
    # Collapse consecutive tokens of the same type for easier comparison
    # and smaller output
    def self.collapse_tokens(tokens : Array(Tartrazine::Token)) : Array(Tartrazine::Token)
      result = [] of Tartrazine::Token
      tokens = tokens.reject { |token| token[:value] == "" }
      tokens.each do |token|
        if result.empty?
          result << token
          next
        end
        last = result.last
        if last[:type] == token[:type]
          new_token = {type: last[:type], value: last[:value] + token[:value]}
          result.pop
          result << new_token
        else
          result << token
        end
      end
      result
    end
    # ameba:disable Metrics/CyclomaticComplexity
    def self.from_xml(xml : String) : Lexer
      l = Lexer.new
      l.xml = xml
      lexer = XML.parse(xml).first_element_child
      if lexer
        config = lexer.children.find { |node|
          node.name == "config"
        }
        if config
          l.config = {
            name:             xml_to_s(config, name) || "",
            aliases:          xml_to_a(config, _alias) || [] of String,
            filenames:        xml_to_a(config, filename) || [] of String,
            mime_types:       xml_to_a(config, mime_type) || [] of String,
            priority:         xml_to_f(config, priority) || 0.0,
            not_multiline:    xml_to_s(config, not_multiline) == "true",
            dot_all:          xml_to_s(config, dot_all) == "true",
            case_insensitive: xml_to_s(config, case_insensitive) == "true",
            ensure_nl:        xml_to_s(config, ensure_nl) == "true",
          }
        end
        rules = lexer.children.find { |node|
          node.name == "rules"
        }
        if rules
          # Rules contains states 🤷
          rules.children.select { |node|
            node.name == "state"
          }.each do |state_node|
            state = State.new
            state.name = state_node["name"]
            if l.states.has_key?(state.name)
              raise Exception.new("Duplicate state: #{state.name}")
            else
              l.states[state.name] = state
            end
            # And states contain rules 🤷
            state_node.children.select { |node|
              node.name == "rule"
            }.each do |rule_node|
              case rule_node["pattern"]?
              when nil
                if rule_node.first_element_child.try &.name == "include"
                  rule = IncludeStateRule.new(rule_node)
                else
                  rule = UnconditionalRule.new(rule_node)
                end
              else
                rule = Rule.new(rule_node,
                  multiline: !l.config[:not_multiline],
                  dotall: l.config[:dot_all],
                  ignorecase: l.config[:case_insensitive])
              end
              state.rules << rule
            end
          end
        end
      end
      l
    end
  end
  # A Lexer state. A state has a name and a list of rules.
  # The state machine has a state stack containing references
  # to states to decide which rules to apply.
  class State
    property name : String = ""
    property rules = [] of Rule
    def +(other : State)
      new_state = State.new
      new_state.name = Random.base58(8)
      new_state.rules = rules + other.rules
      new_state
    end
  end
  # A token, the output of the tokenizer
  alias Token = NamedTuple(type: String, value: String)
  def self.lexer(name : String) : Lexer
    Lexer.from_xml(LexerFiles.get("/#{name}.xml").gets_to_end)
  end
 end
--- a/src/main.cr
+++ b/src/main.cr
@@ -1,5 +1,34 @@
 require "./**"
 HELP = <<-HELP
 tartrazine: a syntax highlighting tool
 Usage:
  tartrazine FILE -f html [-t theme][--standalone][--line-numbers]
                          [-l lexer] [-o output][--css]
  tartrazine FILE -f terminal [-t theme][-l lexer][-o output]
  tartrazine FILE -f json [-o output]
  tartrazine --list-themes
  tartrazine --list-lexers
 -f <formatter>      Format to use (html, terminal, json)
 -t <theme>          Theme to use (see --list-themes)
 -l <lexer>          Lexer (language) to use (see --list-lexers)
 -o <output>         Output file (default: stdout)
 --standalone        Generate a standalone HTML file
 --css               Generate a CSS file for the theme
 --line-numbers      Include line numbers in the output
 HELP
 lexer = Tartrazine.lexer("crystal")
 theme = Tartrazine.theme(ARGV[1])
-puts Tartrazine::Html.new.format(File.read(ARGV[0]), lexer, theme)
+formatter = Tartrazine::Json.new
 # formatter.standalone = true
 # formatter.class_prefix = "hl-"
 # formatter.line_number_id_prefix = "ln-"
 # formatter.line_numbers = true
 # formatter.highlight_lines = [3..7, 20..30]
 # formatter.linkable_line_numbers = false
 # formatter.wrap_long_lines = false
 puts formatter.format(File.read(ARGV[0]), lexer, theme)
--- a/src/rules.cr
+++ b/src/rules.cr
@@ -3,7 +3,7 @@ require "./constants"
 require "./formatter"
 require "./rules"
 require "./styles"
-require "./tartrazine"
+require "./lexer"
 # These are lexer rules. They match with the text being parsed
 # and perform actions, either emitting tokens or changing the
--- a/src/styles.cr
+++ b/src/styles.cr
@@ -11,8 +11,16 @@ module Tartrazine
  alias Color = Sixteen::Color
  def self.theme(name : String) : Theme
-    return Theme.from_base16(name[7..]) if name.starts_with? "base16_"
+    begin
-    Theme.from_xml(ThemeFiles.get("/#{name}.xml").gets_to_end)
+    return Theme.from_base16(name)
    rescue ex : Exception
      raise ex unless ex.message.try &.includes? "Theme not found"
    end
    begin
      return Theme.from_xml(ThemeFiles.get("/#{name}.xml").gets_to_end)
    rescue
      raise Exception.new("Theme #{name} not found")
    end
  end
  class ThemeFiles
@@ -104,6 +112,7 @@ module Tartrazine
      # https://github.com/mohd-akram/base16-pygments/
      theme.styles["Background"] = Style.new(color: t["base05"], background: t["base00"])
      theme.styles["LineHighlight"] = Style.new(color: t["base0D"], background: t["base01"])
      theme.styles["Text"] = Style.new(color: t["base05"])
      theme.styles["Error"] = Style.new(color: t["base08"])
      theme.styles["Comment"] = Style.new(color: t["base03"])
@@ -162,7 +171,31 @@ module Tartrazine
        theme.styles[node["type"]] = s
      end
      # We really want a LineHighlight class
      if !theme.styles.has_key?("LineHighlight")
        theme.styles["LineHighlight"] = Style.new
        theme.styles["LineHighlight"].background = make_highlight_color(theme.styles["Background"].background)
      end
      theme
    end
    # If the color is dark, make it brighter and viceversa
    def self.make_highlight_color(base_color)
      # FIXME: do a proper luminance adjustment in the color class
      return nil if base_color.nil?
      color = Color.new(base_color.hex)
      if base_color.light?
        color.r = [(base_color.r - 40), 255].min.to_u8
        color.g = [(base_color.g - 40), 255].min.to_u8
        color.b = [(base_color.b - 40), 255].min.to_u8
      else
        color.r = [(base_color.r + 40), 255].min.to_u8
        color.g = [(base_color.g + 40), 255].min.to_u8
        color.b = [(base_color.b + 40), 255].min.to_u8
      end
      # Bug in color, setting rgb doesn't update hex
      color.hex = "#{color.r.to_s(16)}#{color.g.to_s(16)}#{color.b.to_s(16)}"
      color
    end
  end
 end
--- a/src/tartrazine.cr
+++ b/src/tartrazine.cr
@@ -12,189 +12,9 @@ require "xml"
 module Tartrazine
  extend self
-  VERSION = "0.1.0"
+  VERSION = "0.1.1"
  Log = ::Log.for("tartrazine")
  # This implements a lexer for Pygments RegexLexers as expressed
  # in Chroma's XML serialization.
  #
  # For explanations on what actions and states do
  # the Pygments documentation is a good place to start.
  # https://pygments.org/docs/lexerdevelopment/
  # A Lexer state. A state has a name and a list of rules.
  # The state machine has a state stack containing references
  # to states to decide which rules to apply.
  class State
    property name : String = ""
    property rules = [] of Rule
    def +(other : State)
      new_state = State.new
      new_state.name = Random.base58(8)
      new_state.rules = rules + other.rules
      new_state
    end
  end
  class LexerFiles
    extend BakedFileSystem
    bake_folder "../lexers", __DIR__
  end
  # A token, the output of the tokenizer
  alias Token = NamedTuple(type: String, value: String)
  class Lexer
    property config = {
      name:             "",
      aliases:          [] of String,
      filenames:        [] of String,
      mime_types:       [] of String,
      priority:         0.0,
      case_insensitive: false,
      dot_all:          false,
      not_multiline:    false,
      ensure_nl:        false,
    }
    property xml : String = ""
    property states = {} of String => State
    property state_stack = ["root"]
    # Turn the text into a list of tokens. The `usingself` parameter
    # is true when the lexer is being used to tokenize a string
    # from a larger text that is already being tokenized.
    # So, when it's true, we don't modify the text.
    def tokenize(text, usingself = false) : Array(Token)
      @state_stack = ["root"]
      tokens = [] of Token
      pos = 0
      matched = false
      # Respect the `ensure_nl` config option
      if text.size > 0 && text[-1] != '\n' && config[:ensure_nl] && !usingself
        text += "\n"
      end
      # Loop through the text, applying rules
      while pos < text.size
        state = states[@state_stack.last]
        # Log.trace { "Stack is #{@state_stack} State is #{state.name}, pos is #{pos}, text is #{text[pos..pos + 10]}" }
        state.rules.each do |rule|
          matched, new_pos, new_tokens = rule.match(text, pos, self)
          if matched
            # Move position forward, save the tokens,
            # tokenize from the new position
            # Log.trace { "MATCHED: #{rule.xml}" }
            pos = new_pos
            tokens += new_tokens
            break
          end
          # Log.trace { "NOT MATCHED: #{rule.xml}" }
        end
        # If no rule matches, emit an error token
        unless matched
          # Log.trace { "Error at #{pos}" }
          tokens << {type: "Error", value: "#{text[pos]}"}
          pos += 1
        end
      end
      Lexer.collapse_tokens(tokens)
    end
    # Collapse consecutive tokens of the same type for easier comparison
    # and smaller output
    def self.collapse_tokens(tokens : Array(Tartrazine::Token)) : Array(Tartrazine::Token)
      result = [] of Tartrazine::Token
      tokens = tokens.reject { |token| token[:value] == "" }
      tokens.each do |token|
        if result.empty?
          result << token
          next
        end
        last = result.last
        if last[:type] == token[:type]
          new_token = {type: last[:type], value: last[:value] + token[:value]}
          result.pop
          result << new_token
        else
          result << token
        end
      end
      result
    end
    # ameba:disable Metrics/CyclomaticComplexity
    def self.from_xml(xml : String) : Lexer
      l = Lexer.new
      l.xml = xml
      lexer = XML.parse(xml).first_element_child
      if lexer
        config = lexer.children.find { |node|
          node.name == "config"
        }
        if config
          l.config = {
            name:             xml_to_s(config, name) || "",
            aliases:          xml_to_a(config, _alias) || [] of String,
            filenames:        xml_to_a(config, filename) || [] of String,
            mime_types:       xml_to_a(config, mime_type) || [] of String,
            priority:         xml_to_f(config, priority) || 0.0,
            not_multiline:    xml_to_s(config, not_multiline) == "true",
            dot_all:          xml_to_s(config, dot_all) == "true",
            case_insensitive: xml_to_s(config, case_insensitive) == "true",
            ensure_nl:        xml_to_s(config, ensure_nl) == "true",
          }
        end
        rules = lexer.children.find { |node|
          node.name == "rules"
        }
        if rules
          # Rules contains states 🤷
          rules.children.select { |node|
            node.name == "state"
          }.each do |state_node|
            state = State.new
            state.name = state_node["name"]
            if l.states.has_key?(state.name)
              raise Exception.new("Duplicate state: #{state.name}")
            else
              l.states[state.name] = state
            end
            # And states contain rules 🤷
            state_node.children.select { |node|
              node.name == "rule"
            }.each do |rule_node|
              case rule_node["pattern"]?
              when nil
                if rule_node.first_element_child.try &.name == "include"
                  rule = IncludeStateRule.new(rule_node)
                else
                  rule = UnconditionalRule.new(rule_node)
                end
              else
                rule = Rule.new(rule_node,
                  multiline: !l.config[:not_multiline],
                  dotall: l.config[:dot_all],
                  ignorecase: l.config[:case_insensitive])
              end
              state.rules << rule
            end
          end
        end
      end
      l
    end
  end
  def self.lexer(name : String) : Lexer
    Lexer.from_xml(LexerFiles.get("/#{name}.xml").gets_to_end)
  end
 end
 # Convenience macros to parse XML
Author	SHA1	Message	Date
Roberto Alsina	84ee7e6934	JSON formatter	2024-08-09 16:58:15 -03:00
Roberto Alsina	89d212b71c	Start actual CLI	2024-08-09 16:53:24 -03:00
Roberto Alsina	a92d2501f7	HTML formatter option: wrap_long_lines	2024-08-09 16:20:30 -03:00
Roberto Alsina	6b44bcb5ad	HTML formatter option: surrounding_pre	2024-08-09 15:59:49 -03:00
Roberto Alsina	86a5894429	Hack luminance tweaking for creating highlight color (needs a proper implementation)	2024-08-09 14:54:00 -03:00
Roberto Alsina	be12e0f4f1	Sort constants	2024-08-09 14:44:23 -03:00
Roberto Alsina	96dcb7e15e	Fix line highlight for non-base16 themes	2024-08-09 14:42:33 -03:00
Roberto Alsina	d1762f477a	Fix constants for non-base16 themes	2024-08-09 14:17:24 -03:00
Roberto Alsina	f98f44365f	HTML formatter option: line_numbers / highlight_lines	2024-08-09 14:00:42 -03:00
Roberto Alsina	d0c2b1764a	HTML formatter option: line_number_start / line_number_id_prefix	2024-08-09 13:28:05 -03:00
Roberto Alsina	e6a292ade0	HTML formatter option: tab_width	2024-08-09 12:29:56 -03:00
Roberto Alsina	4ced996f90	HTML formatter option: class_prefix	2024-08-09 12:21:02 -03:00
Roberto Alsina	fd5af6ba3b	Starting to add options to HTML formatter: standalone	2024-08-09 11:57:23 -03:00
Roberto Alsina	47237eecc3	Refactor things into separate files for easier reading	2024-08-09 11:31:18 -03:00
Roberto Alsina	a0ff4e0118	0.1.1	2024-08-09 11:11:17 -03:00
Roberto Alsina	ece3d4163a	Bug	2024-08-09 11:03:32 -03:00
Roberto Alsina	3180168261	Added helper files	2024-08-09 10:32:15 -03:00