test: added tests for CSS generation

2025-09-18 19:28:12 +00:00 · 2024-09-10 22:33:22 -03:00
parent e288a55812
commit ac8b7e3800
5 changed files with 430 additions and 1 deletions
--- a/spec/css/manni.css
+++ b/spec/css/manni.css
@@ -0,0 +1 @@
+.e {color: #aa0000;background-color: #ffaaaa;}.b {background-color: #f0f3f3;tab-size: 8;}.k {color: #006699;font-weight: bold;}.kp {font-weight: 600;}.kt {color: #007788;}.na {color: #330099;}.nb {color: #336666;}.nc {color: #00aa88;font-weight: bold;}.nc {color: #336600;}.nd {color: #9999ff;}.ne {color: #999999;font-weight: bold;}.ne {color: #cc0000;font-weight: bold;}.nf {color: #cc00ff;}.nl {color: #9999ff;}.nn {color: #00ccff;font-weight: bold;}.nt {color: #330099;font-weight: bold;}.nv {color: #003333;}.ls {color: #cc3300;}.lsd {font-style: italic;}.lse {color: #cc3300;font-weight: bold;}.lsi {color: #aa0000;}.lso {color: #cc3300;}.lsr {color: #33aaaa;}.lss {color: #ffcc33;}.ln {color: #ff6600;}.o {color: #555555;}.ow {color: #000000;font-weight: bold;}.c {color: #0099ff;font-style: italic;}.cs {font-weight: bold;}.cp {color: #009999;font-style: normal;}.gd {background-color: #ffcccc;border: 1px solid #cc0000;}.ge {font-style: italic;}.ge {color: #ff0000;}.gh {color: #003300;font-weight: bold;}.gi {background-color: #ccffcc;border: 1px solid #00cc00;}.go {color: #aaaaaa;}.gp {color: #000099;font-weight: bold;}.gs {font-weight: bold;}.gs {color: #003300;font-weight: bold;}.gt {color: #99cc66;}.gu {text-decoration: underline;}.tw {color: #bbbbbb;}.lh {}
--- a/spec/css/vesper.css
+++ b/spec/css/vesper.css
@@ -0,0 +1 @@
+.b {color: #b7b7b7;background-color: #101010;font-weight: bold;tab-size: 8;}.lh {color: #8eaaaa;background-color: #232323;}.t {color: #b7b7b7;}.e {color: #de6e6e;}.c {color: #333333;}.cp {color: #876c4f;}.cpf {color: #5f8787;}.k {color: #d69094;}.kt {color: #de6e6e;}.na {color: #8eaaaa;}.nb {color: #de6e6e;}.nbp {color: #de6e6e;}.nc {color: #8eaaaa;}.nc {color: #dab083;}.nd {color: #dab083;}.nf {color: #8eaaaa;}.nn {color: #8eaaaa;}.nt {color: #d69094;}.nv {color: #8eaaaa;}.nvi {color: #de6e6e;}.ln {color: #dab083;}.o {color: #60a592;}.ow {color: #d69094;}.l {color: #5f8787;}.ls {color: #5f8787;}.lsi {color: #876c4f;}.lsr {color: #60a592;}.lss {color: #dab083;}
--- a/spec/examples/crystal/lexer.cr
+++ b/spec/examples/crystal/lexer.cr
@@ -0,0 +1,413 @@
+require "./constants/lexers"
+require "./heuristics"
+require "baked_file_system"
+require "crystal/syntax_highlighter"
+
+module Tartrazine
+  class LexerFiles
+    extend BakedFileSystem
+    bake_folder "../lexers", __DIR__
+  end
+
+  # Get the lexer object for a language name
+  # FIXME: support mimetypes
+  def self.lexer(name : String? = nil, filename : String? = nil, mimetype : String? = nil) : BaseLexer
+    return lexer_by_name(name) if name && name != "autodetect"
+    return lexer_by_filename(filename) if filename
+    return lexer_by_mimetype(mimetype) if mimetype
+
+    RegexLexer.from_xml(LexerFiles.get("/#{LEXERS_BY_NAME["plaintext"]}.xml").gets_to_end)
+  end
+
+  private def self.lexer_by_mimetype(mimetype : String) : BaseLexer
+    lexer_file_name = LEXERS_BY_MIMETYPE.fetch(mimetype, nil)
+    raise Exception.new("Unknown mimetype: #{mimetype}") if lexer_file_name.nil?
+
+    RegexLexer.from_xml(LexerFiles.get("/#{lexer_file_name}.xml").gets_to_end)
+  end
+
+  private def self.lexer_by_name(name : String) : BaseLexer
+    return CrystalLexer.new if name == "crystal"
+    lexer_file_name = LEXERS_BY_NAME.fetch(name.downcase, nil)
+    return create_delegating_lexer(name) if lexer_file_name.nil? && name.includes? "+"
+    raise Exception.new("Unknown lexer: #{name}") if lexer_file_name.nil?
+
+    RegexLexer.from_xml(LexerFiles.get("/#{lexer_file_name}.xml").gets_to_end)
+  end
+
+  private def self.lexer_by_filename(filename : String) : BaseLexer
+    if filename.ends_with?(".cr")
+      return CrystalLexer.new
+    end
+
+    candidates = Set(String).new
+    LEXERS_BY_FILENAME.each do |k, v|
+      candidates += v.to_set if File.match?(k, File.basename(filename))
+    end
+
+    case candidates.size
+    when 0
+      lexer_file_name = LEXERS_BY_NAME["plaintext"]
+    when 1
+      lexer_file_name = candidates.first
+    else
+      lexer_file_name = self.lexer_by_content(filename)
+      begin
+        return self.lexer(lexer_file_name)
+      rescue ex : Exception
+        raise Exception.new("Multiple lexers match the filename: #{candidates.to_a.join(", ")}, heuristics suggest #{lexer_file_name} but there is no matching lexer.")
+      end
+    end
+
+    RegexLexer.from_xml(LexerFiles.get("/#{lexer_file_name}.xml").gets_to_end)
+  end
+
+  private def self.lexer_by_content(fname : String) : String?
+    h = Linguist::Heuristic.from_yaml(LexerFiles.get("/heuristics.yml").gets_to_end)
+    result = h.run(fname, File.read(fname))
+    case result
+    when Nil
+      raise Exception.new "No lexer found for #{fname}"
+    when String
+      result.as(String)
+    when Array(String)
+      result.first
+    end
+  end
+
+  private def self.create_delegating_lexer(name : String) : BaseLexer
+    language, root = name.split("+", 2)
+    language_lexer = lexer(language)
+    root_lexer = lexer(root)
+    DelegatingLexer.new(language_lexer, root_lexer)
+  end
+
+  # Return a list of all lexers
+  def self.lexers : Array(String)
+    LEXERS_BY_NAME.keys.sort!
+  end
+
+  # A token, the output of the tokenizer
+  alias Token = NamedTuple(type: String, value: String)
+
+  abstract class BaseTokenizer
+  end
+
+  class Tokenizer < BaseTokenizer
+    include Iterator(Token)
+    property lexer : BaseLexer
+    property text : Bytes
+    property pos : Int32 = 0
+    @dq = Deque(Token).new
+    property state_stack = ["root"]
+
+    def initialize(@lexer : BaseLexer, text : String, secondary = false)
+      # Respect the `ensure_nl` config option
+      if text.size > 0 && text[-1] != '\n' && @lexer.config[:ensure_nl] && !secondary
+        text += "\n"
+      end
+      @text = text.to_slice
+    end
+
+    def next : Iterator::Stop | Token
+      if @dq.size > 0
+        return @dq.shift
+      end
+      if pos == @text.size
+        return stop
+      end
+
+      matched = false
+      while @pos < @text.size
+        @lexer.states[@state_stack.last].rules.each do |rule|
+          matched, new_pos, new_tokens = rule.match(@text, @pos, self)
+          if matched
+            @pos = new_pos
+            split_tokens(new_tokens).each { |token| @dq << token }
+            break
+          end
+        end
+        if !matched
+          if @text[@pos] == 10u8
+            @dq << {type: "Text", value: "\n"}
+            @state_stack = ["root"]
+          else
+            @dq << {type: "Error", value: String.new(@text[@pos..@pos])}
+          end
+          @pos += 1
+          break
+        end
+      end
+      self.next
+    end
+
+    # If a token contains a newline, split it into two tokens
+    def split_tokens(tokens : Array(Token)) : Array(Token)
+      split_tokens = [] of Token
+      tokens.each do |token|
+        if token[:value].includes?("\n")
+          values = token[:value].split("\n")
+          values.each_with_index do |value, index|
+            value += "\n" if index < values.size - 1
+            split_tokens << {type: token[:type], value: value}
+          end
+        else
+          split_tokens << token
+        end
+      end
+      split_tokens
+    end
+  end
+
+  alias BaseLexer = Lexer
+
+  abstract class Lexer
+    property config = {
+      name:             "",
+      priority:         0.0,
+      case_insensitive: false,
+      dot_all:          false,
+      not_multiline:    false,
+      ensure_nl:        false,
+    }
+    property states = {} of String => State
+
+    def tokenizer(text : String, secondary = false) : BaseTokenizer
+      Tokenizer.new(self, text, secondary)
+    end
+  end
+
+  # This implements a lexer for Pygments RegexLexers as expressed
+  # in Chroma's XML serialization.
+  #
+  # For explanations on what actions and states do
+  # the Pygments documentation is a good place to start.
+  # https://pygments.org/docs/lexerdevelopment/
+  class RegexLexer < BaseLexer
+    # Collapse consecutive tokens of the same type for easier comparison
+    # and smaller output
+    def self.collapse_tokens(tokens : Array(Tartrazine::Token)) : Array(Tartrazine::Token)
+      result = [] of Tartrazine::Token
+      tokens = tokens.reject { |token| token[:value] == "" }
+      tokens.each do |token|
+        if result.empty?
+          result << token
+          next
+        end
+        last = result.last
+        if last[:type] == token[:type]
+          new_token = {type: last[:type], value: last[:value] + token[:value]}
+          result.pop
+          result << new_token
+        else
+          result << token
+        end
+      end
+      result
+    end
+
+    def self.from_xml(xml : String) : Lexer
+      l = RegexLexer.new
+      lexer = XML.parse(xml).first_element_child
+      if lexer
+        config = lexer.children.find { |node|
+          node.name == "config"
+        }
+        if config
+          l.config = {
+            name:             xml_to_s(config, name) || "",
+            priority:         xml_to_f(config, priority) || 0.0,
+            not_multiline:    xml_to_s(config, not_multiline) == "true",
+            dot_all:          xml_to_s(config, dot_all) == "true",
+            case_insensitive: xml_to_s(config, case_insensitive) == "true",
+            ensure_nl:        xml_to_s(config, ensure_nl) == "true",
+          }
+        end
+
+        rules = lexer.children.find { |node|
+          node.name == "rules"
+        }
+        if rules
+          # Rules contains states 🤷
+          rules.children.select { |node|
+            node.name == "state"
+          }.each do |state_node|
+            state = State.new
+            state.name = state_node["name"]
+            if l.states.has_key?(state.name)
+              raise Exception.new("Duplicate state: #{state.name}")
+            else
+              l.states[state.name] = state
+            end
+            # And states contain rules 🤷
+            state_node.children.select { |node|
+              node.name == "rule"
+            }.each do |rule_node|
+              case rule_node["pattern"]?
+              when nil
+                if rule_node.first_element_child.try &.name == "include"
+                  rule = IncludeStateRule.new(rule_node)
+                else
+                  rule = UnconditionalRule.new(rule_node)
+                end
+              else
+                rule = Rule.new(rule_node,
+                  multiline: !l.config[:not_multiline],
+                  dotall: l.config[:dot_all],
+                  ignorecase: l.config[:case_insensitive])
+              end
+              state.rules << rule
+            end
+          end
+        end
+      end
+      l
+    end
+  end
+
+  # A lexer that takes two lexers as arguments. A root lexer
+  # and a language lexer. Everything is scalled using the
+  # language lexer, afterwards all `Other` tokens are lexed
+  # using the root lexer.
+  #
+  # This is useful for things like template languages, where
+  # you have Jinja + HTML or Jinja + CSS and so on.
+  class DelegatingLexer < Lexer
+    property language_lexer : BaseLexer
+    property root_lexer : BaseLexer
+
+    def initialize(@language_lexer : BaseLexer, @root_lexer : BaseLexer)
+    end
+
+    def tokenizer(text : String, secondary = false) : DelegatingTokenizer
+      DelegatingTokenizer.new(self, text, secondary)
+    end
+  end
+
+  # This Tokenizer works with a DelegatingLexer. It first tokenizes
+  # using the language lexer, and "Other" tokens are tokenized using
+  # the root lexer.
+  class DelegatingTokenizer < BaseTokenizer
+    include Iterator(Token)
+    @dq = Deque(Token).new
+    @language_tokenizer : BaseTokenizer
+
+    def initialize(@lexer : DelegatingLexer, text : String, secondary = false)
+      # Respect the `ensure_nl` config option
+      if text.size > 0 && text[-1] != '\n' && @lexer.config[:ensure_nl] && !secondary
+        text += "\n"
+      end
+      @language_tokenizer = @lexer.language_lexer.tokenizer(text, true)
+    end
+
+    def next : Iterator::Stop | Token
+      if @dq.size > 0
+        return @dq.shift
+      end
+      token = @language_tokenizer.next
+      if token.is_a? Iterator::Stop
+        return stop
+      elsif token.as(Token).[:type] == "Other"
+        root_tokenizer = @lexer.root_lexer.tokenizer(token.as(Token).[:value], true)
+        root_tokenizer.each do |root_token|
+          @dq << root_token
+        end
+      else
+        @dq << token.as(Token)
+      end
+      self.next
+    end
+  end
+
+  # A Lexer state. A state has a name and a list of rules.
+  # The state machine has a state stack containing references
+  # to states to decide which rules to apply.
+  struct State
+    property name : String = ""
+    property rules = [] of BaseRule
+
+    def +(other : State)
+      new_state = State.new
+      new_state.name = Random.base58(8)
+      new_state.rules = rules + other.rules
+      new_state
+    end
+  end
+
+  class CustomCrystalHighlighter < Crystal::SyntaxHighlighter
+    @tokens = [] of Token
+
+    def render_delimiter(&block)
+      @tokens << {type: "LiteralString", value: block.call.to_s}
+    end
+
+    def render_interpolation(&block)
+      @tokens << {type: "LiteralStringInterpol", value: "\#{"}
+      @tokens << {type: "Text", value: block.call.to_s}
+      @tokens << {type: "LiteralStringInterpol", value: "}"}
+    end
+
+    def render_string_array(&block)
+      @tokens << {type: "LiteralString", value: block.call.to_s}
+    end
+
+    # ameba:disable Metrics/CyclomaticComplexity
+    def render(type : TokenType, value : String)
+      case type
+      when .comment?
+        @tokens << {type: "Comment", value: value}
+      when .number?
+        @tokens << {type: "LiteralNumber", value: value}
+      when .char?
+        @tokens << {type: "LiteralStringChar", value: value}
+      when .symbol?
+        @tokens << {type: "LiteralStringSymbol", value: value}
+      when .const?
+        @tokens << {type: "NameConstant", value: value}
+      when .string?
+        @tokens << {type: "LiteralString", value: value}
+      when .ident?
+        @tokens << {type: "NameVariable", value: value}
+      when .keyword?, .self?
+        @tokens << {type: "NameKeyword", value: value}
+      when .primitive_literal?
+        @tokens << {type: "Literal", value: value}
+      when .operator?
+        @tokens << {type: "Operator", value: value}
+      when Crystal::SyntaxHighlighter::TokenType::DELIMITED_TOKEN, Crystal::SyntaxHighlighter::TokenType::DELIMITER_START, Crystal::SyntaxHighlighter::TokenType::DELIMITER_END
+        @tokens << {type: "LiteralString", value: value}
+      else
+        @tokens << {type: "Text", value: value}
+      end
+    end
+  end
+
+  class CrystalTokenizer < Tartrazine::BaseTokenizer
+    include Iterator(Token)
+    @hl = CustomCrystalHighlighter.new
+    @lexer : BaseLexer
+    @iter : Iterator(Token)
+
+    # delegate next, to: @iter
+
+    def initialize(@lexer : BaseLexer, text : String, secondary = false)
+      # Respect the `ensure_nl` config option
+      if text.size > 0 && text[-1] != '\n' && @lexer.config[:ensure_nl] && !secondary
+        text += "\n"
+      end
+      # Just do the tokenizing
+      @hl.highlight(text)
+      @iter = @hl.@tokens.each
+    end
+
+    def next : Iterator::Stop | Token
+      @iter.next
+    end
+  end
+
+  class CrystalLexer < BaseLexer
+    def tokenizer(text : String, secondary = false) : BaseTokenizer
+      CrystalTokenizer.new(self, text, secondary)
+    end
+  end
+end
--- a/spec/tartrazine_spec.cr
+++ b/spec/tartrazine_spec.cr
@@ -6,6 +6,9 @@ testcases = Dir.glob("#{__DIR__}/tests/**/*txt").sort
 # These are custom testcases
 examples = Dir.glob("#{__DIR__}/examples/**/*.*").reject(&.ends_with? ".json").sort!

+# CSS Stylesheets
+css_files = Dir.glob("#{__DIR__}/css/*.css")
+
 # These lexers don't load because of parsing issues
 failing_lexers = {
  "webgpu_shading_language",
@@ -81,6 +84,17 @@ describe Tartrazine do
    end
  end

+  describe "formatter" do
+    css_files.each do |css_file|
+      it "generates #{css_file}" do
+        css = File.read(css_file)
+        theme = Tartrazine.theme(File.basename(css_file, ".css"))
+        formatter = Tartrazine::Html.new(theme: theme)
+        formatter.style_defs.strip.should eq css.strip
+      end
+    end
+  end
+
  describe "to_html" do
    it "should do basic highlighting" do
      html = Tartrazine.to_html("puts 'Hello, World!'", "ruby", standalone: false)
--- a/src/run_tests.cr
+++ b/src/run_tests.cr
@@ -1 +1 @@
-require "../spec/**"
+require "../spec/tartrazine_spec.cr"
				`@@ -0,0 +1 @@`
				.e {color: #aa0000;background-color: #ffaaaa;}.b {background-color: #f0f3f3;tab-size: 8;}.k {color: #006699;font-weight: bold;}.kp {font-weight: 600;}.kt {color: #007788;}.na {color: #330099;}.nb {color: #336666;}.nc {color: #00aa88;font-weight: bold;}.nc {color: #336600;}.nd {color: #9999ff;}.ne {color: #999999;font-weight: bold;}.ne {color: #cc0000;font-weight: bold;}.nf {color: #cc00ff;}.nl {color: #9999ff;}.nn {color: #00ccff;font-weight: bold;}.nt {color: #330099;font-weight: bold;}.nv {color: #003333;}.ls {color: #cc3300;}.lsd {font-style: italic;}.lse {color: #cc3300;font-weight: bold;}.lsi {color: #aa0000;}.lso {color: #cc3300;}.lsr {color: #33aaaa;}.lss {color: #ffcc33;}.ln {color: #ff6600;}.o {color: #555555;}.ow {color: #000000;font-weight: bold;}.c {color: #0099ff;font-style: italic;}.cs {font-weight: bold;}.cp {color: #009999;font-style: normal;}.gd {background-color: #ffcccc;border: 1px solid #cc0000;}.ge {font-style: italic;}.ge {color: #ff0000;}.gh {color: #003300;font-weight: bold;}.gi {background-color: #ccffcc;border: 1px solid #00cc00;}.go {color: #aaaaaa;}.gp {color: #000099;font-weight: bold;}.gs {font-weight: bold;}.gs {color: #003300;font-weight: bold;}.gt {color: #99cc66;}.gu {text-decoration: underline;}.tw {color: #bbbbbb;}.lh {}
				`@@ -0,0 +1 @@`
				.b {color: #b7b7b7;background-color: #101010;font-weight: bold;tab-size: 8;}.lh {color: #8eaaaa;background-color: #232323;}.t {color: #b7b7b7;}.e {color: #de6e6e;}.c {color: #333333;}.cp {color: #876c4f;}.cpf {color: #5f8787;}.k {color: #d69094;}.kt {color: #de6e6e;}.na {color: #8eaaaa;}.nb {color: #de6e6e;}.nbp {color: #de6e6e;}.nc {color: #8eaaaa;}.nc {color: #dab083;}.nd {color: #dab083;}.nf {color: #8eaaaa;}.nn {color: #8eaaaa;}.nt {color: #d69094;}.nv {color: #8eaaaa;}.nvi {color: #de6e6e;}.ln {color: #dab083;}.o {color: #60a592;}.ow {color: #d69094;}.l {color: #5f8787;}.ls {color: #5f8787;}.lsi {color: #876c4f;}.lsr {color: #60a592;}.lss {color: #dab083;}