tartrazine/spec/tartrazine_spec.cr

require "./spec_helper"

# These are the testcases from Pygments
testcases = Dir.glob("#{__DIR__}/tests/**/*txt").sort

# These lexers don't load because of parsing issues
failing_lexers = {
  "webgpu_shading_language",
}

# These testcases fail because of differences in the way chroma and tartrazine
# represent unicode, but they are actually correct
unicode_problems = {
  "#{__DIR__}/tests/java/test_string_literals.txt",
  "#{__DIR__}/tests/json/test_strings.txt",
  "#{__DIR__}/tests/systemd/example1.txt",
}

# These testcases fail because of differences in the way chroma and tartrazine tokenize
# but tartrazine is correct
bad_in_chroma = {
  "#{__DIR__}/tests/bash_session/test_comment_after_prompt.txt",
  "#{__DIR__}/tests/java/test_default.txt",
  "#{__DIR__}/tests/java/test_multiline_string.txt",
  "#{__DIR__}/tests/java/test_numeric_literals.txt",
  "#{__DIR__}/tests/php/test_string_escaping_run.txt",
  "#{__DIR__}/tests/python_2/test_cls_builtin.txt",
}

known_bad = {
  "#{__DIR__}/tests/bash_session/fake_ps2_prompt.txt",
  "#{__DIR__}/tests/bash_session/prompt_in_output.txt",
  "#{__DIR__}/tests/bash_session/test_newline_in_echo_no_ps2.txt",
  "#{__DIR__}/tests/bash_session/test_newline_in_ls_ps2.txt",
  "#{__DIR__}/tests/bash_session/ps2_prompt.txt",
  "#{__DIR__}/tests/bash_session/test_newline_in_ls_no_ps2.txt",
  "#{__DIR__}/tests/bash_session/test_virtualenv.txt",
  "#{__DIR__}/tests/bash_session/test_newline_in_echo_ps2.txt",
  "#{__DIR__}/tests/c/test_string_resembling_decl_end.txt",
  "#{__DIR__}/tests/html/css_backtracking.txt",
  "#{__DIR__}/tests/mcfunction/data.txt",
  "#{__DIR__}/tests/mcfunction/selectors.txt",
  "#{__DIR__}/tests/php/anonymous_class.txt",
  "#{__DIR__}/tests/html/javascript_unclosed.txt",

}

# Tests that fail because of a limitation in PCRE2
not_my_fault = {
  "#{__DIR__}/tests/fortran/test_string_cataback.txt",
}

describe Tartrazine do
  describe "Lexer" do
    testcases.each do |testcase|
      if known_bad.includes?(testcase)
        pending "parses #{testcase}".split("/")[-2...].join("/") do
        end
      else
        it "parses #{testcase}".split("/")[-2...].join("/") do
          text = File.read(testcase).split("---input---\n").last.split("---tokens---").first
          lexer_name = File.basename(File.dirname(testcase)).downcase
          unless failing_lexers.includes?(lexer_name) ||
                 unicode_problems.includes?(testcase) ||
                 bad_in_chroma.includes?(testcase) ||
                 not_my_fault.includes?(testcase)
            tokenize(lexer_name, text).should eq(chroma_tokenize(lexer_name, text))
          end
        end
      end
    end
  end
end

# Helper that creates lexer and tokenizes
def tokenize(lexer_name, text)
  lexer = Tartrazine.get_lexer(lexer_name)
  collapse_tokens(lexer.tokenize(text))
end

# Helper that tokenizes using chroma to validate the lexer
def chroma_tokenize(lexer_name, text)
  output = IO::Memory.new
  input = IO::Memory.new(text)
  Process.run(
    "chroma",
    ["-f", "json", "-l", lexer_name],
    input: input, output: output
  )
  collapse_tokens(Array(Tartrazine::Token).from_json(output.to_s))
end

# Collapse consecutive tokens of the same type for easier comparison
def collapse_tokens(tokens : Array(Tartrazine::Token))
  result = [] of Tartrazine::Token

  tokens.each do |token|
    if result.empty?
      result << token
      next
    end
    last = result.last
    if last[:type] == token[:type]
      new_token = {type: last[:type], value: last[:value] + token[:value]}
      result.pop
      result << new_token
    else
      result << token
    end
  end
  result
end
Initial dumb stuff 2024-08-02 20:03:39 +00:00			`require "./spec_helper"`

Reorganize tests into a real spec suite 2024-08-04 22:18:43 +00:00			`# These are the testcases from Pygments`
			`testcases = Dir.glob("#{__DIR__}/tests/*/txt").sort`

			`# These lexers don't load because of parsing issues`
			`failing_lexers = {`
			`"webgpu_shading_language",`
			`}`

			`# These testcases fail because of differences in the way chroma and tartrazine`
			`# represent unicode, but they are actually correct`
			`unicode_problems = {`
			`"#{__DIR__}/tests/java/test_string_literals.txt",`
			`"#{__DIR__}/tests/json/test_strings.txt",`
			`"#{__DIR__}/tests/systemd/example1.txt",`
			`}`

			`# These testcases fail because of differences in the way chroma and tartrazine tokenize`
			`# but tartrazine is correct`
			`bad_in_chroma = {`
			`"#{__DIR__}/tests/bash_session/test_comment_after_prompt.txt",`
			`"#{__DIR__}/tests/java/test_default.txt",`
			`"#{__DIR__}/tests/java/test_multiline_string.txt",`
			`"#{__DIR__}/tests/java/test_numeric_literals.txt",`
			`"#{__DIR__}/tests/php/test_string_escaping_run.txt",`
			`"#{__DIR__}/tests/python_2/test_cls_builtin.txt",`
			`}`

			`known_bad = {`
			`"#{__DIR__}/tests/bash_session/fake_ps2_prompt.txt",`
			`"#{__DIR__}/tests/bash_session/prompt_in_output.txt",`
			`"#{__DIR__}/tests/bash_session/test_newline_in_echo_no_ps2.txt",`
			`"#{__DIR__}/tests/bash_session/test_newline_in_ls_ps2.txt",`
			`"#{__DIR__}/tests/bash_session/ps2_prompt.txt",`
			`"#{__DIR__}/tests/bash_session/test_newline_in_ls_no_ps2.txt",`
			`"#{__DIR__}/tests/bash_session/test_virtualenv.txt",`
			`"#{__DIR__}/tests/bash_session/test_newline_in_echo_ps2.txt",`
			`"#{__DIR__}/tests/c/test_string_resembling_decl_end.txt",`
			`"#{__DIR__}/tests/html/css_backtracking.txt",`
			`"#{__DIR__}/tests/mcfunction/data.txt",`
			`"#{__DIR__}/tests/mcfunction/selectors.txt",`
			`"#{__DIR__}/tests/php/anonymous_class.txt",`
			`"#{__DIR__}/tests/html/javascript_unclosed.txt",`

			`}`

			`# Tests that fail because of a limitation in PCRE2`
			`not_my_fault = {`
			`"#{__DIR__}/tests/fortran/test_string_cataback.txt",`
			`}`

Initial dumb stuff 2024-08-02 20:03:39 +00:00			`describe Tartrazine do`
Reorganize tests into a real spec suite 2024-08-04 22:18:43 +00:00			`describe "Lexer" do`
			`testcases.each do \|testcase\|`
			`if known_bad.includes?(testcase)`
			`pending "parses #{testcase}".split("/")[-2...].join("/") do`
			`end`
			`else`
			`it "parses #{testcase}".split("/")[-2...].join("/") do`
			`text = File.read(testcase).split("---input---\n").last.split("---tokens---").first`
			`lexer_name = File.basename(File.dirname(testcase)).downcase`
			`unless failing_lexers.includes?(lexer_name) \|\|`
			`unicode_problems.includes?(testcase) \|\|`
			`bad_in_chroma.includes?(testcase) \|\|`
			`not_my_fault.includes?(testcase)`
			`tokenize(lexer_name, text).should eq(chroma_tokenize(lexer_name, text))`
			`end`
			`end`
			`end`
			`end`
			`end`
			`end`

			`# Helper that creates lexer and tokenizes`
			`def tokenize(lexer_name, text)`
refactor 2024-08-04 23:00:48 +00:00			`lexer = Tartrazine.get_lexer(lexer_name)`
Reorganize tests into a real spec suite 2024-08-04 22:18:43 +00:00			`collapse_tokens(lexer.tokenize(text))`
			`end`

			`# Helper that tokenizes using chroma to validate the lexer`
			`def chroma_tokenize(lexer_name, text)`
			`output = IO::Memory.new`
			`input = IO::Memory.new(text)`
			`Process.run(`
			`"chroma",`
			`["-f", "json", "-l", lexer_name],`
			`input: input, output: output`
			`)`
			`collapse_tokens(Array(Tartrazine::Token).from_json(output.to_s))`
			`end`

			`# Collapse consecutive tokens of the same type for easier comparison`
			`def collapse_tokens(tokens : Array(Tartrazine::Token))`
			`result = [] of Tartrazine::Token`
Initial dumb stuff 2024-08-02 20:03:39 +00:00
Reorganize tests into a real spec suite 2024-08-04 22:18:43 +00:00			`tokens.each do \|token\|`
			`if result.empty?`
			`result << token`
			`next`
			`end`
			`last = result.last`
			`if last[:type] == token[:type]`
			`new_token = {type: last[:type], value: last[:value] + token[:value]}`
			`result.pop`
			`result << new_token`
			`else`
			`result << token`
			`end`
Initial dumb stuff 2024-08-02 20:03:39 +00:00			`end`
Reorganize tests into a real spec suite 2024-08-04 22:18:43 +00:00			`result`
Initial dumb stuff 2024-08-02 20:03:39 +00:00			`end`