diff --git a/spec/css/manni.css b/spec/css/manni.css new file mode 100644 index 0000000..8e61de8 --- /dev/null +++ b/spec/css/manni.css @@ -0,0 +1 @@ +.e {color: #aa0000;background-color: #ffaaaa;}.b {background-color: #f0f3f3;tab-size: 8;}.k {color: #006699;font-weight: bold;}.kp {font-weight: 600;}.kt {color: #007788;}.na {color: #330099;}.nb {color: #336666;}.nc {color: #00aa88;font-weight: bold;}.nc {color: #336600;}.nd {color: #9999ff;}.ne {color: #999999;font-weight: bold;}.ne {color: #cc0000;font-weight: bold;}.nf {color: #cc00ff;}.nl {color: #9999ff;}.nn {color: #00ccff;font-weight: bold;}.nt {color: #330099;font-weight: bold;}.nv {color: #003333;}.ls {color: #cc3300;}.lsd {font-style: italic;}.lse {color: #cc3300;font-weight: bold;}.lsi {color: #aa0000;}.lso {color: #cc3300;}.lsr {color: #33aaaa;}.lss {color: #ffcc33;}.ln {color: #ff6600;}.o {color: #555555;}.ow {color: #000000;font-weight: bold;}.c {color: #0099ff;font-style: italic;}.cs {font-weight: bold;}.cp {color: #009999;font-style: normal;}.gd {background-color: #ffcccc;border: 1px solid #cc0000;}.ge {font-style: italic;}.ge {color: #ff0000;}.gh {color: #003300;font-weight: bold;}.gi {background-color: #ccffcc;border: 1px solid #00cc00;}.go {color: #aaaaaa;}.gp {color: #000099;font-weight: bold;}.gs {font-weight: bold;}.gs {color: #003300;font-weight: bold;}.gt {color: #99cc66;}.gu {text-decoration: underline;}.tw {color: #bbbbbb;}.lh {} diff --git a/spec/css/vesper.css b/spec/css/vesper.css new file mode 100644 index 0000000..0efb561 --- /dev/null +++ b/spec/css/vesper.css @@ -0,0 +1 @@ +.b {color: #b7b7b7;background-color: #101010;font-weight: bold;tab-size: 8;}.lh {color: #8eaaaa;background-color: #232323;}.t {color: #b7b7b7;}.e {color: #de6e6e;}.c {color: #333333;}.cp {color: #876c4f;}.cpf {color: #5f8787;}.k {color: #d69094;}.kt {color: #de6e6e;}.na {color: #8eaaaa;}.nb {color: #de6e6e;}.nbp {color: #de6e6e;}.nc {color: #8eaaaa;}.nc {color: #dab083;}.nd {color: #dab083;}.nf {color: #8eaaaa;}.nn {color: #8eaaaa;}.nt {color: #d69094;}.nv {color: #8eaaaa;}.nvi {color: #de6e6e;}.ln {color: #dab083;}.o {color: #60a592;}.ow {color: #d69094;}.l {color: #5f8787;}.ls {color: #5f8787;}.lsi {color: #876c4f;}.lsr {color: #60a592;}.lss {color: #dab083;} diff --git a/spec/examples/crystal/lexer.cr b/spec/examples/crystal/lexer.cr new file mode 100644 index 0000000..6ec8522 --- /dev/null +++ b/spec/examples/crystal/lexer.cr @@ -0,0 +1,413 @@ +require "./constants/lexers" +require "./heuristics" +require "baked_file_system" +require "crystal/syntax_highlighter" + +module Tartrazine + class LexerFiles + extend BakedFileSystem + bake_folder "../lexers", __DIR__ + end + + # Get the lexer object for a language name + # FIXME: support mimetypes + def self.lexer(name : String? = nil, filename : String? = nil, mimetype : String? = nil) : BaseLexer + return lexer_by_name(name) if name && name != "autodetect" + return lexer_by_filename(filename) if filename + return lexer_by_mimetype(mimetype) if mimetype + + RegexLexer.from_xml(LexerFiles.get("/#{LEXERS_BY_NAME["plaintext"]}.xml").gets_to_end) + end + + private def self.lexer_by_mimetype(mimetype : String) : BaseLexer + lexer_file_name = LEXERS_BY_MIMETYPE.fetch(mimetype, nil) + raise Exception.new("Unknown mimetype: #{mimetype}") if lexer_file_name.nil? + + RegexLexer.from_xml(LexerFiles.get("/#{lexer_file_name}.xml").gets_to_end) + end + + private def self.lexer_by_name(name : String) : BaseLexer + return CrystalLexer.new if name == "crystal" + lexer_file_name = LEXERS_BY_NAME.fetch(name.downcase, nil) + return create_delegating_lexer(name) if lexer_file_name.nil? && name.includes? "+" + raise Exception.new("Unknown lexer: #{name}") if lexer_file_name.nil? + + RegexLexer.from_xml(LexerFiles.get("/#{lexer_file_name}.xml").gets_to_end) + end + + private def self.lexer_by_filename(filename : String) : BaseLexer + if filename.ends_with?(".cr") + return CrystalLexer.new + end + + candidates = Set(String).new + LEXERS_BY_FILENAME.each do |k, v| + candidates += v.to_set if File.match?(k, File.basename(filename)) + end + + case candidates.size + when 0 + lexer_file_name = LEXERS_BY_NAME["plaintext"] + when 1 + lexer_file_name = candidates.first + else + lexer_file_name = self.lexer_by_content(filename) + begin + return self.lexer(lexer_file_name) + rescue ex : Exception + raise Exception.new("Multiple lexers match the filename: #{candidates.to_a.join(", ")}, heuristics suggest #{lexer_file_name} but there is no matching lexer.") + end + end + + RegexLexer.from_xml(LexerFiles.get("/#{lexer_file_name}.xml").gets_to_end) + end + + private def self.lexer_by_content(fname : String) : String? + h = Linguist::Heuristic.from_yaml(LexerFiles.get("/heuristics.yml").gets_to_end) + result = h.run(fname, File.read(fname)) + case result + when Nil + raise Exception.new "No lexer found for #{fname}" + when String + result.as(String) + when Array(String) + result.first + end + end + + private def self.create_delegating_lexer(name : String) : BaseLexer + language, root = name.split("+", 2) + language_lexer = lexer(language) + root_lexer = lexer(root) + DelegatingLexer.new(language_lexer, root_lexer) + end + + # Return a list of all lexers + def self.lexers : Array(String) + LEXERS_BY_NAME.keys.sort! + end + + # A token, the output of the tokenizer + alias Token = NamedTuple(type: String, value: String) + + abstract class BaseTokenizer + end + + class Tokenizer < BaseTokenizer + include Iterator(Token) + property lexer : BaseLexer + property text : Bytes + property pos : Int32 = 0 + @dq = Deque(Token).new + property state_stack = ["root"] + + def initialize(@lexer : BaseLexer, text : String, secondary = false) + # Respect the `ensure_nl` config option + if text.size > 0 && text[-1] != '\n' && @lexer.config[:ensure_nl] && !secondary + text += "\n" + end + @text = text.to_slice + end + + def next : Iterator::Stop | Token + if @dq.size > 0 + return @dq.shift + end + if pos == @text.size + return stop + end + + matched = false + while @pos < @text.size + @lexer.states[@state_stack.last].rules.each do |rule| + matched, new_pos, new_tokens = rule.match(@text, @pos, self) + if matched + @pos = new_pos + split_tokens(new_tokens).each { |token| @dq << token } + break + end + end + if !matched + if @text[@pos] == 10u8 + @dq << {type: "Text", value: "\n"} + @state_stack = ["root"] + else + @dq << {type: "Error", value: String.new(@text[@pos..@pos])} + end + @pos += 1 + break + end + end + self.next + end + + # If a token contains a newline, split it into two tokens + def split_tokens(tokens : Array(Token)) : Array(Token) + split_tokens = [] of Token + tokens.each do |token| + if token[:value].includes?("\n") + values = token[:value].split("\n") + values.each_with_index do |value, index| + value += "\n" if index < values.size - 1 + split_tokens << {type: token[:type], value: value} + end + else + split_tokens << token + end + end + split_tokens + end + end + + alias BaseLexer = Lexer + + abstract class Lexer + property config = { + name: "", + priority: 0.0, + case_insensitive: false, + dot_all: false, + not_multiline: false, + ensure_nl: false, + } + property states = {} of String => State + + def tokenizer(text : String, secondary = false) : BaseTokenizer + Tokenizer.new(self, text, secondary) + end + end + + # This implements a lexer for Pygments RegexLexers as expressed + # in Chroma's XML serialization. + # + # For explanations on what actions and states do + # the Pygments documentation is a good place to start. + # https://pygments.org/docs/lexerdevelopment/ + class RegexLexer < BaseLexer + # Collapse consecutive tokens of the same type for easier comparison + # and smaller output + def self.collapse_tokens(tokens : Array(Tartrazine::Token)) : Array(Tartrazine::Token) + result = [] of Tartrazine::Token + tokens = tokens.reject { |token| token[:value] == "" } + tokens.each do |token| + if result.empty? + result << token + next + end + last = result.last + if last[:type] == token[:type] + new_token = {type: last[:type], value: last[:value] + token[:value]} + result.pop + result << new_token + else + result << token + end + end + result + end + + def self.from_xml(xml : String) : Lexer + l = RegexLexer.new + lexer = XML.parse(xml).first_element_child + if lexer + config = lexer.children.find { |node| + node.name == "config" + } + if config + l.config = { + name: xml_to_s(config, name) || "", + priority: xml_to_f(config, priority) || 0.0, + not_multiline: xml_to_s(config, not_multiline) == "true", + dot_all: xml_to_s(config, dot_all) == "true", + case_insensitive: xml_to_s(config, case_insensitive) == "true", + ensure_nl: xml_to_s(config, ensure_nl) == "true", + } + end + + rules = lexer.children.find { |node| + node.name == "rules" + } + if rules + # Rules contains states 🤷 + rules.children.select { |node| + node.name == "state" + }.each do |state_node| + state = State.new + state.name = state_node["name"] + if l.states.has_key?(state.name) + raise Exception.new("Duplicate state: #{state.name}") + else + l.states[state.name] = state + end + # And states contain rules 🤷 + state_node.children.select { |node| + node.name == "rule" + }.each do |rule_node| + case rule_node["pattern"]? + when nil + if rule_node.first_element_child.try &.name == "include" + rule = IncludeStateRule.new(rule_node) + else + rule = UnconditionalRule.new(rule_node) + end + else + rule = Rule.new(rule_node, + multiline: !l.config[:not_multiline], + dotall: l.config[:dot_all], + ignorecase: l.config[:case_insensitive]) + end + state.rules << rule + end + end + end + end + l + end + end + + # A lexer that takes two lexers as arguments. A root lexer + # and a language lexer. Everything is scalled using the + # language lexer, afterwards all `Other` tokens are lexed + # using the root lexer. + # + # This is useful for things like template languages, where + # you have Jinja + HTML or Jinja + CSS and so on. + class DelegatingLexer < Lexer + property language_lexer : BaseLexer + property root_lexer : BaseLexer + + def initialize(@language_lexer : BaseLexer, @root_lexer : BaseLexer) + end + + def tokenizer(text : String, secondary = false) : DelegatingTokenizer + DelegatingTokenizer.new(self, text, secondary) + end + end + + # This Tokenizer works with a DelegatingLexer. It first tokenizes + # using the language lexer, and "Other" tokens are tokenized using + # the root lexer. + class DelegatingTokenizer < BaseTokenizer + include Iterator(Token) + @dq = Deque(Token).new + @language_tokenizer : BaseTokenizer + + def initialize(@lexer : DelegatingLexer, text : String, secondary = false) + # Respect the `ensure_nl` config option + if text.size > 0 && text[-1] != '\n' && @lexer.config[:ensure_nl] && !secondary + text += "\n" + end + @language_tokenizer = @lexer.language_lexer.tokenizer(text, true) + end + + def next : Iterator::Stop | Token + if @dq.size > 0 + return @dq.shift + end + token = @language_tokenizer.next + if token.is_a? Iterator::Stop + return stop + elsif token.as(Token).[:type] == "Other" + root_tokenizer = @lexer.root_lexer.tokenizer(token.as(Token).[:value], true) + root_tokenizer.each do |root_token| + @dq << root_token + end + else + @dq << token.as(Token) + end + self.next + end + end + + # A Lexer state. A state has a name and a list of rules. + # The state machine has a state stack containing references + # to states to decide which rules to apply. + struct State + property name : String = "" + property rules = [] of BaseRule + + def +(other : State) + new_state = State.new + new_state.name = Random.base58(8) + new_state.rules = rules + other.rules + new_state + end + end + + class CustomCrystalHighlighter < Crystal::SyntaxHighlighter + @tokens = [] of Token + + def render_delimiter(&block) + @tokens << {type: "LiteralString", value: block.call.to_s} + end + + def render_interpolation(&block) + @tokens << {type: "LiteralStringInterpol", value: "\#{"} + @tokens << {type: "Text", value: block.call.to_s} + @tokens << {type: "LiteralStringInterpol", value: "}"} + end + + def render_string_array(&block) + @tokens << {type: "LiteralString", value: block.call.to_s} + end + + # ameba:disable Metrics/CyclomaticComplexity + def render(type : TokenType, value : String) + case type + when .comment? + @tokens << {type: "Comment", value: value} + when .number? + @tokens << {type: "LiteralNumber", value: value} + when .char? + @tokens << {type: "LiteralStringChar", value: value} + when .symbol? + @tokens << {type: "LiteralStringSymbol", value: value} + when .const? + @tokens << {type: "NameConstant", value: value} + when .string? + @tokens << {type: "LiteralString", value: value} + when .ident? + @tokens << {type: "NameVariable", value: value} + when .keyword?, .self? + @tokens << {type: "NameKeyword", value: value} + when .primitive_literal? + @tokens << {type: "Literal", value: value} + when .operator? + @tokens << {type: "Operator", value: value} + when Crystal::SyntaxHighlighter::TokenType::DELIMITED_TOKEN, Crystal::SyntaxHighlighter::TokenType::DELIMITER_START, Crystal::SyntaxHighlighter::TokenType::DELIMITER_END + @tokens << {type: "LiteralString", value: value} + else + @tokens << {type: "Text", value: value} + end + end + end + + class CrystalTokenizer < Tartrazine::BaseTokenizer + include Iterator(Token) + @hl = CustomCrystalHighlighter.new + @lexer : BaseLexer + @iter : Iterator(Token) + + # delegate next, to: @iter + + def initialize(@lexer : BaseLexer, text : String, secondary = false) + # Respect the `ensure_nl` config option + if text.size > 0 && text[-1] != '\n' && @lexer.config[:ensure_nl] && !secondary + text += "\n" + end + # Just do the tokenizing + @hl.highlight(text) + @iter = @hl.@tokens.each + end + + def next : Iterator::Stop | Token + @iter.next + end + end + + class CrystalLexer < BaseLexer + def tokenizer(text : String, secondary = false) : BaseTokenizer + CrystalTokenizer.new(self, text, secondary) + end + end +end diff --git a/spec/tartrazine_spec.cr b/spec/tartrazine_spec.cr index ce29075..b402cb8 100644 --- a/spec/tartrazine_spec.cr +++ b/spec/tartrazine_spec.cr @@ -6,6 +6,9 @@ testcases = Dir.glob("#{__DIR__}/tests/**/*txt").sort # These are custom testcases examples = Dir.glob("#{__DIR__}/examples/**/*.*").reject(&.ends_with? ".json").sort! +# CSS Stylesheets +css_files = Dir.glob("#{__DIR__}/css/*.css") + # These lexers don't load because of parsing issues failing_lexers = { "webgpu_shading_language", @@ -81,6 +84,17 @@ describe Tartrazine do end end + describe "formatter" do + css_files.each do |css_file| + it "generates #{css_file}" do + css = File.read(css_file) + theme = Tartrazine.theme(File.basename(css_file, ".css")) + formatter = Tartrazine::Html.new(theme: theme) + formatter.style_defs.strip.should eq css.strip + end + end + end + describe "to_html" do it "should do basic highlighting" do html = Tartrazine.to_html("puts 'Hello, World!'", "ruby", standalone: false) diff --git a/src/run_tests.cr b/src/run_tests.cr index a567ffd..4b5f229 100644 --- a/src/run_tests.cr +++ b/src/run_tests.cr @@ -1 +1 @@ -require "../spec/**" +require "../spec/tartrazine_spec.cr"