require "./constants/lexers" require "./heuristics" require "baked_file_system" require "crystal/syntax_highlighter" module Tartrazine class LexerFiles extend BakedFileSystem bake_folder "../lexers", __DIR__ end # Get the lexer object for a language name # FIXME: support mimetypes def self.lexer(name : String? = nil, filename : String? = nil, mimetype : String? = nil) : BaseLexer return lexer_by_name(name) if name && name != "autodetect" return lexer_by_filename(filename) if filename return lexer_by_mimetype(mimetype) if mimetype RegexLexer.from_xml(LexerFiles.get("/#{LEXERS_BY_NAME["plaintext"]}.xml").gets_to_end) end private def self.lexer_by_mimetype(mimetype : String) : BaseLexer lexer_file_name = LEXERS_BY_MIMETYPE.fetch(mimetype, nil) raise Exception.new("Unknown mimetype: #{mimetype}") if lexer_file_name.nil? RegexLexer.from_xml(LexerFiles.get("/#{lexer_file_name}.xml").gets_to_end) end private def self.lexer_by_name(name : String) : BaseLexer return CrystalLexer.new if name == "crystal" lexer_file_name = LEXERS_BY_NAME.fetch(name.downcase, nil) return create_delegating_lexer(name) if lexer_file_name.nil? && name.includes? "+" raise Exception.new("Unknown lexer: #{name}") if lexer_file_name.nil? RegexLexer.from_xml(LexerFiles.get("/#{lexer_file_name}.xml").gets_to_end) end private def self.lexer_by_filename(filename : String) : BaseLexer if filename.ends_with?(".cr") return CrystalLexer.new end candidates = Set(String).new LEXERS_BY_FILENAME.each do |k, v| candidates += v.to_set if File.match?(k, File.basename(filename)) end case candidates.size when 0 lexer_file_name = LEXERS_BY_NAME["plaintext"] when 1 lexer_file_name = candidates.first else lexer_file_name = self.lexer_by_content(filename) begin return self.lexer(lexer_file_name) rescue ex : Exception raise Exception.new("Multiple lexers match the filename: #{candidates.to_a.join(", ")}, heuristics suggest #{lexer_file_name} but there is no matching lexer.") end end RegexLexer.from_xml(LexerFiles.get("/#{lexer_file_name}.xml").gets_to_end) end private def self.lexer_by_content(fname : String) : String? h = Linguist::Heuristic.from_yaml(LexerFiles.get("/heuristics.yml").gets_to_end) result = h.run(fname, File.read(fname)) case result when Nil raise Exception.new "No lexer found for #{fname}" when String result.as(String) when Array(String) result.first end end private def self.create_delegating_lexer(name : String) : BaseLexer language, root = name.split("+", 2) language_lexer = lexer(language) root_lexer = lexer(root) DelegatingLexer.new(language_lexer, root_lexer) end # Return a list of all lexers def self.lexers : Array(String) LEXERS_BY_NAME.keys.sort! end # A token, the output of the tokenizer alias Token = NamedTuple(type: String, value: String) abstract class BaseTokenizer end class Tokenizer < BaseTokenizer include Iterator(Token) property lexer : BaseLexer property text : Bytes property pos : Int32 = 0 @dq = Deque(Token).new property state_stack = ["root"] def initialize(@lexer : BaseLexer, text : String, secondary = false) # Respect the `ensure_nl` config option if text.size > 0 && text[-1] != '\n' && @lexer.config[:ensure_nl] && !secondary text += "\n" end @text = text.to_slice end def next : Iterator::Stop | Token if @dq.size > 0 return @dq.shift end if pos == @text.size return stop end matched = false while @pos < @text.size @lexer.states[@state_stack.last].rules.each do |rule| matched, new_pos, new_tokens = rule.match(@text, @pos, self) if matched @pos = new_pos split_tokens(new_tokens).each { |token| @dq << token } break end end if !matched if @text[@pos] == 10u8 @dq << {type: "Text", value: "\n"} @state_stack = ["root"] else @dq << {type: "Error", value: String.new(@text[@pos..@pos])} end @pos += 1 break end end self.next end # If a token contains a newline, split it into two tokens def split_tokens(tokens : Array(Token)) : Array(Token) split_tokens = [] of Token tokens.each do |token| if token[:value].includes?("\n") values = token[:value].split("\n") values.each_with_index do |value, index| value += "\n" if index < values.size - 1 split_tokens << {type: token[:type], value: value} end else split_tokens << token end end split_tokens end end alias BaseLexer = Lexer abstract class Lexer property config = { name: "", priority: 0.0, case_insensitive: false, dot_all: false, not_multiline: false, ensure_nl: false, } property states = {} of String => State def tokenizer(text : String, secondary = false) : BaseTokenizer Tokenizer.new(self, text, secondary) end end # This implements a lexer for Pygments RegexLexers as expressed # in Chroma's XML serialization. # # For explanations on what actions and states do # the Pygments documentation is a good place to start. # https://pygments.org/docs/lexerdevelopment/ class RegexLexer < BaseLexer # Collapse consecutive tokens of the same type for easier comparison # and smaller output def self.collapse_tokens(tokens : Array(Tartrazine::Token)) : Array(Tartrazine::Token) result = [] of Tartrazine::Token tokens = tokens.reject { |token| token[:value] == "" } tokens.each do |token| if result.empty? result << token next end last = result.last if last[:type] == token[:type] new_token = {type: last[:type], value: last[:value] + token[:value]} result.pop result << new_token else result << token end end result end def self.from_xml(xml : String) : Lexer l = RegexLexer.new lexer = XML.parse(xml).first_element_child if lexer config = lexer.children.find { |node| node.name == "config" } if config l.config = { name: xml_to_s(config, name) || "", priority: xml_to_f(config, priority) || 0.0, not_multiline: xml_to_s(config, not_multiline) == "true", dot_all: xml_to_s(config, dot_all) == "true", case_insensitive: xml_to_s(config, case_insensitive) == "true", ensure_nl: xml_to_s(config, ensure_nl) == "true", } end rules = lexer.children.find { |node| node.name == "rules" } if rules # Rules contains states 🤷 rules.children.select { |node| node.name == "state" }.each do |state_node| state = State.new state.name = state_node["name"] if l.states.has_key?(state.name) raise Exception.new("Duplicate state: #{state.name}") else l.states[state.name] = state end # And states contain rules 🤷 state_node.children.select { |node| node.name == "rule" }.each do |rule_node| case rule_node["pattern"]? when nil if rule_node.first_element_child.try &.name == "include" rule = IncludeStateRule.new(rule_node) else rule = UnconditionalRule.new(rule_node) end else rule = Rule.new(rule_node, multiline: !l.config[:not_multiline], dotall: l.config[:dot_all], ignorecase: l.config[:case_insensitive]) end state.rules << rule end end end end l end end # A lexer that takes two lexers as arguments. A root lexer # and a language lexer. Everything is scalled using the # language lexer, afterwards all `Other` tokens are lexed # using the root lexer. # # This is useful for things like template languages, where # you have Jinja + HTML or Jinja + CSS and so on. class DelegatingLexer < Lexer property language_lexer : BaseLexer property root_lexer : BaseLexer def initialize(@language_lexer : BaseLexer, @root_lexer : BaseLexer) end def tokenizer(text : String, secondary = false) : DelegatingTokenizer DelegatingTokenizer.new(self, text, secondary) end end # This Tokenizer works with a DelegatingLexer. It first tokenizes # using the language lexer, and "Other" tokens are tokenized using # the root lexer. class DelegatingTokenizer < BaseTokenizer include Iterator(Token) @dq = Deque(Token).new @language_tokenizer : BaseTokenizer def initialize(@lexer : DelegatingLexer, text : String, secondary = false) # Respect the `ensure_nl` config option if text.size > 0 && text[-1] != '\n' && @lexer.config[:ensure_nl] && !secondary text += "\n" end @language_tokenizer = @lexer.language_lexer.tokenizer(text, true) end def next : Iterator::Stop | Token if @dq.size > 0 return @dq.shift end token = @language_tokenizer.next if token.is_a? Iterator::Stop return stop elsif token.as(Token).[:type] == "Other" root_tokenizer = @lexer.root_lexer.tokenizer(token.as(Token).[:value], true) root_tokenizer.each do |root_token| @dq << root_token end else @dq << token.as(Token) end self.next end end # A Lexer state. A state has a name and a list of rules. # The state machine has a state stack containing references # to states to decide which rules to apply. struct State property name : String = "" property rules = [] of BaseRule def +(other : State) new_state = State.new new_state.name = Random.base58(8) new_state.rules = rules + other.rules new_state end end class CustomCrystalHighlighter < Crystal::SyntaxHighlighter @tokens = [] of Token def render_delimiter(&block) @tokens << {type: "LiteralString", value: block.call.to_s} end def render_interpolation(&block) @tokens << {type: "LiteralStringInterpol", value: "\#{"} @tokens << {type: "Text", value: block.call.to_s} @tokens << {type: "LiteralStringInterpol", value: "}"} end def render_string_array(&block) @tokens << {type: "LiteralString", value: block.call.to_s} end # ameba:disable Metrics/CyclomaticComplexity def render(type : TokenType, value : String) case type when .comment? @tokens << {type: "Comment", value: value} when .number? @tokens << {type: "LiteralNumber", value: value} when .char? @tokens << {type: "LiteralStringChar", value: value} when .symbol? @tokens << {type: "LiteralStringSymbol", value: value} when .const? @tokens << {type: "NameConstant", value: value} when .string? @tokens << {type: "LiteralString", value: value} when .ident? @tokens << {type: "NameVariable", value: value} when .keyword?, .self? @tokens << {type: "NameKeyword", value: value} when .primitive_literal? @tokens << {type: "Literal", value: value} when .operator? @tokens << {type: "Operator", value: value} when Crystal::SyntaxHighlighter::TokenType::DELIMITED_TOKEN, Crystal::SyntaxHighlighter::TokenType::DELIMITER_START, Crystal::SyntaxHighlighter::TokenType::DELIMITER_END @tokens << {type: "LiteralString", value: value} else @tokens << {type: "Text", value: value} end end end class CrystalTokenizer < Tartrazine::BaseTokenizer include Iterator(Token) @hl = CustomCrystalHighlighter.new @lexer : BaseLexer @iter : Iterator(Token) # delegate next, to: @iter def initialize(@lexer : BaseLexer, text : String, secondary = false) # Respect the `ensure_nl` config option if text.size > 0 && text[-1] != '\n' && @lexer.config[:ensure_nl] && !secondary text += "\n" end # Just do the tokenizing @hl.highlight(text) @iter = @hl.@tokens.each end def next : Iterator::Stop | Token @iter.next end end class CrystalLexer < BaseLexer def tokenizer(text : String, secondary = false) : BaseTokenizer CrystalTokenizer.new(self, text, secondary) end end end