diff --git a/spec/tartrazine_spec.cr b/spec/tartrazine_spec.cr index 15011e9..a139af9 100644 --- a/spec/tartrazine_spec.cr +++ b/spec/tartrazine_spec.cr @@ -72,8 +72,7 @@ end # Helper that creates lexer and tokenizes def tokenize(lexer_name, text) - lexer = Tartrazine.lexer(lexer_name) - tokenizer = Tartrazine::Tokenizer.new(lexer, text) + tokenizer = Tartrazine.lexer(lexer_name).tokenizer(text) Tartrazine::Lexer.collapse_tokens(tokenizer.to_a) end diff --git a/src/actions.cr b/src/actions.cr index b626dd2..74ac190 100644 --- a/src/actions.cr +++ b/src/actions.cr @@ -115,15 +115,13 @@ module Tartrazine when ActionType::Using # Shunt to another lexer entirely return [] of Token if match.empty? - Tokenizer.new( - Tartrazine.lexer(@lexer_name), + Tartrazine.lexer(@lexer_name).tokenizer( String.new(match[match_group].value), secondary: true).to_a when ActionType::Usingself # Shunt to another copy of this lexer return [] of Token if match.empty? - Tokenizer.new( - tokenizer.lexer, + tokenizer.lexer.tokenizer( String.new(match[match_group].value), secondary: true).to_a when ActionType::Combined diff --git a/src/formatters/ansi.cr b/src/formatters/ansi.cr index fc9e608..aaf8864 100644 --- a/src/formatters/ansi.cr +++ b/src/formatters/ansi.cr @@ -17,8 +17,8 @@ module Tartrazine outp.to_s end - def format(text : String, lexer : Lexer, outp : IO) : Nil - tokenizer = Tokenizer.new(lexer, text) + def format(text : String, lexer : BaseLexer, outp : IO) : Nil + tokenizer = lexer.tokenizer(text) i = 0 outp << line_label(i) if line_numbers? tokenizer.each do |token| diff --git a/src/formatters/html.cr b/src/formatters/html.cr index 6fab641..05079b4 100644 --- a/src/formatters/html.cr +++ b/src/formatters/html.cr @@ -40,7 +40,7 @@ module Tartrazine outp.to_s end - def format(text : String, lexer : Lexer, io : IO) : Nil + def format(text : String, lexer : BaseLexer, io : IO) : Nil pre, post = wrap_standalone io << pre if standalone? format_text(text, lexer, io) @@ -64,8 +64,8 @@ module Tartrazine "#{line_label} " end - def format_text(text : String, lexer : Lexer, outp : IO) - tokenizer = Tokenizer.new(lexer, text) + def format_text(text : String, lexer : BaseLexer, outp : IO) + tokenizer = lexer.tokenizer(text) i = 0 if surrounding_pre? pre_style = wrap_long_lines? ? "style=\"white-space: pre-wrap; word-break: break-word;\"" : "" diff --git a/src/formatters/json.cr b/src/formatters/json.cr index 5ba28ea..2669181 100644 --- a/src/formatters/json.cr +++ b/src/formatters/json.cr @@ -4,14 +4,14 @@ module Tartrazine class Json < Formatter property name = "json" - def format(text : String, lexer : Lexer) : String + def format(text : String, lexer : BaseLexer) : String outp = String::Builder.new("") format(text, lexer, outp) outp.to_s end - def format(text : String, lexer : Lexer, io : IO) : Nil - tokenizer = Tokenizer.new(lexer, text) + def format(text : String, lexer : BaseLexer, io : IO) : Nil + tokenizer = lexer.tokenizer(text) io << Tartrazine::Lexer.collapse_tokens(tokenizer.to_a).to_json end end diff --git a/src/lexer.cr b/src/lexer.cr index 20b0eb9..04bf093 100644 --- a/src/lexer.cr +++ b/src/lexer.cr @@ -9,11 +9,20 @@ module Tartrazine # Get the lexer object for a language name # FIXME: support mimetypes - def self.lexer(name : String? = nil, filename : String? = nil) : Lexer + def self.lexer(name : String? = nil, filename : String? = nil) : BaseLexer if name.nil? && filename.nil? lexer_file_name = LEXERS_BY_NAME["plaintext"] elsif name && name != "autodetect" - lexer_file_name = LEXERS_BY_NAME[name.downcase] + lexer_file_name = LEXERS_BY_NAME.fetch(name.downcase, nil) + if lexer_file_name.nil? && name.includes? "+" + # Delegating lexer + language, root = name.split("+", 2) + language_lexer = lexer(language) + root_lexer = lexer(root) + return DelegatingLexer.new(language_lexer, root_lexer) + elsif lexer_file_name.nil? + raise Exception.new("Unknown lexer: #{name}") + end else # Guess by filename candidates = Set(String).new @@ -40,7 +49,10 @@ module Tartrazine # A token, the output of the tokenizer alias Token = NamedTuple(type: String, value: String) - struct Tokenizer + abstract class BaseTokenizer + end + + class Tokenizer < BaseTokenizer include Iterator(Token) property lexer : BaseLexer property text : Bytes @@ -48,7 +60,7 @@ module Tartrazine @dq = Deque(Token).new property state_stack = ["root"] - def initialize(@lexer : Lexer, text : String, secondary = false) + def initialize(@lexer : BaseLexer, text : String, secondary = false) # Respect the `ensure_nl` config option if text.size > 0 && text[-1] != '\n' && @lexer.config[:ensure_nl] && !secondary text += "\n" @@ -106,16 +118,7 @@ module Tartrazine end end - abstract struct BaseLexer - end - - # This implements a lexer for Pygments RegexLexers as expressed - # in Chroma's XML serialization. - # - # For explanations on what actions and states do - # the Pygments documentation is a good place to start. - # https://pygments.org/docs/lexerdevelopment/ - struct Lexer < BaseLexer + abstract class BaseLexer property config = { name: "", priority: 0.0, @@ -126,6 +129,18 @@ module Tartrazine } property states = {} of String => State + def tokenizer(text : String, secondary = false) : BaseTokenizer + Tokenizer.new(self, text, secondary) + end + end + + # This implements a lexer for Pygments RegexLexers as expressed + # in Chroma's XML serialization. + # + # For explanations on what actions and states do + # the Pygments documentation is a good place to start. + # https://pygments.org/docs/lexerdevelopment/ + class Lexer < BaseLexer # Collapse consecutive tokens of the same type for easier comparison # and smaller output def self.collapse_tokens(tokens : Array(Tartrazine::Token)) : Array(Tartrazine::Token) @@ -214,27 +229,32 @@ module Tartrazine # # This is useful for things like template languages, where # you have Jinja + HTML or Jinja + CSS and so on. - struct DelegatingLexer < BaseLexer - property root_lexer : Lexer - property language_lexer : Lexer + class DelegatingLexer < BaseLexer + property language_lexer : BaseLexer + property root_lexer : BaseLexer - def initialize(@lexer : Lexer, @delegate : Lexer) + def initialize(@language_lexer : BaseLexer, @root_lexer : BaseLexer) + end + + def tokenizer(text : String, secondary = false) : DelegatingTokenizer + DelegatingTokenizer.new(self, text, secondary) end end # This Tokenizer works with a DelegatingLexer. It first tokenizes # using the language lexer, and "Other" tokens are tokenized using # the root lexer. - struct DelegatingTokenizer + class DelegatingTokenizer < BaseTokenizer include Iterator(Token) @dq = Deque(Token).new + @language_tokenizer : BaseTokenizer - def initialize(@lexer : Lexer, text : String, secondary = false) + def initialize(@lexer : DelegatingLexer, text : String, secondary = false) # Respect the `ensure_nl` config option if text.size > 0 && text[-1] != '\n' && @lexer.config[:ensure_nl] && !secondary text += "\n" end - @language_tokenizer = Tokenizer.new(@lexer.language_lexer, text, true) + @language_tokenizer = @lexer.language_lexer.tokenizer(text, true) end def next : Iterator::Stop | Token @@ -242,16 +262,15 @@ module Tartrazine return @dq.shift end token = @language_tokenizer.next - if token == Iterator::Stop + if token.is_a? Iterator::Stop return stop - end - if token[:type] == "Other" - @root_tokenizer = Tokenizer.new(@lexer.root_lexer, token[:value], true) - @root_tokenizer.each do |root_token| + elsif token.as(Token).[:type] == "Other" + root_tokenizer = @lexer.root_lexer.tokenizer(token.as(Token).[:value], true) + root_tokenizer.each do |root_token| @dq << root_token end else - dq << token + @dq << token.as(Token) end self.next end