Compare commits

..

3 Commits

7 changed files with 1292 additions and 1216 deletions

View File

@ -72,8 +72,7 @@ end
# Helper that creates lexer and tokenizes # Helper that creates lexer and tokenizes
def tokenize(lexer_name, text) def tokenize(lexer_name, text)
lexer = Tartrazine.lexer(lexer_name) tokenizer = Tartrazine.lexer(lexer_name).tokenizer(text)
tokenizer = Tartrazine::Tokenizer.new(lexer, text)
Tartrazine::Lexer.collapse_tokens(tokenizer.to_a) Tartrazine::Lexer.collapse_tokens(tokenizer.to_a)
end end

View File

@ -115,15 +115,13 @@ module Tartrazine
when ActionType::Using when ActionType::Using
# Shunt to another lexer entirely # Shunt to another lexer entirely
return [] of Token if match.empty? return [] of Token if match.empty?
Tokenizer.new( Tartrazine.lexer(@lexer_name).tokenizer(
Tartrazine.lexer(@lexer_name),
String.new(match[match_group].value), String.new(match[match_group].value),
secondary: true).to_a secondary: true).to_a
when ActionType::Usingself when ActionType::Usingself
# Shunt to another copy of this lexer # Shunt to another copy of this lexer
return [] of Token if match.empty? return [] of Token if match.empty?
Tokenizer.new( tokenizer.lexer.tokenizer(
tokenizer.lexer,
String.new(match[match_group].value), String.new(match[match_group].value),
secondary: true).to_a secondary: true).to_a
when ActionType::Combined when ActionType::Combined

View File

@ -17,8 +17,8 @@ module Tartrazine
outp.to_s outp.to_s
end end
def format(text : String, lexer : Lexer, outp : IO) : Nil def format(text : String, lexer : BaseLexer, outp : IO) : Nil
tokenizer = Tokenizer.new(lexer, text) tokenizer = lexer.tokenizer(text)
i = 0 i = 0
outp << line_label(i) if line_numbers? outp << line_label(i) if line_numbers?
tokenizer.each do |token| tokenizer.each do |token|

View File

@ -40,7 +40,7 @@ module Tartrazine
outp.to_s outp.to_s
end end
def format(text : String, lexer : Lexer, io : IO) : Nil def format(text : String, lexer : BaseLexer, io : IO) : Nil
pre, post = wrap_standalone pre, post = wrap_standalone
io << pre if standalone? io << pre if standalone?
format_text(text, lexer, io) format_text(text, lexer, io)
@ -64,8 +64,8 @@ module Tartrazine
"<span #{line_id} #{line_class} style=\"user-select: none;\">#{line_label} </span>" "<span #{line_id} #{line_class} style=\"user-select: none;\">#{line_label} </span>"
end end
def format_text(text : String, lexer : Lexer, outp : IO) def format_text(text : String, lexer : BaseLexer, outp : IO)
tokenizer = Tokenizer.new(lexer, text) tokenizer = lexer.tokenizer(text)
i = 0 i = 0
if surrounding_pre? if surrounding_pre?
pre_style = wrap_long_lines? ? "style=\"white-space: pre-wrap; word-break: break-word;\"" : "" pre_style = wrap_long_lines? ? "style=\"white-space: pre-wrap; word-break: break-word;\"" : ""

View File

@ -4,14 +4,14 @@ module Tartrazine
class Json < Formatter class Json < Formatter
property name = "json" property name = "json"
def format(text : String, lexer : Lexer) : String def format(text : String, lexer : BaseLexer) : String
outp = String::Builder.new("") outp = String::Builder.new("")
format(text, lexer, outp) format(text, lexer, outp)
outp.to_s outp.to_s
end end
def format(text : String, lexer : Lexer, io : IO) : Nil def format(text : String, lexer : BaseLexer, io : IO) : Nil
tokenizer = Tokenizer.new(lexer, text) tokenizer = lexer.tokenizer(text)
io << Tartrazine::Lexer.collapse_tokens(tokenizer.to_a).to_json io << Tartrazine::Lexer.collapse_tokens(tokenizer.to_a).to_json
end end
end end

View File

@ -9,17 +9,27 @@ module Tartrazine
# Get the lexer object for a language name # Get the lexer object for a language name
# FIXME: support mimetypes # FIXME: support mimetypes
def self.lexer(name : String? = nil, filename : String? = nil) : Lexer def self.lexer(name : String? = nil, filename : String? = nil) : BaseLexer
if name.nil? && filename.nil? return lexer_by_name(name) if name && name != "autodetect"
lexer_file_name = LEXERS_BY_NAME["plaintext"] return lexer_by_filename(filename) if filename
elsif name && name != "autodetect"
lexer_file_name = LEXERS_BY_NAME[name.downcase] Lexer.from_xml(LexerFiles.get("/#{LEXERS_BY_NAME["plaintext"]}.xml").gets_to_end)
else end
# Guess by filename
private def self.lexer_by_name(name : String) : BaseLexer
lexer_file_name = LEXERS_BY_NAME.fetch(name.downcase, nil)
return create_delegating_lexer(name) if lexer_file_name.nil? && name.includes? "+"
raise Exception.new("Unknown lexer: #{name}") if lexer_file_name.nil?
Lexer.from_xml(LexerFiles.get("/#{lexer_file_name}.xml").gets_to_end)
end
private def self.lexer_by_filename(filename : String) : BaseLexer
candidates = Set(String).new candidates = Set(String).new
LEXERS_BY_FILENAME.each do |k, v| LEXERS_BY_FILENAME.each do |k, v|
candidates += v.to_set if File.match?(k, File.basename(filename.to_s)) candidates += v.to_set if File.match?(k, File.basename(filename))
end end
case candidates.size case candidates.size
when 0 when 0
lexer_file_name = LEXERS_BY_NAME["plaintext"] lexer_file_name = LEXERS_BY_NAME["plaintext"]
@ -28,10 +38,16 @@ module Tartrazine
else else
raise Exception.new("Multiple lexers match the filename: #{candidates.to_a.join(", ")}") raise Exception.new("Multiple lexers match the filename: #{candidates.to_a.join(", ")}")
end end
end
Lexer.from_xml(LexerFiles.get("/#{lexer_file_name}.xml").gets_to_end) Lexer.from_xml(LexerFiles.get("/#{lexer_file_name}.xml").gets_to_end)
end end
private def self.create_delegating_lexer(name : String) : BaseLexer
language, root = name.split("+", 2)
language_lexer = lexer(language)
root_lexer = lexer(root)
DelegatingLexer.new(language_lexer, root_lexer)
end
# Return a list of all lexers # Return a list of all lexers
def self.lexers : Array(String) def self.lexers : Array(String)
LEXERS_BY_NAME.keys.sort! LEXERS_BY_NAME.keys.sort!
@ -40,15 +56,18 @@ module Tartrazine
# A token, the output of the tokenizer # A token, the output of the tokenizer
alias Token = NamedTuple(type: String, value: String) alias Token = NamedTuple(type: String, value: String)
struct Tokenizer abstract class BaseTokenizer
end
class Tokenizer < BaseTokenizer
include Iterator(Token) include Iterator(Token)
property lexer : Lexer property lexer : BaseLexer
property text : Bytes property text : Bytes
property pos : Int32 = 0 property pos : Int32 = 0
@dq = Deque(Token).new @dq = Deque(Token).new
property state_stack = ["root"] property state_stack = ["root"]
def initialize(@lexer : Lexer, text : String, secondary = false) def initialize(@lexer : BaseLexer, text : String, secondary = false)
# Respect the `ensure_nl` config option # Respect the `ensure_nl` config option
if text.size > 0 && text[-1] != '\n' && @lexer.config[:ensure_nl] && !secondary if text.size > 0 && text[-1] != '\n' && @lexer.config[:ensure_nl] && !secondary
text += "\n" text += "\n"
@ -106,13 +125,7 @@ module Tartrazine
end end
end end
# This implements a lexer for Pygments RegexLexers as expressed abstract class BaseLexer
# in Chroma's XML serialization.
#
# For explanations on what actions and states do
# the Pygments documentation is a good place to start.
# https://pygments.org/docs/lexerdevelopment/
struct Lexer
property config = { property config = {
name: "", name: "",
priority: 0.0, priority: 0.0,
@ -123,6 +136,18 @@ module Tartrazine
} }
property states = {} of String => State property states = {} of String => State
def tokenizer(text : String, secondary = false) : BaseTokenizer
Tokenizer.new(self, text, secondary)
end
end
# This implements a lexer for Pygments RegexLexers as expressed
# in Chroma's XML serialization.
#
# For explanations on what actions and states do
# the Pygments documentation is a good place to start.
# https://pygments.org/docs/lexerdevelopment/
class Lexer < BaseLexer
# Collapse consecutive tokens of the same type for easier comparison # Collapse consecutive tokens of the same type for easier comparison
# and smaller output # and smaller output
def self.collapse_tokens(tokens : Array(Tartrazine::Token)) : Array(Tartrazine::Token) def self.collapse_tokens(tokens : Array(Tartrazine::Token)) : Array(Tartrazine::Token)
@ -204,6 +229,60 @@ module Tartrazine
end end
end end
# A lexer that takes two lexers as arguments. A root lexer
# and a language lexer. Everything is scalled using the
# language lexer, afterwards all `Other` tokens are lexed
# using the root lexer.
#
# This is useful for things like template languages, where
# you have Jinja + HTML or Jinja + CSS and so on.
class DelegatingLexer < BaseLexer
property language_lexer : BaseLexer
property root_lexer : BaseLexer
def initialize(@language_lexer : BaseLexer, @root_lexer : BaseLexer)
end
def tokenizer(text : String, secondary = false) : DelegatingTokenizer
DelegatingTokenizer.new(self, text, secondary)
end
end
# This Tokenizer works with a DelegatingLexer. It first tokenizes
# using the language lexer, and "Other" tokens are tokenized using
# the root lexer.
class DelegatingTokenizer < BaseTokenizer
include Iterator(Token)
@dq = Deque(Token).new
@language_tokenizer : BaseTokenizer
def initialize(@lexer : DelegatingLexer, text : String, secondary = false)
# Respect the `ensure_nl` config option
if text.size > 0 && text[-1] != '\n' && @lexer.config[:ensure_nl] && !secondary
text += "\n"
end
@language_tokenizer = @lexer.language_lexer.tokenizer(text, true)
end
def next : Iterator::Stop | Token
if @dq.size > 0
return @dq.shift
end
token = @language_tokenizer.next
if token.is_a? Iterator::Stop
return stop
elsif token.as(Token).[:type] == "Other"
root_tokenizer = @lexer.root_lexer.tokenizer(token.as(Token).[:value], true)
root_tokenizer.each do |root_token|
@dq << root_token
end
else
@dq << token.as(Token)
end
self.next
end
end
# A Lexer state. A state has a name and a list of rules. # A Lexer state. A state has a name and a list of rules.
# The state machine has a state stack containing references # The state machine has a state stack containing references
# to states to decide which rules to apply. # to states to decide which rules to apply.