Compare commits

..

No commits in common. "a2884c4c78aaf01b9a71d28ad95e1b2b5c807b32" and "7f4296e9d72659ef798ff9fe0801dab0326ee54b" have entirely different histories.

7 changed files with 1213 additions and 1289 deletions

View File

@ -72,7 +72,8 @@ end
# Helper that creates lexer and tokenizes
def tokenize(lexer_name, text)
tokenizer = Tartrazine.lexer(lexer_name).tokenizer(text)
lexer = Tartrazine.lexer(lexer_name)
tokenizer = Tartrazine::Tokenizer.new(lexer, text)
Tartrazine::Lexer.collapse_tokens(tokenizer.to_a)
end

View File

@ -115,13 +115,15 @@ module Tartrazine
when ActionType::Using
# Shunt to another lexer entirely
return [] of Token if match.empty?
Tartrazine.lexer(@lexer_name).tokenizer(
Tokenizer.new(
Tartrazine.lexer(@lexer_name),
String.new(match[match_group].value),
secondary: true).to_a
when ActionType::Usingself
# Shunt to another copy of this lexer
return [] of Token if match.empty?
tokenizer.lexer.tokenizer(
Tokenizer.new(
tokenizer.lexer,
String.new(match[match_group].value),
secondary: true).to_a
when ActionType::Combined

File diff suppressed because it is too large Load Diff

View File

@ -17,8 +17,8 @@ module Tartrazine
outp.to_s
end
def format(text : String, lexer : BaseLexer, outp : IO) : Nil
tokenizer = lexer.tokenizer(text)
def format(text : String, lexer : Lexer, outp : IO) : Nil
tokenizer = Tokenizer.new(lexer, text)
i = 0
outp << line_label(i) if line_numbers?
tokenizer.each do |token|

View File

@ -40,7 +40,7 @@ module Tartrazine
outp.to_s
end
def format(text : String, lexer : BaseLexer, io : IO) : Nil
def format(text : String, lexer : Lexer, io : IO) : Nil
pre, post = wrap_standalone
io << pre if standalone?
format_text(text, lexer, io)
@ -64,8 +64,8 @@ module Tartrazine
"<span #{line_id} #{line_class} style=\"user-select: none;\">#{line_label} </span>"
end
def format_text(text : String, lexer : BaseLexer, outp : IO)
tokenizer = lexer.tokenizer(text)
def format_text(text : String, lexer : Lexer, outp : IO)
tokenizer = Tokenizer.new(lexer, text)
i = 0
if surrounding_pre?
pre_style = wrap_long_lines? ? "style=\"white-space: pre-wrap; word-break: break-word;\"" : ""

View File

@ -4,14 +4,14 @@ module Tartrazine
class Json < Formatter
property name = "json"
def format(text : String, lexer : BaseLexer) : String
def format(text : String, lexer : Lexer) : String
outp = String::Builder.new("")
format(text, lexer, outp)
outp.to_s
end
def format(text : String, lexer : BaseLexer, io : IO) : Nil
tokenizer = lexer.tokenizer(text)
def format(text : String, lexer : Lexer, io : IO) : Nil
tokenizer = Tokenizer.new(lexer, text)
io << Tartrazine::Lexer.collapse_tokens(tokenizer.to_a).to_json
end
end

View File

@ -9,45 +9,29 @@ module Tartrazine
# Get the lexer object for a language name
# FIXME: support mimetypes
def self.lexer(name : String? = nil, filename : String? = nil) : BaseLexer
return lexer_by_name(name) if name && name != "autodetect"
return lexer_by_filename(filename) if filename
Lexer.from_xml(LexerFiles.get("/#{LEXERS_BY_NAME["plaintext"]}.xml").gets_to_end)
end
private def self.lexer_by_name(name : String) : BaseLexer
lexer_file_name = LEXERS_BY_NAME.fetch(name.downcase, nil)
return create_delegating_lexer(name) if lexer_file_name.nil? && name.includes? "+"
raise Exception.new("Unknown lexer: #{name}") if lexer_file_name.nil?
Lexer.from_xml(LexerFiles.get("/#{lexer_file_name}.xml").gets_to_end)
end
private def self.lexer_by_filename(filename : String) : BaseLexer
candidates = Set(String).new
LEXERS_BY_FILENAME.each do |k, v|
candidates += v.to_set if File.match?(k, File.basename(filename))
end
case candidates.size
when 0
def self.lexer(name : String? = nil, filename : String? = nil) : Lexer
if name.nil? && filename.nil?
lexer_file_name = LEXERS_BY_NAME["plaintext"]
when 1
lexer_file_name = candidates.first
elsif name && name != "autodetect"
lexer_file_name = LEXERS_BY_NAME[name.downcase]
else
raise Exception.new("Multiple lexers match the filename: #{candidates.to_a.join(", ")}")
# Guess by filename
candidates = Set(String).new
LEXERS_BY_FILENAME.each do |k, v|
candidates += v.to_set if File.match?(k, File.basename(filename.to_s))
end
case candidates.size
when 0
lexer_file_name = LEXERS_BY_NAME["plaintext"]
when 1
lexer_file_name = candidates.first
else
raise Exception.new("Multiple lexers match the filename: #{candidates.to_a.join(", ")}")
end
end
Lexer.from_xml(LexerFiles.get("/#{lexer_file_name}.xml").gets_to_end)
end
private def self.create_delegating_lexer(name : String) : BaseLexer
language, root = name.split("+", 2)
language_lexer = lexer(language)
root_lexer = lexer(root)
DelegatingLexer.new(language_lexer, root_lexer)
end
# Return a list of all lexers
def self.lexers : Array(String)
LEXERS_BY_NAME.keys.sort!
@ -56,18 +40,15 @@ module Tartrazine
# A token, the output of the tokenizer
alias Token = NamedTuple(type: String, value: String)
abstract class BaseTokenizer
end
class Tokenizer < BaseTokenizer
struct Tokenizer
include Iterator(Token)
property lexer : BaseLexer
property lexer : Lexer
property text : Bytes
property pos : Int32 = 0
@dq = Deque(Token).new
property state_stack = ["root"]
def initialize(@lexer : BaseLexer, text : String, secondary = false)
def initialize(@lexer : Lexer, text : String, secondary = false)
# Respect the `ensure_nl` config option
if text.size > 0 && text[-1] != '\n' && @lexer.config[:ensure_nl] && !secondary
text += "\n"
@ -125,7 +106,13 @@ module Tartrazine
end
end
abstract class BaseLexer
# This implements a lexer for Pygments RegexLexers as expressed
# in Chroma's XML serialization.
#
# For explanations on what actions and states do
# the Pygments documentation is a good place to start.
# https://pygments.org/docs/lexerdevelopment/
struct Lexer
property config = {
name: "",
priority: 0.0,
@ -136,18 +123,6 @@ module Tartrazine
}
property states = {} of String => State
def tokenizer(text : String, secondary = false) : BaseTokenizer
Tokenizer.new(self, text, secondary)
end
end
# This implements a lexer for Pygments RegexLexers as expressed
# in Chroma's XML serialization.
#
# For explanations on what actions and states do
# the Pygments documentation is a good place to start.
# https://pygments.org/docs/lexerdevelopment/
class Lexer < BaseLexer
# Collapse consecutive tokens of the same type for easier comparison
# and smaller output
def self.collapse_tokens(tokens : Array(Tartrazine::Token)) : Array(Tartrazine::Token)
@ -229,60 +204,6 @@ module Tartrazine
end
end
# A lexer that takes two lexers as arguments. A root lexer
# and a language lexer. Everything is scalled using the
# language lexer, afterwards all `Other` tokens are lexed
# using the root lexer.
#
# This is useful for things like template languages, where
# you have Jinja + HTML or Jinja + CSS and so on.
class DelegatingLexer < BaseLexer
property language_lexer : BaseLexer
property root_lexer : BaseLexer
def initialize(@language_lexer : BaseLexer, @root_lexer : BaseLexer)
end
def tokenizer(text : String, secondary = false) : DelegatingTokenizer
DelegatingTokenizer.new(self, text, secondary)
end
end
# This Tokenizer works with a DelegatingLexer. It first tokenizes
# using the language lexer, and "Other" tokens are tokenized using
# the root lexer.
class DelegatingTokenizer < BaseTokenizer
include Iterator(Token)
@dq = Deque(Token).new
@language_tokenizer : BaseTokenizer
def initialize(@lexer : DelegatingLexer, text : String, secondary = false)
# Respect the `ensure_nl` config option
if text.size > 0 && text[-1] != '\n' && @lexer.config[:ensure_nl] && !secondary
text += "\n"
end
@language_tokenizer = @lexer.language_lexer.tokenizer(text, true)
end
def next : Iterator::Stop | Token
if @dq.size > 0
return @dq.shift
end
token = @language_tokenizer.next
if token.is_a? Iterator::Stop
return stop
elsif token.as(Token).[:type] == "Other"
root_tokenizer = @lexer.root_lexer.tokenizer(token.as(Token).[:value], true)
root_tokenizer.each do |root_token|
@dq << root_token
end
else
@dq << token.as(Token)
end
self.next
end
end
# A Lexer state. A state has a name and a list of rules.
# The state machine has a state stack containing references
# to states to decide which rules to apply.