Use classes instead of structs to allow properties of the same type

This commit is contained in:
Roberto Alsina 2024-08-22 21:52:59 -03:00
parent 0f3b7fc3c5
commit bd3df10d2c
6 changed files with 57 additions and 41 deletions

View File

@ -72,8 +72,7 @@ end
# Helper that creates lexer and tokenizes # Helper that creates lexer and tokenizes
def tokenize(lexer_name, text) def tokenize(lexer_name, text)
lexer = Tartrazine.lexer(lexer_name) tokenizer = Tartrazine.lexer(lexer_name).tokenizer(text)
tokenizer = Tartrazine::Tokenizer.new(lexer, text)
Tartrazine::Lexer.collapse_tokens(tokenizer.to_a) Tartrazine::Lexer.collapse_tokens(tokenizer.to_a)
end end

View File

@ -115,15 +115,13 @@ module Tartrazine
when ActionType::Using when ActionType::Using
# Shunt to another lexer entirely # Shunt to another lexer entirely
return [] of Token if match.empty? return [] of Token if match.empty?
Tokenizer.new( Tartrazine.lexer(@lexer_name).tokenizer(
Tartrazine.lexer(@lexer_name),
String.new(match[match_group].value), String.new(match[match_group].value),
secondary: true).to_a secondary: true).to_a
when ActionType::Usingself when ActionType::Usingself
# Shunt to another copy of this lexer # Shunt to another copy of this lexer
return [] of Token if match.empty? return [] of Token if match.empty?
Tokenizer.new( tokenizer.lexer.tokenizer(
tokenizer.lexer,
String.new(match[match_group].value), String.new(match[match_group].value),
secondary: true).to_a secondary: true).to_a
when ActionType::Combined when ActionType::Combined

View File

@ -17,8 +17,8 @@ module Tartrazine
outp.to_s outp.to_s
end end
def format(text : String, lexer : Lexer, outp : IO) : Nil def format(text : String, lexer : BaseLexer, outp : IO) : Nil
tokenizer = Tokenizer.new(lexer, text) tokenizer = lexer.tokenizer(text)
i = 0 i = 0
outp << line_label(i) if line_numbers? outp << line_label(i) if line_numbers?
tokenizer.each do |token| tokenizer.each do |token|

View File

@ -40,7 +40,7 @@ module Tartrazine
outp.to_s outp.to_s
end end
def format(text : String, lexer : Lexer, io : IO) : Nil def format(text : String, lexer : BaseLexer, io : IO) : Nil
pre, post = wrap_standalone pre, post = wrap_standalone
io << pre if standalone? io << pre if standalone?
format_text(text, lexer, io) format_text(text, lexer, io)
@ -64,8 +64,8 @@ module Tartrazine
"<span #{line_id} #{line_class} style=\"user-select: none;\">#{line_label} </span>" "<span #{line_id} #{line_class} style=\"user-select: none;\">#{line_label} </span>"
end end
def format_text(text : String, lexer : Lexer, outp : IO) def format_text(text : String, lexer : BaseLexer, outp : IO)
tokenizer = Tokenizer.new(lexer, text) tokenizer = lexer.tokenizer(text)
i = 0 i = 0
if surrounding_pre? if surrounding_pre?
pre_style = wrap_long_lines? ? "style=\"white-space: pre-wrap; word-break: break-word;\"" : "" pre_style = wrap_long_lines? ? "style=\"white-space: pre-wrap; word-break: break-word;\"" : ""

View File

@ -4,14 +4,14 @@ module Tartrazine
class Json < Formatter class Json < Formatter
property name = "json" property name = "json"
def format(text : String, lexer : Lexer) : String def format(text : String, lexer : BaseLexer) : String
outp = String::Builder.new("") outp = String::Builder.new("")
format(text, lexer, outp) format(text, lexer, outp)
outp.to_s outp.to_s
end end
def format(text : String, lexer : Lexer, io : IO) : Nil def format(text : String, lexer : BaseLexer, io : IO) : Nil
tokenizer = Tokenizer.new(lexer, text) tokenizer = lexer.tokenizer(text)
io << Tartrazine::Lexer.collapse_tokens(tokenizer.to_a).to_json io << Tartrazine::Lexer.collapse_tokens(tokenizer.to_a).to_json
end end
end end

View File

@ -9,11 +9,20 @@ module Tartrazine
# Get the lexer object for a language name # Get the lexer object for a language name
# FIXME: support mimetypes # FIXME: support mimetypes
def self.lexer(name : String? = nil, filename : String? = nil) : Lexer def self.lexer(name : String? = nil, filename : String? = nil) : BaseLexer
if name.nil? && filename.nil? if name.nil? && filename.nil?
lexer_file_name = LEXERS_BY_NAME["plaintext"] lexer_file_name = LEXERS_BY_NAME["plaintext"]
elsif name && name != "autodetect" elsif name && name != "autodetect"
lexer_file_name = LEXERS_BY_NAME[name.downcase] lexer_file_name = LEXERS_BY_NAME.fetch(name.downcase, nil)
if lexer_file_name.nil? && name.includes? "+"
# Delegating lexer
language, root = name.split("+", 2)
language_lexer = lexer(language)
root_lexer = lexer(root)
return DelegatingLexer.new(language_lexer, root_lexer)
elsif lexer_file_name.nil?
raise Exception.new("Unknown lexer: #{name}")
end
else else
# Guess by filename # Guess by filename
candidates = Set(String).new candidates = Set(String).new
@ -40,7 +49,10 @@ module Tartrazine
# A token, the output of the tokenizer # A token, the output of the tokenizer
alias Token = NamedTuple(type: String, value: String) alias Token = NamedTuple(type: String, value: String)
struct Tokenizer abstract class BaseTokenizer
end
class Tokenizer < BaseTokenizer
include Iterator(Token) include Iterator(Token)
property lexer : BaseLexer property lexer : BaseLexer
property text : Bytes property text : Bytes
@ -48,7 +60,7 @@ module Tartrazine
@dq = Deque(Token).new @dq = Deque(Token).new
property state_stack = ["root"] property state_stack = ["root"]
def initialize(@lexer : Lexer, text : String, secondary = false) def initialize(@lexer : BaseLexer, text : String, secondary = false)
# Respect the `ensure_nl` config option # Respect the `ensure_nl` config option
if text.size > 0 && text[-1] != '\n' && @lexer.config[:ensure_nl] && !secondary if text.size > 0 && text[-1] != '\n' && @lexer.config[:ensure_nl] && !secondary
text += "\n" text += "\n"
@ -106,16 +118,7 @@ module Tartrazine
end end
end end
abstract struct BaseLexer abstract class BaseLexer
end
# This implements a lexer for Pygments RegexLexers as expressed
# in Chroma's XML serialization.
#
# For explanations on what actions and states do
# the Pygments documentation is a good place to start.
# https://pygments.org/docs/lexerdevelopment/
struct Lexer < BaseLexer
property config = { property config = {
name: "", name: "",
priority: 0.0, priority: 0.0,
@ -126,6 +129,18 @@ module Tartrazine
} }
property states = {} of String => State property states = {} of String => State
def tokenizer(text : String, secondary = false) : BaseTokenizer
Tokenizer.new(self, text, secondary)
end
end
# This implements a lexer for Pygments RegexLexers as expressed
# in Chroma's XML serialization.
#
# For explanations on what actions and states do
# the Pygments documentation is a good place to start.
# https://pygments.org/docs/lexerdevelopment/
class Lexer < BaseLexer
# Collapse consecutive tokens of the same type for easier comparison # Collapse consecutive tokens of the same type for easier comparison
# and smaller output # and smaller output
def self.collapse_tokens(tokens : Array(Tartrazine::Token)) : Array(Tartrazine::Token) def self.collapse_tokens(tokens : Array(Tartrazine::Token)) : Array(Tartrazine::Token)
@ -214,27 +229,32 @@ module Tartrazine
# #
# This is useful for things like template languages, where # This is useful for things like template languages, where
# you have Jinja + HTML or Jinja + CSS and so on. # you have Jinja + HTML or Jinja + CSS and so on.
struct DelegatingLexer < BaseLexer class DelegatingLexer < BaseLexer
property root_lexer : Lexer property language_lexer : BaseLexer
property language_lexer : Lexer property root_lexer : BaseLexer
def initialize(@lexer : Lexer, @delegate : Lexer) def initialize(@language_lexer : BaseLexer, @root_lexer : BaseLexer)
end
def tokenizer(text : String, secondary = false) : DelegatingTokenizer
DelegatingTokenizer.new(self, text, secondary)
end end
end end
# This Tokenizer works with a DelegatingLexer. It first tokenizes # This Tokenizer works with a DelegatingLexer. It first tokenizes
# using the language lexer, and "Other" tokens are tokenized using # using the language lexer, and "Other" tokens are tokenized using
# the root lexer. # the root lexer.
struct DelegatingTokenizer class DelegatingTokenizer < BaseTokenizer
include Iterator(Token) include Iterator(Token)
@dq = Deque(Token).new @dq = Deque(Token).new
@language_tokenizer : BaseTokenizer
def initialize(@lexer : Lexer, text : String, secondary = false) def initialize(@lexer : DelegatingLexer, text : String, secondary = false)
# Respect the `ensure_nl` config option # Respect the `ensure_nl` config option
if text.size > 0 && text[-1] != '\n' && @lexer.config[:ensure_nl] && !secondary if text.size > 0 && text[-1] != '\n' && @lexer.config[:ensure_nl] && !secondary
text += "\n" text += "\n"
end end
@language_tokenizer = Tokenizer.new(@lexer.language_lexer, text, true) @language_tokenizer = @lexer.language_lexer.tokenizer(text, true)
end end
def next : Iterator::Stop | Token def next : Iterator::Stop | Token
@ -242,16 +262,15 @@ module Tartrazine
return @dq.shift return @dq.shift
end end
token = @language_tokenizer.next token = @language_tokenizer.next
if token == Iterator::Stop if token.is_a? Iterator::Stop
return stop return stop
end elsif token.as(Token).[:type] == "Other"
if token[:type] == "Other" root_tokenizer = @lexer.root_lexer.tokenizer(token.as(Token).[:value], true)
@root_tokenizer = Tokenizer.new(@lexer.root_lexer, token[:value], true) root_tokenizer.each do |root_token|
@root_tokenizer.each do |root_token|
@dq << root_token @dq << root_token
end end
else else
dq << token @dq << token.as(Token)
end end
self.next self.next
end end