Initial implementation of delegatinglexer

This commit is contained in:
Roberto Alsina 2024-08-22 20:55:08 -03:00
parent 7f4296e9d7
commit 0f3b7fc3c5
2 changed files with 1226 additions and 1173 deletions

File diff suppressed because it is too large Load Diff

View File

@ -42,7 +42,7 @@ module Tartrazine
struct Tokenizer struct Tokenizer
include Iterator(Token) include Iterator(Token)
property lexer : Lexer property lexer : BaseLexer
property text : Bytes property text : Bytes
property pos : Int32 = 0 property pos : Int32 = 0
@dq = Deque(Token).new @dq = Deque(Token).new
@ -106,13 +106,16 @@ module Tartrazine
end end
end end
abstract struct BaseLexer
end
# This implements a lexer for Pygments RegexLexers as expressed # This implements a lexer for Pygments RegexLexers as expressed
# in Chroma's XML serialization. # in Chroma's XML serialization.
# #
# For explanations on what actions and states do # For explanations on what actions and states do
# the Pygments documentation is a good place to start. # the Pygments documentation is a good place to start.
# https://pygments.org/docs/lexerdevelopment/ # https://pygments.org/docs/lexerdevelopment/
struct Lexer struct Lexer < BaseLexer
property config = { property config = {
name: "", name: "",
priority: 0.0, priority: 0.0,
@ -204,6 +207,56 @@ module Tartrazine
end end
end end
# A lexer that takes two lexers as arguments. A root lexer
# and a language lexer. Everything is scalled using the
# language lexer, afterwards all `Other` tokens are lexed
# using the root lexer.
#
# This is useful for things like template languages, where
# you have Jinja + HTML or Jinja + CSS and so on.
struct DelegatingLexer < BaseLexer
property root_lexer : Lexer
property language_lexer : Lexer
def initialize(@lexer : Lexer, @delegate : Lexer)
end
end
# This Tokenizer works with a DelegatingLexer. It first tokenizes
# using the language lexer, and "Other" tokens are tokenized using
# the root lexer.
struct DelegatingTokenizer
include Iterator(Token)
@dq = Deque(Token).new
def initialize(@lexer : Lexer, text : String, secondary = false)
# Respect the `ensure_nl` config option
if text.size > 0 && text[-1] != '\n' && @lexer.config[:ensure_nl] && !secondary
text += "\n"
end
@language_tokenizer = Tokenizer.new(@lexer.language_lexer, text, true)
end
def next : Iterator::Stop | Token
if @dq.size > 0
return @dq.shift
end
token = @language_tokenizer.next
if token == Iterator::Stop
return stop
end
if token[:type] == "Other"
@root_tokenizer = Tokenizer.new(@lexer.root_lexer, token[:value], true)
@root_tokenizer.each do |root_token|
@dq << root_token
end
else
dq << token
end
self.next
end
end
# A Lexer state. A state has a name and a list of rules. # A Lexer state. A state has a name and a list of rules.
# The state machine has a state stack containing references # The state machine has a state stack containing references
# to states to decide which rules to apply. # to states to decide which rules to apply.