mirror of
https://github.com/ralsina/tartrazine.git
synced 2025-06-08 04:30:26 -03:00
Compare commits
3 Commits
7f4296e9d7
...
a2884c4c78
Author | SHA1 | Date | |
---|---|---|---|
a2884c4c78 | |||
bd3df10d2c | |||
0f3b7fc3c5 |
@ -72,8 +72,7 @@ end
|
|||||||
|
|
||||||
# Helper that creates lexer and tokenizes
|
# Helper that creates lexer and tokenizes
|
||||||
def tokenize(lexer_name, text)
|
def tokenize(lexer_name, text)
|
||||||
lexer = Tartrazine.lexer(lexer_name)
|
tokenizer = Tartrazine.lexer(lexer_name).tokenizer(text)
|
||||||
tokenizer = Tartrazine::Tokenizer.new(lexer, text)
|
|
||||||
Tartrazine::Lexer.collapse_tokens(tokenizer.to_a)
|
Tartrazine::Lexer.collapse_tokens(tokenizer.to_a)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -115,15 +115,13 @@ module Tartrazine
|
|||||||
when ActionType::Using
|
when ActionType::Using
|
||||||
# Shunt to another lexer entirely
|
# Shunt to another lexer entirely
|
||||||
return [] of Token if match.empty?
|
return [] of Token if match.empty?
|
||||||
Tokenizer.new(
|
Tartrazine.lexer(@lexer_name).tokenizer(
|
||||||
Tartrazine.lexer(@lexer_name),
|
|
||||||
String.new(match[match_group].value),
|
String.new(match[match_group].value),
|
||||||
secondary: true).to_a
|
secondary: true).to_a
|
||||||
when ActionType::Usingself
|
when ActionType::Usingself
|
||||||
# Shunt to another copy of this lexer
|
# Shunt to another copy of this lexer
|
||||||
return [] of Token if match.empty?
|
return [] of Token if match.empty?
|
||||||
Tokenizer.new(
|
tokenizer.lexer.tokenizer(
|
||||||
tokenizer.lexer,
|
|
||||||
String.new(match[match_group].value),
|
String.new(match[match_group].value),
|
||||||
secondary: true).to_a
|
secondary: true).to_a
|
||||||
when ActionType::Combined
|
when ActionType::Combined
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -17,8 +17,8 @@ module Tartrazine
|
|||||||
outp.to_s
|
outp.to_s
|
||||||
end
|
end
|
||||||
|
|
||||||
def format(text : String, lexer : Lexer, outp : IO) : Nil
|
def format(text : String, lexer : BaseLexer, outp : IO) : Nil
|
||||||
tokenizer = Tokenizer.new(lexer, text)
|
tokenizer = lexer.tokenizer(text)
|
||||||
i = 0
|
i = 0
|
||||||
outp << line_label(i) if line_numbers?
|
outp << line_label(i) if line_numbers?
|
||||||
tokenizer.each do |token|
|
tokenizer.each do |token|
|
||||||
|
@ -40,7 +40,7 @@ module Tartrazine
|
|||||||
outp.to_s
|
outp.to_s
|
||||||
end
|
end
|
||||||
|
|
||||||
def format(text : String, lexer : Lexer, io : IO) : Nil
|
def format(text : String, lexer : BaseLexer, io : IO) : Nil
|
||||||
pre, post = wrap_standalone
|
pre, post = wrap_standalone
|
||||||
io << pre if standalone?
|
io << pre if standalone?
|
||||||
format_text(text, lexer, io)
|
format_text(text, lexer, io)
|
||||||
@ -64,8 +64,8 @@ module Tartrazine
|
|||||||
"<span #{line_id} #{line_class} style=\"user-select: none;\">#{line_label} </span>"
|
"<span #{line_id} #{line_class} style=\"user-select: none;\">#{line_label} </span>"
|
||||||
end
|
end
|
||||||
|
|
||||||
def format_text(text : String, lexer : Lexer, outp : IO)
|
def format_text(text : String, lexer : BaseLexer, outp : IO)
|
||||||
tokenizer = Tokenizer.new(lexer, text)
|
tokenizer = lexer.tokenizer(text)
|
||||||
i = 0
|
i = 0
|
||||||
if surrounding_pre?
|
if surrounding_pre?
|
||||||
pre_style = wrap_long_lines? ? "style=\"white-space: pre-wrap; word-break: break-word;\"" : ""
|
pre_style = wrap_long_lines? ? "style=\"white-space: pre-wrap; word-break: break-word;\"" : ""
|
||||||
|
@ -4,14 +4,14 @@ module Tartrazine
|
|||||||
class Json < Formatter
|
class Json < Formatter
|
||||||
property name = "json"
|
property name = "json"
|
||||||
|
|
||||||
def format(text : String, lexer : Lexer) : String
|
def format(text : String, lexer : BaseLexer) : String
|
||||||
outp = String::Builder.new("")
|
outp = String::Builder.new("")
|
||||||
format(text, lexer, outp)
|
format(text, lexer, outp)
|
||||||
outp.to_s
|
outp.to_s
|
||||||
end
|
end
|
||||||
|
|
||||||
def format(text : String, lexer : Lexer, io : IO) : Nil
|
def format(text : String, lexer : BaseLexer, io : IO) : Nil
|
||||||
tokenizer = Tokenizer.new(lexer, text)
|
tokenizer = lexer.tokenizer(text)
|
||||||
io << Tartrazine::Lexer.collapse_tokens(tokenizer.to_a).to_json
|
io << Tartrazine::Lexer.collapse_tokens(tokenizer.to_a).to_json
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
139
src/lexer.cr
139
src/lexer.cr
@ -9,29 +9,45 @@ module Tartrazine
|
|||||||
|
|
||||||
# Get the lexer object for a language name
|
# Get the lexer object for a language name
|
||||||
# FIXME: support mimetypes
|
# FIXME: support mimetypes
|
||||||
def self.lexer(name : String? = nil, filename : String? = nil) : Lexer
|
def self.lexer(name : String? = nil, filename : String? = nil) : BaseLexer
|
||||||
if name.nil? && filename.nil?
|
return lexer_by_name(name) if name && name != "autodetect"
|
||||||
lexer_file_name = LEXERS_BY_NAME["plaintext"]
|
return lexer_by_filename(filename) if filename
|
||||||
elsif name && name != "autodetect"
|
|
||||||
lexer_file_name = LEXERS_BY_NAME[name.downcase]
|
Lexer.from_xml(LexerFiles.get("/#{LEXERS_BY_NAME["plaintext"]}.xml").gets_to_end)
|
||||||
else
|
end
|
||||||
# Guess by filename
|
|
||||||
candidates = Set(String).new
|
private def self.lexer_by_name(name : String) : BaseLexer
|
||||||
LEXERS_BY_FILENAME.each do |k, v|
|
lexer_file_name = LEXERS_BY_NAME.fetch(name.downcase, nil)
|
||||||
candidates += v.to_set if File.match?(k, File.basename(filename.to_s))
|
return create_delegating_lexer(name) if lexer_file_name.nil? && name.includes? "+"
|
||||||
end
|
raise Exception.new("Unknown lexer: #{name}") if lexer_file_name.nil?
|
||||||
case candidates.size
|
|
||||||
when 0
|
|
||||||
lexer_file_name = LEXERS_BY_NAME["plaintext"]
|
|
||||||
when 1
|
|
||||||
lexer_file_name = candidates.first
|
|
||||||
else
|
|
||||||
raise Exception.new("Multiple lexers match the filename: #{candidates.to_a.join(", ")}")
|
|
||||||
end
|
|
||||||
end
|
|
||||||
Lexer.from_xml(LexerFiles.get("/#{lexer_file_name}.xml").gets_to_end)
|
Lexer.from_xml(LexerFiles.get("/#{lexer_file_name}.xml").gets_to_end)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
private def self.lexer_by_filename(filename : String) : BaseLexer
|
||||||
|
candidates = Set(String).new
|
||||||
|
LEXERS_BY_FILENAME.each do |k, v|
|
||||||
|
candidates += v.to_set if File.match?(k, File.basename(filename))
|
||||||
|
end
|
||||||
|
|
||||||
|
case candidates.size
|
||||||
|
when 0
|
||||||
|
lexer_file_name = LEXERS_BY_NAME["plaintext"]
|
||||||
|
when 1
|
||||||
|
lexer_file_name = candidates.first
|
||||||
|
else
|
||||||
|
raise Exception.new("Multiple lexers match the filename: #{candidates.to_a.join(", ")}")
|
||||||
|
end
|
||||||
|
|
||||||
|
Lexer.from_xml(LexerFiles.get("/#{lexer_file_name}.xml").gets_to_end)
|
||||||
|
end
|
||||||
|
|
||||||
|
private def self.create_delegating_lexer(name : String) : BaseLexer
|
||||||
|
language, root = name.split("+", 2)
|
||||||
|
language_lexer = lexer(language)
|
||||||
|
root_lexer = lexer(root)
|
||||||
|
DelegatingLexer.new(language_lexer, root_lexer)
|
||||||
|
end
|
||||||
# Return a list of all lexers
|
# Return a list of all lexers
|
||||||
def self.lexers : Array(String)
|
def self.lexers : Array(String)
|
||||||
LEXERS_BY_NAME.keys.sort!
|
LEXERS_BY_NAME.keys.sort!
|
||||||
@ -40,15 +56,18 @@ module Tartrazine
|
|||||||
# A token, the output of the tokenizer
|
# A token, the output of the tokenizer
|
||||||
alias Token = NamedTuple(type: String, value: String)
|
alias Token = NamedTuple(type: String, value: String)
|
||||||
|
|
||||||
struct Tokenizer
|
abstract class BaseTokenizer
|
||||||
|
end
|
||||||
|
|
||||||
|
class Tokenizer < BaseTokenizer
|
||||||
include Iterator(Token)
|
include Iterator(Token)
|
||||||
property lexer : Lexer
|
property lexer : BaseLexer
|
||||||
property text : Bytes
|
property text : Bytes
|
||||||
property pos : Int32 = 0
|
property pos : Int32 = 0
|
||||||
@dq = Deque(Token).new
|
@dq = Deque(Token).new
|
||||||
property state_stack = ["root"]
|
property state_stack = ["root"]
|
||||||
|
|
||||||
def initialize(@lexer : Lexer, text : String, secondary = false)
|
def initialize(@lexer : BaseLexer, text : String, secondary = false)
|
||||||
# Respect the `ensure_nl` config option
|
# Respect the `ensure_nl` config option
|
||||||
if text.size > 0 && text[-1] != '\n' && @lexer.config[:ensure_nl] && !secondary
|
if text.size > 0 && text[-1] != '\n' && @lexer.config[:ensure_nl] && !secondary
|
||||||
text += "\n"
|
text += "\n"
|
||||||
@ -106,13 +125,7 @@ module Tartrazine
|
|||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
# This implements a lexer for Pygments RegexLexers as expressed
|
abstract class BaseLexer
|
||||||
# in Chroma's XML serialization.
|
|
||||||
#
|
|
||||||
# For explanations on what actions and states do
|
|
||||||
# the Pygments documentation is a good place to start.
|
|
||||||
# https://pygments.org/docs/lexerdevelopment/
|
|
||||||
struct Lexer
|
|
||||||
property config = {
|
property config = {
|
||||||
name: "",
|
name: "",
|
||||||
priority: 0.0,
|
priority: 0.0,
|
||||||
@ -123,6 +136,18 @@ module Tartrazine
|
|||||||
}
|
}
|
||||||
property states = {} of String => State
|
property states = {} of String => State
|
||||||
|
|
||||||
|
def tokenizer(text : String, secondary = false) : BaseTokenizer
|
||||||
|
Tokenizer.new(self, text, secondary)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# This implements a lexer for Pygments RegexLexers as expressed
|
||||||
|
# in Chroma's XML serialization.
|
||||||
|
#
|
||||||
|
# For explanations on what actions and states do
|
||||||
|
# the Pygments documentation is a good place to start.
|
||||||
|
# https://pygments.org/docs/lexerdevelopment/
|
||||||
|
class Lexer < BaseLexer
|
||||||
# Collapse consecutive tokens of the same type for easier comparison
|
# Collapse consecutive tokens of the same type for easier comparison
|
||||||
# and smaller output
|
# and smaller output
|
||||||
def self.collapse_tokens(tokens : Array(Tartrazine::Token)) : Array(Tartrazine::Token)
|
def self.collapse_tokens(tokens : Array(Tartrazine::Token)) : Array(Tartrazine::Token)
|
||||||
@ -204,6 +229,60 @@ module Tartrazine
|
|||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# A lexer that takes two lexers as arguments. A root lexer
|
||||||
|
# and a language lexer. Everything is scalled using the
|
||||||
|
# language lexer, afterwards all `Other` tokens are lexed
|
||||||
|
# using the root lexer.
|
||||||
|
#
|
||||||
|
# This is useful for things like template languages, where
|
||||||
|
# you have Jinja + HTML or Jinja + CSS and so on.
|
||||||
|
class DelegatingLexer < BaseLexer
|
||||||
|
property language_lexer : BaseLexer
|
||||||
|
property root_lexer : BaseLexer
|
||||||
|
|
||||||
|
def initialize(@language_lexer : BaseLexer, @root_lexer : BaseLexer)
|
||||||
|
end
|
||||||
|
|
||||||
|
def tokenizer(text : String, secondary = false) : DelegatingTokenizer
|
||||||
|
DelegatingTokenizer.new(self, text, secondary)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# This Tokenizer works with a DelegatingLexer. It first tokenizes
|
||||||
|
# using the language lexer, and "Other" tokens are tokenized using
|
||||||
|
# the root lexer.
|
||||||
|
class DelegatingTokenizer < BaseTokenizer
|
||||||
|
include Iterator(Token)
|
||||||
|
@dq = Deque(Token).new
|
||||||
|
@language_tokenizer : BaseTokenizer
|
||||||
|
|
||||||
|
def initialize(@lexer : DelegatingLexer, text : String, secondary = false)
|
||||||
|
# Respect the `ensure_nl` config option
|
||||||
|
if text.size > 0 && text[-1] != '\n' && @lexer.config[:ensure_nl] && !secondary
|
||||||
|
text += "\n"
|
||||||
|
end
|
||||||
|
@language_tokenizer = @lexer.language_lexer.tokenizer(text, true)
|
||||||
|
end
|
||||||
|
|
||||||
|
def next : Iterator::Stop | Token
|
||||||
|
if @dq.size > 0
|
||||||
|
return @dq.shift
|
||||||
|
end
|
||||||
|
token = @language_tokenizer.next
|
||||||
|
if token.is_a? Iterator::Stop
|
||||||
|
return stop
|
||||||
|
elsif token.as(Token).[:type] == "Other"
|
||||||
|
root_tokenizer = @lexer.root_lexer.tokenizer(token.as(Token).[:value], true)
|
||||||
|
root_tokenizer.each do |root_token|
|
||||||
|
@dq << root_token
|
||||||
|
end
|
||||||
|
else
|
||||||
|
@dq << token.as(Token)
|
||||||
|
end
|
||||||
|
self.next
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
# A Lexer state. A state has a name and a list of rules.
|
# A Lexer state. A state has a name and a list of rules.
|
||||||
# The state machine has a state stack containing references
|
# The state machine has a state stack containing references
|
||||||
# to states to decide which rules to apply.
|
# to states to decide which rules to apply.
|
||||||
|
Loading…
x
Reference in New Issue
Block a user