7 Commits

10 changed files with 1322 additions and 1230 deletions

View File

@@ -9,4 +9,5 @@
* ✅ Implement lexer loader by file extension
* ✅ Add --line-numbers to terminal formatter
* Implement lexer loader by mime type
* Implement Delegating lexers
* Implement Delegating lexers
* Add RstLexer maybe others?

View File

@@ -18,19 +18,21 @@
<rule pattern="^(\s*)([*-])(\s)(.+\n)"><bygroups><token type="TextWhitespace"/><token type="Keyword"/><token type="TextWhitespace"/><usingself state="inline"/></bygroups></rule>
<rule pattern="^(\s*)([0-9]+\.)( .+\n)"><bygroups><token type="TextWhitespace"/><token type="Keyword"/><usingself state="inline"/></bygroups></rule>
<rule pattern="^(\s*&gt;\s)(.+\n)"><bygroups><token type="Keyword"/><token type="GenericEmph"/></bygroups></rule>
<rule pattern="^(\s*```\n[\w\W]*?^\s*```$\n)"><token type="LiteralStringBacktick"/></rule>
<rule pattern="(?x)
^(?P&lt;initial&gt;\s*```)
(?P&lt;lang&gt;[\w\-]+)
(?P&lt;afterlang&gt;
(?P&lt;whitespace&gt;[^\S\n]+)
(?P&lt;extra&gt;.*))?
(?P&lt;newline&gt;\n)
(?P&lt;code&gt;(.|\n)*?)
(?P&lt;terminator&gt;^\s*```$\n)
">
<!-- Implement actions for delegating via a capture group -->
<token type="Text"/>
<rule pattern="^(```\n)([\w\W]*?)(^```$)">
<bygroups>
<token type="LiteralStringBacktick"/>
<token type="Text"/>
<token type="LiteralStringBacktick"/>
</bygroups>
</rule>
<rule pattern="^(```)(\w+)(\n)([\w\W]*?)(^```$)">
<bygroups>
<token type="LiteralStringBacktick"/>
<token type="NameLabel"/>
<token type="TextWhitespace"/>
<UsingByGroup lexer="2" content="4"/>
<token type="LiteralStringBacktick"/>
</bygroups>
</rule>
<rule><include state="inline"/></rule>
</state>

View File

@@ -1,5 +1,5 @@
name: tartrazine
version: 0.5.1
version: 0.6.0
authors:
- Roberto Alsina <roberto.alsina@gmail.com>

View File

@@ -72,8 +72,7 @@ end
# Helper that creates lexer and tokenizes
def tokenize(lexer_name, text)
lexer = Tartrazine.lexer(lexer_name)
tokenizer = Tartrazine::Tokenizer.new(lexer, text)
tokenizer = Tartrazine.lexer(lexer_name).tokenizer(text)
Tartrazine::Lexer.collapse_tokens(tokenizer.to_a)
end

View File

@@ -16,13 +16,16 @@ module Tartrazine
Push
Token
Using
Usingbygroup
Usingself
end
struct Action
property actions : Array(Action) = [] of Action
@content_index : Int32 = 0
@depth : Int32 = 0
@lexer_index : Int32 = 0
@lexer_name : String = ""
@states : Array(String) = [] of String
@states_to_push : Array(String) = [] of String
@@ -62,6 +65,9 @@ module Tartrazine
@states = xml.attributes.select { |attrib|
attrib.name == "state"
}.map &.content
when ActionType::Usingbygroup
@lexer_index = xml["lexer"].to_i
@content_index = xml["content"].to_i
end
end
@@ -115,15 +121,13 @@ module Tartrazine
when ActionType::Using
# Shunt to another lexer entirely
return [] of Token if match.empty?
Tokenizer.new(
Tartrazine.lexer(@lexer_name),
Tartrazine.lexer(@lexer_name).tokenizer(
String.new(match[match_group].value),
secondary: true).to_a
when ActionType::Usingself
# Shunt to another copy of this lexer
return [] of Token if match.empty?
Tokenizer.new(
tokenizer.lexer,
tokenizer.lexer.tokenizer(
String.new(match[match_group].value),
secondary: true).to_a
when ActionType::Combined
@@ -136,6 +140,12 @@ module Tartrazine
tokenizer.lexer.states[new_state.name] = new_state
tokenizer.state_stack << new_state.name
[] of Token
when ActionType::Usingbygroup
# Shunt to content-specified lexer
return [] of Token if match.empty?
Tartrazine.lexer(String.new(match[@lexer_index].value)).tokenizer(
String.new(match[@content_index].value),
secondary: true).to_a
else
raise Exception.new("Unknown action type: #{@type}")
end

File diff suppressed because it is too large Load Diff

View File

@@ -17,8 +17,8 @@ module Tartrazine
outp.to_s
end
def format(text : String, lexer : Lexer, outp : IO) : Nil
tokenizer = Tokenizer.new(lexer, text)
def format(text : String, lexer : BaseLexer, outp : IO) : Nil
tokenizer = lexer.tokenizer(text)
i = 0
outp << line_label(i) if line_numbers?
tokenizer.each do |token|

View File

@@ -40,7 +40,7 @@ module Tartrazine
outp.to_s
end
def format(text : String, lexer : Lexer, io : IO) : Nil
def format(text : String, lexer : BaseLexer, io : IO) : Nil
pre, post = wrap_standalone
io << pre if standalone?
format_text(text, lexer, io)
@@ -64,8 +64,8 @@ module Tartrazine
"<span #{line_id} #{line_class} style=\"user-select: none;\">#{line_label} </span>"
end
def format_text(text : String, lexer : Lexer, outp : IO)
tokenizer = Tokenizer.new(lexer, text)
def format_text(text : String, lexer : BaseLexer, outp : IO)
tokenizer = lexer.tokenizer(text)
i = 0
if surrounding_pre?
pre_style = wrap_long_lines? ? "style=\"white-space: pre-wrap; word-break: break-word;\"" : ""

View File

@@ -4,14 +4,14 @@ module Tartrazine
class Json < Formatter
property name = "json"
def format(text : String, lexer : Lexer) : String
def format(text : String, lexer : BaseLexer) : String
outp = String::Builder.new("")
format(text, lexer, outp)
outp.to_s
end
def format(text : String, lexer : Lexer, io : IO) : Nil
tokenizer = Tokenizer.new(lexer, text)
def format(text : String, lexer : BaseLexer, io : IO) : Nil
tokenizer = lexer.tokenizer(text)
io << Tartrazine::Lexer.collapse_tokens(tokenizer.to_a).to_json
end
end

View File

@@ -9,29 +9,46 @@ module Tartrazine
# Get the lexer object for a language name
# FIXME: support mimetypes
def self.lexer(name : String? = nil, filename : String? = nil) : Lexer
if name.nil? && filename.nil?
lexer_file_name = LEXERS_BY_NAME["plaintext"]
elsif name && name != "autodetect"
lexer_file_name = LEXERS_BY_NAME[name.downcase]
else
# Guess by filename
candidates = Set(String).new
LEXERS_BY_FILENAME.each do |k, v|
candidates += v.to_set if File.match?(k, File.basename(filename.to_s))
end
case candidates.size
when 0
lexer_file_name = LEXERS_BY_NAME["plaintext"]
when 1
lexer_file_name = candidates.first
else
raise Exception.new("Multiple lexers match the filename: #{candidates.to_a.join(", ")}")
end
end
def self.lexer(name : String? = nil, filename : String? = nil) : BaseLexer
return lexer_by_name(name) if name && name != "autodetect"
return lexer_by_filename(filename) if filename
Lexer.from_xml(LexerFiles.get("/#{LEXERS_BY_NAME["plaintext"]}.xml").gets_to_end)
end
private def self.lexer_by_name(name : String) : BaseLexer
lexer_file_name = LEXERS_BY_NAME.fetch(name.downcase, nil)
return create_delegating_lexer(name) if lexer_file_name.nil? && name.includes? "+"
raise Exception.new("Unknown lexer: #{name}") if lexer_file_name.nil?
Lexer.from_xml(LexerFiles.get("/#{lexer_file_name}.xml").gets_to_end)
end
private def self.lexer_by_filename(filename : String) : BaseLexer
candidates = Set(String).new
LEXERS_BY_FILENAME.each do |k, v|
candidates += v.to_set if File.match?(k, File.basename(filename))
end
case candidates.size
when 0
lexer_file_name = LEXERS_BY_NAME["plaintext"]
when 1
lexer_file_name = candidates.first
else
raise Exception.new("Multiple lexers match the filename: #{candidates.to_a.join(", ")}")
end
Lexer.from_xml(LexerFiles.get("/#{lexer_file_name}.xml").gets_to_end)
end
private def self.create_delegating_lexer(name : String) : BaseLexer
language, root = name.split("+", 2)
language_lexer = lexer(language)
root_lexer = lexer(root)
DelegatingLexer.new(language_lexer, root_lexer)
end
# Return a list of all lexers
def self.lexers : Array(String)
LEXERS_BY_NAME.keys.sort!
@@ -40,15 +57,18 @@ module Tartrazine
# A token, the output of the tokenizer
alias Token = NamedTuple(type: String, value: String)
struct Tokenizer
abstract class BaseTokenizer
end
class Tokenizer < BaseTokenizer
include Iterator(Token)
property lexer : Lexer
property lexer : BaseLexer
property text : Bytes
property pos : Int32 = 0
@dq = Deque(Token).new
property state_stack = ["root"]
def initialize(@lexer : Lexer, text : String, secondary = false)
def initialize(@lexer : BaseLexer, text : String, secondary = false)
# Respect the `ensure_nl` config option
if text.size > 0 && text[-1] != '\n' && @lexer.config[:ensure_nl] && !secondary
text += "\n"
@@ -106,13 +126,7 @@ module Tartrazine
end
end
# This implements a lexer for Pygments RegexLexers as expressed
# in Chroma's XML serialization.
#
# For explanations on what actions and states do
# the Pygments documentation is a good place to start.
# https://pygments.org/docs/lexerdevelopment/
struct Lexer
abstract class BaseLexer
property config = {
name: "",
priority: 0.0,
@@ -123,6 +137,18 @@ module Tartrazine
}
property states = {} of String => State
def tokenizer(text : String, secondary = false) : BaseTokenizer
Tokenizer.new(self, text, secondary)
end
end
# This implements a lexer for Pygments RegexLexers as expressed
# in Chroma's XML serialization.
#
# For explanations on what actions and states do
# the Pygments documentation is a good place to start.
# https://pygments.org/docs/lexerdevelopment/
class Lexer < BaseLexer
# Collapse consecutive tokens of the same type for easier comparison
# and smaller output
def self.collapse_tokens(tokens : Array(Tartrazine::Token)) : Array(Tartrazine::Token)
@@ -204,6 +230,60 @@ module Tartrazine
end
end
# A lexer that takes two lexers as arguments. A root lexer
# and a language lexer. Everything is scalled using the
# language lexer, afterwards all `Other` tokens are lexed
# using the root lexer.
#
# This is useful for things like template languages, where
# you have Jinja + HTML or Jinja + CSS and so on.
class DelegatingLexer < BaseLexer
property language_lexer : BaseLexer
property root_lexer : BaseLexer
def initialize(@language_lexer : BaseLexer, @root_lexer : BaseLexer)
end
def tokenizer(text : String, secondary = false) : DelegatingTokenizer
DelegatingTokenizer.new(self, text, secondary)
end
end
# This Tokenizer works with a DelegatingLexer. It first tokenizes
# using the language lexer, and "Other" tokens are tokenized using
# the root lexer.
class DelegatingTokenizer < BaseTokenizer
include Iterator(Token)
@dq = Deque(Token).new
@language_tokenizer : BaseTokenizer
def initialize(@lexer : DelegatingLexer, text : String, secondary = false)
# Respect the `ensure_nl` config option
if text.size > 0 && text[-1] != '\n' && @lexer.config[:ensure_nl] && !secondary
text += "\n"
end
@language_tokenizer = @lexer.language_lexer.tokenizer(text, true)
end
def next : Iterator::Stop | Token
if @dq.size > 0
return @dq.shift
end
token = @language_tokenizer.next
if token.is_a? Iterator::Stop
return stop
elsif token.as(Token).[:type] == "Other"
root_tokenizer = @lexer.root_lexer.tokenizer(token.as(Token).[:value], true)
root_tokenizer.each do |root_token|
@dq << root_token
end
else
@dq << token.as(Token)
end
self.next
end
end
# A Lexer state. A state has a name and a list of rules.
# The state machine has a state stack containing references
# to states to decide which rules to apply.