13 Commits

Author SHA1 Message Date
10842f7074 v0.5.0 2024-08-16 19:38:40 -03:00
ae03e4612e todo management 2024-08-16 14:05:34 -03:00
471b2f5050 updated 2024-08-16 14:03:05 -03:00
5a3b08e716 lint 2024-08-16 14:01:16 -03:00
9ebb9f2765 Fix off-by-1 2024-08-16 13:36:11 -03:00
7538fc76aa Tokenize via an iterator, makes everything much faster 2024-08-16 13:27:02 -03:00
788577b226 Fix comment 2024-08-15 23:56:52 -03:00
1f01146b1f Minor cleanup 2024-08-15 23:21:21 -03:00
9041b763ea Remove unused bits of lexer config 2024-08-15 23:17:49 -03:00
ada30915c3 Idiomatic changes 2024-08-15 23:16:29 -03:00
78eff45ea0 Idiomatic changes 2024-08-15 23:11:49 -03:00
e817aedd60 Idiomatic changes 2024-08-15 22:41:24 -03:00
20d6b65346 More idiomatic 2024-08-15 22:01:50 -03:00
11 changed files with 194 additions and 165 deletions

View File

@@ -47,7 +47,14 @@ To build from source:
2. Run `make` to build the `tartrazine` binary
3. Copy the binary somewhere in your PATH.
## Usage
## Usage as a CLI tool
```shell
$ tartrazine whatever.c -l c -t catppuccin-macchiato --line-numbers \
--standalone -o whatever.html
```
## Usage as a Library
This works:

View File

@@ -8,4 +8,5 @@
* ✅ Implement lexer loader that respects aliases
* ✅ Implement lexer loader by file extension
* ✅ Add --line-numbers to terminal formatter
* Implement lexer loader by mime type
* Implement lexer loader by mime type
* Implement Delegating lexers

View File

@@ -1,5 +1,5 @@
name: tartrazine
version: 0.4.0
version: 0.5.0
authors:
- Roberto Alsina <roberto.alsina@gmail.com>

View File

@@ -73,7 +73,8 @@ end
# Helper that creates lexer and tokenizes
def tokenize(lexer_name, text)
lexer = Tartrazine.lexer(lexer_name)
lexer.tokenize(text)
tokenizer = Tartrazine::Tokenizer.new(lexer, text)
Tartrazine::Lexer.collapse_tokens(tokenizer.to_a)
end
# Helper that tokenizes using chroma to validate the lexer

View File

@@ -8,19 +8,29 @@ require "./tartrazine"
# perform a list of actions. These actions can emit tokens
# or change the state machine.
module Tartrazine
enum ActionType
Bygroups
Combined
Include
Pop
Push
Token
Using
Usingself
end
struct Action
property actions : Array(Action) = [] of Action
property type : String
@depth : Int32 = 0
@lexer_name : String = ""
@states : Array(String) = [] of String
@states_to_push : Array(String) = [] of String
@token_type : String = ""
@type : ActionType = ActionType::Token
def initialize(@type : String, xml : XML::Node?)
known_types = %w(token push pop combined bygroups include using usingself)
raise Exception.new("Unknown action type: #{type}") unless known_types.includes? type
def initialize(t : String, xml : XML::Node?)
@type = ActionType.parse(t.capitalize)
# Some actions may have actions in them, like this:
# <bygroups>
@@ -37,18 +47,18 @@ module Tartrazine
end
# Prefetch the attributes we ned from the XML and keep them
case type
when "token"
case @type
when ActionType::Token
@token_type = xml["type"]
when "push"
when ActionType::Push
@states_to_push = xml.attributes.select { |attrib|
attrib.name == "state"
}.map &.content
when "pop"
when ActionType::Pop
@depth = xml["depth"].to_i
when "using"
when ActionType::Using
@lexer_name = xml["lexer"].downcase
when "combined"
when ActionType::Combined
@states = xml.attributes.select { |attrib|
attrib.name == "state"
}.map &.content
@@ -56,28 +66,28 @@ module Tartrazine
end
# ameba:disable Metrics/CyclomaticComplexity
def emit(match : MatchData, lexer : Lexer, match_group = 0) : Array(Token)
case type
when "token"
def emit(match : MatchData, tokenizer : Tokenizer, match_group = 0) : Array(Token)
case @type
when ActionType::Token
raise Exception.new "Can't have a token without a match" if match.empty?
[Token.new(type: @token_type, value: String.new(match[match_group].value))]
when "push"
to_push = @states_to_push.empty? ? [lexer.state_stack.last] : @states_to_push
when ActionType::Push
to_push = @states_to_push.empty? ? [tokenizer.state_stack.last] : @states_to_push
to_push.each do |state|
if state == "#pop" && lexer.state_stack.size > 1
if state == "#pop" && tokenizer.state_stack.size > 1
# Pop the state
lexer.state_stack.pop
tokenizer.state_stack.pop
else
# Really push
lexer.state_stack << state
tokenizer.state_stack << state
end
end
[] of Token
when "pop"
to_pop = [@depth, lexer.state_stack.size - 1].min
lexer.state_stack.pop(to_pop)
when ActionType::Pop
to_pop = [@depth, tokenizer.state_stack.size - 1].min
tokenizer.state_stack.pop(to_pop)
[] of Token
when "bygroups"
when ActionType::Bygroups
# FIXME: handle
# ><bygroups>
# <token type="Punctuation"/>
@@ -99,30 +109,35 @@ module Tartrazine
# No match for this group
next
end
result += e.emit(match, lexer, i + 1)
result += e.emit(match, tokenizer, i + 1)
end
result
when "using"
when ActionType::Using
# Shunt to another lexer entirely
return [] of Token if match.empty?
Tartrazine.lexer(@lexer_name).tokenize(String.new(match[match_group].value), usingself: true)
when "usingself"
Tokenizer.new(
Tartrazine.lexer(@lexer_name),
String.new(match[match_group].value),
secondary: true).to_a
when ActionType::Usingself
# Shunt to another copy of this lexer
return [] of Token if match.empty?
new_lexer = lexer.copy
new_lexer.tokenize(String.new(match[match_group].value), usingself: true)
when "combined"
# Combine two states into one anonymous state
Tokenizer.new(
tokenizer.lexer,
String.new(match[match_group].value),
secondary: true).to_a
when ActionType::Combined
# Combine two or more states into one anonymous state
new_state = @states.map { |name|
lexer.states[name]
tokenizer.lexer.states[name]
}.reduce { |state1, state2|
state1 + state2
}
lexer.states[new_state.name] = new_state
lexer.state_stack << new_state.name
tokenizer.lexer.states[new_state.name] = new_state
tokenizer.state_stack << new_state.name
[] of Token
else
raise Exception.new("Unknown action type: #{type}")
raise Exception.new("Unknown action type: #{@type}")
end
end
end

View File

@@ -31,7 +31,6 @@ module BytesRegex
end
def match(str : Bytes, pos = 0) : Array(Match)
match = [] of Match
rc = LibPCRE2.match(
@re,
str,
@@ -42,22 +41,23 @@ module BytesRegex
nil)
if rc > 0
ovector = LibPCRE2.get_ovector_pointer(@match_data)
(0...rc).each do |i|
(0...rc).map do |i|
m_start = ovector[2 * i]
m_size = ovector[2 * i + 1] - m_start
if m_size == 0
m_end = ovector[2 * i + 1]
if m_start == m_end
m_value = Bytes.new(0)
else
m_value = str[m_start...m_start + m_size]
m_value = str[m_start...m_end]
end
match << Match.new(m_value, m_start, m_size)
Match.new(m_value, m_start, m_end - m_start)
end
else
[] of Match
end
match
end
end
class Match
struct Match
property value : Bytes
property start : UInt64
property size : UInt64

View File

@@ -7,19 +7,39 @@ module Tartrazine
def initialize(@theme : Theme = Tartrazine.theme("default-dark"), @line_numbers : Bool = false)
end
private def line_label(i : Int32) : String
"#{i + 1}".rjust(4).ljust(5)
end
def format(text : String, lexer : Lexer) : String
tokenizer = Tokenizer.new(lexer, text)
i = 0
output = String.build do |outp|
lexer.group_tokens_in_lines(lexer.tokenize(text)).each_with_index do |line, i|
label = line_numbers? ? "#{i + 1}".rjust(4).ljust(5) : ""
outp << label
line.each do |token|
outp << colorize(token[:value], token[:type])
outp << line_label(i) if line_numbers?
tokenizer.each do |token|
outp << colorize(token[:value], token[:type])
if token[:value].includes?("\n")
i += 1
outp << line_label(i) if line_numbers?
end
end
end
output
end
# def format(text : String, lexer : Lexer) : String
# output = String.build do |outp|
# lexer.group_tokens_in_lines(lexer.tokenize(text)).each_with_index do |line, i|
# label = line_numbers? ? "#{i + 1}".rjust(4).ljust(5) : ""
# outp << label
# line.each do |token|
# outp << colorize(token[:value], token[:type])
# end
# end
# end
# output
# end
def colorize(text : String, token : String) : String
style = theme.styles.fetch(token, nil)
return text if style.nil?

View File

@@ -54,21 +54,29 @@ module Tartrazine
output
end
private def line_label(i : Int32) : String
line_label = "#{i + 1}".rjust(4).ljust(5)
line_class = highlighted?(i + 1) ? "class=\"#{get_css_class("LineHighlight")}\"" : ""
line_id = linkable_line_numbers? ? "id=\"#{line_number_id_prefix}#{i + 1}\"" : ""
"<span #{line_id} #{line_class} style=\"user-select: none;\">#{line_label} </span>"
end
def format_text(text : String, lexer : Lexer) : String
lines = lexer.group_tokens_in_lines(lexer.tokenize(text))
# lines = lexer.group_tokens_in_lines(lexer.tokenize(text))
tokenizer = Tokenizer.new(lexer, text)
i = 0
output = String.build do |outp|
if surrounding_pre?
pre_style = wrap_long_lines? ? "style=\"white-space: pre-wrap; word-break: break-word;\"" : ""
outp << "<pre class=\"#{get_css_class("Background")}\" #{pre_style}>"
end
outp << "<code class=\"#{get_css_class("Background")}\">"
lines.each_with_index(offset: line_number_start - 1) do |line, i|
line_label = line_numbers? ? "#{i + 1}".rjust(4).ljust(5) : ""
line_class = highlighted?(i + 1) ? "class=\"#{get_css_class("LineHighlight")}\"" : ""
line_id = linkable_line_numbers? ? "id=\"#{line_number_id_prefix}#{i + 1}\"" : ""
outp << "<span #{line_id} #{line_class} style=\"user-select: none;\">#{line_label} </span>"
line.each do |token|
outp << "<span class=\"#{get_css_class(token[:type])}\">#{HTML.escape(token[:value])}</span>"
outp << line_label(i) if line_numbers?
tokenizer.each do |token|
outp << "<span class=\"#{get_css_class(token[:type])}\">#{HTML.escape(token[:value])}</span>"
if token[:value].ends_with? "\n"
i += 1
outp << line_label(i) if line_numbers?
end
end
outp << "</code></pre>"

View File

@@ -4,7 +4,6 @@ require "./constants/lexers"
module Tartrazine
class LexerFiles
extend BakedFileSystem
bake_folder "../lexers", __DIR__
end
@@ -38,80 +37,91 @@ module Tartrazine
LEXERS_BY_NAME.keys.sort!
end
# A token, the output of the tokenizer
alias Token = NamedTuple(type: String, value: String)
struct Tokenizer
include Iterator(Token)
property lexer : Lexer
property text : Bytes
property pos : Int32 = 0
@dq = Deque(Token).new
property state_stack = ["root"]
def initialize(@lexer : Lexer, text : String, secondary = false)
# Respect the `ensure_nl` config option
if text.size > 0 && text[-1] != '\n' && @lexer.config[:ensure_nl] && !secondary
text += "\n"
end
@text = text.to_slice
end
def next : Iterator::Stop | Token
if @dq.size > 0
return @dq.shift
end
if pos == @text.size
return stop
end
matched = false
while @pos < @text.size
@lexer.states[@state_stack.last].rules.each do |rule|
matched, new_pos, new_tokens = rule.match(@text, @pos, self)
if matched
@pos = new_pos
split_tokens(new_tokens).each { |token| @dq << token }
break
end
end
if !matched
if @text[@pos] == 10u8
@dq << {type: "Text", value: "\n"}
@state_stack = ["root"]
else
@dq << {type: "Error", value: String.new(@text[@pos..@pos])}
end
@pos += 1
break
end
end
self.next
end
# If a token contains a newline, split it into two tokens
def split_tokens(tokens : Array(Token)) : Array(Token)
split_tokens = [] of Token
tokens.each do |token|
if token[:value].includes?("\n")
values = token[:value].split("\n")
values.each_with_index do |value, index|
value += "\n" if index < values.size - 1
split_tokens << {type: token[:type], value: value}
end
else
split_tokens << token
end
end
split_tokens
end
end
# This implements a lexer for Pygments RegexLexers as expressed
# in Chroma's XML serialization.
#
# For explanations on what actions and states do
# the Pygments documentation is a good place to start.
# https://pygments.org/docs/lexerdevelopment/
class Lexer
struct Lexer
property config = {
name: "",
aliases: [] of String,
filenames: [] of String,
mime_types: [] of String,
priority: 0.0,
case_insensitive: false,
dot_all: false,
not_multiline: false,
ensure_nl: false,
}
# property xml : String = ""
property states = {} of String => State
property state_stack = ["root"]
def copy : Lexer
new_lexer = Lexer.new
new_lexer.config = config
new_lexer.states = states
new_lexer.state_stack = state_stack[0..-1]
new_lexer
end
# Turn the text into a list of tokens. The `usingself` parameter
# is true when the lexer is being used to tokenize a string
# from a larger text that is already being tokenized.
# So, when it's true, we don't modify the text.
def tokenize(text : String, usingself = false) : Array(Token)
@state_stack = ["root"]
tokens = [] of Token
pos = 0
matched = false
# Respect the `ensure_nl` config option
if text.size > 0 && text[-1] != '\n' && config[:ensure_nl] && !usingself
text += "\n"
end
text_bytes = text.to_slice
# Loop through the text, applying rules
while pos < text_bytes.size
state = states[@state_stack.last]
# Log.trace { "Stack is #{@state_stack} State is #{state.name}, pos is #{pos}, text is #{text[pos..pos + 10]}" }
state.rules.each do |rule|
matched, new_pos, new_tokens = rule.match(text_bytes, pos, self)
if matched
# Move position forward, save the tokens,
# tokenize from the new position
pos = new_pos
tokens += new_tokens
break
end
end
# If no rule matches, emit an error token
unless matched
if text_bytes[pos] == 10u8
# at EOL, reset state to "root"
tokens << {type: "Text", value: "\n"}
@state_stack = ["root"]
else
tokens << {type: "Error", value: String.new(text_bytes[pos..pos])}
end
pos += 1
end
end
Lexer.collapse_tokens(tokens)
end
# Collapse consecutive tokens of the same type for easier comparison
# and smaller output
@@ -135,31 +145,6 @@ module Tartrazine
result
end
# Group tokens into lines, splitting them when a newline is found
def group_tokens_in_lines(tokens : Array(Token)) : Array(Array(Token))
split_tokens = [] of Token
tokens.each do |token|
if token[:value].includes?("\n")
values = token[:value].split("\n")
values.each_with_index do |value, index|
value += "\n" if index < values.size - 1
split_tokens << {type: token[:type], value: value}
end
else
split_tokens << token
end
end
lines = [Array(Token).new]
split_tokens.each do |token|
lines.last << token
if token[:value].includes?("\n")
lines << Array(Token).new
end
end
lines
end
# ameba:disable Metrics/CyclomaticComplexity
def self.from_xml(xml : String) : Lexer
l = Lexer.new
lexer = XML.parse(xml).first_element_child
@@ -170,9 +155,6 @@ module Tartrazine
if config
l.config = {
name: xml_to_s(config, name) || "",
aliases: xml_to_a(config, _alias) || [] of String,
filenames: xml_to_a(config, filename) || [] of String,
mime_types: xml_to_a(config, mime_type) || [] of String,
priority: xml_to_f(config, priority) || 0.0,
not_multiline: xml_to_s(config, not_multiline) == "true",
dot_all: xml_to_s(config, dot_all) == "true",
@@ -236,7 +218,4 @@ module Tartrazine
new_state
end
end
# A token, the output of the tokenizer
alias Token = NamedTuple(type: String, value: String)
end

View File

@@ -16,10 +16,10 @@ module Tartrazine
alias MatchData = Array(Match)
abstract struct BaseRule
abstract def match(text : Bytes, pos, lexer) : Tuple(Bool, Int32, Array(Token))
abstract def match(text : Bytes, pos : Int32, tokenizer : Tokenizer) : Tuple(Bool, Int32, Array(Token))
abstract def initialize(node : XML::Node)
property actions : Array(Action) = [] of Action
@actions : Array(Action) = [] of Action
def add_actions(node : XML::Node)
node.children.each do |child|
@@ -31,14 +31,13 @@ module Tartrazine
struct Rule < BaseRule
property pattern : Regex = Regex.new ""
property actions : Array(Action) = [] of Action
def match(text : Bytes, pos, lexer) : Tuple(Bool, Int32, Array(Token))
def match(text : Bytes, pos, tokenizer) : Tuple(Bool, Int32, Array(Token))
match = pattern.match(text, pos)
# No match
return false, pos, [] of Token if match.size == 0
return true, pos + match[0].size, actions.flat_map { |action| action.emit(match, lexer) }
return true, pos + match[0].size, @actions.flat_map(&.emit(match, tokenizer))
end
def initialize(node : XML::Node)
@@ -55,12 +54,11 @@ module Tartrazine
# This rule includes another state. If any of the rules of the
# included state matches, this rule matches.
struct IncludeStateRule < BaseRule
property state : String = ""
@state : String = ""
def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token))
Log.trace { "Including state #{state} from #{lexer.state_stack.last}" }
lexer.states[state].rules.each do |rule|
matched, new_pos, new_tokens = rule.match(text, pos, lexer)
def match(text : Bytes, pos : Int32, tokenizer : Tokenizer) : Tuple(Bool, Int32, Array(Token))
tokenizer.@lexer.states[@state].rules.each do |rule|
matched, new_pos, new_tokens = rule.match(text, pos, tokenizer)
return true, new_pos, new_tokens if matched
end
return false, pos, [] of Token
@@ -79,8 +77,8 @@ module Tartrazine
struct UnconditionalRule < BaseRule
NO_MATCH = [] of Match
def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token))
return true, pos, actions.flat_map { |action| action.emit(NO_MATCH, lexer) }
def match(text, pos, tokenizer) : Tuple(Bool, Int32, Array(Token))
return true, pos, @actions.flat_map(&.emit(NO_MATCH, tokenizer))
end
def initialize(node : XML::Node)

View File

@@ -9,7 +9,7 @@ require "xml"
module Tartrazine
alias Color = Sixteen::Color
class ThemeFiles
struct ThemeFiles
extend BakedFileSystem
bake_folder "../styles", __DIR__
end
@@ -39,7 +39,7 @@ module Tartrazine
themes.to_a.sort!
end
class Style
struct Style
# These properties are tri-state.
# true means it's set
# false means it's not set
@@ -79,7 +79,7 @@ module Tartrazine
end
end
class Theme
struct Theme
property name : String = ""
property styles = {} of String => Style