mirror of
https://github.com/ralsina/tartrazine.git
synced 2025-09-17 10:48:12 +00:00
Compare commits
20 Commits
cb09dff9f1
...
v0.5.1
Author | SHA1 | Date | |
---|---|---|---|
91b973f464 | |||
3ebedec6c1 | |||
57e63f2308 | |||
4a598a575b | |||
9042138053 | |||
fa647e898a | |||
ad92929a10 | |||
bb952a44b8 | |||
ae03e4612e | |||
471b2f5050 | |||
5a3b08e716 | |||
9ebb9f2765 | |||
7538fc76aa | |||
788577b226 | |||
1f01146b1f | |||
9041b763ea | |||
ada30915c3 | |||
78eff45ea0 | |||
e817aedd60 | |||
20d6b65346 |
@@ -47,7 +47,14 @@ To build from source:
|
|||||||
2. Run `make` to build the `tartrazine` binary
|
2. Run `make` to build the `tartrazine` binary
|
||||||
3. Copy the binary somewhere in your PATH.
|
3. Copy the binary somewhere in your PATH.
|
||||||
|
|
||||||
## Usage
|
## Usage as a CLI tool
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ tartrazine whatever.c -l c -t catppuccin-macchiato --line-numbers \
|
||||||
|
--standalone -o whatever.html
|
||||||
|
```
|
||||||
|
|
||||||
|
## Usage as a Library
|
||||||
|
|
||||||
This works:
|
This works:
|
||||||
|
|
||||||
|
1
TODO.md
1
TODO.md
@@ -9,3 +9,4 @@
|
|||||||
* ✅ Implement lexer loader by file extension
|
* ✅ Implement lexer loader by file extension
|
||||||
* ✅ Add --line-numbers to terminal formatter
|
* ✅ Add --line-numbers to terminal formatter
|
||||||
* Implement lexer loader by mime type
|
* Implement lexer loader by mime type
|
||||||
|
* Implement Delegating lexers
|
@@ -1,5 +1,5 @@
|
|||||||
name: tartrazine
|
name: tartrazine
|
||||||
version: 0.4.0
|
version: 0.5.1
|
||||||
|
|
||||||
authors:
|
authors:
|
||||||
- Roberto Alsina <roberto.alsina@gmail.com>
|
- Roberto Alsina <roberto.alsina@gmail.com>
|
||||||
|
@@ -73,7 +73,8 @@ end
|
|||||||
# Helper that creates lexer and tokenizes
|
# Helper that creates lexer and tokenizes
|
||||||
def tokenize(lexer_name, text)
|
def tokenize(lexer_name, text)
|
||||||
lexer = Tartrazine.lexer(lexer_name)
|
lexer = Tartrazine.lexer(lexer_name)
|
||||||
lexer.tokenize(text)
|
tokenizer = Tartrazine::Tokenizer.new(lexer, text)
|
||||||
|
Tartrazine::Lexer.collapse_tokens(tokenizer.to_a)
|
||||||
end
|
end
|
||||||
|
|
||||||
# Helper that tokenizes using chroma to validate the lexer
|
# Helper that tokenizes using chroma to validate the lexer
|
||||||
|
@@ -8,19 +8,29 @@ require "./tartrazine"
|
|||||||
# perform a list of actions. These actions can emit tokens
|
# perform a list of actions. These actions can emit tokens
|
||||||
# or change the state machine.
|
# or change the state machine.
|
||||||
module Tartrazine
|
module Tartrazine
|
||||||
|
enum ActionType
|
||||||
|
Bygroups
|
||||||
|
Combined
|
||||||
|
Include
|
||||||
|
Pop
|
||||||
|
Push
|
||||||
|
Token
|
||||||
|
Using
|
||||||
|
Usingself
|
||||||
|
end
|
||||||
|
|
||||||
struct Action
|
struct Action
|
||||||
property actions : Array(Action) = [] of Action
|
property actions : Array(Action) = [] of Action
|
||||||
property type : String
|
|
||||||
|
|
||||||
@depth : Int32 = 0
|
@depth : Int32 = 0
|
||||||
@lexer_name : String = ""
|
@lexer_name : String = ""
|
||||||
@states : Array(String) = [] of String
|
@states : Array(String) = [] of String
|
||||||
@states_to_push : Array(String) = [] of String
|
@states_to_push : Array(String) = [] of String
|
||||||
@token_type : String = ""
|
@token_type : String = ""
|
||||||
|
@type : ActionType = ActionType::Token
|
||||||
|
|
||||||
def initialize(@type : String, xml : XML::Node?)
|
def initialize(t : String, xml : XML::Node?)
|
||||||
known_types = %w(token push pop combined bygroups include using usingself)
|
@type = ActionType.parse(t.capitalize)
|
||||||
raise Exception.new("Unknown action type: #{type}") unless known_types.includes? type
|
|
||||||
|
|
||||||
# Some actions may have actions in them, like this:
|
# Some actions may have actions in them, like this:
|
||||||
# <bygroups>
|
# <bygroups>
|
||||||
@@ -37,18 +47,18 @@ module Tartrazine
|
|||||||
end
|
end
|
||||||
|
|
||||||
# Prefetch the attributes we ned from the XML and keep them
|
# Prefetch the attributes we ned from the XML and keep them
|
||||||
case type
|
case @type
|
||||||
when "token"
|
when ActionType::Token
|
||||||
@token_type = xml["type"]
|
@token_type = xml["type"]
|
||||||
when "push"
|
when ActionType::Push
|
||||||
@states_to_push = xml.attributes.select { |attrib|
|
@states_to_push = xml.attributes.select { |attrib|
|
||||||
attrib.name == "state"
|
attrib.name == "state"
|
||||||
}.map &.content
|
}.map &.content
|
||||||
when "pop"
|
when ActionType::Pop
|
||||||
@depth = xml["depth"].to_i
|
@depth = xml["depth"].to_i
|
||||||
when "using"
|
when ActionType::Using
|
||||||
@lexer_name = xml["lexer"].downcase
|
@lexer_name = xml["lexer"].downcase
|
||||||
when "combined"
|
when ActionType::Combined
|
||||||
@states = xml.attributes.select { |attrib|
|
@states = xml.attributes.select { |attrib|
|
||||||
attrib.name == "state"
|
attrib.name == "state"
|
||||||
}.map &.content
|
}.map &.content
|
||||||
@@ -56,28 +66,28 @@ module Tartrazine
|
|||||||
end
|
end
|
||||||
|
|
||||||
# ameba:disable Metrics/CyclomaticComplexity
|
# ameba:disable Metrics/CyclomaticComplexity
|
||||||
def emit(match : MatchData, lexer : Lexer, match_group = 0) : Array(Token)
|
def emit(match : MatchData, tokenizer : Tokenizer, match_group = 0) : Array(Token)
|
||||||
case type
|
case @type
|
||||||
when "token"
|
when ActionType::Token
|
||||||
raise Exception.new "Can't have a token without a match" if match.empty?
|
raise Exception.new "Can't have a token without a match" if match.empty?
|
||||||
[Token.new(type: @token_type, value: String.new(match[match_group].value))]
|
[Token.new(type: @token_type, value: String.new(match[match_group].value))]
|
||||||
when "push"
|
when ActionType::Push
|
||||||
to_push = @states_to_push.empty? ? [lexer.state_stack.last] : @states_to_push
|
to_push = @states_to_push.empty? ? [tokenizer.state_stack.last] : @states_to_push
|
||||||
to_push.each do |state|
|
to_push.each do |state|
|
||||||
if state == "#pop" && lexer.state_stack.size > 1
|
if state == "#pop" && tokenizer.state_stack.size > 1
|
||||||
# Pop the state
|
# Pop the state
|
||||||
lexer.state_stack.pop
|
tokenizer.state_stack.pop
|
||||||
else
|
else
|
||||||
# Really push
|
# Really push
|
||||||
lexer.state_stack << state
|
tokenizer.state_stack << state
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
[] of Token
|
[] of Token
|
||||||
when "pop"
|
when ActionType::Pop
|
||||||
to_pop = [@depth, lexer.state_stack.size - 1].min
|
to_pop = [@depth, tokenizer.state_stack.size - 1].min
|
||||||
lexer.state_stack.pop(to_pop)
|
tokenizer.state_stack.pop(to_pop)
|
||||||
[] of Token
|
[] of Token
|
||||||
when "bygroups"
|
when ActionType::Bygroups
|
||||||
# FIXME: handle
|
# FIXME: handle
|
||||||
# ><bygroups>
|
# ><bygroups>
|
||||||
# <token type="Punctuation"/>
|
# <token type="Punctuation"/>
|
||||||
@@ -99,30 +109,35 @@ module Tartrazine
|
|||||||
# No match for this group
|
# No match for this group
|
||||||
next
|
next
|
||||||
end
|
end
|
||||||
result += e.emit(match, lexer, i + 1)
|
result += e.emit(match, tokenizer, i + 1)
|
||||||
end
|
end
|
||||||
result
|
result
|
||||||
when "using"
|
when ActionType::Using
|
||||||
# Shunt to another lexer entirely
|
# Shunt to another lexer entirely
|
||||||
return [] of Token if match.empty?
|
return [] of Token if match.empty?
|
||||||
Tartrazine.lexer(@lexer_name).tokenize(String.new(match[match_group].value), usingself: true)
|
Tokenizer.new(
|
||||||
when "usingself"
|
Tartrazine.lexer(@lexer_name),
|
||||||
|
String.new(match[match_group].value),
|
||||||
|
secondary: true).to_a
|
||||||
|
when ActionType::Usingself
|
||||||
# Shunt to another copy of this lexer
|
# Shunt to another copy of this lexer
|
||||||
return [] of Token if match.empty?
|
return [] of Token if match.empty?
|
||||||
new_lexer = lexer.copy
|
Tokenizer.new(
|
||||||
new_lexer.tokenize(String.new(match[match_group].value), usingself: true)
|
tokenizer.lexer,
|
||||||
when "combined"
|
String.new(match[match_group].value),
|
||||||
# Combine two states into one anonymous state
|
secondary: true).to_a
|
||||||
|
when ActionType::Combined
|
||||||
|
# Combine two or more states into one anonymous state
|
||||||
new_state = @states.map { |name|
|
new_state = @states.map { |name|
|
||||||
lexer.states[name]
|
tokenizer.lexer.states[name]
|
||||||
}.reduce { |state1, state2|
|
}.reduce { |state1, state2|
|
||||||
state1 + state2
|
state1 + state2
|
||||||
}
|
}
|
||||||
lexer.states[new_state.name] = new_state
|
tokenizer.lexer.states[new_state.name] = new_state
|
||||||
lexer.state_stack << new_state.name
|
tokenizer.state_stack << new_state.name
|
||||||
[] of Token
|
[] of Token
|
||||||
else
|
else
|
||||||
raise Exception.new("Unknown action type: #{type}")
|
raise Exception.new("Unknown action type: #{@type}")
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
@@ -31,7 +31,6 @@ module BytesRegex
|
|||||||
end
|
end
|
||||||
|
|
||||||
def match(str : Bytes, pos = 0) : Array(Match)
|
def match(str : Bytes, pos = 0) : Array(Match)
|
||||||
match = [] of Match
|
|
||||||
rc = LibPCRE2.match(
|
rc = LibPCRE2.match(
|
||||||
@re,
|
@re,
|
||||||
str,
|
str,
|
||||||
@@ -42,22 +41,23 @@ module BytesRegex
|
|||||||
nil)
|
nil)
|
||||||
if rc > 0
|
if rc > 0
|
||||||
ovector = LibPCRE2.get_ovector_pointer(@match_data)
|
ovector = LibPCRE2.get_ovector_pointer(@match_data)
|
||||||
(0...rc).each do |i|
|
(0...rc).map do |i|
|
||||||
m_start = ovector[2 * i]
|
m_start = ovector[2 * i]
|
||||||
m_size = ovector[2 * i + 1] - m_start
|
m_end = ovector[2 * i + 1]
|
||||||
if m_size == 0
|
if m_start == m_end
|
||||||
m_value = Bytes.new(0)
|
m_value = Bytes.new(0)
|
||||||
else
|
else
|
||||||
m_value = str[m_start...m_start + m_size]
|
m_value = str[m_start...m_end]
|
||||||
end
|
end
|
||||||
match << Match.new(m_value, m_start, m_size)
|
Match.new(m_value, m_start, m_end - m_start)
|
||||||
end
|
end
|
||||||
|
else
|
||||||
|
[] of Match
|
||||||
end
|
end
|
||||||
match
|
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
class Match
|
struct Match
|
||||||
property value : Bytes
|
property value : Bytes
|
||||||
property start : UInt64
|
property start : UInt64
|
||||||
property size : UInt64
|
property size : UInt64
|
||||||
|
@@ -12,6 +12,10 @@ module Tartrazine
|
|||||||
property theme : Theme = Tartrazine.theme("default-dark")
|
property theme : Theme = Tartrazine.theme("default-dark")
|
||||||
|
|
||||||
# Format the text using the given lexer.
|
# Format the text using the given lexer.
|
||||||
|
def format(text : String, lexer : Lexer, io : IO = nil) : Nil
|
||||||
|
raise Exception.new("Not implemented")
|
||||||
|
end
|
||||||
|
|
||||||
def format(text : String, lexer : Lexer) : String
|
def format(text : String, lexer : Lexer) : String
|
||||||
raise Exception.new("Not implemented")
|
raise Exception.new("Not implemented")
|
||||||
end
|
end
|
||||||
|
@@ -7,18 +7,28 @@ module Tartrazine
|
|||||||
def initialize(@theme : Theme = Tartrazine.theme("default-dark"), @line_numbers : Bool = false)
|
def initialize(@theme : Theme = Tartrazine.theme("default-dark"), @line_numbers : Bool = false)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
private def line_label(i : Int32) : String
|
||||||
|
"#{i + 1}".rjust(4).ljust(5)
|
||||||
|
end
|
||||||
|
|
||||||
def format(text : String, lexer : Lexer) : String
|
def format(text : String, lexer : Lexer) : String
|
||||||
output = String.build do |outp|
|
outp = String::Builder.new("")
|
||||||
lexer.group_tokens_in_lines(lexer.tokenize(text)).each_with_index do |line, i|
|
format(text, lexer, outp)
|
||||||
label = line_numbers? ? "#{i + 1}".rjust(4).ljust(5) : ""
|
outp.to_s
|
||||||
outp << label
|
end
|
||||||
line.each do |token|
|
|
||||||
|
def format(text : String, lexer : Lexer, outp : IO) : Nil
|
||||||
|
tokenizer = Tokenizer.new(lexer, text)
|
||||||
|
i = 0
|
||||||
|
outp << line_label(i) if line_numbers?
|
||||||
|
tokenizer.each do |token|
|
||||||
outp << colorize(token[:value], token[:type])
|
outp << colorize(token[:value], token[:type])
|
||||||
|
if token[:value].includes?("\n")
|
||||||
|
i += 1
|
||||||
|
outp << line_label(i) if line_numbers?
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
output
|
|
||||||
end
|
|
||||||
|
|
||||||
def colorize(text : String, token : String) : String
|
def colorize(text : String, token : String) : String
|
||||||
style = theme.styles.fetch(token, nil)
|
style = theme.styles.fetch(token, nil)
|
||||||
|
@@ -35,46 +35,53 @@ module Tartrazine
|
|||||||
end
|
end
|
||||||
|
|
||||||
def format(text : String, lexer : Lexer) : String
|
def format(text : String, lexer : Lexer) : String
|
||||||
text = format_text(text, lexer)
|
outp = String::Builder.new("")
|
||||||
if standalone?
|
format(text, lexer, outp)
|
||||||
text = wrap_standalone(text)
|
outp.to_s
|
||||||
end
|
end
|
||||||
text
|
|
||||||
|
def format(text : String, lexer : Lexer, io : IO) : Nil
|
||||||
|
pre, post = wrap_standalone
|
||||||
|
io << pre if standalone?
|
||||||
|
format_text(text, lexer, io)
|
||||||
|
io << post if standalone?
|
||||||
end
|
end
|
||||||
|
|
||||||
# Wrap text into a full HTML document, including the CSS for the theme
|
# Wrap text into a full HTML document, including the CSS for the theme
|
||||||
def wrap_standalone(text) : String
|
def wrap_standalone
|
||||||
output = String.build do |outp|
|
output = String.build do |outp|
|
||||||
outp << "<!DOCTYPE html><html><head><style>"
|
outp << "<!DOCTYPE html><html><head><style>"
|
||||||
outp << style_defs
|
outp << style_defs
|
||||||
outp << "</style></head><body>"
|
outp << "</style></head><body>"
|
||||||
outp << text
|
|
||||||
outp << "</body></html>"
|
|
||||||
end
|
end
|
||||||
output
|
{output.to_s, "</body></html>"}
|
||||||
end
|
end
|
||||||
|
|
||||||
def format_text(text : String, lexer : Lexer) : String
|
private def line_label(i : Int32) : String
|
||||||
lines = lexer.group_tokens_in_lines(lexer.tokenize(text))
|
line_label = "#{i + 1}".rjust(4).ljust(5)
|
||||||
output = String.build do |outp|
|
line_class = highlighted?(i + 1) ? "class=\"#{get_css_class("LineHighlight")}\"" : ""
|
||||||
|
line_id = linkable_line_numbers? ? "id=\"#{line_number_id_prefix}#{i + 1}\"" : ""
|
||||||
|
"<span #{line_id} #{line_class} style=\"user-select: none;\">#{line_label} </span>"
|
||||||
|
end
|
||||||
|
|
||||||
|
def format_text(text : String, lexer : Lexer, outp : IO)
|
||||||
|
tokenizer = Tokenizer.new(lexer, text)
|
||||||
|
i = 0
|
||||||
if surrounding_pre?
|
if surrounding_pre?
|
||||||
pre_style = wrap_long_lines? ? "style=\"white-space: pre-wrap; word-break: break-word;\"" : ""
|
pre_style = wrap_long_lines? ? "style=\"white-space: pre-wrap; word-break: break-word;\"" : ""
|
||||||
outp << "<pre class=\"#{get_css_class("Background")}\" #{pre_style}>"
|
outp << "<pre class=\"#{get_css_class("Background")}\" #{pre_style}>"
|
||||||
end
|
end
|
||||||
outp << "<code class=\"#{get_css_class("Background")}\">"
|
outp << "<code class=\"#{get_css_class("Background")}\">"
|
||||||
lines.each_with_index(offset: line_number_start - 1) do |line, i|
|
outp << line_label(i) if line_numbers?
|
||||||
line_label = line_numbers? ? "#{i + 1}".rjust(4).ljust(5) : ""
|
tokenizer.each do |token|
|
||||||
line_class = highlighted?(i + 1) ? "class=\"#{get_css_class("LineHighlight")}\"" : ""
|
|
||||||
line_id = linkable_line_numbers? ? "id=\"#{line_number_id_prefix}#{i + 1}\"" : ""
|
|
||||||
outp << "<span #{line_id} #{line_class} style=\"user-select: none;\">#{line_label} </span>"
|
|
||||||
line.each do |token|
|
|
||||||
outp << "<span class=\"#{get_css_class(token[:type])}\">#{HTML.escape(token[:value])}</span>"
|
outp << "<span class=\"#{get_css_class(token[:type])}\">#{HTML.escape(token[:value])}</span>"
|
||||||
|
if token[:value].ends_with? "\n"
|
||||||
|
i += 1
|
||||||
|
outp << line_label(i) if line_numbers?
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
outp << "</code></pre>"
|
outp << "</code></pre>"
|
||||||
end
|
end
|
||||||
output
|
|
||||||
end
|
|
||||||
|
|
||||||
# ameba:disable Metrics/CyclomaticComplexity
|
# ameba:disable Metrics/CyclomaticComplexity
|
||||||
def style_defs : String
|
def style_defs : String
|
||||||
|
@@ -4,8 +4,15 @@ module Tartrazine
|
|||||||
class Json < Formatter
|
class Json < Formatter
|
||||||
property name = "json"
|
property name = "json"
|
||||||
|
|
||||||
def format(text : String, lexer : Lexer, _theme : Theme) : String
|
def format(text : String, lexer : Lexer) : String
|
||||||
lexer.tokenize(text).to_json
|
outp = String::Builder.new("")
|
||||||
|
format(text, lexer, outp)
|
||||||
|
outp.to_s
|
||||||
|
end
|
||||||
|
|
||||||
|
def format(text : String, lexer : Lexer, io : IO) : Nil
|
||||||
|
tokenizer = Tokenizer.new(lexer, text)
|
||||||
|
io << Tartrazine::Lexer.collapse_tokens(tokenizer.to_a).to_json
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
161
src/lexer.cr
161
src/lexer.cr
@@ -4,7 +4,6 @@ require "./constants/lexers"
|
|||||||
module Tartrazine
|
module Tartrazine
|
||||||
class LexerFiles
|
class LexerFiles
|
||||||
extend BakedFileSystem
|
extend BakedFileSystem
|
||||||
|
|
||||||
bake_folder "../lexers", __DIR__
|
bake_folder "../lexers", __DIR__
|
||||||
end
|
end
|
||||||
|
|
||||||
@@ -38,80 +37,91 @@ module Tartrazine
|
|||||||
LEXERS_BY_NAME.keys.sort!
|
LEXERS_BY_NAME.keys.sort!
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# A token, the output of the tokenizer
|
||||||
|
alias Token = NamedTuple(type: String, value: String)
|
||||||
|
|
||||||
|
struct Tokenizer
|
||||||
|
include Iterator(Token)
|
||||||
|
property lexer : Lexer
|
||||||
|
property text : Bytes
|
||||||
|
property pos : Int32 = 0
|
||||||
|
@dq = Deque(Token).new
|
||||||
|
property state_stack = ["root"]
|
||||||
|
|
||||||
|
def initialize(@lexer : Lexer, text : String, secondary = false)
|
||||||
|
# Respect the `ensure_nl` config option
|
||||||
|
if text.size > 0 && text[-1] != '\n' && @lexer.config[:ensure_nl] && !secondary
|
||||||
|
text += "\n"
|
||||||
|
end
|
||||||
|
@text = text.to_slice
|
||||||
|
end
|
||||||
|
|
||||||
|
def next : Iterator::Stop | Token
|
||||||
|
if @dq.size > 0
|
||||||
|
return @dq.shift
|
||||||
|
end
|
||||||
|
if pos == @text.size
|
||||||
|
return stop
|
||||||
|
end
|
||||||
|
|
||||||
|
matched = false
|
||||||
|
while @pos < @text.size
|
||||||
|
@lexer.states[@state_stack.last].rules.each do |rule|
|
||||||
|
matched, new_pos, new_tokens = rule.match(@text, @pos, self)
|
||||||
|
if matched
|
||||||
|
@pos = new_pos
|
||||||
|
split_tokens(new_tokens).each { |token| @dq << token }
|
||||||
|
break
|
||||||
|
end
|
||||||
|
end
|
||||||
|
if !matched
|
||||||
|
if @text[@pos] == 10u8
|
||||||
|
@dq << {type: "Text", value: "\n"}
|
||||||
|
@state_stack = ["root"]
|
||||||
|
else
|
||||||
|
@dq << {type: "Error", value: String.new(@text[@pos..@pos])}
|
||||||
|
end
|
||||||
|
@pos += 1
|
||||||
|
break
|
||||||
|
end
|
||||||
|
end
|
||||||
|
self.next
|
||||||
|
end
|
||||||
|
|
||||||
|
# If a token contains a newline, split it into two tokens
|
||||||
|
def split_tokens(tokens : Array(Token)) : Array(Token)
|
||||||
|
split_tokens = [] of Token
|
||||||
|
tokens.each do |token|
|
||||||
|
if token[:value].includes?("\n")
|
||||||
|
values = token[:value].split("\n")
|
||||||
|
values.each_with_index do |value, index|
|
||||||
|
value += "\n" if index < values.size - 1
|
||||||
|
split_tokens << {type: token[:type], value: value}
|
||||||
|
end
|
||||||
|
else
|
||||||
|
split_tokens << token
|
||||||
|
end
|
||||||
|
end
|
||||||
|
split_tokens
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
# This implements a lexer for Pygments RegexLexers as expressed
|
# This implements a lexer for Pygments RegexLexers as expressed
|
||||||
# in Chroma's XML serialization.
|
# in Chroma's XML serialization.
|
||||||
#
|
#
|
||||||
# For explanations on what actions and states do
|
# For explanations on what actions and states do
|
||||||
# the Pygments documentation is a good place to start.
|
# the Pygments documentation is a good place to start.
|
||||||
# https://pygments.org/docs/lexerdevelopment/
|
# https://pygments.org/docs/lexerdevelopment/
|
||||||
class Lexer
|
struct Lexer
|
||||||
property config = {
|
property config = {
|
||||||
name: "",
|
name: "",
|
||||||
aliases: [] of String,
|
|
||||||
filenames: [] of String,
|
|
||||||
mime_types: [] of String,
|
|
||||||
priority: 0.0,
|
priority: 0.0,
|
||||||
case_insensitive: false,
|
case_insensitive: false,
|
||||||
dot_all: false,
|
dot_all: false,
|
||||||
not_multiline: false,
|
not_multiline: false,
|
||||||
ensure_nl: false,
|
ensure_nl: false,
|
||||||
}
|
}
|
||||||
# property xml : String = ""
|
|
||||||
property states = {} of String => State
|
property states = {} of String => State
|
||||||
property state_stack = ["root"]
|
|
||||||
|
|
||||||
def copy : Lexer
|
|
||||||
new_lexer = Lexer.new
|
|
||||||
new_lexer.config = config
|
|
||||||
new_lexer.states = states
|
|
||||||
new_lexer.state_stack = state_stack[0..-1]
|
|
||||||
new_lexer
|
|
||||||
end
|
|
||||||
|
|
||||||
# Turn the text into a list of tokens. The `usingself` parameter
|
|
||||||
# is true when the lexer is being used to tokenize a string
|
|
||||||
# from a larger text that is already being tokenized.
|
|
||||||
# So, when it's true, we don't modify the text.
|
|
||||||
def tokenize(text : String, usingself = false) : Array(Token)
|
|
||||||
@state_stack = ["root"]
|
|
||||||
tokens = [] of Token
|
|
||||||
pos = 0
|
|
||||||
matched = false
|
|
||||||
|
|
||||||
# Respect the `ensure_nl` config option
|
|
||||||
if text.size > 0 && text[-1] != '\n' && config[:ensure_nl] && !usingself
|
|
||||||
text += "\n"
|
|
||||||
end
|
|
||||||
|
|
||||||
text_bytes = text.to_slice
|
|
||||||
# Loop through the text, applying rules
|
|
||||||
while pos < text_bytes.size
|
|
||||||
state = states[@state_stack.last]
|
|
||||||
# Log.trace { "Stack is #{@state_stack} State is #{state.name}, pos is #{pos}, text is #{text[pos..pos + 10]}" }
|
|
||||||
state.rules.each do |rule|
|
|
||||||
matched, new_pos, new_tokens = rule.match(text_bytes, pos, self)
|
|
||||||
if matched
|
|
||||||
# Move position forward, save the tokens,
|
|
||||||
# tokenize from the new position
|
|
||||||
pos = new_pos
|
|
||||||
tokens += new_tokens
|
|
||||||
break
|
|
||||||
end
|
|
||||||
end
|
|
||||||
# If no rule matches, emit an error token
|
|
||||||
unless matched
|
|
||||||
if text_bytes[pos] == 10u8
|
|
||||||
# at EOL, reset state to "root"
|
|
||||||
tokens << {type: "Text", value: "\n"}
|
|
||||||
@state_stack = ["root"]
|
|
||||||
else
|
|
||||||
tokens << {type: "Error", value: String.new(text_bytes[pos..pos])}
|
|
||||||
end
|
|
||||||
pos += 1
|
|
||||||
end
|
|
||||||
end
|
|
||||||
Lexer.collapse_tokens(tokens)
|
|
||||||
end
|
|
||||||
|
|
||||||
# Collapse consecutive tokens of the same type for easier comparison
|
# Collapse consecutive tokens of the same type for easier comparison
|
||||||
# and smaller output
|
# and smaller output
|
||||||
@@ -135,31 +145,6 @@ module Tartrazine
|
|||||||
result
|
result
|
||||||
end
|
end
|
||||||
|
|
||||||
# Group tokens into lines, splitting them when a newline is found
|
|
||||||
def group_tokens_in_lines(tokens : Array(Token)) : Array(Array(Token))
|
|
||||||
split_tokens = [] of Token
|
|
||||||
tokens.each do |token|
|
|
||||||
if token[:value].includes?("\n")
|
|
||||||
values = token[:value].split("\n")
|
|
||||||
values.each_with_index do |value, index|
|
|
||||||
value += "\n" if index < values.size - 1
|
|
||||||
split_tokens << {type: token[:type], value: value}
|
|
||||||
end
|
|
||||||
else
|
|
||||||
split_tokens << token
|
|
||||||
end
|
|
||||||
end
|
|
||||||
lines = [Array(Token).new]
|
|
||||||
split_tokens.each do |token|
|
|
||||||
lines.last << token
|
|
||||||
if token[:value].includes?("\n")
|
|
||||||
lines << Array(Token).new
|
|
||||||
end
|
|
||||||
end
|
|
||||||
lines
|
|
||||||
end
|
|
||||||
|
|
||||||
# ameba:disable Metrics/CyclomaticComplexity
|
|
||||||
def self.from_xml(xml : String) : Lexer
|
def self.from_xml(xml : String) : Lexer
|
||||||
l = Lexer.new
|
l = Lexer.new
|
||||||
lexer = XML.parse(xml).first_element_child
|
lexer = XML.parse(xml).first_element_child
|
||||||
@@ -170,9 +155,6 @@ module Tartrazine
|
|||||||
if config
|
if config
|
||||||
l.config = {
|
l.config = {
|
||||||
name: xml_to_s(config, name) || "",
|
name: xml_to_s(config, name) || "",
|
||||||
aliases: xml_to_a(config, _alias) || [] of String,
|
|
||||||
filenames: xml_to_a(config, filename) || [] of String,
|
|
||||||
mime_types: xml_to_a(config, mime_type) || [] of String,
|
|
||||||
priority: xml_to_f(config, priority) || 0.0,
|
priority: xml_to_f(config, priority) || 0.0,
|
||||||
not_multiline: xml_to_s(config, not_multiline) == "true",
|
not_multiline: xml_to_s(config, not_multiline) == "true",
|
||||||
dot_all: xml_to_s(config, dot_all) == "true",
|
dot_all: xml_to_s(config, dot_all) == "true",
|
||||||
@@ -236,7 +218,4 @@ module Tartrazine
|
|||||||
new_state
|
new_state
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
# A token, the output of the tokenizer
|
|
||||||
alias Token = NamedTuple(type: String, value: String)
|
|
||||||
end
|
end
|
||||||
|
@@ -85,13 +85,11 @@ if options["-f"]
|
|||||||
lexer = Tartrazine.lexer(name: options["-l"].as(String), filename: options["FILE"].as(String))
|
lexer = Tartrazine.lexer(name: options["-l"].as(String), filename: options["FILE"].as(String))
|
||||||
|
|
||||||
input = File.open(options["FILE"].as(String)).gets_to_end
|
input = File.open(options["FILE"].as(String)).gets_to_end
|
||||||
output = formatter.format(input, lexer)
|
|
||||||
|
|
||||||
if options["-o"].nil?
|
if options["-o"].nil?
|
||||||
puts output
|
outf = STDOUT
|
||||||
else
|
else
|
||||||
File.open(options["-o"].as(String), "w") do |outf|
|
outf = File.open(options["-o"].as(String), "w")
|
||||||
outf << output
|
|
||||||
end
|
|
||||||
end
|
end
|
||||||
|
formatter.format(input, lexer, outf)
|
||||||
end
|
end
|
||||||
|
22
src/rules.cr
22
src/rules.cr
@@ -16,10 +16,10 @@ module Tartrazine
|
|||||||
alias MatchData = Array(Match)
|
alias MatchData = Array(Match)
|
||||||
|
|
||||||
abstract struct BaseRule
|
abstract struct BaseRule
|
||||||
abstract def match(text : Bytes, pos, lexer) : Tuple(Bool, Int32, Array(Token))
|
abstract def match(text : Bytes, pos : Int32, tokenizer : Tokenizer) : Tuple(Bool, Int32, Array(Token))
|
||||||
abstract def initialize(node : XML::Node)
|
abstract def initialize(node : XML::Node)
|
||||||
|
|
||||||
property actions : Array(Action) = [] of Action
|
@actions : Array(Action) = [] of Action
|
||||||
|
|
||||||
def add_actions(node : XML::Node)
|
def add_actions(node : XML::Node)
|
||||||
node.children.each do |child|
|
node.children.each do |child|
|
||||||
@@ -31,14 +31,13 @@ module Tartrazine
|
|||||||
|
|
||||||
struct Rule < BaseRule
|
struct Rule < BaseRule
|
||||||
property pattern : Regex = Regex.new ""
|
property pattern : Regex = Regex.new ""
|
||||||
property actions : Array(Action) = [] of Action
|
|
||||||
|
|
||||||
def match(text : Bytes, pos, lexer) : Tuple(Bool, Int32, Array(Token))
|
def match(text : Bytes, pos, tokenizer) : Tuple(Bool, Int32, Array(Token))
|
||||||
match = pattern.match(text, pos)
|
match = pattern.match(text, pos)
|
||||||
|
|
||||||
# No match
|
# No match
|
||||||
return false, pos, [] of Token if match.size == 0
|
return false, pos, [] of Token if match.size == 0
|
||||||
return true, pos + match[0].size, actions.flat_map { |action| action.emit(match, lexer) }
|
return true, pos + match[0].size, @actions.flat_map(&.emit(match, tokenizer))
|
||||||
end
|
end
|
||||||
|
|
||||||
def initialize(node : XML::Node)
|
def initialize(node : XML::Node)
|
||||||
@@ -55,12 +54,11 @@ module Tartrazine
|
|||||||
# This rule includes another state. If any of the rules of the
|
# This rule includes another state. If any of the rules of the
|
||||||
# included state matches, this rule matches.
|
# included state matches, this rule matches.
|
||||||
struct IncludeStateRule < BaseRule
|
struct IncludeStateRule < BaseRule
|
||||||
property state : String = ""
|
@state : String = ""
|
||||||
|
|
||||||
def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token))
|
def match(text : Bytes, pos : Int32, tokenizer : Tokenizer) : Tuple(Bool, Int32, Array(Token))
|
||||||
Log.trace { "Including state #{state} from #{lexer.state_stack.last}" }
|
tokenizer.@lexer.states[@state].rules.each do |rule|
|
||||||
lexer.states[state].rules.each do |rule|
|
matched, new_pos, new_tokens = rule.match(text, pos, tokenizer)
|
||||||
matched, new_pos, new_tokens = rule.match(text, pos, lexer)
|
|
||||||
return true, new_pos, new_tokens if matched
|
return true, new_pos, new_tokens if matched
|
||||||
end
|
end
|
||||||
return false, pos, [] of Token
|
return false, pos, [] of Token
|
||||||
@@ -79,8 +77,8 @@ module Tartrazine
|
|||||||
struct UnconditionalRule < BaseRule
|
struct UnconditionalRule < BaseRule
|
||||||
NO_MATCH = [] of Match
|
NO_MATCH = [] of Match
|
||||||
|
|
||||||
def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token))
|
def match(text, pos, tokenizer) : Tuple(Bool, Int32, Array(Token))
|
||||||
return true, pos, actions.flat_map { |action| action.emit(NO_MATCH, lexer) }
|
return true, pos, @actions.flat_map(&.emit(NO_MATCH, tokenizer))
|
||||||
end
|
end
|
||||||
|
|
||||||
def initialize(node : XML::Node)
|
def initialize(node : XML::Node)
|
||||||
|
@@ -9,7 +9,7 @@ require "xml"
|
|||||||
module Tartrazine
|
module Tartrazine
|
||||||
alias Color = Sixteen::Color
|
alias Color = Sixteen::Color
|
||||||
|
|
||||||
class ThemeFiles
|
struct ThemeFiles
|
||||||
extend BakedFileSystem
|
extend BakedFileSystem
|
||||||
bake_folder "../styles", __DIR__
|
bake_folder "../styles", __DIR__
|
||||||
end
|
end
|
||||||
@@ -39,7 +39,7 @@ module Tartrazine
|
|||||||
themes.to_a.sort!
|
themes.to_a.sort!
|
||||||
end
|
end
|
||||||
|
|
||||||
class Style
|
struct Style
|
||||||
# These properties are tri-state.
|
# These properties are tri-state.
|
||||||
# true means it's set
|
# true means it's set
|
||||||
# false means it's not set
|
# false means it's not set
|
||||||
@@ -79,7 +79,7 @@ module Tartrazine
|
|||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
class Theme
|
struct Theme
|
||||||
property name : String = ""
|
property name : String = ""
|
||||||
|
|
||||||
property styles = {} of String => Style
|
property styles = {} of String => Style
|
||||||
|
Reference in New Issue
Block a user