mirror of
https://github.com/ralsina/tartrazine.git
synced 2025-07-01 12:27:08 -03:00
Compare commits
32 Commits
Author | SHA1 | Date | |
---|---|---|---|
10842f7074 | |||
ae03e4612e | |||
471b2f5050 | |||
5a3b08e716 | |||
9ebb9f2765 | |||
7538fc76aa | |||
788577b226 | |||
1f01146b1f | |||
9041b763ea | |||
ada30915c3 | |||
78eff45ea0 | |||
e817aedd60 | |||
20d6b65346 | |||
cb09dff9f1 | |||
b589726352 | |||
a3a7b5bd9a | |||
58e8dac038 | |||
f72a40f095 | |||
bf257a5b82 | |||
029495590c | |||
115debdec6 | |||
4612db58fe | |||
f45a86c83a | |||
27008640a6 | |||
7db8fdc9e4 | |||
ad664d9f93 | |||
0626c8619f | |||
3725201f8a | |||
6f64b76c44 | |||
5218af6855 | |||
c898f395a1 | |||
56e49328fb |
1
.gitignore
vendored
1
.gitignore
vendored
@ -7,3 +7,4 @@ chroma/
|
||||
pygments/
|
||||
shard.lock
|
||||
.vscode/
|
||||
.crystal/
|
||||
|
25
README.md
25
README.md
@ -4,17 +4,17 @@ Tartrazine is a library to syntax-highlight code. It is
|
||||
a port of [Pygments](https://pygments.org/) to
|
||||
[Crystal](https://crystal-lang.org/). Kind of.
|
||||
|
||||
It's not currently usable because it's not finished, but:
|
||||
|
||||
* The lexers work for the implemented languages
|
||||
* The provided styles work
|
||||
* There is a very very simple HTML formatter
|
||||
The CLI tool can be used to highlight many things in many styles.
|
||||
|
||||
# A port of what? Why "kind of"?
|
||||
|
||||
Because I did not read the Pygments code. And this is actually
|
||||
based on [Chroma](https://github.com/alecthomas/chroma) ...
|
||||
although I did not read that code either.
|
||||
Pygments is a staple of the Python ecosystem, and it's great.
|
||||
It lets you highlight code in many languages, and it has many
|
||||
themes. Chroma is "Pygments for Go", it's actually a port of
|
||||
Pygments to Go, and it's great too.
|
||||
|
||||
I wanted that in Crystal, so I started this project. But I did
|
||||
not read much of the Pygments code. Or much of Chroma's.
|
||||
|
||||
Chroma has taken most of the Pygments lexers and turned them into
|
||||
XML descriptions. What I did was take those XML files from Chroma
|
||||
@ -47,7 +47,14 @@ To build from source:
|
||||
2. Run `make` to build the `tartrazine` binary
|
||||
3. Copy the binary somewhere in your PATH.
|
||||
|
||||
## Usage
|
||||
## Usage as a CLI tool
|
||||
|
||||
```shell
|
||||
$ tartrazine whatever.c -l c -t catppuccin-macchiato --line-numbers \
|
||||
--standalone -o whatever.html
|
||||
```
|
||||
|
||||
## Usage as a Library
|
||||
|
||||
This works:
|
||||
|
||||
|
3
TODO.md
3
TODO.md
@ -8,4 +8,5 @@
|
||||
* ✅ Implement lexer loader that respects aliases
|
||||
* ✅ Implement lexer loader by file extension
|
||||
* ✅ Add --line-numbers to terminal formatter
|
||||
* Implement lexer loader by mime type
|
||||
* Implement lexer loader by mime type
|
||||
* Implement Delegating lexers
|
@ -1,5 +1,5 @@
|
||||
name: tartrazine
|
||||
version: 0.3.0
|
||||
version: 0.5.0
|
||||
|
||||
authors:
|
||||
- Roberto Alsina <roberto.alsina@gmail.com>
|
||||
|
@ -14,15 +14,18 @@ unicode_problems = {
|
||||
"#{__DIR__}/tests/java/test_string_literals.txt",
|
||||
"#{__DIR__}/tests/json/test_strings.txt",
|
||||
"#{__DIR__}/tests/systemd/example1.txt",
|
||||
"#{__DIR__}/tests/c++/test_unicode_identifiers.txt",
|
||||
}
|
||||
|
||||
# These testcases fail because of differences in the way chroma and tartrazine tokenize
|
||||
# but tartrazine is correct
|
||||
bad_in_chroma = {
|
||||
"#{__DIR__}/tests/bash_session/test_comment_after_prompt.txt",
|
||||
"#{__DIR__}/tests/html/javascript_backtracking.txt",
|
||||
"#{__DIR__}/tests/java/test_default.txt",
|
||||
"#{__DIR__}/tests/java/test_multiline_string.txt",
|
||||
"#{__DIR__}/tests/java/test_numeric_literals.txt",
|
||||
"#{__DIR__}/tests/octave/test_multilinecomment.txt",
|
||||
"#{__DIR__}/tests/php/test_string_escaping_run.txt",
|
||||
"#{__DIR__}/tests/python_2/test_cls_builtin.txt",
|
||||
}
|
||||
@ -30,19 +33,14 @@ bad_in_chroma = {
|
||||
known_bad = {
|
||||
"#{__DIR__}/tests/bash_session/fake_ps2_prompt.txt",
|
||||
"#{__DIR__}/tests/bash_session/prompt_in_output.txt",
|
||||
"#{__DIR__}/tests/bash_session/test_newline_in_echo_no_ps2.txt",
|
||||
"#{__DIR__}/tests/bash_session/test_newline_in_ls_ps2.txt",
|
||||
"#{__DIR__}/tests/bash_session/ps2_prompt.txt",
|
||||
"#{__DIR__}/tests/bash_session/test_newline_in_ls_no_ps2.txt",
|
||||
"#{__DIR__}/tests/bash_session/test_virtualenv.txt",
|
||||
"#{__DIR__}/tests/bash_session/test_newline_in_echo_no_ps2.txt",
|
||||
"#{__DIR__}/tests/bash_session/test_newline_in_echo_ps2.txt",
|
||||
"#{__DIR__}/tests/c/test_string_resembling_decl_end.txt",
|
||||
"#{__DIR__}/tests/html/css_backtracking.txt",
|
||||
"#{__DIR__}/tests/bash_session/test_newline_in_ls_no_ps2.txt",
|
||||
"#{__DIR__}/tests/bash_session/test_newline_in_ls_ps2.txt",
|
||||
"#{__DIR__}/tests/bash_session/test_virtualenv.txt",
|
||||
"#{__DIR__}/tests/mcfunction/data.txt",
|
||||
"#{__DIR__}/tests/mcfunction/selectors.txt",
|
||||
"#{__DIR__}/tests/php/anonymous_class.txt",
|
||||
"#{__DIR__}/tests/html/javascript_unclosed.txt",
|
||||
|
||||
}
|
||||
|
||||
# Tests that fail because of a limitation in PCRE2
|
||||
@ -75,7 +73,8 @@ end
|
||||
# Helper that creates lexer and tokenizes
|
||||
def tokenize(lexer_name, text)
|
||||
lexer = Tartrazine.lexer(lexer_name)
|
||||
lexer.tokenize(text)
|
||||
tokenizer = Tartrazine::Tokenizer.new(lexer, text)
|
||||
Tartrazine::Lexer.collapse_tokens(tokenizer.to_a)
|
||||
end
|
||||
|
||||
# Helper that tokenizes using chroma to validate the lexer
|
||||
|
139
src/actions.cr
139
src/actions.cr
@ -8,12 +8,30 @@ require "./tartrazine"
|
||||
# perform a list of actions. These actions can emit tokens
|
||||
# or change the state machine.
|
||||
module Tartrazine
|
||||
class Action
|
||||
property type : String
|
||||
property xml : XML::Node
|
||||
enum ActionType
|
||||
Bygroups
|
||||
Combined
|
||||
Include
|
||||
Pop
|
||||
Push
|
||||
Token
|
||||
Using
|
||||
Usingself
|
||||
end
|
||||
|
||||
struct Action
|
||||
property actions : Array(Action) = [] of Action
|
||||
|
||||
def initialize(@type : String, @xml : XML::Node?)
|
||||
@depth : Int32 = 0
|
||||
@lexer_name : String = ""
|
||||
@states : Array(String) = [] of String
|
||||
@states_to_push : Array(String) = [] of String
|
||||
@token_type : String = ""
|
||||
@type : ActionType = ActionType::Token
|
||||
|
||||
def initialize(t : String, xml : XML::Node?)
|
||||
@type = ActionType.parse(t.capitalize)
|
||||
|
||||
# Some actions may have actions in them, like this:
|
||||
# <bygroups>
|
||||
# <token type="GenericPrompt"/>
|
||||
@ -23,48 +41,53 @@ module Tartrazine
|
||||
#
|
||||
# The token actions match with the first 2 groups in the regex
|
||||
# the using action matches the 3rd and shunts it to another lexer
|
||||
@xml.children.each do |node|
|
||||
xml.children.each do |node|
|
||||
next unless node.element?
|
||||
@actions << Action.new(node.name, node)
|
||||
end
|
||||
|
||||
# Prefetch the attributes we ned from the XML and keep them
|
||||
case @type
|
||||
when ActionType::Token
|
||||
@token_type = xml["type"]
|
||||
when ActionType::Push
|
||||
@states_to_push = xml.attributes.select { |attrib|
|
||||
attrib.name == "state"
|
||||
}.map &.content
|
||||
when ActionType::Pop
|
||||
@depth = xml["depth"].to_i
|
||||
when ActionType::Using
|
||||
@lexer_name = xml["lexer"].downcase
|
||||
when ActionType::Combined
|
||||
@states = xml.attributes.select { |attrib|
|
||||
attrib.name == "state"
|
||||
}.map &.content
|
||||
end
|
||||
end
|
||||
|
||||
# ameba:disable Metrics/CyclomaticComplexity
|
||||
def emit(match : Regex::MatchData?, lexer : Lexer, match_group = 0) : Array(Token)
|
||||
case type
|
||||
when "token"
|
||||
raise Exception.new "Can't have a token without a match" if match.nil?
|
||||
[Token.new(type: xml["type"], value: match[match_group])]
|
||||
when "push"
|
||||
states_to_push = xml.attributes.select { |attrib|
|
||||
attrib.name == "state"
|
||||
}.map &.content
|
||||
if states_to_push.empty?
|
||||
# Push without a state means push the current state
|
||||
states_to_push = [lexer.state_stack.last]
|
||||
end
|
||||
states_to_push.each do |state|
|
||||
if state == "#pop"
|
||||
def emit(match : MatchData, tokenizer : Tokenizer, match_group = 0) : Array(Token)
|
||||
case @type
|
||||
when ActionType::Token
|
||||
raise Exception.new "Can't have a token without a match" if match.empty?
|
||||
[Token.new(type: @token_type, value: String.new(match[match_group].value))]
|
||||
when ActionType::Push
|
||||
to_push = @states_to_push.empty? ? [tokenizer.state_stack.last] : @states_to_push
|
||||
to_push.each do |state|
|
||||
if state == "#pop" && tokenizer.state_stack.size > 1
|
||||
# Pop the state
|
||||
Log.trace { "Popping state" }
|
||||
lexer.state_stack.pop
|
||||
tokenizer.state_stack.pop
|
||||
else
|
||||
# Really push
|
||||
lexer.state_stack << state
|
||||
Log.trace { "Pushed #{lexer.state_stack}" }
|
||||
tokenizer.state_stack << state
|
||||
end
|
||||
end
|
||||
[] of Token
|
||||
when "pop"
|
||||
depth = xml["depth"].to_i
|
||||
Log.trace { "Popping #{depth} states" }
|
||||
if lexer.state_stack.size <= depth
|
||||
Log.trace { "Can't pop #{depth} states, only have #{lexer.state_stack.size}" }
|
||||
else
|
||||
lexer.state_stack.pop(depth)
|
||||
end
|
||||
when ActionType::Pop
|
||||
to_pop = [@depth, tokenizer.state_stack.size - 1].min
|
||||
tokenizer.state_stack.pop(to_pop)
|
||||
[] of Token
|
||||
when "bygroups"
|
||||
when ActionType::Bygroups
|
||||
# FIXME: handle
|
||||
# ><bygroups>
|
||||
# <token type="Punctuation"/>
|
||||
@ -79,38 +102,42 @@ module Tartrazine
|
||||
# the action is skipped.
|
||||
result = [] of Token
|
||||
@actions.each_with_index do |e, i|
|
||||
next if match[i + 1]?.nil?
|
||||
result += e.emit(match, lexer, i + 1)
|
||||
begin
|
||||
next if match[i + 1].size == 0
|
||||
rescue IndexError
|
||||
# FIXME: This should not actually happen
|
||||
# No match for this group
|
||||
next
|
||||
end
|
||||
result += e.emit(match, tokenizer, i + 1)
|
||||
end
|
||||
result
|
||||
when "using"
|
||||
when ActionType::Using
|
||||
# Shunt to another lexer entirely
|
||||
return [] of Token if match.nil?
|
||||
lexer_name = xml["lexer"].downcase
|
||||
Log.trace { "to tokenize: #{match[match_group]}" }
|
||||
Tartrazine.lexer(lexer_name).tokenize(match[match_group], usingself: true)
|
||||
when "usingself"
|
||||
return [] of Token if match.empty?
|
||||
Tokenizer.new(
|
||||
Tartrazine.lexer(@lexer_name),
|
||||
String.new(match[match_group].value),
|
||||
secondary: true).to_a
|
||||
when ActionType::Usingself
|
||||
# Shunt to another copy of this lexer
|
||||
return [] of Token if match.nil?
|
||||
|
||||
new_lexer = Lexer.from_xml(lexer.xml)
|
||||
Log.trace { "to tokenize: #{match[match_group]}" }
|
||||
new_lexer.tokenize(match[match_group], usingself: true)
|
||||
when "combined"
|
||||
# Combine two states into one anonymous state
|
||||
states = xml.attributes.select { |attrib|
|
||||
attrib.name == "state"
|
||||
}.map &.content
|
||||
new_state = states.map { |name|
|
||||
lexer.states[name]
|
||||
return [] of Token if match.empty?
|
||||
Tokenizer.new(
|
||||
tokenizer.lexer,
|
||||
String.new(match[match_group].value),
|
||||
secondary: true).to_a
|
||||
when ActionType::Combined
|
||||
# Combine two or more states into one anonymous state
|
||||
new_state = @states.map { |name|
|
||||
tokenizer.lexer.states[name]
|
||||
}.reduce { |state1, state2|
|
||||
state1 + state2
|
||||
}
|
||||
lexer.states[new_state.name] = new_state
|
||||
lexer.state_stack << new_state.name
|
||||
tokenizer.lexer.states[new_state.name] = new_state
|
||||
tokenizer.state_stack << new_state.name
|
||||
[] of Token
|
||||
else
|
||||
raise Exception.new("Unknown action type: #{type}: #{xml}")
|
||||
raise Exception.new("Unknown action type: #{@type}")
|
||||
end
|
||||
end
|
||||
end
|
||||
|
73
src/bytes_regex.cr
Normal file
73
src/bytes_regex.cr
Normal file
@ -0,0 +1,73 @@
|
||||
module BytesRegex
|
||||
extend self
|
||||
|
||||
class Regex
|
||||
def initialize(pattern : String, multiline = false, dotall = false, ignorecase = false, anchored = false)
|
||||
flags = LibPCRE2::UTF | LibPCRE2::UCP | LibPCRE2::NO_UTF_CHECK
|
||||
flags |= LibPCRE2::MULTILINE if multiline
|
||||
flags |= LibPCRE2::DOTALL if dotall
|
||||
flags |= LibPCRE2::CASELESS if ignorecase
|
||||
flags |= LibPCRE2::ANCHORED if anchored
|
||||
if @re = LibPCRE2.compile(
|
||||
pattern,
|
||||
pattern.bytesize,
|
||||
flags,
|
||||
out errorcode,
|
||||
out erroroffset,
|
||||
nil)
|
||||
else
|
||||
msg = String.new(256) do |buffer|
|
||||
bytesize = LibPCRE2.get_error_message(errorcode, buffer, 256)
|
||||
{bytesize, 0}
|
||||
end
|
||||
raise Exception.new "Error #{msg} compiling regex at offset #{erroroffset}"
|
||||
end
|
||||
@match_data = LibPCRE2.match_data_create_from_pattern(@re, nil)
|
||||
end
|
||||
|
||||
def finalize
|
||||
LibPCRE2.match_data_free(@match_data)
|
||||
LibPCRE2.code_free(@re)
|
||||
end
|
||||
|
||||
def match(str : Bytes, pos = 0) : Array(Match)
|
||||
rc = LibPCRE2.match(
|
||||
@re,
|
||||
str,
|
||||
str.size,
|
||||
pos,
|
||||
LibPCRE2::NO_UTF_CHECK,
|
||||
@match_data,
|
||||
nil)
|
||||
if rc > 0
|
||||
ovector = LibPCRE2.get_ovector_pointer(@match_data)
|
||||
(0...rc).map do |i|
|
||||
m_start = ovector[2 * i]
|
||||
m_end = ovector[2 * i + 1]
|
||||
if m_start == m_end
|
||||
m_value = Bytes.new(0)
|
||||
else
|
||||
m_value = str[m_start...m_end]
|
||||
end
|
||||
Match.new(m_value, m_start, m_end - m_start)
|
||||
end
|
||||
else
|
||||
[] of Match
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
struct Match
|
||||
property value : Bytes
|
||||
property start : UInt64
|
||||
property size : UInt64
|
||||
|
||||
def initialize(@value : Bytes, @start : UInt64, @size : UInt64)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
# pattern = "foo"
|
||||
# str = "foo bar"
|
||||
# re = BytesRegex::Regex.new(pattern)
|
||||
# p! String.new(re.match(str.to_slice)[0].value)
|
@ -7,19 +7,39 @@ module Tartrazine
|
||||
def initialize(@theme : Theme = Tartrazine.theme("default-dark"), @line_numbers : Bool = false)
|
||||
end
|
||||
|
||||
private def line_label(i : Int32) : String
|
||||
"#{i + 1}".rjust(4).ljust(5)
|
||||
end
|
||||
|
||||
def format(text : String, lexer : Lexer) : String
|
||||
tokenizer = Tokenizer.new(lexer, text)
|
||||
i = 0
|
||||
output = String.build do |outp|
|
||||
lexer.group_tokens_in_lines(lexer.tokenize(text)).each_with_index do |line, i|
|
||||
label = line_numbers? ? "#{i + 1}".rjust(4).ljust(5) : ""
|
||||
outp << label
|
||||
line.each do |token|
|
||||
outp << colorize(token[:value], token[:type])
|
||||
outp << line_label(i) if line_numbers?
|
||||
tokenizer.each do |token|
|
||||
outp << colorize(token[:value], token[:type])
|
||||
if token[:value].includes?("\n")
|
||||
i += 1
|
||||
outp << line_label(i) if line_numbers?
|
||||
end
|
||||
end
|
||||
end
|
||||
output
|
||||
end
|
||||
|
||||
# def format(text : String, lexer : Lexer) : String
|
||||
# output = String.build do |outp|
|
||||
# lexer.group_tokens_in_lines(lexer.tokenize(text)).each_with_index do |line, i|
|
||||
# label = line_numbers? ? "#{i + 1}".rjust(4).ljust(5) : ""
|
||||
# outp << label
|
||||
# line.each do |token|
|
||||
# outp << colorize(token[:value], token[:type])
|
||||
# end
|
||||
# end
|
||||
# end
|
||||
# output
|
||||
# end
|
||||
|
||||
def colorize(text : String, token : String) : String
|
||||
style = theme.styles.fetch(token, nil)
|
||||
return text if style.nil?
|
||||
|
@ -1,5 +1,6 @@
|
||||
require "../constants/token_abbrevs.cr"
|
||||
require "../formatter"
|
||||
require "html"
|
||||
|
||||
module Tartrazine
|
||||
class Html < Formatter
|
||||
@ -30,7 +31,7 @@ module Tartrazine
|
||||
@standalone : Bool = false,
|
||||
@surrounding_pre : Bool = true,
|
||||
@wrap_long_lines : Bool = false,
|
||||
@weight_of_bold : Int32 = 600,)
|
||||
@weight_of_bold : Int32 = 600)
|
||||
end
|
||||
|
||||
def format(text : String, lexer : Lexer) : String
|
||||
@ -53,22 +54,29 @@ module Tartrazine
|
||||
output
|
||||
end
|
||||
|
||||
private def line_label(i : Int32) : String
|
||||
line_label = "#{i + 1}".rjust(4).ljust(5)
|
||||
line_class = highlighted?(i + 1) ? "class=\"#{get_css_class("LineHighlight")}\"" : ""
|
||||
line_id = linkable_line_numbers? ? "id=\"#{line_number_id_prefix}#{i + 1}\"" : ""
|
||||
"<span #{line_id} #{line_class} style=\"user-select: none;\">#{line_label} </span>"
|
||||
end
|
||||
|
||||
def format_text(text : String, lexer : Lexer) : String
|
||||
lines = lexer.group_tokens_in_lines(lexer.tokenize(text))
|
||||
# lines = lexer.group_tokens_in_lines(lexer.tokenize(text))
|
||||
tokenizer = Tokenizer.new(lexer, text)
|
||||
i = 0
|
||||
output = String.build do |outp|
|
||||
if surrounding_pre?
|
||||
pre_style = wrap_long_lines? ? "style=\"white-space: pre-wrap; word-break: break-word;\"" : ""
|
||||
outp << "<pre class=\"#{get_css_class("Background")}\" #{pre_style}>"
|
||||
end
|
||||
outp << "<code class=\"#{get_css_class("Background")}\">"
|
||||
lines.each_with_index(offset: line_number_start - 1) do |line, i|
|
||||
line_label = line_numbers? ? "#{i + 1}".rjust(4).ljust(5) : ""
|
||||
line_class = highlighted?(i + 1) ? "class=\"#{get_css_class("LineHighlight")}\"" : ""
|
||||
line_id = linkable_line_numbers? ? "id=\"#{line_number_id_prefix}#{i + 1}\"" : ""
|
||||
outp << "<span #{line_id} #{line_class} style=\"user-select: none;\">#{line_label} </span>"
|
||||
line.each do |token|
|
||||
fragment = "<span class=\"#{get_css_class(token[:type])}\">#{token[:value]}</span>"
|
||||
outp << fragment
|
||||
outp << line_label(i) if line_numbers?
|
||||
tokenizer.each do |token|
|
||||
outp << "<span class=\"#{get_css_class(token[:type])}\">#{HTML.escape(token[:value])}</span>"
|
||||
if token[:value].ends_with? "\n"
|
||||
i += 1
|
||||
outp << line_label(i) if line_numbers?
|
||||
end
|
||||
end
|
||||
outp << "</code></pre>"
|
||||
@ -104,15 +112,17 @@ module Tartrazine
|
||||
|
||||
# Given a token type, return the CSS class to use.
|
||||
def get_css_class(token : String) : String
|
||||
return class_prefix + Abbreviations[token] if theme.styles.has_key?(token)
|
||||
|
||||
# Themes don't contain information for each specific
|
||||
# token type. However, they may contain information
|
||||
# for a parent style. Worst case, we go to the root
|
||||
# (Background) style.
|
||||
class_prefix + Abbreviations[theme.style_parents(token).reverse.find { |parent|
|
||||
theme.styles.has_key?(parent)
|
||||
}]
|
||||
if !theme.styles.has_key? token
|
||||
# Themes don't contain information for each specific
|
||||
# token type. However, they may contain information
|
||||
# for a parent style. Worst case, we go to the root
|
||||
# (Background) style.
|
||||
parent = theme.style_parents(token).reverse.find { |dad|
|
||||
theme.styles.has_key?(dad)
|
||||
}
|
||||
theme.styles[token] = theme.styles[parent]
|
||||
end
|
||||
class_prefix + Abbreviations[token]
|
||||
end
|
||||
|
||||
# Is this line in the highlighted ranges?
|
||||
|
157
src/lexer.cr
157
src/lexer.cr
@ -1,9 +1,9 @@
|
||||
require "baked_file_system"
|
||||
require "./constants/lexers"
|
||||
|
||||
module Tartrazine
|
||||
class LexerFiles
|
||||
extend BakedFileSystem
|
||||
|
||||
bake_folder "../lexers", __DIR__
|
||||
end
|
||||
|
||||
@ -37,71 +37,92 @@ module Tartrazine
|
||||
LEXERS_BY_NAME.keys.sort!
|
||||
end
|
||||
|
||||
# A token, the output of the tokenizer
|
||||
alias Token = NamedTuple(type: String, value: String)
|
||||
|
||||
struct Tokenizer
|
||||
include Iterator(Token)
|
||||
property lexer : Lexer
|
||||
property text : Bytes
|
||||
property pos : Int32 = 0
|
||||
@dq = Deque(Token).new
|
||||
property state_stack = ["root"]
|
||||
|
||||
def initialize(@lexer : Lexer, text : String, secondary = false)
|
||||
# Respect the `ensure_nl` config option
|
||||
if text.size > 0 && text[-1] != '\n' && @lexer.config[:ensure_nl] && !secondary
|
||||
text += "\n"
|
||||
end
|
||||
@text = text.to_slice
|
||||
end
|
||||
|
||||
def next : Iterator::Stop | Token
|
||||
if @dq.size > 0
|
||||
return @dq.shift
|
||||
end
|
||||
if pos == @text.size
|
||||
return stop
|
||||
end
|
||||
|
||||
matched = false
|
||||
while @pos < @text.size
|
||||
@lexer.states[@state_stack.last].rules.each do |rule|
|
||||
matched, new_pos, new_tokens = rule.match(@text, @pos, self)
|
||||
if matched
|
||||
@pos = new_pos
|
||||
split_tokens(new_tokens).each { |token| @dq << token }
|
||||
break
|
||||
end
|
||||
end
|
||||
if !matched
|
||||
if @text[@pos] == 10u8
|
||||
@dq << {type: "Text", value: "\n"}
|
||||
@state_stack = ["root"]
|
||||
else
|
||||
@dq << {type: "Error", value: String.new(@text[@pos..@pos])}
|
||||
end
|
||||
@pos += 1
|
||||
break
|
||||
end
|
||||
end
|
||||
self.next
|
||||
end
|
||||
|
||||
# If a token contains a newline, split it into two tokens
|
||||
def split_tokens(tokens : Array(Token)) : Array(Token)
|
||||
split_tokens = [] of Token
|
||||
tokens.each do |token|
|
||||
if token[:value].includes?("\n")
|
||||
values = token[:value].split("\n")
|
||||
values.each_with_index do |value, index|
|
||||
value += "\n" if index < values.size - 1
|
||||
split_tokens << {type: token[:type], value: value}
|
||||
end
|
||||
else
|
||||
split_tokens << token
|
||||
end
|
||||
end
|
||||
split_tokens
|
||||
end
|
||||
end
|
||||
|
||||
# This implements a lexer for Pygments RegexLexers as expressed
|
||||
# in Chroma's XML serialization.
|
||||
#
|
||||
# For explanations on what actions and states do
|
||||
# the Pygments documentation is a good place to start.
|
||||
# https://pygments.org/docs/lexerdevelopment/
|
||||
class Lexer
|
||||
struct Lexer
|
||||
property config = {
|
||||
name: "",
|
||||
aliases: [] of String,
|
||||
filenames: [] of String,
|
||||
mime_types: [] of String,
|
||||
priority: 0.0,
|
||||
case_insensitive: false,
|
||||
dot_all: false,
|
||||
not_multiline: false,
|
||||
ensure_nl: false,
|
||||
}
|
||||
property xml : String = ""
|
||||
|
||||
property states = {} of String => State
|
||||
|
||||
property state_stack = ["root"]
|
||||
|
||||
# Turn the text into a list of tokens. The `usingself` parameter
|
||||
# is true when the lexer is being used to tokenize a string
|
||||
# from a larger text that is already being tokenized.
|
||||
# So, when it's true, we don't modify the text.
|
||||
def tokenize(text, usingself = false) : Array(Token)
|
||||
@state_stack = ["root"]
|
||||
tokens = [] of Token
|
||||
pos = 0
|
||||
matched = false
|
||||
|
||||
# Respect the `ensure_nl` config option
|
||||
if text.size > 0 && text[-1] != '\n' && config[:ensure_nl] && !usingself
|
||||
text += "\n"
|
||||
end
|
||||
|
||||
# Loop through the text, applying rules
|
||||
while pos < text.size
|
||||
state = states[@state_stack.last]
|
||||
# Log.trace { "Stack is #{@state_stack} State is #{state.name}, pos is #{pos}, text is #{text[pos..pos + 10]}" }
|
||||
state.rules.each do |rule|
|
||||
matched, new_pos, new_tokens = rule.match(text, pos, self)
|
||||
if matched
|
||||
# Move position forward, save the tokens,
|
||||
# tokenize from the new position
|
||||
# Log.trace { "MATCHED: #{rule.xml}" }
|
||||
pos = new_pos
|
||||
tokens += new_tokens
|
||||
break
|
||||
end
|
||||
# Log.trace { "NOT MATCHED: #{rule.xml}" }
|
||||
end
|
||||
# If no rule matches, emit an error token
|
||||
unless matched
|
||||
# Log.trace { "Error at #{pos}" }
|
||||
tokens << {type: "Error", value: "#{text[pos]}"}
|
||||
pos += 1
|
||||
end
|
||||
end
|
||||
Lexer.collapse_tokens(tokens)
|
||||
end
|
||||
|
||||
# Collapse consecutive tokens of the same type for easier comparison
|
||||
# and smaller output
|
||||
def self.collapse_tokens(tokens : Array(Tartrazine::Token)) : Array(Tartrazine::Token)
|
||||
@ -124,34 +145,8 @@ module Tartrazine
|
||||
result
|
||||
end
|
||||
|
||||
# Group tokens into lines, splitting them when a newline is found
|
||||
def group_tokens_in_lines(tokens : Array(Token)) : Array(Array(Token))
|
||||
split_tokens = [] of Token
|
||||
tokens.each do |token|
|
||||
if token[:value].includes?("\n")
|
||||
values = token[:value].split("\n")
|
||||
values.each_with_index do |value, index|
|
||||
value += "\n" if index < values.size - 1
|
||||
split_tokens << {type: token[:type], value: value}
|
||||
end
|
||||
else
|
||||
split_tokens << token
|
||||
end
|
||||
end
|
||||
lines = [Array(Token).new]
|
||||
split_tokens.each do |token|
|
||||
lines.last << token
|
||||
if token[:value].includes?("\n")
|
||||
lines << Array(Token).new
|
||||
end
|
||||
end
|
||||
lines
|
||||
end
|
||||
|
||||
# ameba:disable Metrics/CyclomaticComplexity
|
||||
def self.from_xml(xml : String) : Lexer
|
||||
l = Lexer.new
|
||||
l.xml = xml
|
||||
lexer = XML.parse(xml).first_element_child
|
||||
if lexer
|
||||
config = lexer.children.find { |node|
|
||||
@ -160,9 +155,6 @@ module Tartrazine
|
||||
if config
|
||||
l.config = {
|
||||
name: xml_to_s(config, name) || "",
|
||||
aliases: xml_to_a(config, _alias) || [] of String,
|
||||
filenames: xml_to_a(config, filename) || [] of String,
|
||||
mime_types: xml_to_a(config, mime_type) || [] of String,
|
||||
priority: xml_to_f(config, priority) || 0.0,
|
||||
not_multiline: xml_to_s(config, not_multiline) == "true",
|
||||
dot_all: xml_to_s(config, dot_all) == "true",
|
||||
@ -215,9 +207,9 @@ module Tartrazine
|
||||
# A Lexer state. A state has a name and a list of rules.
|
||||
# The state machine has a state stack containing references
|
||||
# to states to decide which rules to apply.
|
||||
class State
|
||||
struct State
|
||||
property name : String = ""
|
||||
property rules = [] of Rule
|
||||
property rules = [] of BaseRule
|
||||
|
||||
def +(other : State)
|
||||
new_state = State.new
|
||||
@ -226,7 +218,4 @@ module Tartrazine
|
||||
new_state
|
||||
end
|
||||
end
|
||||
|
||||
# A token, the output of the tokenizer
|
||||
alias Token = NamedTuple(type: String, value: String)
|
||||
end
|
||||
|
@ -77,7 +77,7 @@ if options["-f"]
|
||||
|
||||
if formatter.is_a?(Tartrazine::Html) && options["--css"]
|
||||
File.open("#{options["-t"].as(String)}.css", "w") do |outf|
|
||||
outf.puts formatter.style_defs
|
||||
outf << formatter.style_defs
|
||||
end
|
||||
exit 0
|
||||
end
|
||||
@ -91,7 +91,7 @@ if options["-f"]
|
||||
puts output
|
||||
else
|
||||
File.open(options["-o"].as(String), "w") do |outf|
|
||||
outf.puts output
|
||||
outf << output
|
||||
end
|
||||
end
|
||||
end
|
||||
|
90
src/rules.cr
90
src/rules.cr
@ -1,8 +1,9 @@
|
||||
require "./actions"
|
||||
require "./bytes_regex"
|
||||
require "./formatter"
|
||||
require "./lexer"
|
||||
require "./rules"
|
||||
require "./styles"
|
||||
require "./lexer"
|
||||
|
||||
# These are lexer rules. They match with the text being parsed
|
||||
# and perform actions, either emitting tokens or changing the
|
||||
@ -10,40 +11,15 @@ require "./lexer"
|
||||
module Tartrazine
|
||||
# This rule matches via a regex pattern
|
||||
|
||||
class Rule
|
||||
property pattern : Regex = Regex.new ""
|
||||
property actions : Array(Action) = [] of Action
|
||||
property xml : String = "foo"
|
||||
alias Regex = BytesRegex::Regex
|
||||
alias Match = BytesRegex::Match
|
||||
alias MatchData = Array(Match)
|
||||
|
||||
def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token))
|
||||
match = pattern.match(text, pos)
|
||||
# We don't match if the match doesn't move the cursor
|
||||
# because that causes infinite loops
|
||||
return false, pos, [] of Token if match.nil? || match.end == 0
|
||||
# Log.trace { "#{match}, #{pattern.inspect}, #{text}, #{pos}" }
|
||||
tokens = [] of Token
|
||||
# Emit the tokens
|
||||
actions.each do |action|
|
||||
# Emit the token
|
||||
tokens += action.emit(match, lexer)
|
||||
end
|
||||
Log.trace { "#{xml}, #{match.end}, #{tokens}" }
|
||||
return true, match.end, tokens
|
||||
end
|
||||
abstract struct BaseRule
|
||||
abstract def match(text : Bytes, pos : Int32, tokenizer : Tokenizer) : Tuple(Bool, Int32, Array(Token))
|
||||
abstract def initialize(node : XML::Node)
|
||||
|
||||
def initialize(node : XML::Node, multiline, dotall, ignorecase)
|
||||
@xml = node.to_s
|
||||
pattern = node["pattern"]
|
||||
flags = Regex::Options::ANCHORED
|
||||
# MULTILINE implies DOTALL which we don't want, so we
|
||||
# use in-pattern flag (?m) instead
|
||||
# flags |= Regex::Options::MULTILINE if multiline
|
||||
pattern = "(?m)" + pattern if multiline
|
||||
flags |= Regex::Options::DOTALL if dotall
|
||||
flags |= Regex::Options::IGNORE_CASE if ignorecase
|
||||
@pattern = Regex.new(pattern, flags)
|
||||
add_actions(node)
|
||||
end
|
||||
@actions : Array(Action) = [] of Action
|
||||
|
||||
def add_actions(node : XML::Node)
|
||||
node.children.each do |child|
|
||||
@ -53,23 +29,42 @@ module Tartrazine
|
||||
end
|
||||
end
|
||||
|
||||
struct Rule < BaseRule
|
||||
property pattern : Regex = Regex.new ""
|
||||
|
||||
def match(text : Bytes, pos, tokenizer) : Tuple(Bool, Int32, Array(Token))
|
||||
match = pattern.match(text, pos)
|
||||
|
||||
# No match
|
||||
return false, pos, [] of Token if match.size == 0
|
||||
return true, pos + match[0].size, @actions.flat_map(&.emit(match, tokenizer))
|
||||
end
|
||||
|
||||
def initialize(node : XML::Node)
|
||||
end
|
||||
|
||||
def initialize(node : XML::Node, multiline, dotall, ignorecase)
|
||||
pattern = node["pattern"]
|
||||
pattern = "(?m)" + pattern if multiline
|
||||
@pattern = Regex.new(pattern, multiline, dotall, ignorecase, true)
|
||||
add_actions(node)
|
||||
end
|
||||
end
|
||||
|
||||
# This rule includes another state. If any of the rules of the
|
||||
# included state matches, this rule matches.
|
||||
class IncludeStateRule < Rule
|
||||
property state : String = ""
|
||||
struct IncludeStateRule < BaseRule
|
||||
@state : String = ""
|
||||
|
||||
def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token))
|
||||
Log.trace { "Including state #{state} from #{lexer.state_stack.last}" }
|
||||
lexer.states[state].rules.each do |rule|
|
||||
matched, new_pos, new_tokens = rule.match(text, pos, lexer)
|
||||
Log.trace { "#{xml}, #{new_pos}, #{new_tokens}" } if matched
|
||||
def match(text : Bytes, pos : Int32, tokenizer : Tokenizer) : Tuple(Bool, Int32, Array(Token))
|
||||
tokenizer.@lexer.states[@state].rules.each do |rule|
|
||||
matched, new_pos, new_tokens = rule.match(text, pos, tokenizer)
|
||||
return true, new_pos, new_tokens if matched
|
||||
end
|
||||
return false, pos, [] of Token
|
||||
end
|
||||
|
||||
def initialize(node : XML::Node)
|
||||
@xml = node.to_s
|
||||
include_node = node.children.find { |child|
|
||||
child.name == "include"
|
||||
}
|
||||
@ -79,17 +74,14 @@ module Tartrazine
|
||||
end
|
||||
|
||||
# This rule always matches, unconditionally
|
||||
class UnconditionalRule < Rule
|
||||
def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token))
|
||||
tokens = [] of Token
|
||||
actions.each do |action|
|
||||
tokens += action.emit(nil, lexer)
|
||||
end
|
||||
return true, pos, tokens
|
||||
struct UnconditionalRule < BaseRule
|
||||
NO_MATCH = [] of Match
|
||||
|
||||
def match(text, pos, tokenizer) : Tuple(Bool, Int32, Array(Token))
|
||||
return true, pos, @actions.flat_map(&.emit(NO_MATCH, tokenizer))
|
||||
end
|
||||
|
||||
def initialize(node : XML::Node)
|
||||
@xml = node.to_s
|
||||
add_actions(node)
|
||||
end
|
||||
end
|
||||
|
@ -9,7 +9,7 @@ require "xml"
|
||||
module Tartrazine
|
||||
alias Color = Sixteen::Color
|
||||
|
||||
class ThemeFiles
|
||||
struct ThemeFiles
|
||||
extend BakedFileSystem
|
||||
bake_folder "../styles", __DIR__
|
||||
end
|
||||
@ -39,7 +39,7 @@ module Tartrazine
|
||||
themes.to_a.sort!
|
||||
end
|
||||
|
||||
class Style
|
||||
struct Style
|
||||
# These properties are tri-state.
|
||||
# true means it's set
|
||||
# false means it's not set
|
||||
@ -79,7 +79,7 @@ module Tartrazine
|
||||
end
|
||||
end
|
||||
|
||||
class Theme
|
||||
struct Theme
|
||||
property name : String = ""
|
||||
|
||||
property styles = {} of String => Style
|
||||
|
Reference in New Issue
Block a user