5 Commits

19 changed files with 502 additions and 420 deletions

1
.gitignore vendored
View File

@@ -7,4 +7,3 @@ chroma/
pygments/
shard.lock
.vscode/
.crystal/

View File

@@ -4,17 +4,17 @@ Tartrazine is a library to syntax-highlight code. It is
a port of [Pygments](https://pygments.org/) to
[Crystal](https://crystal-lang.org/). Kind of.
The CLI tool can be used to highlight many things in many styles.
It's not currently usable because it's not finished, but:
* The lexers work for the implemented languages
* The provided styles work
* There is a very very simple HTML formatter
# A port of what? Why "kind of"?
Pygments is a staple of the Python ecosystem, and it's great.
It lets you highlight code in many languages, and it has many
themes. Chroma is "Pygments for Go", it's actually a port of
Pygments to Go, and it's great too.
I wanted that in Crystal, so I started this project. But I did
not read much of the Pygments code. Or much of Chroma's.
Because I did not read the Pygments code. And this is actually
based on [Chroma](https://github.com/alecthomas/chroma) ...
although I did not read that code either.
Chroma has taken most of the Pygments lexers and turned them into
XML descriptions. What I did was take those XML files from Chroma
@@ -47,14 +47,7 @@ To build from source:
2. Run `make` to build the `tartrazine` binary
3. Copy the binary somewhere in your PATH.
## Usage as a CLI tool
```shell
$ tartrazine whatever.c -l c -t catppuccin-macchiato --line-numbers \
--standalone -o whatever.html
```
## Usage as a Library
## Usage
This works:

View File

@@ -8,5 +8,4 @@
* ✅ Implement lexer loader that respects aliases
* ✅ Implement lexer loader by file extension
* ✅ Add --line-numbers to terminal formatter
* Implement lexer loader by mime type
* Implement Delegating lexers
* Implement lexer loader by mime type

View File

@@ -1,5 +1,5 @@
name: tartrazine
version: 0.5.1
version: 0.2.0
authors:
- Roberto Alsina <roberto.alsina@gmail.com>

View File

@@ -14,18 +14,15 @@ unicode_problems = {
"#{__DIR__}/tests/java/test_string_literals.txt",
"#{__DIR__}/tests/json/test_strings.txt",
"#{__DIR__}/tests/systemd/example1.txt",
"#{__DIR__}/tests/c++/test_unicode_identifiers.txt",
}
# These testcases fail because of differences in the way chroma and tartrazine tokenize
# but tartrazine is correct
bad_in_chroma = {
"#{__DIR__}/tests/bash_session/test_comment_after_prompt.txt",
"#{__DIR__}/tests/html/javascript_backtracking.txt",
"#{__DIR__}/tests/java/test_default.txt",
"#{__DIR__}/tests/java/test_multiline_string.txt",
"#{__DIR__}/tests/java/test_numeric_literals.txt",
"#{__DIR__}/tests/octave/test_multilinecomment.txt",
"#{__DIR__}/tests/php/test_string_escaping_run.txt",
"#{__DIR__}/tests/python_2/test_cls_builtin.txt",
}
@@ -33,14 +30,22 @@ bad_in_chroma = {
known_bad = {
"#{__DIR__}/tests/bash_session/fake_ps2_prompt.txt",
"#{__DIR__}/tests/bash_session/prompt_in_output.txt",
"#{__DIR__}/tests/bash_session/ps2_prompt.txt",
"#{__DIR__}/tests/bash_session/test_newline_in_echo_no_ps2.txt",
"#{__DIR__}/tests/bash_session/test_newline_in_echo_ps2.txt",
"#{__DIR__}/tests/bash_session/test_newline_in_ls_no_ps2.txt",
"#{__DIR__}/tests/bash_session/test_newline_in_ls_ps2.txt",
"#{__DIR__}/tests/bash_session/ps2_prompt.txt",
"#{__DIR__}/tests/bash_session/test_newline_in_ls_no_ps2.txt",
"#{__DIR__}/tests/bash_session/test_virtualenv.txt",
"#{__DIR__}/tests/bash_session/test_newline_in_echo_ps2.txt",
"#{__DIR__}/tests/c/test_string_resembling_decl_end.txt",
"#{__DIR__}/tests/html/css_backtracking.txt",
"#{__DIR__}/tests/mcfunction/data.txt",
"#{__DIR__}/tests/mcfunction/selectors.txt",
"#{__DIR__}/tests/php/anonymous_class.txt",
"#{__DIR__}/tests/html/javascript_unclosed.txt",
# BAD FOR ONIGMO
"#{__DIR__}/tests/json/test_backtracking.txt",
}
# Tests that fail because of a limitation in PCRE2
@@ -56,6 +61,7 @@ describe Tartrazine do
end
else
it "parses #{testcase}".split("/")[-2...].join("/") do
p! testcase
text = File.read(testcase).split("---input---\n").last.split("---tokens---").first
lexer_name = File.basename(File.dirname(testcase)).downcase
unless failing_lexers.includes?(lexer_name) ||
@@ -73,8 +79,7 @@ end
# Helper that creates lexer and tokenizes
def tokenize(lexer_name, text)
lexer = Tartrazine.lexer(lexer_name)
tokenizer = Tartrazine::Tokenizer.new(lexer, text)
Tartrazine::Lexer.collapse_tokens(tokenizer.to_a)
lexer.tokenize(text)
end
# Helper that tokenizes using chroma to validate the lexer

View File

@@ -8,30 +8,12 @@ require "./tartrazine"
# perform a list of actions. These actions can emit tokens
# or change the state machine.
module Tartrazine
enum ActionType
Bygroups
Combined
Include
Pop
Push
Token
Using
Usingself
end
struct Action
class Action
property type : String
property xml : XML::Node
property actions : Array(Action) = [] of Action
@depth : Int32 = 0
@lexer_name : String = ""
@states : Array(String) = [] of String
@states_to_push : Array(String) = [] of String
@token_type : String = ""
@type : ActionType = ActionType::Token
def initialize(t : String, xml : XML::Node?)
@type = ActionType.parse(t.capitalize)
def initialize(@type : String, @xml : XML::Node?)
# Some actions may have actions in them, like this:
# <bygroups>
# <token type="GenericPrompt"/>
@@ -41,53 +23,48 @@ module Tartrazine
#
# The token actions match with the first 2 groups in the regex
# the using action matches the 3rd and shunts it to another lexer
xml.children.each do |node|
@xml.children.each do |node|
next unless node.element?
@actions << Action.new(node.name, node)
end
# Prefetch the attributes we ned from the XML and keep them
case @type
when ActionType::Token
@token_type = xml["type"]
when ActionType::Push
@states_to_push = xml.attributes.select { |attrib|
attrib.name == "state"
}.map &.content
when ActionType::Pop
@depth = xml["depth"].to_i
when ActionType::Using
@lexer_name = xml["lexer"].downcase
when ActionType::Combined
@states = xml.attributes.select { |attrib|
attrib.name == "state"
}.map &.content
end
end
# ameba:disable Metrics/CyclomaticComplexity
def emit(match : MatchData, tokenizer : Tokenizer, match_group = 0) : Array(Token)
case @type
when ActionType::Token
raise Exception.new "Can't have a token without a match" if match.empty?
[Token.new(type: @token_type, value: String.new(match[match_group].value))]
when ActionType::Push
to_push = @states_to_push.empty? ? [tokenizer.state_stack.last] : @states_to_push
to_push.each do |state|
if state == "#pop" && tokenizer.state_stack.size > 1
def emit(match, lexer : Lexer, match_group = 0) : Array(Token)
case type
when "token"
raise Exception.new "Can't have a token without a match" if match.nil?
[Token.new(type: xml["type"], value: match[match_group].as(Onigmo::Match).value)]
when "push"
states_to_push = xml.attributes.select { |attrib|
attrib.name == "state"
}.map &.content
if states_to_push.empty?
# Push without a state means push the current state
states_to_push = [lexer.state_stack.last]
end
states_to_push.each do |state|
if state == "#pop"
# Pop the state
tokenizer.state_stack.pop
Log.trace { "Popping state" }
lexer.state_stack.pop
else
# Really push
tokenizer.state_stack << state
lexer.state_stack << state
Log.trace { "Pushed #{lexer.state_stack}" }
end
end
[] of Token
when ActionType::Pop
to_pop = [@depth, tokenizer.state_stack.size - 1].min
tokenizer.state_stack.pop(to_pop)
when "pop"
depth = xml["depth"].to_i
Log.trace { "Popping #{depth} states" }
if lexer.state_stack.size <= depth
Log.trace { "Can't pop #{depth} states, only have #{lexer.state_stack.size}" }
else
lexer.state_stack.pop(depth)
end
[] of Token
when ActionType::Bygroups
when "bygroups"
# FIXME: handle
# ><bygroups>
# <token type="Punctuation"/>
@@ -102,42 +79,38 @@ module Tartrazine
# the action is skipped.
result = [] of Token
@actions.each_with_index do |e, i|
begin
next if match[i + 1].size == 0
rescue IndexError
# FIXME: This should not actually happen
# No match for this group
next
end
result += e.emit(match, tokenizer, i + 1)
next if match[i + 1]?.nil?
result += e.emit(match, lexer, i + 1)
end
result
when ActionType::Using
when "using"
# Shunt to another lexer entirely
return [] of Token if match.empty?
Tokenizer.new(
Tartrazine.lexer(@lexer_name),
String.new(match[match_group].value),
secondary: true).to_a
when ActionType::Usingself
return [] of Token if match.nil?
lexer_name = xml["lexer"].downcase
Log.trace { "to tokenize: #{match[match_group]}" }
Tartrazine.lexer(lexer_name).tokenize(match[match_group].as(Onigmo::Match).value, usingself: true)
when "usingself"
# Shunt to another copy of this lexer
return [] of Token if match.empty?
Tokenizer.new(
tokenizer.lexer,
String.new(match[match_group].value),
secondary: true).to_a
when ActionType::Combined
# Combine two or more states into one anonymous state
new_state = @states.map { |name|
tokenizer.lexer.states[name]
return [] of Token if match.nil?
new_lexer = Lexer.from_xml(lexer.xml)
Log.trace { "to tokenize: #{match[match_group]}" }
new_lexer.tokenize(match[match_group].as(Onigmo::Match).value, usingself: true)
when "combined"
# Combine two states into one anonymous state
states = xml.attributes.select { |attrib|
attrib.name == "state"
}.map &.content
new_state = states.map { |name|
lexer.states[name]
}.reduce { |state1, state2|
state1 + state2
}
tokenizer.lexer.states[new_state.name] = new_state
tokenizer.state_stack << new_state.name
lexer.states[new_state.name] = new_state
lexer.state_stack << new_state.name
[] of Token
else
raise Exception.new("Unknown action type: #{@type}")
raise Exception.new("Unknown action type: #{type}: #{xml}")
end
end
end

View File

@@ -1,73 +0,0 @@
module BytesRegex
extend self
class Regex
def initialize(pattern : String, multiline = false, dotall = false, ignorecase = false, anchored = false)
flags = LibPCRE2::UTF | LibPCRE2::UCP | LibPCRE2::NO_UTF_CHECK
flags |= LibPCRE2::MULTILINE if multiline
flags |= LibPCRE2::DOTALL if dotall
flags |= LibPCRE2::CASELESS if ignorecase
flags |= LibPCRE2::ANCHORED if anchored
if @re = LibPCRE2.compile(
pattern,
pattern.bytesize,
flags,
out errorcode,
out erroroffset,
nil)
else
msg = String.new(256) do |buffer|
bytesize = LibPCRE2.get_error_message(errorcode, buffer, 256)
{bytesize, 0}
end
raise Exception.new "Error #{msg} compiling regex at offset #{erroroffset}"
end
@match_data = LibPCRE2.match_data_create_from_pattern(@re, nil)
end
def finalize
LibPCRE2.match_data_free(@match_data)
LibPCRE2.code_free(@re)
end
def match(str : Bytes, pos = 0) : Array(Match)
rc = LibPCRE2.match(
@re,
str,
str.size,
pos,
LibPCRE2::NO_UTF_CHECK,
@match_data,
nil)
if rc > 0
ovector = LibPCRE2.get_ovector_pointer(@match_data)
(0...rc).map do |i|
m_start = ovector[2 * i]
m_end = ovector[2 * i + 1]
if m_start == m_end
m_value = Bytes.new(0)
else
m_value = str[m_start...m_end]
end
Match.new(m_value, m_start, m_end - m_start)
end
else
[] of Match
end
end
end
struct Match
property value : Bytes
property start : UInt64
property size : UInt64
def initialize(@value : Bytes, @start : UInt64, @size : UInt64)
end
end
end
# pattern = "foo"
# str = "foo bar"
# re = BytesRegex::Regex.new(pattern)
# p! String.new(re.match(str.to_slice)[0].value)

View File

@@ -9,19 +9,12 @@ module Tartrazine
# This is the base class for all formatters.
abstract class Formatter
property name : String = ""
property theme : Theme = Tartrazine.theme("default-dark")
# Format the text using the given lexer.
def format(text : String, lexer : Lexer, io : IO = nil) : Nil
def format(text : String, lexer : Lexer, theme : Theme) : String
raise Exception.new("Not implemented")
end
def format(text : String, lexer : Lexer) : String
raise Exception.new("Not implemented")
end
# Return the styles, if the formatter supports it.
def style_defs : String
def get_style_defs(theme : Theme) : String
raise Exception.new("Not implemented")
end
end

View File

@@ -4,33 +4,20 @@ module Tartrazine
class Ansi < Formatter
property? line_numbers : Bool = false
def initialize(@theme : Theme = Tartrazine.theme("default-dark"), @line_numbers : Bool = false)
end
private def line_label(i : Int32) : String
"#{i + 1}".rjust(4).ljust(5)
end
def format(text : String, lexer : Lexer) : String
outp = String::Builder.new("")
format(text, lexer, outp)
outp.to_s
end
def format(text : String, lexer : Lexer, outp : IO) : Nil
tokenizer = Tokenizer.new(lexer, text)
i = 0
outp << line_label(i) if line_numbers?
tokenizer.each do |token|
outp << colorize(token[:value], token[:type])
if token[:value].includes?("\n")
i += 1
outp << line_label(i) if line_numbers?
def format(text : String, lexer : Lexer, theme : Theme) : String
output = String.build do |outp|
lexer.group_tokens_in_lines(lexer.tokenize(text)).each_with_index do |line, i|
label = line_numbers? ? "#{i + 1}".rjust(4).ljust(5) : ""
outp << label
line.each do |token|
outp << colorize(token[:value], token[:type], theme)
end
end
end
output
end
def colorize(text : String, token : String) : String
def colorize(text : String, token : String, theme : Theme) : String
style = theme.styles.fetch(token, nil)
return text if style.nil?
if theme.styles.has_key?(token)

View File

@@ -1,6 +1,5 @@
require "../constants/token_abbrevs.cr"
require "../formatter"
require "html"
module Tartrazine
class Html < Formatter
@@ -16,78 +15,56 @@ module Tartrazine
property? standalone : Bool = false
property? surrounding_pre : Bool = true
property? wrap_long_lines : Bool = false
property weight_of_bold : Int32 = 600
property? weight_of_bold : Int32 = 600
property theme : Theme
def initialize(@theme : Theme = Tartrazine.theme("default-dark"), *,
@highlight_lines = [] of Range(Int32, Int32),
@class_prefix : String = "",
@line_number_id_prefix = "line-",
@line_number_start = 1,
@tab_width = 8,
@line_numbers : Bool = false,
@linkable_line_numbers : Bool = true,
@standalone : Bool = false,
@surrounding_pre : Bool = true,
@wrap_long_lines : Bool = false,
@weight_of_bold : Int32 = 600)
end
def format(text : String, lexer : Lexer) : String
outp = String::Builder.new("")
format(text, lexer, outp)
outp.to_s
end
def format(text : String, lexer : Lexer, io : IO) : Nil
pre, post = wrap_standalone
io << pre if standalone?
format_text(text, lexer, io)
io << post if standalone?
def format(text : String, lexer : Lexer, theme : Theme) : String
text = format_text(text, lexer, theme)
if standalone?
text = wrap_standalone(text, theme)
end
text
end
# Wrap text into a full HTML document, including the CSS for the theme
def wrap_standalone
def wrap_standalone(text, theme) : String
output = String.build do |outp|
outp << "<!DOCTYPE html><html><head><style>"
outp << style_defs
outp << get_style_defs(theme)
outp << "</style></head><body>"
outp << text
outp << "</body></html>"
end
{output.to_s, "</body></html>"}
output
end
private def line_label(i : Int32) : String
line_label = "#{i + 1}".rjust(4).ljust(5)
line_class = highlighted?(i + 1) ? "class=\"#{get_css_class("LineHighlight")}\"" : ""
line_id = linkable_line_numbers? ? "id=\"#{line_number_id_prefix}#{i + 1}\"" : ""
"<span #{line_id} #{line_class} style=\"user-select: none;\">#{line_label} </span>"
end
def format_text(text : String, lexer : Lexer, outp : IO)
tokenizer = Tokenizer.new(lexer, text)
i = 0
if surrounding_pre?
pre_style = wrap_long_lines? ? "style=\"white-space: pre-wrap; word-break: break-word;\"" : ""
outp << "<pre class=\"#{get_css_class("Background")}\" #{pre_style}>"
end
outp << "<code class=\"#{get_css_class("Background")}\">"
outp << line_label(i) if line_numbers?
tokenizer.each do |token|
outp << "<span class=\"#{get_css_class(token[:type])}\">#{HTML.escape(token[:value])}</span>"
if token[:value].ends_with? "\n"
i += 1
outp << line_label(i) if line_numbers?
def format_text(text : String, lexer : Lexer, theme : Theme) : String
lines = lexer.group_tokens_in_lines(lexer.tokenize(text))
output = String.build do |outp|
if surrounding_pre?
pre_style = wrap_long_lines? ? "style=\"white-space: pre-wrap; word-break: break-word;\"" : ""
outp << "<pre class=\"#{get_css_class("Background", theme)}\" #{pre_style}>"
end
outp << "<code class=\"#{get_css_class("Background", theme)}\">"
lines.each_with_index(offset: line_number_start - 1) do |line, i|
line_label = line_numbers? ? "#{i + 1}".rjust(4).ljust(5) : ""
line_class = highlighted?(i + 1) ? "class=\"#{get_css_class("LineHighlight", theme)}\"" : ""
line_id = linkable_line_numbers? ? "id=\"#{line_number_id_prefix}#{i + 1}\"" : ""
outp << "<span #{line_id} #{line_class} style=\"user-select: none;\">#{line_label} </span>"
line.each do |token|
fragment = "<span class=\"#{get_css_class(token[:type], theme)}\">#{token[:value]}</span>"
outp << fragment
end
end
outp << "</code></pre>"
end
outp << "</code></pre>"
output
end
# ameba:disable Metrics/CyclomaticComplexity
def style_defs : String
def get_style_defs(theme : Theme) : String
output = String.build do |outp|
theme.styles.each do |token, style|
outp << ".#{get_css_class(token)} {"
outp << ".#{get_css_class(token, theme)} {"
# These are set or nil
outp << "color: ##{style.color.try &.hex};" if style.color
outp << "background-color: ##{style.background.try &.hex};" if style.background
@@ -110,21 +87,18 @@ module Tartrazine
end
# Given a token type, return the CSS class to use.
def get_css_class(token : String) : String
if !theme.styles.has_key? token
# Themes don't contain information for each specific
# token type. However, they may contain information
# for a parent style. Worst case, we go to the root
# (Background) style.
parent = theme.style_parents(token).reverse.find { |dad|
theme.styles.has_key?(dad)
}
theme.styles[token] = theme.styles[parent]
end
class_prefix + Abbreviations[token]
def get_css_class(token, theme)
return class_prefix + Abbreviations[token] if theme.styles.has_key?(token)
# Themes don't contain information for each specific
# token type. However, they may contain information
# for a parent style. Worst case, we go to the root
# (Background) style.
class_prefix + Abbreviations[theme.style_parents(token).reverse.find { |parent|
theme.styles.has_key?(parent)
}]
end
# Is this line in the highlighted ranges?
def highlighted?(line : Int) : Bool
highlight_lines.any?(&.includes?(line))
end

View File

@@ -4,15 +4,8 @@ module Tartrazine
class Json < Formatter
property name = "json"
def format(text : String, lexer : Lexer) : String
outp = String::Builder.new("")
format(text, lexer, outp)
outp.to_s
end
def format(text : String, lexer : Lexer, io : IO) : Nil
tokenizer = Tokenizer.new(lexer, text)
io << Tartrazine::Lexer.collapse_tokens(tokenizer.to_a).to_json
def format(text : String, lexer : Lexer, _theme : Theme) : String
lexer.tokenize(text).to_json
end
end
end

View File

@@ -1,9 +1,9 @@
require "baked_file_system"
require "./constants/lexers"
module Tartrazine
class LexerFiles
extend BakedFileSystem
bake_folder "../lexers", __DIR__
end
@@ -37,92 +37,71 @@ module Tartrazine
LEXERS_BY_NAME.keys.sort!
end
# A token, the output of the tokenizer
alias Token = NamedTuple(type: String, value: String)
struct Tokenizer
include Iterator(Token)
property lexer : Lexer
property text : Bytes
property pos : Int32 = 0
@dq = Deque(Token).new
property state_stack = ["root"]
def initialize(@lexer : Lexer, text : String, secondary = false)
# Respect the `ensure_nl` config option
if text.size > 0 && text[-1] != '\n' && @lexer.config[:ensure_nl] && !secondary
text += "\n"
end
@text = text.to_slice
end
def next : Iterator::Stop | Token
if @dq.size > 0
return @dq.shift
end
if pos == @text.size
return stop
end
matched = false
while @pos < @text.size
@lexer.states[@state_stack.last].rules.each do |rule|
matched, new_pos, new_tokens = rule.match(@text, @pos, self)
if matched
@pos = new_pos
split_tokens(new_tokens).each { |token| @dq << token }
break
end
end
if !matched
if @text[@pos] == 10u8
@dq << {type: "Text", value: "\n"}
@state_stack = ["root"]
else
@dq << {type: "Error", value: String.new(@text[@pos..@pos])}
end
@pos += 1
break
end
end
self.next
end
# If a token contains a newline, split it into two tokens
def split_tokens(tokens : Array(Token)) : Array(Token)
split_tokens = [] of Token
tokens.each do |token|
if token[:value].includes?("\n")
values = token[:value].split("\n")
values.each_with_index do |value, index|
value += "\n" if index < values.size - 1
split_tokens << {type: token[:type], value: value}
end
else
split_tokens << token
end
end
split_tokens
end
end
# This implements a lexer for Pygments RegexLexers as expressed
# in Chroma's XML serialization.
#
# For explanations on what actions and states do
# the Pygments documentation is a good place to start.
# https://pygments.org/docs/lexerdevelopment/
struct Lexer
class Lexer
property config = {
name: "",
aliases: [] of String,
filenames: [] of String,
mime_types: [] of String,
priority: 0.0,
case_insensitive: false,
dot_all: false,
not_multiline: false,
ensure_nl: false,
}
property xml : String = ""
property states = {} of String => State
property state_stack = ["root"]
# Turn the text into a list of tokens. The `usingself` parameter
# is true when the lexer is being used to tokenize a string
# from a larger text that is already being tokenized.
# So, when it's true, we don't modify the text.
def tokenize(text, usingself = false) : Array(Token)
@state_stack = ["root"]
tokens = [] of Token
pos = 0
matched = false
# Respect the `ensure_nl` config option
if text.size > 0 && text[-1] != '\n' && config[:ensure_nl] && !usingself
text += "\n"
end
# Loop through the text, applying rules
while pos < text.size
state = states[@state_stack.last]
# Log.trace { "Stack is #{@state_stack} State is #{state.name}, pos is #{pos}, text is #{text[pos..pos + 10]}" }
state.rules.each do |rule|
matched, new_pos, new_tokens = rule.match(text, pos, self)
if matched
# Move position forward, save the tokens,
# tokenize from the new position
# Log.trace { "MATCHED: #{rule.xml}" }
pos = new_pos
tokens += new_tokens
break
end
# Log.trace { "NOT MATCHED: #{rule.xml}" }
end
# If no rule matches, emit an error token
unless matched
# Log.trace { "Error at #{pos}" }
tokens << {type: "Error", value: "#{text[pos]}"}
pos += 1
end
end
Lexer.collapse_tokens(tokens)
end
# Collapse consecutive tokens of the same type for easier comparison
# and smaller output
def self.collapse_tokens(tokens : Array(Tartrazine::Token)) : Array(Tartrazine::Token)
@@ -145,8 +124,34 @@ module Tartrazine
result
end
# Group tokens into lines, splitting them when a newline is found
def group_tokens_in_lines(tokens : Array(Token)) : Array(Array(Token))
split_tokens = [] of Token
tokens.each do |token|
if token[:value].includes?("\n")
values = token[:value].split("\n")
values.each_with_index do |value, index|
value += "\n" if index < values.size - 1
split_tokens << {type: token[:type], value: value}
end
else
split_tokens << token
end
end
lines = [Array(Token).new]
split_tokens.each do |token|
lines.last << token
if token[:value].includes?("\n")
lines << Array(Token).new
end
end
lines
end
# ameba:disable Metrics/CyclomaticComplexity
def self.from_xml(xml : String) : Lexer
l = Lexer.new
l.xml = xml
lexer = XML.parse(xml).first_element_child
if lexer
config = lexer.children.find { |node|
@@ -155,6 +160,9 @@ module Tartrazine
if config
l.config = {
name: xml_to_s(config, name) || "",
aliases: xml_to_a(config, _alias) || [] of String,
filenames: xml_to_a(config, filename) || [] of String,
mime_types: xml_to_a(config, mime_type) || [] of String,
priority: xml_to_f(config, priority) || 0.0,
not_multiline: xml_to_s(config, not_multiline) == "true",
dot_all: xml_to_s(config, dot_all) == "true",
@@ -207,9 +215,9 @@ module Tartrazine
# A Lexer state. A state has a name and a list of rules.
# The state machine has a state stack containing references
# to states to decide which rules to apply.
struct State
class State
property name : String = ""
property rules = [] of BaseRule
property rules = [] of Rule
def +(other : State)
new_state = State.new
@@ -218,4 +226,7 @@ module Tartrazine
new_state
end
end
# A token, the output of the tokenizer
alias Token = NamedTuple(type: String, value: String)
end

View File

@@ -54,8 +54,6 @@ if options["--list-formatters"]
exit 0
end
theme = Tartrazine.theme(options["-t"].as(String))
if options["-f"]
formatter = options["-f"].as(String)
case formatter
@@ -63,11 +61,9 @@ if options["-f"]
formatter = Tartrazine::Html.new
formatter.standalone = options["--standalone"] != nil
formatter.line_numbers = options["--line-numbers"] != nil
formatter.theme = theme
when "terminal"
formatter = Tartrazine::Ansi.new
formatter.line_numbers = options["--line-numbers"] != nil
formatter.theme = theme
when "json"
formatter = Tartrazine::Json.new
else
@@ -75,9 +71,11 @@ if options["-f"]
exit 1
end
theme = Tartrazine.theme(options["-t"].as(String))
if formatter.is_a?(Tartrazine::Html) && options["--css"]
File.open("#{options["-t"].as(String)}.css", "w") do |outf|
outf << formatter.style_defs
outf.puts formatter.get_style_defs(theme)
end
exit 0
end
@@ -85,11 +83,13 @@ if options["-f"]
lexer = Tartrazine.lexer(name: options["-l"].as(String), filename: options["FILE"].as(String))
input = File.open(options["FILE"].as(String)).gets_to_end
output = formatter.format(input, lexer, theme)
if options["-o"].nil?
outf = STDOUT
puts output
else
outf = File.open(options["-o"].as(String), "w")
File.open(options["-o"].as(String), "w") do |outf|
outf.puts output
end
end
formatter.format(input, lexer, outf)
end

85
src/onigmo.cr Normal file
View File

@@ -0,0 +1,85 @@
@[Link("onigmo")]
@[Link(ldflags: "#{__DIR__}/onigmo/onigwrap.o")]
lib LibOnigmo
type Regex = Pointer(Void)
type Region = Pointer(Void)
fun create = onigwrap_create(pattern : LibC::Char*, len : UInt32,
ignoreCase : Int32,
multiline : Int32,
dotall : Int32) : Regex
fun free = onigwrap_free(re : Regex)
fun region_free = onigwrap_region_free(region : Region)
fun search = onigwrap_search(re : Regex, str : LibC::Char*, offset : UInt32, length : UInt32) : Region
fun num_regs = onigwrap_num_regs(region : Region) : Int32
fun pos = onigwrap_pos(region : Region, index : Int32) : Int32
fun len = onigwrap_len(region : Region, index : Int32) : Int32
end
module Onigmo
class Match
property begin : Int32
property end : Int32
property value : String
def initialize(@begin, @end, @value)
end
def to_s
@value
end
end
class Regex
def initialize(@pattern : String, @ignorecase = false, @multiline = false, @dotall = false)
@re = LibOnigmo.create(@pattern.to_unsafe, @pattern.bytesize, @ignorecase ? 1 : 0, @multiline ? 1 : 0, @dotall ? 1 : 0)
end
def finalize
LibOnigmo.free(@re)
end
def match(str : String, offset = 0)
# The offset argument is a character index, but Onigmo expects a byte index
offset = str.char_index_to_byte_index(offset)
if offset.nil?
raise Exception.new "Invalid offset"
end
region = LibOnigmo.search(@re, str.to_unsafe, offset, str.bytesize)
result = [] of Match?
num_regs = LibOnigmo.num_regs(region)
if num_regs > 0
(0...num_regs).each do |i|
pos = LibOnigmo.pos(region, i)
l = LibOnigmo.len(region, i)
if pos == -1 || l == -1
result << nil
else
b = str.byte_index_to_char_index(pos)
e = str.byte_index_to_char_index(pos + l)
# p! pos, l, b, e, str[pos..]
if b.nil? || e.nil?
raise Exception.new "Invalid substring"
end
v = str[b...e]
result << Match.new(b, b + v.size, v)
end
end
else
return [] of Match
end
LibOnigmo.region_free(region)
result
end
end
end
# pattern = "\\w"
# str = "α"
# re = Onigmo::Regex.new(pattern, false, false, false)
# p! re.match(str)

94
src/onigmo/onigwrap.c Normal file
View File

@@ -0,0 +1,94 @@
#include "onigmo.h"
regex_t *onigwrap_create(char *pattern, int len, int ignoreCase, int multiline, int dotall)
{
regex_t *reg;
OnigErrorInfo einfo;
OnigOptionType onigOptions = ONIG_OPTION_DEFAULT;
if (ignoreCase == 1)
onigOptions |= ONIG_OPTION_IGNORECASE;
if (multiline == 1)
onigOptions |= ONIG_OPTION_NEGATE_SINGLELINE;
if (dotall == 1)
onigOptions |= ONIG_OPTION_DOTALL;
OnigUChar *stringStart = (OnigUChar*) pattern;
OnigUChar *stringEnd = (OnigUChar*) pattern + len;
int res = onig_new(&reg, stringStart, stringEnd, onigOptions, ONIG_ENCODING_UTF8, ONIG_SYNTAX_PYTHON, &einfo);
return reg;
}
void onigwrap_region_free(OnigRegion *region)
{
onig_region_free(region, 1);
}
void onigwrap_free(regex_t *reg)
{
onig_free(reg);
}
int onigwrap_index_in(regex_t *reg, char *charPtr, int offset, int length)
{
OnigUChar *stringStart = (OnigUChar*) charPtr;
OnigUChar *stringEnd = (OnigUChar*) (charPtr + length);
OnigUChar *stringOffset = (OnigUChar*) (charPtr + offset);
OnigUChar *stringRange = (OnigUChar*) stringEnd;
OnigRegion *region = onig_region_new();
int result = onig_search(reg, stringStart, stringEnd, stringOffset, stringRange, region, ONIG_OPTION_NONE);
onig_region_free(region, 1);
if (result >= 0)
return result >> 1;
if (result == ONIG_MISMATCH)
return -1;
return -2;
}
OnigRegion *onigwrap_search(regex_t *reg, char *charPtr, int offset, int length)
{
OnigUChar *stringStart = (OnigUChar*) charPtr;
OnigUChar *stringEnd = (OnigUChar*) (charPtr + length);
OnigUChar *stringOffset = (OnigUChar*) (charPtr + offset);
OnigUChar *stringRange = (OnigUChar*) stringEnd;
OnigRegion *region = onig_region_new();
int result = onig_search(reg, stringStart, stringEnd, stringOffset, stringRange, region, ONIG_OPTION_NONE);
return region;
}
int onigwrap_num_regs(OnigRegion *region)
{
return region->num_regs;
}
int onigwrap_pos(OnigRegion *region, int nth)
{
if (nth < region->num_regs)
{
int result = region->beg[nth];
if (result < 0)
return -1;
return result;
}
return -1;
}
int onigwrap_len(OnigRegion *region, int nth)
{
if (nth < region->num_regs)
{
int result = region->end[nth] - region->beg[nth];
return result;
}
return -1;
}

32
src/onigmo/onigwrap.h Normal file
View File

@@ -0,0 +1,32 @@
#include "onigmo.h"
#if defined(_WIN32)
#define ONIGWRAP_EXTERN extern __declspec(dllexport)
#else
#define ONIGWRAP_EXTERN extern
#endif
ONIGWRAP_EXTERN
regex_t *onigwrap_create(char *pattern, int len, int ignoreCase, int multiline);
ONIGWRAP_EXTERN
void onigwrap_region_free(OnigRegion *region);
ONIGWRAP_EXTERN
void onigwrap_free(regex_t *reg);
ONIGWRAP_EXTERN
int onigwrap_index_in(regex_t *reg, char *charPtr, int offset, int length);
ONIGWRAP_EXTERN
OnigRegion *onigwrap_search(regex_t *reg, char *charPtr, int offset, int length);
ONIGWRAP_EXTERN
int onigwrap_num_regs(OnigRegion *region);
ONIGWRAP_EXTERN
int onigwrap_pos(OnigRegion *region, int nth);
ONIGWRAP_EXTERN
int onigwrap_len(OnigRegion *region, int nth);

View File

@@ -1,9 +1,9 @@
require "./actions"
require "./bytes_regex"
require "./formatter"
require "./lexer"
require "./rules"
require "./styles"
require "./lexer"
require "./onigmo"
# These are lexer rules. They match with the text being parsed
# and perform actions, either emitting tokens or changing the
@@ -11,15 +11,48 @@ require "./styles"
module Tartrazine
# This rule matches via a regex pattern
alias Regex = BytesRegex::Regex
alias Match = BytesRegex::Match
alias MatchData = Array(Match)
alias Regex = Onigmo::Regex
abstract struct BaseRule
abstract def match(text : Bytes, pos : Int32, tokenizer : Tokenizer) : Tuple(Bool, Int32, Array(Token))
abstract def initialize(node : XML::Node)
class Rule
property pattern : Regex = Regex.new ""
property pattern2 : ::Regex = ::Regex.new ""
property actions : Array(Action) = [] of Action
property xml : String = "foo"
@actions : Array(Action) = [] of Action
def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token))
match = pattern.match(text, pos)
match2 = pattern2.match(text, pos)
# We don't match if the match doesn't move the cursor
# because that causes infinite loops
# The `match.begin > pos` is the same as the ANCHORED option
return false, pos, [] of Token if match.empty? || match[0].nil? || match[0].try { |m| m.begin > pos }
# p! match.map(&.to_s), match2, text[pos-1..pos + 20],"----------------------"
# Log.trace { "#{match}, #{pattern.inspect}, #{text}, #{pos}" }
tokens = [] of Token
# Emit the tokens
actions.each do |action|
# Emit the token
tokens += action.emit(match, lexer)
end
# Log.trace { "#{xml}, #{match[0].end}, #{tokens}" }
return true, pos + match[0].as(Onigmo::Match).value.size, tokens
end
def initialize(node : XML::Node, multiline, dotall, ignorecase)
@xml = node.to_s
pattern = node["pattern"]
# flags = Regex::Options::ANCHORED
flags = ::Regex::Options::NO_UTF_CHECK
# MULTILINE implies DOTALL which we don't want, so we
# use in-pattern flag (?m) instead
flags |= ::Regex::Options::MULTILINE if multiline
pattern = "(?m)" + pattern if multiline
flags |= ::Regex::Options::DOTALL if dotall
flags |= ::Regex::Options::IGNORE_CASE if ignorecase
@pattern = Regex.new(pattern, ignorecase, multiline, dotall)
@pattern2 = ::Regex.new(pattern, flags)
add_actions(node)
end
def add_actions(node : XML::Node)
node.children.each do |child|
@@ -29,42 +62,23 @@ module Tartrazine
end
end
struct Rule < BaseRule
property pattern : Regex = Regex.new ""
def match(text : Bytes, pos, tokenizer) : Tuple(Bool, Int32, Array(Token))
match = pattern.match(text, pos)
# No match
return false, pos, [] of Token if match.size == 0
return true, pos + match[0].size, @actions.flat_map(&.emit(match, tokenizer))
end
def initialize(node : XML::Node)
end
def initialize(node : XML::Node, multiline, dotall, ignorecase)
pattern = node["pattern"]
pattern = "(?m)" + pattern if multiline
@pattern = Regex.new(pattern, multiline, dotall, ignorecase, true)
add_actions(node)
end
end
# This rule includes another state. If any of the rules of the
# included state matches, this rule matches.
struct IncludeStateRule < BaseRule
@state : String = ""
class IncludeStateRule < Rule
property state : String = ""
def match(text : Bytes, pos : Int32, tokenizer : Tokenizer) : Tuple(Bool, Int32, Array(Token))
tokenizer.@lexer.states[@state].rules.each do |rule|
matched, new_pos, new_tokens = rule.match(text, pos, tokenizer)
def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token))
Log.trace { "Including state #{state} from #{lexer.state_stack.last}" }
lexer.states[state].rules.each do |rule|
matched, new_pos, new_tokens = rule.match(text, pos, lexer)
Log.trace { "#{xml}, #{new_pos}, #{new_tokens}" } if matched
return true, new_pos, new_tokens if matched
end
return false, pos, [] of Token
end
def initialize(node : XML::Node)
@xml = node.to_s
include_node = node.children.find { |child|
child.name == "include"
}
@@ -74,14 +88,17 @@ module Tartrazine
end
# This rule always matches, unconditionally
struct UnconditionalRule < BaseRule
NO_MATCH = [] of Match
def match(text, pos, tokenizer) : Tuple(Bool, Int32, Array(Token))
return true, pos, @actions.flat_map(&.emit(NO_MATCH, tokenizer))
class UnconditionalRule < Rule
def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token))
tokens = [] of Token
actions.each do |action|
tokens += action.emit(nil, lexer)
end
return true, pos, tokens
end
def initialize(node : XML::Node)
@xml = node.to_s
add_actions(node)
end
end

View File

@@ -9,7 +9,7 @@ require "xml"
module Tartrazine
alias Color = Sixteen::Color
struct ThemeFiles
class ThemeFiles
extend BakedFileSystem
bake_folder "../styles", __DIR__
end
@@ -39,7 +39,7 @@ module Tartrazine
themes.to_a.sort!
end
struct Style
class Style
# These properties are tri-state.
# true means it's set
# false means it's not set
@@ -79,7 +79,7 @@ module Tartrazine
end
end
struct Theme
class Theme
property name : String = ""
property styles = {} of String => Style

View File

@@ -11,7 +11,7 @@ require "xml"
module Tartrazine
extend self
VERSION = {{ `shards version #{__DIR__}`.chomp.stringify }}
VERSION = "0.2.0"
Log = ::Log.for("tartrazine")
end