Compare commits

..

12 Commits

10 changed files with 117 additions and 118 deletions

1
.gitignore vendored
View File

@ -7,3 +7,4 @@ chroma/
pygments/
shard.lock
.vscode/
.crystal/

View File

@ -4,17 +4,17 @@ Tartrazine is a library to syntax-highlight code. It is
a port of [Pygments](https://pygments.org/) to
[Crystal](https://crystal-lang.org/). Kind of.
It's not currently usable because it's not finished, but:
* The lexers work for the implemented languages
* The provided styles work
* There is a very very simple HTML formatter
The CLI tool can be used to highlight many things in many styles.
# A port of what? Why "kind of"?
Because I did not read the Pygments code. And this is actually
based on [Chroma](https://github.com/alecthomas/chroma) ...
although I did not read that code either.
Pygments is a staple of the Python ecosystem, and it's great.
It lets you highlight code in many languages, and it has many
themes. Chroma is "Pygments for Go", it's actually a port of
Pygments to Go, and it's great too.
I wanted that in Crystal, so I started this project. But I did
not read much of the Pygments code. Or much of Chroma's.
Chroma has taken most of the Pygments lexers and turned them into
XML descriptions. What I did was take those XML files from Chroma

View File

@ -1,5 +1,5 @@
name: tartrazine
version: 0.3.0
version: 0.4.0
authors:
- Roberto Alsina <roberto.alsina@gmail.com>

View File

@ -14,6 +14,7 @@ unicode_problems = {
"#{__DIR__}/tests/java/test_string_literals.txt",
"#{__DIR__}/tests/json/test_strings.txt",
"#{__DIR__}/tests/systemd/example1.txt",
"#{__DIR__}/tests/c++/test_unicode_identifiers.txt",
}
# These testcases fail because of differences in the way chroma and tartrazine tokenize

View File

@ -8,12 +8,20 @@ require "./tartrazine"
# perform a list of actions. These actions can emit tokens
# or change the state machine.
module Tartrazine
class Action
property type : String
property xml : XML::Node
struct Action
property actions : Array(Action) = [] of Action
property type : String
@depth : Int32 = 0
@lexer_name : String = ""
@states : Array(String) = [] of String
@states_to_push : Array(String) = [] of String
@token_type : String = ""
def initialize(@type : String, xml : XML::Node?)
known_types = %w(token push pop combined bygroups include using usingself)
raise Exception.new("Unknown action type: #{type}") unless known_types.includes? type
def initialize(@type : String, @xml : XML::Node?)
# Some actions may have actions in them, like this:
# <bygroups>
# <token type="GenericPrompt"/>
@ -23,10 +31,28 @@ module Tartrazine
#
# The token actions match with the first 2 groups in the regex
# the using action matches the 3rd and shunts it to another lexer
@xml.children.each do |node|
xml.children.each do |node|
next unless node.element?
@actions << Action.new(node.name, node)
end
# Prefetch the attributes we ned from the XML and keep them
case type
when "token"
@token_type = xml["type"]
when "push"
@states_to_push = xml.attributes.select { |attrib|
attrib.name == "state"
}.map &.content
when "pop"
@depth = xml["depth"].to_i
when "using"
@lexer_name = xml["lexer"].downcase
when "combined"
@states = xml.attributes.select { |attrib|
attrib.name == "state"
}.map &.content
end
end
# ameba:disable Metrics/CyclomaticComplexity
@ -34,35 +60,22 @@ module Tartrazine
case type
when "token"
raise Exception.new "Can't have a token without a match" if match.empty?
[Token.new(type: xml["type"], value: String.new(match[match_group].value))]
[Token.new(type: @token_type, value: String.new(match[match_group].value))]
when "push"
states_to_push = xml.attributes.select { |attrib|
attrib.name == "state"
}.map &.content
if states_to_push.empty?
# Push without a state means push the current state
states_to_push = [lexer.state_stack.last]
end
states_to_push.each do |state|
if state == "#pop"
to_push = @states_to_push.empty? ? [lexer.state_stack.last] : @states_to_push
to_push.each do |state|
if state == "#pop" && lexer.state_stack.size > 1
# Pop the state
Log.trace { "Popping state" }
lexer.state_stack.pop
else
# Really push
lexer.state_stack << state
Log.trace { "Pushed #{lexer.state_stack}" }
end
end
[] of Token
when "pop"
depth = xml["depth"].to_i
Log.trace { "Popping #{depth} states" }
if lexer.state_stack.size <= depth
Log.trace { "Can't pop #{depth} states, only have #{lexer.state_stack.size}" }
else
lexer.state_stack.pop(depth)
end
to_pop = [@depth, lexer.state_stack.size - 1].min
lexer.state_stack.pop(to_pop)
[] of Token
when "bygroups"
# FIXME: handle
@ -92,22 +105,15 @@ module Tartrazine
when "using"
# Shunt to another lexer entirely
return [] of Token if match.empty?
lexer_name = xml["lexer"].downcase
Log.trace { "to tokenize: #{match[match_group]}" }
Tartrazine.lexer(lexer_name).tokenize(String.new(match[match_group].value), usingself: true)
Tartrazine.lexer(@lexer_name).tokenize(String.new(match[match_group].value), usingself: true)
when "usingself"
# Shunt to another copy of this lexer
return [] of Token if match.empty?
new_lexer = Lexer.from_xml(lexer.xml)
Log.trace { "to tokenize: #{match[match_group]}" }
new_lexer = lexer.copy
new_lexer.tokenize(String.new(match[match_group].value), usingself: true)
when "combined"
# Combine two states into one anonymous state
states = xml.attributes.select { |attrib|
attrib.name == "state"
}.map &.content
new_state = states.map { |name|
new_state = @states.map { |name|
lexer.states[name]
}.reduce { |state1, state2|
state1 + state2
@ -116,7 +122,7 @@ module Tartrazine
lexer.state_stack << new_state.name
[] of Token
else
raise Exception.new("Unknown action type: #{type}: #{xml}")
raise Exception.new("Unknown action type: #{type}")
end
end
end

View File

@ -3,7 +3,7 @@ module BytesRegex
class Regex
def initialize(pattern : String, multiline = false, dotall = false, ignorecase = false, anchored = false)
flags = LibPCRE2::UTF | LibPCRE2::DUPNAMES | LibPCRE2::UCP
flags = LibPCRE2::UTF | LibPCRE2::UCP | LibPCRE2::NO_UTF_CHECK
flags |= LibPCRE2::MULTILINE if multiline
flags |= LibPCRE2::DOTALL if dotall
flags |= LibPCRE2::CASELESS if ignorecase
@ -22,27 +22,26 @@ module BytesRegex
end
raise Exception.new "Error #{msg} compiling regex at offset #{erroroffset}"
end
@match_data = LibPCRE2.match_data_create_from_pattern(@re, nil)
end
def finalize
LibPCRE2.match_data_free(@match_data)
LibPCRE2.code_free(@re)
end
def match(str : Bytes, pos = 0) : Array(Match)
match_data = LibPCRE2.match_data_create_from_pattern(@re, nil)
match = [] of Match
rc = LibPCRE2.match(
@re,
str,
str.size,
pos,
0,
match_data,
LibPCRE2::NO_UTF_CHECK,
@match_data,
nil)
if rc < 0
# No match, do nothing
else
ovector = LibPCRE2.get_ovector_pointer(match_data)
if rc > 0
ovector = LibPCRE2.get_ovector_pointer(@match_data)
(0...rc).each do |i|
m_start = ovector[2 * i]
m_size = ovector[2 * i + 1] - m_start
@ -54,7 +53,6 @@ module BytesRegex
match << Match.new(m_value, m_start, m_size)
end
end
LibPCRE2.match_data_free(match_data)
match
end
end

View File

@ -1,5 +1,6 @@
require "../constants/token_abbrevs.cr"
require "../formatter"
require "html"
module Tartrazine
class Html < Formatter
@ -67,8 +68,7 @@ module Tartrazine
line_id = linkable_line_numbers? ? "id=\"#{line_number_id_prefix}#{i + 1}\"" : ""
outp << "<span #{line_id} #{line_class} style=\"user-select: none;\">#{line_label} </span>"
line.each do |token|
fragment = "<span class=\"#{get_css_class(token[:type])}\">#{token[:value]}</span>"
outp << fragment
outp << "<span class=\"#{get_css_class(token[:type])}\">#{HTML.escape(token[:value])}</span>"
end
end
outp << "</code></pre>"
@ -104,15 +104,17 @@ module Tartrazine
# Given a token type, return the CSS class to use.
def get_css_class(token : String) : String
return class_prefix + Abbreviations[token] if theme.styles.has_key?(token)
# Themes don't contain information for each specific
# token type. However, they may contain information
# for a parent style. Worst case, we go to the root
# (Background) style.
class_prefix + Abbreviations[theme.style_parents(token).reverse.find { |parent|
theme.styles.has_key?(parent)
}]
if !theme.styles.has_key? token
# Themes don't contain information for each specific
# token type. However, they may contain information
# for a parent style. Worst case, we go to the root
# (Background) style.
parent = theme.style_parents(token).reverse.find { |dad|
theme.styles.has_key?(dad)
}
theme.styles[token] = theme.styles[parent]
end
class_prefix + Abbreviations[token]
end
# Is this line in the highlighted ranges?

View File

@ -56,12 +56,18 @@ module Tartrazine
not_multiline: false,
ensure_nl: false,
}
property xml : String = ""
# property xml : String = ""
property states = {} of String => State
property state_stack = ["root"]
def copy : Lexer
new_lexer = Lexer.new
new_lexer.config = config
new_lexer.states = states
new_lexer.state_stack = state_stack[0..-1]
new_lexer
end
# Turn the text into a list of tokens. The `usingself` parameter
# is true when the lexer is being used to tokenize a string
# from a larger text that is already being tokenized.
@ -87,12 +93,10 @@ module Tartrazine
if matched
# Move position forward, save the tokens,
# tokenize from the new position
# Log.trace { "MATCHED: #{rule.xml}" }
pos = new_pos
tokens += new_tokens
break
end
# Log.trace { "NOT MATCHED: #{rule.xml}" }
end
# If no rule matches, emit an error token
unless matched
@ -158,7 +162,6 @@ module Tartrazine
# ameba:disable Metrics/CyclomaticComplexity
def self.from_xml(xml : String) : Lexer
l = Lexer.new
l.xml = xml
lexer = XML.parse(xml).first_element_child
if lexer
config = lexer.children.find { |node|
@ -222,9 +225,9 @@ module Tartrazine
# A Lexer state. A state has a name and a list of rules.
# The state machine has a state stack containing references
# to states to decide which rules to apply.
class State
struct State
property name : String = ""
property rules = [] of Rule
property rules = [] of BaseRule
def +(other : State)
new_state = State.new

View File

@ -77,7 +77,7 @@ if options["-f"]
if formatter.is_a?(Tartrazine::Html) && options["--css"]
File.open("#{options["-t"].as(String)}.css", "w") do |outf|
outf.puts formatter.style_defs
outf << formatter.style_defs
end
exit 0
end
@ -91,7 +91,7 @@ if options["-f"]
puts output
else
File.open(options["-o"].as(String), "w") do |outf|
outf.puts output
outf << output
end
end
end

View File

@ -15,41 +15,11 @@ module Tartrazine
alias Match = BytesRegex::Match
alias MatchData = Array(Match)
class Rule
property pattern : Regex = Regex.new ""
abstract struct BaseRule
abstract def match(text : Bytes, pos, lexer) : Tuple(Bool, Int32, Array(Token))
abstract def initialize(node : XML::Node)
property actions : Array(Action) = [] of Action
property xml : String = "foo"
def match(text : Bytes, pos, lexer) : Tuple(Bool, Int32, Array(Token))
match = pattern.match(text, pos)
# We don't match if the match doesn't move the cursor
# because that causes infinite loops
return false, pos, [] of Token if match.empty? || match[0].size == 0
# p! match, String.new(text[pos..pos+20])
# Log.trace { "#{match}, #{pattern.inspect}, #{text}, #{pos}" }
tokens = [] of Token
# Emit the tokens
actions.each do |action|
# Emit the token
tokens += action.emit(match, lexer)
end
Log.trace { "#{xml}, #{pos + match[0].size}, #{tokens}" }
return true, pos + match[0].size, tokens
end
def initialize(node : XML::Node, multiline, dotall, ignorecase)
@xml = node.to_s
pattern = node["pattern"]
# flags = Regex::Options::ANCHORED
# MULTILINE implies DOTALL which we don't want, so we
# use in-pattern flag (?m) instead
# flags |= Regex::Options::MULTILINE if multiline
pattern = "(?m)" + pattern if multiline
# flags |= Regex::Options::DOTALL if dotall
# flags |= Regex::Options::IGNORE_CASE if ignorecase
@pattern = Regex.new(pattern, multiline, dotall, ignorecase, true)
add_actions(node)
end
def add_actions(node : XML::Node)
node.children.each do |child|
@ -59,23 +29,44 @@ module Tartrazine
end
end
struct Rule < BaseRule
property pattern : Regex = Regex.new ""
property actions : Array(Action) = [] of Action
def match(text : Bytes, pos, lexer) : Tuple(Bool, Int32, Array(Token))
match = pattern.match(text, pos)
# No match
return false, pos, [] of Token if match.size == 0
return true, pos + match[0].size, actions.flat_map { |action| action.emit(match, lexer) }
end
def initialize(node : XML::Node)
end
def initialize(node : XML::Node, multiline, dotall, ignorecase)
pattern = node["pattern"]
pattern = "(?m)" + pattern if multiline
@pattern = Regex.new(pattern, multiline, dotall, ignorecase, true)
add_actions(node)
end
end
# This rule includes another state. If any of the rules of the
# included state matches, this rule matches.
class IncludeStateRule < Rule
struct IncludeStateRule < BaseRule
property state : String = ""
def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token))
Log.trace { "Including state #{state} from #{lexer.state_stack.last}" }
lexer.states[state].rules.each do |rule|
matched, new_pos, new_tokens = rule.match(text, pos, lexer)
Log.trace { "#{xml}, #{new_pos}, #{new_tokens}" } if matched
return true, new_pos, new_tokens if matched
end
return false, pos, [] of Token
end
def initialize(node : XML::Node)
@xml = node.to_s
include_node = node.children.find { |child|
child.name == "include"
}
@ -85,17 +76,14 @@ module Tartrazine
end
# This rule always matches, unconditionally
class UnconditionalRule < Rule
struct UnconditionalRule < BaseRule
NO_MATCH = [] of Match
def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token))
tokens = [] of Token
actions.each do |action|
tokens += action.emit([] of Match, lexer)
end
return true, pos, tokens
return true, pos, actions.flat_map { |action| action.emit(NO_MATCH, lexer) }
end
def initialize(node : XML::Node)
@xml = node.to_s
add_actions(node)
end
end