Compare commits

..

12 Commits

10 changed files with 117 additions and 118 deletions

1
.gitignore vendored
View File

@ -7,3 +7,4 @@ chroma/
pygments/ pygments/
shard.lock shard.lock
.vscode/ .vscode/
.crystal/

View File

@ -4,17 +4,17 @@ Tartrazine is a library to syntax-highlight code. It is
a port of [Pygments](https://pygments.org/) to a port of [Pygments](https://pygments.org/) to
[Crystal](https://crystal-lang.org/). Kind of. [Crystal](https://crystal-lang.org/). Kind of.
It's not currently usable because it's not finished, but: The CLI tool can be used to highlight many things in many styles.
* The lexers work for the implemented languages
* The provided styles work
* There is a very very simple HTML formatter
# A port of what? Why "kind of"? # A port of what? Why "kind of"?
Because I did not read the Pygments code. And this is actually Pygments is a staple of the Python ecosystem, and it's great.
based on [Chroma](https://github.com/alecthomas/chroma) ... It lets you highlight code in many languages, and it has many
although I did not read that code either. themes. Chroma is "Pygments for Go", it's actually a port of
Pygments to Go, and it's great too.
I wanted that in Crystal, so I started this project. But I did
not read much of the Pygments code. Or much of Chroma's.
Chroma has taken most of the Pygments lexers and turned them into Chroma has taken most of the Pygments lexers and turned them into
XML descriptions. What I did was take those XML files from Chroma XML descriptions. What I did was take those XML files from Chroma

View File

@ -1,5 +1,5 @@
name: tartrazine name: tartrazine
version: 0.3.0 version: 0.4.0
authors: authors:
- Roberto Alsina <roberto.alsina@gmail.com> - Roberto Alsina <roberto.alsina@gmail.com>

View File

@ -14,6 +14,7 @@ unicode_problems = {
"#{__DIR__}/tests/java/test_string_literals.txt", "#{__DIR__}/tests/java/test_string_literals.txt",
"#{__DIR__}/tests/json/test_strings.txt", "#{__DIR__}/tests/json/test_strings.txt",
"#{__DIR__}/tests/systemd/example1.txt", "#{__DIR__}/tests/systemd/example1.txt",
"#{__DIR__}/tests/c++/test_unicode_identifiers.txt",
} }
# These testcases fail because of differences in the way chroma and tartrazine tokenize # These testcases fail because of differences in the way chroma and tartrazine tokenize

View File

@ -8,12 +8,20 @@ require "./tartrazine"
# perform a list of actions. These actions can emit tokens # perform a list of actions. These actions can emit tokens
# or change the state machine. # or change the state machine.
module Tartrazine module Tartrazine
class Action struct Action
property type : String
property xml : XML::Node
property actions : Array(Action) = [] of Action property actions : Array(Action) = [] of Action
property type : String
@depth : Int32 = 0
@lexer_name : String = ""
@states : Array(String) = [] of String
@states_to_push : Array(String) = [] of String
@token_type : String = ""
def initialize(@type : String, xml : XML::Node?)
known_types = %w(token push pop combined bygroups include using usingself)
raise Exception.new("Unknown action type: #{type}") unless known_types.includes? type
def initialize(@type : String, @xml : XML::Node?)
# Some actions may have actions in them, like this: # Some actions may have actions in them, like this:
# <bygroups> # <bygroups>
# <token type="GenericPrompt"/> # <token type="GenericPrompt"/>
@ -23,10 +31,28 @@ module Tartrazine
# #
# The token actions match with the first 2 groups in the regex # The token actions match with the first 2 groups in the regex
# the using action matches the 3rd and shunts it to another lexer # the using action matches the 3rd and shunts it to another lexer
@xml.children.each do |node| xml.children.each do |node|
next unless node.element? next unless node.element?
@actions << Action.new(node.name, node) @actions << Action.new(node.name, node)
end end
# Prefetch the attributes we ned from the XML and keep them
case type
when "token"
@token_type = xml["type"]
when "push"
@states_to_push = xml.attributes.select { |attrib|
attrib.name == "state"
}.map &.content
when "pop"
@depth = xml["depth"].to_i
when "using"
@lexer_name = xml["lexer"].downcase
when "combined"
@states = xml.attributes.select { |attrib|
attrib.name == "state"
}.map &.content
end
end end
# ameba:disable Metrics/CyclomaticComplexity # ameba:disable Metrics/CyclomaticComplexity
@ -34,35 +60,22 @@ module Tartrazine
case type case type
when "token" when "token"
raise Exception.new "Can't have a token without a match" if match.empty? raise Exception.new "Can't have a token without a match" if match.empty?
[Token.new(type: xml["type"], value: String.new(match[match_group].value))] [Token.new(type: @token_type, value: String.new(match[match_group].value))]
when "push" when "push"
states_to_push = xml.attributes.select { |attrib| to_push = @states_to_push.empty? ? [lexer.state_stack.last] : @states_to_push
attrib.name == "state" to_push.each do |state|
}.map &.content if state == "#pop" && lexer.state_stack.size > 1
if states_to_push.empty?
# Push without a state means push the current state
states_to_push = [lexer.state_stack.last]
end
states_to_push.each do |state|
if state == "#pop"
# Pop the state # Pop the state
Log.trace { "Popping state" }
lexer.state_stack.pop lexer.state_stack.pop
else else
# Really push # Really push
lexer.state_stack << state lexer.state_stack << state
Log.trace { "Pushed #{lexer.state_stack}" }
end end
end end
[] of Token [] of Token
when "pop" when "pop"
depth = xml["depth"].to_i to_pop = [@depth, lexer.state_stack.size - 1].min
Log.trace { "Popping #{depth} states" } lexer.state_stack.pop(to_pop)
if lexer.state_stack.size <= depth
Log.trace { "Can't pop #{depth} states, only have #{lexer.state_stack.size}" }
else
lexer.state_stack.pop(depth)
end
[] of Token [] of Token
when "bygroups" when "bygroups"
# FIXME: handle # FIXME: handle
@ -92,22 +105,15 @@ module Tartrazine
when "using" when "using"
# Shunt to another lexer entirely # Shunt to another lexer entirely
return [] of Token if match.empty? return [] of Token if match.empty?
lexer_name = xml["lexer"].downcase Tartrazine.lexer(@lexer_name).tokenize(String.new(match[match_group].value), usingself: true)
Log.trace { "to tokenize: #{match[match_group]}" }
Tartrazine.lexer(lexer_name).tokenize(String.new(match[match_group].value), usingself: true)
when "usingself" when "usingself"
# Shunt to another copy of this lexer # Shunt to another copy of this lexer
return [] of Token if match.empty? return [] of Token if match.empty?
new_lexer = lexer.copy
new_lexer = Lexer.from_xml(lexer.xml)
Log.trace { "to tokenize: #{match[match_group]}" }
new_lexer.tokenize(String.new(match[match_group].value), usingself: true) new_lexer.tokenize(String.new(match[match_group].value), usingself: true)
when "combined" when "combined"
# Combine two states into one anonymous state # Combine two states into one anonymous state
states = xml.attributes.select { |attrib| new_state = @states.map { |name|
attrib.name == "state"
}.map &.content
new_state = states.map { |name|
lexer.states[name] lexer.states[name]
}.reduce { |state1, state2| }.reduce { |state1, state2|
state1 + state2 state1 + state2
@ -116,7 +122,7 @@ module Tartrazine
lexer.state_stack << new_state.name lexer.state_stack << new_state.name
[] of Token [] of Token
else else
raise Exception.new("Unknown action type: #{type}: #{xml}") raise Exception.new("Unknown action type: #{type}")
end end
end end
end end

View File

@ -3,7 +3,7 @@ module BytesRegex
class Regex class Regex
def initialize(pattern : String, multiline = false, dotall = false, ignorecase = false, anchored = false) def initialize(pattern : String, multiline = false, dotall = false, ignorecase = false, anchored = false)
flags = LibPCRE2::UTF | LibPCRE2::DUPNAMES | LibPCRE2::UCP flags = LibPCRE2::UTF | LibPCRE2::UCP | LibPCRE2::NO_UTF_CHECK
flags |= LibPCRE2::MULTILINE if multiline flags |= LibPCRE2::MULTILINE if multiline
flags |= LibPCRE2::DOTALL if dotall flags |= LibPCRE2::DOTALL if dotall
flags |= LibPCRE2::CASELESS if ignorecase flags |= LibPCRE2::CASELESS if ignorecase
@ -22,27 +22,26 @@ module BytesRegex
end end
raise Exception.new "Error #{msg} compiling regex at offset #{erroroffset}" raise Exception.new "Error #{msg} compiling regex at offset #{erroroffset}"
end end
@match_data = LibPCRE2.match_data_create_from_pattern(@re, nil)
end end
def finalize def finalize
LibPCRE2.match_data_free(@match_data)
LibPCRE2.code_free(@re) LibPCRE2.code_free(@re)
end end
def match(str : Bytes, pos = 0) : Array(Match) def match(str : Bytes, pos = 0) : Array(Match)
match_data = LibPCRE2.match_data_create_from_pattern(@re, nil)
match = [] of Match match = [] of Match
rc = LibPCRE2.match( rc = LibPCRE2.match(
@re, @re,
str, str,
str.size, str.size,
pos, pos,
0, LibPCRE2::NO_UTF_CHECK,
match_data, @match_data,
nil) nil)
if rc < 0 if rc > 0
# No match, do nothing ovector = LibPCRE2.get_ovector_pointer(@match_data)
else
ovector = LibPCRE2.get_ovector_pointer(match_data)
(0...rc).each do |i| (0...rc).each do |i|
m_start = ovector[2 * i] m_start = ovector[2 * i]
m_size = ovector[2 * i + 1] - m_start m_size = ovector[2 * i + 1] - m_start
@ -54,7 +53,6 @@ module BytesRegex
match << Match.new(m_value, m_start, m_size) match << Match.new(m_value, m_start, m_size)
end end
end end
LibPCRE2.match_data_free(match_data)
match match
end end
end end

View File

@ -1,5 +1,6 @@
require "../constants/token_abbrevs.cr" require "../constants/token_abbrevs.cr"
require "../formatter" require "../formatter"
require "html"
module Tartrazine module Tartrazine
class Html < Formatter class Html < Formatter
@ -67,8 +68,7 @@ module Tartrazine
line_id = linkable_line_numbers? ? "id=\"#{line_number_id_prefix}#{i + 1}\"" : "" line_id = linkable_line_numbers? ? "id=\"#{line_number_id_prefix}#{i + 1}\"" : ""
outp << "<span #{line_id} #{line_class} style=\"user-select: none;\">#{line_label} </span>" outp << "<span #{line_id} #{line_class} style=\"user-select: none;\">#{line_label} </span>"
line.each do |token| line.each do |token|
fragment = "<span class=\"#{get_css_class(token[:type])}\">#{token[:value]}</span>" outp << "<span class=\"#{get_css_class(token[:type])}\">#{HTML.escape(token[:value])}</span>"
outp << fragment
end end
end end
outp << "</code></pre>" outp << "</code></pre>"
@ -104,15 +104,17 @@ module Tartrazine
# Given a token type, return the CSS class to use. # Given a token type, return the CSS class to use.
def get_css_class(token : String) : String def get_css_class(token : String) : String
return class_prefix + Abbreviations[token] if theme.styles.has_key?(token) if !theme.styles.has_key? token
# Themes don't contain information for each specific # Themes don't contain information for each specific
# token type. However, they may contain information # token type. However, they may contain information
# for a parent style. Worst case, we go to the root # for a parent style. Worst case, we go to the root
# (Background) style. # (Background) style.
class_prefix + Abbreviations[theme.style_parents(token).reverse.find { |parent| parent = theme.style_parents(token).reverse.find { |dad|
theme.styles.has_key?(parent) theme.styles.has_key?(dad)
}] }
theme.styles[token] = theme.styles[parent]
end
class_prefix + Abbreviations[token]
end end
# Is this line in the highlighted ranges? # Is this line in the highlighted ranges?

View File

@ -56,12 +56,18 @@ module Tartrazine
not_multiline: false, not_multiline: false,
ensure_nl: false, ensure_nl: false,
} }
property xml : String = "" # property xml : String = ""
property states = {} of String => State property states = {} of String => State
property state_stack = ["root"] property state_stack = ["root"]
def copy : Lexer
new_lexer = Lexer.new
new_lexer.config = config
new_lexer.states = states
new_lexer.state_stack = state_stack[0..-1]
new_lexer
end
# Turn the text into a list of tokens. The `usingself` parameter # Turn the text into a list of tokens. The `usingself` parameter
# is true when the lexer is being used to tokenize a string # is true when the lexer is being used to tokenize a string
# from a larger text that is already being tokenized. # from a larger text that is already being tokenized.
@ -87,12 +93,10 @@ module Tartrazine
if matched if matched
# Move position forward, save the tokens, # Move position forward, save the tokens,
# tokenize from the new position # tokenize from the new position
# Log.trace { "MATCHED: #{rule.xml}" }
pos = new_pos pos = new_pos
tokens += new_tokens tokens += new_tokens
break break
end end
# Log.trace { "NOT MATCHED: #{rule.xml}" }
end end
# If no rule matches, emit an error token # If no rule matches, emit an error token
unless matched unless matched
@ -158,7 +162,6 @@ module Tartrazine
# ameba:disable Metrics/CyclomaticComplexity # ameba:disable Metrics/CyclomaticComplexity
def self.from_xml(xml : String) : Lexer def self.from_xml(xml : String) : Lexer
l = Lexer.new l = Lexer.new
l.xml = xml
lexer = XML.parse(xml).first_element_child lexer = XML.parse(xml).first_element_child
if lexer if lexer
config = lexer.children.find { |node| config = lexer.children.find { |node|
@ -222,9 +225,9 @@ module Tartrazine
# A Lexer state. A state has a name and a list of rules. # A Lexer state. A state has a name and a list of rules.
# The state machine has a state stack containing references # The state machine has a state stack containing references
# to states to decide which rules to apply. # to states to decide which rules to apply.
class State struct State
property name : String = "" property name : String = ""
property rules = [] of Rule property rules = [] of BaseRule
def +(other : State) def +(other : State)
new_state = State.new new_state = State.new

View File

@ -77,7 +77,7 @@ if options["-f"]
if formatter.is_a?(Tartrazine::Html) && options["--css"] if formatter.is_a?(Tartrazine::Html) && options["--css"]
File.open("#{options["-t"].as(String)}.css", "w") do |outf| File.open("#{options["-t"].as(String)}.css", "w") do |outf|
outf.puts formatter.style_defs outf << formatter.style_defs
end end
exit 0 exit 0
end end
@ -91,7 +91,7 @@ if options["-f"]
puts output puts output
else else
File.open(options["-o"].as(String), "w") do |outf| File.open(options["-o"].as(String), "w") do |outf|
outf.puts output outf << output
end end
end end
end end

View File

@ -15,41 +15,11 @@ module Tartrazine
alias Match = BytesRegex::Match alias Match = BytesRegex::Match
alias MatchData = Array(Match) alias MatchData = Array(Match)
class Rule abstract struct BaseRule
property pattern : Regex = Regex.new "" abstract def match(text : Bytes, pos, lexer) : Tuple(Bool, Int32, Array(Token))
abstract def initialize(node : XML::Node)
property actions : Array(Action) = [] of Action property actions : Array(Action) = [] of Action
property xml : String = "foo"
def match(text : Bytes, pos, lexer) : Tuple(Bool, Int32, Array(Token))
match = pattern.match(text, pos)
# We don't match if the match doesn't move the cursor
# because that causes infinite loops
return false, pos, [] of Token if match.empty? || match[0].size == 0
# p! match, String.new(text[pos..pos+20])
# Log.trace { "#{match}, #{pattern.inspect}, #{text}, #{pos}" }
tokens = [] of Token
# Emit the tokens
actions.each do |action|
# Emit the token
tokens += action.emit(match, lexer)
end
Log.trace { "#{xml}, #{pos + match[0].size}, #{tokens}" }
return true, pos + match[0].size, tokens
end
def initialize(node : XML::Node, multiline, dotall, ignorecase)
@xml = node.to_s
pattern = node["pattern"]
# flags = Regex::Options::ANCHORED
# MULTILINE implies DOTALL which we don't want, so we
# use in-pattern flag (?m) instead
# flags |= Regex::Options::MULTILINE if multiline
pattern = "(?m)" + pattern if multiline
# flags |= Regex::Options::DOTALL if dotall
# flags |= Regex::Options::IGNORE_CASE if ignorecase
@pattern = Regex.new(pattern, multiline, dotall, ignorecase, true)
add_actions(node)
end
def add_actions(node : XML::Node) def add_actions(node : XML::Node)
node.children.each do |child| node.children.each do |child|
@ -59,23 +29,44 @@ module Tartrazine
end end
end end
struct Rule < BaseRule
property pattern : Regex = Regex.new ""
property actions : Array(Action) = [] of Action
def match(text : Bytes, pos, lexer) : Tuple(Bool, Int32, Array(Token))
match = pattern.match(text, pos)
# No match
return false, pos, [] of Token if match.size == 0
return true, pos + match[0].size, actions.flat_map { |action| action.emit(match, lexer) }
end
def initialize(node : XML::Node)
end
def initialize(node : XML::Node, multiline, dotall, ignorecase)
pattern = node["pattern"]
pattern = "(?m)" + pattern if multiline
@pattern = Regex.new(pattern, multiline, dotall, ignorecase, true)
add_actions(node)
end
end
# This rule includes another state. If any of the rules of the # This rule includes another state. If any of the rules of the
# included state matches, this rule matches. # included state matches, this rule matches.
class IncludeStateRule < Rule struct IncludeStateRule < BaseRule
property state : String = "" property state : String = ""
def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token)) def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token))
Log.trace { "Including state #{state} from #{lexer.state_stack.last}" } Log.trace { "Including state #{state} from #{lexer.state_stack.last}" }
lexer.states[state].rules.each do |rule| lexer.states[state].rules.each do |rule|
matched, new_pos, new_tokens = rule.match(text, pos, lexer) matched, new_pos, new_tokens = rule.match(text, pos, lexer)
Log.trace { "#{xml}, #{new_pos}, #{new_tokens}" } if matched
return true, new_pos, new_tokens if matched return true, new_pos, new_tokens if matched
end end
return false, pos, [] of Token return false, pos, [] of Token
end end
def initialize(node : XML::Node) def initialize(node : XML::Node)
@xml = node.to_s
include_node = node.children.find { |child| include_node = node.children.find { |child|
child.name == "include" child.name == "include"
} }
@ -85,17 +76,14 @@ module Tartrazine
end end
# This rule always matches, unconditionally # This rule always matches, unconditionally
class UnconditionalRule < Rule struct UnconditionalRule < BaseRule
NO_MATCH = [] of Match
def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token)) def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token))
tokens = [] of Token return true, pos, actions.flat_map { |action| action.emit(NO_MATCH, lexer) }
actions.each do |action|
tokens += action.emit([] of Match, lexer)
end
return true, pos, tokens
end end
def initialize(node : XML::Node) def initialize(node : XML::Node)
@xml = node.to_s
add_actions(node) add_actions(node)
end end
end end