Compare commits

..

No commits in common. "cb09dff9f16261661e23d79639ec1ad2e140e854" and "ad664d9f934b879b393092497492eac526b52254" have entirely different histories.

10 changed files with 119 additions and 118 deletions

1
.gitignore vendored
View File

@ -7,4 +7,3 @@ chroma/
pygments/ pygments/
shard.lock shard.lock
.vscode/ .vscode/
.crystal/

View File

@ -4,17 +4,17 @@ Tartrazine is a library to syntax-highlight code. It is
a port of [Pygments](https://pygments.org/) to a port of [Pygments](https://pygments.org/) to
[Crystal](https://crystal-lang.org/). Kind of. [Crystal](https://crystal-lang.org/). Kind of.
The CLI tool can be used to highlight many things in many styles. It's not currently usable because it's not finished, but:
* The lexers work for the implemented languages
* The provided styles work
* There is a very very simple HTML formatter
# A port of what? Why "kind of"? # A port of what? Why "kind of"?
Pygments is a staple of the Python ecosystem, and it's great. Because I did not read the Pygments code. And this is actually
It lets you highlight code in many languages, and it has many based on [Chroma](https://github.com/alecthomas/chroma) ...
themes. Chroma is "Pygments for Go", it's actually a port of although I did not read that code either.
Pygments to Go, and it's great too.
I wanted that in Crystal, so I started this project. But I did
not read much of the Pygments code. Or much of Chroma's.
Chroma has taken most of the Pygments lexers and turned them into Chroma has taken most of the Pygments lexers and turned them into
XML descriptions. What I did was take those XML files from Chroma XML descriptions. What I did was take those XML files from Chroma

View File

@ -1,5 +1,5 @@
name: tartrazine name: tartrazine
version: 0.4.0 version: 0.3.0
authors: authors:
- Roberto Alsina <roberto.alsina@gmail.com> - Roberto Alsina <roberto.alsina@gmail.com>

View File

@ -14,7 +14,6 @@ unicode_problems = {
"#{__DIR__}/tests/java/test_string_literals.txt", "#{__DIR__}/tests/java/test_string_literals.txt",
"#{__DIR__}/tests/json/test_strings.txt", "#{__DIR__}/tests/json/test_strings.txt",
"#{__DIR__}/tests/systemd/example1.txt", "#{__DIR__}/tests/systemd/example1.txt",
"#{__DIR__}/tests/c++/test_unicode_identifiers.txt",
} }
# These testcases fail because of differences in the way chroma and tartrazine tokenize # These testcases fail because of differences in the way chroma and tartrazine tokenize

View File

@ -8,20 +8,12 @@ require "./tartrazine"
# perform a list of actions. These actions can emit tokens # perform a list of actions. These actions can emit tokens
# or change the state machine. # or change the state machine.
module Tartrazine module Tartrazine
struct Action class Action
property actions : Array(Action) = [] of Action
property type : String property type : String
property xml : XML::Node
property actions : Array(Action) = [] of Action
@depth : Int32 = 0 def initialize(@type : String, @xml : XML::Node?)
@lexer_name : String = ""
@states : Array(String) = [] of String
@states_to_push : Array(String) = [] of String
@token_type : String = ""
def initialize(@type : String, xml : XML::Node?)
known_types = %w(token push pop combined bygroups include using usingself)
raise Exception.new("Unknown action type: #{type}") unless known_types.includes? type
# Some actions may have actions in them, like this: # Some actions may have actions in them, like this:
# <bygroups> # <bygroups>
# <token type="GenericPrompt"/> # <token type="GenericPrompt"/>
@ -31,28 +23,10 @@ module Tartrazine
# #
# The token actions match with the first 2 groups in the regex # The token actions match with the first 2 groups in the regex
# the using action matches the 3rd and shunts it to another lexer # the using action matches the 3rd and shunts it to another lexer
xml.children.each do |node| @xml.children.each do |node|
next unless node.element? next unless node.element?
@actions << Action.new(node.name, node) @actions << Action.new(node.name, node)
end end
# Prefetch the attributes we ned from the XML and keep them
case type
when "token"
@token_type = xml["type"]
when "push"
@states_to_push = xml.attributes.select { |attrib|
attrib.name == "state"
}.map &.content
when "pop"
@depth = xml["depth"].to_i
when "using"
@lexer_name = xml["lexer"].downcase
when "combined"
@states = xml.attributes.select { |attrib|
attrib.name == "state"
}.map &.content
end
end end
# ameba:disable Metrics/CyclomaticComplexity # ameba:disable Metrics/CyclomaticComplexity
@ -60,22 +34,35 @@ module Tartrazine
case type case type
when "token" when "token"
raise Exception.new "Can't have a token without a match" if match.empty? raise Exception.new "Can't have a token without a match" if match.empty?
[Token.new(type: @token_type, value: String.new(match[match_group].value))] [Token.new(type: xml["type"], value: String.new(match[match_group].value))]
when "push" when "push"
to_push = @states_to_push.empty? ? [lexer.state_stack.last] : @states_to_push states_to_push = xml.attributes.select { |attrib|
to_push.each do |state| attrib.name == "state"
if state == "#pop" && lexer.state_stack.size > 1 }.map &.content
if states_to_push.empty?
# Push without a state means push the current state
states_to_push = [lexer.state_stack.last]
end
states_to_push.each do |state|
if state == "#pop"
# Pop the state # Pop the state
Log.trace { "Popping state" }
lexer.state_stack.pop lexer.state_stack.pop
else else
# Really push # Really push
lexer.state_stack << state lexer.state_stack << state
Log.trace { "Pushed #{lexer.state_stack}" }
end end
end end
[] of Token [] of Token
when "pop" when "pop"
to_pop = [@depth, lexer.state_stack.size - 1].min depth = xml["depth"].to_i
lexer.state_stack.pop(to_pop) Log.trace { "Popping #{depth} states" }
if lexer.state_stack.size <= depth
Log.trace { "Can't pop #{depth} states, only have #{lexer.state_stack.size}" }
else
lexer.state_stack.pop(depth)
end
[] of Token [] of Token
when "bygroups" when "bygroups"
# FIXME: handle # FIXME: handle
@ -105,15 +92,22 @@ module Tartrazine
when "using" when "using"
# Shunt to another lexer entirely # Shunt to another lexer entirely
return [] of Token if match.empty? return [] of Token if match.empty?
Tartrazine.lexer(@lexer_name).tokenize(String.new(match[match_group].value), usingself: true) lexer_name = xml["lexer"].downcase
Log.trace { "to tokenize: #{match[match_group]}" }
Tartrazine.lexer(lexer_name).tokenize(String.new(match[match_group].value), usingself: true)
when "usingself" when "usingself"
# Shunt to another copy of this lexer # Shunt to another copy of this lexer
return [] of Token if match.empty? return [] of Token if match.empty?
new_lexer = lexer.copy
new_lexer = Lexer.from_xml(lexer.xml)
Log.trace { "to tokenize: #{match[match_group]}" }
new_lexer.tokenize(String.new(match[match_group].value), usingself: true) new_lexer.tokenize(String.new(match[match_group].value), usingself: true)
when "combined" when "combined"
# Combine two states into one anonymous state # Combine two states into one anonymous state
new_state = @states.map { |name| states = xml.attributes.select { |attrib|
attrib.name == "state"
}.map &.content
new_state = states.map { |name|
lexer.states[name] lexer.states[name]
}.reduce { |state1, state2| }.reduce { |state1, state2|
state1 + state2 state1 + state2
@ -122,7 +116,7 @@ module Tartrazine
lexer.state_stack << new_state.name lexer.state_stack << new_state.name
[] of Token [] of Token
else else
raise Exception.new("Unknown action type: #{type}") raise Exception.new("Unknown action type: #{type}: #{xml}")
end end
end end
end end

View File

@ -3,7 +3,7 @@ module BytesRegex
class Regex class Regex
def initialize(pattern : String, multiline = false, dotall = false, ignorecase = false, anchored = false) def initialize(pattern : String, multiline = false, dotall = false, ignorecase = false, anchored = false)
flags = LibPCRE2::UTF | LibPCRE2::UCP | LibPCRE2::NO_UTF_CHECK flags = LibPCRE2::UTF | LibPCRE2::DUPNAMES | LibPCRE2::UCP
flags |= LibPCRE2::MULTILINE if multiline flags |= LibPCRE2::MULTILINE if multiline
flags |= LibPCRE2::DOTALL if dotall flags |= LibPCRE2::DOTALL if dotall
flags |= LibPCRE2::CASELESS if ignorecase flags |= LibPCRE2::CASELESS if ignorecase
@ -22,26 +22,27 @@ module BytesRegex
end end
raise Exception.new "Error #{msg} compiling regex at offset #{erroroffset}" raise Exception.new "Error #{msg} compiling regex at offset #{erroroffset}"
end end
@match_data = LibPCRE2.match_data_create_from_pattern(@re, nil)
end end
def finalize def finalize
LibPCRE2.match_data_free(@match_data)
LibPCRE2.code_free(@re) LibPCRE2.code_free(@re)
end end
def match(str : Bytes, pos = 0) : Array(Match) def match(str : Bytes, pos = 0) : Array(Match)
match_data = LibPCRE2.match_data_create_from_pattern(@re, nil)
match = [] of Match match = [] of Match
rc = LibPCRE2.match( rc = LibPCRE2.match(
@re, @re,
str, str,
str.size, str.size,
pos, pos,
LibPCRE2::NO_UTF_CHECK, 0,
@match_data, match_data,
nil) nil)
if rc > 0 if rc < 0
ovector = LibPCRE2.get_ovector_pointer(@match_data) # No match, do nothing
else
ovector = LibPCRE2.get_ovector_pointer(match_data)
(0...rc).each do |i| (0...rc).each do |i|
m_start = ovector[2 * i] m_start = ovector[2 * i]
m_size = ovector[2 * i + 1] - m_start m_size = ovector[2 * i + 1] - m_start
@ -53,6 +54,7 @@ module BytesRegex
match << Match.new(m_value, m_start, m_size) match << Match.new(m_value, m_start, m_size)
end end
end end
LibPCRE2.match_data_free(match_data)
match match
end end
end end

View File

@ -1,6 +1,5 @@
require "../constants/token_abbrevs.cr" require "../constants/token_abbrevs.cr"
require "../formatter" require "../formatter"
require "html"
module Tartrazine module Tartrazine
class Html < Formatter class Html < Formatter
@ -68,7 +67,8 @@ module Tartrazine
line_id = linkable_line_numbers? ? "id=\"#{line_number_id_prefix}#{i + 1}\"" : "" line_id = linkable_line_numbers? ? "id=\"#{line_number_id_prefix}#{i + 1}\"" : ""
outp << "<span #{line_id} #{line_class} style=\"user-select: none;\">#{line_label} </span>" outp << "<span #{line_id} #{line_class} style=\"user-select: none;\">#{line_label} </span>"
line.each do |token| line.each do |token|
outp << "<span class=\"#{get_css_class(token[:type])}\">#{HTML.escape(token[:value])}</span>" fragment = "<span class=\"#{get_css_class(token[:type])}\">#{token[:value]}</span>"
outp << fragment
end end
end end
outp << "</code></pre>" outp << "</code></pre>"
@ -104,17 +104,15 @@ module Tartrazine
# Given a token type, return the CSS class to use. # Given a token type, return the CSS class to use.
def get_css_class(token : String) : String def get_css_class(token : String) : String
if !theme.styles.has_key? token return class_prefix + Abbreviations[token] if theme.styles.has_key?(token)
# Themes don't contain information for each specific # Themes don't contain information for each specific
# token type. However, they may contain information # token type. However, they may contain information
# for a parent style. Worst case, we go to the root # for a parent style. Worst case, we go to the root
# (Background) style. # (Background) style.
parent = theme.style_parents(token).reverse.find { |dad| class_prefix + Abbreviations[theme.style_parents(token).reverse.find { |parent|
theme.styles.has_key?(dad) theme.styles.has_key?(parent)
} }]
theme.styles[token] = theme.styles[parent]
end
class_prefix + Abbreviations[token]
end end
# Is this line in the highlighted ranges? # Is this line in the highlighted ranges?

View File

@ -56,17 +56,11 @@ module Tartrazine
not_multiline: false, not_multiline: false,
ensure_nl: false, ensure_nl: false,
} }
# property xml : String = "" property xml : String = ""
property states = {} of String => State
property state_stack = ["root"]
def copy : Lexer property states = {} of String => State
new_lexer = Lexer.new
new_lexer.config = config property state_stack = ["root"]
new_lexer.states = states
new_lexer.state_stack = state_stack[0..-1]
new_lexer
end
# Turn the text into a list of tokens. The `usingself` parameter # Turn the text into a list of tokens. The `usingself` parameter
# is true when the lexer is being used to tokenize a string # is true when the lexer is being used to tokenize a string
@ -93,10 +87,12 @@ module Tartrazine
if matched if matched
# Move position forward, save the tokens, # Move position forward, save the tokens,
# tokenize from the new position # tokenize from the new position
# Log.trace { "MATCHED: #{rule.xml}" }
pos = new_pos pos = new_pos
tokens += new_tokens tokens += new_tokens
break break
end end
# Log.trace { "NOT MATCHED: #{rule.xml}" }
end end
# If no rule matches, emit an error token # If no rule matches, emit an error token
unless matched unless matched
@ -162,6 +158,7 @@ module Tartrazine
# ameba:disable Metrics/CyclomaticComplexity # ameba:disable Metrics/CyclomaticComplexity
def self.from_xml(xml : String) : Lexer def self.from_xml(xml : String) : Lexer
l = Lexer.new l = Lexer.new
l.xml = xml
lexer = XML.parse(xml).first_element_child lexer = XML.parse(xml).first_element_child
if lexer if lexer
config = lexer.children.find { |node| config = lexer.children.find { |node|
@ -225,9 +222,9 @@ module Tartrazine
# A Lexer state. A state has a name and a list of rules. # A Lexer state. A state has a name and a list of rules.
# The state machine has a state stack containing references # The state machine has a state stack containing references
# to states to decide which rules to apply. # to states to decide which rules to apply.
struct State class State
property name : String = "" property name : String = ""
property rules = [] of BaseRule property rules = [] of Rule
def +(other : State) def +(other : State)
new_state = State.new new_state = State.new

View File

@ -77,7 +77,7 @@ if options["-f"]
if formatter.is_a?(Tartrazine::Html) && options["--css"] if formatter.is_a?(Tartrazine::Html) && options["--css"]
File.open("#{options["-t"].as(String)}.css", "w") do |outf| File.open("#{options["-t"].as(String)}.css", "w") do |outf|
outf << formatter.style_defs outf.puts formatter.style_defs
end end
exit 0 exit 0
end end
@ -91,7 +91,7 @@ if options["-f"]
puts output puts output
else else
File.open(options["-o"].as(String), "w") do |outf| File.open(options["-o"].as(String), "w") do |outf|
outf << output outf.puts output
end end
end end
end end

View File

@ -15,11 +15,41 @@ module Tartrazine
alias Match = BytesRegex::Match alias Match = BytesRegex::Match
alias MatchData = Array(Match) alias MatchData = Array(Match)
abstract struct BaseRule class Rule
abstract def match(text : Bytes, pos, lexer) : Tuple(Bool, Int32, Array(Token)) property pattern : Regex = Regex.new ""
abstract def initialize(node : XML::Node)
property actions : Array(Action) = [] of Action property actions : Array(Action) = [] of Action
property xml : String = "foo"
def match(text : Bytes, pos, lexer) : Tuple(Bool, Int32, Array(Token))
match = pattern.match(text, pos)
# We don't match if the match doesn't move the cursor
# because that causes infinite loops
return false, pos, [] of Token if match.empty? || match[0].size == 0
# p! match, String.new(text[pos..pos+20])
# Log.trace { "#{match}, #{pattern.inspect}, #{text}, #{pos}" }
tokens = [] of Token
# Emit the tokens
actions.each do |action|
# Emit the token
tokens += action.emit(match, lexer)
end
Log.trace { "#{xml}, #{pos + match[0].size}, #{tokens}" }
return true, pos + match[0].size, tokens
end
def initialize(node : XML::Node, multiline, dotall, ignorecase)
@xml = node.to_s
pattern = node["pattern"]
# flags = Regex::Options::ANCHORED
# MULTILINE implies DOTALL which we don't want, so we
# use in-pattern flag (?m) instead
# flags |= Regex::Options::MULTILINE if multiline
pattern = "(?m)" + pattern if multiline
# flags |= Regex::Options::DOTALL if dotall
# flags |= Regex::Options::IGNORE_CASE if ignorecase
@pattern = Regex.new(pattern, multiline, dotall, ignorecase, true)
add_actions(node)
end
def add_actions(node : XML::Node) def add_actions(node : XML::Node)
node.children.each do |child| node.children.each do |child|
@ -29,44 +59,23 @@ module Tartrazine
end end
end end
struct Rule < BaseRule
property pattern : Regex = Regex.new ""
property actions : Array(Action) = [] of Action
def match(text : Bytes, pos, lexer) : Tuple(Bool, Int32, Array(Token))
match = pattern.match(text, pos)
# No match
return false, pos, [] of Token if match.size == 0
return true, pos + match[0].size, actions.flat_map { |action| action.emit(match, lexer) }
end
def initialize(node : XML::Node)
end
def initialize(node : XML::Node, multiline, dotall, ignorecase)
pattern = node["pattern"]
pattern = "(?m)" + pattern if multiline
@pattern = Regex.new(pattern, multiline, dotall, ignorecase, true)
add_actions(node)
end
end
# This rule includes another state. If any of the rules of the # This rule includes another state. If any of the rules of the
# included state matches, this rule matches. # included state matches, this rule matches.
struct IncludeStateRule < BaseRule class IncludeStateRule < Rule
property state : String = "" property state : String = ""
def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token)) def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token))
Log.trace { "Including state #{state} from #{lexer.state_stack.last}" } Log.trace { "Including state #{state} from #{lexer.state_stack.last}" }
lexer.states[state].rules.each do |rule| lexer.states[state].rules.each do |rule|
matched, new_pos, new_tokens = rule.match(text, pos, lexer) matched, new_pos, new_tokens = rule.match(text, pos, lexer)
Log.trace { "#{xml}, #{new_pos}, #{new_tokens}" } if matched
return true, new_pos, new_tokens if matched return true, new_pos, new_tokens if matched
end end
return false, pos, [] of Token return false, pos, [] of Token
end end
def initialize(node : XML::Node) def initialize(node : XML::Node)
@xml = node.to_s
include_node = node.children.find { |child| include_node = node.children.find { |child|
child.name == "include" child.name == "include"
} }
@ -76,14 +85,17 @@ module Tartrazine
end end
# This rule always matches, unconditionally # This rule always matches, unconditionally
struct UnconditionalRule < BaseRule class UnconditionalRule < Rule
NO_MATCH = [] of Match
def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token)) def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token))
return true, pos, actions.flat_map { |action| action.emit(NO_MATCH, lexer) } tokens = [] of Token
actions.each do |action|
tokens += action.emit([] of Match, lexer)
end
return true, pos, tokens
end end
def initialize(node : XML::Node) def initialize(node : XML::Node)
@xml = node.to_s
add_actions(node) add_actions(node)
end end
end end