9 Commits

8 changed files with 134 additions and 42 deletions

View File

@ -4,17 +4,17 @@ Tartrazine is a library to syntax-highlight code. It is
a port of [Pygments](https://pygments.org/) to a port of [Pygments](https://pygments.org/) to
[Crystal](https://crystal-lang.org/). Kind of. [Crystal](https://crystal-lang.org/). Kind of.
It's not currently usable because it's not finished, but: The CLI tool can be used to highlight many things in many styles.
* The lexers work for the implemented languages
* The provided styles work
* There is a very very simple HTML formatter
# A port of what? Why "kind of"? # A port of what? Why "kind of"?
Because I did not read the Pygments code. And this is actually Pygments is a staple of the Python ecosystem, and it's great.
based on [Chroma](https://github.com/alecthomas/chroma) ... It lets you highlight code in many languages, and it has many
although I did not read that code either. themes. Chroma is "Pygments for Go", it's actually a port of
Pygments to Go, and it's great too.
I wanted that in Crystal, so I started this project. But I did
not read much of the Pygments code. Or much of Chroma's.
Chroma has taken most of the Pygments lexers and turned them into Chroma has taken most of the Pygments lexers and turned them into
XML descriptions. What I did was take those XML files from Chroma XML descriptions. What I did was take those XML files from Chroma

View File

@ -1,5 +1,5 @@
name: tartrazine name: tartrazine
version: 0.3.0 version: 0.4.0
authors: authors:
- Roberto Alsina <roberto.alsina@gmail.com> - Roberto Alsina <roberto.alsina@gmail.com>

View File

@ -14,15 +14,18 @@ unicode_problems = {
"#{__DIR__}/tests/java/test_string_literals.txt", "#{__DIR__}/tests/java/test_string_literals.txt",
"#{__DIR__}/tests/json/test_strings.txt", "#{__DIR__}/tests/json/test_strings.txt",
"#{__DIR__}/tests/systemd/example1.txt", "#{__DIR__}/tests/systemd/example1.txt",
"#{__DIR__}/tests/c++/test_unicode_identifiers.txt",
} }
# These testcases fail because of differences in the way chroma and tartrazine tokenize # These testcases fail because of differences in the way chroma and tartrazine tokenize
# but tartrazine is correct # but tartrazine is correct
bad_in_chroma = { bad_in_chroma = {
"#{__DIR__}/tests/bash_session/test_comment_after_prompt.txt", "#{__DIR__}/tests/bash_session/test_comment_after_prompt.txt",
"#{__DIR__}/tests/html/javascript_backtracking.txt",
"#{__DIR__}/tests/java/test_default.txt", "#{__DIR__}/tests/java/test_default.txt",
"#{__DIR__}/tests/java/test_multiline_string.txt", "#{__DIR__}/tests/java/test_multiline_string.txt",
"#{__DIR__}/tests/java/test_numeric_literals.txt", "#{__DIR__}/tests/java/test_numeric_literals.txt",
"#{__DIR__}/tests/octave/test_multilinecomment.txt",
"#{__DIR__}/tests/php/test_string_escaping_run.txt", "#{__DIR__}/tests/php/test_string_escaping_run.txt",
"#{__DIR__}/tests/python_2/test_cls_builtin.txt", "#{__DIR__}/tests/python_2/test_cls_builtin.txt",
} }
@ -30,19 +33,14 @@ bad_in_chroma = {
known_bad = { known_bad = {
"#{__DIR__}/tests/bash_session/fake_ps2_prompt.txt", "#{__DIR__}/tests/bash_session/fake_ps2_prompt.txt",
"#{__DIR__}/tests/bash_session/prompt_in_output.txt", "#{__DIR__}/tests/bash_session/prompt_in_output.txt",
"#{__DIR__}/tests/bash_session/test_newline_in_echo_no_ps2.txt",
"#{__DIR__}/tests/bash_session/test_newline_in_ls_ps2.txt",
"#{__DIR__}/tests/bash_session/ps2_prompt.txt", "#{__DIR__}/tests/bash_session/ps2_prompt.txt",
"#{__DIR__}/tests/bash_session/test_newline_in_ls_no_ps2.txt", "#{__DIR__}/tests/bash_session/test_newline_in_echo_no_ps2.txt",
"#{__DIR__}/tests/bash_session/test_virtualenv.txt",
"#{__DIR__}/tests/bash_session/test_newline_in_echo_ps2.txt", "#{__DIR__}/tests/bash_session/test_newline_in_echo_ps2.txt",
"#{__DIR__}/tests/c/test_string_resembling_decl_end.txt", "#{__DIR__}/tests/bash_session/test_newline_in_ls_no_ps2.txt",
"#{__DIR__}/tests/html/css_backtracking.txt", "#{__DIR__}/tests/bash_session/test_newline_in_ls_ps2.txt",
"#{__DIR__}/tests/bash_session/test_virtualenv.txt",
"#{__DIR__}/tests/mcfunction/data.txt", "#{__DIR__}/tests/mcfunction/data.txt",
"#{__DIR__}/tests/mcfunction/selectors.txt", "#{__DIR__}/tests/mcfunction/selectors.txt",
"#{__DIR__}/tests/php/anonymous_class.txt",
"#{__DIR__}/tests/html/javascript_unclosed.txt",
} }
# Tests that fail because of a limitation in PCRE2 # Tests that fail because of a limitation in PCRE2

View File

@ -30,11 +30,11 @@ module Tartrazine
end end
# ameba:disable Metrics/CyclomaticComplexity # ameba:disable Metrics/CyclomaticComplexity
def emit(match : Regex::MatchData?, lexer : Lexer, match_group = 0) : Array(Token) def emit(match : MatchData, lexer : Lexer, match_group = 0) : Array(Token)
case type case type
when "token" when "token"
raise Exception.new "Can't have a token without a match" if match.nil? raise Exception.new "Can't have a token without a match" if match.empty?
[Token.new(type: xml["type"], value: match[match_group])] [Token.new(type: xml["type"], value: String.new(match[match_group].value))]
when "push" when "push"
states_to_push = xml.attributes.select { |attrib| states_to_push = xml.attributes.select { |attrib|
attrib.name == "state" attrib.name == "state"
@ -79,23 +79,29 @@ module Tartrazine
# the action is skipped. # the action is skipped.
result = [] of Token result = [] of Token
@actions.each_with_index do |e, i| @actions.each_with_index do |e, i|
next if match[i + 1]?.nil? begin
next if match[i + 1].size == 0
rescue IndexError
# FIXME: This should not actually happen
# No match for this group
next
end
result += e.emit(match, lexer, i + 1) result += e.emit(match, lexer, i + 1)
end end
result result
when "using" when "using"
# Shunt to another lexer entirely # Shunt to another lexer entirely
return [] of Token if match.nil? return [] of Token if match.empty?
lexer_name = xml["lexer"].downcase lexer_name = xml["lexer"].downcase
Log.trace { "to tokenize: #{match[match_group]}" } Log.trace { "to tokenize: #{match[match_group]}" }
Tartrazine.lexer(lexer_name).tokenize(match[match_group], usingself: true) Tartrazine.lexer(lexer_name).tokenize(String.new(match[match_group].value), usingself: true)
when "usingself" when "usingself"
# Shunt to another copy of this lexer # Shunt to another copy of this lexer
return [] of Token if match.nil? return [] of Token if match.empty?
new_lexer = Lexer.from_xml(lexer.xml) new_lexer = Lexer.from_xml(lexer.xml)
Log.trace { "to tokenize: #{match[match_group]}" } Log.trace { "to tokenize: #{match[match_group]}" }
new_lexer.tokenize(match[match_group], usingself: true) new_lexer.tokenize(String.new(match[match_group].value), usingself: true)
when "combined" when "combined"
# Combine two states into one anonymous state # Combine two states into one anonymous state
states = xml.attributes.select { |attrib| states = xml.attributes.select { |attrib|

75
src/bytes_regex.cr Normal file
View File

@ -0,0 +1,75 @@
module BytesRegex
extend self
class Regex
def initialize(pattern : String, multiline = false, dotall = false, ignorecase = false, anchored = false)
flags = LibPCRE2::UTF | LibPCRE2::DUPNAMES | LibPCRE2::UCP | LibPCRE2::NO_UTF_CHECK
flags |= LibPCRE2::MULTILINE if multiline
flags |= LibPCRE2::DOTALL if dotall
flags |= LibPCRE2::CASELESS if ignorecase
flags |= LibPCRE2::ANCHORED if anchored
if @re = LibPCRE2.compile(
pattern,
pattern.bytesize,
flags,
out errorcode,
out erroroffset,
nil)
else
msg = String.new(256) do |buffer|
bytesize = LibPCRE2.get_error_message(errorcode, buffer, 256)
{bytesize, 0}
end
raise Exception.new "Error #{msg} compiling regex at offset #{erroroffset}"
end
end
def finalize
LibPCRE2.code_free(@re)
end
def match(str : Bytes, pos = 0) : Array(Match)
match_data = LibPCRE2.match_data_create_from_pattern(@re, nil)
match = [] of Match
rc = LibPCRE2.match(
@re,
str,
str.size,
pos,
LibPCRE2::NO_UTF_CHECK,
match_data,
nil)
if rc < 0
# No match, do nothing
else
ovector = LibPCRE2.get_ovector_pointer(match_data)
(0...rc).each do |i|
m_start = ovector[2 * i]
m_size = ovector[2 * i + 1] - m_start
if m_size == 0
m_value = Bytes.new(0)
else
m_value = str[m_start...m_start + m_size]
end
match << Match.new(m_value, m_start, m_size)
end
end
LibPCRE2.match_data_free(match_data)
match
end
end
class Match
property value : Bytes
property start : UInt64
property size : UInt64
def initialize(@value : Bytes, @start : UInt64, @size : UInt64)
end
end
end
# pattern = "foo"
# str = "foo bar"
# re = BytesRegex::Regex.new(pattern)
# p! String.new(re.match(str.to_slice)[0].value)

View File

@ -30,7 +30,7 @@ module Tartrazine
@standalone : Bool = false, @standalone : Bool = false,
@surrounding_pre : Bool = true, @surrounding_pre : Bool = true,
@wrap_long_lines : Bool = false, @wrap_long_lines : Bool = false,
@weight_of_bold : Int32 = 600,) @weight_of_bold : Int32 = 600)
end end
def format(text : String, lexer : Lexer) : String def format(text : String, lexer : Lexer) : String

View File

@ -1,3 +1,4 @@
require "baked_file_system"
require "./constants/lexers" require "./constants/lexers"
module Tartrazine module Tartrazine
@ -65,7 +66,7 @@ module Tartrazine
# is true when the lexer is being used to tokenize a string # is true when the lexer is being used to tokenize a string
# from a larger text that is already being tokenized. # from a larger text that is already being tokenized.
# So, when it's true, we don't modify the text. # So, when it's true, we don't modify the text.
def tokenize(text, usingself = false) : Array(Token) def tokenize(text : String, usingself = false) : Array(Token)
@state_stack = ["root"] @state_stack = ["root"]
tokens = [] of Token tokens = [] of Token
pos = 0 pos = 0
@ -76,12 +77,13 @@ module Tartrazine
text += "\n" text += "\n"
end end
text_bytes = text.to_slice
# Loop through the text, applying rules # Loop through the text, applying rules
while pos < text.size while pos < text_bytes.size
state = states[@state_stack.last] state = states[@state_stack.last]
# Log.trace { "Stack is #{@state_stack} State is #{state.name}, pos is #{pos}, text is #{text[pos..pos + 10]}" } # Log.trace { "Stack is #{@state_stack} State is #{state.name}, pos is #{pos}, text is #{text[pos..pos + 10]}" }
state.rules.each do |rule| state.rules.each do |rule|
matched, new_pos, new_tokens = rule.match(text, pos, self) matched, new_pos, new_tokens = rule.match(text_bytes, pos, self)
if matched if matched
# Move position forward, save the tokens, # Move position forward, save the tokens,
# tokenize from the new position # tokenize from the new position
@ -94,8 +96,13 @@ module Tartrazine
end end
# If no rule matches, emit an error token # If no rule matches, emit an error token
unless matched unless matched
# Log.trace { "Error at #{pos}" } if text_bytes[pos] == 10u8
tokens << {type: "Error", value: "#{text[pos]}"} # at EOL, reset state to "root"
tokens << {type: "Text", value: "\n"}
@state_stack = ["root"]
else
tokens << {type: "Error", value: String.new(text_bytes[pos..pos])}
end
pos += 1 pos += 1
end end
end end

View File

@ -1,8 +1,9 @@
require "./actions" require "./actions"
require "./bytes_regex"
require "./formatter" require "./formatter"
require "./lexer"
require "./rules" require "./rules"
require "./styles" require "./styles"
require "./lexer"
# These are lexer rules. They match with the text being parsed # These are lexer rules. They match with the text being parsed
# and perform actions, either emitting tokens or changing the # and perform actions, either emitting tokens or changing the
@ -10,16 +11,21 @@ require "./lexer"
module Tartrazine module Tartrazine
# This rule matches via a regex pattern # This rule matches via a regex pattern
alias Regex = BytesRegex::Regex
alias Match = BytesRegex::Match
alias MatchData = Array(Match)
class Rule class Rule
property pattern : Regex = Regex.new "" property pattern : Regex = Regex.new ""
property actions : Array(Action) = [] of Action property actions : Array(Action) = [] of Action
property xml : String = "foo" property xml : String = "foo"
def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token)) def match(text : Bytes, pos, lexer) : Tuple(Bool, Int32, Array(Token))
match = pattern.match(text, pos) match = pattern.match(text, pos)
# We don't match if the match doesn't move the cursor # We don't match if the match doesn't move the cursor
# because that causes infinite loops # because that causes infinite loops
return false, pos, [] of Token if match.nil? || match.end == 0 return false, pos, [] of Token if match.empty? || match[0].size == 0
# p! match, String.new(text[pos..pos+20])
# Log.trace { "#{match}, #{pattern.inspect}, #{text}, #{pos}" } # Log.trace { "#{match}, #{pattern.inspect}, #{text}, #{pos}" }
tokens = [] of Token tokens = [] of Token
# Emit the tokens # Emit the tokens
@ -27,21 +33,21 @@ module Tartrazine
# Emit the token # Emit the token
tokens += action.emit(match, lexer) tokens += action.emit(match, lexer)
end end
Log.trace { "#{xml}, #{match.end}, #{tokens}" } Log.trace { "#{xml}, #{pos + match[0].size}, #{tokens}" }
return true, match.end, tokens return true, pos + match[0].size, tokens
end end
def initialize(node : XML::Node, multiline, dotall, ignorecase) def initialize(node : XML::Node, multiline, dotall, ignorecase)
@xml = node.to_s @xml = node.to_s
pattern = node["pattern"] pattern = node["pattern"]
flags = Regex::Options::ANCHORED # flags = Regex::Options::ANCHORED
# MULTILINE implies DOTALL which we don't want, so we # MULTILINE implies DOTALL which we don't want, so we
# use in-pattern flag (?m) instead # use in-pattern flag (?m) instead
# flags |= Regex::Options::MULTILINE if multiline # flags |= Regex::Options::MULTILINE if multiline
pattern = "(?m)" + pattern if multiline pattern = "(?m)" + pattern if multiline
flags |= Regex::Options::DOTALL if dotall # flags |= Regex::Options::DOTALL if dotall
flags |= Regex::Options::IGNORE_CASE if ignorecase # flags |= Regex::Options::IGNORE_CASE if ignorecase
@pattern = Regex.new(pattern, flags) @pattern = Regex.new(pattern, multiline, dotall, ignorecase, true)
add_actions(node) add_actions(node)
end end
@ -83,7 +89,7 @@ module Tartrazine
def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token)) def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token))
tokens = [] of Token tokens = [] of Token
actions.each do |action| actions.each do |action|
tokens += action.emit(nil, lexer) tokens += action.emit([] of Match, lexer)
end end
return true, pos, tokens return true, pos, tokens
end end