mirror of
https://github.com/ralsina/tartrazine.git
synced 2025-06-08 12:40:25 -03:00
Compare commits
No commits in common. "cb09dff9f16261661e23d79639ec1ad2e140e854" and "ad664d9f934b879b393092497492eac526b52254" have entirely different histories.
cb09dff9f1
...
ad664d9f93
1
.gitignore
vendored
1
.gitignore
vendored
@ -7,4 +7,3 @@ chroma/
|
||||
pygments/
|
||||
shard.lock
|
||||
.vscode/
|
||||
.crystal/
|
||||
|
16
README.md
16
README.md
@ -4,17 +4,17 @@ Tartrazine is a library to syntax-highlight code. It is
|
||||
a port of [Pygments](https://pygments.org/) to
|
||||
[Crystal](https://crystal-lang.org/). Kind of.
|
||||
|
||||
The CLI tool can be used to highlight many things in many styles.
|
||||
It's not currently usable because it's not finished, but:
|
||||
|
||||
* The lexers work for the implemented languages
|
||||
* The provided styles work
|
||||
* There is a very very simple HTML formatter
|
||||
|
||||
# A port of what? Why "kind of"?
|
||||
|
||||
Pygments is a staple of the Python ecosystem, and it's great.
|
||||
It lets you highlight code in many languages, and it has many
|
||||
themes. Chroma is "Pygments for Go", it's actually a port of
|
||||
Pygments to Go, and it's great too.
|
||||
|
||||
I wanted that in Crystal, so I started this project. But I did
|
||||
not read much of the Pygments code. Or much of Chroma's.
|
||||
Because I did not read the Pygments code. And this is actually
|
||||
based on [Chroma](https://github.com/alecthomas/chroma) ...
|
||||
although I did not read that code either.
|
||||
|
||||
Chroma has taken most of the Pygments lexers and turned them into
|
||||
XML descriptions. What I did was take those XML files from Chroma
|
||||
|
@ -1,5 +1,5 @@
|
||||
name: tartrazine
|
||||
version: 0.4.0
|
||||
version: 0.3.0
|
||||
|
||||
authors:
|
||||
- Roberto Alsina <roberto.alsina@gmail.com>
|
||||
|
@ -14,7 +14,6 @@ unicode_problems = {
|
||||
"#{__DIR__}/tests/java/test_string_literals.txt",
|
||||
"#{__DIR__}/tests/json/test_strings.txt",
|
||||
"#{__DIR__}/tests/systemd/example1.txt",
|
||||
"#{__DIR__}/tests/c++/test_unicode_identifiers.txt",
|
||||
}
|
||||
|
||||
# These testcases fail because of differences in the way chroma and tartrazine tokenize
|
||||
|
@ -8,20 +8,12 @@ require "./tartrazine"
|
||||
# perform a list of actions. These actions can emit tokens
|
||||
# or change the state machine.
|
||||
module Tartrazine
|
||||
struct Action
|
||||
property actions : Array(Action) = [] of Action
|
||||
class Action
|
||||
property type : String
|
||||
property xml : XML::Node
|
||||
property actions : Array(Action) = [] of Action
|
||||
|
||||
@depth : Int32 = 0
|
||||
@lexer_name : String = ""
|
||||
@states : Array(String) = [] of String
|
||||
@states_to_push : Array(String) = [] of String
|
||||
@token_type : String = ""
|
||||
|
||||
def initialize(@type : String, xml : XML::Node?)
|
||||
known_types = %w(token push pop combined bygroups include using usingself)
|
||||
raise Exception.new("Unknown action type: #{type}") unless known_types.includes? type
|
||||
|
||||
def initialize(@type : String, @xml : XML::Node?)
|
||||
# Some actions may have actions in them, like this:
|
||||
# <bygroups>
|
||||
# <token type="GenericPrompt"/>
|
||||
@ -31,28 +23,10 @@ module Tartrazine
|
||||
#
|
||||
# The token actions match with the first 2 groups in the regex
|
||||
# the using action matches the 3rd and shunts it to another lexer
|
||||
xml.children.each do |node|
|
||||
@xml.children.each do |node|
|
||||
next unless node.element?
|
||||
@actions << Action.new(node.name, node)
|
||||
end
|
||||
|
||||
# Prefetch the attributes we ned from the XML and keep them
|
||||
case type
|
||||
when "token"
|
||||
@token_type = xml["type"]
|
||||
when "push"
|
||||
@states_to_push = xml.attributes.select { |attrib|
|
||||
attrib.name == "state"
|
||||
}.map &.content
|
||||
when "pop"
|
||||
@depth = xml["depth"].to_i
|
||||
when "using"
|
||||
@lexer_name = xml["lexer"].downcase
|
||||
when "combined"
|
||||
@states = xml.attributes.select { |attrib|
|
||||
attrib.name == "state"
|
||||
}.map &.content
|
||||
end
|
||||
end
|
||||
|
||||
# ameba:disable Metrics/CyclomaticComplexity
|
||||
@ -60,22 +34,35 @@ module Tartrazine
|
||||
case type
|
||||
when "token"
|
||||
raise Exception.new "Can't have a token without a match" if match.empty?
|
||||
[Token.new(type: @token_type, value: String.new(match[match_group].value))]
|
||||
[Token.new(type: xml["type"], value: String.new(match[match_group].value))]
|
||||
when "push"
|
||||
to_push = @states_to_push.empty? ? [lexer.state_stack.last] : @states_to_push
|
||||
to_push.each do |state|
|
||||
if state == "#pop" && lexer.state_stack.size > 1
|
||||
states_to_push = xml.attributes.select { |attrib|
|
||||
attrib.name == "state"
|
||||
}.map &.content
|
||||
if states_to_push.empty?
|
||||
# Push without a state means push the current state
|
||||
states_to_push = [lexer.state_stack.last]
|
||||
end
|
||||
states_to_push.each do |state|
|
||||
if state == "#pop"
|
||||
# Pop the state
|
||||
Log.trace { "Popping state" }
|
||||
lexer.state_stack.pop
|
||||
else
|
||||
# Really push
|
||||
lexer.state_stack << state
|
||||
Log.trace { "Pushed #{lexer.state_stack}" }
|
||||
end
|
||||
end
|
||||
[] of Token
|
||||
when "pop"
|
||||
to_pop = [@depth, lexer.state_stack.size - 1].min
|
||||
lexer.state_stack.pop(to_pop)
|
||||
depth = xml["depth"].to_i
|
||||
Log.trace { "Popping #{depth} states" }
|
||||
if lexer.state_stack.size <= depth
|
||||
Log.trace { "Can't pop #{depth} states, only have #{lexer.state_stack.size}" }
|
||||
else
|
||||
lexer.state_stack.pop(depth)
|
||||
end
|
||||
[] of Token
|
||||
when "bygroups"
|
||||
# FIXME: handle
|
||||
@ -105,15 +92,22 @@ module Tartrazine
|
||||
when "using"
|
||||
# Shunt to another lexer entirely
|
||||
return [] of Token if match.empty?
|
||||
Tartrazine.lexer(@lexer_name).tokenize(String.new(match[match_group].value), usingself: true)
|
||||
lexer_name = xml["lexer"].downcase
|
||||
Log.trace { "to tokenize: #{match[match_group]}" }
|
||||
Tartrazine.lexer(lexer_name).tokenize(String.new(match[match_group].value), usingself: true)
|
||||
when "usingself"
|
||||
# Shunt to another copy of this lexer
|
||||
return [] of Token if match.empty?
|
||||
new_lexer = lexer.copy
|
||||
|
||||
new_lexer = Lexer.from_xml(lexer.xml)
|
||||
Log.trace { "to tokenize: #{match[match_group]}" }
|
||||
new_lexer.tokenize(String.new(match[match_group].value), usingself: true)
|
||||
when "combined"
|
||||
# Combine two states into one anonymous state
|
||||
new_state = @states.map { |name|
|
||||
states = xml.attributes.select { |attrib|
|
||||
attrib.name == "state"
|
||||
}.map &.content
|
||||
new_state = states.map { |name|
|
||||
lexer.states[name]
|
||||
}.reduce { |state1, state2|
|
||||
state1 + state2
|
||||
@ -122,7 +116,7 @@ module Tartrazine
|
||||
lexer.state_stack << new_state.name
|
||||
[] of Token
|
||||
else
|
||||
raise Exception.new("Unknown action type: #{type}")
|
||||
raise Exception.new("Unknown action type: #{type}: #{xml}")
|
||||
end
|
||||
end
|
||||
end
|
||||
|
@ -3,7 +3,7 @@ module BytesRegex
|
||||
|
||||
class Regex
|
||||
def initialize(pattern : String, multiline = false, dotall = false, ignorecase = false, anchored = false)
|
||||
flags = LibPCRE2::UTF | LibPCRE2::UCP | LibPCRE2::NO_UTF_CHECK
|
||||
flags = LibPCRE2::UTF | LibPCRE2::DUPNAMES | LibPCRE2::UCP
|
||||
flags |= LibPCRE2::MULTILINE if multiline
|
||||
flags |= LibPCRE2::DOTALL if dotall
|
||||
flags |= LibPCRE2::CASELESS if ignorecase
|
||||
@ -22,26 +22,27 @@ module BytesRegex
|
||||
end
|
||||
raise Exception.new "Error #{msg} compiling regex at offset #{erroroffset}"
|
||||
end
|
||||
@match_data = LibPCRE2.match_data_create_from_pattern(@re, nil)
|
||||
end
|
||||
|
||||
def finalize
|
||||
LibPCRE2.match_data_free(@match_data)
|
||||
LibPCRE2.code_free(@re)
|
||||
end
|
||||
|
||||
def match(str : Bytes, pos = 0) : Array(Match)
|
||||
match_data = LibPCRE2.match_data_create_from_pattern(@re, nil)
|
||||
match = [] of Match
|
||||
rc = LibPCRE2.match(
|
||||
@re,
|
||||
str,
|
||||
str.size,
|
||||
pos,
|
||||
LibPCRE2::NO_UTF_CHECK,
|
||||
@match_data,
|
||||
0,
|
||||
match_data,
|
||||
nil)
|
||||
if rc > 0
|
||||
ovector = LibPCRE2.get_ovector_pointer(@match_data)
|
||||
if rc < 0
|
||||
# No match, do nothing
|
||||
else
|
||||
ovector = LibPCRE2.get_ovector_pointer(match_data)
|
||||
(0...rc).each do |i|
|
||||
m_start = ovector[2 * i]
|
||||
m_size = ovector[2 * i + 1] - m_start
|
||||
@ -53,6 +54,7 @@ module BytesRegex
|
||||
match << Match.new(m_value, m_start, m_size)
|
||||
end
|
||||
end
|
||||
LibPCRE2.match_data_free(match_data)
|
||||
match
|
||||
end
|
||||
end
|
||||
|
@ -1,6 +1,5 @@
|
||||
require "../constants/token_abbrevs.cr"
|
||||
require "../formatter"
|
||||
require "html"
|
||||
|
||||
module Tartrazine
|
||||
class Html < Formatter
|
||||
@ -68,7 +67,8 @@ module Tartrazine
|
||||
line_id = linkable_line_numbers? ? "id=\"#{line_number_id_prefix}#{i + 1}\"" : ""
|
||||
outp << "<span #{line_id} #{line_class} style=\"user-select: none;\">#{line_label} </span>"
|
||||
line.each do |token|
|
||||
outp << "<span class=\"#{get_css_class(token[:type])}\">#{HTML.escape(token[:value])}</span>"
|
||||
fragment = "<span class=\"#{get_css_class(token[:type])}\">#{token[:value]}</span>"
|
||||
outp << fragment
|
||||
end
|
||||
end
|
||||
outp << "</code></pre>"
|
||||
@ -104,17 +104,15 @@ module Tartrazine
|
||||
|
||||
# Given a token type, return the CSS class to use.
|
||||
def get_css_class(token : String) : String
|
||||
if !theme.styles.has_key? token
|
||||
return class_prefix + Abbreviations[token] if theme.styles.has_key?(token)
|
||||
|
||||
# Themes don't contain information for each specific
|
||||
# token type. However, they may contain information
|
||||
# for a parent style. Worst case, we go to the root
|
||||
# (Background) style.
|
||||
parent = theme.style_parents(token).reverse.find { |dad|
|
||||
theme.styles.has_key?(dad)
|
||||
}
|
||||
theme.styles[token] = theme.styles[parent]
|
||||
end
|
||||
class_prefix + Abbreviations[token]
|
||||
class_prefix + Abbreviations[theme.style_parents(token).reverse.find { |parent|
|
||||
theme.styles.has_key?(parent)
|
||||
}]
|
||||
end
|
||||
|
||||
# Is this line in the highlighted ranges?
|
||||
|
21
src/lexer.cr
21
src/lexer.cr
@ -56,17 +56,11 @@ module Tartrazine
|
||||
not_multiline: false,
|
||||
ensure_nl: false,
|
||||
}
|
||||
# property xml : String = ""
|
||||
property states = {} of String => State
|
||||
property state_stack = ["root"]
|
||||
property xml : String = ""
|
||||
|
||||
def copy : Lexer
|
||||
new_lexer = Lexer.new
|
||||
new_lexer.config = config
|
||||
new_lexer.states = states
|
||||
new_lexer.state_stack = state_stack[0..-1]
|
||||
new_lexer
|
||||
end
|
||||
property states = {} of String => State
|
||||
|
||||
property state_stack = ["root"]
|
||||
|
||||
# Turn the text into a list of tokens. The `usingself` parameter
|
||||
# is true when the lexer is being used to tokenize a string
|
||||
@ -93,10 +87,12 @@ module Tartrazine
|
||||
if matched
|
||||
# Move position forward, save the tokens,
|
||||
# tokenize from the new position
|
||||
# Log.trace { "MATCHED: #{rule.xml}" }
|
||||
pos = new_pos
|
||||
tokens += new_tokens
|
||||
break
|
||||
end
|
||||
# Log.trace { "NOT MATCHED: #{rule.xml}" }
|
||||
end
|
||||
# If no rule matches, emit an error token
|
||||
unless matched
|
||||
@ -162,6 +158,7 @@ module Tartrazine
|
||||
# ameba:disable Metrics/CyclomaticComplexity
|
||||
def self.from_xml(xml : String) : Lexer
|
||||
l = Lexer.new
|
||||
l.xml = xml
|
||||
lexer = XML.parse(xml).first_element_child
|
||||
if lexer
|
||||
config = lexer.children.find { |node|
|
||||
@ -225,9 +222,9 @@ module Tartrazine
|
||||
# A Lexer state. A state has a name and a list of rules.
|
||||
# The state machine has a state stack containing references
|
||||
# to states to decide which rules to apply.
|
||||
struct State
|
||||
class State
|
||||
property name : String = ""
|
||||
property rules = [] of BaseRule
|
||||
property rules = [] of Rule
|
||||
|
||||
def +(other : State)
|
||||
new_state = State.new
|
||||
|
@ -77,7 +77,7 @@ if options["-f"]
|
||||
|
||||
if formatter.is_a?(Tartrazine::Html) && options["--css"]
|
||||
File.open("#{options["-t"].as(String)}.css", "w") do |outf|
|
||||
outf << formatter.style_defs
|
||||
outf.puts formatter.style_defs
|
||||
end
|
||||
exit 0
|
||||
end
|
||||
@ -91,7 +91,7 @@ if options["-f"]
|
||||
puts output
|
||||
else
|
||||
File.open(options["-o"].as(String), "w") do |outf|
|
||||
outf << output
|
||||
outf.puts output
|
||||
end
|
||||
end
|
||||
end
|
||||
|
76
src/rules.cr
76
src/rules.cr
@ -15,11 +15,41 @@ module Tartrazine
|
||||
alias Match = BytesRegex::Match
|
||||
alias MatchData = Array(Match)
|
||||
|
||||
abstract struct BaseRule
|
||||
abstract def match(text : Bytes, pos, lexer) : Tuple(Bool, Int32, Array(Token))
|
||||
abstract def initialize(node : XML::Node)
|
||||
|
||||
class Rule
|
||||
property pattern : Regex = Regex.new ""
|
||||
property actions : Array(Action) = [] of Action
|
||||
property xml : String = "foo"
|
||||
|
||||
def match(text : Bytes, pos, lexer) : Tuple(Bool, Int32, Array(Token))
|
||||
match = pattern.match(text, pos)
|
||||
# We don't match if the match doesn't move the cursor
|
||||
# because that causes infinite loops
|
||||
return false, pos, [] of Token if match.empty? || match[0].size == 0
|
||||
# p! match, String.new(text[pos..pos+20])
|
||||
# Log.trace { "#{match}, #{pattern.inspect}, #{text}, #{pos}" }
|
||||
tokens = [] of Token
|
||||
# Emit the tokens
|
||||
actions.each do |action|
|
||||
# Emit the token
|
||||
tokens += action.emit(match, lexer)
|
||||
end
|
||||
Log.trace { "#{xml}, #{pos + match[0].size}, #{tokens}" }
|
||||
return true, pos + match[0].size, tokens
|
||||
end
|
||||
|
||||
def initialize(node : XML::Node, multiline, dotall, ignorecase)
|
||||
@xml = node.to_s
|
||||
pattern = node["pattern"]
|
||||
# flags = Regex::Options::ANCHORED
|
||||
# MULTILINE implies DOTALL which we don't want, so we
|
||||
# use in-pattern flag (?m) instead
|
||||
# flags |= Regex::Options::MULTILINE if multiline
|
||||
pattern = "(?m)" + pattern if multiline
|
||||
# flags |= Regex::Options::DOTALL if dotall
|
||||
# flags |= Regex::Options::IGNORE_CASE if ignorecase
|
||||
@pattern = Regex.new(pattern, multiline, dotall, ignorecase, true)
|
||||
add_actions(node)
|
||||
end
|
||||
|
||||
def add_actions(node : XML::Node)
|
||||
node.children.each do |child|
|
||||
@ -29,44 +59,23 @@ module Tartrazine
|
||||
end
|
||||
end
|
||||
|
||||
struct Rule < BaseRule
|
||||
property pattern : Regex = Regex.new ""
|
||||
property actions : Array(Action) = [] of Action
|
||||
|
||||
def match(text : Bytes, pos, lexer) : Tuple(Bool, Int32, Array(Token))
|
||||
match = pattern.match(text, pos)
|
||||
|
||||
# No match
|
||||
return false, pos, [] of Token if match.size == 0
|
||||
return true, pos + match[0].size, actions.flat_map { |action| action.emit(match, lexer) }
|
||||
end
|
||||
|
||||
def initialize(node : XML::Node)
|
||||
end
|
||||
|
||||
def initialize(node : XML::Node, multiline, dotall, ignorecase)
|
||||
pattern = node["pattern"]
|
||||
pattern = "(?m)" + pattern if multiline
|
||||
@pattern = Regex.new(pattern, multiline, dotall, ignorecase, true)
|
||||
add_actions(node)
|
||||
end
|
||||
end
|
||||
|
||||
# This rule includes another state. If any of the rules of the
|
||||
# included state matches, this rule matches.
|
||||
struct IncludeStateRule < BaseRule
|
||||
class IncludeStateRule < Rule
|
||||
property state : String = ""
|
||||
|
||||
def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token))
|
||||
Log.trace { "Including state #{state} from #{lexer.state_stack.last}" }
|
||||
lexer.states[state].rules.each do |rule|
|
||||
matched, new_pos, new_tokens = rule.match(text, pos, lexer)
|
||||
Log.trace { "#{xml}, #{new_pos}, #{new_tokens}" } if matched
|
||||
return true, new_pos, new_tokens if matched
|
||||
end
|
||||
return false, pos, [] of Token
|
||||
end
|
||||
|
||||
def initialize(node : XML::Node)
|
||||
@xml = node.to_s
|
||||
include_node = node.children.find { |child|
|
||||
child.name == "include"
|
||||
}
|
||||
@ -76,14 +85,17 @@ module Tartrazine
|
||||
end
|
||||
|
||||
# This rule always matches, unconditionally
|
||||
struct UnconditionalRule < BaseRule
|
||||
NO_MATCH = [] of Match
|
||||
|
||||
class UnconditionalRule < Rule
|
||||
def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token))
|
||||
return true, pos, actions.flat_map { |action| action.emit(NO_MATCH, lexer) }
|
||||
tokens = [] of Token
|
||||
actions.each do |action|
|
||||
tokens += action.emit([] of Match, lexer)
|
||||
end
|
||||
return true, pos, tokens
|
||||
end
|
||||
|
||||
def initialize(node : XML::Node)
|
||||
@xml = node.to_s
|
||||
add_actions(node)
|
||||
end
|
||||
end
|
||||
|
Loading…
x
Reference in New Issue
Block a user