5 Commits

12 changed files with 261 additions and 62 deletions

View File

@@ -1,5 +1,5 @@
name: tartrazine
version: 0.3.0
version: 0.2.0
authors:
- Roberto Alsina <roberto.alsina@gmail.com>

View File

@@ -42,6 +42,9 @@ known_bad = {
"#{__DIR__}/tests/mcfunction/selectors.txt",
"#{__DIR__}/tests/php/anonymous_class.txt",
"#{__DIR__}/tests/html/javascript_unclosed.txt",
# BAD FOR ONIGMO
"#{__DIR__}/tests/json/test_backtracking.txt",
}
@@ -58,6 +61,7 @@ describe Tartrazine do
end
else
it "parses #{testcase}".split("/")[-2...].join("/") do
p! testcase
text = File.read(testcase).split("---input---\n").last.split("---tokens---").first
lexer_name = File.basename(File.dirname(testcase)).downcase
unless failing_lexers.includes?(lexer_name) ||

View File

@@ -30,11 +30,11 @@ module Tartrazine
end
# ameba:disable Metrics/CyclomaticComplexity
def emit(match : Regex::MatchData?, lexer : Lexer, match_group = 0) : Array(Token)
def emit(match, lexer : Lexer, match_group = 0) : Array(Token)
case type
when "token"
raise Exception.new "Can't have a token without a match" if match.nil?
[Token.new(type: xml["type"], value: match[match_group])]
[Token.new(type: xml["type"], value: match[match_group].as(Onigmo::Match).value)]
when "push"
states_to_push = xml.attributes.select { |attrib|
attrib.name == "state"
@@ -88,14 +88,14 @@ module Tartrazine
return [] of Token if match.nil?
lexer_name = xml["lexer"].downcase
Log.trace { "to tokenize: #{match[match_group]}" }
Tartrazine.lexer(lexer_name).tokenize(match[match_group], usingself: true)
Tartrazine.lexer(lexer_name).tokenize(match[match_group].as(Onigmo::Match).value, usingself: true)
when "usingself"
# Shunt to another copy of this lexer
return [] of Token if match.nil?
new_lexer = Lexer.from_xml(lexer.xml)
Log.trace { "to tokenize: #{match[match_group]}" }
new_lexer.tokenize(match[match_group], usingself: true)
new_lexer.tokenize(match[match_group].as(Onigmo::Match).value, usingself: true)
when "combined"
# Combine two states into one anonymous state
states = xml.attributes.select { |attrib|

View File

@@ -9,15 +9,12 @@ module Tartrazine
# This is the base class for all formatters.
abstract class Formatter
property name : String = ""
property theme : Theme = Tartrazine.theme("default-dark")
# Format the text using the given lexer.
def format(text : String, lexer : Lexer) : String
def format(text : String, lexer : Lexer, theme : Theme) : String
raise Exception.new("Not implemented")
end
# Return the styles, if the formatter supports it.
def style_defs : String
def get_style_defs(theme : Theme) : String
raise Exception.new("Not implemented")
end
end

View File

@@ -4,23 +4,20 @@ module Tartrazine
class Ansi < Formatter
property? line_numbers : Bool = false
def initialize(@theme : Theme = Tartrazine.theme("default-dark"), @line_numbers : Bool = false)
end
def format(text : String, lexer : Lexer) : String
def format(text : String, lexer : Lexer, theme : Theme) : String
output = String.build do |outp|
lexer.group_tokens_in_lines(lexer.tokenize(text)).each_with_index do |line, i|
label = line_numbers? ? "#{i + 1}".rjust(4).ljust(5) : ""
outp << label
line.each do |token|
outp << colorize(token[:value], token[:type])
outp << colorize(token[:value], token[:type], theme)
end
end
end
output
end
def colorize(text : String, token : String) : String
def colorize(text : String, token : String, theme : Theme) : String
style = theme.styles.fetch(token, nil)
return text if style.nil?
if theme.styles.has_key?(token)

View File

@@ -15,37 +15,21 @@ module Tartrazine
property? standalone : Bool = false
property? surrounding_pre : Bool = true
property? wrap_long_lines : Bool = false
property weight_of_bold : Int32 = 600
property? weight_of_bold : Int32 = 600
property theme : Theme
def initialize(@theme : Theme = Tartrazine.theme("default-dark"), *,
@highlight_lines = [] of Range(Int32, Int32),
@class_prefix : String = "",
@line_number_id_prefix = "line-",
@line_number_start = 1,
@tab_width = 8,
@line_numbers : Bool = false,
@linkable_line_numbers : Bool = true,
@standalone : Bool = false,
@surrounding_pre : Bool = true,
@wrap_long_lines : Bool = false,
@weight_of_bold : Int32 = 600,)
end
def format(text : String, lexer : Lexer) : String
text = format_text(text, lexer)
def format(text : String, lexer : Lexer, theme : Theme) : String
text = format_text(text, lexer, theme)
if standalone?
text = wrap_standalone(text)
text = wrap_standalone(text, theme)
end
text
end
# Wrap text into a full HTML document, including the CSS for the theme
def wrap_standalone(text) : String
def wrap_standalone(text, theme) : String
output = String.build do |outp|
outp << "<!DOCTYPE html><html><head><style>"
outp << style_defs
outp << get_style_defs(theme)
outp << "</style></head><body>"
outp << text
outp << "</body></html>"
@@ -53,21 +37,21 @@ module Tartrazine
output
end
def format_text(text : String, lexer : Lexer) : String
def format_text(text : String, lexer : Lexer, theme : Theme) : String
lines = lexer.group_tokens_in_lines(lexer.tokenize(text))
output = String.build do |outp|
if surrounding_pre?
pre_style = wrap_long_lines? ? "style=\"white-space: pre-wrap; word-break: break-word;\"" : ""
outp << "<pre class=\"#{get_css_class("Background")}\" #{pre_style}>"
outp << "<pre class=\"#{get_css_class("Background", theme)}\" #{pre_style}>"
end
outp << "<code class=\"#{get_css_class("Background")}\">"
outp << "<code class=\"#{get_css_class("Background", theme)}\">"
lines.each_with_index(offset: line_number_start - 1) do |line, i|
line_label = line_numbers? ? "#{i + 1}".rjust(4).ljust(5) : ""
line_class = highlighted?(i + 1) ? "class=\"#{get_css_class("LineHighlight")}\"" : ""
line_class = highlighted?(i + 1) ? "class=\"#{get_css_class("LineHighlight", theme)}\"" : ""
line_id = linkable_line_numbers? ? "id=\"#{line_number_id_prefix}#{i + 1}\"" : ""
outp << "<span #{line_id} #{line_class} style=\"user-select: none;\">#{line_label} </span>"
line.each do |token|
fragment = "<span class=\"#{get_css_class(token[:type])}\">#{token[:value]}</span>"
fragment = "<span class=\"#{get_css_class(token[:type], theme)}\">#{token[:value]}</span>"
outp << fragment
end
end
@@ -77,10 +61,10 @@ module Tartrazine
end
# ameba:disable Metrics/CyclomaticComplexity
def style_defs : String
def get_style_defs(theme : Theme) : String
output = String.build do |outp|
theme.styles.each do |token, style|
outp << ".#{get_css_class(token)} {"
outp << ".#{get_css_class(token, theme)} {"
# These are set or nil
outp << "color: ##{style.color.try &.hex};" if style.color
outp << "background-color: ##{style.background.try &.hex};" if style.background
@@ -103,7 +87,7 @@ module Tartrazine
end
# Given a token type, return the CSS class to use.
def get_css_class(token : String) : String
def get_css_class(token, theme)
return class_prefix + Abbreviations[token] if theme.styles.has_key?(token)
# Themes don't contain information for each specific
@@ -115,7 +99,6 @@ module Tartrazine
}]
end
# Is this line in the highlighted ranges?
def highlighted?(line : Int) : Bool
highlight_lines.any?(&.includes?(line))
end

View File

@@ -54,8 +54,6 @@ if options["--list-formatters"]
exit 0
end
theme = Tartrazine.theme(options["-t"].as(String))
if options["-f"]
formatter = options["-f"].as(String)
case formatter
@@ -63,11 +61,9 @@ if options["-f"]
formatter = Tartrazine::Html.new
formatter.standalone = options["--standalone"] != nil
formatter.line_numbers = options["--line-numbers"] != nil
formatter.theme = theme
when "terminal"
formatter = Tartrazine::Ansi.new
formatter.line_numbers = options["--line-numbers"] != nil
formatter.theme = theme
when "json"
formatter = Tartrazine::Json.new
else
@@ -75,9 +71,11 @@ if options["-f"]
exit 1
end
theme = Tartrazine.theme(options["-t"].as(String))
if formatter.is_a?(Tartrazine::Html) && options["--css"]
File.open("#{options["-t"].as(String)}.css", "w") do |outf|
outf.puts formatter.style_defs
outf.puts formatter.get_style_defs(theme)
end
exit 0
end
@@ -85,7 +83,7 @@ if options["-f"]
lexer = Tartrazine.lexer(name: options["-l"].as(String), filename: options["FILE"].as(String))
input = File.open(options["FILE"].as(String)).gets_to_end
output = formatter.format(input, lexer)
output = formatter.format(input, lexer, theme)
if options["-o"].nil?
puts output

85
src/onigmo.cr Normal file
View File

@@ -0,0 +1,85 @@
@[Link("onigmo")]
@[Link(ldflags: "#{__DIR__}/onigmo/onigwrap.o")]
lib LibOnigmo
type Regex = Pointer(Void)
type Region = Pointer(Void)
fun create = onigwrap_create(pattern : LibC::Char*, len : UInt32,
ignoreCase : Int32,
multiline : Int32,
dotall : Int32) : Regex
fun free = onigwrap_free(re : Regex)
fun region_free = onigwrap_region_free(region : Region)
fun search = onigwrap_search(re : Regex, str : LibC::Char*, offset : UInt32, length : UInt32) : Region
fun num_regs = onigwrap_num_regs(region : Region) : Int32
fun pos = onigwrap_pos(region : Region, index : Int32) : Int32
fun len = onigwrap_len(region : Region, index : Int32) : Int32
end
module Onigmo
class Match
property begin : Int32
property end : Int32
property value : String
def initialize(@begin, @end, @value)
end
def to_s
@value
end
end
class Regex
def initialize(@pattern : String, @ignorecase = false, @multiline = false, @dotall = false)
@re = LibOnigmo.create(@pattern.to_unsafe, @pattern.bytesize, @ignorecase ? 1 : 0, @multiline ? 1 : 0, @dotall ? 1 : 0)
end
def finalize
LibOnigmo.free(@re)
end
def match(str : String, offset = 0)
# The offset argument is a character index, but Onigmo expects a byte index
offset = str.char_index_to_byte_index(offset)
if offset.nil?
raise Exception.new "Invalid offset"
end
region = LibOnigmo.search(@re, str.to_unsafe, offset, str.bytesize)
result = [] of Match?
num_regs = LibOnigmo.num_regs(region)
if num_regs > 0
(0...num_regs).each do |i|
pos = LibOnigmo.pos(region, i)
l = LibOnigmo.len(region, i)
if pos == -1 || l == -1
result << nil
else
b = str.byte_index_to_char_index(pos)
e = str.byte_index_to_char_index(pos + l)
# p! pos, l, b, e, str[pos..]
if b.nil? || e.nil?
raise Exception.new "Invalid substring"
end
v = str[b...e]
result << Match.new(b, b + v.size, v)
end
end
else
return [] of Match
end
LibOnigmo.region_free(region)
result
end
end
end
# pattern = "\\w"
# str = "α"
# re = Onigmo::Regex.new(pattern, false, false, false)
# p! re.match(str)

94
src/onigmo/onigwrap.c Normal file
View File

@@ -0,0 +1,94 @@
#include "onigmo.h"
regex_t *onigwrap_create(char *pattern, int len, int ignoreCase, int multiline, int dotall)
{
regex_t *reg;
OnigErrorInfo einfo;
OnigOptionType onigOptions = ONIG_OPTION_DEFAULT;
if (ignoreCase == 1)
onigOptions |= ONIG_OPTION_IGNORECASE;
if (multiline == 1)
onigOptions |= ONIG_OPTION_NEGATE_SINGLELINE;
if (dotall == 1)
onigOptions |= ONIG_OPTION_DOTALL;
OnigUChar *stringStart = (OnigUChar*) pattern;
OnigUChar *stringEnd = (OnigUChar*) pattern + len;
int res = onig_new(&reg, stringStart, stringEnd, onigOptions, ONIG_ENCODING_UTF8, ONIG_SYNTAX_PYTHON, &einfo);
return reg;
}
void onigwrap_region_free(OnigRegion *region)
{
onig_region_free(region, 1);
}
void onigwrap_free(regex_t *reg)
{
onig_free(reg);
}
int onigwrap_index_in(regex_t *reg, char *charPtr, int offset, int length)
{
OnigUChar *stringStart = (OnigUChar*) charPtr;
OnigUChar *stringEnd = (OnigUChar*) (charPtr + length);
OnigUChar *stringOffset = (OnigUChar*) (charPtr + offset);
OnigUChar *stringRange = (OnigUChar*) stringEnd;
OnigRegion *region = onig_region_new();
int result = onig_search(reg, stringStart, stringEnd, stringOffset, stringRange, region, ONIG_OPTION_NONE);
onig_region_free(region, 1);
if (result >= 0)
return result >> 1;
if (result == ONIG_MISMATCH)
return -1;
return -2;
}
OnigRegion *onigwrap_search(regex_t *reg, char *charPtr, int offset, int length)
{
OnigUChar *stringStart = (OnigUChar*) charPtr;
OnigUChar *stringEnd = (OnigUChar*) (charPtr + length);
OnigUChar *stringOffset = (OnigUChar*) (charPtr + offset);
OnigUChar *stringRange = (OnigUChar*) stringEnd;
OnigRegion *region = onig_region_new();
int result = onig_search(reg, stringStart, stringEnd, stringOffset, stringRange, region, ONIG_OPTION_NONE);
return region;
}
int onigwrap_num_regs(OnigRegion *region)
{
return region->num_regs;
}
int onigwrap_pos(OnigRegion *region, int nth)
{
if (nth < region->num_regs)
{
int result = region->beg[nth];
if (result < 0)
return -1;
return result;
}
return -1;
}
int onigwrap_len(OnigRegion *region, int nth)
{
if (nth < region->num_regs)
{
int result = region->end[nth] - region->beg[nth];
return result;
}
return -1;
}

32
src/onigmo/onigwrap.h Normal file
View File

@@ -0,0 +1,32 @@
#include "onigmo.h"
#if defined(_WIN32)
#define ONIGWRAP_EXTERN extern __declspec(dllexport)
#else
#define ONIGWRAP_EXTERN extern
#endif
ONIGWRAP_EXTERN
regex_t *onigwrap_create(char *pattern, int len, int ignoreCase, int multiline);
ONIGWRAP_EXTERN
void onigwrap_region_free(OnigRegion *region);
ONIGWRAP_EXTERN
void onigwrap_free(regex_t *reg);
ONIGWRAP_EXTERN
int onigwrap_index_in(regex_t *reg, char *charPtr, int offset, int length);
ONIGWRAP_EXTERN
OnigRegion *onigwrap_search(regex_t *reg, char *charPtr, int offset, int length);
ONIGWRAP_EXTERN
int onigwrap_num_regs(OnigRegion *region);
ONIGWRAP_EXTERN
int onigwrap_pos(OnigRegion *region, int nth);
ONIGWRAP_EXTERN
int onigwrap_len(OnigRegion *region, int nth);

View File

@@ -3,6 +3,7 @@ require "./formatter"
require "./rules"
require "./styles"
require "./lexer"
require "./onigmo"
# These are lexer rules. They match with the text being parsed
# and perform actions, either emitting tokens or changing the
@@ -10,16 +11,22 @@ require "./lexer"
module Tartrazine
# This rule matches via a regex pattern
alias Regex = Onigmo::Regex
class Rule
property pattern : Regex = Regex.new ""
property pattern2 : ::Regex = ::Regex.new ""
property actions : Array(Action) = [] of Action
property xml : String = "foo"
def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token))
match = pattern.match(text, pos)
match2 = pattern2.match(text, pos)
# We don't match if the match doesn't move the cursor
# because that causes infinite loops
return false, pos, [] of Token if match.nil? || match.end == 0
# The `match.begin > pos` is the same as the ANCHORED option
return false, pos, [] of Token if match.empty? || match[0].nil? || match[0].try { |m| m.begin > pos }
# p! match.map(&.to_s), match2, text[pos-1..pos + 20],"----------------------"
# Log.trace { "#{match}, #{pattern.inspect}, #{text}, #{pos}" }
tokens = [] of Token
# Emit the tokens
@@ -27,21 +34,23 @@ module Tartrazine
# Emit the token
tokens += action.emit(match, lexer)
end
Log.trace { "#{xml}, #{match.end}, #{tokens}" }
return true, match.end, tokens
# Log.trace { "#{xml}, #{match[0].end}, #{tokens}" }
return true, pos + match[0].as(Onigmo::Match).value.size, tokens
end
def initialize(node : XML::Node, multiline, dotall, ignorecase)
@xml = node.to_s
pattern = node["pattern"]
flags = Regex::Options::ANCHORED
# flags = Regex::Options::ANCHORED
flags = ::Regex::Options::NO_UTF_CHECK
# MULTILINE implies DOTALL which we don't want, so we
# use in-pattern flag (?m) instead
# flags |= Regex::Options::MULTILINE if multiline
flags |= ::Regex::Options::MULTILINE if multiline
pattern = "(?m)" + pattern if multiline
flags |= Regex::Options::DOTALL if dotall
flags |= Regex::Options::IGNORE_CASE if ignorecase
@pattern = Regex.new(pattern, flags)
flags |= ::Regex::Options::DOTALL if dotall
flags |= ::Regex::Options::IGNORE_CASE if ignorecase
@pattern = Regex.new(pattern, ignorecase, multiline, dotall)
@pattern2 = ::Regex.new(pattern, flags)
add_actions(node)
end

View File

@@ -11,7 +11,7 @@ require "xml"
module Tartrazine
extend self
VERSION = {{ `shards version #{__DIR__}`.chomp.stringify }}
VERSION = "0.2.0"
Log = ::Log.for("tartrazine")
end