9 Commits

8 changed files with 348 additions and 39 deletions

View File

@@ -1,5 +1,5 @@
# This configuration file was generated by `ameba --gen-config`
# on 2024-08-04 23:09:09 UTC using Ameba version 1.6.1.
# on 2024-08-12 22:00:49 UTC using Ameba version 1.6.1.
# The point is for the user to remove these configuration records
# one by one as the reported problems are removed from the code base.
@@ -9,7 +9,7 @@ Documentation/DocumentationAdmonition:
Description: Reports documentation admonitions
Timezone: UTC
Excluded:
- src/tartrazine.cr
- src/lexer.cr
- src/actions.cr
Admonitions:
- TODO
@@ -17,3 +17,105 @@ Documentation/DocumentationAdmonition:
- BUG
Enabled: true
Severity: Warning
# Problems found: 22
# Run `ameba --only Lint/MissingBlockArgument` for details
Lint/MissingBlockArgument:
Description: Disallows yielding method definitions without block argument
Excluded:
- pygments/tests/examplefiles/cr/test.cr
Enabled: true
Severity: Warning
# Problems found: 1
# Run `ameba --only Lint/NotNil` for details
Lint/NotNil:
Description: Identifies usage of `not_nil!` calls
Excluded:
- pygments/tests/examplefiles/cr/test.cr
Enabled: true
Severity: Warning
# Problems found: 34
# Run `ameba --only Lint/ShadowingOuterLocalVar` for details
Lint/ShadowingOuterLocalVar:
Description: Disallows the usage of the same name as outer local variables for block
or proc arguments
Excluded:
- pygments/tests/examplefiles/cr/test.cr
Enabled: true
Severity: Warning
# Problems found: 1
# Run `ameba --only Lint/UnreachableCode` for details
Lint/UnreachableCode:
Description: Reports unreachable code
Excluded:
- pygments/tests/examplefiles/cr/test.cr
Enabled: true
Severity: Warning
# Problems found: 6
# Run `ameba --only Lint/UselessAssign` for details
Lint/UselessAssign:
Description: Disallows useless variable assignments
ExcludeTypeDeclarations: false
Excluded:
- pygments/tests/examplefiles/cr/test.cr
Enabled: true
Severity: Warning
# Problems found: 3
# Run `ameba --only Naming/BlockParameterName` for details
Naming/BlockParameterName:
Description: Disallows non-descriptive block parameter names
MinNameLength: 3
AllowNamesEndingInNumbers: true
Excluded:
- pygments/tests/examplefiles/cr/test.cr
AllowedNames:
- _
- e
- i
- j
- k
- v
- x
- y
- ex
- io
- ws
- op
- tx
- id
- ip
- k1
- k2
- v1
- v2
ForbiddenNames: []
Enabled: true
Severity: Convention
# Problems found: 1
# Run `ameba --only Naming/RescuedExceptionsVariableName` for details
Naming/RescuedExceptionsVariableName:
Description: Makes sure that rescued exceptions variables are named as expected
Excluded:
- pygments/tests/examplefiles/cr/test.cr
AllowedNames:
- e
- ex
- exception
- error
Enabled: true
Severity: Convention
# Problems found: 6
# Run `ameba --only Naming/TypeNames` for details
Naming/TypeNames:
Description: Enforces type names in camelcase manner
Excluded:
- pygments/tests/examplefiles/cr/test.cr
Enabled: true
Severity: Convention

View File

@@ -42,6 +42,9 @@ known_bad = {
"#{__DIR__}/tests/mcfunction/selectors.txt",
"#{__DIR__}/tests/php/anonymous_class.txt",
"#{__DIR__}/tests/html/javascript_unclosed.txt",
# BAD FOR ONIGMO
"#{__DIR__}/tests/json/test_backtracking.txt",
}
@@ -58,6 +61,7 @@ describe Tartrazine do
end
else
it "parses #{testcase}".split("/")[-2...].join("/") do
p! testcase
text = File.read(testcase).split("---input---\n").last.split("---tokens---").first
lexer_name = File.basename(File.dirname(testcase)).downcase
unless failing_lexers.includes?(lexer_name) ||

View File

@@ -30,11 +30,11 @@ module Tartrazine
end
# ameba:disable Metrics/CyclomaticComplexity
def emit(match : Regex::MatchData?, lexer : Lexer, match_group = 0) : Array(Token)
def emit(match, lexer : Lexer, match_group = 0) : Array(Token)
case type
when "token"
raise Exception.new "Can't have a token without a match" if match.nil?
[Token.new(type: xml["type"], value: match[match_group])]
[Token.new(type: xml["type"], value: match[match_group].as(Onigmo::Match).value)]
when "push"
states_to_push = xml.attributes.select { |attrib|
attrib.name == "state"
@@ -88,14 +88,14 @@ module Tartrazine
return [] of Token if match.nil?
lexer_name = xml["lexer"].downcase
Log.trace { "to tokenize: #{match[match_group]}" }
Tartrazine.lexer(lexer_name).tokenize(match[match_group], usingself: true)
Tartrazine.lexer(lexer_name).tokenize(match[match_group].as(Onigmo::Match).value, usingself: true)
when "usingself"
# Shunt to another copy of this lexer
return [] of Token if match.nil?
new_lexer = Lexer.from_xml(lexer.xml)
Log.trace { "to tokenize: #{match[match_group]}" }
new_lexer.tokenize(match[match_group], usingself: true)
new_lexer.tokenize(match[match_group].as(Onigmo::Match).value, usingself: true)
when "combined"
# Combine two states into one anonymous state
states = xml.attributes.select { |attrib|

View File

@@ -15,6 +15,7 @@ module Tartrazine
property? standalone : Bool = false
property? surrounding_pre : Bool = true
property? wrap_long_lines : Bool = false
property? weight_of_bold : Int32 = 600
def format(text : String, lexer : Lexer, theme : Theme) : String
text = format_text(text, lexer, theme)
@@ -43,7 +44,7 @@ module Tartrazine
pre_style = wrap_long_lines? ? "style=\"white-space: pre-wrap; word-break: break-word;\"" : ""
outp << "<pre class=\"#{get_css_class("Background", theme)}\" #{pre_style}>"
end
"<code class=\"#{get_css_class("Background", theme)}\">"
outp << "<code class=\"#{get_css_class("Background", theme)}\">"
lines.each_with_index(offset: line_number_start - 1) do |line, i|
line_label = line_numbers? ? "#{i + 1}".rjust(4).ljust(5) : ""
line_class = highlighted?(i + 1) ? "class=\"#{get_css_class("LineHighlight", theme)}\"" : ""
@@ -72,7 +73,7 @@ module Tartrazine
# These are true/false/nil
outp << "border: none;" if style.border == false
outp << "font-weight: bold;" if style.bold
outp << "font-weight: 400;" if style.bold == false
outp << "font-weight: #{@weight_of_bold};" if style.bold == false
outp << "font-style: italic;" if style.italic
outp << "font-style: normal;" if style.italic == false
outp << "text-decoration: underline;" if style.underline

85
src/onigmo.cr Normal file
View File

@@ -0,0 +1,85 @@
@[Link("onigmo")]
@[Link(ldflags: "#{__DIR__}/onigmo/onigwrap.o")]
lib LibOnigmo
type Regex = Pointer(Void)
type Region = Pointer(Void)
fun create = onigwrap_create(pattern : LibC::Char*, len : UInt32,
ignoreCase : Int32,
multiline : Int32,
dotall : Int32) : Regex
fun free = onigwrap_free(re : Regex)
fun region_free = onigwrap_region_free(region : Region)
fun search = onigwrap_search(re : Regex, str : LibC::Char*, offset : UInt32, length : UInt32) : Region
fun num_regs = onigwrap_num_regs(region : Region) : Int32
fun pos = onigwrap_pos(region : Region, index : Int32) : Int32
fun len = onigwrap_len(region : Region, index : Int32) : Int32
end
module Onigmo
class Match
property begin : Int32
property end : Int32
property value : String
def initialize(@begin, @end, @value)
end
def to_s
@value
end
end
class Regex
def initialize(@pattern : String, @ignorecase = false, @multiline = false, @dotall = false)
@re = LibOnigmo.create(@pattern.to_unsafe, @pattern.bytesize, @ignorecase ? 1 : 0, @multiline ? 1 : 0, @dotall ? 1 : 0)
end
def finalize
LibOnigmo.free(@re)
end
def match(str : String, offset = 0)
# The offset argument is a character index, but Onigmo expects a byte index
offset = str.char_index_to_byte_index(offset)
if offset.nil?
raise Exception.new "Invalid offset"
end
region = LibOnigmo.search(@re, str.to_unsafe, offset, str.bytesize)
result = [] of Match?
num_regs = LibOnigmo.num_regs(region)
if num_regs > 0
(0...num_regs).each do |i|
pos = LibOnigmo.pos(region, i)
l = LibOnigmo.len(region, i)
if pos == -1 || l == -1
result << nil
else
b = str.byte_index_to_char_index(pos)
e = str.byte_index_to_char_index(pos + l)
# p! pos, l, b, e, str[pos..]
if b.nil? || e.nil?
raise Exception.new "Invalid substring"
end
v = str[b...e]
result << Match.new(b, b + v.size, v)
end
end
else
return [] of Match
end
LibOnigmo.region_free(region)
result
end
end
end
# pattern = "\\w"
# str = "α"
# re = Onigmo::Regex.new(pattern, false, false, false)
# p! re.match(str)

94
src/onigmo/onigwrap.c Normal file
View File

@@ -0,0 +1,94 @@
#include "onigmo.h"
regex_t *onigwrap_create(char *pattern, int len, int ignoreCase, int multiline, int dotall)
{
regex_t *reg;
OnigErrorInfo einfo;
OnigOptionType onigOptions = ONIG_OPTION_DEFAULT;
if (ignoreCase == 1)
onigOptions |= ONIG_OPTION_IGNORECASE;
if (multiline == 1)
onigOptions |= ONIG_OPTION_NEGATE_SINGLELINE;
if (dotall == 1)
onigOptions |= ONIG_OPTION_DOTALL;
OnigUChar *stringStart = (OnigUChar*) pattern;
OnigUChar *stringEnd = (OnigUChar*) pattern + len;
int res = onig_new(&reg, stringStart, stringEnd, onigOptions, ONIG_ENCODING_UTF8, ONIG_SYNTAX_PYTHON, &einfo);
return reg;
}
void onigwrap_region_free(OnigRegion *region)
{
onig_region_free(region, 1);
}
void onigwrap_free(regex_t *reg)
{
onig_free(reg);
}
int onigwrap_index_in(regex_t *reg, char *charPtr, int offset, int length)
{
OnigUChar *stringStart = (OnigUChar*) charPtr;
OnigUChar *stringEnd = (OnigUChar*) (charPtr + length);
OnigUChar *stringOffset = (OnigUChar*) (charPtr + offset);
OnigUChar *stringRange = (OnigUChar*) stringEnd;
OnigRegion *region = onig_region_new();
int result = onig_search(reg, stringStart, stringEnd, stringOffset, stringRange, region, ONIG_OPTION_NONE);
onig_region_free(region, 1);
if (result >= 0)
return result >> 1;
if (result == ONIG_MISMATCH)
return -1;
return -2;
}
OnigRegion *onigwrap_search(regex_t *reg, char *charPtr, int offset, int length)
{
OnigUChar *stringStart = (OnigUChar*) charPtr;
OnigUChar *stringEnd = (OnigUChar*) (charPtr + length);
OnigUChar *stringOffset = (OnigUChar*) (charPtr + offset);
OnigUChar *stringRange = (OnigUChar*) stringEnd;
OnigRegion *region = onig_region_new();
int result = onig_search(reg, stringStart, stringEnd, stringOffset, stringRange, region, ONIG_OPTION_NONE);
return region;
}
int onigwrap_num_regs(OnigRegion *region)
{
return region->num_regs;
}
int onigwrap_pos(OnigRegion *region, int nth)
{
if (nth < region->num_regs)
{
int result = region->beg[nth];
if (result < 0)
return -1;
return result;
}
return -1;
}
int onigwrap_len(OnigRegion *region, int nth)
{
if (nth < region->num_regs)
{
int result = region->end[nth] - region->beg[nth];
return result;
}
return -1;
}

32
src/onigmo/onigwrap.h Normal file
View File

@@ -0,0 +1,32 @@
#include "onigmo.h"
#if defined(_WIN32)
#define ONIGWRAP_EXTERN extern __declspec(dllexport)
#else
#define ONIGWRAP_EXTERN extern
#endif
ONIGWRAP_EXTERN
regex_t *onigwrap_create(char *pattern, int len, int ignoreCase, int multiline);
ONIGWRAP_EXTERN
void onigwrap_region_free(OnigRegion *region);
ONIGWRAP_EXTERN
void onigwrap_free(regex_t *reg);
ONIGWRAP_EXTERN
int onigwrap_index_in(regex_t *reg, char *charPtr, int offset, int length);
ONIGWRAP_EXTERN
OnigRegion *onigwrap_search(regex_t *reg, char *charPtr, int offset, int length);
ONIGWRAP_EXTERN
int onigwrap_num_regs(OnigRegion *region);
ONIGWRAP_EXTERN
int onigwrap_pos(OnigRegion *region, int nth);
ONIGWRAP_EXTERN
int onigwrap_len(OnigRegion *region, int nth);

View File

@@ -3,6 +3,7 @@ require "./formatter"
require "./rules"
require "./styles"
require "./lexer"
require "./onigmo"
# These are lexer rules. They match with the text being parsed
# and perform actions, either emitting tokens or changing the
@@ -10,16 +11,22 @@ require "./lexer"
module Tartrazine
# This rule matches via a regex pattern
alias Regex = Onigmo::Regex
class Rule
property pattern : Regex = Re2.new ""
property pattern : Regex = Regex.new ""
property pattern2 : ::Regex = ::Regex.new ""
property actions : Array(Action) = [] of Action
property xml : String = "foo"
def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token))
match = pattern.match(text, pos)
match2 = pattern2.match(text, pos)
# We don't match if the match doesn't move the cursor
# because that causes infinite loops
return false, pos, [] of Token if match.nil? || match.end == 0
# The `match.begin > pos` is the same as the ANCHORED option
return false, pos, [] of Token if match.empty? || match[0].nil? || match[0].try { |m| m.begin > pos }
# p! match.map(&.to_s), match2, text[pos-1..pos + 20],"----------------------"
# Log.trace { "#{match}, #{pattern.inspect}, #{text}, #{pos}" }
tokens = [] of Token
# Emit the tokens
@@ -27,18 +34,23 @@ module Tartrazine
# Emit the token
tokens += action.emit(match, lexer)
end
Log.trace { "#{xml}, #{match.end}, #{tokens}" }
return true, match.end, tokens
# Log.trace { "#{xml}, #{match[0].end}, #{tokens}" }
return true, pos + match[0].as(Onigmo::Match).value.size, tokens
end
def initialize(node : XML::Node, multiline, dotall, ignorecase)
@xml = node.to_s
@pattern = Re2.new(
node["pattern"],
multiline,
dotall,
ignorecase,
anchored: true)
pattern = node["pattern"]
# flags = Regex::Options::ANCHORED
flags = ::Regex::Options::NO_UTF_CHECK
# MULTILINE implies DOTALL which we don't want, so we
# use in-pattern flag (?m) instead
flags |= ::Regex::Options::MULTILINE if multiline
pattern = "(?m)" + pattern if multiline
flags |= ::Regex::Options::DOTALL if dotall
flags |= ::Regex::Options::IGNORE_CASE if ignorecase
@pattern = Regex.new(pattern, ignorecase, multiline, dotall)
@pattern2 = ::Regex.new(pattern, flags)
add_actions(node)
end
@@ -90,25 +102,4 @@ module Tartrazine
add_actions(node)
end
end
# This is a hack to workaround that Crystal seems to disallow
# having regexes multiline but not dot_all
class Re2 < Regex
@source = "fa"
@options = Regex::Options::None
@jit = true
def initialize(pattern : String, multiline = false, dotall = false, ignorecase = false, anchored = false)
flags = LibPCRE2::UTF | LibPCRE2::DUPNAMES |
LibPCRE2::UCP
flags |= LibPCRE2::MULTILINE if multiline
flags |= LibPCRE2::DOTALL if dotall
flags |= LibPCRE2::CASELESS if ignorecase
flags |= LibPCRE2::ANCHORED if anchored
flags |= LibPCRE2::NO_UTF_CHECK
@re = Regex::PCRE2.compile(pattern, flags) do |error_message|
raise Exception.new(error_message)
end
end
end
end