mirror of
https://github.com/ralsina/tartrazine.git
synced 2025-06-27 14:47:50 -03:00
heuristics regexp engine configurable #2, skip rules at runtime
This commit is contained in:
@ -3,6 +3,15 @@
|
||||
// with colliding extensions, based on regexps from Linguist data.
|
||||
package rule
|
||||
|
||||
import "github.com/go-enry/go-enry/v2/regex"
|
||||
|
||||
// Matcher checks if the data matches (number of) pattern(s).
|
||||
// Every heuristic rule below implements this interface.
|
||||
// A regexp.Regexp satisfies this interface and can be used instead.
|
||||
type Matcher interface {
|
||||
Match(data []byte) bool
|
||||
}
|
||||
|
||||
// Heuristic consist of (a number of) rules where each, if matches,
|
||||
// identifies content as belonging to a programming language(s).
|
||||
type Heuristic interface {
|
||||
@ -10,15 +19,7 @@ type Heuristic interface {
|
||||
Languages() []string
|
||||
}
|
||||
|
||||
// Matcher checks if the data matches (number of) pattern.
|
||||
// Every heuristic rule below implements this interface.
|
||||
// A regexp.Regexp satisfies this interface and can be used instead.
|
||||
type Matcher interface {
|
||||
Match(data []byte) bool
|
||||
}
|
||||
|
||||
// languages struct incapsulate data common to every Matcher: all languages
|
||||
// that it identifies.
|
||||
// languages base struct with all the languages that a Matcher identifies.
|
||||
type languages struct {
|
||||
langs []string
|
||||
}
|
||||
@ -33,6 +34,10 @@ func MatchingLanguages(langs ...string) languages {
|
||||
return languages{langs}
|
||||
}
|
||||
|
||||
func noLanguages() languages {
|
||||
return MatchingLanguages([]string{}...)
|
||||
}
|
||||
|
||||
// Implements a Heuristic.
|
||||
type or struct {
|
||||
languages
|
||||
@ -40,14 +45,19 @@ type or struct {
|
||||
}
|
||||
|
||||
// Or rule matches, if a single matching pattern exists.
|
||||
// It receives only one pattern as it relies on compile-time optimization that
|
||||
// represtes union with | inside a single regexp.
|
||||
func Or(l languages, r Matcher) Heuristic {
|
||||
return or{l, r}
|
||||
// It receives only one pattern as it relies on optimization that
|
||||
// represtes union with | inside a single regexp during code generation.
|
||||
func Or(l languages, p Matcher) Heuristic {
|
||||
//FIXME(bzz): this will not be the case as only some of the patterns may
|
||||
// be non-RE2 => we shouldn't collate them not to loose the (accuracty of) whole rule
|
||||
return or{l, p}
|
||||
}
|
||||
|
||||
// Match implements rule.Matcher.
|
||||
func (r or) Match(data []byte) bool {
|
||||
if runOnRE2AndRegexNotAccepted(r.pattern) {
|
||||
return false
|
||||
}
|
||||
return r.pattern.Match(data)
|
||||
}
|
||||
|
||||
@ -65,6 +75,9 @@ func And(l languages, m ...Matcher) Heuristic {
|
||||
// Match implements data.Matcher.
|
||||
func (r and) Match(data []byte) bool {
|
||||
for _, p := range r.patterns {
|
||||
if runOnRE2AndRegexNotAccepted(p) {
|
||||
continue
|
||||
}
|
||||
if !p.Match(data) {
|
||||
return false
|
||||
}
|
||||
@ -86,6 +99,9 @@ func Not(l languages, r ...Matcher) Heuristic {
|
||||
// Match implements data.Matcher.
|
||||
func (r not) Match(data []byte) bool {
|
||||
for _, p := range r.Patterns {
|
||||
if runOnRE2AndRegexNotAccepted(p) {
|
||||
continue
|
||||
}
|
||||
if p.Match(data) {
|
||||
return false
|
||||
}
|
||||
@ -107,3 +123,9 @@ func Always(l languages) Heuristic {
|
||||
func (r always) Match(data []byte) bool {
|
||||
return true
|
||||
}
|
||||
|
||||
// checks if regular expression syntax isn't accepted by RE2 engine
|
||||
func runOnRE2AndRegexNotAccepted(re Matcher) bool {
|
||||
v, ok := re.(regex.EnryRegexp)
|
||||
return ok && v == nil
|
||||
}
|
||||
|
@ -1,39 +1,71 @@
|
||||
package rule
|
||||
|
||||
import (
|
||||
"regexp"
|
||||
"testing"
|
||||
|
||||
"github.com/go-enry/go-enry/v2/regex"
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
const lang = "ActionScript"
|
||||
|
||||
var fixtures = []struct {
|
||||
type fixture struct {
|
||||
name string
|
||||
rule Heuristic
|
||||
numLangs int
|
||||
matching string
|
||||
match string
|
||||
noMatch string
|
||||
}{
|
||||
{"Always", Always(MatchingLanguages(lang)), 1, "a", ""},
|
||||
{"Not", Not(MatchingLanguages(lang), regexp.MustCompile(`a`)), 1, "b", "a"},
|
||||
{"And", And(MatchingLanguages(lang), regexp.MustCompile(`a`), regexp.MustCompile(`b`)), 1, "ab", "a"},
|
||||
{"Or", Or(MatchingLanguages(lang), regexp.MustCompile(`a|b`)), 1, "ab", "c"},
|
||||
}
|
||||
|
||||
func TestRules(t *testing.T) {
|
||||
for _, f := range fixtures {
|
||||
t.Run(f.name, func(t *testing.T) {
|
||||
assert.NotNil(t, f.rule)
|
||||
assert.NotNil(t, f.rule.Languages())
|
||||
assert.Equal(t, f.numLangs, len(f.rule.Languages()))
|
||||
assert.Truef(t, f.rule.Match([]byte(f.matching)),
|
||||
"'%s' is expected to .Match() by rule %s%v", f.matching, f.name, f.rule)
|
||||
if f.noMatch != "" {
|
||||
assert.Falsef(t, f.rule.Match([]byte(f.noMatch)),
|
||||
"'%s' is expected NOT to .Match() by rule %s%v", f.noMatch, f.name, f.rule)
|
||||
}
|
||||
var specificFixtures = map[string][]fixture{
|
||||
"": { // cases that don't vary between the engines
|
||||
{"Always", Always(MatchingLanguages(lang)), 1, "a", ""},
|
||||
{"Not", Not(MatchingLanguages(lang), regex.MustCompile(`a`)), 1, "b", "a"},
|
||||
{"And", And(MatchingLanguages(lang), regex.MustCompile(`a`), regex.MustCompile(`b`)), 1, "ab", "a"},
|
||||
{"Or", Or(MatchingLanguages(lang), regex.MustCompile(`a|b`)), 1, "ab", "c"},
|
||||
// the results of these depend on the regex engine
|
||||
// {"NilOr", Or(noLanguages(), regex.MustCompileRuby(``)), 0, "", "a"},
|
||||
// {"NilNot", Not(noLanguages(), regex.MustCompileRuby(`a`)), 0, "", "a"},
|
||||
},
|
||||
regex.RE2: {
|
||||
{"NilAnd", And(noLanguages(), regex.MustCompileRuby(`a`), regex.MustCompile(`b`)), 0, "b", "a"},
|
||||
{"NilNot", Not(noLanguages(), regex.MustCompileRuby(`a`), regex.MustCompile(`b`)), 0, "c", "b"},
|
||||
},
|
||||
regex.Oniguruma: {
|
||||
{"NilAnd", And(noLanguages(), regex.MustCompileRuby(`a`), regex.MustCompile(`b`)), 0, "ab", "c"},
|
||||
{"NilNot", Not(noLanguages(), regex.MustCompileRuby(`a`), regex.MustCompile(`b`)), 0, "c", "a"},
|
||||
{"NilOr", Or(noLanguages(), regex.MustCompileRuby(`a`) /*, regexp.MustCompile(`b`)*/), 0, "a", "b"},
|
||||
},
|
||||
}
|
||||
|
||||
func testRulesForEngine(t *testing.T, engine string) {
|
||||
if engine != "" && regex.Name != engine {
|
||||
return
|
||||
}
|
||||
for _, f := range specificFixtures[engine] {
|
||||
t.Run(engine+f.name, func(t *testing.T) {
|
||||
check(t, f)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestRules(t *testing.T) {
|
||||
//TODO(bzz): can all be run in parallel
|
||||
testRulesForEngine(t, "")
|
||||
testRulesForEngine(t, regex.RE2)
|
||||
testRulesForEngine(t, regex.Oniguruma)
|
||||
}
|
||||
|
||||
func check(t *testing.T, f fixture) {
|
||||
assert.NotNil(t, f.rule)
|
||||
assert.NotNil(t, f.rule.Languages())
|
||||
assert.Equal(t, f.numLangs, len(f.rule.Languages()))
|
||||
if f.match != "" {
|
||||
assert.Truef(t, f.rule.Match([]byte(f.match)),
|
||||
"'%s' is expected to .Match() by rule %s%v", f.match, f.name, f.rule)
|
||||
}
|
||||
if f.noMatch != "" {
|
||||
assert.Falsef(t, f.rule.Match([]byte(f.noMatch)),
|
||||
"'%s' is expected NOT to .Match() by rule %s%v", f.noMatch, f.name, f.rule)
|
||||
}
|
||||
}
|
||||
|
Reference in New Issue
Block a user