mirror of
https://github.com/ralsina/tartrazine.git
synced 2024-11-10 05:22:23 +00:00
heuristics regexp engine configurable #2, skip rules at runtime
This commit is contained in:
parent
d8913b00e9
commit
3aeb9879da
@ -184,6 +184,8 @@ Parsing [linguist/samples](https://github.com/github/linguist/tree/master/sample
|
|||||||
|
|
||||||
In all the cases above that have an issue number - we plan to update enry to match Linguist behavior.
|
In all the cases above that have an issue number - we plan to update enry to match Linguist behavior.
|
||||||
|
|
||||||
|
> All the issues related to heuristics' regexp syntax incompatibilities with the RE2 engine can be avoided by using `oniguruma` instead (see [instuctions](#misc))
|
||||||
|
|
||||||
## Benchmarks
|
## Benchmarks
|
||||||
|
|
||||||
Enry's language detection has been compared with Linguist's on [_linguist/samples_](https://github.com/github/linguist/tree/master/samples).
|
Enry's language detection has been compared with Linguist's on [_linguist/samples_](https://github.com/github/linguist/tree/master/samples).
|
||||||
|
@ -3,6 +3,15 @@
|
|||||||
// with colliding extensions, based on regexps from Linguist data.
|
// with colliding extensions, based on regexps from Linguist data.
|
||||||
package rule
|
package rule
|
||||||
|
|
||||||
|
import "github.com/go-enry/go-enry/v2/regex"
|
||||||
|
|
||||||
|
// Matcher checks if the data matches (number of) pattern(s).
|
||||||
|
// Every heuristic rule below implements this interface.
|
||||||
|
// A regexp.Regexp satisfies this interface and can be used instead.
|
||||||
|
type Matcher interface {
|
||||||
|
Match(data []byte) bool
|
||||||
|
}
|
||||||
|
|
||||||
// Heuristic consist of (a number of) rules where each, if matches,
|
// Heuristic consist of (a number of) rules where each, if matches,
|
||||||
// identifies content as belonging to a programming language(s).
|
// identifies content as belonging to a programming language(s).
|
||||||
type Heuristic interface {
|
type Heuristic interface {
|
||||||
@ -10,15 +19,7 @@ type Heuristic interface {
|
|||||||
Languages() []string
|
Languages() []string
|
||||||
}
|
}
|
||||||
|
|
||||||
// Matcher checks if the data matches (number of) pattern.
|
// languages base struct with all the languages that a Matcher identifies.
|
||||||
// Every heuristic rule below implements this interface.
|
|
||||||
// A regexp.Regexp satisfies this interface and can be used instead.
|
|
||||||
type Matcher interface {
|
|
||||||
Match(data []byte) bool
|
|
||||||
}
|
|
||||||
|
|
||||||
// languages struct incapsulate data common to every Matcher: all languages
|
|
||||||
// that it identifies.
|
|
||||||
type languages struct {
|
type languages struct {
|
||||||
langs []string
|
langs []string
|
||||||
}
|
}
|
||||||
@ -33,6 +34,10 @@ func MatchingLanguages(langs ...string) languages {
|
|||||||
return languages{langs}
|
return languages{langs}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func noLanguages() languages {
|
||||||
|
return MatchingLanguages([]string{}...)
|
||||||
|
}
|
||||||
|
|
||||||
// Implements a Heuristic.
|
// Implements a Heuristic.
|
||||||
type or struct {
|
type or struct {
|
||||||
languages
|
languages
|
||||||
@ -40,14 +45,19 @@ type or struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Or rule matches, if a single matching pattern exists.
|
// Or rule matches, if a single matching pattern exists.
|
||||||
// It receives only one pattern as it relies on compile-time optimization that
|
// It receives only one pattern as it relies on optimization that
|
||||||
// represtes union with | inside a single regexp.
|
// represtes union with | inside a single regexp during code generation.
|
||||||
func Or(l languages, r Matcher) Heuristic {
|
func Or(l languages, p Matcher) Heuristic {
|
||||||
return or{l, r}
|
//FIXME(bzz): this will not be the case as only some of the patterns may
|
||||||
|
// be non-RE2 => we shouldn't collate them not to loose the (accuracty of) whole rule
|
||||||
|
return or{l, p}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Match implements rule.Matcher.
|
// Match implements rule.Matcher.
|
||||||
func (r or) Match(data []byte) bool {
|
func (r or) Match(data []byte) bool {
|
||||||
|
if runOnRE2AndRegexNotAccepted(r.pattern) {
|
||||||
|
return false
|
||||||
|
}
|
||||||
return r.pattern.Match(data)
|
return r.pattern.Match(data)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -65,6 +75,9 @@ func And(l languages, m ...Matcher) Heuristic {
|
|||||||
// Match implements data.Matcher.
|
// Match implements data.Matcher.
|
||||||
func (r and) Match(data []byte) bool {
|
func (r and) Match(data []byte) bool {
|
||||||
for _, p := range r.patterns {
|
for _, p := range r.patterns {
|
||||||
|
if runOnRE2AndRegexNotAccepted(p) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
if !p.Match(data) {
|
if !p.Match(data) {
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
@ -86,6 +99,9 @@ func Not(l languages, r ...Matcher) Heuristic {
|
|||||||
// Match implements data.Matcher.
|
// Match implements data.Matcher.
|
||||||
func (r not) Match(data []byte) bool {
|
func (r not) Match(data []byte) bool {
|
||||||
for _, p := range r.Patterns {
|
for _, p := range r.Patterns {
|
||||||
|
if runOnRE2AndRegexNotAccepted(p) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
if p.Match(data) {
|
if p.Match(data) {
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
@ -107,3 +123,9 @@ func Always(l languages) Heuristic {
|
|||||||
func (r always) Match(data []byte) bool {
|
func (r always) Match(data []byte) bool {
|
||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// checks if regular expression syntax isn't accepted by RE2 engine
|
||||||
|
func runOnRE2AndRegexNotAccepted(re Matcher) bool {
|
||||||
|
v, ok := re.(regex.EnryRegexp)
|
||||||
|
return ok && v == nil
|
||||||
|
}
|
||||||
|
@ -1,39 +1,71 @@
|
|||||||
package rule
|
package rule
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"regexp"
|
|
||||||
"testing"
|
"testing"
|
||||||
|
|
||||||
|
"github.com/go-enry/go-enry/v2/regex"
|
||||||
"github.com/stretchr/testify/assert"
|
"github.com/stretchr/testify/assert"
|
||||||
)
|
)
|
||||||
|
|
||||||
const lang = "ActionScript"
|
const lang = "ActionScript"
|
||||||
|
|
||||||
var fixtures = []struct {
|
type fixture struct {
|
||||||
name string
|
name string
|
||||||
rule Heuristic
|
rule Heuristic
|
||||||
numLangs int
|
numLangs int
|
||||||
matching string
|
match string
|
||||||
noMatch string
|
noMatch string
|
||||||
}{
|
|
||||||
{"Always", Always(MatchingLanguages(lang)), 1, "a", ""},
|
|
||||||
{"Not", Not(MatchingLanguages(lang), regexp.MustCompile(`a`)), 1, "b", "a"},
|
|
||||||
{"And", And(MatchingLanguages(lang), regexp.MustCompile(`a`), regexp.MustCompile(`b`)), 1, "ab", "a"},
|
|
||||||
{"Or", Or(MatchingLanguages(lang), regexp.MustCompile(`a|b`)), 1, "ab", "c"},
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestRules(t *testing.T) {
|
var specificFixtures = map[string][]fixture{
|
||||||
for _, f := range fixtures {
|
"": { // cases that don't vary between the engines
|
||||||
t.Run(f.name, func(t *testing.T) {
|
{"Always", Always(MatchingLanguages(lang)), 1, "a", ""},
|
||||||
assert.NotNil(t, f.rule)
|
{"Not", Not(MatchingLanguages(lang), regex.MustCompile(`a`)), 1, "b", "a"},
|
||||||
assert.NotNil(t, f.rule.Languages())
|
{"And", And(MatchingLanguages(lang), regex.MustCompile(`a`), regex.MustCompile(`b`)), 1, "ab", "a"},
|
||||||
assert.Equal(t, f.numLangs, len(f.rule.Languages()))
|
{"Or", Or(MatchingLanguages(lang), regex.MustCompile(`a|b`)), 1, "ab", "c"},
|
||||||
assert.Truef(t, f.rule.Match([]byte(f.matching)),
|
// the results of these depend on the regex engine
|
||||||
"'%s' is expected to .Match() by rule %s%v", f.matching, f.name, f.rule)
|
// {"NilOr", Or(noLanguages(), regex.MustCompileRuby(``)), 0, "", "a"},
|
||||||
if f.noMatch != "" {
|
// {"NilNot", Not(noLanguages(), regex.MustCompileRuby(`a`)), 0, "", "a"},
|
||||||
assert.Falsef(t, f.rule.Match([]byte(f.noMatch)),
|
},
|
||||||
"'%s' is expected NOT to .Match() by rule %s%v", f.noMatch, f.name, f.rule)
|
regex.RE2: {
|
||||||
}
|
{"NilAnd", And(noLanguages(), regex.MustCompileRuby(`a`), regex.MustCompile(`b`)), 0, "b", "a"},
|
||||||
|
{"NilNot", Not(noLanguages(), regex.MustCompileRuby(`a`), regex.MustCompile(`b`)), 0, "c", "b"},
|
||||||
|
},
|
||||||
|
regex.Oniguruma: {
|
||||||
|
{"NilAnd", And(noLanguages(), regex.MustCompileRuby(`a`), regex.MustCompile(`b`)), 0, "ab", "c"},
|
||||||
|
{"NilNot", Not(noLanguages(), regex.MustCompileRuby(`a`), regex.MustCompile(`b`)), 0, "c", "a"},
|
||||||
|
{"NilOr", Or(noLanguages(), regex.MustCompileRuby(`a`) /*, regexp.MustCompile(`b`)*/), 0, "a", "b"},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
func testRulesForEngine(t *testing.T, engine string) {
|
||||||
|
if engine != "" && regex.Name != engine {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
for _, f := range specificFixtures[engine] {
|
||||||
|
t.Run(engine+f.name, func(t *testing.T) {
|
||||||
|
check(t, f)
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestRules(t *testing.T) {
|
||||||
|
//TODO(bzz): can all be run in parallel
|
||||||
|
testRulesForEngine(t, "")
|
||||||
|
testRulesForEngine(t, regex.RE2)
|
||||||
|
testRulesForEngine(t, regex.Oniguruma)
|
||||||
|
}
|
||||||
|
|
||||||
|
func check(t *testing.T, f fixture) {
|
||||||
|
assert.NotNil(t, f.rule)
|
||||||
|
assert.NotNil(t, f.rule.Languages())
|
||||||
|
assert.Equal(t, f.numLangs, len(f.rule.Languages()))
|
||||||
|
if f.match != "" {
|
||||||
|
assert.Truef(t, f.rule.Match([]byte(f.match)),
|
||||||
|
"'%s' is expected to .Match() by rule %s%v", f.match, f.name, f.rule)
|
||||||
|
}
|
||||||
|
if f.noMatch != "" {
|
||||||
|
assert.Falsef(t, f.rule.Match([]byte(f.noMatch)),
|
||||||
|
"'%s' is expected NOT to .Match() by rule %s%v", f.noMatch, f.name, f.rule)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@ -72,9 +72,12 @@ func loadRule(namedPatterns map[string]StringArray, rule *Rule) *LanguagePattern
|
|||||||
}
|
}
|
||||||
result = &LanguagePattern{"And", rule.Languages, "", subPatterns, true}
|
result = &LanguagePattern{"And", rule.Languages, "", subPatterns, true}
|
||||||
} else if len(rule.Pattern) != 0 { // OrPattern
|
} else if len(rule.Pattern) != 0 { // OrPattern
|
||||||
|
// FIXME(bzz): this optimization should only be applied if each pattern isRE2!
|
||||||
pattern := strings.Join(rule.Pattern, orPipe)
|
pattern := strings.Join(rule.Pattern, orPipe)
|
||||||
// TODO(bzz): handle len(Languages)==0 better e.g. by emiting rule.Rule
|
|
||||||
// instead of an ugly `rule.Or( rule.MatchingLanguages(""), ... )`
|
// TODO(bzz): handle the common case Or(len(Languages)==0) better
|
||||||
|
// e.g. by emiting `rule.Rule(...)` instead of
|
||||||
|
// an (ugly) `rule.Or( rule.MatchingLanguages(""), ... )`
|
||||||
result = &LanguagePattern{"Or", rule.Languages, pattern, nil, isRE2(pattern)}
|
result = &LanguagePattern{"Or", rule.Languages, pattern, nil, isRE2(pattern)}
|
||||||
} else if rule.NegativePattern != "" { // NotPattern
|
} else if rule.NegativePattern != "" { // NotPattern
|
||||||
pattern := rule.NegativePattern
|
pattern := rule.NegativePattern
|
||||||
@ -87,7 +90,7 @@ func loadRule(namedPatterns map[string]StringArray, rule *Rule) *LanguagePattern
|
|||||||
}
|
}
|
||||||
|
|
||||||
if !isRE2(result.Pattern) {
|
if !isRE2(result.Pattern) {
|
||||||
log.Printf("RE2 incompatible rule: language:'%q', rule:'%q'\n", rule.Languages, result.Pattern)
|
log.Printf("RE2 incompatible rule: language:'%s', rule:'%s'\n", rule.Languages, result.Pattern)
|
||||||
}
|
}
|
||||||
return result
|
return result
|
||||||
}
|
}
|
||||||
|
@ -7,6 +7,8 @@ import (
|
|||||||
rubex "github.com/go-enry/go-oniguruma"
|
rubex "github.com/go-enry/go-oniguruma"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
const Name = Oniguruma
|
||||||
|
|
||||||
type EnryRegexp = *rubex.Regexp
|
type EnryRegexp = *rubex.Regexp
|
||||||
|
|
||||||
func MustCompile(s string) EnryRegexp {
|
func MustCompile(s string) EnryRegexp {
|
||||||
|
9
regex/regex.go
Normal file
9
regex/regex.go
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
package regex
|
||||||
|
|
||||||
|
// Package regex abstracts regular expression engine
|
||||||
|
// that can be chosen at compile-time by a build tag.
|
||||||
|
|
||||||
|
const (
|
||||||
|
RE2 = "RE2"
|
||||||
|
Oniguruma = "Oniguruma"
|
||||||
|
)
|
@ -7,6 +7,8 @@ import (
|
|||||||
"regexp"
|
"regexp"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
const Name = RE2
|
||||||
|
|
||||||
type EnryRegexp = *regexp.Regexp
|
type EnryRegexp = *regexp.Regexp
|
||||||
|
|
||||||
func MustCompile(str string) EnryRegexp {
|
func MustCompile(str string) EnryRegexp {
|
||||||
@ -21,9 +23,13 @@ func MustCompileMultiline(s string) EnryRegexp {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// MustCompileRuby used for expressions with syntax not supported by RE2.
|
// MustCompileRuby used for expressions with syntax not supported by RE2.
|
||||||
|
// Now it's confusing as we use the result as [data/rule.Matcher] and
|
||||||
|
//
|
||||||
|
// (*Matcher)(nil) != nil
|
||||||
|
//
|
||||||
|
// What is a better way for an expression to indicate unsupported syntax?
|
||||||
|
// e.g. add .IsValidSyntax() to both, Matcher interface and EnryRegexp implementations?
|
||||||
func MustCompileRuby(s string) EnryRegexp {
|
func MustCompileRuby(s string) EnryRegexp {
|
||||||
// TODO(bzz): find a bettee way?
|
|
||||||
// This will only trigger a panic on .Match() for the clients
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
27
regex/standard_test.go
Normal file
27
regex/standard_test.go
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
//go:build !oniguruma
|
||||||
|
// +build !oniguruma
|
||||||
|
|
||||||
|
package regex
|
||||||
|
|
||||||
|
import (
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/stretchr/testify/assert"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestMustCompileMultiline(t *testing.T) {
|
||||||
|
const re = `^\.(.*)!$`
|
||||||
|
want := MustCompileMultiline(re)
|
||||||
|
assert.Equal(t, "(?m)"+re, want.String())
|
||||||
|
|
||||||
|
const s = `.one
|
||||||
|
.two!
|
||||||
|
thre!`
|
||||||
|
if !want.MatchString(s) {
|
||||||
|
t.Fatalf("MustCompileMultiline(`%s`) must match multiline %q\n", re, s)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestMustCompileRuby(t *testing.T) {
|
||||||
|
assert.Nil(t, MustCompileRuby(``))
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user