diff --git a/README.md b/README.md index b38277d..8011bfc 100644 --- a/README.md +++ b/README.md @@ -184,6 +184,8 @@ Parsing [linguist/samples](https://github.com/github/linguist/tree/master/sample In all the cases above that have an issue number - we plan to update enry to match Linguist behavior. +> All the issues related to heuristics' regexp syntax incompatibilities with the RE2 engine can be avoided by using `oniguruma` instead (see [instuctions](#misc)) + ## Benchmarks Enry's language detection has been compared with Linguist's on [_linguist/samples_](https://github.com/github/linguist/tree/master/samples). diff --git a/data/rule/rule.go b/data/rule/rule.go index 7dc13bc..50da701 100644 --- a/data/rule/rule.go +++ b/data/rule/rule.go @@ -3,6 +3,15 @@ // with colliding extensions, based on regexps from Linguist data. package rule +import "github.com/go-enry/go-enry/v2/regex" + +// Matcher checks if the data matches (number of) pattern(s). +// Every heuristic rule below implements this interface. +// A regexp.Regexp satisfies this interface and can be used instead. +type Matcher interface { + Match(data []byte) bool +} + // Heuristic consist of (a number of) rules where each, if matches, // identifies content as belonging to a programming language(s). type Heuristic interface { @@ -10,15 +19,7 @@ type Heuristic interface { Languages() []string } -// Matcher checks if the data matches (number of) pattern. -// Every heuristic rule below implements this interface. -// A regexp.Regexp satisfies this interface and can be used instead. -type Matcher interface { - Match(data []byte) bool -} - -// languages struct incapsulate data common to every Matcher: all languages -// that it identifies. +// languages base struct with all the languages that a Matcher identifies. type languages struct { langs []string } @@ -33,6 +34,10 @@ func MatchingLanguages(langs ...string) languages { return languages{langs} } +func noLanguages() languages { + return MatchingLanguages([]string{}...) +} + // Implements a Heuristic. type or struct { languages @@ -40,14 +45,19 @@ type or struct { } // Or rule matches, if a single matching pattern exists. -// It receives only one pattern as it relies on compile-time optimization that -// represtes union with | inside a single regexp. -func Or(l languages, r Matcher) Heuristic { - return or{l, r} +// It receives only one pattern as it relies on optimization that +// represtes union with | inside a single regexp during code generation. +func Or(l languages, p Matcher) Heuristic { + //FIXME(bzz): this will not be the case as only some of the patterns may + // be non-RE2 => we shouldn't collate them not to loose the (accuracty of) whole rule + return or{l, p} } // Match implements rule.Matcher. func (r or) Match(data []byte) bool { + if runOnRE2AndRegexNotAccepted(r.pattern) { + return false + } return r.pattern.Match(data) } @@ -65,6 +75,9 @@ func And(l languages, m ...Matcher) Heuristic { // Match implements data.Matcher. func (r and) Match(data []byte) bool { for _, p := range r.patterns { + if runOnRE2AndRegexNotAccepted(p) { + continue + } if !p.Match(data) { return false } @@ -86,6 +99,9 @@ func Not(l languages, r ...Matcher) Heuristic { // Match implements data.Matcher. func (r not) Match(data []byte) bool { for _, p := range r.Patterns { + if runOnRE2AndRegexNotAccepted(p) { + continue + } if p.Match(data) { return false } @@ -107,3 +123,9 @@ func Always(l languages) Heuristic { func (r always) Match(data []byte) bool { return true } + +// checks if regular expression syntax isn't accepted by RE2 engine +func runOnRE2AndRegexNotAccepted(re Matcher) bool { + v, ok := re.(regex.EnryRegexp) + return ok && v == nil +} diff --git a/data/rule/rule_test.go b/data/rule/rule_test.go index 208fc07..ce53b67 100644 --- a/data/rule/rule_test.go +++ b/data/rule/rule_test.go @@ -1,39 +1,71 @@ package rule import ( - "regexp" "testing" + "github.com/go-enry/go-enry/v2/regex" "github.com/stretchr/testify/assert" ) const lang = "ActionScript" -var fixtures = []struct { +type fixture struct { name string rule Heuristic numLangs int - matching string + match string noMatch string -}{ - {"Always", Always(MatchingLanguages(lang)), 1, "a", ""}, - {"Not", Not(MatchingLanguages(lang), regexp.MustCompile(`a`)), 1, "b", "a"}, - {"And", And(MatchingLanguages(lang), regexp.MustCompile(`a`), regexp.MustCompile(`b`)), 1, "ab", "a"}, - {"Or", Or(MatchingLanguages(lang), regexp.MustCompile(`a|b`)), 1, "ab", "c"}, } -func TestRules(t *testing.T) { - for _, f := range fixtures { - t.Run(f.name, func(t *testing.T) { - assert.NotNil(t, f.rule) - assert.NotNil(t, f.rule.Languages()) - assert.Equal(t, f.numLangs, len(f.rule.Languages())) - assert.Truef(t, f.rule.Match([]byte(f.matching)), - "'%s' is expected to .Match() by rule %s%v", f.matching, f.name, f.rule) - if f.noMatch != "" { - assert.Falsef(t, f.rule.Match([]byte(f.noMatch)), - "'%s' is expected NOT to .Match() by rule %s%v", f.noMatch, f.name, f.rule) - } +var specificFixtures = map[string][]fixture{ + "": { // cases that don't vary between the engines + {"Always", Always(MatchingLanguages(lang)), 1, "a", ""}, + {"Not", Not(MatchingLanguages(lang), regex.MustCompile(`a`)), 1, "b", "a"}, + {"And", And(MatchingLanguages(lang), regex.MustCompile(`a`), regex.MustCompile(`b`)), 1, "ab", "a"}, + {"Or", Or(MatchingLanguages(lang), regex.MustCompile(`a|b`)), 1, "ab", "c"}, + // the results of these depend on the regex engine + // {"NilOr", Or(noLanguages(), regex.MustCompileRuby(``)), 0, "", "a"}, + // {"NilNot", Not(noLanguages(), regex.MustCompileRuby(`a`)), 0, "", "a"}, + }, + regex.RE2: { + {"NilAnd", And(noLanguages(), regex.MustCompileRuby(`a`), regex.MustCompile(`b`)), 0, "b", "a"}, + {"NilNot", Not(noLanguages(), regex.MustCompileRuby(`a`), regex.MustCompile(`b`)), 0, "c", "b"}, + }, + regex.Oniguruma: { + {"NilAnd", And(noLanguages(), regex.MustCompileRuby(`a`), regex.MustCompile(`b`)), 0, "ab", "c"}, + {"NilNot", Not(noLanguages(), regex.MustCompileRuby(`a`), regex.MustCompile(`b`)), 0, "c", "a"}, + {"NilOr", Or(noLanguages(), regex.MustCompileRuby(`a`) /*, regexp.MustCompile(`b`)*/), 0, "a", "b"}, + }, +} + +func testRulesForEngine(t *testing.T, engine string) { + if engine != "" && regex.Name != engine { + return + } + for _, f := range specificFixtures[engine] { + t.Run(engine+f.name, func(t *testing.T) { + check(t, f) }) } } + +func TestRules(t *testing.T) { + //TODO(bzz): can all be run in parallel + testRulesForEngine(t, "") + testRulesForEngine(t, regex.RE2) + testRulesForEngine(t, regex.Oniguruma) +} + +func check(t *testing.T, f fixture) { + assert.NotNil(t, f.rule) + assert.NotNil(t, f.rule.Languages()) + assert.Equal(t, f.numLangs, len(f.rule.Languages())) + if f.match != "" { + assert.Truef(t, f.rule.Match([]byte(f.match)), + "'%s' is expected to .Match() by rule %s%v", f.match, f.name, f.rule) + } + if f.noMatch != "" { + assert.Falsef(t, f.rule.Match([]byte(f.noMatch)), + "'%s' is expected NOT to .Match() by rule %s%v", f.noMatch, f.name, f.rule) + } +} diff --git a/internal/code-generator/generator/heuristics.go b/internal/code-generator/generator/heuristics.go index 5a3475e..ac89b97 100644 --- a/internal/code-generator/generator/heuristics.go +++ b/internal/code-generator/generator/heuristics.go @@ -72,9 +72,12 @@ func loadRule(namedPatterns map[string]StringArray, rule *Rule) *LanguagePattern } result = &LanguagePattern{"And", rule.Languages, "", subPatterns, true} } else if len(rule.Pattern) != 0 { // OrPattern + // FIXME(bzz): this optimization should only be applied if each pattern isRE2! pattern := strings.Join(rule.Pattern, orPipe) - // TODO(bzz): handle len(Languages)==0 better e.g. by emiting rule.Rule - // instead of an ugly `rule.Or( rule.MatchingLanguages(""), ... )` + + // TODO(bzz): handle the common case Or(len(Languages)==0) better + // e.g. by emiting `rule.Rule(...)` instead of + // an (ugly) `rule.Or( rule.MatchingLanguages(""), ... )` result = &LanguagePattern{"Or", rule.Languages, pattern, nil, isRE2(pattern)} } else if rule.NegativePattern != "" { // NotPattern pattern := rule.NegativePattern @@ -87,7 +90,7 @@ func loadRule(namedPatterns map[string]StringArray, rule *Rule) *LanguagePattern } if !isRE2(result.Pattern) { - log.Printf("RE2 incompatible rule: language:'%q', rule:'%q'\n", rule.Languages, result.Pattern) + log.Printf("RE2 incompatible rule: language:'%s', rule:'%s'\n", rule.Languages, result.Pattern) } return result } diff --git a/regex/oniguruma.go b/regex/oniguruma.go index 40462da..c152739 100644 --- a/regex/oniguruma.go +++ b/regex/oniguruma.go @@ -7,6 +7,8 @@ import ( rubex "github.com/go-enry/go-oniguruma" ) +const Name = Oniguruma + type EnryRegexp = *rubex.Regexp func MustCompile(s string) EnryRegexp { diff --git a/regex/regex.go b/regex/regex.go new file mode 100644 index 0000000..6d38f3f --- /dev/null +++ b/regex/regex.go @@ -0,0 +1,9 @@ +package regex + +// Package regex abstracts regular expression engine +// that can be chosen at compile-time by a build tag. + +const ( + RE2 = "RE2" + Oniguruma = "Oniguruma" +) diff --git a/regex/standard.go b/regex/standard.go index 5ca9607..e9ccdb9 100644 --- a/regex/standard.go +++ b/regex/standard.go @@ -7,6 +7,8 @@ import ( "regexp" ) +const Name = RE2 + type EnryRegexp = *regexp.Regexp func MustCompile(str string) EnryRegexp { @@ -21,9 +23,13 @@ func MustCompileMultiline(s string) EnryRegexp { } // MustCompileRuby used for expressions with syntax not supported by RE2. +// Now it's confusing as we use the result as [data/rule.Matcher] and +// +// (*Matcher)(nil) != nil +// +// What is a better way for an expression to indicate unsupported syntax? +// e.g. add .IsValidSyntax() to both, Matcher interface and EnryRegexp implementations? func MustCompileRuby(s string) EnryRegexp { - // TODO(bzz): find a bettee way? - // This will only trigger a panic on .Match() for the clients return nil } diff --git a/regex/standard_test.go b/regex/standard_test.go new file mode 100644 index 0000000..9e7755b --- /dev/null +++ b/regex/standard_test.go @@ -0,0 +1,27 @@ +//go:build !oniguruma +// +build !oniguruma + +package regex + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestMustCompileMultiline(t *testing.T) { + const re = `^\.(.*)!$` + want := MustCompileMultiline(re) + assert.Equal(t, "(?m)"+re, want.String()) + + const s = `.one +.two! +thre!` + if !want.MatchString(s) { + t.Fatalf("MustCompileMultiline(`%s`) must match multiline %q\n", re, s) + } +} + +func TestMustCompileRuby(t *testing.T) { + assert.Nil(t, MustCompileRuby(``)) +}