heuristics regexp engine configurable #2, skip rules at runtime

2025-09-28 16:12:26 +00:00 · 2023-01-19 15:58:43 +01:00
parent d8913b00e9
commit 3aeb9879da
8 changed files with 141 additions and 38 deletions
--- a/README.md
+++ b/README.md
@@ -184,6 +184,8 @@ Parsing [linguist/samples](https://github.com/github/linguist/tree/master/sample

 In all the cases above that have an issue number - we plan to update enry to match Linguist behavior.

+> All the issues related to heuristics' regexp  syntax incompatibilities with the RE2 engine can be avoided by using `oniguruma` instead (see [instuctions](#misc))
+
 ## Benchmarks

 Enry's language detection has been compared with Linguist's on [_linguist/samples_](https://github.com/github/linguist/tree/master/samples).
--- a/data/rule/rule.go
+++ b/data/rule/rule.go
@@ -3,6 +3,15 @@
 // with colliding extensions, based on regexps from Linguist data.
 package rule

+import "github.com/go-enry/go-enry/v2/regex"
+
+// Matcher checks if the data matches (number of) pattern(s).
+// Every heuristic rule below implements this interface.
+// A regexp.Regexp satisfies this interface and can be used instead.
+type Matcher interface {
+	Match(data []byte) bool
+}
+
 // Heuristic consist of (a number of) rules where each, if matches,
 // identifies content as belonging to a programming language(s).
 type Heuristic interface {
@@ -10,15 +19,7 @@ type Heuristic interface {
 	Languages() []string
 }

-// Matcher checks if the data matches (number of) pattern.
-// Every heuristic rule below implements this interface.
-// A regexp.Regexp satisfies this interface and can be used instead.
-type Matcher interface {
-	Match(data []byte) bool
-}
-
-// languages struct incapsulate data common to every Matcher: all languages
-// that it identifies.
+// languages base struct with all the languages that a Matcher identifies.
 type languages struct {
 	langs []string
 }
@@ -33,6 +34,10 @@ func MatchingLanguages(langs ...string) languages {
 	return languages{langs}
 }

+func noLanguages() languages {
+	return MatchingLanguages([]string{}...)
+}
+
 // Implements a Heuristic.
 type or struct {
 	languages
@@ -40,14 +45,19 @@ type or struct {
 }

 // Or rule matches, if a single matching pattern exists.
-// It receives only one pattern as it relies on compile-time optimization that
-// represtes union with | inside a single regexp.
-func Or(l languages, r Matcher) Heuristic {
-	return or{l, r}
+// It receives only one pattern as it relies on optimization that
+// represtes union with | inside a single regexp during code generation.
+func Or(l languages, p Matcher) Heuristic {
+	//FIXME(bzz): this will not be the case as only some of the patterns may
+	// be non-RE2 => we shouldn't collate them not to loose the (accuracty of) whole rule
+	return or{l, p}
 }

 // Match implements rule.Matcher.
 func (r or) Match(data []byte) bool {
+	if runOnRE2AndRegexNotAccepted(r.pattern) {
+		return false
+	}
 	return r.pattern.Match(data)
 }

@@ -65,6 +75,9 @@ func And(l languages, m ...Matcher) Heuristic {
 // Match implements data.Matcher.
 func (r and) Match(data []byte) bool {
 	for _, p := range r.patterns {
+		if runOnRE2AndRegexNotAccepted(p) {
+			continue
+		}
 		if !p.Match(data) {
 			return false
 		}
@@ -86,6 +99,9 @@ func Not(l languages, r ...Matcher) Heuristic {
 // Match implements data.Matcher.
 func (r not) Match(data []byte) bool {
 	for _, p := range r.Patterns {
+		if runOnRE2AndRegexNotAccepted(p) {
+			continue
+		}
 		if p.Match(data) {
 			return false
 		}
@@ -107,3 +123,9 @@ func Always(l languages) Heuristic {
 func (r always) Match(data []byte) bool {
 	return true
 }
+
+// checks if regular expression syntax isn't accepted by RE2 engine
+func runOnRE2AndRegexNotAccepted(re Matcher) bool {
+	v, ok := re.(regex.EnryRegexp)
+	return ok && v == nil
+}
--- a/data/rule/rule_test.go
+++ b/data/rule/rule_test.go
@@ -1,39 +1,71 @@
 package rule

 import (
-	"regexp"
 	"testing"

+	"github.com/go-enry/go-enry/v2/regex"
 	"github.com/stretchr/testify/assert"
 )

 const lang = "ActionScript"

-var fixtures = []struct {
+type fixture struct {
 	name     string
 	rule     Heuristic
 	numLangs int
-	matching string
+	match    string
 	noMatch  string
-}{
-	{"Always", Always(MatchingLanguages(lang)), 1, "a", ""},
-	{"Not", Not(MatchingLanguages(lang), regexp.MustCompile(`a`)), 1, "b", "a"},
-	{"And", And(MatchingLanguages(lang), regexp.MustCompile(`a`), regexp.MustCompile(`b`)), 1, "ab", "a"},
-	{"Or", Or(MatchingLanguages(lang), regexp.MustCompile(`a|b`)), 1, "ab", "c"},
 }

-func TestRules(t *testing.T) {
-	for _, f := range fixtures {
-		t.Run(f.name, func(t *testing.T) {
-			assert.NotNil(t, f.rule)
-			assert.NotNil(t, f.rule.Languages())
-			assert.Equal(t, f.numLangs, len(f.rule.Languages()))
-			assert.Truef(t, f.rule.Match([]byte(f.matching)),
-				"'%s' is expected to .Match() by rule %s%v", f.matching, f.name, f.rule)
-			if f.noMatch != "" {
-				assert.Falsef(t, f.rule.Match([]byte(f.noMatch)),
-					"'%s' is expected NOT to .Match() by rule %s%v", f.noMatch, f.name, f.rule)
-			}
+var specificFixtures = map[string][]fixture{
+	"": { // cases that don't vary between the engines
+		{"Always", Always(MatchingLanguages(lang)), 1, "a", ""},
+		{"Not", Not(MatchingLanguages(lang), regex.MustCompile(`a`)), 1, "b", "a"},
+		{"And", And(MatchingLanguages(lang), regex.MustCompile(`a`), regex.MustCompile(`b`)), 1, "ab", "a"},
+		{"Or", Or(MatchingLanguages(lang), regex.MustCompile(`a|b`)), 1, "ab", "c"},
+		// the results of these depend on the regex engine
+		// {"NilOr", Or(noLanguages(), regex.MustCompileRuby(``)), 0, "", "a"},
+		// {"NilNot", Not(noLanguages(), regex.MustCompileRuby(`a`)), 0, "", "a"},
+	},
+	regex.RE2: {
+		{"NilAnd", And(noLanguages(), regex.MustCompileRuby(`a`), regex.MustCompile(`b`)), 0, "b", "a"},
+		{"NilNot", Not(noLanguages(), regex.MustCompileRuby(`a`), regex.MustCompile(`b`)), 0, "c", "b"},
+	},
+	regex.Oniguruma: {
+		{"NilAnd", And(noLanguages(), regex.MustCompileRuby(`a`), regex.MustCompile(`b`)), 0, "ab", "c"},
+		{"NilNot", Not(noLanguages(), regex.MustCompileRuby(`a`), regex.MustCompile(`b`)), 0, "c", "a"},
+		{"NilOr", Or(noLanguages(), regex.MustCompileRuby(`a`) /*, regexp.MustCompile(`b`)*/), 0, "a", "b"},
+	},
+}
+
+func testRulesForEngine(t *testing.T, engine string) {
+	if engine != "" && regex.Name != engine {
+		return
+	}
+	for _, f := range specificFixtures[engine] {
+		t.Run(engine+f.name, func(t *testing.T) {
+			check(t, f)
 		})
 	}
 }
+
+func TestRules(t *testing.T) {
+	//TODO(bzz): can all be run in parallel
+	testRulesForEngine(t, "")
+	testRulesForEngine(t, regex.RE2)
+	testRulesForEngine(t, regex.Oniguruma)
+}
+
+func check(t *testing.T, f fixture) {
+	assert.NotNil(t, f.rule)
+	assert.NotNil(t, f.rule.Languages())
+	assert.Equal(t, f.numLangs, len(f.rule.Languages()))
+	if f.match != "" {
+		assert.Truef(t, f.rule.Match([]byte(f.match)),
+			"'%s' is expected to .Match() by rule %s%v", f.match, f.name, f.rule)
+	}
+	if f.noMatch != "" {
+		assert.Falsef(t, f.rule.Match([]byte(f.noMatch)),
+			"'%s' is expected NOT to .Match() by rule %s%v", f.noMatch, f.name, f.rule)
+	}
+}
--- a/internal/code-generator/generator/heuristics.go
+++ b/internal/code-generator/generator/heuristics.go
@@ -72,9 +72,12 @@ func loadRule(namedPatterns map[string]StringArray, rule *Rule) *LanguagePattern
 		}
 		result = &LanguagePattern{"And", rule.Languages, "", subPatterns, true}
 	} else if len(rule.Pattern) != 0 { // OrPattern
+		// FIXME(bzz): this optimization should only be applied if each pattern isRE2!
 		pattern := strings.Join(rule.Pattern, orPipe)
-		// TODO(bzz): handle len(Languages)==0 better e.g. by emiting rule.Rule
-		// instead of an ugly `rule.Or( rule.MatchingLanguages(""), ... )`
+
+		// TODO(bzz): handle the common case Or(len(Languages)==0) better
+		// e.g. by emiting `rule.Rule(...)` instead of
+		// an (ugly) `rule.Or( rule.MatchingLanguages(""), ... )`
 		result = &LanguagePattern{"Or", rule.Languages, pattern, nil, isRE2(pattern)}
 	} else if rule.NegativePattern != "" { // NotPattern
 		pattern := rule.NegativePattern
@@ -87,7 +90,7 @@ func loadRule(namedPatterns map[string]StringArray, rule *Rule) *LanguagePattern
 	}

 	if !isRE2(result.Pattern) {
-		log.Printf("RE2 incompatible rule: language:'%q', rule:'%q'\n", rule.Languages, result.Pattern)
+		log.Printf("RE2 incompatible rule: language:'%s', rule:'%s'\n", rule.Languages, result.Pattern)
 	}
 	return result
 }
--- a/regex/oniguruma.go
+++ b/regex/oniguruma.go
@@ -7,6 +7,8 @@ import (
 	rubex "github.com/go-enry/go-oniguruma"
 )

+const Name = Oniguruma
+
 type EnryRegexp = *rubex.Regexp

 func MustCompile(s string) EnryRegexp {
--- a/regex/regex.go
+++ b/regex/regex.go
@@ -0,0 +1,9 @@
+package regex
+
+// Package regex abstracts regular expression engine
+// that can be chosen at compile-time by a build tag.
+
+const (
+	RE2       = "RE2"
+	Oniguruma = "Oniguruma"
+)
--- a/regex/standard.go
+++ b/regex/standard.go
@@ -7,6 +7,8 @@ import (
 	"regexp"
 )

+const Name = RE2
+
 type EnryRegexp = *regexp.Regexp

 func MustCompile(str string) EnryRegexp {
@@ -21,9 +23,13 @@ func MustCompileMultiline(s string) EnryRegexp {
 }

 // MustCompileRuby used for expressions with syntax not supported by RE2.
+// Now it's confusing as we use the result as [data/rule.Matcher] and
+//
+//	(*Matcher)(nil) != nil
+//
+// What is a better way for an expression to indicate unsupported syntax?
+// e.g. add .IsValidSyntax() to both, Matcher interface and EnryRegexp implementations?
 func MustCompileRuby(s string) EnryRegexp {
-	// TODO(bzz): find a bettee way?
-	// This will only trigger a panic on .Match() for the clients
 	return nil
 }

--- a/regex/standard_test.go
+++ b/regex/standard_test.go
@@ -0,0 +1,27 @@
+//go:build !oniguruma
+// +build !oniguruma
+
+package regex
+
+import (
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestMustCompileMultiline(t *testing.T) {
+	const re = `^\.(.*)!$`
+	want := MustCompileMultiline(re)
+	assert.Equal(t, "(?m)"+re, want.String())
+
+	const s = `.one
+.two!
+thre!`
+	if !want.MatchString(s) {
+		t.Fatalf("MustCompileMultiline(`%s`) must match multiline %q\n", re, s)
+	}
+}
+
+func TestMustCompileRuby(t *testing.T) {
+	assert.Nil(t, MustCompileRuby(``))
+}