Merge pull request #151 from go-enry/distinguish-re-syntax

Syntax-aware regexp generation for configurable engines
2025-08-29 17:57:30 +00:00 · 2023-03-03 13:57:49 +01:00
parent ef6c17997c 8246efecce
commit 0e58945703
20 changed files with 1045 additions and 760 deletions
--- a/README.md
+++ b/README.md
@@ -184,6 +184,8 @@ Parsing [linguist/samples](https://github.com/github/linguist/tree/master/sample
 In all the cases above that have an issue number - we plan to update enry to match Linguist behavior.
 > All the issues related to heuristics' regexp  syntax incompatibilities with the RE2 engine can be avoided by using `oniguruma` instead (see [instuctions](#misc))
 ## Benchmarks
 Enry's language detection has been compared with Linguist's on [_linguist/samples_](https://github.com/github/linguist/tree/master/samples).
--- a/data/content.go
+++ b/data/content.go
--- a/data/rule/rule.go
+++ b/data/rule/rule.go
@@ -3,6 +3,15 @@
 // with colliding extensions, based on regexps from Linguist data.
 package rule
 import "github.com/go-enry/go-enry/v2/regex"
 // Matcher checks if the data matches (number of) pattern(s).
 // Every heuristic rule below implements this interface.
 // A regexp.Regexp satisfies this interface and can be used instead.
 type Matcher interface {
 	Match(data []byte) bool
 }
 // Heuristic consist of (a number of) rules where each, if matches,
 // identifies content as belonging to a programming language(s).
 type Heuristic interface {
@@ -10,15 +19,7 @@ type Heuristic interface {
 	Languages() []string
 }
-// Matcher checks if the data matches (number of) pattern.
+// languages base struct with all the languages that a Matcher identifies.
 // Every heuristic rule below implements this interface.
 // A regexp.Regexp satisfies this interface and can be used instead.
 type Matcher interface {
 	Match(data []byte) bool
 }
 // languages struct incapsulate data common to every Matcher: all languages
 // that it identifies.
 type languages struct {
 	langs []string
 }
@@ -33,6 +34,10 @@ func MatchingLanguages(langs ...string) languages {
 	return languages{langs}
 }
 func noLanguages() languages {
 	return MatchingLanguages([]string{}...)
 }
 // Implements a Heuristic.
 type or struct {
 	languages
@@ -40,14 +45,19 @@ type or struct {
 }
 // Or rule matches, if a single matching pattern exists.
-// It receives only one pattern as it relies on compile-time optimization that
+// It receives only one pattern as it relies on optimization that
-// represtes union with | inside a single regexp.
+// represtes union with | inside a single regexp during code generation.
-func Or(l languages, r Matcher) Heuristic {
+func Or(l languages, p Matcher) Heuristic {
-	return or{l, r}
+	//FIXME(bzz): this will not be the case as only some of the patterns may
 	// be non-RE2 => we shouldn't collate them not to loose the (accuracty of) whole rule
 	return or{l, p}
 }
 // Match implements rule.Matcher.
 func (r or) Match(data []byte) bool {
 	if runOnRE2AndRegexNotAccepted(r.pattern) {
 		return false
 	}
 	return r.pattern.Match(data)
 }
@@ -65,6 +75,9 @@ func And(l languages, m ...Matcher) Heuristic {
 // Match implements data.Matcher.
 func (r and) Match(data []byte) bool {
 	for _, p := range r.patterns {
 		if runOnRE2AndRegexNotAccepted(p) {
 			continue
 		}
 		if !p.Match(data) {
 			return false
 		}
@@ -86,6 +99,9 @@ func Not(l languages, r ...Matcher) Heuristic {
 // Match implements data.Matcher.
 func (r not) Match(data []byte) bool {
 	for _, p := range r.Patterns {
 		if runOnRE2AndRegexNotAccepted(p) {
 			continue
 		}
 		if p.Match(data) {
 			return false
 		}
@@ -107,3 +123,11 @@ func Always(l languages) Heuristic {
 func (r always) Match(data []byte) bool {
 	return true
 }
 // Checks if a regex syntax isn't accepted by RE2 engine.
 // It's nil by construction from regex.MustCompileRuby but
 // is used here as a Matcher interface wich itself is non-nil.
 func runOnRE2AndRegexNotAccepted(re Matcher) bool {
 	v, ok := re.(regex.EnryRegexp)
 	return ok && v == nil
 }
--- a/data/rule/rule_test.go
+++ b/data/rule/rule_test.go
@@ -1,39 +1,71 @@
 package rule
 import (
 	"regexp"
 	"testing"
 	"github.com/go-enry/go-enry/v2/regex"
 	"github.com/stretchr/testify/assert"
 )
 const lang = "ActionScript"
-var fixtures = []struct {
+type fixture struct {
 	name     string
 	rule     Heuristic
 	numLangs int
-	matching string
+	match    string
 	noMatch  string
-}{
+}
 var specificFixtures = map[string][]fixture{
 	"": { // cases that don't vary between the engines
 		{"Always", Always(MatchingLanguages(lang)), 1, "a", ""},
-	{"Not", Not(MatchingLanguages(lang), regexp.MustCompile(`a`)), 1, "b", "a"},
+		{"Not", Not(MatchingLanguages(lang), regex.MustCompile(`a`)), 1, "b", "a"},
-	{"And", And(MatchingLanguages(lang), regexp.MustCompile(`a`), regexp.MustCompile(`b`)), 1, "ab", "a"},
+		{"And", And(MatchingLanguages(lang), regex.MustCompile(`a`), regex.MustCompile(`b`)), 1, "ab", "a"},
-	{"Or", Or(MatchingLanguages(lang), regexp.MustCompile(`a|b`)), 1, "ab", "c"},
+		{"Or", Or(MatchingLanguages(lang), regex.MustCompile(`a|b`)), 1, "ab", "c"},
 		// the results of these depend on the regex engine
 		// {"NilOr", Or(noLanguages(), regex.MustCompileRuby(``)), 0, "", "a"},
 		// {"NilNot", Not(noLanguages(), regex.MustCompileRuby(`a`)), 0, "", "a"},
 	},
 	regex.RE2: {
 		{"NilAnd", And(noLanguages(), regex.MustCompileRuby(`a`), regex.MustCompile(`b`)), 0, "b", "a"},
 		{"NilNot", Not(noLanguages(), regex.MustCompileRuby(`a`), regex.MustCompile(`b`)), 0, "c", "b"},
 	},
 	regex.Oniguruma: {
 		{"NilAnd", And(noLanguages(), regex.MustCompileRuby(`a`), regex.MustCompile(`b`)), 0, "ab", "c"},
 		{"NilNot", Not(noLanguages(), regex.MustCompileRuby(`a`), regex.MustCompile(`b`)), 0, "c", "a"},
 		{"NilOr", Or(noLanguages(), regex.MustCompileRuby(`a`) /*, regexp.MustCompile(`b`)*/), 0, "a", "b"},
 	},
 }
 func testRulesForEngine(t *testing.T, engine string) {
 	if engine != "" && regex.Name != engine {
 		return
 	}
 	for _, f := range specificFixtures[engine] {
 		t.Run(engine+f.name, func(t *testing.T) {
 			check(t, f)
 		})
 	}
 }
 func TestRules(t *testing.T) {
-	for _, f := range fixtures {
+	//TODO(bzz): can all be run in parallel
-		t.Run(f.name, func(t *testing.T) {
+	testRulesForEngine(t, "")
 	testRulesForEngine(t, regex.RE2)
 	testRulesForEngine(t, regex.Oniguruma)
 }
 func check(t *testing.T, f fixture) {
 	assert.NotNil(t, f.rule)
 	assert.NotNil(t, f.rule.Languages())
 	assert.Equal(t, f.numLangs, len(f.rule.Languages()))
-			assert.Truef(t, f.rule.Match([]byte(f.matching)),
+	if f.match != "" {
-				"'%s' is expected to .Match() by rule %s%v", f.matching, f.name, f.rule)
+		assert.Truef(t, f.rule.Match([]byte(f.match)),
 			"'%s' is expected to .Match() by rule %s%v", f.match, f.name, f.rule)
 	}
 	if f.noMatch != "" {
 		assert.Falsef(t, f.rule.Match([]byte(f.noMatch)),
 			"'%s' is expected NOT to .Match() by rule %s%v", f.noMatch, f.name, f.rule)
 	}
 		})
 	}
 }
--- a/enry.go
+++ b/enry.go
@@ -1,15 +1,15 @@
 /*
-	Package enry implements multiple strategies for programming language identification.
+Package enry identifies programming languages.
-	Identification is made based on file name and file content using a service
+Identification is based on file name and content using a series
-	of strategies to narrow down possible option.
+of strategies to narrow down possible options.
-	Each strategy is available as a separate API call, as well as a main enty point
+Each strategy is available as a separate API call, as well as though the main enty point:
 	GetLanguage(filename string, content []byte) (language string)
-	It is a port of the https://github.com/github/linguist from Ruby.
+It is a port of the https://github.com/github/linguist from Ruby.
-	Upstream Linguist YAML files are used to generate datastructures for data
+Upstream Linguist YAML files are used to generate datastructures for data
-	package.
+package.
 */
 package enry // import "github.com/go-enry/go-enry/v2"
--- a/go.mod
+++ b/go.mod
@@ -4,6 +4,6 @@ go 1.14
 require (
 	github.com/go-enry/go-oniguruma v1.2.1
-	github.com/stretchr/testify v1.3.0
+	github.com/stretchr/testify v1.8.1
 	gopkg.in/yaml.v2 v2.2.8
 )
--- a/go.sum
+++ b/go.sum
@@ -1,16 +1,21 @@
 github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8=
 github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
-github.com/go-enry/go-oniguruma v1.2.0 h1:oBO9XC1IDT9+AoWW5oFsa/7gFeOPacEqDbyXZKWXuDs=
+github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
-github.com/go-enry/go-oniguruma v1.2.0/go.mod h1:bWDhYP+S6xZQgiRL7wlTScFYBe023B6ilRZbCAD5Hf4=
+github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/go-enry/go-oniguruma v1.2.1 h1:k8aAMuJfMrqm/56SG2lV9Cfti6tC4x8673aHCcBk+eo=
 github.com/go-enry/go-oniguruma v1.2.1/go.mod h1:bWDhYP+S6xZQgiRL7wlTScFYBe023B6ilRZbCAD5Hf4=
 github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
 github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
 github.com/stretchr/objx v0.1.0 h1:4G4v2dO3VZwixGIRoQ5Lfboy6nUhCyYzaqnIAPPhYs4=
 github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
-github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q=
+github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
-github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
+github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
 github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
 github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
 github.com/stretchr/testify v1.8.1 h1:w7B6lhMri9wdJUVmEZPGGhZzrYTPvgJArz7wNPgYKsk=
 github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
 gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
 gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
 gopkg.in/yaml.v2 v2.2.8 h1:obN1ZagJSUGI0Ek/LBmuj4SNLPfIny3KsKFopxRdj10=
 gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
 gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
 gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
 gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
--- a/internal/code-generator/assets/content.go.tmpl
+++ b/internal/code-generator/assets/content.go.tmpl
@@ -1,9 +1,8 @@
 package data
 import (
 	"regexp"
 	"github.com/go-enry/go-enry/v2/data/rule"
 	"github.com/go-enry/go-enry/v2/regex"
 )
 var ContentHeuristics = map[string]*Heuristics{
@@ -27,12 +26,12 @@ var ContentHeuristics = map[string]*Heuristics{
 	{{- else if eq .Op "Or" -}}
 		rule.Or(
 			{{ template "Languages" .Langs -}}
-			regexp.MustCompile({{ .Pattern | stringVal }}),
+			{{ template "mustCompile" . }}
 		),
 	{{- else if eq .Op "Not" -}}
 		rule.Not(
 			{{ template "Languages" .Langs -}}
-			regexp.MustCompile({{ .Pattern | stringVal }}),
+			{{ template "mustCompile" . }}
 		),
 	{{- else if eq .Op "Always" -}}
 		rule.Always(
@@ -49,3 +48,11 @@ var ContentHeuristics = map[string]*Heuristics{
 		rule.MatchingLanguages(""),
 	{{end -}}
 {{end}}
 {{define "mustCompile" -}}
 	{{ if .IsRE2  -}}
 		regex.MustCompileMultiline({{ .Pattern | stringVal }}),
 	{{- else -}}
 		regex.MustCompileRuby({{ .Pattern | stringVal }}),
 	{{ end -}}
 {{end}}
--- a/internal/code-generator/assets/vendor.go.tmpl
+++ b/internal/code-generator/assets/vendor.go.tmpl
@@ -2,11 +2,21 @@ package data
 import "github.com/go-enry/go-enry/v2/regex"
 {{define "mustCompile" -}}
 	{{ if isRE2 .  -}}
 		regex.MustCompile({{ . | stringVal }})
 	{{- else -}}
 		regex.MustCompileRuby({{ . | stringVal }})
 	{{- end -}}
 {{end}}
 var VendorMatchers = []regex.EnryRegexp{
-	{{range $regexp := . -}}
+	{{range $re := . -}}
-	regex.MustCompile(`{{ $regexp }}`),
+		{{ template "mustCompile" $re }},
 	{{end -}}
 }
 // FastVendorMatcher is equivalent to matching any of the VendorMatchers.
-var FastVendorMatcher = regex.MustCompile(`{{ optimize . }}`)
+{{with $singleRE := collateAllRegexps . -}}
 var FastVendorMatcher = {{template "mustCompile" $singleRE}}
 {{end}}
--- a/internal/code-generator/generator/generator.go
+++ b/internal/code-generator/generator/generator.go
@@ -3,7 +3,6 @@
 package generator
 import (
 	"bytes"
 	"fmt"
 	"go/format"
 	"io"
@@ -22,12 +21,15 @@ type File func(fileToParse, samplesDir, outPath, tmplPath, tmplName, commit stri
 func formatedWrite(outPath string, source []byte) error {
 	formatedSource, err := format.Source(source)
 	if err != nil {
-		return err
+		err = fmt.Errorf("'go fmt' fails on %v", err)
 		// write un-formatter source to simplify debugging
 		formatedSource = source
 	}
 	if err := ioutil.WriteFile(outPath, formatedSource, 0666); err != nil {
 		return err
 	}
-	return nil
+	return err
 }
 func executeTemplate(w io.Writer, name, path, commit string, fmap template.FuncMap, data interface{}) error {
@@ -40,35 +42,21 @@ func executeTemplate(w io.Writer, name, path, commit string, fmap template.FuncM
 		val = strings.ReplaceAll(val, "`", "`+\"`\"+`")
 		return fmt.Sprintf("`%s`", val)
 	}
 	const headerTmpl = "header.go.tmpl"
 	headerPath := filepath.Join(filepath.Dir(path), headerTmpl)
 	h := template.Must(template.New(headerTmpl).Funcs(template.FuncMap{
 		"getCommit": getCommit,
 		"stringVal": stringVal,
 	}).ParseFiles(headerPath))
 	buf := bytes.NewBuffer(nil)
 	if err := h.Execute(buf, data); err != nil {
 		return err
 	}
 	if fmap == nil {
 		fmap = make(template.FuncMap)
 	}
 	fmap["getCommit"] = getCommit
 	fmap["stringVal"] = stringVal
 	fmap["isRE2"] = isRE2
 	const headerTmpl = "header.go.tmpl"
 	headerPath := filepath.Join(filepath.Dir(path), headerTmpl)
 	h := template.Must(template.New(headerTmpl).Funcs(fmap).ParseFiles(headerPath))
 	if err := h.Execute(w, data); err != nil {
 		return err
 	}
 	t := template.Must(template.New(name).Funcs(fmap).ParseFiles(path))
-	if err := t.Execute(buf, data); err != nil {
+	return t.Execute(w, data)
 		return err
 	}
 	src, err := format.Source(buf.Bytes())
 	if err != nil {
 		return err
 	}
 	_, err = w.Write(src)
 	return err
 }
--- a/internal/code-generator/generator/heuristics.go
+++ b/internal/code-generator/generator/heuristics.go
@@ -70,25 +70,27 @@ func loadRule(namedPatterns map[string]StringArray, rule *Rule) *LanguagePattern
 			subp := loadRule(namedPatterns, r)
 			subPatterns = append(subPatterns, subp)
 		}
-		result = &LanguagePattern{"And", rule.Languages, "", subPatterns}
+		result = &LanguagePattern{"And", rule.Languages, "", subPatterns, true}
 	} else if len(rule.Pattern) != 0 { // OrPattern
-		conjunction := strings.Join(rule.Pattern, orPipe)
+		// FIXME(bzz): this optimization should only be applied if each pattern isRE2!
-		pattern := convertToValidRegexp(conjunction)
+		pattern := strings.Join(rule.Pattern, orPipe)
-		result = &LanguagePattern{"Or", rule.Languages, pattern, nil}
+
 		// TODO(bzz): handle the common case Or(len(Languages)==0) better
 		// e.g. by emiting `rule.Rule(...)` instead of
 		// an (ugly) `rule.Or( rule.MatchingLanguages(""), ... )`
 		result = &LanguagePattern{"Or", rule.Languages, pattern, nil, isRE2(pattern)}
 	} else if rule.NegativePattern != "" { // NotPattern
-		pattern := convertToValidRegexp(rule.NegativePattern)
+		pattern := rule.NegativePattern
-		result = &LanguagePattern{"Not", rule.Languages, pattern, nil}
+		result = &LanguagePattern{"Not", rule.Languages, pattern, nil, isRE2(pattern)}
 	} else if rule.NamedPattern != "" { // Named OrPattern
-		conjunction := strings.Join(namedPatterns[rule.NamedPattern], orPipe)
+		pattern := strings.Join(namedPatterns[rule.NamedPattern], orPipe)
-		pattern := convertToValidRegexp(conjunction)
+		result = &LanguagePattern{"Or", rule.Languages, pattern, nil, isRE2(pattern)}
 		result = &LanguagePattern{"Or", rule.Languages, pattern, nil}
 	} else { // AlwaysPattern
-		result = &LanguagePattern{"Always", rule.Languages, "", nil}
+		result = &LanguagePattern{"Always", rule.Languages, "", nil, true}
 	}
-	if isUnsupportedRegexpSyntax(result.Pattern) {
+	if !isRE2(result.Pattern) {
-		log.Printf("skipping rule: language:'%q', rule:'%q'\n", rule.Languages, result.Pattern)
+		log.Printf("RE2 incompatible syntax for heuristic language:'%s', rule:'%s'\n", rule.Languages, result.Pattern)
 		return nil
 	}
 	return result
 }
@@ -100,6 +102,7 @@ type LanguagePattern struct {
 	Langs   []string
 	Pattern string
 	Rules   []*LanguagePattern
 	IsRE2   bool
 }
 type Heuristics struct {
@@ -125,7 +128,7 @@ type Patterns struct {
 }
 // StringArray is workaround for parsing named_pattern,
-// wich is sometimes arry and sometimes not.
+// wich is sometimes an array and sometimes is not.
 // See https://github.com/go-yaml/yaml/issues/100
 type StringArray []string
@@ -173,8 +176,6 @@ func isUnsupportedRegexpSyntax(reg string) bool {
 		(strings.HasPrefix(reg, multilinePrefix+`/`) && strings.HasSuffix(reg, `/`))
 }
-// convertToValidRegexp converts Ruby regexp syntax to RE2 equivalent.
+func isRE2(s string) bool {
-// Does not work with Ruby regexp literals.
+	return !isUnsupportedRegexpSyntax(s)
 func convertToValidRegexp(rubyRegexp string) string {
 	return multilinePrefix + rubyRegexp
 }
--- a/internal/code-generator/generator/test_files/content.gold
+++ b/internal/code-generator/generator/test_files/content.gold
--- a/internal/code-generator/generator/vendor.go
+++ b/internal/code-generator/generator/vendor.go
@@ -5,6 +5,7 @@ import (
 	"fmt"
 	"io"
 	"io/ioutil"
 	"log"
 	"sort"
 	"strings"
 	"text/template"
@@ -25,6 +26,12 @@ func Vendor(fileToParse, samplesDir, outPath, tmplPath, tmplName, commit string)
 		return fmt.Errorf("failed to parse YAML %s, %q", fileToParse, err)
 	}
 	for _, re := range regexps {
 		if !isRE2(re) {
 			log.Printf("RE2 incompatible syntax for vendor:'%s'\n", re)
 		}
 	}
 	buf := &bytes.Buffer{}
 	if err := executeVendorTemplate(buf, regexps, tmplPath, tmplName, commit); err != nil {
 		return err
@@ -34,34 +41,14 @@ func Vendor(fileToParse, samplesDir, outPath, tmplPath, tmplName, commit string)
 }
 func executeVendorTemplate(out io.Writer, regexps []string, tmplPath, tmplName, commit string) error {
-	funcs := template.FuncMap{"optimize": collateAllMatchers}
+	funcs := template.FuncMap{"collateAllRegexps": collateAllRegexps}
 	return executeTemplate(out, tmplName, tmplPath, commit, funcs, regexps)
 }
-func collateAllMatchers(regexps []string) string {
+// collateAllRegexps all regexps to a single large regexp.
-	// We now collate all regexps from VendorMatchers to a single large regexp
+func collateAllRegexps(regexps []string) string {
 	// which is at least twice as fast to test than simply iterating & matching.
 	//
 	// ---
 	//
 	// We could test each matcher from VendorMatchers in turn i.e.
 	//
 	//  	func IsVendor(filename string) bool {
 	// 			for _, matcher := range data.VendorMatchers {
 	// 				if matcher.MatchString(filename) {
 	//					return true
 	//				}
 	//			}
 	//			return false
 	//		}
 	//
 	// Or naïvely concatentate all these regexps using groups i.e.
 	//
 	//		`(regexp1)|(regexp2)|(regexp3)|...`
 	//
 	// However, both of these are relatively slow and don't take advantage
 	// of the inherent structure within our regexps.
 	//
 	// Imperical observation: by looking at the regexps, we only have 3 types.
 	//  1. Those that start with `^`
 	//  2. Those that start with `(^|/)`
@@ -81,8 +68,8 @@ func collateAllMatchers(regexps []string) string {
 	sort.Strings(regexps)
 	// Check prefix, group expressions
 	var caretPrefixed, caretOrSlashPrefixed, theRest []string
 	// Check prefix, add to the respective group slices
 	for _, re := range regexps {
 		if strings.HasPrefix(re, caret) {
 			caretPrefixed = append(caretPrefixed, re[len(caret):])
@@ -92,6 +79,7 @@ func collateAllMatchers(regexps []string) string {
 			theRest = append(theRest, re)
 		}
 	}
 	var sb strings.Builder
 	appendGroupWithCommonPrefix(&sb, "^", caretPrefixed)
 	sb.WriteString("|")
--- a/internal/code-generator/main.go
+++ b/internal/code-generator/main.go
@@ -134,7 +134,7 @@ func main() {
 	for _, file := range fileList {
 		if err := file.generate(file.fileToParse, file.samplesDir, file.outPath, file.tmplPath, file.tmplName, file.commit); err != nil {
-			log.Fatalf("error generating template %q to %q: %+v", file.tmplPath, file.outPath, err)
+			log.Fatalf("failed to generate %q from %q - %+v", file.outPath, file.tmplPath, err)
 		}
 	}
 }
--- a/regex/oniguruma.go
+++ b/regex/oniguruma.go
@@ -1,3 +1,4 @@
 //go:build oniguruma
 // +build oniguruma
 package regex
@@ -6,10 +7,21 @@ import (
 	rubex "github.com/go-enry/go-oniguruma"
 )
 const Name = Oniguruma
 type EnryRegexp = *rubex.Regexp
-func MustCompile(str string) EnryRegexp {
+func MustCompile(s string) EnryRegexp {
-	return rubex.MustCompileASCII(str)
+	return rubex.MustCompileASCII(s)
 }
 // MustCompileMultiline matches in multi-line mode by default with Oniguruma.
 func MustCompileMultiline(s string) EnryRegexp {
 	return MustCompile(s)
 }
 func MustCompileRuby(s string) EnryRegexp {
 	return MustCompile(s)
 }
 func QuoteMeta(s string) string {
--- a/regex/regex.go
+++ b/regex/regex.go
@@ -0,0 +1,9 @@
 package regex
 // Package regex abstracts regular expression engine
 // that can be chosen at compile-time by a build tag.
 const (
 	RE2       = "RE2"
 	Oniguruma = "Oniguruma"
 )
--- a/regex/standard.go
+++ b/regex/standard.go
@@ -1,3 +1,4 @@
 //go:build !oniguruma
 // +build !oniguruma
 package regex
@@ -6,12 +7,32 @@ import (
 	"regexp"
 )
 const Name = RE2
 type EnryRegexp = *regexp.Regexp
 func MustCompile(str string) EnryRegexp {
 	return regexp.MustCompile(str)
 }
 // MustCompileMultiline mimics Ruby defaults for regexp, where ^$ matches begin/end of line.
 // I.e. it converts Ruby regexp syntaxt to RE2 equivalent
 func MustCompileMultiline(s string) EnryRegexp {
 	const multilineModeFlag = "(?m)"
 	return regexp.MustCompile(multilineModeFlag + s)
 }
 // MustCompileRuby used for expressions with syntax not supported by RE2.
 // Now it's confusing as we use the result as [data/rule.Matcher] and
 //
 //	(*Matcher)(nil) != nil
 //
 // What is a better way for an expression to indicate unsupported syntax?
 // e.g. add .IsValidSyntax() to both, Matcher interface and EnryRegexp implementations?
 func MustCompileRuby(s string) EnryRegexp {
 	return nil
 }
 func QuoteMeta(s string) string {
 	return regexp.QuoteMeta(s)
 }
--- a/regex/standard_test.go
+++ b/regex/standard_test.go
@@ -0,0 +1,27 @@
 //go:build !oniguruma
 // +build !oniguruma
 package regex
 import (
 	"testing"
 	"github.com/stretchr/testify/assert"
 )
 func TestMustCompileMultiline(t *testing.T) {
 	const re = `^\.(.*)!$`
 	want := MustCompileMultiline(re)
 	assert.Equal(t, "(?m)"+re, want.String())
 	const s = `.one
 .two!
 thre!`
 	if !want.MatchString(s) {
 		t.Fatalf("MustCompileMultiline(`%s`) must match multiline %q\n", re, s)
 	}
 }
 func TestMustCompileRuby(t *testing.T) {
 	assert.Nil(t, MustCompileRuby(``))
 }
--- a/utils.go
+++ b/utils.go
@@ -63,7 +63,21 @@ func IsDotFile(path string) bool {
 // IsVendor returns whether or not path is a vendor path.
 func IsVendor(path string) bool {
 	// fast path: single collatated regex, if the engine supports its syntax
 	if data.FastVendorMatcher != nil {
 		return data.FastVendorMatcher.MatchString(path)
 	}
 	// slow path: skip individual rules with unsupported syntax
 	for _, matcher := range data.VendorMatchers {
 		if matcher == nil {
 			continue
 		}
 		if matcher.MatchString(path) {
 			return true
 		}
 	}
 	return false
 }
 // IsTest returns whether or not path is a test path.
--- a/utils_test.go
+++ b/utils_test.go
@@ -7,57 +7,62 @@ import (
 	"path/filepath"
 	"testing"
 	"github.com/go-enry/go-enry/v2/regex"
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
 )
-//TODO(bzz): port all from test/test_file_blob.rb test_vendored()
+// TODO(bzz): port all from test/test_file_blob.rb test_vendored()
-//https://github.com/github/linguist/blob/86adc140d3e8903980565a2984f5532edf4ae875/test/test_file_blob.rb#L270-L583
+// https://github.com/github/linguist/blob/86adc140d3e8903980565a2984f5532edf4ae875/test/test_file_blob.rb#L270-L583
 var vendorTests = []struct {
 	skipOnRE2 bool // some rules are (present in code but) missing at runtime on RE2
 	path      string
 	expected  bool
 }{
-	{"cache/", true},
+	{path: "cache/", expected: true},
-	{"something_cache/", false},
+	{false, "something_cache/", false},
-	{"random/cache/", true},
+	{false, "random/cache/", true},
-	{"cache", false},
+	{false, "cache", false},
-	{"dependencies/", true},
+	{false, "dependencies/", true},
-	{"Dependencies/", true},
+	{false, "Dependencies/", true},
-	{"dependency/", false},
+	{false, "dependency/", false},
-	{"dist/", true},
+	{false, "dist/", true},
-	{"dist", false},
+	{false, "dist", false},
-	{"random/dist/", true},
+	{false, "random/dist/", true},
-	{"random/dist", false},
+	{false, "random/dist", false},
-	{"deps/", true},
+	{false, "deps/", true},
-	{"foodeps/", false},
+	{false, "foodeps/", false},
-	{"configure", true},
+	{false, "configure", true},
-	{"a/configure", true},
+	{false, "a/configure", true},
-	{"config.guess", true},
+	{false, "config.guess", true},
-	{"config.guess/", false},
+	{false, "config.guess/", false},
-	{".vscode/", true},
+	{false, ".vscode/", true},
-	{"doc/_build/", true},
+	{false, "doc/_build/", true},
-	{"a/docs/_build/", true},
+	{false, "a/docs/_build/", true},
-	{"a/dasdocs/_build-vsdoc.js", true},
+	{false, "a/dasdocs/_build-vsdoc.js", true},
-	{"a/dasdocs/_build-vsdoc.j", false},
+	{false, "a/dasdocs/_build-vsdoc.j", false},
-	{"foo/bar", false},
+	{false, "foo/bar", false},
-	{".sublime-project", true},
+	{false, ".sublime-project", true},
-	{"foo/vendor/foo", true},
+	{false, "foo/vendor/foo", true},
-	{"leaflet.draw-src.js", true},
+	{false, "leaflet.draw-src.js", true},
-	{"foo/bar/MochiKit.js", true},
+	{false, "foo/bar/MochiKit.js", true},
-	{"foo/bar/dojo.js", true},
+	{false, "foo/bar/dojo.js", true},
-	{"foo/env/whatever", true},
+	{false, "foo/env/whatever", true},
-	{"some/python/venv/", false},
+	{false, "some/python/venv/", false},
-	{"foo/.imageset/bar", true},
+	{false, "foo/.imageset/bar", true},
-	{"Vagrantfile", true},
+	{false, "Vagrantfile", true},
-	{"src/bootstrap-custom.js", true},
+	{true, "src/bootstrap-custom.js", true},
-	// {"/css/bootstrap.rtl.css", true}, // from linguist v7.23
+	// {true, "/css/bootstrap.rtl.css", true}, // from linguist v7.23
 }
 func TestIsVendor(t *testing.T) {
-	for _, tt := range vendorTests {
+	for _, test := range vendorTests {
-		t.Run(tt.path, func(t *testing.T) {
+		t.Run(test.path, func(t *testing.T) {
-			if got := IsVendor(tt.path); got != tt.expected {
+			if got := IsVendor(test.path); got != test.expected {
-				t.Errorf("IsVendor(%q) = %v, expected %v", tt.path, got, tt.expected)
+				if regex.Name == regex.RE2 && test.skipOnRE2 {
 					return // skip
 				}
 				t.Errorf("IsVendor(%q) = %v, expected %v (usuing %s)", test.path, got, test.expected, regex.Name)
 			}
 		})
 	}