Merge pull request #151 from go-enry/distinguish-re-syntax

Syntax-aware regexp generation for configurable engines
2025-08-08 17:17:52 +00:00 · 2023-03-03 13:57:49 +01:00
parent ef6c17997c 8246efecce
commit 0e58945703
20 changed files with 1045 additions and 760 deletions
--- a/README.md
+++ b/README.md
@@ -184,6 +184,8 @@ Parsing [linguist/samples](https://github.com/github/linguist/tree/master/sample

 In all the cases above that have an issue number - we plan to update enry to match Linguist behavior.

+> All the issues related to heuristics' regexp  syntax incompatibilities with the RE2 engine can be avoided by using `oniguruma` instead (see [instuctions](#misc))
+
 ## Benchmarks

 Enry's language detection has been compared with Linguist's on [_linguist/samples_](https://github.com/github/linguist/tree/master/samples).
--- a/data/content.go
+++ b/data/content.go
--- a/data/rule/rule.go
+++ b/data/rule/rule.go
@@ -3,6 +3,15 @@
 // with colliding extensions, based on regexps from Linguist data.
 package rule

+import "github.com/go-enry/go-enry/v2/regex"
+
+// Matcher checks if the data matches (number of) pattern(s).
+// Every heuristic rule below implements this interface.
+// A regexp.Regexp satisfies this interface and can be used instead.
+type Matcher interface {
+	Match(data []byte) bool
+}
+
 // Heuristic consist of (a number of) rules where each, if matches,
 // identifies content as belonging to a programming language(s).
 type Heuristic interface {
@@ -10,15 +19,7 @@ type Heuristic interface {
 	Languages() []string
 }

-// Matcher checks if the data matches (number of) pattern.
-// Every heuristic rule below implements this interface.
-// A regexp.Regexp satisfies this interface and can be used instead.
-type Matcher interface {
-	Match(data []byte) bool
-}
-
-// languages struct incapsulate data common to every Matcher: all languages
-// that it identifies.
+// languages base struct with all the languages that a Matcher identifies.
 type languages struct {
 	langs []string
 }
@@ -33,6 +34,10 @@ func MatchingLanguages(langs ...string) languages {
 	return languages{langs}
 }

+func noLanguages() languages {
+	return MatchingLanguages([]string{}...)
+}
+
 // Implements a Heuristic.
 type or struct {
 	languages
@@ -40,14 +45,19 @@ type or struct {
 }

 // Or rule matches, if a single matching pattern exists.
-// It receives only one pattern as it relies on compile-time optimization that
-// represtes union with | inside a single regexp.
-func Or(l languages, r Matcher) Heuristic {
-	return or{l, r}
+// It receives only one pattern as it relies on optimization that
+// represtes union with | inside a single regexp during code generation.
+func Or(l languages, p Matcher) Heuristic {
+	//FIXME(bzz): this will not be the case as only some of the patterns may
+	// be non-RE2 => we shouldn't collate them not to loose the (accuracty of) whole rule
+	return or{l, p}
 }

 // Match implements rule.Matcher.
 func (r or) Match(data []byte) bool {
+	if runOnRE2AndRegexNotAccepted(r.pattern) {
+		return false
+	}
 	return r.pattern.Match(data)
 }

@@ -65,6 +75,9 @@ func And(l languages, m ...Matcher) Heuristic {
 // Match implements data.Matcher.
 func (r and) Match(data []byte) bool {
 	for _, p := range r.patterns {
+		if runOnRE2AndRegexNotAccepted(p) {
+			continue
+		}
 		if !p.Match(data) {
 			return false
 		}
@@ -86,6 +99,9 @@ func Not(l languages, r ...Matcher) Heuristic {
 // Match implements data.Matcher.
 func (r not) Match(data []byte) bool {
 	for _, p := range r.Patterns {
+		if runOnRE2AndRegexNotAccepted(p) {
+			continue
+		}
 		if p.Match(data) {
 			return false
 		}
@@ -107,3 +123,11 @@ func Always(l languages) Heuristic {
 func (r always) Match(data []byte) bool {
 	return true
 }
+
+// Checks if a regex syntax isn't accepted by RE2 engine.
+// It's nil by construction from regex.MustCompileRuby but
+// is used here as a Matcher interface wich itself is non-nil.
+func runOnRE2AndRegexNotAccepted(re Matcher) bool {
+	v, ok := re.(regex.EnryRegexp)
+	return ok && v == nil
+}
--- a/data/rule/rule_test.go
+++ b/data/rule/rule_test.go
@@ -1,39 +1,71 @@
 package rule

 import (
-	"regexp"
 	"testing"

+	"github.com/go-enry/go-enry/v2/regex"
 	"github.com/stretchr/testify/assert"
 )

 const lang = "ActionScript"

-var fixtures = []struct {
+type fixture struct {
 	name     string
 	rule     Heuristic
 	numLangs int
-	matching string
+	match    string
 	noMatch  string
-}{
-	{"Always", Always(MatchingLanguages(lang)), 1, "a", ""},
-	{"Not", Not(MatchingLanguages(lang), regexp.MustCompile(`a`)), 1, "b", "a"},
-	{"And", And(MatchingLanguages(lang), regexp.MustCompile(`a`), regexp.MustCompile(`b`)), 1, "ab", "a"},
-	{"Or", Or(MatchingLanguages(lang), regexp.MustCompile(`a|b`)), 1, "ab", "c"},
 }

-func TestRules(t *testing.T) {
-	for _, f := range fixtures {
-		t.Run(f.name, func(t *testing.T) {
-			assert.NotNil(t, f.rule)
-			assert.NotNil(t, f.rule.Languages())
-			assert.Equal(t, f.numLangs, len(f.rule.Languages()))
-			assert.Truef(t, f.rule.Match([]byte(f.matching)),
-				"'%s' is expected to .Match() by rule %s%v", f.matching, f.name, f.rule)
-			if f.noMatch != "" {
-				assert.Falsef(t, f.rule.Match([]byte(f.noMatch)),
-					"'%s' is expected NOT to .Match() by rule %s%v", f.noMatch, f.name, f.rule)
-			}
+var specificFixtures = map[string][]fixture{
+	"": { // cases that don't vary between the engines
+		{"Always", Always(MatchingLanguages(lang)), 1, "a", ""},
+		{"Not", Not(MatchingLanguages(lang), regex.MustCompile(`a`)), 1, "b", "a"},
+		{"And", And(MatchingLanguages(lang), regex.MustCompile(`a`), regex.MustCompile(`b`)), 1, "ab", "a"},
+		{"Or", Or(MatchingLanguages(lang), regex.MustCompile(`a|b`)), 1, "ab", "c"},
+		// the results of these depend on the regex engine
+		// {"NilOr", Or(noLanguages(), regex.MustCompileRuby(``)), 0, "", "a"},
+		// {"NilNot", Not(noLanguages(), regex.MustCompileRuby(`a`)), 0, "", "a"},
+	},
+	regex.RE2: {
+		{"NilAnd", And(noLanguages(), regex.MustCompileRuby(`a`), regex.MustCompile(`b`)), 0, "b", "a"},
+		{"NilNot", Not(noLanguages(), regex.MustCompileRuby(`a`), regex.MustCompile(`b`)), 0, "c", "b"},
+	},
+	regex.Oniguruma: {
+		{"NilAnd", And(noLanguages(), regex.MustCompileRuby(`a`), regex.MustCompile(`b`)), 0, "ab", "c"},
+		{"NilNot", Not(noLanguages(), regex.MustCompileRuby(`a`), regex.MustCompile(`b`)), 0, "c", "a"},
+		{"NilOr", Or(noLanguages(), regex.MustCompileRuby(`a`) /*, regexp.MustCompile(`b`)*/), 0, "a", "b"},
+	},
+}
+
+func testRulesForEngine(t *testing.T, engine string) {
+	if engine != "" && regex.Name != engine {
+		return
+	}
+	for _, f := range specificFixtures[engine] {
+		t.Run(engine+f.name, func(t *testing.T) {
+			check(t, f)
 		})
 	}
 }
+
+func TestRules(t *testing.T) {
+	//TODO(bzz): can all be run in parallel
+	testRulesForEngine(t, "")
+	testRulesForEngine(t, regex.RE2)
+	testRulesForEngine(t, regex.Oniguruma)
+}
+
+func check(t *testing.T, f fixture) {
+	assert.NotNil(t, f.rule)
+	assert.NotNil(t, f.rule.Languages())
+	assert.Equal(t, f.numLangs, len(f.rule.Languages()))
+	if f.match != "" {
+		assert.Truef(t, f.rule.Match([]byte(f.match)),
+			"'%s' is expected to .Match() by rule %s%v", f.match, f.name, f.rule)
+	}
+	if f.noMatch != "" {
+		assert.Falsef(t, f.rule.Match([]byte(f.noMatch)),
+			"'%s' is expected NOT to .Match() by rule %s%v", f.noMatch, f.name, f.rule)
+	}
+}
--- a/enry.go
+++ b/enry.go
@@ -1,15 +1,15 @@
 /*
-	Package enry implements multiple strategies for programming language identification.
+Package enry identifies programming languages.

-	Identification is made based on file name and file content using a service
-	of strategies to narrow down possible option.
-	Each strategy is available as a separate API call, as well as a main enty point
+Identification is based on file name and content using a series
+of strategies to narrow down possible options.
+Each strategy is available as a separate API call, as well as though the main enty point:

-		GetLanguage(filename string, content []byte) (language string)
+	GetLanguage(filename string, content []byte) (language string)

-	It is a port of the https://github.com/github/linguist from Ruby.
-	Upstream Linguist YAML files are used to generate datastructures for data
-	package.
+It is a port of the https://github.com/github/linguist from Ruby.
+Upstream Linguist YAML files are used to generate datastructures for data
+package.
 */
 package enry // import "github.com/go-enry/go-enry/v2"

--- a/go.mod
+++ b/go.mod
@@ -4,6 +4,6 @@ go 1.14

 require (
 	github.com/go-enry/go-oniguruma v1.2.1
-	github.com/stretchr/testify v1.3.0
+	github.com/stretchr/testify v1.8.1
 	gopkg.in/yaml.v2 v2.2.8
 )
--- a/go.sum
+++ b/go.sum
@@ -1,16 +1,21 @@
-github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8=
 github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
-github.com/go-enry/go-oniguruma v1.2.0 h1:oBO9XC1IDT9+AoWW5oFsa/7gFeOPacEqDbyXZKWXuDs=
-github.com/go-enry/go-oniguruma v1.2.0/go.mod h1:bWDhYP+S6xZQgiRL7wlTScFYBe023B6ilRZbCAD5Hf4=
+github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
+github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/go-enry/go-oniguruma v1.2.1 h1:k8aAMuJfMrqm/56SG2lV9Cfti6tC4x8673aHCcBk+eo=
 github.com/go-enry/go-oniguruma v1.2.1/go.mod h1:bWDhYP+S6xZQgiRL7wlTScFYBe023B6ilRZbCAD5Hf4=
 github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
 github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
-github.com/stretchr/objx v0.1.0 h1:4G4v2dO3VZwixGIRoQ5Lfboy6nUhCyYzaqnIAPPhYs4=
 github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
-github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q=
-github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
+github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
+github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
+github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
+github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
+github.com/stretchr/testify v1.8.1 h1:w7B6lhMri9wdJUVmEZPGGhZzrYTPvgJArz7wNPgYKsk=
+github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
 gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
 gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
 gopkg.in/yaml.v2 v2.2.8 h1:obN1ZagJSUGI0Ek/LBmuj4SNLPfIny3KsKFopxRdj10=
 gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
+gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
+gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
+gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
--- a/internal/code-generator/assets/content.go.tmpl
+++ b/internal/code-generator/assets/content.go.tmpl
@@ -1,9 +1,8 @@
 package data

 import (
-	"regexp"
-
 	"github.com/go-enry/go-enry/v2/data/rule"
+	"github.com/go-enry/go-enry/v2/regex"
 )

 var ContentHeuristics = map[string]*Heuristics{
@@ -27,12 +26,12 @@ var ContentHeuristics = map[string]*Heuristics{
 	{{- else if eq .Op "Or" -}}
 		rule.Or(
 			{{ template "Languages" .Langs -}}
-			regexp.MustCompile({{ .Pattern | stringVal }}),
+			{{ template "mustCompile" . }}
 		),
 	{{- else if eq .Op "Not" -}}
 		rule.Not(
 			{{ template "Languages" .Langs -}}
-			regexp.MustCompile({{ .Pattern | stringVal }}),
+			{{ template "mustCompile" . }}
 		),
 	{{- else if eq .Op "Always" -}}
 		rule.Always(
@@ -49,3 +48,11 @@ var ContentHeuristics = map[string]*Heuristics{
 		rule.MatchingLanguages(""),
 	{{end -}}
 {{end}}
+
+{{define "mustCompile" -}}
+	{{ if .IsRE2  -}}
+		regex.MustCompileMultiline({{ .Pattern | stringVal }}),
+	{{- else -}}
+		regex.MustCompileRuby({{ .Pattern | stringVal }}),
+	{{ end -}}
+{{end}}
--- a/internal/code-generator/assets/vendor.go.tmpl
+++ b/internal/code-generator/assets/vendor.go.tmpl
@@ -2,11 +2,21 @@ package data

 import "github.com/go-enry/go-enry/v2/regex"

+{{define "mustCompile" -}}
+	{{ if isRE2 .  -}}
+		regex.MustCompile({{ . | stringVal }})
+	{{- else -}}
+		regex.MustCompileRuby({{ . | stringVal }})
+	{{- end -}}
+{{end}}
+
 var VendorMatchers = []regex.EnryRegexp{
-	{{range $regexp := . -}}
-	regex.MustCompile(`{{ $regexp }}`),
+	{{range $re := . -}}
+		{{ template "mustCompile" $re }},
 	{{end -}}
 }

 // FastVendorMatcher is equivalent to matching any of the VendorMatchers.
-var FastVendorMatcher = regex.MustCompile(`{{ optimize . }}`)
+{{with $singleRE := collateAllRegexps . -}}
+var FastVendorMatcher = {{template "mustCompile" $singleRE}}
+{{end}}
--- a/internal/code-generator/generator/generator.go
+++ b/internal/code-generator/generator/generator.go
@@ -3,7 +3,6 @@
 package generator

 import (
-	"bytes"
 	"fmt"
 	"go/format"
 	"io"
@@ -22,12 +21,15 @@ type File func(fileToParse, samplesDir, outPath, tmplPath, tmplName, commit stri
 func formatedWrite(outPath string, source []byte) error {
 	formatedSource, err := format.Source(source)
 	if err != nil {
-		return err
+		err = fmt.Errorf("'go fmt' fails on %v", err)
+		// write un-formatter source to simplify debugging
+		formatedSource = source
 	}
+
 	if err := ioutil.WriteFile(outPath, formatedSource, 0666); err != nil {
 		return err
 	}
-	return nil
+	return err
 }

 func executeTemplate(w io.Writer, name, path, commit string, fmap template.FuncMap, data interface{}) error {
@@ -40,35 +42,21 @@ func executeTemplate(w io.Writer, name, path, commit string, fmap template.FuncM
 		val = strings.ReplaceAll(val, "`", "`+\"`\"+`")
 		return fmt.Sprintf("`%s`", val)
 	}
-
-	const headerTmpl = "header.go.tmpl"
-	headerPath := filepath.Join(filepath.Dir(path), headerTmpl)
-
-	h := template.Must(template.New(headerTmpl).Funcs(template.FuncMap{
-		"getCommit": getCommit,
-		"stringVal": stringVal,
-	}).ParseFiles(headerPath))
-
-	buf := bytes.NewBuffer(nil)
-	if err := h.Execute(buf, data); err != nil {
-		return err
-	}
-
 	if fmap == nil {
 		fmap = make(template.FuncMap)
 	}
 	fmap["getCommit"] = getCommit
 	fmap["stringVal"] = stringVal
+	fmap["isRE2"] = isRE2
+
+	const headerTmpl = "header.go.tmpl"
+	headerPath := filepath.Join(filepath.Dir(path), headerTmpl)
+
+	h := template.Must(template.New(headerTmpl).Funcs(fmap).ParseFiles(headerPath))
+	if err := h.Execute(w, data); err != nil {
+		return err
+	}

 	t := template.Must(template.New(name).Funcs(fmap).ParseFiles(path))
-	if err := t.Execute(buf, data); err != nil {
-		return err
-	}
-
-	src, err := format.Source(buf.Bytes())
-	if err != nil {
-		return err
-	}
-	_, err = w.Write(src)
-	return err
+	return t.Execute(w, data)
 }
--- a/internal/code-generator/generator/heuristics.go
+++ b/internal/code-generator/generator/heuristics.go
@@ -70,25 +70,27 @@ func loadRule(namedPatterns map[string]StringArray, rule *Rule) *LanguagePattern
 			subp := loadRule(namedPatterns, r)
 			subPatterns = append(subPatterns, subp)
 		}
-		result = &LanguagePattern{"And", rule.Languages, "", subPatterns}
+		result = &LanguagePattern{"And", rule.Languages, "", subPatterns, true}
 	} else if len(rule.Pattern) != 0 { // OrPattern
-		conjunction := strings.Join(rule.Pattern, orPipe)
-		pattern := convertToValidRegexp(conjunction)
-		result = &LanguagePattern{"Or", rule.Languages, pattern, nil}
+		// FIXME(bzz): this optimization should only be applied if each pattern isRE2!
+		pattern := strings.Join(rule.Pattern, orPipe)
+
+		// TODO(bzz): handle the common case Or(len(Languages)==0) better
+		// e.g. by emiting `rule.Rule(...)` instead of
+		// an (ugly) `rule.Or( rule.MatchingLanguages(""), ... )`
+		result = &LanguagePattern{"Or", rule.Languages, pattern, nil, isRE2(pattern)}
 	} else if rule.NegativePattern != "" { // NotPattern
-		pattern := convertToValidRegexp(rule.NegativePattern)
-		result = &LanguagePattern{"Not", rule.Languages, pattern, nil}
+		pattern := rule.NegativePattern
+		result = &LanguagePattern{"Not", rule.Languages, pattern, nil, isRE2(pattern)}
 	} else if rule.NamedPattern != "" { // Named OrPattern
-		conjunction := strings.Join(namedPatterns[rule.NamedPattern], orPipe)
-		pattern := convertToValidRegexp(conjunction)
-		result = &LanguagePattern{"Or", rule.Languages, pattern, nil}
+		pattern := strings.Join(namedPatterns[rule.NamedPattern], orPipe)
+		result = &LanguagePattern{"Or", rule.Languages, pattern, nil, isRE2(pattern)}
 	} else { // AlwaysPattern
-		result = &LanguagePattern{"Always", rule.Languages, "", nil}
+		result = &LanguagePattern{"Always", rule.Languages, "", nil, true}
 	}

-	if isUnsupportedRegexpSyntax(result.Pattern) {
-		log.Printf("skipping rule: language:'%q', rule:'%q'\n", rule.Languages, result.Pattern)
-		return nil
+	if !isRE2(result.Pattern) {
+		log.Printf("RE2 incompatible syntax for heuristic language:'%s', rule:'%s'\n", rule.Languages, result.Pattern)
 	}
 	return result
 }
@@ -100,6 +102,7 @@ type LanguagePattern struct {
 	Langs   []string
 	Pattern string
 	Rules   []*LanguagePattern
+	IsRE2   bool
 }

 type Heuristics struct {
@@ -125,7 +128,7 @@ type Patterns struct {
 }

 // StringArray is workaround for parsing named_pattern,
-// wich is sometimes arry and sometimes not.
+// wich is sometimes an array and sometimes is not.
 // See https://github.com/go-yaml/yaml/issues/100
 type StringArray []string

@@ -173,8 +176,6 @@ func isUnsupportedRegexpSyntax(reg string) bool {
 		(strings.HasPrefix(reg, multilinePrefix+`/`) && strings.HasSuffix(reg, `/`))
 }

-// convertToValidRegexp converts Ruby regexp syntax to RE2 equivalent.
-// Does not work with Ruby regexp literals.
-func convertToValidRegexp(rubyRegexp string) string {
-	return multilinePrefix + rubyRegexp
+func isRE2(s string) bool {
+	return !isUnsupportedRegexpSyntax(s)
 }
--- a/internal/code-generator/generator/test_files/content.gold
+++ b/internal/code-generator/generator/test_files/content.gold
--- a/internal/code-generator/generator/vendor.go
+++ b/internal/code-generator/generator/vendor.go
@@ -5,6 +5,7 @@ import (
 	"fmt"
 	"io"
 	"io/ioutil"
+	"log"
 	"sort"
 	"strings"
 	"text/template"
@@ -25,6 +26,12 @@ func Vendor(fileToParse, samplesDir, outPath, tmplPath, tmplName, commit string)
 		return fmt.Errorf("failed to parse YAML %s, %q", fileToParse, err)
 	}

+	for _, re := range regexps {
+		if !isRE2(re) {
+			log.Printf("RE2 incompatible syntax for vendor:'%s'\n", re)
+		}
+	}
+
 	buf := &bytes.Buffer{}
 	if err := executeVendorTemplate(buf, regexps, tmplPath, tmplName, commit); err != nil {
 		return err
@@ -34,34 +41,14 @@ func Vendor(fileToParse, samplesDir, outPath, tmplPath, tmplName, commit string)
 }

 func executeVendorTemplate(out io.Writer, regexps []string, tmplPath, tmplName, commit string) error {
-	funcs := template.FuncMap{"optimize": collateAllMatchers}
+	funcs := template.FuncMap{"collateAllRegexps": collateAllRegexps}
 	return executeTemplate(out, tmplName, tmplPath, commit, funcs, regexps)
 }

-func collateAllMatchers(regexps []string) string {
-	// We now collate all regexps from VendorMatchers to a single large regexp
+// collateAllRegexps all regexps to a single large regexp.
+func collateAllRegexps(regexps []string) string {
 	// which is at least twice as fast to test than simply iterating & matching.
 	//
-	// ---
-	//
-	// We could test each matcher from VendorMatchers in turn i.e.
-	//
-	//  	func IsVendor(filename string) bool {
-	// 			for _, matcher := range data.VendorMatchers {
-	// 				if matcher.MatchString(filename) {
-	//					return true
-	//				}
-	//			}
-	//			return false
-	//		}
-	//
-	// Or naïvely concatentate all these regexps using groups i.e.
-	//
-	//		`(regexp1)|(regexp2)|(regexp3)|...`
-	//
-	// However, both of these are relatively slow and don't take advantage
-	// of the inherent structure within our regexps.
-	//
 	// Imperical observation: by looking at the regexps, we only have 3 types.
 	//  1. Those that start with `^`
 	//  2. Those that start with `(^|/)`
@@ -81,8 +68,8 @@ func collateAllMatchers(regexps []string) string {

 	sort.Strings(regexps)

+	// Check prefix, group expressions
 	var caretPrefixed, caretOrSlashPrefixed, theRest []string
-	// Check prefix, add to the respective group slices
 	for _, re := range regexps {
 		if strings.HasPrefix(re, caret) {
 			caretPrefixed = append(caretPrefixed, re[len(caret):])
@@ -92,6 +79,7 @@ func collateAllMatchers(regexps []string) string {
 			theRest = append(theRest, re)
 		}
 	}
+
 	var sb strings.Builder
 	appendGroupWithCommonPrefix(&sb, "^", caretPrefixed)
 	sb.WriteString("|")
--- a/internal/code-generator/main.go
+++ b/internal/code-generator/main.go
@@ -134,7 +134,7 @@ func main() {

 	for _, file := range fileList {
 		if err := file.generate(file.fileToParse, file.samplesDir, file.outPath, file.tmplPath, file.tmplName, file.commit); err != nil {
-			log.Fatalf("error generating template %q to %q: %+v", file.tmplPath, file.outPath, err)
+			log.Fatalf("failed to generate %q from %q - %+v", file.outPath, file.tmplPath, err)
 		}
 	}
 }
--- a/regex/oniguruma.go
+++ b/regex/oniguruma.go
@@ -1,3 +1,4 @@
+//go:build oniguruma
 // +build oniguruma

 package regex
@@ -6,10 +7,21 @@ import (
 	rubex "github.com/go-enry/go-oniguruma"
 )

+const Name = Oniguruma
+
 type EnryRegexp = *rubex.Regexp

-func MustCompile(str string) EnryRegexp {
-	return rubex.MustCompileASCII(str)
+func MustCompile(s string) EnryRegexp {
+	return rubex.MustCompileASCII(s)
+}
+
+// MustCompileMultiline matches in multi-line mode by default with Oniguruma.
+func MustCompileMultiline(s string) EnryRegexp {
+	return MustCompile(s)
+}
+
+func MustCompileRuby(s string) EnryRegexp {
+	return MustCompile(s)
 }

 func QuoteMeta(s string) string {
--- a/regex/regex.go
+++ b/regex/regex.go
@@ -0,0 +1,9 @@
+package regex
+
+// Package regex abstracts regular expression engine
+// that can be chosen at compile-time by a build tag.
+
+const (
+	RE2       = "RE2"
+	Oniguruma = "Oniguruma"
+)
--- a/regex/standard.go
+++ b/regex/standard.go
@@ -1,3 +1,4 @@
+//go:build !oniguruma
 // +build !oniguruma

 package regex
@@ -6,12 +7,32 @@ import (
 	"regexp"
 )

+const Name = RE2
+
 type EnryRegexp = *regexp.Regexp

 func MustCompile(str string) EnryRegexp {
 	return regexp.MustCompile(str)
 }

+// MustCompileMultiline mimics Ruby defaults for regexp, where ^$ matches begin/end of line.
+// I.e. it converts Ruby regexp syntaxt to RE2 equivalent
+func MustCompileMultiline(s string) EnryRegexp {
+	const multilineModeFlag = "(?m)"
+	return regexp.MustCompile(multilineModeFlag + s)
+}
+
+// MustCompileRuby used for expressions with syntax not supported by RE2.
+// Now it's confusing as we use the result as [data/rule.Matcher] and
+//
+//	(*Matcher)(nil) != nil
+//
+// What is a better way for an expression to indicate unsupported syntax?
+// e.g. add .IsValidSyntax() to both, Matcher interface and EnryRegexp implementations?
+func MustCompileRuby(s string) EnryRegexp {
+	return nil
+}
+
 func QuoteMeta(s string) string {
 	return regexp.QuoteMeta(s)
 }
--- a/regex/standard_test.go
+++ b/regex/standard_test.go
@@ -0,0 +1,27 @@
+//go:build !oniguruma
+// +build !oniguruma
+
+package regex
+
+import (
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestMustCompileMultiline(t *testing.T) {
+	const re = `^\.(.*)!$`
+	want := MustCompileMultiline(re)
+	assert.Equal(t, "(?m)"+re, want.String())
+
+	const s = `.one
+.two!
+thre!`
+	if !want.MatchString(s) {
+		t.Fatalf("MustCompileMultiline(`%s`) must match multiline %q\n", re, s)
+	}
+}
+
+func TestMustCompileRuby(t *testing.T) {
+	assert.Nil(t, MustCompileRuby(``))
+}
--- a/utils.go
+++ b/utils.go
@@ -63,7 +63,21 @@ func IsDotFile(path string) bool {

 // IsVendor returns whether or not path is a vendor path.
 func IsVendor(path string) bool {
-	return data.FastVendorMatcher.MatchString(path)
+	// fast path: single collatated regex, if the engine supports its syntax
+	if data.FastVendorMatcher != nil {
+		return data.FastVendorMatcher.MatchString(path)
+	}
+
+	// slow path: skip individual rules with unsupported syntax
+	for _, matcher := range data.VendorMatchers {
+		if matcher == nil {
+			continue
+		}
+		if matcher.MatchString(path) {
+			return true
+		}
+	}
+	return false
 }

 // IsTest returns whether or not path is a test path.
--- a/utils_test.go
+++ b/utils_test.go
@@ -7,57 +7,62 @@ import (
 	"path/filepath"
 	"testing"

+	"github.com/go-enry/go-enry/v2/regex"
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
 )

-//TODO(bzz): port all from test/test_file_blob.rb test_vendored()
-//https://github.com/github/linguist/blob/86adc140d3e8903980565a2984f5532edf4ae875/test/test_file_blob.rb#L270-L583
+// TODO(bzz): port all from test/test_file_blob.rb test_vendored()
+// https://github.com/github/linguist/blob/86adc140d3e8903980565a2984f5532edf4ae875/test/test_file_blob.rb#L270-L583
 var vendorTests = []struct {
-	path     string
-	expected bool
+	skipOnRE2 bool // some rules are (present in code but) missing at runtime on RE2
+	path      string
+	expected  bool
 }{
-	{"cache/", true},
-	{"something_cache/", false},
-	{"random/cache/", true},
-	{"cache", false},
-	{"dependencies/", true},
-	{"Dependencies/", true},
-	{"dependency/", false},
-	{"dist/", true},
-	{"dist", false},
-	{"random/dist/", true},
-	{"random/dist", false},
-	{"deps/", true},
-	{"foodeps/", false},
-	{"configure", true},
-	{"a/configure", true},
-	{"config.guess", true},
-	{"config.guess/", false},
-	{".vscode/", true},
-	{"doc/_build/", true},
-	{"a/docs/_build/", true},
-	{"a/dasdocs/_build-vsdoc.js", true},
-	{"a/dasdocs/_build-vsdoc.j", false},
-	{"foo/bar", false},
-	{".sublime-project", true},
-	{"foo/vendor/foo", true},
-	{"leaflet.draw-src.js", true},
-	{"foo/bar/MochiKit.js", true},
-	{"foo/bar/dojo.js", true},
-	{"foo/env/whatever", true},
-	{"some/python/venv/", false},
-	{"foo/.imageset/bar", true},
-	{"Vagrantfile", true},
-	{"src/bootstrap-custom.js", true},
-	// {"/css/bootstrap.rtl.css", true}, // from linguist v7.23
+	{path: "cache/", expected: true},
+	{false, "something_cache/", false},
+	{false, "random/cache/", true},
+	{false, "cache", false},
+	{false, "dependencies/", true},
+	{false, "Dependencies/", true},
+	{false, "dependency/", false},
+	{false, "dist/", true},
+	{false, "dist", false},
+	{false, "random/dist/", true},
+	{false, "random/dist", false},
+	{false, "deps/", true},
+	{false, "foodeps/", false},
+	{false, "configure", true},
+	{false, "a/configure", true},
+	{false, "config.guess", true},
+	{false, "config.guess/", false},
+	{false, ".vscode/", true},
+	{false, "doc/_build/", true},
+	{false, "a/docs/_build/", true},
+	{false, "a/dasdocs/_build-vsdoc.js", true},
+	{false, "a/dasdocs/_build-vsdoc.j", false},
+	{false, "foo/bar", false},
+	{false, ".sublime-project", true},
+	{false, "foo/vendor/foo", true},
+	{false, "leaflet.draw-src.js", true},
+	{false, "foo/bar/MochiKit.js", true},
+	{false, "foo/bar/dojo.js", true},
+	{false, "foo/env/whatever", true},
+	{false, "some/python/venv/", false},
+	{false, "foo/.imageset/bar", true},
+	{false, "Vagrantfile", true},
+	{true, "src/bootstrap-custom.js", true},
+	// {true, "/css/bootstrap.rtl.css", true}, // from linguist v7.23
 }

 func TestIsVendor(t *testing.T) {
-	for _, tt := range vendorTests {
-		t.Run(tt.path, func(t *testing.T) {
-			if got := IsVendor(tt.path); got != tt.expected {
-				t.Errorf("IsVendor(%q) = %v, expected %v", tt.path, got, tt.expected)
+	for _, test := range vendorTests {
+		t.Run(test.path, func(t *testing.T) {
+			if got := IsVendor(test.path); got != test.expected {
+				if regex.Name == regex.RE2 && test.skipOnRE2 {
+					return // skip
+				}
+				t.Errorf("IsVendor(%q) = %v, expected %v (usuing %s)", test.path, got, test.expected, regex.Name)
 			}
 		})
 	}