heuristics regexp engine configurable #3, adapt IsVendor optimization & tests

Regex collation optimization for IsVendor now fails gracefully.
Tests that are affected by non-RE2 syntax are explicitly marked.
This commit is contained in:
Alex Bezzubov 2023-02-16 17:55:57 +01:00
parent 8df9e1ecf2
commit 8246efecce
4 changed files with 79 additions and 76 deletions

View File

@ -2,15 +2,21 @@ package data
import "github.com/go-enry/go-enry/v2/regex"
{{define "mustCompile" -}}
{{ if isRE2 . -}}
regex.MustCompile({{ . | stringVal }})
{{- else -}}
regex.MustCompileRuby({{ . | stringVal }})
{{- end -}}
{{end}}
var VendorMatchers = []regex.EnryRegexp{
{{range $re := . -}}
{{ if isRE2 $re -}}
regex.MustCompile({{ $re | stringVal }}),
{{- else -}}
regex.MustCompileRuby({{ $re | stringVal }}),
{{ end }}
{{ template "mustCompile" $re }},
{{end -}}
}
// FastVendorMatcher is equivalent to matching any of the VendorMatchers.
var FastVendorMatcher = regex.MustCompile(`{{ optimize . }}`)
{{with $singleRE := collateAllRegexps . -}}
var FastVendorMatcher = {{template "mustCompile" $singleRE}}
{{end}}

View File

@ -41,34 +41,14 @@ func Vendor(fileToParse, samplesDir, outPath, tmplPath, tmplName, commit string)
}
func executeVendorTemplate(out io.Writer, regexps []string, tmplPath, tmplName, commit string) error {
funcs := template.FuncMap{"optimize": collateAllMatchers}
funcs := template.FuncMap{"collateAllRegexps": collateAllRegexps}
return executeTemplate(out, tmplName, tmplPath, commit, funcs, regexps)
}
func collateAllMatchers(regexps []string) string {
// We now collate all regexps from VendorMatchers to a single large regexp
// collateAllRegexps all regexps to a single large regexp.
func collateAllRegexps(regexps []string) string {
// which is at least twice as fast to test than simply iterating & matching.
//
// ---
//
// We could test each matcher from VendorMatchers in turn i.e.
//
// func IsVendor(filename string) bool {
// for _, matcher := range data.VendorMatchers {
// if matcher.MatchString(filename) {
// return true
// }
// }
// return false
// }
//
// Or naïvely concatentate all these regexps using groups i.e.
//
// `(regexp1)|(regexp2)|(regexp3)|...`
//
// However, both of these are relatively slow and don't take advantage
// of the inherent structure within our regexps.
//
// Imperical observation: by looking at the regexps, we only have 3 types.
// 1. Those that start with `^`
// 2. Those that start with `(^|/)`
@ -88,12 +68,9 @@ func collateAllMatchers(regexps []string) string {
sort.Strings(regexps)
// Check prefix, group expressions
var caretPrefixed, caretOrSlashPrefixed, theRest []string
// Check prefix, add to the respective group slices
for _, re := range regexps {
if !isRE2(re) {
continue
}
if strings.HasPrefix(re, caret) {
caretPrefixed = append(caretPrefixed, re[len(caret):])
} else if strings.HasPrefix(re, caretOrSlash) {
@ -102,6 +79,7 @@ func collateAllMatchers(regexps []string) string {
theRest = append(theRest, re)
}
}
var sb strings.Builder
appendGroupWithCommonPrefix(&sb, "^", caretPrefixed)
sb.WriteString("|")

View File

@ -63,7 +63,21 @@ func IsDotFile(path string) bool {
// IsVendor returns whether or not path is a vendor path.
func IsVendor(path string) bool {
return data.FastVendorMatcher.MatchString(path)
// fast path: single collatated regex, if the engine supports its syntax
if data.FastVendorMatcher != nil {
return data.FastVendorMatcher.MatchString(path)
}
// slow path: skip individual rules with unsupported syntax
for _, matcher := range data.VendorMatchers {
if matcher == nil {
continue
}
if matcher.MatchString(path) {
return true
}
}
return false
}
// IsTest returns whether or not path is a test path.

View File

@ -7,57 +7,62 @@ import (
"path/filepath"
"testing"
"github.com/go-enry/go-enry/v2/regex"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
//TODO(bzz): port all from test/test_file_blob.rb test_vendored()
//https://github.com/github/linguist/blob/86adc140d3e8903980565a2984f5532edf4ae875/test/test_file_blob.rb#L270-L583
// TODO(bzz): port all from test/test_file_blob.rb test_vendored()
// https://github.com/github/linguist/blob/86adc140d3e8903980565a2984f5532edf4ae875/test/test_file_blob.rb#L270-L583
var vendorTests = []struct {
path string
expected bool
skipOnRE2 bool // some rules are (present in code but) missing at runtime on RE2
path string
expected bool
}{
{"cache/", true},
{"something_cache/", false},
{"random/cache/", true},
{"cache", false},
{"dependencies/", true},
{"Dependencies/", true},
{"dependency/", false},
{"dist/", true},
{"dist", false},
{"random/dist/", true},
{"random/dist", false},
{"deps/", true},
{"foodeps/", false},
{"configure", true},
{"a/configure", true},
{"config.guess", true},
{"config.guess/", false},
{".vscode/", true},
{"doc/_build/", true},
{"a/docs/_build/", true},
{"a/dasdocs/_build-vsdoc.js", true},
{"a/dasdocs/_build-vsdoc.j", false},
{"foo/bar", false},
{".sublime-project", true},
{"foo/vendor/foo", true},
{"leaflet.draw-src.js", true},
{"foo/bar/MochiKit.js", true},
{"foo/bar/dojo.js", true},
{"foo/env/whatever", true},
{"some/python/venv/", false},
{"foo/.imageset/bar", true},
{"Vagrantfile", true},
{"src/bootstrap-custom.js", true},
// {"/css/bootstrap.rtl.css", true}, // from linguist v7.23
{path: "cache/", expected: true},
{false, "something_cache/", false},
{false, "random/cache/", true},
{false, "cache", false},
{false, "dependencies/", true},
{false, "Dependencies/", true},
{false, "dependency/", false},
{false, "dist/", true},
{false, "dist", false},
{false, "random/dist/", true},
{false, "random/dist", false},
{false, "deps/", true},
{false, "foodeps/", false},
{false, "configure", true},
{false, "a/configure", true},
{false, "config.guess", true},
{false, "config.guess/", false},
{false, ".vscode/", true},
{false, "doc/_build/", true},
{false, "a/docs/_build/", true},
{false, "a/dasdocs/_build-vsdoc.js", true},
{false, "a/dasdocs/_build-vsdoc.j", false},
{false, "foo/bar", false},
{false, ".sublime-project", true},
{false, "foo/vendor/foo", true},
{false, "leaflet.draw-src.js", true},
{false, "foo/bar/MochiKit.js", true},
{false, "foo/bar/dojo.js", true},
{false, "foo/env/whatever", true},
{false, "some/python/venv/", false},
{false, "foo/.imageset/bar", true},
{false, "Vagrantfile", true},
{true, "src/bootstrap-custom.js", true},
// {true, "/css/bootstrap.rtl.css", true}, // from linguist v7.23
}
func TestIsVendor(t *testing.T) {
for _, tt := range vendorTests {
t.Run(tt.path, func(t *testing.T) {
if got := IsVendor(tt.path); got != tt.expected {
t.Errorf("IsVendor(%q) = %v, expected %v", tt.path, got, tt.expected)
for _, test := range vendorTests {
t.Run(test.path, func(t *testing.T) {
if got := IsVendor(test.path); got != test.expected {
if regex.Name == regex.RE2 && test.skipOnRE2 {
return // skip
}
t.Errorf("IsVendor(%q) = %v, expected %v (usuing %s)", test.path, got, test.expected, regex.Name)
}
})
}