heuristics regexp engine configurable #3, adapt IsVendor optimization & tests

Regex collation optimization for IsVendor now fails gracefully.
Tests that are affected by non-RE2 syntax are explicitly marked.
This commit is contained in:
Alex Bezzubov 2023-02-16 17:55:57 +01:00
parent 8df9e1ecf2
commit 8246efecce
4 changed files with 79 additions and 76 deletions

View File

@ -2,15 +2,21 @@ package data
import "github.com/go-enry/go-enry/v2/regex" import "github.com/go-enry/go-enry/v2/regex"
{{define "mustCompile" -}}
{{ if isRE2 . -}}
regex.MustCompile({{ . | stringVal }})
{{- else -}}
regex.MustCompileRuby({{ . | stringVal }})
{{- end -}}
{{end}}
var VendorMatchers = []regex.EnryRegexp{ var VendorMatchers = []regex.EnryRegexp{
{{range $re := . -}} {{range $re := . -}}
{{ if isRE2 $re -}} {{ template "mustCompile" $re }},
regex.MustCompile({{ $re | stringVal }}),
{{- else -}}
regex.MustCompileRuby({{ $re | stringVal }}),
{{ end }}
{{end -}} {{end -}}
} }
// FastVendorMatcher is equivalent to matching any of the VendorMatchers. // FastVendorMatcher is equivalent to matching any of the VendorMatchers.
var FastVendorMatcher = regex.MustCompile(`{{ optimize . }}`) {{with $singleRE := collateAllRegexps . -}}
var FastVendorMatcher = {{template "mustCompile" $singleRE}}
{{end}}

View File

@ -41,34 +41,14 @@ func Vendor(fileToParse, samplesDir, outPath, tmplPath, tmplName, commit string)
} }
func executeVendorTemplate(out io.Writer, regexps []string, tmplPath, tmplName, commit string) error { func executeVendorTemplate(out io.Writer, regexps []string, tmplPath, tmplName, commit string) error {
funcs := template.FuncMap{"optimize": collateAllMatchers} funcs := template.FuncMap{"collateAllRegexps": collateAllRegexps}
return executeTemplate(out, tmplName, tmplPath, commit, funcs, regexps) return executeTemplate(out, tmplName, tmplPath, commit, funcs, regexps)
} }
func collateAllMatchers(regexps []string) string { // collateAllRegexps all regexps to a single large regexp.
// We now collate all regexps from VendorMatchers to a single large regexp func collateAllRegexps(regexps []string) string {
// which is at least twice as fast to test than simply iterating & matching. // which is at least twice as fast to test than simply iterating & matching.
// //
// ---
//
// We could test each matcher from VendorMatchers in turn i.e.
//
// func IsVendor(filename string) bool {
// for _, matcher := range data.VendorMatchers {
// if matcher.MatchString(filename) {
// return true
// }
// }
// return false
// }
//
// Or naïvely concatentate all these regexps using groups i.e.
//
// `(regexp1)|(regexp2)|(regexp3)|...`
//
// However, both of these are relatively slow and don't take advantage
// of the inherent structure within our regexps.
//
// Imperical observation: by looking at the regexps, we only have 3 types. // Imperical observation: by looking at the regexps, we only have 3 types.
// 1. Those that start with `^` // 1. Those that start with `^`
// 2. Those that start with `(^|/)` // 2. Those that start with `(^|/)`
@ -88,12 +68,9 @@ func collateAllMatchers(regexps []string) string {
sort.Strings(regexps) sort.Strings(regexps)
// Check prefix, group expressions
var caretPrefixed, caretOrSlashPrefixed, theRest []string var caretPrefixed, caretOrSlashPrefixed, theRest []string
// Check prefix, add to the respective group slices
for _, re := range regexps { for _, re := range regexps {
if !isRE2(re) {
continue
}
if strings.HasPrefix(re, caret) { if strings.HasPrefix(re, caret) {
caretPrefixed = append(caretPrefixed, re[len(caret):]) caretPrefixed = append(caretPrefixed, re[len(caret):])
} else if strings.HasPrefix(re, caretOrSlash) { } else if strings.HasPrefix(re, caretOrSlash) {
@ -102,6 +79,7 @@ func collateAllMatchers(regexps []string) string {
theRest = append(theRest, re) theRest = append(theRest, re)
} }
} }
var sb strings.Builder var sb strings.Builder
appendGroupWithCommonPrefix(&sb, "^", caretPrefixed) appendGroupWithCommonPrefix(&sb, "^", caretPrefixed)
sb.WriteString("|") sb.WriteString("|")

View File

@ -63,7 +63,21 @@ func IsDotFile(path string) bool {
// IsVendor returns whether or not path is a vendor path. // IsVendor returns whether or not path is a vendor path.
func IsVendor(path string) bool { func IsVendor(path string) bool {
return data.FastVendorMatcher.MatchString(path) // fast path: single collatated regex, if the engine supports its syntax
if data.FastVendorMatcher != nil {
return data.FastVendorMatcher.MatchString(path)
}
// slow path: skip individual rules with unsupported syntax
for _, matcher := range data.VendorMatchers {
if matcher == nil {
continue
}
if matcher.MatchString(path) {
return true
}
}
return false
} }
// IsTest returns whether or not path is a test path. // IsTest returns whether or not path is a test path.

View File

@ -7,57 +7,62 @@ import (
"path/filepath" "path/filepath"
"testing" "testing"
"github.com/go-enry/go-enry/v2/regex"
"github.com/stretchr/testify/assert" "github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require" "github.com/stretchr/testify/require"
) )
//TODO(bzz): port all from test/test_file_blob.rb test_vendored() // TODO(bzz): port all from test/test_file_blob.rb test_vendored()
//https://github.com/github/linguist/blob/86adc140d3e8903980565a2984f5532edf4ae875/test/test_file_blob.rb#L270-L583 // https://github.com/github/linguist/blob/86adc140d3e8903980565a2984f5532edf4ae875/test/test_file_blob.rb#L270-L583
var vendorTests = []struct { var vendorTests = []struct {
path string skipOnRE2 bool // some rules are (present in code but) missing at runtime on RE2
expected bool path string
expected bool
}{ }{
{"cache/", true}, {path: "cache/", expected: true},
{"something_cache/", false}, {false, "something_cache/", false},
{"random/cache/", true}, {false, "random/cache/", true},
{"cache", false}, {false, "cache", false},
{"dependencies/", true}, {false, "dependencies/", true},
{"Dependencies/", true}, {false, "Dependencies/", true},
{"dependency/", false}, {false, "dependency/", false},
{"dist/", true}, {false, "dist/", true},
{"dist", false}, {false, "dist", false},
{"random/dist/", true}, {false, "random/dist/", true},
{"random/dist", false}, {false, "random/dist", false},
{"deps/", true}, {false, "deps/", true},
{"foodeps/", false}, {false, "foodeps/", false},
{"configure", true}, {false, "configure", true},
{"a/configure", true}, {false, "a/configure", true},
{"config.guess", true}, {false, "config.guess", true},
{"config.guess/", false}, {false, "config.guess/", false},
{".vscode/", true}, {false, ".vscode/", true},
{"doc/_build/", true}, {false, "doc/_build/", true},
{"a/docs/_build/", true}, {false, "a/docs/_build/", true},
{"a/dasdocs/_build-vsdoc.js", true}, {false, "a/dasdocs/_build-vsdoc.js", true},
{"a/dasdocs/_build-vsdoc.j", false}, {false, "a/dasdocs/_build-vsdoc.j", false},
{"foo/bar", false}, {false, "foo/bar", false},
{".sublime-project", true}, {false, ".sublime-project", true},
{"foo/vendor/foo", true}, {false, "foo/vendor/foo", true},
{"leaflet.draw-src.js", true}, {false, "leaflet.draw-src.js", true},
{"foo/bar/MochiKit.js", true}, {false, "foo/bar/MochiKit.js", true},
{"foo/bar/dojo.js", true}, {false, "foo/bar/dojo.js", true},
{"foo/env/whatever", true}, {false, "foo/env/whatever", true},
{"some/python/venv/", false}, {false, "some/python/venv/", false},
{"foo/.imageset/bar", true}, {false, "foo/.imageset/bar", true},
{"Vagrantfile", true}, {false, "Vagrantfile", true},
{"src/bootstrap-custom.js", true}, {true, "src/bootstrap-custom.js", true},
// {"/css/bootstrap.rtl.css", true}, // from linguist v7.23 // {true, "/css/bootstrap.rtl.css", true}, // from linguist v7.23
} }
func TestIsVendor(t *testing.T) { func TestIsVendor(t *testing.T) {
for _, tt := range vendorTests { for _, test := range vendorTests {
t.Run(tt.path, func(t *testing.T) { t.Run(test.path, func(t *testing.T) {
if got := IsVendor(tt.path); got != tt.expected { if got := IsVendor(test.path); got != test.expected {
t.Errorf("IsVendor(%q) = %v, expected %v", tt.path, got, tt.expected) if regex.Name == regex.RE2 && test.skipOnRE2 {
return // skip
}
t.Errorf("IsVendor(%q) = %v, expected %v (usuing %s)", test.path, got, test.expected, regex.Name)
} }
}) })
} }