From 8246efeccef260550eceedacc13938502b17175c Mon Sep 17 00:00:00 2001 From: Alex Bezzubov Date: Thu, 16 Feb 2023 17:55:57 +0100 Subject: [PATCH] heuristics regexp engine configurable #3, adapt IsVendor optimization & tests Regex collation optimization for IsVendor now fails gracefully. Tests that are affected by non-RE2 syntax are explicitly marked. --- internal/code-generator/assets/vendor.go.tmpl | 18 ++-- internal/code-generator/generator/vendor.go | 32 ++----- utils.go | 16 +++- utils_test.go | 89 ++++++++++--------- 4 files changed, 79 insertions(+), 76 deletions(-) diff --git a/internal/code-generator/assets/vendor.go.tmpl b/internal/code-generator/assets/vendor.go.tmpl index a5d0c9f..19eb2b3 100644 --- a/internal/code-generator/assets/vendor.go.tmpl +++ b/internal/code-generator/assets/vendor.go.tmpl @@ -2,15 +2,21 @@ package data import "github.com/go-enry/go-enry/v2/regex" +{{define "mustCompile" -}} + {{ if isRE2 . -}} + regex.MustCompile({{ . | stringVal }}) + {{- else -}} + regex.MustCompileRuby({{ . | stringVal }}) + {{- end -}} +{{end}} + var VendorMatchers = []regex.EnryRegexp{ {{range $re := . -}} - {{ if isRE2 $re -}} - regex.MustCompile({{ $re | stringVal }}), - {{- else -}} - regex.MustCompileRuby({{ $re | stringVal }}), - {{ end }} + {{ template "mustCompile" $re }}, {{end -}} } // FastVendorMatcher is equivalent to matching any of the VendorMatchers. -var FastVendorMatcher = regex.MustCompile(`{{ optimize . }}`) \ No newline at end of file +{{with $singleRE := collateAllRegexps . -}} +var FastVendorMatcher = {{template "mustCompile" $singleRE}} +{{end}} \ No newline at end of file diff --git a/internal/code-generator/generator/vendor.go b/internal/code-generator/generator/vendor.go index cc3facc..fff86e2 100644 --- a/internal/code-generator/generator/vendor.go +++ b/internal/code-generator/generator/vendor.go @@ -41,34 +41,14 @@ func Vendor(fileToParse, samplesDir, outPath, tmplPath, tmplName, commit string) } func executeVendorTemplate(out io.Writer, regexps []string, tmplPath, tmplName, commit string) error { - funcs := template.FuncMap{"optimize": collateAllMatchers} + funcs := template.FuncMap{"collateAllRegexps": collateAllRegexps} return executeTemplate(out, tmplName, tmplPath, commit, funcs, regexps) } -func collateAllMatchers(regexps []string) string { - // We now collate all regexps from VendorMatchers to a single large regexp +// collateAllRegexps all regexps to a single large regexp. +func collateAllRegexps(regexps []string) string { // which is at least twice as fast to test than simply iterating & matching. // - // --- - // - // We could test each matcher from VendorMatchers in turn i.e. - // - // func IsVendor(filename string) bool { - // for _, matcher := range data.VendorMatchers { - // if matcher.MatchString(filename) { - // return true - // } - // } - // return false - // } - // - // Or naïvely concatentate all these regexps using groups i.e. - // - // `(regexp1)|(regexp2)|(regexp3)|...` - // - // However, both of these are relatively slow and don't take advantage - // of the inherent structure within our regexps. - // // Imperical observation: by looking at the regexps, we only have 3 types. // 1. Those that start with `^` // 2. Those that start with `(^|/)` @@ -88,12 +68,9 @@ func collateAllMatchers(regexps []string) string { sort.Strings(regexps) + // Check prefix, group expressions var caretPrefixed, caretOrSlashPrefixed, theRest []string - // Check prefix, add to the respective group slices for _, re := range regexps { - if !isRE2(re) { - continue - } if strings.HasPrefix(re, caret) { caretPrefixed = append(caretPrefixed, re[len(caret):]) } else if strings.HasPrefix(re, caretOrSlash) { @@ -102,6 +79,7 @@ func collateAllMatchers(regexps []string) string { theRest = append(theRest, re) } } + var sb strings.Builder appendGroupWithCommonPrefix(&sb, "^", caretPrefixed) sb.WriteString("|") diff --git a/utils.go b/utils.go index fb3d6f4..5e18300 100644 --- a/utils.go +++ b/utils.go @@ -63,7 +63,21 @@ func IsDotFile(path string) bool { // IsVendor returns whether or not path is a vendor path. func IsVendor(path string) bool { - return data.FastVendorMatcher.MatchString(path) + // fast path: single collatated regex, if the engine supports its syntax + if data.FastVendorMatcher != nil { + return data.FastVendorMatcher.MatchString(path) + } + + // slow path: skip individual rules with unsupported syntax + for _, matcher := range data.VendorMatchers { + if matcher == nil { + continue + } + if matcher.MatchString(path) { + return true + } + } + return false } // IsTest returns whether or not path is a test path. diff --git a/utils_test.go b/utils_test.go index 0a26069..75f2a22 100644 --- a/utils_test.go +++ b/utils_test.go @@ -7,57 +7,62 @@ import ( "path/filepath" "testing" + "github.com/go-enry/go-enry/v2/regex" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) -//TODO(bzz): port all from test/test_file_blob.rb test_vendored() -//https://github.com/github/linguist/blob/86adc140d3e8903980565a2984f5532edf4ae875/test/test_file_blob.rb#L270-L583 +// TODO(bzz): port all from test/test_file_blob.rb test_vendored() +// https://github.com/github/linguist/blob/86adc140d3e8903980565a2984f5532edf4ae875/test/test_file_blob.rb#L270-L583 var vendorTests = []struct { - path string - expected bool + skipOnRE2 bool // some rules are (present in code but) missing at runtime on RE2 + path string + expected bool }{ - {"cache/", true}, - {"something_cache/", false}, - {"random/cache/", true}, - {"cache", false}, - {"dependencies/", true}, - {"Dependencies/", true}, - {"dependency/", false}, - {"dist/", true}, - {"dist", false}, - {"random/dist/", true}, - {"random/dist", false}, - {"deps/", true}, - {"foodeps/", false}, - {"configure", true}, - {"a/configure", true}, - {"config.guess", true}, - {"config.guess/", false}, - {".vscode/", true}, - {"doc/_build/", true}, - {"a/docs/_build/", true}, - {"a/dasdocs/_build-vsdoc.js", true}, - {"a/dasdocs/_build-vsdoc.j", false}, - {"foo/bar", false}, - {".sublime-project", true}, - {"foo/vendor/foo", true}, - {"leaflet.draw-src.js", true}, - {"foo/bar/MochiKit.js", true}, - {"foo/bar/dojo.js", true}, - {"foo/env/whatever", true}, - {"some/python/venv/", false}, - {"foo/.imageset/bar", true}, - {"Vagrantfile", true}, - {"src/bootstrap-custom.js", true}, - // {"/css/bootstrap.rtl.css", true}, // from linguist v7.23 + {path: "cache/", expected: true}, + {false, "something_cache/", false}, + {false, "random/cache/", true}, + {false, "cache", false}, + {false, "dependencies/", true}, + {false, "Dependencies/", true}, + {false, "dependency/", false}, + {false, "dist/", true}, + {false, "dist", false}, + {false, "random/dist/", true}, + {false, "random/dist", false}, + {false, "deps/", true}, + {false, "foodeps/", false}, + {false, "configure", true}, + {false, "a/configure", true}, + {false, "config.guess", true}, + {false, "config.guess/", false}, + {false, ".vscode/", true}, + {false, "doc/_build/", true}, + {false, "a/docs/_build/", true}, + {false, "a/dasdocs/_build-vsdoc.js", true}, + {false, "a/dasdocs/_build-vsdoc.j", false}, + {false, "foo/bar", false}, + {false, ".sublime-project", true}, + {false, "foo/vendor/foo", true}, + {false, "leaflet.draw-src.js", true}, + {false, "foo/bar/MochiKit.js", true}, + {false, "foo/bar/dojo.js", true}, + {false, "foo/env/whatever", true}, + {false, "some/python/venv/", false}, + {false, "foo/.imageset/bar", true}, + {false, "Vagrantfile", true}, + {true, "src/bootstrap-custom.js", true}, + // {true, "/css/bootstrap.rtl.css", true}, // from linguist v7.23 } func TestIsVendor(t *testing.T) { - for _, tt := range vendorTests { - t.Run(tt.path, func(t *testing.T) { - if got := IsVendor(tt.path); got != tt.expected { - t.Errorf("IsVendor(%q) = %v, expected %v", tt.path, got, tt.expected) + for _, test := range vendorTests { + t.Run(test.path, func(t *testing.T) { + if got := IsVendor(test.path); got != test.expected { + if regex.Name == regex.RE2 && test.skipOnRE2 { + return // skip + } + t.Errorf("IsVendor(%q) = %v, expected %v (usuing %s)", test.path, got, test.expected, regex.Name) } }) }