From 95f30f0db4f7128c3c09ca77a82ac87a5d8fed0c Mon Sep 17 00:00:00 2001 From: Alex Bezzubov Date: Fri, 14 Oct 2022 13:32:06 +0200 Subject: [PATCH] IsVendor: refactor RE collation optimization The same optimization still happens during package initialization at runtime, but an effort was made to make it more transparent and self-documented. Both the test & the benchmark were updated. Old version (usefull for benchmark) was ```go func IsVendor(filename string) bool { for _, matcher := range data.VendorMatchers { if matcher.MatchString(filename) { return true } } return false } ``` Test plan: * go test -run ^TestIsVendor$ github.com/go-enry/go-enry/v2 --- utils.go | 213 +++++++++++++++++++++----------------------------- utils_test.go | 90 +++++++++++---------- 2 files changed, 138 insertions(+), 165 deletions(-) diff --git a/utils.go b/utils.go index f48f29a..3a23c9b 100644 --- a/utils.go +++ b/utils.go @@ -2,8 +2,8 @@ package enry import ( "bytes" + "fmt" "path/filepath" - "regexp" "sort" "strings" @@ -63,11 +63,89 @@ func IsDotFile(path string) bool { return strings.HasPrefix(base, ".") && base != "." } -var isVendorRegExp *regexp.Regexp +var allVendorRegExp regex.EnryRegexp // IsVendor returns whether or not path is a vendor path. func IsVendor(path string) bool { - return isVendorRegExp.MatchString(path) + return allVendorRegExp.MatchString(path) +} + +func init() { + // We now collate all regexps from VendorMatchers to a single large regexp + // which is at least twice as fast to test than simply iterating & matching. + // + // --- + // + // We could test each matcher from VendorMatchers in turn i.e. + // + // func IsVendor(filename string) bool { + // for _, matcher := range data.VendorMatchers { + // if matcher.MatchString(filename) { + // return true + // } + // } + // return false + // } + // + // Or naïvely concatentate all these regexps using groups i.e. + // + // `(regexp1)|(regexp2)|(regexp3)|...` + // + // However, both of these are relatively slow and don't take advantage + // of the inherent structure within our regexps. + // + // Imperical observation: by looking at the regexps, we only have 3 types. + // 1. Those that start with `^` + // 2. Those that start with `(^|/)` + // 3. All the rest + // + // If we collate our regexps into these 3 groups - that will significantly + // reduce the likelihood of backtracking within the regexp trie matcher. + // + // A further improvement is to use non-capturing groups (?:) as otherwise + // the regexp parser, whilst matching, will have to allocate slices for + // matching positions. (A future improvement left out could be to + // enforce non-capturing groups within the sub-regexps.) + + matchers := data.VendorMatchers + sort.SliceStable(matchers, func(i, j int) bool { + return matchers[i].String() < matchers[j].String() + }) + + var caretPrefixed, caretOrSlashPrefixed, theRest []string + // Check prefix, add to the respective group slices + for _, matcher := range matchers { + str := matcher.String() + if strings.HasPrefix(str, "^") { + caretPrefixed = append(caretPrefixed, str[1:]) + } else if strings.HasPrefix(str, "(^|/)") { + caretOrSlashPrefixed = append(caretOrSlashPrefixed, str[5:]) + } else { + theRest = append(theRest, str) + } + } + var sb strings.Builder + // group 1 - start with `^` + appendGroupWithCommonPrefix(&sb, "^", caretPrefixed) + sb.WriteString("|") + // group 2 - start with `(^|/)` + appendGroupWithCommonPrefix(&sb, "(?:^|/)", caretOrSlashPrefixed) + sb.WriteString("|") + // grou 3, all rest. + appendGroupWithCommonPrefix(&sb, "", theRest) + allVendorRegExp = regex.MustCompile(sb.String()) +} + +func appendGroupWithCommonPrefix(sb *strings.Builder, commonPrefix string, res []string) { + sb.WriteString("(?:") + if commonPrefix != "" { + sb.WriteString(fmt.Sprintf("%s(?:(?:", commonPrefix)) + } + sb.WriteString(strings.Join(res, ")|(?:")) + if commonPrefix != "" { + sb.WriteString("))") + } + sb.WriteString(")") } // IsTest returns whether or not path is a test path. @@ -75,6 +153,16 @@ func IsTest(path string) bool { return matchRegexSlice(data.TestMatchers, path) } +func matchRegexSlice(exprs []regex.EnryRegexp, str string) bool { + for _, expr := range exprs { + if expr.MatchString(str) { + return true + } + } + + return false +} + // IsBinary detects if data is a binary value based on: // http://git.kernel.org/cgit/git/git.git/tree/xdiff-interface.c?id=HEAD#n198 func IsBinary(data []byte) bool { @@ -102,16 +190,6 @@ func GetColor(language string) string { return "#cccccc" } -func matchRegexSlice(exprs []regex.EnryRegexp, str string) bool { - for _, expr := range exprs { - if expr.MatchString(str) { - return true - } - } - - return false -} - // IsGenerated returns whether the file with the given path and content is a // generated file. func IsGenerated(path string, content []byte) bool { @@ -135,112 +213,3 @@ func IsGenerated(path string, content []byte) bool { return false } - -func init() { - // We now collate the individual regexps that make up the VendorMatchers to - // produce a single large regexp which is around twice as fast to test than - // simply iterating through all the regexps or naïvely collating the - // regexps. - // - // --- - // - // data.VendorMatchers here is a slice containing individual regexps that - // match a vendor file therefore if we want to test if a filename is a - // Vendor we need to test whether that filename matches one or more of - // those regexps. - // - // Now we could test each matcher in turn using a shortcircuiting test i.e. - // - // func IsVendor(filename string) bool { - // for _, matcher := range data.VendorMatchers { - // if matcher.Match(filename) { - // return true - // } - // } - // return false - // } - // - // Or concatentate all these regexps using groups i.e. - // - // `(regexp1)|(regexp2)|(regexp3)|...` - // - // However both of these are relatively slow and they don't take advantage - // of the inherent structure within our regexps... - // - // If we look at our regexps there are essentially three types of regexp: - // - // 1. Those that start with `^` - // 2. Those that start with `(^|/)` - // 3. Others - // - // If we collate our regexps into these groups that will significantly - // reduce the likelihood of backtracking within the regexp trie matcher. - // - // A further improvement is to use non-capturing groups as otherwise the - // regexp parser, whilst matching, will have to allocate slices for - // matching positions. (A future improvement here could be in the use of - // enforcing non-capturing groups within the sub-regexps too.) - // - // Finally if we sort the segments we can help the matcher build a more - // efficient matcher and trie. - - // alias the VendorMatchers to simplify things - matchers := data.VendorMatchers - - // Create three temporary string slices for our three groups above - prefixes removed - caretStrings := make([]string, 0, 10) - caretSegmentStrings := make([]string, 0, 10) - matcherStrings := make([]string, 0, len(matchers)) - - // Walk the matchers and check their string representation for each group prefix, remove it and add to the respective group slices - for _, matcher := range matchers { - str := matcher.String() - if str[0] == '^' { - caretStrings = append(caretStrings, str[1:]) - } else if str[0:5] == "(^|/)" { - caretSegmentStrings = append(caretSegmentStrings, str[5:]) - } else { - matcherStrings = append(matcherStrings, str) - } - } - - // Sort the strings within each group - a potential further improvement could be in simplifying within these groups - sort.Strings(caretSegmentStrings) - sort.Strings(caretStrings) - sort.Strings(matcherStrings) - - // Now build the collated regexp - sb := &strings.Builder{} - - // Start with group 1 - those that started with `^` - sb.WriteString("(?:^(?:(?:") - sb.WriteString(caretStrings[0]) - for _, matcher := range caretStrings[1:] { - sb.WriteString(")|(?:") - sb.WriteString(matcher) - } - sb.WriteString(")))") - sb.WriteString("|") - - // Now add group 2 - those that started with `(^|/)` - sb.WriteString("(?:(?:^|/)(?:(?:") - sb.WriteString(caretSegmentStrings[0]) - for _, matcher := range caretSegmentStrings[1:] { - sb.WriteString(")|(?:") - sb.WriteString(matcher) - } - sb.WriteString(")))") - sb.WriteString("|") - - // Finally add the rest - sb.WriteString("(?:") - sb.WriteString(matcherStrings[0]) - for _, matcher := range matcherStrings[1:] { - sb.WriteString(")|(?:") - sb.WriteString(matcher) - } - sb.WriteString(")") - - // Compile the whole thing as the isVendorRegExp - isVendorRegExp = regexp.MustCompile(sb.String()) -} diff --git a/utils_test.go b/utils_test.go index fd031ab..0a26069 100644 --- a/utils_test.go +++ b/utils_test.go @@ -11,48 +11,53 @@ import ( "github.com/stretchr/testify/require" ) +//TODO(bzz): port all from test/test_file_blob.rb test_vendored() +//https://github.com/github/linguist/blob/86adc140d3e8903980565a2984f5532edf4ae875/test/test_file_blob.rb#L270-L583 +var vendorTests = []struct { + path string + expected bool +}{ + {"cache/", true}, + {"something_cache/", false}, + {"random/cache/", true}, + {"cache", false}, + {"dependencies/", true}, + {"Dependencies/", true}, + {"dependency/", false}, + {"dist/", true}, + {"dist", false}, + {"random/dist/", true}, + {"random/dist", false}, + {"deps/", true}, + {"foodeps/", false}, + {"configure", true}, + {"a/configure", true}, + {"config.guess", true}, + {"config.guess/", false}, + {".vscode/", true}, + {"doc/_build/", true}, + {"a/docs/_build/", true}, + {"a/dasdocs/_build-vsdoc.js", true}, + {"a/dasdocs/_build-vsdoc.j", false}, + {"foo/bar", false}, + {".sublime-project", true}, + {"foo/vendor/foo", true}, + {"leaflet.draw-src.js", true}, + {"foo/bar/MochiKit.js", true}, + {"foo/bar/dojo.js", true}, + {"foo/env/whatever", true}, + {"some/python/venv/", false}, + {"foo/.imageset/bar", true}, + {"Vagrantfile", true}, + {"src/bootstrap-custom.js", true}, + // {"/css/bootstrap.rtl.css", true}, // from linguist v7.23 +} + func TestIsVendor(t *testing.T) { - tests := []struct { - path string - expected bool - }{ - {"cache/", true}, - {"something_cache/", false}, - {"random/cache/", true}, - {"cache", false}, - {"dependencies/", true}, - {"Dependencies/", true}, - {"dependency/", false}, - {"dist/", true}, - {"dist", false}, - {"random/dist/", true}, - {"random/dist", false}, - {"deps/", true}, - {"foodeps/", false}, - {"configure", true}, - {"a/configure", true}, - {"config.guess", true}, - {"config.guess/", false}, - {".vscode/", true}, - {"doc/_build/", true}, - {"a/docs/_build/", true}, - {"a/dasdocs/_build-vsdoc.js", true}, - {"a/dasdocs/_build-vsdoc.j", false}, - {"foo/bar", false}, - {".sublime-project", true}, - {"foo/vendor/foo", true}, - {"leaflet.draw-src.js", true}, - {"foo/bar/MochiKit.js", true}, - {"foo/bar/dojo.js", true}, - {"foo/env/whatever", true}, - {"some/python/venv/", false}, - {"foo/.imageset/bar", true}, - {"Vagrantfile", true}, - } - for _, tt := range tests { + for _, tt := range vendorTests { t.Run(tt.path, func(t *testing.T) { if got := IsVendor(tt.path); got != tt.expected { - t.Errorf("IsVendor() = %v, expected %v", got, tt.expected) + t.Errorf("IsVendor(%q) = %v, expected %v", tt.path, got, tt.expected) } }) } @@ -60,10 +65,9 @@ func TestIsVendor(t *testing.T) { func BenchmarkIsVendor(b *testing.B) { for i := 0; i < b.N; i++ { - IsVendor(".vscode/") - IsVendor("cache/") - IsVendor("foo/bar") - IsVendor("foo/bar/MochiKit.js") + for _, t := range vendorTests { + IsVendor(t.path) + } } }