Make IsVendor quicker

Although iterating across the regexps is quicker than naively concatenating them,
it is still quite slow.

This PR proposes a slightly cleverer solution.

First instead of just concatenating with groups this PR uses non-capturing groups.
This speeds up the regexp processing.

Secondly we group the regexps in to 3 groups - those that have to be at the start,
those that are segments or at the start and the rest. This makes a considerable speed
improvement.

Thirdly the regexps are sorted within those groups - which also speeds things up.

All in all for a non-vendored file this makes IsVendor around twice as fast.

Signed-off-by: Andrew Thornton <art27@cantab.net>
This commit is contained in:
Andrew Thornton 2021-03-31 20:34:43 +01:00
parent d2d4c32d4d
commit 20726a1de3
No known key found for this signature in database
GPG Key ID: 3CDE74631F13A748

115
utils.go
View File

@ -3,6 +3,8 @@ package enry
import (
"bytes"
"path/filepath"
"regexp"
"sort"
"strings"
"github.com/go-enry/go-enry/v2/data"
@ -61,9 +63,11 @@ func IsDotFile(path string) bool {
return strings.HasPrefix(base, ".") && base != "."
}
var isVendorRegExp *regexp.Regexp
// IsVendor returns whether or not path is a vendor path.
func IsVendor(path string) bool {
return matchRegexSlice(data.VendorMatchers, path)
return isVendorRegExp.MatchString(path)
}
// IsTest returns whether or not path is a test path.
@ -131,3 +135,112 @@ func IsGenerated(path string, content []byte) bool {
return false
}
func init() {
// We now collate the individual regexps that make up the VendorMatchers to
// produce a single large regexp which is around twice as fast to test than
// simply iterating through all the regexps or naïvely collating the
// regexps.
//
// ---
//
// data.VendorMatchers here is a slice containing individual regexps that
// match a vendor file therefore if we want to test if a filename is a
// Vendor we need to test whether that filename matches one or more of
// those regexps.
//
// Now we could test each matcher in turn using a shortcircuiting test i.e.
//
// func IsVendor(filename string) bool {
// for _, matcher := range data.VendorMatchers {
// if matcher.Match(filename) {
// return true
// }
// }
// return false
// }
//
// Or concatentate all these regexps using groups i.e.
//
// `(regexp1)|(regexp2)|(regexp3)|...`
//
// However both of these are relatively slow and they don't take advantage
// of the inherent structure within our regexps...
//
// If we look at our regexps there are essentially three types of regexp:
//
// 1. Those that start with `^`
// 2. Those that start with `(^|/)`
// 3. Others
//
// If we collate our regexps into these groups that will significantly
// reduce the likelihood of backtracking within the regexp trie matcher.
//
// A further improvement is to use non-capturing groups as otherwise the
// regexp parser, whilst matching, will have to allocate slices for
// matching positions. (A future improvement here could be in the use of
// enforcing non-capturing groups within the sub-regexps too.)
//
// Finally if we sort the segments we can help the matcher build a more
// efficient matcher and trie.
// alias the VendorMatchers to simplify things
matchers := data.VendorMatchers
// Create three temporary string slices for our three groups above - prefixes removed
caretStrings := make([]string, 0, 10)
caretSegmentStrings := make([]string, 0, 10)
matcherStrings := make([]string, 0, len(matchers))
// Walk the matchers and check their string representation for each group prefix, remove it and add to the respective group slices
for _, matcher := range matchers {
str := matcher.String()
if str[0] == '^' {
caretStrings = append(caretStrings, str[1:])
} else if str[0:5] == "(^|/)" {
caretSegmentStrings = append(caretSegmentStrings, str[5:])
} else {
matcherStrings = append(matcherStrings, str)
}
}
// Sort the strings within each group - a potential further improvement could be in simplifying within these groups
sort.Strings(caretSegmentStrings)
sort.Strings(caretStrings)
sort.Strings(matcherStrings)
// Now build the collated regexp
sb := &strings.Builder{}
// Start with group 1 - those that started with `^`
sb.WriteString("(?:^(?:")
sb.WriteString(caretStrings[0])
for _, matcher := range caretStrings[1:] {
sb.WriteString(")|(?:")
sb.WriteString(matcher)
}
sb.WriteString("))")
sb.WriteString("|")
// Now add group 2 - those that started with `(^|/)`
sb.WriteString("(?:(?:^|/)(?:")
sb.WriteString(caretSegmentStrings[0])
for _, matcher := range caretSegmentStrings[1:] {
sb.WriteString(")|(?:")
sb.WriteString(matcher)
}
sb.WriteString("))")
sb.WriteString("|")
// Finally add the rest
sb.WriteString("(?:")
sb.WriteString(matcherStrings[0])
for _, matcher := range matcherStrings[1:] {
sb.WriteString(")|(?:")
sb.WriteString(matcher)
}
sb.WriteString(")")
// Compile the whole thing as the isVendorRegExp
isVendorRegExp = regexp.MustCompile(sb.String())
}