From 20726a1de3cdff1d64bf0a6e5f6e2475a2c96913 Mon Sep 17 00:00:00 2001 From: Andrew Thornton Date: Wed, 31 Mar 2021 20:34:43 +0100 Subject: [PATCH] Make IsVendor quicker Although iterating across the regexps is quicker than naively concatenating them, it is still quite slow. This PR proposes a slightly cleverer solution. First instead of just concatenating with groups this PR uses non-capturing groups. This speeds up the regexp processing. Secondly we group the regexps in to 3 groups - those that have to be at the start, those that are segments or at the start and the rest. This makes a considerable speed improvement. Thirdly the regexps are sorted within those groups - which also speeds things up. All in all for a non-vendored file this makes IsVendor around twice as fast. Signed-off-by: Andrew Thornton --- utils.go | 115 ++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 114 insertions(+), 1 deletion(-) diff --git a/utils.go b/utils.go index 34da250..c3ff632 100644 --- a/utils.go +++ b/utils.go @@ -3,6 +3,8 @@ package enry import ( "bytes" "path/filepath" + "regexp" + "sort" "strings" "github.com/go-enry/go-enry/v2/data" @@ -61,9 +63,11 @@ func IsDotFile(path string) bool { return strings.HasPrefix(base, ".") && base != "." } +var isVendorRegExp *regexp.Regexp + // IsVendor returns whether or not path is a vendor path. func IsVendor(path string) bool { - return matchRegexSlice(data.VendorMatchers, path) + return isVendorRegExp.MatchString(path) } // IsTest returns whether or not path is a test path. @@ -131,3 +135,112 @@ func IsGenerated(path string, content []byte) bool { return false } + +func init() { + // We now collate the individual regexps that make up the VendorMatchers to + // produce a single large regexp which is around twice as fast to test than + // simply iterating through all the regexps or naïvely collating the + // regexps. + // + // --- + // + // data.VendorMatchers here is a slice containing individual regexps that + // match a vendor file therefore if we want to test if a filename is a + // Vendor we need to test whether that filename matches one or more of + // those regexps. + // + // Now we could test each matcher in turn using a shortcircuiting test i.e. + // + // func IsVendor(filename string) bool { + // for _, matcher := range data.VendorMatchers { + // if matcher.Match(filename) { + // return true + // } + // } + // return false + // } + // + // Or concatentate all these regexps using groups i.e. + // + // `(regexp1)|(regexp2)|(regexp3)|...` + // + // However both of these are relatively slow and they don't take advantage + // of the inherent structure within our regexps... + // + // If we look at our regexps there are essentially three types of regexp: + // + // 1. Those that start with `^` + // 2. Those that start with `(^|/)` + // 3. Others + // + // If we collate our regexps into these groups that will significantly + // reduce the likelihood of backtracking within the regexp trie matcher. + // + // A further improvement is to use non-capturing groups as otherwise the + // regexp parser, whilst matching, will have to allocate slices for + // matching positions. (A future improvement here could be in the use of + // enforcing non-capturing groups within the sub-regexps too.) + // + // Finally if we sort the segments we can help the matcher build a more + // efficient matcher and trie. + + // alias the VendorMatchers to simplify things + matchers := data.VendorMatchers + + // Create three temporary string slices for our three groups above - prefixes removed + caretStrings := make([]string, 0, 10) + caretSegmentStrings := make([]string, 0, 10) + matcherStrings := make([]string, 0, len(matchers)) + + // Walk the matchers and check their string representation for each group prefix, remove it and add to the respective group slices + for _, matcher := range matchers { + str := matcher.String() + if str[0] == '^' { + caretStrings = append(caretStrings, str[1:]) + } else if str[0:5] == "(^|/)" { + caretSegmentStrings = append(caretSegmentStrings, str[5:]) + } else { + matcherStrings = append(matcherStrings, str) + } + } + + // Sort the strings within each group - a potential further improvement could be in simplifying within these groups + sort.Strings(caretSegmentStrings) + sort.Strings(caretStrings) + sort.Strings(matcherStrings) + + // Now build the collated regexp + sb := &strings.Builder{} + + // Start with group 1 - those that started with `^` + sb.WriteString("(?:^(?:") + sb.WriteString(caretStrings[0]) + for _, matcher := range caretStrings[1:] { + sb.WriteString(")|(?:") + sb.WriteString(matcher) + } + sb.WriteString("))") + sb.WriteString("|") + + // Now add group 2 - those that started with `(^|/)` + sb.WriteString("(?:(?:^|/)(?:") + sb.WriteString(caretSegmentStrings[0]) + for _, matcher := range caretSegmentStrings[1:] { + sb.WriteString(")|(?:") + sb.WriteString(matcher) + } + sb.WriteString("))") + sb.WriteString("|") + + // Finally add the rest + sb.WriteString("(?:") + sb.WriteString(matcherStrings[0]) + for _, matcher := range matcherStrings[1:] { + sb.WriteString(")|(?:") + sb.WriteString(matcher) + } + sb.WriteString(")") + + // Compile the whole thing as the isVendorRegExp + isVendorRegExp = regexp.MustCompile(sb.String()) +}