mirror of
https://github.com/ralsina/tartrazine.git
synced 2024-11-10 05:22:23 +00:00
IsVendor: refactor RE collation optimization
The same optimization still happens during package initialization at runtime, but an effort was made to make it more transparent and self-documented. Both the test & the benchmark were updated. Old version (usefull for benchmark) was ```go func IsVendor(filename string) bool { for _, matcher := range data.VendorMatchers { if matcher.MatchString(filename) { return true } } return false } ``` Test plan: * go test -run ^TestIsVendor$ github.com/go-enry/go-enry/v2
This commit is contained in:
parent
9b19067edc
commit
95f30f0db4
213
utils.go
213
utils.go
@ -2,8 +2,8 @@ package enry
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"sort"
|
||||
"strings"
|
||||
|
||||
@ -63,11 +63,89 @@ func IsDotFile(path string) bool {
|
||||
return strings.HasPrefix(base, ".") && base != "."
|
||||
}
|
||||
|
||||
var isVendorRegExp *regexp.Regexp
|
||||
var allVendorRegExp regex.EnryRegexp
|
||||
|
||||
// IsVendor returns whether or not path is a vendor path.
|
||||
func IsVendor(path string) bool {
|
||||
return isVendorRegExp.MatchString(path)
|
||||
return allVendorRegExp.MatchString(path)
|
||||
}
|
||||
|
||||
func init() {
|
||||
// We now collate all regexps from VendorMatchers to a single large regexp
|
||||
// which is at least twice as fast to test than simply iterating & matching.
|
||||
//
|
||||
// ---
|
||||
//
|
||||
// We could test each matcher from VendorMatchers in turn i.e.
|
||||
//
|
||||
// func IsVendor(filename string) bool {
|
||||
// for _, matcher := range data.VendorMatchers {
|
||||
// if matcher.MatchString(filename) {
|
||||
// return true
|
||||
// }
|
||||
// }
|
||||
// return false
|
||||
// }
|
||||
//
|
||||
// Or naïvely concatentate all these regexps using groups i.e.
|
||||
//
|
||||
// `(regexp1)|(regexp2)|(regexp3)|...`
|
||||
//
|
||||
// However, both of these are relatively slow and don't take advantage
|
||||
// of the inherent structure within our regexps.
|
||||
//
|
||||
// Imperical observation: by looking at the regexps, we only have 3 types.
|
||||
// 1. Those that start with `^`
|
||||
// 2. Those that start with `(^|/)`
|
||||
// 3. All the rest
|
||||
//
|
||||
// If we collate our regexps into these 3 groups - that will significantly
|
||||
// reduce the likelihood of backtracking within the regexp trie matcher.
|
||||
//
|
||||
// A further improvement is to use non-capturing groups (?:) as otherwise
|
||||
// the regexp parser, whilst matching, will have to allocate slices for
|
||||
// matching positions. (A future improvement left out could be to
|
||||
// enforce non-capturing groups within the sub-regexps.)
|
||||
|
||||
matchers := data.VendorMatchers
|
||||
sort.SliceStable(matchers, func(i, j int) bool {
|
||||
return matchers[i].String() < matchers[j].String()
|
||||
})
|
||||
|
||||
var caretPrefixed, caretOrSlashPrefixed, theRest []string
|
||||
// Check prefix, add to the respective group slices
|
||||
for _, matcher := range matchers {
|
||||
str := matcher.String()
|
||||
if strings.HasPrefix(str, "^") {
|
||||
caretPrefixed = append(caretPrefixed, str[1:])
|
||||
} else if strings.HasPrefix(str, "(^|/)") {
|
||||
caretOrSlashPrefixed = append(caretOrSlashPrefixed, str[5:])
|
||||
} else {
|
||||
theRest = append(theRest, str)
|
||||
}
|
||||
}
|
||||
var sb strings.Builder
|
||||
// group 1 - start with `^`
|
||||
appendGroupWithCommonPrefix(&sb, "^", caretPrefixed)
|
||||
sb.WriteString("|")
|
||||
// group 2 - start with `(^|/)`
|
||||
appendGroupWithCommonPrefix(&sb, "(?:^|/)", caretOrSlashPrefixed)
|
||||
sb.WriteString("|")
|
||||
// grou 3, all rest.
|
||||
appendGroupWithCommonPrefix(&sb, "", theRest)
|
||||
allVendorRegExp = regex.MustCompile(sb.String())
|
||||
}
|
||||
|
||||
func appendGroupWithCommonPrefix(sb *strings.Builder, commonPrefix string, res []string) {
|
||||
sb.WriteString("(?:")
|
||||
if commonPrefix != "" {
|
||||
sb.WriteString(fmt.Sprintf("%s(?:(?:", commonPrefix))
|
||||
}
|
||||
sb.WriteString(strings.Join(res, ")|(?:"))
|
||||
if commonPrefix != "" {
|
||||
sb.WriteString("))")
|
||||
}
|
||||
sb.WriteString(")")
|
||||
}
|
||||
|
||||
// IsTest returns whether or not path is a test path.
|
||||
@ -75,6 +153,16 @@ func IsTest(path string) bool {
|
||||
return matchRegexSlice(data.TestMatchers, path)
|
||||
}
|
||||
|
||||
func matchRegexSlice(exprs []regex.EnryRegexp, str string) bool {
|
||||
for _, expr := range exprs {
|
||||
if expr.MatchString(str) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
// IsBinary detects if data is a binary value based on:
|
||||
// http://git.kernel.org/cgit/git/git.git/tree/xdiff-interface.c?id=HEAD#n198
|
||||
func IsBinary(data []byte) bool {
|
||||
@ -102,16 +190,6 @@ func GetColor(language string) string {
|
||||
return "#cccccc"
|
||||
}
|
||||
|
||||
func matchRegexSlice(exprs []regex.EnryRegexp, str string) bool {
|
||||
for _, expr := range exprs {
|
||||
if expr.MatchString(str) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
// IsGenerated returns whether the file with the given path and content is a
|
||||
// generated file.
|
||||
func IsGenerated(path string, content []byte) bool {
|
||||
@ -135,112 +213,3 @@ func IsGenerated(path string, content []byte) bool {
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
func init() {
|
||||
// We now collate the individual regexps that make up the VendorMatchers to
|
||||
// produce a single large regexp which is around twice as fast to test than
|
||||
// simply iterating through all the regexps or naïvely collating the
|
||||
// regexps.
|
||||
//
|
||||
// ---
|
||||
//
|
||||
// data.VendorMatchers here is a slice containing individual regexps that
|
||||
// match a vendor file therefore if we want to test if a filename is a
|
||||
// Vendor we need to test whether that filename matches one or more of
|
||||
// those regexps.
|
||||
//
|
||||
// Now we could test each matcher in turn using a shortcircuiting test i.e.
|
||||
//
|
||||
// func IsVendor(filename string) bool {
|
||||
// for _, matcher := range data.VendorMatchers {
|
||||
// if matcher.Match(filename) {
|
||||
// return true
|
||||
// }
|
||||
// }
|
||||
// return false
|
||||
// }
|
||||
//
|
||||
// Or concatentate all these regexps using groups i.e.
|
||||
//
|
||||
// `(regexp1)|(regexp2)|(regexp3)|...`
|
||||
//
|
||||
// However both of these are relatively slow and they don't take advantage
|
||||
// of the inherent structure within our regexps...
|
||||
//
|
||||
// If we look at our regexps there are essentially three types of regexp:
|
||||
//
|
||||
// 1. Those that start with `^`
|
||||
// 2. Those that start with `(^|/)`
|
||||
// 3. Others
|
||||
//
|
||||
// If we collate our regexps into these groups that will significantly
|
||||
// reduce the likelihood of backtracking within the regexp trie matcher.
|
||||
//
|
||||
// A further improvement is to use non-capturing groups as otherwise the
|
||||
// regexp parser, whilst matching, will have to allocate slices for
|
||||
// matching positions. (A future improvement here could be in the use of
|
||||
// enforcing non-capturing groups within the sub-regexps too.)
|
||||
//
|
||||
// Finally if we sort the segments we can help the matcher build a more
|
||||
// efficient matcher and trie.
|
||||
|
||||
// alias the VendorMatchers to simplify things
|
||||
matchers := data.VendorMatchers
|
||||
|
||||
// Create three temporary string slices for our three groups above - prefixes removed
|
||||
caretStrings := make([]string, 0, 10)
|
||||
caretSegmentStrings := make([]string, 0, 10)
|
||||
matcherStrings := make([]string, 0, len(matchers))
|
||||
|
||||
// Walk the matchers and check their string representation for each group prefix, remove it and add to the respective group slices
|
||||
for _, matcher := range matchers {
|
||||
str := matcher.String()
|
||||
if str[0] == '^' {
|
||||
caretStrings = append(caretStrings, str[1:])
|
||||
} else if str[0:5] == "(^|/)" {
|
||||
caretSegmentStrings = append(caretSegmentStrings, str[5:])
|
||||
} else {
|
||||
matcherStrings = append(matcherStrings, str)
|
||||
}
|
||||
}
|
||||
|
||||
// Sort the strings within each group - a potential further improvement could be in simplifying within these groups
|
||||
sort.Strings(caretSegmentStrings)
|
||||
sort.Strings(caretStrings)
|
||||
sort.Strings(matcherStrings)
|
||||
|
||||
// Now build the collated regexp
|
||||
sb := &strings.Builder{}
|
||||
|
||||
// Start with group 1 - those that started with `^`
|
||||
sb.WriteString("(?:^(?:(?:")
|
||||
sb.WriteString(caretStrings[0])
|
||||
for _, matcher := range caretStrings[1:] {
|
||||
sb.WriteString(")|(?:")
|
||||
sb.WriteString(matcher)
|
||||
}
|
||||
sb.WriteString(")))")
|
||||
sb.WriteString("|")
|
||||
|
||||
// Now add group 2 - those that started with `(^|/)`
|
||||
sb.WriteString("(?:(?:^|/)(?:(?:")
|
||||
sb.WriteString(caretSegmentStrings[0])
|
||||
for _, matcher := range caretSegmentStrings[1:] {
|
||||
sb.WriteString(")|(?:")
|
||||
sb.WriteString(matcher)
|
||||
}
|
||||
sb.WriteString(")))")
|
||||
sb.WriteString("|")
|
||||
|
||||
// Finally add the rest
|
||||
sb.WriteString("(?:")
|
||||
sb.WriteString(matcherStrings[0])
|
||||
for _, matcher := range matcherStrings[1:] {
|
||||
sb.WriteString(")|(?:")
|
||||
sb.WriteString(matcher)
|
||||
}
|
||||
sb.WriteString(")")
|
||||
|
||||
// Compile the whole thing as the isVendorRegExp
|
||||
isVendorRegExp = regexp.MustCompile(sb.String())
|
||||
}
|
||||
|
@ -11,48 +11,53 @@ import (
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
//TODO(bzz): port all from test/test_file_blob.rb test_vendored()
|
||||
//https://github.com/github/linguist/blob/86adc140d3e8903980565a2984f5532edf4ae875/test/test_file_blob.rb#L270-L583
|
||||
var vendorTests = []struct {
|
||||
path string
|
||||
expected bool
|
||||
}{
|
||||
{"cache/", true},
|
||||
{"something_cache/", false},
|
||||
{"random/cache/", true},
|
||||
{"cache", false},
|
||||
{"dependencies/", true},
|
||||
{"Dependencies/", true},
|
||||
{"dependency/", false},
|
||||
{"dist/", true},
|
||||
{"dist", false},
|
||||
{"random/dist/", true},
|
||||
{"random/dist", false},
|
||||
{"deps/", true},
|
||||
{"foodeps/", false},
|
||||
{"configure", true},
|
||||
{"a/configure", true},
|
||||
{"config.guess", true},
|
||||
{"config.guess/", false},
|
||||
{".vscode/", true},
|
||||
{"doc/_build/", true},
|
||||
{"a/docs/_build/", true},
|
||||
{"a/dasdocs/_build-vsdoc.js", true},
|
||||
{"a/dasdocs/_build-vsdoc.j", false},
|
||||
{"foo/bar", false},
|
||||
{".sublime-project", true},
|
||||
{"foo/vendor/foo", true},
|
||||
{"leaflet.draw-src.js", true},
|
||||
{"foo/bar/MochiKit.js", true},
|
||||
{"foo/bar/dojo.js", true},
|
||||
{"foo/env/whatever", true},
|
||||
{"some/python/venv/", false},
|
||||
{"foo/.imageset/bar", true},
|
||||
{"Vagrantfile", true},
|
||||
{"src/bootstrap-custom.js", true},
|
||||
// {"/css/bootstrap.rtl.css", true}, // from linguist v7.23
|
||||
}
|
||||
|
||||
func TestIsVendor(t *testing.T) {
|
||||
tests := []struct {
|
||||
path string
|
||||
expected bool
|
||||
}{
|
||||
{"cache/", true},
|
||||
{"something_cache/", false},
|
||||
{"random/cache/", true},
|
||||
{"cache", false},
|
||||
{"dependencies/", true},
|
||||
{"Dependencies/", true},
|
||||
{"dependency/", false},
|
||||
{"dist/", true},
|
||||
{"dist", false},
|
||||
{"random/dist/", true},
|
||||
{"random/dist", false},
|
||||
{"deps/", true},
|
||||
{"foodeps/", false},
|
||||
{"configure", true},
|
||||
{"a/configure", true},
|
||||
{"config.guess", true},
|
||||
{"config.guess/", false},
|
||||
{".vscode/", true},
|
||||
{"doc/_build/", true},
|
||||
{"a/docs/_build/", true},
|
||||
{"a/dasdocs/_build-vsdoc.js", true},
|
||||
{"a/dasdocs/_build-vsdoc.j", false},
|
||||
{"foo/bar", false},
|
||||
{".sublime-project", true},
|
||||
{"foo/vendor/foo", true},
|
||||
{"leaflet.draw-src.js", true},
|
||||
{"foo/bar/MochiKit.js", true},
|
||||
{"foo/bar/dojo.js", true},
|
||||
{"foo/env/whatever", true},
|
||||
{"some/python/venv/", false},
|
||||
{"foo/.imageset/bar", true},
|
||||
{"Vagrantfile", true},
|
||||
}
|
||||
for _, tt := range tests {
|
||||
for _, tt := range vendorTests {
|
||||
t.Run(tt.path, func(t *testing.T) {
|
||||
if got := IsVendor(tt.path); got != tt.expected {
|
||||
t.Errorf("IsVendor() = %v, expected %v", got, tt.expected)
|
||||
t.Errorf("IsVendor(%q) = %v, expected %v", tt.path, got, tt.expected)
|
||||
}
|
||||
})
|
||||
}
|
||||
@ -60,10 +65,9 @@ func TestIsVendor(t *testing.T) {
|
||||
|
||||
func BenchmarkIsVendor(b *testing.B) {
|
||||
for i := 0; i < b.N; i++ {
|
||||
IsVendor(".vscode/")
|
||||
IsVendor("cache/")
|
||||
IsVendor("foo/bar")
|
||||
IsVendor("foo/bar/MochiKit.js")
|
||||
for _, t := range vendorTests {
|
||||
IsVendor(t.path)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user