Merge pull request #151 from go-enry/distinguish-re-syntax

Syntax-aware regexp generation for configurable engines
This commit is contained in:
Alex 2023-03-03 13:57:49 +01:00 committed by GitHub
commit 0e58945703
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
20 changed files with 1045 additions and 760 deletions

View File

@ -184,6 +184,8 @@ Parsing [linguist/samples](https://github.com/github/linguist/tree/master/sample
In all the cases above that have an issue number - we plan to update enry to match Linguist behavior. In all the cases above that have an issue number - we plan to update enry to match Linguist behavior.
> All the issues related to heuristics' regexp syntax incompatibilities with the RE2 engine can be avoided by using `oniguruma` instead (see [instuctions](#misc))
## Benchmarks ## Benchmarks
Enry's language detection has been compared with Linguist's on [_linguist/samples_](https://github.com/github/linguist/tree/master/samples). Enry's language detection has been compared with Linguist's on [_linguist/samples_](https://github.com/github/linguist/tree/master/samples).

File diff suppressed because it is too large Load Diff

View File

@ -3,6 +3,15 @@
// with colliding extensions, based on regexps from Linguist data. // with colliding extensions, based on regexps from Linguist data.
package rule package rule
import "github.com/go-enry/go-enry/v2/regex"
// Matcher checks if the data matches (number of) pattern(s).
// Every heuristic rule below implements this interface.
// A regexp.Regexp satisfies this interface and can be used instead.
type Matcher interface {
Match(data []byte) bool
}
// Heuristic consist of (a number of) rules where each, if matches, // Heuristic consist of (a number of) rules where each, if matches,
// identifies content as belonging to a programming language(s). // identifies content as belonging to a programming language(s).
type Heuristic interface { type Heuristic interface {
@ -10,15 +19,7 @@ type Heuristic interface {
Languages() []string Languages() []string
} }
// Matcher checks if the data matches (number of) pattern. // languages base struct with all the languages that a Matcher identifies.
// Every heuristic rule below implements this interface.
// A regexp.Regexp satisfies this interface and can be used instead.
type Matcher interface {
Match(data []byte) bool
}
// languages struct incapsulate data common to every Matcher: all languages
// that it identifies.
type languages struct { type languages struct {
langs []string langs []string
} }
@ -33,6 +34,10 @@ func MatchingLanguages(langs ...string) languages {
return languages{langs} return languages{langs}
} }
func noLanguages() languages {
return MatchingLanguages([]string{}...)
}
// Implements a Heuristic. // Implements a Heuristic.
type or struct { type or struct {
languages languages
@ -40,14 +45,19 @@ type or struct {
} }
// Or rule matches, if a single matching pattern exists. // Or rule matches, if a single matching pattern exists.
// It receives only one pattern as it relies on compile-time optimization that // It receives only one pattern as it relies on optimization that
// represtes union with | inside a single regexp. // represtes union with | inside a single regexp during code generation.
func Or(l languages, r Matcher) Heuristic { func Or(l languages, p Matcher) Heuristic {
return or{l, r} //FIXME(bzz): this will not be the case as only some of the patterns may
// be non-RE2 => we shouldn't collate them not to loose the (accuracty of) whole rule
return or{l, p}
} }
// Match implements rule.Matcher. // Match implements rule.Matcher.
func (r or) Match(data []byte) bool { func (r or) Match(data []byte) bool {
if runOnRE2AndRegexNotAccepted(r.pattern) {
return false
}
return r.pattern.Match(data) return r.pattern.Match(data)
} }
@ -65,6 +75,9 @@ func And(l languages, m ...Matcher) Heuristic {
// Match implements data.Matcher. // Match implements data.Matcher.
func (r and) Match(data []byte) bool { func (r and) Match(data []byte) bool {
for _, p := range r.patterns { for _, p := range r.patterns {
if runOnRE2AndRegexNotAccepted(p) {
continue
}
if !p.Match(data) { if !p.Match(data) {
return false return false
} }
@ -86,6 +99,9 @@ func Not(l languages, r ...Matcher) Heuristic {
// Match implements data.Matcher. // Match implements data.Matcher.
func (r not) Match(data []byte) bool { func (r not) Match(data []byte) bool {
for _, p := range r.Patterns { for _, p := range r.Patterns {
if runOnRE2AndRegexNotAccepted(p) {
continue
}
if p.Match(data) { if p.Match(data) {
return false return false
} }
@ -107,3 +123,11 @@ func Always(l languages) Heuristic {
func (r always) Match(data []byte) bool { func (r always) Match(data []byte) bool {
return true return true
} }
// Checks if a regex syntax isn't accepted by RE2 engine.
// It's nil by construction from regex.MustCompileRuby but
// is used here as a Matcher interface wich itself is non-nil.
func runOnRE2AndRegexNotAccepted(re Matcher) bool {
v, ok := re.(regex.EnryRegexp)
return ok && v == nil
}

View File

@ -1,39 +1,71 @@
package rule package rule
import ( import (
"regexp"
"testing" "testing"
"github.com/go-enry/go-enry/v2/regex"
"github.com/stretchr/testify/assert" "github.com/stretchr/testify/assert"
) )
const lang = "ActionScript" const lang = "ActionScript"
var fixtures = []struct { type fixture struct {
name string name string
rule Heuristic rule Heuristic
numLangs int numLangs int
matching string match string
noMatch string noMatch string
}{ }
var specificFixtures = map[string][]fixture{
"": { // cases that don't vary between the engines
{"Always", Always(MatchingLanguages(lang)), 1, "a", ""}, {"Always", Always(MatchingLanguages(lang)), 1, "a", ""},
{"Not", Not(MatchingLanguages(lang), regexp.MustCompile(`a`)), 1, "b", "a"}, {"Not", Not(MatchingLanguages(lang), regex.MustCompile(`a`)), 1, "b", "a"},
{"And", And(MatchingLanguages(lang), regexp.MustCompile(`a`), regexp.MustCompile(`b`)), 1, "ab", "a"}, {"And", And(MatchingLanguages(lang), regex.MustCompile(`a`), regex.MustCompile(`b`)), 1, "ab", "a"},
{"Or", Or(MatchingLanguages(lang), regexp.MustCompile(`a|b`)), 1, "ab", "c"}, {"Or", Or(MatchingLanguages(lang), regex.MustCompile(`a|b`)), 1, "ab", "c"},
// the results of these depend on the regex engine
// {"NilOr", Or(noLanguages(), regex.MustCompileRuby(``)), 0, "", "a"},
// {"NilNot", Not(noLanguages(), regex.MustCompileRuby(`a`)), 0, "", "a"},
},
regex.RE2: {
{"NilAnd", And(noLanguages(), regex.MustCompileRuby(`a`), regex.MustCompile(`b`)), 0, "b", "a"},
{"NilNot", Not(noLanguages(), regex.MustCompileRuby(`a`), regex.MustCompile(`b`)), 0, "c", "b"},
},
regex.Oniguruma: {
{"NilAnd", And(noLanguages(), regex.MustCompileRuby(`a`), regex.MustCompile(`b`)), 0, "ab", "c"},
{"NilNot", Not(noLanguages(), regex.MustCompileRuby(`a`), regex.MustCompile(`b`)), 0, "c", "a"},
{"NilOr", Or(noLanguages(), regex.MustCompileRuby(`a`) /*, regexp.MustCompile(`b`)*/), 0, "a", "b"},
},
}
func testRulesForEngine(t *testing.T, engine string) {
if engine != "" && regex.Name != engine {
return
}
for _, f := range specificFixtures[engine] {
t.Run(engine+f.name, func(t *testing.T) {
check(t, f)
})
}
} }
func TestRules(t *testing.T) { func TestRules(t *testing.T) {
for _, f := range fixtures { //TODO(bzz): can all be run in parallel
t.Run(f.name, func(t *testing.T) { testRulesForEngine(t, "")
testRulesForEngine(t, regex.RE2)
testRulesForEngine(t, regex.Oniguruma)
}
func check(t *testing.T, f fixture) {
assert.NotNil(t, f.rule) assert.NotNil(t, f.rule)
assert.NotNil(t, f.rule.Languages()) assert.NotNil(t, f.rule.Languages())
assert.Equal(t, f.numLangs, len(f.rule.Languages())) assert.Equal(t, f.numLangs, len(f.rule.Languages()))
assert.Truef(t, f.rule.Match([]byte(f.matching)), if f.match != "" {
"'%s' is expected to .Match() by rule %s%v", f.matching, f.name, f.rule) assert.Truef(t, f.rule.Match([]byte(f.match)),
"'%s' is expected to .Match() by rule %s%v", f.match, f.name, f.rule)
}
if f.noMatch != "" { if f.noMatch != "" {
assert.Falsef(t, f.rule.Match([]byte(f.noMatch)), assert.Falsef(t, f.rule.Match([]byte(f.noMatch)),
"'%s' is expected NOT to .Match() by rule %s%v", f.noMatch, f.name, f.rule) "'%s' is expected NOT to .Match() by rule %s%v", f.noMatch, f.name, f.rule)
} }
})
}
} }

14
enry.go
View File

@ -1,15 +1,15 @@
/* /*
Package enry implements multiple strategies for programming language identification. Package enry identifies programming languages.
Identification is made based on file name and file content using a service Identification is based on file name and content using a series
of strategies to narrow down possible option. of strategies to narrow down possible options.
Each strategy is available as a separate API call, as well as a main enty point Each strategy is available as a separate API call, as well as though the main enty point:
GetLanguage(filename string, content []byte) (language string) GetLanguage(filename string, content []byte) (language string)
It is a port of the https://github.com/github/linguist from Ruby. It is a port of the https://github.com/github/linguist from Ruby.
Upstream Linguist YAML files are used to generate datastructures for data Upstream Linguist YAML files are used to generate datastructures for data
package. package.
*/ */
package enry // import "github.com/go-enry/go-enry/v2" package enry // import "github.com/go-enry/go-enry/v2"

2
go.mod
View File

@ -4,6 +4,6 @@ go 1.14
require ( require (
github.com/go-enry/go-oniguruma v1.2.1 github.com/go-enry/go-oniguruma v1.2.1
github.com/stretchr/testify v1.3.0 github.com/stretchr/testify v1.8.1
gopkg.in/yaml.v2 v2.2.8 gopkg.in/yaml.v2 v2.2.8
) )

17
go.sum
View File

@ -1,16 +1,21 @@
github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/go-enry/go-oniguruma v1.2.0 h1:oBO9XC1IDT9+AoWW5oFsa/7gFeOPacEqDbyXZKWXuDs= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/go-enry/go-oniguruma v1.2.0/go.mod h1:bWDhYP+S6xZQgiRL7wlTScFYBe023B6ilRZbCAD5Hf4= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/go-enry/go-oniguruma v1.2.1 h1:k8aAMuJfMrqm/56SG2lV9Cfti6tC4x8673aHCcBk+eo= github.com/go-enry/go-oniguruma v1.2.1 h1:k8aAMuJfMrqm/56SG2lV9Cfti6tC4x8673aHCcBk+eo=
github.com/go-enry/go-oniguruma v1.2.1/go.mod h1:bWDhYP+S6xZQgiRL7wlTScFYBe023B6ilRZbCAD5Hf4= github.com/go-enry/go-oniguruma v1.2.1/go.mod h1:bWDhYP+S6xZQgiRL7wlTScFYBe023B6ilRZbCAD5Hf4=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/stretchr/objx v0.1.0 h1:4G4v2dO3VZwixGIRoQ5Lfboy6nUhCyYzaqnIAPPhYs4=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q= github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
github.com/stretchr/testify v1.8.1 h1:w7B6lhMri9wdJUVmEZPGGhZzrYTPvgJArz7wNPgYKsk=
github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/yaml.v2 v2.2.8 h1:obN1ZagJSUGI0Ek/LBmuj4SNLPfIny3KsKFopxRdj10= gopkg.in/yaml.v2 v2.2.8 h1:obN1ZagJSUGI0Ek/LBmuj4SNLPfIny3KsKFopxRdj10=
gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=

View File

@ -1,9 +1,8 @@
package data package data
import ( import (
"regexp"
"github.com/go-enry/go-enry/v2/data/rule" "github.com/go-enry/go-enry/v2/data/rule"
"github.com/go-enry/go-enry/v2/regex"
) )
var ContentHeuristics = map[string]*Heuristics{ var ContentHeuristics = map[string]*Heuristics{
@ -27,12 +26,12 @@ var ContentHeuristics = map[string]*Heuristics{
{{- else if eq .Op "Or" -}} {{- else if eq .Op "Or" -}}
rule.Or( rule.Or(
{{ template "Languages" .Langs -}} {{ template "Languages" .Langs -}}
regexp.MustCompile({{ .Pattern | stringVal }}), {{ template "mustCompile" . }}
), ),
{{- else if eq .Op "Not" -}} {{- else if eq .Op "Not" -}}
rule.Not( rule.Not(
{{ template "Languages" .Langs -}} {{ template "Languages" .Langs -}}
regexp.MustCompile({{ .Pattern | stringVal }}), {{ template "mustCompile" . }}
), ),
{{- else if eq .Op "Always" -}} {{- else if eq .Op "Always" -}}
rule.Always( rule.Always(
@ -49,3 +48,11 @@ var ContentHeuristics = map[string]*Heuristics{
rule.MatchingLanguages(""), rule.MatchingLanguages(""),
{{end -}} {{end -}}
{{end}} {{end}}
{{define "mustCompile" -}}
{{ if .IsRE2 -}}
regex.MustCompileMultiline({{ .Pattern | stringVal }}),
{{- else -}}
regex.MustCompileRuby({{ .Pattern | stringVal }}),
{{ end -}}
{{end}}

View File

@ -2,11 +2,21 @@ package data
import "github.com/go-enry/go-enry/v2/regex" import "github.com/go-enry/go-enry/v2/regex"
{{define "mustCompile" -}}
{{ if isRE2 . -}}
regex.MustCompile({{ . | stringVal }})
{{- else -}}
regex.MustCompileRuby({{ . | stringVal }})
{{- end -}}
{{end}}
var VendorMatchers = []regex.EnryRegexp{ var VendorMatchers = []regex.EnryRegexp{
{{range $regexp := . -}} {{range $re := . -}}
regex.MustCompile(`{{ $regexp }}`), {{ template "mustCompile" $re }},
{{end -}} {{end -}}
} }
// FastVendorMatcher is equivalent to matching any of the VendorMatchers. // FastVendorMatcher is equivalent to matching any of the VendorMatchers.
var FastVendorMatcher = regex.MustCompile(`{{ optimize . }}`) {{with $singleRE := collateAllRegexps . -}}
var FastVendorMatcher = {{template "mustCompile" $singleRE}}
{{end}}

View File

@ -3,7 +3,6 @@
package generator package generator
import ( import (
"bytes"
"fmt" "fmt"
"go/format" "go/format"
"io" "io"
@ -22,12 +21,15 @@ type File func(fileToParse, samplesDir, outPath, tmplPath, tmplName, commit stri
func formatedWrite(outPath string, source []byte) error { func formatedWrite(outPath string, source []byte) error {
formatedSource, err := format.Source(source) formatedSource, err := format.Source(source)
if err != nil { if err != nil {
return err err = fmt.Errorf("'go fmt' fails on %v", err)
// write un-formatter source to simplify debugging
formatedSource = source
} }
if err := ioutil.WriteFile(outPath, formatedSource, 0666); err != nil { if err := ioutil.WriteFile(outPath, formatedSource, 0666); err != nil {
return err return err
} }
return nil return err
} }
func executeTemplate(w io.Writer, name, path, commit string, fmap template.FuncMap, data interface{}) error { func executeTemplate(w io.Writer, name, path, commit string, fmap template.FuncMap, data interface{}) error {
@ -40,35 +42,21 @@ func executeTemplate(w io.Writer, name, path, commit string, fmap template.FuncM
val = strings.ReplaceAll(val, "`", "`+\"`\"+`") val = strings.ReplaceAll(val, "`", "`+\"`\"+`")
return fmt.Sprintf("`%s`", val) return fmt.Sprintf("`%s`", val)
} }
const headerTmpl = "header.go.tmpl"
headerPath := filepath.Join(filepath.Dir(path), headerTmpl)
h := template.Must(template.New(headerTmpl).Funcs(template.FuncMap{
"getCommit": getCommit,
"stringVal": stringVal,
}).ParseFiles(headerPath))
buf := bytes.NewBuffer(nil)
if err := h.Execute(buf, data); err != nil {
return err
}
if fmap == nil { if fmap == nil {
fmap = make(template.FuncMap) fmap = make(template.FuncMap)
} }
fmap["getCommit"] = getCommit fmap["getCommit"] = getCommit
fmap["stringVal"] = stringVal fmap["stringVal"] = stringVal
fmap["isRE2"] = isRE2
const headerTmpl = "header.go.tmpl"
headerPath := filepath.Join(filepath.Dir(path), headerTmpl)
h := template.Must(template.New(headerTmpl).Funcs(fmap).ParseFiles(headerPath))
if err := h.Execute(w, data); err != nil {
return err
}
t := template.Must(template.New(name).Funcs(fmap).ParseFiles(path)) t := template.Must(template.New(name).Funcs(fmap).ParseFiles(path))
if err := t.Execute(buf, data); err != nil { return t.Execute(w, data)
return err
}
src, err := format.Source(buf.Bytes())
if err != nil {
return err
}
_, err = w.Write(src)
return err
} }

View File

@ -70,25 +70,27 @@ func loadRule(namedPatterns map[string]StringArray, rule *Rule) *LanguagePattern
subp := loadRule(namedPatterns, r) subp := loadRule(namedPatterns, r)
subPatterns = append(subPatterns, subp) subPatterns = append(subPatterns, subp)
} }
result = &LanguagePattern{"And", rule.Languages, "", subPatterns} result = &LanguagePattern{"And", rule.Languages, "", subPatterns, true}
} else if len(rule.Pattern) != 0 { // OrPattern } else if len(rule.Pattern) != 0 { // OrPattern
conjunction := strings.Join(rule.Pattern, orPipe) // FIXME(bzz): this optimization should only be applied if each pattern isRE2!
pattern := convertToValidRegexp(conjunction) pattern := strings.Join(rule.Pattern, orPipe)
result = &LanguagePattern{"Or", rule.Languages, pattern, nil}
// TODO(bzz): handle the common case Or(len(Languages)==0) better
// e.g. by emiting `rule.Rule(...)` instead of
// an (ugly) `rule.Or( rule.MatchingLanguages(""), ... )`
result = &LanguagePattern{"Or", rule.Languages, pattern, nil, isRE2(pattern)}
} else if rule.NegativePattern != "" { // NotPattern } else if rule.NegativePattern != "" { // NotPattern
pattern := convertToValidRegexp(rule.NegativePattern) pattern := rule.NegativePattern
result = &LanguagePattern{"Not", rule.Languages, pattern, nil} result = &LanguagePattern{"Not", rule.Languages, pattern, nil, isRE2(pattern)}
} else if rule.NamedPattern != "" { // Named OrPattern } else if rule.NamedPattern != "" { // Named OrPattern
conjunction := strings.Join(namedPatterns[rule.NamedPattern], orPipe) pattern := strings.Join(namedPatterns[rule.NamedPattern], orPipe)
pattern := convertToValidRegexp(conjunction) result = &LanguagePattern{"Or", rule.Languages, pattern, nil, isRE2(pattern)}
result = &LanguagePattern{"Or", rule.Languages, pattern, nil}
} else { // AlwaysPattern } else { // AlwaysPattern
result = &LanguagePattern{"Always", rule.Languages, "", nil} result = &LanguagePattern{"Always", rule.Languages, "", nil, true}
} }
if isUnsupportedRegexpSyntax(result.Pattern) { if !isRE2(result.Pattern) {
log.Printf("skipping rule: language:'%q', rule:'%q'\n", rule.Languages, result.Pattern) log.Printf("RE2 incompatible syntax for heuristic language:'%s', rule:'%s'\n", rule.Languages, result.Pattern)
return nil
} }
return result return result
} }
@ -100,6 +102,7 @@ type LanguagePattern struct {
Langs []string Langs []string
Pattern string Pattern string
Rules []*LanguagePattern Rules []*LanguagePattern
IsRE2 bool
} }
type Heuristics struct { type Heuristics struct {
@ -125,7 +128,7 @@ type Patterns struct {
} }
// StringArray is workaround for parsing named_pattern, // StringArray is workaround for parsing named_pattern,
// wich is sometimes arry and sometimes not. // wich is sometimes an array and sometimes is not.
// See https://github.com/go-yaml/yaml/issues/100 // See https://github.com/go-yaml/yaml/issues/100
type StringArray []string type StringArray []string
@ -173,8 +176,6 @@ func isUnsupportedRegexpSyntax(reg string) bool {
(strings.HasPrefix(reg, multilinePrefix+`/`) && strings.HasSuffix(reg, `/`)) (strings.HasPrefix(reg, multilinePrefix+`/`) && strings.HasSuffix(reg, `/`))
} }
// convertToValidRegexp converts Ruby regexp syntax to RE2 equivalent. func isRE2(s string) bool {
// Does not work with Ruby regexp literals. return !isUnsupportedRegexpSyntax(s)
func convertToValidRegexp(rubyRegexp string) string {
return multilinePrefix + rubyRegexp
} }

File diff suppressed because it is too large Load Diff

View File

@ -5,6 +5,7 @@ import (
"fmt" "fmt"
"io" "io"
"io/ioutil" "io/ioutil"
"log"
"sort" "sort"
"strings" "strings"
"text/template" "text/template"
@ -25,6 +26,12 @@ func Vendor(fileToParse, samplesDir, outPath, tmplPath, tmplName, commit string)
return fmt.Errorf("failed to parse YAML %s, %q", fileToParse, err) return fmt.Errorf("failed to parse YAML %s, %q", fileToParse, err)
} }
for _, re := range regexps {
if !isRE2(re) {
log.Printf("RE2 incompatible syntax for vendor:'%s'\n", re)
}
}
buf := &bytes.Buffer{} buf := &bytes.Buffer{}
if err := executeVendorTemplate(buf, regexps, tmplPath, tmplName, commit); err != nil { if err := executeVendorTemplate(buf, regexps, tmplPath, tmplName, commit); err != nil {
return err return err
@ -34,34 +41,14 @@ func Vendor(fileToParse, samplesDir, outPath, tmplPath, tmplName, commit string)
} }
func executeVendorTemplate(out io.Writer, regexps []string, tmplPath, tmplName, commit string) error { func executeVendorTemplate(out io.Writer, regexps []string, tmplPath, tmplName, commit string) error {
funcs := template.FuncMap{"optimize": collateAllMatchers} funcs := template.FuncMap{"collateAllRegexps": collateAllRegexps}
return executeTemplate(out, tmplName, tmplPath, commit, funcs, regexps) return executeTemplate(out, tmplName, tmplPath, commit, funcs, regexps)
} }
func collateAllMatchers(regexps []string) string { // collateAllRegexps all regexps to a single large regexp.
// We now collate all regexps from VendorMatchers to a single large regexp func collateAllRegexps(regexps []string) string {
// which is at least twice as fast to test than simply iterating & matching. // which is at least twice as fast to test than simply iterating & matching.
// //
// ---
//
// We could test each matcher from VendorMatchers in turn i.e.
//
// func IsVendor(filename string) bool {
// for _, matcher := range data.VendorMatchers {
// if matcher.MatchString(filename) {
// return true
// }
// }
// return false
// }
//
// Or naïvely concatentate all these regexps using groups i.e.
//
// `(regexp1)|(regexp2)|(regexp3)|...`
//
// However, both of these are relatively slow and don't take advantage
// of the inherent structure within our regexps.
//
// Imperical observation: by looking at the regexps, we only have 3 types. // Imperical observation: by looking at the regexps, we only have 3 types.
// 1. Those that start with `^` // 1. Those that start with `^`
// 2. Those that start with `(^|/)` // 2. Those that start with `(^|/)`
@ -81,8 +68,8 @@ func collateAllMatchers(regexps []string) string {
sort.Strings(regexps) sort.Strings(regexps)
// Check prefix, group expressions
var caretPrefixed, caretOrSlashPrefixed, theRest []string var caretPrefixed, caretOrSlashPrefixed, theRest []string
// Check prefix, add to the respective group slices
for _, re := range regexps { for _, re := range regexps {
if strings.HasPrefix(re, caret) { if strings.HasPrefix(re, caret) {
caretPrefixed = append(caretPrefixed, re[len(caret):]) caretPrefixed = append(caretPrefixed, re[len(caret):])
@ -92,6 +79,7 @@ func collateAllMatchers(regexps []string) string {
theRest = append(theRest, re) theRest = append(theRest, re)
} }
} }
var sb strings.Builder var sb strings.Builder
appendGroupWithCommonPrefix(&sb, "^", caretPrefixed) appendGroupWithCommonPrefix(&sb, "^", caretPrefixed)
sb.WriteString("|") sb.WriteString("|")

View File

@ -134,7 +134,7 @@ func main() {
for _, file := range fileList { for _, file := range fileList {
if err := file.generate(file.fileToParse, file.samplesDir, file.outPath, file.tmplPath, file.tmplName, file.commit); err != nil { if err := file.generate(file.fileToParse, file.samplesDir, file.outPath, file.tmplPath, file.tmplName, file.commit); err != nil {
log.Fatalf("error generating template %q to %q: %+v", file.tmplPath, file.outPath, err) log.Fatalf("failed to generate %q from %q - %+v", file.outPath, file.tmplPath, err)
} }
} }
} }

View File

@ -1,3 +1,4 @@
//go:build oniguruma
// +build oniguruma // +build oniguruma
package regex package regex
@ -6,10 +7,21 @@ import (
rubex "github.com/go-enry/go-oniguruma" rubex "github.com/go-enry/go-oniguruma"
) )
const Name = Oniguruma
type EnryRegexp = *rubex.Regexp type EnryRegexp = *rubex.Regexp
func MustCompile(str string) EnryRegexp { func MustCompile(s string) EnryRegexp {
return rubex.MustCompileASCII(str) return rubex.MustCompileASCII(s)
}
// MustCompileMultiline matches in multi-line mode by default with Oniguruma.
func MustCompileMultiline(s string) EnryRegexp {
return MustCompile(s)
}
func MustCompileRuby(s string) EnryRegexp {
return MustCompile(s)
} }
func QuoteMeta(s string) string { func QuoteMeta(s string) string {

9
regex/regex.go Normal file
View File

@ -0,0 +1,9 @@
package regex
// Package regex abstracts regular expression engine
// that can be chosen at compile-time by a build tag.
const (
RE2 = "RE2"
Oniguruma = "Oniguruma"
)

View File

@ -1,3 +1,4 @@
//go:build !oniguruma
// +build !oniguruma // +build !oniguruma
package regex package regex
@ -6,12 +7,32 @@ import (
"regexp" "regexp"
) )
const Name = RE2
type EnryRegexp = *regexp.Regexp type EnryRegexp = *regexp.Regexp
func MustCompile(str string) EnryRegexp { func MustCompile(str string) EnryRegexp {
return regexp.MustCompile(str) return regexp.MustCompile(str)
} }
// MustCompileMultiline mimics Ruby defaults for regexp, where ^$ matches begin/end of line.
// I.e. it converts Ruby regexp syntaxt to RE2 equivalent
func MustCompileMultiline(s string) EnryRegexp {
const multilineModeFlag = "(?m)"
return regexp.MustCompile(multilineModeFlag + s)
}
// MustCompileRuby used for expressions with syntax not supported by RE2.
// Now it's confusing as we use the result as [data/rule.Matcher] and
//
// (*Matcher)(nil) != nil
//
// What is a better way for an expression to indicate unsupported syntax?
// e.g. add .IsValidSyntax() to both, Matcher interface and EnryRegexp implementations?
func MustCompileRuby(s string) EnryRegexp {
return nil
}
func QuoteMeta(s string) string { func QuoteMeta(s string) string {
return regexp.QuoteMeta(s) return regexp.QuoteMeta(s)
} }

27
regex/standard_test.go Normal file
View File

@ -0,0 +1,27 @@
//go:build !oniguruma
// +build !oniguruma
package regex
import (
"testing"
"github.com/stretchr/testify/assert"
)
func TestMustCompileMultiline(t *testing.T) {
const re = `^\.(.*)!$`
want := MustCompileMultiline(re)
assert.Equal(t, "(?m)"+re, want.String())
const s = `.one
.two!
thre!`
if !want.MatchString(s) {
t.Fatalf("MustCompileMultiline(`%s`) must match multiline %q\n", re, s)
}
}
func TestMustCompileRuby(t *testing.T) {
assert.Nil(t, MustCompileRuby(``))
}

View File

@ -63,7 +63,21 @@ func IsDotFile(path string) bool {
// IsVendor returns whether or not path is a vendor path. // IsVendor returns whether or not path is a vendor path.
func IsVendor(path string) bool { func IsVendor(path string) bool {
// fast path: single collatated regex, if the engine supports its syntax
if data.FastVendorMatcher != nil {
return data.FastVendorMatcher.MatchString(path) return data.FastVendorMatcher.MatchString(path)
}
// slow path: skip individual rules with unsupported syntax
for _, matcher := range data.VendorMatchers {
if matcher == nil {
continue
}
if matcher.MatchString(path) {
return true
}
}
return false
} }
// IsTest returns whether or not path is a test path. // IsTest returns whether or not path is a test path.

View File

@ -7,57 +7,62 @@ import (
"path/filepath" "path/filepath"
"testing" "testing"
"github.com/go-enry/go-enry/v2/regex"
"github.com/stretchr/testify/assert" "github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require" "github.com/stretchr/testify/require"
) )
//TODO(bzz): port all from test/test_file_blob.rb test_vendored() // TODO(bzz): port all from test/test_file_blob.rb test_vendored()
//https://github.com/github/linguist/blob/86adc140d3e8903980565a2984f5532edf4ae875/test/test_file_blob.rb#L270-L583 // https://github.com/github/linguist/blob/86adc140d3e8903980565a2984f5532edf4ae875/test/test_file_blob.rb#L270-L583
var vendorTests = []struct { var vendorTests = []struct {
skipOnRE2 bool // some rules are (present in code but) missing at runtime on RE2
path string path string
expected bool expected bool
}{ }{
{"cache/", true}, {path: "cache/", expected: true},
{"something_cache/", false}, {false, "something_cache/", false},
{"random/cache/", true}, {false, "random/cache/", true},
{"cache", false}, {false, "cache", false},
{"dependencies/", true}, {false, "dependencies/", true},
{"Dependencies/", true}, {false, "Dependencies/", true},
{"dependency/", false}, {false, "dependency/", false},
{"dist/", true}, {false, "dist/", true},
{"dist", false}, {false, "dist", false},
{"random/dist/", true}, {false, "random/dist/", true},
{"random/dist", false}, {false, "random/dist", false},
{"deps/", true}, {false, "deps/", true},
{"foodeps/", false}, {false, "foodeps/", false},
{"configure", true}, {false, "configure", true},
{"a/configure", true}, {false, "a/configure", true},
{"config.guess", true}, {false, "config.guess", true},
{"config.guess/", false}, {false, "config.guess/", false},
{".vscode/", true}, {false, ".vscode/", true},
{"doc/_build/", true}, {false, "doc/_build/", true},
{"a/docs/_build/", true}, {false, "a/docs/_build/", true},
{"a/dasdocs/_build-vsdoc.js", true}, {false, "a/dasdocs/_build-vsdoc.js", true},
{"a/dasdocs/_build-vsdoc.j", false}, {false, "a/dasdocs/_build-vsdoc.j", false},
{"foo/bar", false}, {false, "foo/bar", false},
{".sublime-project", true}, {false, ".sublime-project", true},
{"foo/vendor/foo", true}, {false, "foo/vendor/foo", true},
{"leaflet.draw-src.js", true}, {false, "leaflet.draw-src.js", true},
{"foo/bar/MochiKit.js", true}, {false, "foo/bar/MochiKit.js", true},
{"foo/bar/dojo.js", true}, {false, "foo/bar/dojo.js", true},
{"foo/env/whatever", true}, {false, "foo/env/whatever", true},
{"some/python/venv/", false}, {false, "some/python/venv/", false},
{"foo/.imageset/bar", true}, {false, "foo/.imageset/bar", true},
{"Vagrantfile", true}, {false, "Vagrantfile", true},
{"src/bootstrap-custom.js", true}, {true, "src/bootstrap-custom.js", true},
// {"/css/bootstrap.rtl.css", true}, // from linguist v7.23 // {true, "/css/bootstrap.rtl.css", true}, // from linguist v7.23
} }
func TestIsVendor(t *testing.T) { func TestIsVendor(t *testing.T) {
for _, tt := range vendorTests { for _, test := range vendorTests {
t.Run(tt.path, func(t *testing.T) { t.Run(test.path, func(t *testing.T) {
if got := IsVendor(tt.path); got != tt.expected { if got := IsVendor(test.path); got != test.expected {
t.Errorf("IsVendor(%q) = %v, expected %v", tt.path, got, tt.expected) if regex.Name == regex.RE2 && test.skipOnRE2 {
return // skip
}
t.Errorf("IsVendor(%q) = %v, expected %v (usuing %s)", test.path, got, test.expected, regex.Name)
} }
}) })
} }