mirror of
https://github.com/ralsina/tartrazine.git
synced 2024-12-05 00:00:34 +00:00
Merge pull request #151 from go-enry/distinguish-re-syntax
Syntax-aware regexp generation for configurable engines
This commit is contained in:
commit
0e58945703
@ -184,6 +184,8 @@ Parsing [linguist/samples](https://github.com/github/linguist/tree/master/sample
|
||||
|
||||
In all the cases above that have an issue number - we plan to update enry to match Linguist behavior.
|
||||
|
||||
> All the issues related to heuristics' regexp syntax incompatibilities with the RE2 engine can be avoided by using `oniguruma` instead (see [instuctions](#misc))
|
||||
|
||||
## Benchmarks
|
||||
|
||||
Enry's language detection has been compared with Linguist's on [_linguist/samples_](https://github.com/github/linguist/tree/master/samples).
|
||||
|
660
data/content.go
660
data/content.go
File diff suppressed because it is too large
Load Diff
@ -3,6 +3,15 @@
|
||||
// with colliding extensions, based on regexps from Linguist data.
|
||||
package rule
|
||||
|
||||
import "github.com/go-enry/go-enry/v2/regex"
|
||||
|
||||
// Matcher checks if the data matches (number of) pattern(s).
|
||||
// Every heuristic rule below implements this interface.
|
||||
// A regexp.Regexp satisfies this interface and can be used instead.
|
||||
type Matcher interface {
|
||||
Match(data []byte) bool
|
||||
}
|
||||
|
||||
// Heuristic consist of (a number of) rules where each, if matches,
|
||||
// identifies content as belonging to a programming language(s).
|
||||
type Heuristic interface {
|
||||
@ -10,15 +19,7 @@ type Heuristic interface {
|
||||
Languages() []string
|
||||
}
|
||||
|
||||
// Matcher checks if the data matches (number of) pattern.
|
||||
// Every heuristic rule below implements this interface.
|
||||
// A regexp.Regexp satisfies this interface and can be used instead.
|
||||
type Matcher interface {
|
||||
Match(data []byte) bool
|
||||
}
|
||||
|
||||
// languages struct incapsulate data common to every Matcher: all languages
|
||||
// that it identifies.
|
||||
// languages base struct with all the languages that a Matcher identifies.
|
||||
type languages struct {
|
||||
langs []string
|
||||
}
|
||||
@ -33,6 +34,10 @@ func MatchingLanguages(langs ...string) languages {
|
||||
return languages{langs}
|
||||
}
|
||||
|
||||
func noLanguages() languages {
|
||||
return MatchingLanguages([]string{}...)
|
||||
}
|
||||
|
||||
// Implements a Heuristic.
|
||||
type or struct {
|
||||
languages
|
||||
@ -40,14 +45,19 @@ type or struct {
|
||||
}
|
||||
|
||||
// Or rule matches, if a single matching pattern exists.
|
||||
// It receives only one pattern as it relies on compile-time optimization that
|
||||
// represtes union with | inside a single regexp.
|
||||
func Or(l languages, r Matcher) Heuristic {
|
||||
return or{l, r}
|
||||
// It receives only one pattern as it relies on optimization that
|
||||
// represtes union with | inside a single regexp during code generation.
|
||||
func Or(l languages, p Matcher) Heuristic {
|
||||
//FIXME(bzz): this will not be the case as only some of the patterns may
|
||||
// be non-RE2 => we shouldn't collate them not to loose the (accuracty of) whole rule
|
||||
return or{l, p}
|
||||
}
|
||||
|
||||
// Match implements rule.Matcher.
|
||||
func (r or) Match(data []byte) bool {
|
||||
if runOnRE2AndRegexNotAccepted(r.pattern) {
|
||||
return false
|
||||
}
|
||||
return r.pattern.Match(data)
|
||||
}
|
||||
|
||||
@ -65,6 +75,9 @@ func And(l languages, m ...Matcher) Heuristic {
|
||||
// Match implements data.Matcher.
|
||||
func (r and) Match(data []byte) bool {
|
||||
for _, p := range r.patterns {
|
||||
if runOnRE2AndRegexNotAccepted(p) {
|
||||
continue
|
||||
}
|
||||
if !p.Match(data) {
|
||||
return false
|
||||
}
|
||||
@ -86,6 +99,9 @@ func Not(l languages, r ...Matcher) Heuristic {
|
||||
// Match implements data.Matcher.
|
||||
func (r not) Match(data []byte) bool {
|
||||
for _, p := range r.Patterns {
|
||||
if runOnRE2AndRegexNotAccepted(p) {
|
||||
continue
|
||||
}
|
||||
if p.Match(data) {
|
||||
return false
|
||||
}
|
||||
@ -107,3 +123,11 @@ func Always(l languages) Heuristic {
|
||||
func (r always) Match(data []byte) bool {
|
||||
return true
|
||||
}
|
||||
|
||||
// Checks if a regex syntax isn't accepted by RE2 engine.
|
||||
// It's nil by construction from regex.MustCompileRuby but
|
||||
// is used here as a Matcher interface wich itself is non-nil.
|
||||
func runOnRE2AndRegexNotAccepted(re Matcher) bool {
|
||||
v, ok := re.(regex.EnryRegexp)
|
||||
return ok && v == nil
|
||||
}
|
||||
|
@ -1,39 +1,71 @@
|
||||
package rule
|
||||
|
||||
import (
|
||||
"regexp"
|
||||
"testing"
|
||||
|
||||
"github.com/go-enry/go-enry/v2/regex"
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
const lang = "ActionScript"
|
||||
|
||||
var fixtures = []struct {
|
||||
type fixture struct {
|
||||
name string
|
||||
rule Heuristic
|
||||
numLangs int
|
||||
matching string
|
||||
match string
|
||||
noMatch string
|
||||
}{
|
||||
{"Always", Always(MatchingLanguages(lang)), 1, "a", ""},
|
||||
{"Not", Not(MatchingLanguages(lang), regexp.MustCompile(`a`)), 1, "b", "a"},
|
||||
{"And", And(MatchingLanguages(lang), regexp.MustCompile(`a`), regexp.MustCompile(`b`)), 1, "ab", "a"},
|
||||
{"Or", Or(MatchingLanguages(lang), regexp.MustCompile(`a|b`)), 1, "ab", "c"},
|
||||
}
|
||||
|
||||
func TestRules(t *testing.T) {
|
||||
for _, f := range fixtures {
|
||||
t.Run(f.name, func(t *testing.T) {
|
||||
assert.NotNil(t, f.rule)
|
||||
assert.NotNil(t, f.rule.Languages())
|
||||
assert.Equal(t, f.numLangs, len(f.rule.Languages()))
|
||||
assert.Truef(t, f.rule.Match([]byte(f.matching)),
|
||||
"'%s' is expected to .Match() by rule %s%v", f.matching, f.name, f.rule)
|
||||
if f.noMatch != "" {
|
||||
assert.Falsef(t, f.rule.Match([]byte(f.noMatch)),
|
||||
"'%s' is expected NOT to .Match() by rule %s%v", f.noMatch, f.name, f.rule)
|
||||
}
|
||||
var specificFixtures = map[string][]fixture{
|
||||
"": { // cases that don't vary between the engines
|
||||
{"Always", Always(MatchingLanguages(lang)), 1, "a", ""},
|
||||
{"Not", Not(MatchingLanguages(lang), regex.MustCompile(`a`)), 1, "b", "a"},
|
||||
{"And", And(MatchingLanguages(lang), regex.MustCompile(`a`), regex.MustCompile(`b`)), 1, "ab", "a"},
|
||||
{"Or", Or(MatchingLanguages(lang), regex.MustCompile(`a|b`)), 1, "ab", "c"},
|
||||
// the results of these depend on the regex engine
|
||||
// {"NilOr", Or(noLanguages(), regex.MustCompileRuby(``)), 0, "", "a"},
|
||||
// {"NilNot", Not(noLanguages(), regex.MustCompileRuby(`a`)), 0, "", "a"},
|
||||
},
|
||||
regex.RE2: {
|
||||
{"NilAnd", And(noLanguages(), regex.MustCompileRuby(`a`), regex.MustCompile(`b`)), 0, "b", "a"},
|
||||
{"NilNot", Not(noLanguages(), regex.MustCompileRuby(`a`), regex.MustCompile(`b`)), 0, "c", "b"},
|
||||
},
|
||||
regex.Oniguruma: {
|
||||
{"NilAnd", And(noLanguages(), regex.MustCompileRuby(`a`), regex.MustCompile(`b`)), 0, "ab", "c"},
|
||||
{"NilNot", Not(noLanguages(), regex.MustCompileRuby(`a`), regex.MustCompile(`b`)), 0, "c", "a"},
|
||||
{"NilOr", Or(noLanguages(), regex.MustCompileRuby(`a`) /*, regexp.MustCompile(`b`)*/), 0, "a", "b"},
|
||||
},
|
||||
}
|
||||
|
||||
func testRulesForEngine(t *testing.T, engine string) {
|
||||
if engine != "" && regex.Name != engine {
|
||||
return
|
||||
}
|
||||
for _, f := range specificFixtures[engine] {
|
||||
t.Run(engine+f.name, func(t *testing.T) {
|
||||
check(t, f)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestRules(t *testing.T) {
|
||||
//TODO(bzz): can all be run in parallel
|
||||
testRulesForEngine(t, "")
|
||||
testRulesForEngine(t, regex.RE2)
|
||||
testRulesForEngine(t, regex.Oniguruma)
|
||||
}
|
||||
|
||||
func check(t *testing.T, f fixture) {
|
||||
assert.NotNil(t, f.rule)
|
||||
assert.NotNil(t, f.rule.Languages())
|
||||
assert.Equal(t, f.numLangs, len(f.rule.Languages()))
|
||||
if f.match != "" {
|
||||
assert.Truef(t, f.rule.Match([]byte(f.match)),
|
||||
"'%s' is expected to .Match() by rule %s%v", f.match, f.name, f.rule)
|
||||
}
|
||||
if f.noMatch != "" {
|
||||
assert.Falsef(t, f.rule.Match([]byte(f.noMatch)),
|
||||
"'%s' is expected NOT to .Match() by rule %s%v", f.noMatch, f.name, f.rule)
|
||||
}
|
||||
}
|
||||
|
16
enry.go
16
enry.go
@ -1,15 +1,15 @@
|
||||
/*
|
||||
Package enry implements multiple strategies for programming language identification.
|
||||
Package enry identifies programming languages.
|
||||
|
||||
Identification is made based on file name and file content using a service
|
||||
of strategies to narrow down possible option.
|
||||
Each strategy is available as a separate API call, as well as a main enty point
|
||||
Identification is based on file name and content using a series
|
||||
of strategies to narrow down possible options.
|
||||
Each strategy is available as a separate API call, as well as though the main enty point:
|
||||
|
||||
GetLanguage(filename string, content []byte) (language string)
|
||||
GetLanguage(filename string, content []byte) (language string)
|
||||
|
||||
It is a port of the https://github.com/github/linguist from Ruby.
|
||||
Upstream Linguist YAML files are used to generate datastructures for data
|
||||
package.
|
||||
It is a port of the https://github.com/github/linguist from Ruby.
|
||||
Upstream Linguist YAML files are used to generate datastructures for data
|
||||
package.
|
||||
*/
|
||||
package enry // import "github.com/go-enry/go-enry/v2"
|
||||
|
||||
|
2
go.mod
2
go.mod
@ -4,6 +4,6 @@ go 1.14
|
||||
|
||||
require (
|
||||
github.com/go-enry/go-oniguruma v1.2.1
|
||||
github.com/stretchr/testify v1.3.0
|
||||
github.com/stretchr/testify v1.8.1
|
||||
gopkg.in/yaml.v2 v2.2.8
|
||||
)
|
||||
|
17
go.sum
17
go.sum
@ -1,16 +1,21 @@
|
||||
github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8=
|
||||
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/go-enry/go-oniguruma v1.2.0 h1:oBO9XC1IDT9+AoWW5oFsa/7gFeOPacEqDbyXZKWXuDs=
|
||||
github.com/go-enry/go-oniguruma v1.2.0/go.mod h1:bWDhYP+S6xZQgiRL7wlTScFYBe023B6ilRZbCAD5Hf4=
|
||||
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
||||
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/go-enry/go-oniguruma v1.2.1 h1:k8aAMuJfMrqm/56SG2lV9Cfti6tC4x8673aHCcBk+eo=
|
||||
github.com/go-enry/go-oniguruma v1.2.1/go.mod h1:bWDhYP+S6xZQgiRL7wlTScFYBe023B6ilRZbCAD5Hf4=
|
||||
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||
github.com/stretchr/objx v0.1.0 h1:4G4v2dO3VZwixGIRoQ5Lfboy6nUhCyYzaqnIAPPhYs4=
|
||||
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
|
||||
github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q=
|
||||
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
|
||||
github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
|
||||
github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
|
||||
github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
|
||||
github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
|
||||
github.com/stretchr/testify v1.8.1 h1:w7B6lhMri9wdJUVmEZPGGhZzrYTPvgJArz7wNPgYKsk=
|
||||
github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
|
||||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
|
||||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
||||
gopkg.in/yaml.v2 v2.2.8 h1:obN1ZagJSUGI0Ek/LBmuj4SNLPfIny3KsKFopxRdj10=
|
||||
gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
|
||||
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
|
||||
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||
|
@ -1,9 +1,8 @@
|
||||
package data
|
||||
|
||||
import (
|
||||
"regexp"
|
||||
|
||||
"github.com/go-enry/go-enry/v2/data/rule"
|
||||
"github.com/go-enry/go-enry/v2/regex"
|
||||
)
|
||||
|
||||
var ContentHeuristics = map[string]*Heuristics{
|
||||
@ -27,12 +26,12 @@ var ContentHeuristics = map[string]*Heuristics{
|
||||
{{- else if eq .Op "Or" -}}
|
||||
rule.Or(
|
||||
{{ template "Languages" .Langs -}}
|
||||
regexp.MustCompile({{ .Pattern | stringVal }}),
|
||||
{{ template "mustCompile" . }}
|
||||
),
|
||||
{{- else if eq .Op "Not" -}}
|
||||
rule.Not(
|
||||
{{ template "Languages" .Langs -}}
|
||||
regexp.MustCompile({{ .Pattern | stringVal }}),
|
||||
{{ template "mustCompile" . }}
|
||||
),
|
||||
{{- else if eq .Op "Always" -}}
|
||||
rule.Always(
|
||||
@ -49,3 +48,11 @@ var ContentHeuristics = map[string]*Heuristics{
|
||||
rule.MatchingLanguages(""),
|
||||
{{end -}}
|
||||
{{end}}
|
||||
|
||||
{{define "mustCompile" -}}
|
||||
{{ if .IsRE2 -}}
|
||||
regex.MustCompileMultiline({{ .Pattern | stringVal }}),
|
||||
{{- else -}}
|
||||
regex.MustCompileRuby({{ .Pattern | stringVal }}),
|
||||
{{ end -}}
|
||||
{{end}}
|
||||
|
@ -2,11 +2,21 @@ package data
|
||||
|
||||
import "github.com/go-enry/go-enry/v2/regex"
|
||||
|
||||
{{define "mustCompile" -}}
|
||||
{{ if isRE2 . -}}
|
||||
regex.MustCompile({{ . | stringVal }})
|
||||
{{- else -}}
|
||||
regex.MustCompileRuby({{ . | stringVal }})
|
||||
{{- end -}}
|
||||
{{end}}
|
||||
|
||||
var VendorMatchers = []regex.EnryRegexp{
|
||||
{{range $regexp := . -}}
|
||||
regex.MustCompile(`{{ $regexp }}`),
|
||||
{{range $re := . -}}
|
||||
{{ template "mustCompile" $re }},
|
||||
{{end -}}
|
||||
}
|
||||
|
||||
// FastVendorMatcher is equivalent to matching any of the VendorMatchers.
|
||||
var FastVendorMatcher = regex.MustCompile(`{{ optimize . }}`)
|
||||
{{with $singleRE := collateAllRegexps . -}}
|
||||
var FastVendorMatcher = {{template "mustCompile" $singleRE}}
|
||||
{{end}}
|
@ -3,7 +3,6 @@
|
||||
package generator
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"go/format"
|
||||
"io"
|
||||
@ -22,12 +21,15 @@ type File func(fileToParse, samplesDir, outPath, tmplPath, tmplName, commit stri
|
||||
func formatedWrite(outPath string, source []byte) error {
|
||||
formatedSource, err := format.Source(source)
|
||||
if err != nil {
|
||||
return err
|
||||
err = fmt.Errorf("'go fmt' fails on %v", err)
|
||||
// write un-formatter source to simplify debugging
|
||||
formatedSource = source
|
||||
}
|
||||
|
||||
if err := ioutil.WriteFile(outPath, formatedSource, 0666); err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
return err
|
||||
}
|
||||
|
||||
func executeTemplate(w io.Writer, name, path, commit string, fmap template.FuncMap, data interface{}) error {
|
||||
@ -40,35 +42,21 @@ func executeTemplate(w io.Writer, name, path, commit string, fmap template.FuncM
|
||||
val = strings.ReplaceAll(val, "`", "`+\"`\"+`")
|
||||
return fmt.Sprintf("`%s`", val)
|
||||
}
|
||||
|
||||
const headerTmpl = "header.go.tmpl"
|
||||
headerPath := filepath.Join(filepath.Dir(path), headerTmpl)
|
||||
|
||||
h := template.Must(template.New(headerTmpl).Funcs(template.FuncMap{
|
||||
"getCommit": getCommit,
|
||||
"stringVal": stringVal,
|
||||
}).ParseFiles(headerPath))
|
||||
|
||||
buf := bytes.NewBuffer(nil)
|
||||
if err := h.Execute(buf, data); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if fmap == nil {
|
||||
fmap = make(template.FuncMap)
|
||||
}
|
||||
fmap["getCommit"] = getCommit
|
||||
fmap["stringVal"] = stringVal
|
||||
fmap["isRE2"] = isRE2
|
||||
|
||||
const headerTmpl = "header.go.tmpl"
|
||||
headerPath := filepath.Join(filepath.Dir(path), headerTmpl)
|
||||
|
||||
h := template.Must(template.New(headerTmpl).Funcs(fmap).ParseFiles(headerPath))
|
||||
if err := h.Execute(w, data); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
t := template.Must(template.New(name).Funcs(fmap).ParseFiles(path))
|
||||
if err := t.Execute(buf, data); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
src, err := format.Source(buf.Bytes())
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
_, err = w.Write(src)
|
||||
return err
|
||||
return t.Execute(w, data)
|
||||
}
|
||||
|
@ -70,25 +70,27 @@ func loadRule(namedPatterns map[string]StringArray, rule *Rule) *LanguagePattern
|
||||
subp := loadRule(namedPatterns, r)
|
||||
subPatterns = append(subPatterns, subp)
|
||||
}
|
||||
result = &LanguagePattern{"And", rule.Languages, "", subPatterns}
|
||||
result = &LanguagePattern{"And", rule.Languages, "", subPatterns, true}
|
||||
} else if len(rule.Pattern) != 0 { // OrPattern
|
||||
conjunction := strings.Join(rule.Pattern, orPipe)
|
||||
pattern := convertToValidRegexp(conjunction)
|
||||
result = &LanguagePattern{"Or", rule.Languages, pattern, nil}
|
||||
// FIXME(bzz): this optimization should only be applied if each pattern isRE2!
|
||||
pattern := strings.Join(rule.Pattern, orPipe)
|
||||
|
||||
// TODO(bzz): handle the common case Or(len(Languages)==0) better
|
||||
// e.g. by emiting `rule.Rule(...)` instead of
|
||||
// an (ugly) `rule.Or( rule.MatchingLanguages(""), ... )`
|
||||
result = &LanguagePattern{"Or", rule.Languages, pattern, nil, isRE2(pattern)}
|
||||
} else if rule.NegativePattern != "" { // NotPattern
|
||||
pattern := convertToValidRegexp(rule.NegativePattern)
|
||||
result = &LanguagePattern{"Not", rule.Languages, pattern, nil}
|
||||
pattern := rule.NegativePattern
|
||||
result = &LanguagePattern{"Not", rule.Languages, pattern, nil, isRE2(pattern)}
|
||||
} else if rule.NamedPattern != "" { // Named OrPattern
|
||||
conjunction := strings.Join(namedPatterns[rule.NamedPattern], orPipe)
|
||||
pattern := convertToValidRegexp(conjunction)
|
||||
result = &LanguagePattern{"Or", rule.Languages, pattern, nil}
|
||||
pattern := strings.Join(namedPatterns[rule.NamedPattern], orPipe)
|
||||
result = &LanguagePattern{"Or", rule.Languages, pattern, nil, isRE2(pattern)}
|
||||
} else { // AlwaysPattern
|
||||
result = &LanguagePattern{"Always", rule.Languages, "", nil}
|
||||
result = &LanguagePattern{"Always", rule.Languages, "", nil, true}
|
||||
}
|
||||
|
||||
if isUnsupportedRegexpSyntax(result.Pattern) {
|
||||
log.Printf("skipping rule: language:'%q', rule:'%q'\n", rule.Languages, result.Pattern)
|
||||
return nil
|
||||
if !isRE2(result.Pattern) {
|
||||
log.Printf("RE2 incompatible syntax for heuristic language:'%s', rule:'%s'\n", rule.Languages, result.Pattern)
|
||||
}
|
||||
return result
|
||||
}
|
||||
@ -100,6 +102,7 @@ type LanguagePattern struct {
|
||||
Langs []string
|
||||
Pattern string
|
||||
Rules []*LanguagePattern
|
||||
IsRE2 bool
|
||||
}
|
||||
|
||||
type Heuristics struct {
|
||||
@ -125,7 +128,7 @@ type Patterns struct {
|
||||
}
|
||||
|
||||
// StringArray is workaround for parsing named_pattern,
|
||||
// wich is sometimes arry and sometimes not.
|
||||
// wich is sometimes an array and sometimes is not.
|
||||
// See https://github.com/go-yaml/yaml/issues/100
|
||||
type StringArray []string
|
||||
|
||||
@ -173,8 +176,6 @@ func isUnsupportedRegexpSyntax(reg string) bool {
|
||||
(strings.HasPrefix(reg, multilinePrefix+`/`) && strings.HasSuffix(reg, `/`))
|
||||
}
|
||||
|
||||
// convertToValidRegexp converts Ruby regexp syntax to RE2 equivalent.
|
||||
// Does not work with Ruby regexp literals.
|
||||
func convertToValidRegexp(rubyRegexp string) string {
|
||||
return multilinePrefix + rubyRegexp
|
||||
func isRE2(s string) bool {
|
||||
return !isUnsupportedRegexpSyntax(s)
|
||||
}
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -5,6 +5,7 @@ import (
|
||||
"fmt"
|
||||
"io"
|
||||
"io/ioutil"
|
||||
"log"
|
||||
"sort"
|
||||
"strings"
|
||||
"text/template"
|
||||
@ -25,6 +26,12 @@ func Vendor(fileToParse, samplesDir, outPath, tmplPath, tmplName, commit string)
|
||||
return fmt.Errorf("failed to parse YAML %s, %q", fileToParse, err)
|
||||
}
|
||||
|
||||
for _, re := range regexps {
|
||||
if !isRE2(re) {
|
||||
log.Printf("RE2 incompatible syntax for vendor:'%s'\n", re)
|
||||
}
|
||||
}
|
||||
|
||||
buf := &bytes.Buffer{}
|
||||
if err := executeVendorTemplate(buf, regexps, tmplPath, tmplName, commit); err != nil {
|
||||
return err
|
||||
@ -34,34 +41,14 @@ func Vendor(fileToParse, samplesDir, outPath, tmplPath, tmplName, commit string)
|
||||
}
|
||||
|
||||
func executeVendorTemplate(out io.Writer, regexps []string, tmplPath, tmplName, commit string) error {
|
||||
funcs := template.FuncMap{"optimize": collateAllMatchers}
|
||||
funcs := template.FuncMap{"collateAllRegexps": collateAllRegexps}
|
||||
return executeTemplate(out, tmplName, tmplPath, commit, funcs, regexps)
|
||||
}
|
||||
|
||||
func collateAllMatchers(regexps []string) string {
|
||||
// We now collate all regexps from VendorMatchers to a single large regexp
|
||||
// collateAllRegexps all regexps to a single large regexp.
|
||||
func collateAllRegexps(regexps []string) string {
|
||||
// which is at least twice as fast to test than simply iterating & matching.
|
||||
//
|
||||
// ---
|
||||
//
|
||||
// We could test each matcher from VendorMatchers in turn i.e.
|
||||
//
|
||||
// func IsVendor(filename string) bool {
|
||||
// for _, matcher := range data.VendorMatchers {
|
||||
// if matcher.MatchString(filename) {
|
||||
// return true
|
||||
// }
|
||||
// }
|
||||
// return false
|
||||
// }
|
||||
//
|
||||
// Or naïvely concatentate all these regexps using groups i.e.
|
||||
//
|
||||
// `(regexp1)|(regexp2)|(regexp3)|...`
|
||||
//
|
||||
// However, both of these are relatively slow and don't take advantage
|
||||
// of the inherent structure within our regexps.
|
||||
//
|
||||
// Imperical observation: by looking at the regexps, we only have 3 types.
|
||||
// 1. Those that start with `^`
|
||||
// 2. Those that start with `(^|/)`
|
||||
@ -81,8 +68,8 @@ func collateAllMatchers(regexps []string) string {
|
||||
|
||||
sort.Strings(regexps)
|
||||
|
||||
// Check prefix, group expressions
|
||||
var caretPrefixed, caretOrSlashPrefixed, theRest []string
|
||||
// Check prefix, add to the respective group slices
|
||||
for _, re := range regexps {
|
||||
if strings.HasPrefix(re, caret) {
|
||||
caretPrefixed = append(caretPrefixed, re[len(caret):])
|
||||
@ -92,6 +79,7 @@ func collateAllMatchers(regexps []string) string {
|
||||
theRest = append(theRest, re)
|
||||
}
|
||||
}
|
||||
|
||||
var sb strings.Builder
|
||||
appendGroupWithCommonPrefix(&sb, "^", caretPrefixed)
|
||||
sb.WriteString("|")
|
||||
|
@ -134,7 +134,7 @@ func main() {
|
||||
|
||||
for _, file := range fileList {
|
||||
if err := file.generate(file.fileToParse, file.samplesDir, file.outPath, file.tmplPath, file.tmplName, file.commit); err != nil {
|
||||
log.Fatalf("error generating template %q to %q: %+v", file.tmplPath, file.outPath, err)
|
||||
log.Fatalf("failed to generate %q from %q - %+v", file.outPath, file.tmplPath, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1,3 +1,4 @@
|
||||
//go:build oniguruma
|
||||
// +build oniguruma
|
||||
|
||||
package regex
|
||||
@ -6,10 +7,21 @@ import (
|
||||
rubex "github.com/go-enry/go-oniguruma"
|
||||
)
|
||||
|
||||
const Name = Oniguruma
|
||||
|
||||
type EnryRegexp = *rubex.Regexp
|
||||
|
||||
func MustCompile(str string) EnryRegexp {
|
||||
return rubex.MustCompileASCII(str)
|
||||
func MustCompile(s string) EnryRegexp {
|
||||
return rubex.MustCompileASCII(s)
|
||||
}
|
||||
|
||||
// MustCompileMultiline matches in multi-line mode by default with Oniguruma.
|
||||
func MustCompileMultiline(s string) EnryRegexp {
|
||||
return MustCompile(s)
|
||||
}
|
||||
|
||||
func MustCompileRuby(s string) EnryRegexp {
|
||||
return MustCompile(s)
|
||||
}
|
||||
|
||||
func QuoteMeta(s string) string {
|
||||
|
9
regex/regex.go
Normal file
9
regex/regex.go
Normal file
@ -0,0 +1,9 @@
|
||||
package regex
|
||||
|
||||
// Package regex abstracts regular expression engine
|
||||
// that can be chosen at compile-time by a build tag.
|
||||
|
||||
const (
|
||||
RE2 = "RE2"
|
||||
Oniguruma = "Oniguruma"
|
||||
)
|
@ -1,3 +1,4 @@
|
||||
//go:build !oniguruma
|
||||
// +build !oniguruma
|
||||
|
||||
package regex
|
||||
@ -6,12 +7,32 @@ import (
|
||||
"regexp"
|
||||
)
|
||||
|
||||
const Name = RE2
|
||||
|
||||
type EnryRegexp = *regexp.Regexp
|
||||
|
||||
func MustCompile(str string) EnryRegexp {
|
||||
return regexp.MustCompile(str)
|
||||
}
|
||||
|
||||
// MustCompileMultiline mimics Ruby defaults for regexp, where ^$ matches begin/end of line.
|
||||
// I.e. it converts Ruby regexp syntaxt to RE2 equivalent
|
||||
func MustCompileMultiline(s string) EnryRegexp {
|
||||
const multilineModeFlag = "(?m)"
|
||||
return regexp.MustCompile(multilineModeFlag + s)
|
||||
}
|
||||
|
||||
// MustCompileRuby used for expressions with syntax not supported by RE2.
|
||||
// Now it's confusing as we use the result as [data/rule.Matcher] and
|
||||
//
|
||||
// (*Matcher)(nil) != nil
|
||||
//
|
||||
// What is a better way for an expression to indicate unsupported syntax?
|
||||
// e.g. add .IsValidSyntax() to both, Matcher interface and EnryRegexp implementations?
|
||||
func MustCompileRuby(s string) EnryRegexp {
|
||||
return nil
|
||||
}
|
||||
|
||||
func QuoteMeta(s string) string {
|
||||
return regexp.QuoteMeta(s)
|
||||
}
|
||||
|
27
regex/standard_test.go
Normal file
27
regex/standard_test.go
Normal file
@ -0,0 +1,27 @@
|
||||
//go:build !oniguruma
|
||||
// +build !oniguruma
|
||||
|
||||
package regex
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
func TestMustCompileMultiline(t *testing.T) {
|
||||
const re = `^\.(.*)!$`
|
||||
want := MustCompileMultiline(re)
|
||||
assert.Equal(t, "(?m)"+re, want.String())
|
||||
|
||||
const s = `.one
|
||||
.two!
|
||||
thre!`
|
||||
if !want.MatchString(s) {
|
||||
t.Fatalf("MustCompileMultiline(`%s`) must match multiline %q\n", re, s)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMustCompileRuby(t *testing.T) {
|
||||
assert.Nil(t, MustCompileRuby(``))
|
||||
}
|
16
utils.go
16
utils.go
@ -63,7 +63,21 @@ func IsDotFile(path string) bool {
|
||||
|
||||
// IsVendor returns whether or not path is a vendor path.
|
||||
func IsVendor(path string) bool {
|
||||
return data.FastVendorMatcher.MatchString(path)
|
||||
// fast path: single collatated regex, if the engine supports its syntax
|
||||
if data.FastVendorMatcher != nil {
|
||||
return data.FastVendorMatcher.MatchString(path)
|
||||
}
|
||||
|
||||
// slow path: skip individual rules with unsupported syntax
|
||||
for _, matcher := range data.VendorMatchers {
|
||||
if matcher == nil {
|
||||
continue
|
||||
}
|
||||
if matcher.MatchString(path) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// IsTest returns whether or not path is a test path.
|
||||
|
@ -7,57 +7,62 @@ import (
|
||||
"path/filepath"
|
||||
"testing"
|
||||
|
||||
"github.com/go-enry/go-enry/v2/regex"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
//TODO(bzz): port all from test/test_file_blob.rb test_vendored()
|
||||
//https://github.com/github/linguist/blob/86adc140d3e8903980565a2984f5532edf4ae875/test/test_file_blob.rb#L270-L583
|
||||
// TODO(bzz): port all from test/test_file_blob.rb test_vendored()
|
||||
// https://github.com/github/linguist/blob/86adc140d3e8903980565a2984f5532edf4ae875/test/test_file_blob.rb#L270-L583
|
||||
var vendorTests = []struct {
|
||||
path string
|
||||
expected bool
|
||||
skipOnRE2 bool // some rules are (present in code but) missing at runtime on RE2
|
||||
path string
|
||||
expected bool
|
||||
}{
|
||||
{"cache/", true},
|
||||
{"something_cache/", false},
|
||||
{"random/cache/", true},
|
||||
{"cache", false},
|
||||
{"dependencies/", true},
|
||||
{"Dependencies/", true},
|
||||
{"dependency/", false},
|
||||
{"dist/", true},
|
||||
{"dist", false},
|
||||
{"random/dist/", true},
|
||||
{"random/dist", false},
|
||||
{"deps/", true},
|
||||
{"foodeps/", false},
|
||||
{"configure", true},
|
||||
{"a/configure", true},
|
||||
{"config.guess", true},
|
||||
{"config.guess/", false},
|
||||
{".vscode/", true},
|
||||
{"doc/_build/", true},
|
||||
{"a/docs/_build/", true},
|
||||
{"a/dasdocs/_build-vsdoc.js", true},
|
||||
{"a/dasdocs/_build-vsdoc.j", false},
|
||||
{"foo/bar", false},
|
||||
{".sublime-project", true},
|
||||
{"foo/vendor/foo", true},
|
||||
{"leaflet.draw-src.js", true},
|
||||
{"foo/bar/MochiKit.js", true},
|
||||
{"foo/bar/dojo.js", true},
|
||||
{"foo/env/whatever", true},
|
||||
{"some/python/venv/", false},
|
||||
{"foo/.imageset/bar", true},
|
||||
{"Vagrantfile", true},
|
||||
{"src/bootstrap-custom.js", true},
|
||||
// {"/css/bootstrap.rtl.css", true}, // from linguist v7.23
|
||||
{path: "cache/", expected: true},
|
||||
{false, "something_cache/", false},
|
||||
{false, "random/cache/", true},
|
||||
{false, "cache", false},
|
||||
{false, "dependencies/", true},
|
||||
{false, "Dependencies/", true},
|
||||
{false, "dependency/", false},
|
||||
{false, "dist/", true},
|
||||
{false, "dist", false},
|
||||
{false, "random/dist/", true},
|
||||
{false, "random/dist", false},
|
||||
{false, "deps/", true},
|
||||
{false, "foodeps/", false},
|
||||
{false, "configure", true},
|
||||
{false, "a/configure", true},
|
||||
{false, "config.guess", true},
|
||||
{false, "config.guess/", false},
|
||||
{false, ".vscode/", true},
|
||||
{false, "doc/_build/", true},
|
||||
{false, "a/docs/_build/", true},
|
||||
{false, "a/dasdocs/_build-vsdoc.js", true},
|
||||
{false, "a/dasdocs/_build-vsdoc.j", false},
|
||||
{false, "foo/bar", false},
|
||||
{false, ".sublime-project", true},
|
||||
{false, "foo/vendor/foo", true},
|
||||
{false, "leaflet.draw-src.js", true},
|
||||
{false, "foo/bar/MochiKit.js", true},
|
||||
{false, "foo/bar/dojo.js", true},
|
||||
{false, "foo/env/whatever", true},
|
||||
{false, "some/python/venv/", false},
|
||||
{false, "foo/.imageset/bar", true},
|
||||
{false, "Vagrantfile", true},
|
||||
{true, "src/bootstrap-custom.js", true},
|
||||
// {true, "/css/bootstrap.rtl.css", true}, // from linguist v7.23
|
||||
}
|
||||
|
||||
func TestIsVendor(t *testing.T) {
|
||||
for _, tt := range vendorTests {
|
||||
t.Run(tt.path, func(t *testing.T) {
|
||||
if got := IsVendor(tt.path); got != tt.expected {
|
||||
t.Errorf("IsVendor(%q) = %v, expected %v", tt.path, got, tt.expected)
|
||||
for _, test := range vendorTests {
|
||||
t.Run(test.path, func(t *testing.T) {
|
||||
if got := IsVendor(test.path); got != test.expected {
|
||||
if regex.Name == regex.RE2 && test.skipOnRE2 {
|
||||
return // skip
|
||||
}
|
||||
t.Errorf("IsVendor(%q) = %v, expected %v (usuing %s)", test.path, got, test.expected, regex.Name)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user