code-gen: make content heuristics regexp engine configurable & generation syntax-aware

This commit is contained in:
Alex Bezzubov
2022-12-25 11:56:58 +01:00
parent 0b92f97b9c
commit 5e590f3554
4 changed files with 54 additions and 24 deletions

View File

@ -1,9 +1,8 @@
package data
import (
"regexp"
"github.com/go-enry/go-enry/v2/data/rule"
"github.com/go-enry/go-enry/v2/regex"
)
var ContentHeuristics = map[string]*Heuristics{
@ -27,12 +26,12 @@ var ContentHeuristics = map[string]*Heuristics{
{{- else if eq .Op "Or" -}}
rule.Or(
{{ template "Languages" .Langs -}}
regexp.MustCompile({{ .Pattern | stringVal }}),
{{ template "mustCompile" . }}
),
{{- else if eq .Op "Not" -}}
rule.Not(
{{ template "Languages" .Langs -}}
regexp.MustCompile({{ .Pattern | stringVal }}),
{{ template "mustCompile" . }}
),
{{- else if eq .Op "Always" -}}
rule.Always(
@ -49,3 +48,11 @@ var ContentHeuristics = map[string]*Heuristics{
rule.MatchingLanguages(""),
{{end -}}
{{end}}
{{define "mustCompile" -}}
{{ if .IsRE2 -}}
regex.MustCompileMultiline({{ .Pattern | stringVal }}),
{{- else -}}
regex.MustCompileRuby({{ .Pattern | stringVal }}),
{{ end -}}
{{end}}

View File

@ -70,25 +70,24 @@ func loadRule(namedPatterns map[string]StringArray, rule *Rule) *LanguagePattern
subp := loadRule(namedPatterns, r)
subPatterns = append(subPatterns, subp)
}
result = &LanguagePattern{"And", rule.Languages, "", subPatterns}
result = &LanguagePattern{"And", rule.Languages, "", subPatterns, true}
} else if len(rule.Pattern) != 0 { // OrPattern
conjunction := strings.Join(rule.Pattern, orPipe)
pattern := convertToValidRegexp(conjunction)
result = &LanguagePattern{"Or", rule.Languages, pattern, nil}
pattern := strings.Join(rule.Pattern, orPipe)
// TODO(bzz): handle len(Languages)==0 better e.g. by emiting rule.Rule
// instead of an ugly `rule.Or( rule.MatchingLanguages(""), ... )`
result = &LanguagePattern{"Or", rule.Languages, pattern, nil, isRE2(pattern)}
} else if rule.NegativePattern != "" { // NotPattern
pattern := convertToValidRegexp(rule.NegativePattern)
result = &LanguagePattern{"Not", rule.Languages, pattern, nil}
pattern := rule.NegativePattern
result = &LanguagePattern{"Not", rule.Languages, pattern, nil, isRE2(pattern)}
} else if rule.NamedPattern != "" { // Named OrPattern
conjunction := strings.Join(namedPatterns[rule.NamedPattern], orPipe)
pattern := convertToValidRegexp(conjunction)
result = &LanguagePattern{"Or", rule.Languages, pattern, nil}
pattern := strings.Join(namedPatterns[rule.NamedPattern], orPipe)
result = &LanguagePattern{"Or", rule.Languages, pattern, nil, isRE2(pattern)}
} else { // AlwaysPattern
result = &LanguagePattern{"Always", rule.Languages, "", nil}
result = &LanguagePattern{"Always", rule.Languages, "", nil, true}
}
if isUnsupportedRegexpSyntax(result.Pattern) {
log.Printf("skipping rule: language:'%q', rule:'%q'\n", rule.Languages, result.Pattern)
return nil
if !isRE2(result.Pattern) {
log.Printf("RE2 incompatible rule: language:'%q', rule:'%q'\n", rule.Languages, result.Pattern)
}
return result
}
@ -100,6 +99,7 @@ type LanguagePattern struct {
Langs []string
Pattern string
Rules []*LanguagePattern
IsRE2 bool
}
type Heuristics struct {
@ -125,7 +125,7 @@ type Patterns struct {
}
// StringArray is workaround for parsing named_pattern,
// wich is sometimes arry and sometimes not.
// wich is sometimes an array and sometimes is not.
// See https://github.com/go-yaml/yaml/issues/100
type StringArray []string
@ -173,8 +173,6 @@ func isUnsupportedRegexpSyntax(reg string) bool {
(strings.HasPrefix(reg, multilinePrefix+`/`) && strings.HasSuffix(reg, `/`))
}
// convertToValidRegexp converts Ruby regexp syntax to RE2 equivalent.
// Does not work with Ruby regexp literals.
func convertToValidRegexp(rubyRegexp string) string {
return multilinePrefix + rubyRegexp
func isRE2(s string) bool {
return !isUnsupportedRegexpSyntax(s)
}