code-gen: make content heuristics regexp engine configurable & generation syntax-aware

This commit is contained in:
Alex Bezzubov
2022-12-25 11:56:58 +01:00
parent 0b92f97b9c
commit 5e590f3554
4 changed files with 54 additions and 24 deletions

View File

@ -1,9 +1,8 @@
package data package data
import ( import (
"regexp"
"github.com/go-enry/go-enry/v2/data/rule" "github.com/go-enry/go-enry/v2/data/rule"
"github.com/go-enry/go-enry/v2/regex"
) )
var ContentHeuristics = map[string]*Heuristics{ var ContentHeuristics = map[string]*Heuristics{
@ -27,12 +26,12 @@ var ContentHeuristics = map[string]*Heuristics{
{{- else if eq .Op "Or" -}} {{- else if eq .Op "Or" -}}
rule.Or( rule.Or(
{{ template "Languages" .Langs -}} {{ template "Languages" .Langs -}}
regexp.MustCompile({{ .Pattern | stringVal }}), {{ template "mustCompile" . }}
), ),
{{- else if eq .Op "Not" -}} {{- else if eq .Op "Not" -}}
rule.Not( rule.Not(
{{ template "Languages" .Langs -}} {{ template "Languages" .Langs -}}
regexp.MustCompile({{ .Pattern | stringVal }}), {{ template "mustCompile" . }}
), ),
{{- else if eq .Op "Always" -}} {{- else if eq .Op "Always" -}}
rule.Always( rule.Always(
@ -49,3 +48,11 @@ var ContentHeuristics = map[string]*Heuristics{
rule.MatchingLanguages(""), rule.MatchingLanguages(""),
{{end -}} {{end -}}
{{end}} {{end}}
{{define "mustCompile" -}}
{{ if .IsRE2 -}}
regex.MustCompileMultiline({{ .Pattern | stringVal }}),
{{- else -}}
regex.MustCompileRuby({{ .Pattern | stringVal }}),
{{ end -}}
{{end}}

View File

@ -70,25 +70,24 @@ func loadRule(namedPatterns map[string]StringArray, rule *Rule) *LanguagePattern
subp := loadRule(namedPatterns, r) subp := loadRule(namedPatterns, r)
subPatterns = append(subPatterns, subp) subPatterns = append(subPatterns, subp)
} }
result = &LanguagePattern{"And", rule.Languages, "", subPatterns} result = &LanguagePattern{"And", rule.Languages, "", subPatterns, true}
} else if len(rule.Pattern) != 0 { // OrPattern } else if len(rule.Pattern) != 0 { // OrPattern
conjunction := strings.Join(rule.Pattern, orPipe) pattern := strings.Join(rule.Pattern, orPipe)
pattern := convertToValidRegexp(conjunction) // TODO(bzz): handle len(Languages)==0 better e.g. by emiting rule.Rule
result = &LanguagePattern{"Or", rule.Languages, pattern, nil} // instead of an ugly `rule.Or( rule.MatchingLanguages(""), ... )`
result = &LanguagePattern{"Or", rule.Languages, pattern, nil, isRE2(pattern)}
} else if rule.NegativePattern != "" { // NotPattern } else if rule.NegativePattern != "" { // NotPattern
pattern := convertToValidRegexp(rule.NegativePattern) pattern := rule.NegativePattern
result = &LanguagePattern{"Not", rule.Languages, pattern, nil} result = &LanguagePattern{"Not", rule.Languages, pattern, nil, isRE2(pattern)}
} else if rule.NamedPattern != "" { // Named OrPattern } else if rule.NamedPattern != "" { // Named OrPattern
conjunction := strings.Join(namedPatterns[rule.NamedPattern], orPipe) pattern := strings.Join(namedPatterns[rule.NamedPattern], orPipe)
pattern := convertToValidRegexp(conjunction) result = &LanguagePattern{"Or", rule.Languages, pattern, nil, isRE2(pattern)}
result = &LanguagePattern{"Or", rule.Languages, pattern, nil}
} else { // AlwaysPattern } else { // AlwaysPattern
result = &LanguagePattern{"Always", rule.Languages, "", nil} result = &LanguagePattern{"Always", rule.Languages, "", nil, true}
} }
if isUnsupportedRegexpSyntax(result.Pattern) { if !isRE2(result.Pattern) {
log.Printf("skipping rule: language:'%q', rule:'%q'\n", rule.Languages, result.Pattern) log.Printf("RE2 incompatible rule: language:'%q', rule:'%q'\n", rule.Languages, result.Pattern)
return nil
} }
return result return result
} }
@ -100,6 +99,7 @@ type LanguagePattern struct {
Langs []string Langs []string
Pattern string Pattern string
Rules []*LanguagePattern Rules []*LanguagePattern
IsRE2 bool
} }
type Heuristics struct { type Heuristics struct {
@ -125,7 +125,7 @@ type Patterns struct {
} }
// StringArray is workaround for parsing named_pattern, // StringArray is workaround for parsing named_pattern,
// wich is sometimes arry and sometimes not. // wich is sometimes an array and sometimes is not.
// See https://github.com/go-yaml/yaml/issues/100 // See https://github.com/go-yaml/yaml/issues/100
type StringArray []string type StringArray []string
@ -173,8 +173,6 @@ func isUnsupportedRegexpSyntax(reg string) bool {
(strings.HasPrefix(reg, multilinePrefix+`/`) && strings.HasSuffix(reg, `/`)) (strings.HasPrefix(reg, multilinePrefix+`/`) && strings.HasSuffix(reg, `/`))
} }
// convertToValidRegexp converts Ruby regexp syntax to RE2 equivalent. func isRE2(s string) bool {
// Does not work with Ruby regexp literals. return !isUnsupportedRegexpSyntax(s)
func convertToValidRegexp(rubyRegexp string) string {
return multilinePrefix + rubyRegexp
} }

View File

@ -1,3 +1,4 @@
//go:build oniguruma
// +build oniguruma // +build oniguruma
package regex package regex
@ -8,8 +9,17 @@ import (
type EnryRegexp = *rubex.Regexp type EnryRegexp = *rubex.Regexp
func MustCompile(str string) EnryRegexp { func MustCompile(s string) EnryRegexp {
return rubex.MustCompileASCII(str) return rubex.MustCompileASCII(s)
}
// MustCompileMultiline matches in multi-line mode by default with Oniguruma.
func MustCompileMultiline(s string) EnryRegexp {
return MustCompile(s)
}
func MustCompileRuby(s string) EnryRegexp {
return MustCompile(s)
} }
func QuoteMeta(s string) string { func QuoteMeta(s string) string {

View File

@ -1,3 +1,4 @@
//go:build !oniguruma
// +build !oniguruma // +build !oniguruma
package regex package regex
@ -12,6 +13,20 @@ func MustCompile(str string) EnryRegexp {
return regexp.MustCompile(str) return regexp.MustCompile(str)
} }
// MustCompileMultiline mimics Ruby defaults for regexp, where ^$ matches begin/end of line.
// I.e. it converts Ruby regexp syntaxt to RE2 equivalent
func MustCompileMultiline(s string) EnryRegexp {
const multilineModeFlag = "(?m)"
return regexp.MustCompile(multilineModeFlag + s)
}
// MustCompileRuby used for expressions with syntax not supported by RE2.
func MustCompileRuby(s string) EnryRegexp {
// TODO(bzz): find a bettee way?
// This will only trigger a panic on .Match() for the clients
return nil
}
func QuoteMeta(s string) string { func QuoteMeta(s string) string {
return regexp.QuoteMeta(s) return regexp.QuoteMeta(s)
} }