mirror of
https://github.com/ralsina/tartrazine.git
synced 2025-06-27 14:47:50 -03:00
code-gen: make content heuristics regexp engine configurable & generation syntax-aware
This commit is contained in:
@ -1,9 +1,8 @@
|
||||
package data
|
||||
|
||||
import (
|
||||
"regexp"
|
||||
|
||||
"github.com/go-enry/go-enry/v2/data/rule"
|
||||
"github.com/go-enry/go-enry/v2/regex"
|
||||
)
|
||||
|
||||
var ContentHeuristics = map[string]*Heuristics{
|
||||
@ -27,12 +26,12 @@ var ContentHeuristics = map[string]*Heuristics{
|
||||
{{- else if eq .Op "Or" -}}
|
||||
rule.Or(
|
||||
{{ template "Languages" .Langs -}}
|
||||
regexp.MustCompile({{ .Pattern | stringVal }}),
|
||||
{{ template "mustCompile" . }}
|
||||
),
|
||||
{{- else if eq .Op "Not" -}}
|
||||
rule.Not(
|
||||
{{ template "Languages" .Langs -}}
|
||||
regexp.MustCompile({{ .Pattern | stringVal }}),
|
||||
{{ template "mustCompile" . }}
|
||||
),
|
||||
{{- else if eq .Op "Always" -}}
|
||||
rule.Always(
|
||||
@ -49,3 +48,11 @@ var ContentHeuristics = map[string]*Heuristics{
|
||||
rule.MatchingLanguages(""),
|
||||
{{end -}}
|
||||
{{end}}
|
||||
|
||||
{{define "mustCompile" -}}
|
||||
{{ if .IsRE2 -}}
|
||||
regex.MustCompileMultiline({{ .Pattern | stringVal }}),
|
||||
{{- else -}}
|
||||
regex.MustCompileRuby({{ .Pattern | stringVal }}),
|
||||
{{ end -}}
|
||||
{{end}}
|
||||
|
@ -70,25 +70,24 @@ func loadRule(namedPatterns map[string]StringArray, rule *Rule) *LanguagePattern
|
||||
subp := loadRule(namedPatterns, r)
|
||||
subPatterns = append(subPatterns, subp)
|
||||
}
|
||||
result = &LanguagePattern{"And", rule.Languages, "", subPatterns}
|
||||
result = &LanguagePattern{"And", rule.Languages, "", subPatterns, true}
|
||||
} else if len(rule.Pattern) != 0 { // OrPattern
|
||||
conjunction := strings.Join(rule.Pattern, orPipe)
|
||||
pattern := convertToValidRegexp(conjunction)
|
||||
result = &LanguagePattern{"Or", rule.Languages, pattern, nil}
|
||||
pattern := strings.Join(rule.Pattern, orPipe)
|
||||
// TODO(bzz): handle len(Languages)==0 better e.g. by emiting rule.Rule
|
||||
// instead of an ugly `rule.Or( rule.MatchingLanguages(""), ... )`
|
||||
result = &LanguagePattern{"Or", rule.Languages, pattern, nil, isRE2(pattern)}
|
||||
} else if rule.NegativePattern != "" { // NotPattern
|
||||
pattern := convertToValidRegexp(rule.NegativePattern)
|
||||
result = &LanguagePattern{"Not", rule.Languages, pattern, nil}
|
||||
pattern := rule.NegativePattern
|
||||
result = &LanguagePattern{"Not", rule.Languages, pattern, nil, isRE2(pattern)}
|
||||
} else if rule.NamedPattern != "" { // Named OrPattern
|
||||
conjunction := strings.Join(namedPatterns[rule.NamedPattern], orPipe)
|
||||
pattern := convertToValidRegexp(conjunction)
|
||||
result = &LanguagePattern{"Or", rule.Languages, pattern, nil}
|
||||
pattern := strings.Join(namedPatterns[rule.NamedPattern], orPipe)
|
||||
result = &LanguagePattern{"Or", rule.Languages, pattern, nil, isRE2(pattern)}
|
||||
} else { // AlwaysPattern
|
||||
result = &LanguagePattern{"Always", rule.Languages, "", nil}
|
||||
result = &LanguagePattern{"Always", rule.Languages, "", nil, true}
|
||||
}
|
||||
|
||||
if isUnsupportedRegexpSyntax(result.Pattern) {
|
||||
log.Printf("skipping rule: language:'%q', rule:'%q'\n", rule.Languages, result.Pattern)
|
||||
return nil
|
||||
if !isRE2(result.Pattern) {
|
||||
log.Printf("RE2 incompatible rule: language:'%q', rule:'%q'\n", rule.Languages, result.Pattern)
|
||||
}
|
||||
return result
|
||||
}
|
||||
@ -100,6 +99,7 @@ type LanguagePattern struct {
|
||||
Langs []string
|
||||
Pattern string
|
||||
Rules []*LanguagePattern
|
||||
IsRE2 bool
|
||||
}
|
||||
|
||||
type Heuristics struct {
|
||||
@ -125,7 +125,7 @@ type Patterns struct {
|
||||
}
|
||||
|
||||
// StringArray is workaround for parsing named_pattern,
|
||||
// wich is sometimes arry and sometimes not.
|
||||
// wich is sometimes an array and sometimes is not.
|
||||
// See https://github.com/go-yaml/yaml/issues/100
|
||||
type StringArray []string
|
||||
|
||||
@ -173,8 +173,6 @@ func isUnsupportedRegexpSyntax(reg string) bool {
|
||||
(strings.HasPrefix(reg, multilinePrefix+`/`) && strings.HasSuffix(reg, `/`))
|
||||
}
|
||||
|
||||
// convertToValidRegexp converts Ruby regexp syntax to RE2 equivalent.
|
||||
// Does not work with Ruby regexp literals.
|
||||
func convertToValidRegexp(rubyRegexp string) string {
|
||||
return multilinePrefix + rubyRegexp
|
||||
func isRE2(s string) bool {
|
||||
return !isUnsupportedRegexpSyntax(s)
|
||||
}
|
||||
|
@ -1,3 +1,4 @@
|
||||
//go:build oniguruma
|
||||
// +build oniguruma
|
||||
|
||||
package regex
|
||||
@ -8,8 +9,17 @@ import (
|
||||
|
||||
type EnryRegexp = *rubex.Regexp
|
||||
|
||||
func MustCompile(str string) EnryRegexp {
|
||||
return rubex.MustCompileASCII(str)
|
||||
func MustCompile(s string) EnryRegexp {
|
||||
return rubex.MustCompileASCII(s)
|
||||
}
|
||||
|
||||
// MustCompileMultiline matches in multi-line mode by default with Oniguruma.
|
||||
func MustCompileMultiline(s string) EnryRegexp {
|
||||
return MustCompile(s)
|
||||
}
|
||||
|
||||
func MustCompileRuby(s string) EnryRegexp {
|
||||
return MustCompile(s)
|
||||
}
|
||||
|
||||
func QuoteMeta(s string) string {
|
||||
|
@ -1,3 +1,4 @@
|
||||
//go:build !oniguruma
|
||||
// +build !oniguruma
|
||||
|
||||
package regex
|
||||
@ -12,6 +13,20 @@ func MustCompile(str string) EnryRegexp {
|
||||
return regexp.MustCompile(str)
|
||||
}
|
||||
|
||||
// MustCompileMultiline mimics Ruby defaults for regexp, where ^$ matches begin/end of line.
|
||||
// I.e. it converts Ruby regexp syntaxt to RE2 equivalent
|
||||
func MustCompileMultiline(s string) EnryRegexp {
|
||||
const multilineModeFlag = "(?m)"
|
||||
return regexp.MustCompile(multilineModeFlag + s)
|
||||
}
|
||||
|
||||
// MustCompileRuby used for expressions with syntax not supported by RE2.
|
||||
func MustCompileRuby(s string) EnryRegexp {
|
||||
// TODO(bzz): find a bettee way?
|
||||
// This will only trigger a panic on .Match() for the clients
|
||||
return nil
|
||||
}
|
||||
|
||||
func QuoteMeta(s string) string {
|
||||
return regexp.QuoteMeta(s)
|
||||
}
|
||||
|
Reference in New Issue
Block a user