diff --git a/internal/code-generator/assets/content.go.tmpl b/internal/code-generator/assets/content.go.tmpl index f03669e..dd83166 100644 --- a/internal/code-generator/assets/content.go.tmpl +++ b/internal/code-generator/assets/content.go.tmpl @@ -1,9 +1,8 @@ package data import ( - "regexp" - "github.com/go-enry/go-enry/v2/data/rule" + "github.com/go-enry/go-enry/v2/regex" ) var ContentHeuristics = map[string]*Heuristics{ @@ -27,12 +26,12 @@ var ContentHeuristics = map[string]*Heuristics{ {{- else if eq .Op "Or" -}} rule.Or( {{ template "Languages" .Langs -}} - regexp.MustCompile({{ .Pattern | stringVal }}), + {{ template "mustCompile" . }} ), {{- else if eq .Op "Not" -}} rule.Not( {{ template "Languages" .Langs -}} - regexp.MustCompile({{ .Pattern | stringVal }}), + {{ template "mustCompile" . }} ), {{- else if eq .Op "Always" -}} rule.Always( @@ -49,3 +48,11 @@ var ContentHeuristics = map[string]*Heuristics{ rule.MatchingLanguages(""), {{end -}} {{end}} + +{{define "mustCompile" -}} + {{ if .IsRE2 -}} + regex.MustCompileMultiline({{ .Pattern | stringVal }}), + {{- else -}} + regex.MustCompileRuby({{ .Pattern | stringVal }}), + {{ end -}} +{{end}} diff --git a/internal/code-generator/generator/heuristics.go b/internal/code-generator/generator/heuristics.go index b226b8b..5a3475e 100644 --- a/internal/code-generator/generator/heuristics.go +++ b/internal/code-generator/generator/heuristics.go @@ -70,25 +70,24 @@ func loadRule(namedPatterns map[string]StringArray, rule *Rule) *LanguagePattern subp := loadRule(namedPatterns, r) subPatterns = append(subPatterns, subp) } - result = &LanguagePattern{"And", rule.Languages, "", subPatterns} + result = &LanguagePattern{"And", rule.Languages, "", subPatterns, true} } else if len(rule.Pattern) != 0 { // OrPattern - conjunction := strings.Join(rule.Pattern, orPipe) - pattern := convertToValidRegexp(conjunction) - result = &LanguagePattern{"Or", rule.Languages, pattern, nil} + pattern := strings.Join(rule.Pattern, orPipe) + // TODO(bzz): handle len(Languages)==0 better e.g. by emiting rule.Rule + // instead of an ugly `rule.Or( rule.MatchingLanguages(""), ... )` + result = &LanguagePattern{"Or", rule.Languages, pattern, nil, isRE2(pattern)} } else if rule.NegativePattern != "" { // NotPattern - pattern := convertToValidRegexp(rule.NegativePattern) - result = &LanguagePattern{"Not", rule.Languages, pattern, nil} + pattern := rule.NegativePattern + result = &LanguagePattern{"Not", rule.Languages, pattern, nil, isRE2(pattern)} } else if rule.NamedPattern != "" { // Named OrPattern - conjunction := strings.Join(namedPatterns[rule.NamedPattern], orPipe) - pattern := convertToValidRegexp(conjunction) - result = &LanguagePattern{"Or", rule.Languages, pattern, nil} + pattern := strings.Join(namedPatterns[rule.NamedPattern], orPipe) + result = &LanguagePattern{"Or", rule.Languages, pattern, nil, isRE2(pattern)} } else { // AlwaysPattern - result = &LanguagePattern{"Always", rule.Languages, "", nil} + result = &LanguagePattern{"Always", rule.Languages, "", nil, true} } - if isUnsupportedRegexpSyntax(result.Pattern) { - log.Printf("skipping rule: language:'%q', rule:'%q'\n", rule.Languages, result.Pattern) - return nil + if !isRE2(result.Pattern) { + log.Printf("RE2 incompatible rule: language:'%q', rule:'%q'\n", rule.Languages, result.Pattern) } return result } @@ -100,6 +99,7 @@ type LanguagePattern struct { Langs []string Pattern string Rules []*LanguagePattern + IsRE2 bool } type Heuristics struct { @@ -125,7 +125,7 @@ type Patterns struct { } // StringArray is workaround for parsing named_pattern, -// wich is sometimes arry and sometimes not. +// wich is sometimes an array and sometimes is not. // See https://github.com/go-yaml/yaml/issues/100 type StringArray []string @@ -173,8 +173,6 @@ func isUnsupportedRegexpSyntax(reg string) bool { (strings.HasPrefix(reg, multilinePrefix+`/`) && strings.HasSuffix(reg, `/`)) } -// convertToValidRegexp converts Ruby regexp syntax to RE2 equivalent. -// Does not work with Ruby regexp literals. -func convertToValidRegexp(rubyRegexp string) string { - return multilinePrefix + rubyRegexp +func isRE2(s string) bool { + return !isUnsupportedRegexpSyntax(s) } diff --git a/regex/oniguruma.go b/regex/oniguruma.go index 0c9660e..40462da 100644 --- a/regex/oniguruma.go +++ b/regex/oniguruma.go @@ -1,3 +1,4 @@ +//go:build oniguruma // +build oniguruma package regex @@ -8,8 +9,17 @@ import ( type EnryRegexp = *rubex.Regexp -func MustCompile(str string) EnryRegexp { - return rubex.MustCompileASCII(str) +func MustCompile(s string) EnryRegexp { + return rubex.MustCompileASCII(s) +} + +// MustCompileMultiline matches in multi-line mode by default with Oniguruma. +func MustCompileMultiline(s string) EnryRegexp { + return MustCompile(s) +} + +func MustCompileRuby(s string) EnryRegexp { + return MustCompile(s) } func QuoteMeta(s string) string { diff --git a/regex/standard.go b/regex/standard.go index b242403..5ca9607 100644 --- a/regex/standard.go +++ b/regex/standard.go @@ -1,3 +1,4 @@ +//go:build !oniguruma // +build !oniguruma package regex @@ -12,6 +13,20 @@ func MustCompile(str string) EnryRegexp { return regexp.MustCompile(str) } +// MustCompileMultiline mimics Ruby defaults for regexp, where ^$ matches begin/end of line. +// I.e. it converts Ruby regexp syntaxt to RE2 equivalent +func MustCompileMultiline(s string) EnryRegexp { + const multilineModeFlag = "(?m)" + return regexp.MustCompile(multilineModeFlag + s) +} + +// MustCompileRuby used for expressions with syntax not supported by RE2. +func MustCompileRuby(s string) EnryRegexp { + // TODO(bzz): find a bettee way? + // This will only trigger a panic on .Match() for the clients + return nil +} + func QuoteMeta(s string) string { return regexp.QuoteMeta(s) }