diff --git a/README.md b/README.md index a9b19ef..a082a0b 100644 --- a/README.md +++ b/README.md @@ -172,6 +172,8 @@ Parsing [linguist/samples](https://github.com/github/linguist/tree/master/sample - [Heuristics for ".as" extension](https://github.com/github/linguist/blob/223c00bb80eff04788e29010f98c5778993d2b2a/lib/linguist/heuristics.yml#L67) in ActionScript could not be parsed, due to unsupported positive lookahead in RE2 regexp engine. +- [Heuristics for ".csc", ".gsc" and ".gsh" extension](https://github.com/github/linguist/blob/7469c7982d93f2ad922230d712f586a353dc1a42/lib/linguist/heuristics.yml#L650-L651) in GSC could not be parsed, due to unsupported non-backtracking subexpressions in RE2 regexp engine. + - As of [Linguist v5.3.2](https://github.com/github/linguist/releases/tag/v5.3.2) it is using [flex-based scanner in C for tokenization](https://github.com/github/linguist/pull/3846). Enry still uses [extract_token](https://github.com/github/linguist/pull/3846/files#diff-d5179df0b71620e3fac4535cd1368d15L60) regex-based algorithm. See [#193](https://github.com/src-d/enry/issues/193). - Bayesian classifier can't distinguish "SQL" from "PLpgSQL. See [#194](https://github.com/src-d/enry/issues/194). diff --git a/data/content.go b/data/content.go index 00ea011..2872231 100644 --- a/data/content.go +++ b/data/content.go @@ -697,12 +697,7 @@ var ContentHeuristics = map[string]*Heuristics{ regexp.MustCompile(`(?m)^(\s*namespace\s*[\w\.]+\s*{|\s*\/\/)`), ), }, - ".csc": &Heuristics{ - rule.Or( - rule.MatchingLanguages("GSC"), - regexp.MustCompile(`(?m)^\s*#\s*(?:using|insert|include|define|namespace)[ \t]+\w|^\s*(?>(?:autoexec|private)\s+){0,2}function\s+(?>(?:autoexec|private)\s+){0,2}\w+\s*\(|\b(?:level|self)[ \t]+thread[ \t]+(?:\[\[[ \t]*(?>\w+\.)*\w+[ \t]*\]\]|\w+)[ \t]*\([^\r\n\)]*\)[ \t]*;|^[ \t]*#[ \t]*(?:precache|using_animtree)[ \t]*\(`), - ), - }, + ".csc": &Heuristics{}, ".csl": &Heuristics{ rule.Or( rule.MatchingLanguages("XML"), @@ -879,18 +874,8 @@ var ContentHeuristics = map[string]*Heuristics{ regexp.MustCompile(`(?m)^\[indent=[0-9]+\]`), ), }, - ".gsc": &Heuristics{ - rule.Or( - rule.MatchingLanguages("GSC"), - regexp.MustCompile(`(?m)^\s*#\s*(?:using|insert|include|define|namespace)[ \t]+\w|^\s*(?>(?:autoexec|private)\s+){0,2}function\s+(?>(?:autoexec|private)\s+){0,2}\w+\s*\(|\b(?:level|self)[ \t]+thread[ \t]+(?:\[\[[ \t]*(?>\w+\.)*\w+[ \t]*\]\]|\w+)[ \t]*\([^\r\n\)]*\)[ \t]*;|^[ \t]*#[ \t]*(?:precache|using_animtree)[ \t]*\(`), - ), - }, - ".gsh": &Heuristics{ - rule.Or( - rule.MatchingLanguages("GSC"), - regexp.MustCompile(`(?m)^\s*#\s*(?:using|insert|include|define|namespace)[ \t]+\w|^\s*(?>(?:autoexec|private)\s+){0,2}function\s+(?>(?:autoexec|private)\s+){0,2}\w+\s*\(|\b(?:level|self)[ \t]+thread[ \t]+(?:\[\[[ \t]*(?>\w+\.)*\w+[ \t]*\]\]|\w+)[ \t]*\([^\r\n\)]*\)[ \t]*;|^[ \t]*#[ \t]*(?:precache|using_animtree)[ \t]*\(`), - ), - }, + ".gsc": &Heuristics{}, + ".gsh": &Heuristics{}, ".h": &Heuristics{ rule.Or( rule.MatchingLanguages("Objective-C"), diff --git a/internal/code-generator/generator/heuristics.go b/internal/code-generator/generator/heuristics.go index bd8a9af..0b0c2f4 100644 --- a/internal/code-generator/generator/heuristics.go +++ b/internal/code-generator/generator/heuristics.go @@ -39,7 +39,7 @@ func GenHeuristics(fileToParse, _, outPath, tmplPath, tmplName, commit string) e // loadHeuristics transforms parsed YAML to map[".ext"]->IR for code generation. func loadHeuristics(yaml *Heuristics) (map[string][]*LanguagePattern, error) { - var patterns = make(map[string][]*LanguagePattern) + patterns := make(map[string][]*LanguagePattern) for _, disambiguation := range yaml.Disambiguations { var rules []*LanguagePattern for _, rule := range disambiguation.Rules { @@ -161,13 +161,14 @@ func parseYaml(file string) (*Heuristics, error) { // isUnsupportedRegexpSyntax filters regexp syntax that is not supported by RE2. // In particular, we stumbled up on usage of next cases: // - lookbehind & lookahead +// - non-backtracking subexpressions // - named & numbered capturing group/after text matching // - backreference // - possessive quantifier // For referece on supported syntax see https://github.com/google/re2/wiki/Syntax func isUnsupportedRegexpSyntax(reg string) bool { return strings.Contains(reg, `(?<`) || strings.Contains(reg, `(?=`) || strings.Contains(reg, `(?!`) || - strings.Contains(reg, `\1`) || strings.Contains(reg, `*+`) || + strings.Contains(reg, `(?>`) || strings.Contains(reg, `\1`) || strings.Contains(reg, `*+`) || // See https://github.com/github/linguist/pull/4243#discussion_r246105067 (strings.HasPrefix(reg, multilinePrefix+`/`) && strings.HasSuffix(reg, `/`)) } diff --git a/internal/code-generator/generator/test_files/content.gold b/internal/code-generator/generator/test_files/content.gold index 00ea011..2872231 100644 --- a/internal/code-generator/generator/test_files/content.gold +++ b/internal/code-generator/generator/test_files/content.gold @@ -697,12 +697,7 @@ var ContentHeuristics = map[string]*Heuristics{ regexp.MustCompile(`(?m)^(\s*namespace\s*[\w\.]+\s*{|\s*\/\/)`), ), }, - ".csc": &Heuristics{ - rule.Or( - rule.MatchingLanguages("GSC"), - regexp.MustCompile(`(?m)^\s*#\s*(?:using|insert|include|define|namespace)[ \t]+\w|^\s*(?>(?:autoexec|private)\s+){0,2}function\s+(?>(?:autoexec|private)\s+){0,2}\w+\s*\(|\b(?:level|self)[ \t]+thread[ \t]+(?:\[\[[ \t]*(?>\w+\.)*\w+[ \t]*\]\]|\w+)[ \t]*\([^\r\n\)]*\)[ \t]*;|^[ \t]*#[ \t]*(?:precache|using_animtree)[ \t]*\(`), - ), - }, + ".csc": &Heuristics{}, ".csl": &Heuristics{ rule.Or( rule.MatchingLanguages("XML"), @@ -879,18 +874,8 @@ var ContentHeuristics = map[string]*Heuristics{ regexp.MustCompile(`(?m)^\[indent=[0-9]+\]`), ), }, - ".gsc": &Heuristics{ - rule.Or( - rule.MatchingLanguages("GSC"), - regexp.MustCompile(`(?m)^\s*#\s*(?:using|insert|include|define|namespace)[ \t]+\w|^\s*(?>(?:autoexec|private)\s+){0,2}function\s+(?>(?:autoexec|private)\s+){0,2}\w+\s*\(|\b(?:level|self)[ \t]+thread[ \t]+(?:\[\[[ \t]*(?>\w+\.)*\w+[ \t]*\]\]|\w+)[ \t]*\([^\r\n\)]*\)[ \t]*;|^[ \t]*#[ \t]*(?:precache|using_animtree)[ \t]*\(`), - ), - }, - ".gsh": &Heuristics{ - rule.Or( - rule.MatchingLanguages("GSC"), - regexp.MustCompile(`(?m)^\s*#\s*(?:using|insert|include|define|namespace)[ \t]+\w|^\s*(?>(?:autoexec|private)\s+){0,2}function\s+(?>(?:autoexec|private)\s+){0,2}\w+\s*\(|\b(?:level|self)[ \t]+thread[ \t]+(?:\[\[[ \t]*(?>\w+\.)*\w+[ \t]*\]\]|\w+)[ \t]*\([^\r\n\)]*\)[ \t]*;|^[ \t]*#[ \t]*(?:precache|using_animtree)[ \t]*\(`), - ), - }, + ".gsc": &Heuristics{}, + ".gsh": &Heuristics{}, ".h": &Heuristics{ rule.Or( rule.MatchingLanguages("Objective-C"),