new by content heuristisc, nore

This commit is contained in:
Máximo Cuadros 2016-07-14 18:12:12 +02:00
parent b1a3085e44
commit 52986d00fc
3 changed files with 107 additions and 10 deletions

View File

@ -929,13 +929,17 @@ var LanguagesByExtension = map[string][]string{
}
func init() {
LanguagesByExtension[".cgi"] = []string{OtherLanguage}
LanguagesByExtension[".fcgi"] = []string{OtherLanguage}
for _, l := range ignoredExtensions {
LanguagesByExtension[l] = []string{OtherLanguage}
}
ExtensionsByLanguage = reverseStringListMap(LanguagesByExtension)
}
var ExtensionsByLanguage map[string][]string
var ignoredExtensions = []string{".asc", ".cgi", ".fcgi", ".gml", ".vhost"}
func GetLanguageExtensions(language string) []string {
return ExtensionsByLanguage[language]
}

View File

@ -18,6 +18,8 @@ func GetLanguageByContent(filename string, content []byte) (lang string, safe bo
type languageMatcher func([]byte) (string, bool)
var matchers = map[string]languageMatcher{
".bf": bfExtLanguage,
".b": bExtLanguage,
".cl": clExtLanguage,
".inc": incExtLanguage,
".cls": clsExtLanguage,
@ -32,12 +34,15 @@ var matchers = map[string]languageMatcher{
".lisp": lispExtLanguage,
".lsp": lispExtLanguage,
".pm": pmExtLanguage,
".t": pmExtLanguage,
".t": tExtLanguage,
".ts": tsExtLanguage,
".tsx": tsxExtLanguage,
".rs": rsExtLanguage,
".pl": plExtLanguage,
".pro": proExtLanguage,
".toc": tocExtLanguage,
".sls": slsExtLanguage,
".sql": sqlExtLanguage,
}
var (
@ -52,6 +57,22 @@ var (
)
)
func bExtLanguage(input []byte) (string, bool) {
if substring.BytesRegexp(`(include|modules)`).Match(input) {
return "Limbo", true
}
return "Brainfuck", false
}
func bfExtLanguage(input []byte) (string, bool) {
if substring.BytesRegexp(`(fprintf|function|return)`).Match(input) {
return "HyPhy", true
}
return "Brainfuck", false
}
func incExtLanguage(input []byte) (string, bool) {
if substring.BytesRegexp(`^<\?(?:php)?`).Match(input) {
return "PHP", true
@ -190,7 +211,7 @@ func clsExtLanguage(input []byte) (string, bool) {
}
var (
mathematicaMatcher = substring.BytesHas(`\s*\(\*`)
mathematicaMatcher = substring.BytesHas(`\n\s*\(\*`)
matlabMatcher = substring.BytesRegexp(`\b(function\s*[\[a-zA-Z]+|pcolor|classdef|figure|end|elseif)\b`)
objectiveCMatcher = substring.BytesRegexp(
`@(interface|class|protocol|property|end|synchronised|selector|implementation)\b|#import\s+.+\.h[">]`)
@ -199,10 +220,18 @@ var (
func mExtLanguage(input []byte) (string, bool) {
if objectiveCMatcher.Match(input) {
return "Objective-C", true
} else if matlabMatcher.Match(input) {
return "Matlab", true
} else if substring.BytesHas(`:- module`).Match(input) {
return "Mercury", true
} else if substring.BytesRegexp(`\n: `).Match(input) {
return "MUF", true
} else if substring.BytesRegexp(`^\s*;`).Match(input) {
return "M", true
} else if mathematicaMatcher.Match(input) {
return "Mathematica", true
} else if matlabMatcher.Match(input) {
return "Matlab", true
} else if substring.BytesRegexp(`^\w+\s*:\s*module\s*{`).Match(input) {
return "Matlab", true
}
return OtherLanguage, false
@ -244,6 +273,20 @@ func pmExtLanguage(input []byte) (string, bool) {
return "Perl", false
}
func tExtLanguage(input []byte) (string, bool) {
if perlMatcher.Match(input) {
return "Perl", true
} else if perl6Matcher.Match(input) {
return "Perl6", true
} else if substring.BytesRegexp(`^\s*%|^\s*var\s+\w+\s*:\s*\w+`).Match(input) {
return "RenderScript", true
} else if substring.BytesRegexp(`^\s*use\s+v6\s*;`).Match(input) {
return "RenderScript", true
}
return "Perl", false
}
func rsExtLanguage(input []byte) (string, bool) {
if substring.BytesRegexp(`(use |fn |mod |pub |macro_rules|impl|#!?\[)`).Match(input) {
return "Rust", true
@ -281,3 +324,47 @@ func slsExtLanguage(input []byte) (string, bool) {
return OtherLanguage, false
}
var (
pgSQLMatcher = substring.BytesOr(
substring.BytesRegexp(`(?i)\\i\b|AS \$\$|LANGUAGE '?plpgsql'?`),
substring.BytesRegexp(`(?i)SECURITY (DEFINER|INVOKER)`),
substring.BytesRegexp(`BEGIN( WORK| TRANSACTION)?;`),
)
db2SQLMatcher = substring.BytesOr(
substring.BytesRegexp(`(?i)(alter module)|(language sql)|(begin( NOT)+ atomic)`),
substring.BytesRegexp(`(?i)signal SQLSTATE '[0-9]+'`),
)
oracleSQLMatcher = substring.BytesOr(
substring.BytesRegexp(`(?i)\$\$PLSQL_|XMLTYPE|sysdate|systimestamp|\.nextval|connect by|AUTHID (DEFINER|CURRENT_USER)`),
substring.BytesRegexp(`(?i)constructor\W+function`),
)
)
func sqlExtLanguage(input []byte) (string, bool) {
if pgSQLMatcher.Match(input) {
return "PLpgSQL", true
} else if db2SQLMatcher.Match(input) {
return "SQLPL", true
} else if oracleSQLMatcher.Match(input) {
return "PLSQL", true
}
return "SQL", false
}
func tsExtLanguage(input []byte) (string, bool) {
if substring.BytesHas("</TS>").Match(input) {
return "XML", true
}
return "TypeScript", true
}
func tsxExtLanguage(input []byte) (string, bool) {
if substring.BytesHas("</tileset>").Match(input) {
return "XML", true
}
return "TypeScript", true
}

View File

@ -64,14 +64,20 @@ func (s *TSuite) TestGetLanguageByContentLinguistCorpus(c *C) {
if f.Name() == "filenames" {
return filepath.SkipDir
}
return nil
}
expected := filepath.Base(filepath.Dir(path))
filename := filepath.Base(path)
extension := filepath.Ext(path)
content, _ := ioutil.ReadFile(path)
if extension == "" {
return nil
}
total++
expected := filepath.Base(filepath.Dir(path))
filename := filepath.Base(path)
content, _ := ioutil.ReadFile(path)
obtained, safe := GetLanguageByContent(filename, content)
if obtained == OtherLanguage {
other++