mirror of
https://github.com/ralsina/tartrazine.git
synced 2025-05-23 08:30:07 -03:00
new by content heuristisc, nore
This commit is contained in:
parent
b1a3085e44
commit
52986d00fc
@ -929,13 +929,17 @@ var LanguagesByExtension = map[string][]string{
|
||||
}
|
||||
|
||||
func init() {
|
||||
LanguagesByExtension[".cgi"] = []string{OtherLanguage}
|
||||
LanguagesByExtension[".fcgi"] = []string{OtherLanguage}
|
||||
for _, l := range ignoredExtensions {
|
||||
LanguagesByExtension[l] = []string{OtherLanguage}
|
||||
}
|
||||
|
||||
ExtensionsByLanguage = reverseStringListMap(LanguagesByExtension)
|
||||
}
|
||||
|
||||
var ExtensionsByLanguage map[string][]string
|
||||
|
||||
var ignoredExtensions = []string{".asc", ".cgi", ".fcgi", ".gml", ".vhost"}
|
||||
|
||||
func GetLanguageExtensions(language string) []string {
|
||||
return ExtensionsByLanguage[language]
|
||||
}
|
||||
|
95
content.go
95
content.go
@ -18,6 +18,8 @@ func GetLanguageByContent(filename string, content []byte) (lang string, safe bo
|
||||
type languageMatcher func([]byte) (string, bool)
|
||||
|
||||
var matchers = map[string]languageMatcher{
|
||||
".bf": bfExtLanguage,
|
||||
".b": bExtLanguage,
|
||||
".cl": clExtLanguage,
|
||||
".inc": incExtLanguage,
|
||||
".cls": clsExtLanguage,
|
||||
@ -32,12 +34,15 @@ var matchers = map[string]languageMatcher{
|
||||
".lisp": lispExtLanguage,
|
||||
".lsp": lispExtLanguage,
|
||||
".pm": pmExtLanguage,
|
||||
".t": pmExtLanguage,
|
||||
".t": tExtLanguage,
|
||||
".ts": tsExtLanguage,
|
||||
".tsx": tsxExtLanguage,
|
||||
".rs": rsExtLanguage,
|
||||
".pl": plExtLanguage,
|
||||
".pro": proExtLanguage,
|
||||
".toc": tocExtLanguage,
|
||||
".sls": slsExtLanguage,
|
||||
".sql": sqlExtLanguage,
|
||||
}
|
||||
|
||||
var (
|
||||
@ -52,6 +57,22 @@ var (
|
||||
)
|
||||
)
|
||||
|
||||
func bExtLanguage(input []byte) (string, bool) {
|
||||
if substring.BytesRegexp(`(include|modules)`).Match(input) {
|
||||
return "Limbo", true
|
||||
}
|
||||
|
||||
return "Brainfuck", false
|
||||
}
|
||||
|
||||
func bfExtLanguage(input []byte) (string, bool) {
|
||||
if substring.BytesRegexp(`(fprintf|function|return)`).Match(input) {
|
||||
return "HyPhy", true
|
||||
}
|
||||
|
||||
return "Brainfuck", false
|
||||
}
|
||||
|
||||
func incExtLanguage(input []byte) (string, bool) {
|
||||
if substring.BytesRegexp(`^<\?(?:php)?`).Match(input) {
|
||||
return "PHP", true
|
||||
@ -190,7 +211,7 @@ func clsExtLanguage(input []byte) (string, bool) {
|
||||
}
|
||||
|
||||
var (
|
||||
mathematicaMatcher = substring.BytesHas(`\s*\(\*`)
|
||||
mathematicaMatcher = substring.BytesHas(`\n\s*\(\*`)
|
||||
matlabMatcher = substring.BytesRegexp(`\b(function\s*[\[a-zA-Z]+|pcolor|classdef|figure|end|elseif)\b`)
|
||||
objectiveCMatcher = substring.BytesRegexp(
|
||||
`@(interface|class|protocol|property|end|synchronised|selector|implementation)\b|#import\s+.+\.h[">]`)
|
||||
@ -199,10 +220,18 @@ var (
|
||||
func mExtLanguage(input []byte) (string, bool) {
|
||||
if objectiveCMatcher.Match(input) {
|
||||
return "Objective-C", true
|
||||
} else if matlabMatcher.Match(input) {
|
||||
return "Matlab", true
|
||||
} else if substring.BytesHas(`:- module`).Match(input) {
|
||||
return "Mercury", true
|
||||
} else if substring.BytesRegexp(`\n: `).Match(input) {
|
||||
return "MUF", true
|
||||
} else if substring.BytesRegexp(`^\s*;`).Match(input) {
|
||||
return "M", true
|
||||
} else if mathematicaMatcher.Match(input) {
|
||||
return "Mathematica", true
|
||||
} else if matlabMatcher.Match(input) {
|
||||
return "Matlab", true
|
||||
} else if substring.BytesRegexp(`^\w+\s*:\s*module\s*{`).Match(input) {
|
||||
return "Matlab", true
|
||||
}
|
||||
|
||||
return OtherLanguage, false
|
||||
@ -244,6 +273,20 @@ func pmExtLanguage(input []byte) (string, bool) {
|
||||
return "Perl", false
|
||||
}
|
||||
|
||||
func tExtLanguage(input []byte) (string, bool) {
|
||||
if perlMatcher.Match(input) {
|
||||
return "Perl", true
|
||||
} else if perl6Matcher.Match(input) {
|
||||
return "Perl6", true
|
||||
} else if substring.BytesRegexp(`^\s*%|^\s*var\s+\w+\s*:\s*\w+`).Match(input) {
|
||||
return "RenderScript", true
|
||||
} else if substring.BytesRegexp(`^\s*use\s+v6\s*;`).Match(input) {
|
||||
return "RenderScript", true
|
||||
}
|
||||
|
||||
return "Perl", false
|
||||
}
|
||||
|
||||
func rsExtLanguage(input []byte) (string, bool) {
|
||||
if substring.BytesRegexp(`(use |fn |mod |pub |macro_rules|impl|#!?\[)`).Match(input) {
|
||||
return "Rust", true
|
||||
@ -281,3 +324,47 @@ func slsExtLanguage(input []byte) (string, bool) {
|
||||
|
||||
return OtherLanguage, false
|
||||
}
|
||||
|
||||
var (
|
||||
pgSQLMatcher = substring.BytesOr(
|
||||
substring.BytesRegexp(`(?i)\\i\b|AS \$\$|LANGUAGE '?plpgsql'?`),
|
||||
substring.BytesRegexp(`(?i)SECURITY (DEFINER|INVOKER)`),
|
||||
substring.BytesRegexp(`BEGIN( WORK| TRANSACTION)?;`),
|
||||
)
|
||||
db2SQLMatcher = substring.BytesOr(
|
||||
substring.BytesRegexp(`(?i)(alter module)|(language sql)|(begin( NOT)+ atomic)`),
|
||||
substring.BytesRegexp(`(?i)signal SQLSTATE '[0-9]+'`),
|
||||
)
|
||||
oracleSQLMatcher = substring.BytesOr(
|
||||
substring.BytesRegexp(`(?i)\$\$PLSQL_|XMLTYPE|sysdate|systimestamp|\.nextval|connect by|AUTHID (DEFINER|CURRENT_USER)`),
|
||||
substring.BytesRegexp(`(?i)constructor\W+function`),
|
||||
)
|
||||
)
|
||||
|
||||
func sqlExtLanguage(input []byte) (string, bool) {
|
||||
if pgSQLMatcher.Match(input) {
|
||||
return "PLpgSQL", true
|
||||
} else if db2SQLMatcher.Match(input) {
|
||||
return "SQLPL", true
|
||||
} else if oracleSQLMatcher.Match(input) {
|
||||
return "PLSQL", true
|
||||
}
|
||||
|
||||
return "SQL", false
|
||||
}
|
||||
|
||||
func tsExtLanguage(input []byte) (string, bool) {
|
||||
if substring.BytesHas("</TS>").Match(input) {
|
||||
return "XML", true
|
||||
}
|
||||
|
||||
return "TypeScript", true
|
||||
}
|
||||
|
||||
func tsxExtLanguage(input []byte) (string, bool) {
|
||||
if substring.BytesHas("</tileset>").Match(input) {
|
||||
return "XML", true
|
||||
}
|
||||
|
||||
return "TypeScript", true
|
||||
}
|
||||
|
@ -64,14 +64,20 @@ func (s *TSuite) TestGetLanguageByContentLinguistCorpus(c *C) {
|
||||
if f.Name() == "filenames" {
|
||||
return filepath.SkipDir
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
expected := filepath.Base(filepath.Dir(path))
|
||||
filename := filepath.Base(path)
|
||||
extension := filepath.Ext(path)
|
||||
content, _ := ioutil.ReadFile(path)
|
||||
|
||||
if extension == "" {
|
||||
return nil
|
||||
}
|
||||
|
||||
total++
|
||||
expected := filepath.Base(filepath.Dir(path))
|
||||
filename := filepath.Base(path)
|
||||
content, _ := ioutil.ReadFile(path)
|
||||
|
||||
obtained, safe := GetLanguageByContent(filename, content)
|
||||
if obtained == OtherLanguage {
|
||||
other++
|
||||
|
Loading…
x
Reference in New Issue
Block a user