diff --git a/common.go b/common.go index 55bb7d9..c171f9e 100644 --- a/common.go +++ b/common.go @@ -929,13 +929,17 @@ var LanguagesByExtension = map[string][]string{ } func init() { - LanguagesByExtension[".cgi"] = []string{OtherLanguage} - LanguagesByExtension[".fcgi"] = []string{OtherLanguage} + for _, l := range ignoredExtensions { + LanguagesByExtension[l] = []string{OtherLanguage} + } + ExtensionsByLanguage = reverseStringListMap(LanguagesByExtension) } var ExtensionsByLanguage map[string][]string +var ignoredExtensions = []string{".asc", ".cgi", ".fcgi", ".gml", ".vhost"} + func GetLanguageExtensions(language string) []string { return ExtensionsByLanguage[language] } diff --git a/content.go b/content.go index a6523f1..ac5af03 100644 --- a/content.go +++ b/content.go @@ -18,6 +18,8 @@ func GetLanguageByContent(filename string, content []byte) (lang string, safe bo type languageMatcher func([]byte) (string, bool) var matchers = map[string]languageMatcher{ + ".bf": bfExtLanguage, + ".b": bExtLanguage, ".cl": clExtLanguage, ".inc": incExtLanguage, ".cls": clsExtLanguage, @@ -32,12 +34,15 @@ var matchers = map[string]languageMatcher{ ".lisp": lispExtLanguage, ".lsp": lispExtLanguage, ".pm": pmExtLanguage, - ".t": pmExtLanguage, + ".t": tExtLanguage, + ".ts": tsExtLanguage, + ".tsx": tsxExtLanguage, ".rs": rsExtLanguage, ".pl": plExtLanguage, ".pro": proExtLanguage, ".toc": tocExtLanguage, ".sls": slsExtLanguage, + ".sql": sqlExtLanguage, } var ( @@ -52,6 +57,22 @@ var ( ) ) +func bExtLanguage(input []byte) (string, bool) { + if substring.BytesRegexp(`(include|modules)`).Match(input) { + return "Limbo", true + } + + return "Brainfuck", false +} + +func bfExtLanguage(input []byte) (string, bool) { + if substring.BytesRegexp(`(fprintf|function|return)`).Match(input) { + return "HyPhy", true + } + + return "Brainfuck", false +} + func incExtLanguage(input []byte) (string, bool) { if substring.BytesRegexp(`^<\?(?:php)?`).Match(input) { return "PHP", true @@ -190,7 +211,7 @@ func clsExtLanguage(input []byte) (string, bool) { } var ( - mathematicaMatcher = substring.BytesHas(`\s*\(\*`) + mathematicaMatcher = substring.BytesHas(`\n\s*\(\*`) matlabMatcher = substring.BytesRegexp(`\b(function\s*[\[a-zA-Z]+|pcolor|classdef|figure|end|elseif)\b`) objectiveCMatcher = substring.BytesRegexp( `@(interface|class|protocol|property|end|synchronised|selector|implementation)\b|#import\s+.+\.h[">]`) @@ -199,10 +220,18 @@ var ( func mExtLanguage(input []byte) (string, bool) { if objectiveCMatcher.Match(input) { return "Objective-C", true - } else if matlabMatcher.Match(input) { - return "Matlab", true + } else if substring.BytesHas(`:- module`).Match(input) { + return "Mercury", true + } else if substring.BytesRegexp(`\n: `).Match(input) { + return "MUF", true + } else if substring.BytesRegexp(`^\s*;`).Match(input) { + return "M", true } else if mathematicaMatcher.Match(input) { return "Mathematica", true + } else if matlabMatcher.Match(input) { + return "Matlab", true + } else if substring.BytesRegexp(`^\w+\s*:\s*module\s*{`).Match(input) { + return "Matlab", true } return OtherLanguage, false @@ -244,6 +273,20 @@ func pmExtLanguage(input []byte) (string, bool) { return "Perl", false } +func tExtLanguage(input []byte) (string, bool) { + if perlMatcher.Match(input) { + return "Perl", true + } else if perl6Matcher.Match(input) { + return "Perl6", true + } else if substring.BytesRegexp(`^\s*%|^\s*var\s+\w+\s*:\s*\w+`).Match(input) { + return "RenderScript", true + } else if substring.BytesRegexp(`^\s*use\s+v6\s*;`).Match(input) { + return "RenderScript", true + } + + return "Perl", false +} + func rsExtLanguage(input []byte) (string, bool) { if substring.BytesRegexp(`(use |fn |mod |pub |macro_rules|impl|#!?\[)`).Match(input) { return "Rust", true @@ -281,3 +324,47 @@ func slsExtLanguage(input []byte) (string, bool) { return OtherLanguage, false } + +var ( + pgSQLMatcher = substring.BytesOr( + substring.BytesRegexp(`(?i)\\i\b|AS \$\$|LANGUAGE '?plpgsql'?`), + substring.BytesRegexp(`(?i)SECURITY (DEFINER|INVOKER)`), + substring.BytesRegexp(`BEGIN( WORK| TRANSACTION)?;`), + ) + db2SQLMatcher = substring.BytesOr( + substring.BytesRegexp(`(?i)(alter module)|(language sql)|(begin( NOT)+ atomic)`), + substring.BytesRegexp(`(?i)signal SQLSTATE '[0-9]+'`), + ) + oracleSQLMatcher = substring.BytesOr( + substring.BytesRegexp(`(?i)\$\$PLSQL_|XMLTYPE|sysdate|systimestamp|\.nextval|connect by|AUTHID (DEFINER|CURRENT_USER)`), + substring.BytesRegexp(`(?i)constructor\W+function`), + ) +) + +func sqlExtLanguage(input []byte) (string, bool) { + if pgSQLMatcher.Match(input) { + return "PLpgSQL", true + } else if db2SQLMatcher.Match(input) { + return "SQLPL", true + } else if oracleSQLMatcher.Match(input) { + return "PLSQL", true + } + + return "SQL", false +} + +func tsExtLanguage(input []byte) (string, bool) { + if substring.BytesHas("").Match(input) { + return "XML", true + } + + return "TypeScript", true +} + +func tsxExtLanguage(input []byte) (string, bool) { + if substring.BytesHas("").Match(input) { + return "XML", true + } + + return "TypeScript", true +} diff --git a/content_test.go b/content_test.go index fb86275..7662fcb 100644 --- a/content_test.go +++ b/content_test.go @@ -64,14 +64,20 @@ func (s *TSuite) TestGetLanguageByContentLinguistCorpus(c *C) { if f.Name() == "filenames" { return filepath.SkipDir } + + return nil + } + + expected := filepath.Base(filepath.Dir(path)) + filename := filepath.Base(path) + extension := filepath.Ext(path) + content, _ := ioutil.ReadFile(path) + + if extension == "" { return nil } total++ - expected := filepath.Base(filepath.Dir(path)) - filename := filepath.Base(path) - content, _ := ioutil.ReadFile(path) - obtained, safe := GetLanguageByContent(filename, content) if obtained == OtherLanguage { other++