new by content heuristisc

This commit is contained in:
Máximo Cuadros 2016-07-14 15:14:32 +02:00
parent e0a990e4ea
commit b1a3085e44
3 changed files with 118 additions and 7 deletions

View File

@ -929,6 +929,8 @@ var LanguagesByExtension = map[string][]string{
} }
func init() { func init() {
LanguagesByExtension[".cgi"] = []string{OtherLanguage}
LanguagesByExtension[".fcgi"] = []string{OtherLanguage}
ExtensionsByLanguage = reverseStringListMap(LanguagesByExtension) ExtensionsByLanguage = reverseStringListMap(LanguagesByExtension)
} }

View File

@ -23,27 +23,32 @@ var matchers = map[string]languageMatcher{
".cls": clsExtLanguage, ".cls": clsExtLanguage,
".m": mExtLanguage, ".m": mExtLanguage,
".ms": msExtLanguage, ".ms": msExtLanguage,
".md": mdExtLanguage,
".fs": fsExtLanguage,
".h": hExtLanguage, ".h": hExtLanguage,
".hh": hhExtLanguage,
".l": lExtLanguage, ".l": lExtLanguage,
".n": nExtLanguage, ".n": nExtLanguage,
".lisp": lispExtLanguage, ".lisp": lispExtLanguage,
".lsp": lispExtLanguage, ".lsp": lispExtLanguage,
".pm": pmExtLanguage, ".pm": pmExtLanguage,
".t": pmExtLanguage, ".t": pmExtLanguage,
".rs": rsExtLanguage,
".pl": plExtLanguage, ".pl": plExtLanguage,
".pro": proExtLanguage, ".pro": proExtLanguage,
".toc": tocExtLanguage, ".toc": tocExtLanguage,
".sls": slsExtLanguage,
} }
var ( var (
cPlusPlusMatcher = substring.BytesOr( cPlusPlusMatcher = substring.BytesOr(
substring.BytesRegexp(`^\s*template\s*<`), substring.BytesRegexp(`\s*template\s*<`),
substring.BytesRegexp(`^\s*#\s*include <(cstdint|string|vector|map|list|array|bitset|queue|stack|forward_list|unordered_map|unordered_set|(i|o|io)stream)>`), substring.BytesRegexp(`\s*#\s*include <(cstdint|string|vector|map|list|array|bitset|queue|stack|forward_list|unordered_map|unordered_set|(i|o|io)stream)>`),
substring.BytesRegexp(`^[ \t]*try`), substring.BytesRegexp(`\n[ \t]*try`),
substring.BytesRegexp(`^[ \t]*(class|(using[ \t]+)?namespace)\s+\w+`), substring.BytesRegexp(`\n[ \t]*(class|(using[ \t]+)?namespace)\s+\w+`),
substring.BytesRegexp(`^[ \t]*(private|public|protected):$`), substring.BytesRegexp(`\n[ \t]*(private|public|protected):$`),
substring.BytesRegexp(`std::\w+`), substring.BytesRegexp(`std::\w+`),
substring.BytesRegexp(`^[ \t]*catch\s*`), substring.BytesRegexp(`[ \t]*catch\s*`),
) )
) )
@ -55,6 +60,20 @@ func incExtLanguage(input []byte) (string, bool) {
return OtherLanguage, true return OtherLanguage, true
} }
func fsExtLanguage(input []byte) (string, bool) {
if substring.BytesRegexp(`\n(: |new-device)`).Match(input) {
return "Forth", true
} else if substring.BytesRegexp(`\s*(#light|import|let|module|namespace|open|type)`).Match(input) {
return "F#", true
} else if substring.BytesRegexp(`(#version|precision|uniform|varying|vec[234])`).Match(input) {
return "GLSL", true
} else if substring.BytesRegexp(`#include|#pragma\s+(rs|version)|__attribute__`).Match(input) {
return "Filterscript", true
}
return OtherLanguage, true
}
func hExtLanguage(input []byte) (string, bool) { func hExtLanguage(input []byte) (string, bool) {
if objectiveCMatcher.Match(input) { if objectiveCMatcher.Match(input) {
return "Objective-C", true return "Objective-C", true
@ -65,6 +84,16 @@ func hExtLanguage(input []byte) (string, bool) {
return "C", true return "C", true
} }
func hhExtLanguage(input []byte) (string, bool) {
if substring.BytesRegexp(`^<\?(?:hh)?`).Match(input) {
return "Hack", true
} else if cPlusPlusMatcher.Match(input) {
return "C++", true
}
return OtherLanguage, false
}
func msExtLanguage(input []byte) (string, bool) { func msExtLanguage(input []byte) (string, bool) {
if substring.BytesRegexp(`[.'][a-z][a-z](\s|$)`).Match(input) { if substring.BytesRegexp(`[.'][a-z][a-z](\s|$)`).Match(input) {
return "Groff", true return "Groff", true
@ -179,6 +208,16 @@ func mExtLanguage(input []byte) (string, bool) {
return OtherLanguage, false return OtherLanguage, false
} }
func mdExtLanguage(input []byte) (string, bool) {
if substring.BytesRegexp(`\n[-a-z0-9=#!\*\[|]`).Match(input) {
return "Markdown", true
} else if substring.BytesRegexp(`\n(;;|\(define_)`).Match(input) {
return "GCC Machine Description", true
}
return OtherLanguage, false
}
var ( var (
prologMatcher = substring.BytesRegexp(`^[^#]+:-`) prologMatcher = substring.BytesRegexp(`^[^#]+:-`)
perlMatcher = substring.BytesRegexp(`use strict|use\s+v?5\.`) perlMatcher = substring.BytesRegexp(`use strict|use\s+v?5\.`)
@ -205,6 +244,16 @@ func pmExtLanguage(input []byte) (string, bool) {
return "Perl", false return "Perl", false
} }
func rsExtLanguage(input []byte) (string, bool) {
if substring.BytesRegexp(`(use |fn |mod |pub |macro_rules|impl|#!?\[)`).Match(input) {
return "Rust", true
} else if substring.BytesRegexp(`#include|#pragma\s+(rs|version)|__attribute__`).Match(input) {
return "RenderScript", true
}
return OtherLanguage, false
}
func proExtLanguage(input []byte) (string, bool) { func proExtLanguage(input []byte) (string, bool) {
if prologMatcher.Match(input) { if prologMatcher.Match(input) {
return "Prolog", true return "Prolog", true
@ -222,3 +271,13 @@ func tocExtLanguage(input []byte) (string, bool) {
return OtherLanguage, false return OtherLanguage, false
} }
func slsExtLanguage(input []byte) (string, bool) {
if substring.BytesRegexp("## |@no-lib-strip@").Match(input) {
return "World of Warcraft Addon Data", true
} else if substring.BytesRegexp("(contentsline|defcounter|beamer|boolfalse)").Match(input) {
return "TeX", true
}
return OtherLanguage, false
}

View File

@ -1,10 +1,12 @@
package slinguist package slinguist
import ( import (
"fmt"
"io/ioutil" "io/ioutil"
"os" "os"
"path" "path"
"path/filepath" "path/filepath"
"text/tabwriter"
. "gopkg.in/check.v1" . "gopkg.in/check.v1"
) )
@ -28,6 +30,7 @@ func (s *TSuite) TestGetLanguageByContentH(c *C) {
s.testGetLanguageByContent(c, "Prolog") s.testGetLanguageByContent(c, "Prolog")
s.testGetLanguageByContent(c, "Perl") s.testGetLanguageByContent(c, "Perl")
s.testGetLanguageByContent(c, "Perl6") s.testGetLanguageByContent(c, "Perl6")
s.testGetLanguageByContent(c, "Hack")
} }
func (s *TSuite) testGetLanguageByContent(c *C, expected string) { func (s *TSuite) testGetLanguageByContent(c *C, expected string) {
@ -41,7 +44,6 @@ func (s *TSuite) testGetLanguageByContent(c *C, expected string) {
} }
content, _ := ioutil.ReadFile(file) content, _ := ioutil.ReadFile(file)
obtained, _ := GetLanguageByContent(path.Base(file), content) obtained, _ := GetLanguageByContent(path.Base(file), content)
if obtained == OtherLanguage { if obtained == OtherLanguage {
continue continue
@ -50,3 +52,51 @@ func (s *TSuite) testGetLanguageByContent(c *C, expected string) {
c.Check(obtained, Equals, expected, Commentf(file)) c.Check(obtained, Equals, expected, Commentf(file))
} }
} }
func (s *TSuite) TestGetLanguageByContentLinguistCorpus(c *C) {
var total, failed, ok, other, unsafe int
w := new(tabwriter.Writer)
w.Init(os.Stdout, 0, 8, 0, '\t', 0)
filepath.Walk(".linguist/samples", func(path string, f os.FileInfo, err error) error {
if f.IsDir() {
if f.Name() == "filenames" {
return filepath.SkipDir
}
return nil
}
total++
expected := filepath.Base(filepath.Dir(path))
filename := filepath.Base(path)
content, _ := ioutil.ReadFile(path)
obtained, safe := GetLanguageByContent(filename, content)
if obtained == OtherLanguage {
other++
}
var status string
if expected == obtained {
status = "ok"
ok++
} else {
status = "failed"
failed++
if !safe {
unsafe++
}
}
fmt.Fprintf(w, "%s\t%s\t%s\t%v\t%s\n", filename, expected, obtained, safe, status)
return nil
})
fmt.Fprintln(w)
w.Flush()
fmt.Printf("total files: %d, ok: %d, failed: %d, unsafe: %d, other: %d\n", total, ok, failed, unsafe, other)
}