mirror of
https://github.com/ralsina/tartrazine.git
synced 2025-05-23 16:40:08 -03:00
new by content heuristisc
This commit is contained in:
parent
e0a990e4ea
commit
b1a3085e44
@ -929,6 +929,8 @@ var LanguagesByExtension = map[string][]string{
|
|||||||
}
|
}
|
||||||
|
|
||||||
func init() {
|
func init() {
|
||||||
|
LanguagesByExtension[".cgi"] = []string{OtherLanguage}
|
||||||
|
LanguagesByExtension[".fcgi"] = []string{OtherLanguage}
|
||||||
ExtensionsByLanguage = reverseStringListMap(LanguagesByExtension)
|
ExtensionsByLanguage = reverseStringListMap(LanguagesByExtension)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
71
content.go
71
content.go
@ -23,27 +23,32 @@ var matchers = map[string]languageMatcher{
|
|||||||
".cls": clsExtLanguage,
|
".cls": clsExtLanguage,
|
||||||
".m": mExtLanguage,
|
".m": mExtLanguage,
|
||||||
".ms": msExtLanguage,
|
".ms": msExtLanguage,
|
||||||
|
".md": mdExtLanguage,
|
||||||
|
".fs": fsExtLanguage,
|
||||||
".h": hExtLanguage,
|
".h": hExtLanguage,
|
||||||
|
".hh": hhExtLanguage,
|
||||||
".l": lExtLanguage,
|
".l": lExtLanguage,
|
||||||
".n": nExtLanguage,
|
".n": nExtLanguage,
|
||||||
".lisp": lispExtLanguage,
|
".lisp": lispExtLanguage,
|
||||||
".lsp": lispExtLanguage,
|
".lsp": lispExtLanguage,
|
||||||
".pm": pmExtLanguage,
|
".pm": pmExtLanguage,
|
||||||
".t": pmExtLanguage,
|
".t": pmExtLanguage,
|
||||||
|
".rs": rsExtLanguage,
|
||||||
".pl": plExtLanguage,
|
".pl": plExtLanguage,
|
||||||
".pro": proExtLanguage,
|
".pro": proExtLanguage,
|
||||||
".toc": tocExtLanguage,
|
".toc": tocExtLanguage,
|
||||||
|
".sls": slsExtLanguage,
|
||||||
}
|
}
|
||||||
|
|
||||||
var (
|
var (
|
||||||
cPlusPlusMatcher = substring.BytesOr(
|
cPlusPlusMatcher = substring.BytesOr(
|
||||||
substring.BytesRegexp(`^\s*template\s*<`),
|
substring.BytesRegexp(`\s*template\s*<`),
|
||||||
substring.BytesRegexp(`^\s*#\s*include <(cstdint|string|vector|map|list|array|bitset|queue|stack|forward_list|unordered_map|unordered_set|(i|o|io)stream)>`),
|
substring.BytesRegexp(`\s*#\s*include <(cstdint|string|vector|map|list|array|bitset|queue|stack|forward_list|unordered_map|unordered_set|(i|o|io)stream)>`),
|
||||||
substring.BytesRegexp(`^[ \t]*try`),
|
substring.BytesRegexp(`\n[ \t]*try`),
|
||||||
substring.BytesRegexp(`^[ \t]*(class|(using[ \t]+)?namespace)\s+\w+`),
|
substring.BytesRegexp(`\n[ \t]*(class|(using[ \t]+)?namespace)\s+\w+`),
|
||||||
substring.BytesRegexp(`^[ \t]*(private|public|protected):$`),
|
substring.BytesRegexp(`\n[ \t]*(private|public|protected):$`),
|
||||||
substring.BytesRegexp(`std::\w+`),
|
substring.BytesRegexp(`std::\w+`),
|
||||||
substring.BytesRegexp(`^[ \t]*catch\s*`),
|
substring.BytesRegexp(`[ \t]*catch\s*`),
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -55,6 +60,20 @@ func incExtLanguage(input []byte) (string, bool) {
|
|||||||
return OtherLanguage, true
|
return OtherLanguage, true
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func fsExtLanguage(input []byte) (string, bool) {
|
||||||
|
if substring.BytesRegexp(`\n(: |new-device)`).Match(input) {
|
||||||
|
return "Forth", true
|
||||||
|
} else if substring.BytesRegexp(`\s*(#light|import|let|module|namespace|open|type)`).Match(input) {
|
||||||
|
return "F#", true
|
||||||
|
} else if substring.BytesRegexp(`(#version|precision|uniform|varying|vec[234])`).Match(input) {
|
||||||
|
return "GLSL", true
|
||||||
|
} else if substring.BytesRegexp(`#include|#pragma\s+(rs|version)|__attribute__`).Match(input) {
|
||||||
|
return "Filterscript", true
|
||||||
|
}
|
||||||
|
|
||||||
|
return OtherLanguage, true
|
||||||
|
}
|
||||||
|
|
||||||
func hExtLanguage(input []byte) (string, bool) {
|
func hExtLanguage(input []byte) (string, bool) {
|
||||||
if objectiveCMatcher.Match(input) {
|
if objectiveCMatcher.Match(input) {
|
||||||
return "Objective-C", true
|
return "Objective-C", true
|
||||||
@ -65,6 +84,16 @@ func hExtLanguage(input []byte) (string, bool) {
|
|||||||
return "C", true
|
return "C", true
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func hhExtLanguage(input []byte) (string, bool) {
|
||||||
|
if substring.BytesRegexp(`^<\?(?:hh)?`).Match(input) {
|
||||||
|
return "Hack", true
|
||||||
|
} else if cPlusPlusMatcher.Match(input) {
|
||||||
|
return "C++", true
|
||||||
|
}
|
||||||
|
|
||||||
|
return OtherLanguage, false
|
||||||
|
}
|
||||||
|
|
||||||
func msExtLanguage(input []byte) (string, bool) {
|
func msExtLanguage(input []byte) (string, bool) {
|
||||||
if substring.BytesRegexp(`[.'][a-z][a-z](\s|$)`).Match(input) {
|
if substring.BytesRegexp(`[.'][a-z][a-z](\s|$)`).Match(input) {
|
||||||
return "Groff", true
|
return "Groff", true
|
||||||
@ -179,6 +208,16 @@ func mExtLanguage(input []byte) (string, bool) {
|
|||||||
return OtherLanguage, false
|
return OtherLanguage, false
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func mdExtLanguage(input []byte) (string, bool) {
|
||||||
|
if substring.BytesRegexp(`\n[-a-z0-9=#!\*\[|]`).Match(input) {
|
||||||
|
return "Markdown", true
|
||||||
|
} else if substring.BytesRegexp(`\n(;;|\(define_)`).Match(input) {
|
||||||
|
return "GCC Machine Description", true
|
||||||
|
}
|
||||||
|
|
||||||
|
return OtherLanguage, false
|
||||||
|
}
|
||||||
|
|
||||||
var (
|
var (
|
||||||
prologMatcher = substring.BytesRegexp(`^[^#]+:-`)
|
prologMatcher = substring.BytesRegexp(`^[^#]+:-`)
|
||||||
perlMatcher = substring.BytesRegexp(`use strict|use\s+v?5\.`)
|
perlMatcher = substring.BytesRegexp(`use strict|use\s+v?5\.`)
|
||||||
@ -205,6 +244,16 @@ func pmExtLanguage(input []byte) (string, bool) {
|
|||||||
return "Perl", false
|
return "Perl", false
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func rsExtLanguage(input []byte) (string, bool) {
|
||||||
|
if substring.BytesRegexp(`(use |fn |mod |pub |macro_rules|impl|#!?\[)`).Match(input) {
|
||||||
|
return "Rust", true
|
||||||
|
} else if substring.BytesRegexp(`#include|#pragma\s+(rs|version)|__attribute__`).Match(input) {
|
||||||
|
return "RenderScript", true
|
||||||
|
}
|
||||||
|
|
||||||
|
return OtherLanguage, false
|
||||||
|
}
|
||||||
|
|
||||||
func proExtLanguage(input []byte) (string, bool) {
|
func proExtLanguage(input []byte) (string, bool) {
|
||||||
if prologMatcher.Match(input) {
|
if prologMatcher.Match(input) {
|
||||||
return "Prolog", true
|
return "Prolog", true
|
||||||
@ -222,3 +271,13 @@ func tocExtLanguage(input []byte) (string, bool) {
|
|||||||
|
|
||||||
return OtherLanguage, false
|
return OtherLanguage, false
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func slsExtLanguage(input []byte) (string, bool) {
|
||||||
|
if substring.BytesRegexp("## |@no-lib-strip@").Match(input) {
|
||||||
|
return "World of Warcraft Addon Data", true
|
||||||
|
} else if substring.BytesRegexp("(contentsline|defcounter|beamer|boolfalse)").Match(input) {
|
||||||
|
return "TeX", true
|
||||||
|
}
|
||||||
|
|
||||||
|
return OtherLanguage, false
|
||||||
|
}
|
||||||
|
@ -1,10 +1,12 @@
|
|||||||
package slinguist
|
package slinguist
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"fmt"
|
||||||
"io/ioutil"
|
"io/ioutil"
|
||||||
"os"
|
"os"
|
||||||
"path"
|
"path"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
|
"text/tabwriter"
|
||||||
|
|
||||||
. "gopkg.in/check.v1"
|
. "gopkg.in/check.v1"
|
||||||
)
|
)
|
||||||
@ -28,6 +30,7 @@ func (s *TSuite) TestGetLanguageByContentH(c *C) {
|
|||||||
s.testGetLanguageByContent(c, "Prolog")
|
s.testGetLanguageByContent(c, "Prolog")
|
||||||
s.testGetLanguageByContent(c, "Perl")
|
s.testGetLanguageByContent(c, "Perl")
|
||||||
s.testGetLanguageByContent(c, "Perl6")
|
s.testGetLanguageByContent(c, "Perl6")
|
||||||
|
s.testGetLanguageByContent(c, "Hack")
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *TSuite) testGetLanguageByContent(c *C, expected string) {
|
func (s *TSuite) testGetLanguageByContent(c *C, expected string) {
|
||||||
@ -41,7 +44,6 @@ func (s *TSuite) testGetLanguageByContent(c *C, expected string) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
content, _ := ioutil.ReadFile(file)
|
content, _ := ioutil.ReadFile(file)
|
||||||
|
|
||||||
obtained, _ := GetLanguageByContent(path.Base(file), content)
|
obtained, _ := GetLanguageByContent(path.Base(file), content)
|
||||||
if obtained == OtherLanguage {
|
if obtained == OtherLanguage {
|
||||||
continue
|
continue
|
||||||
@ -50,3 +52,51 @@ func (s *TSuite) testGetLanguageByContent(c *C, expected string) {
|
|||||||
c.Check(obtained, Equals, expected, Commentf(file))
|
c.Check(obtained, Equals, expected, Commentf(file))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (s *TSuite) TestGetLanguageByContentLinguistCorpus(c *C) {
|
||||||
|
var total, failed, ok, other, unsafe int
|
||||||
|
|
||||||
|
w := new(tabwriter.Writer)
|
||||||
|
w.Init(os.Stdout, 0, 8, 0, '\t', 0)
|
||||||
|
|
||||||
|
filepath.Walk(".linguist/samples", func(path string, f os.FileInfo, err error) error {
|
||||||
|
if f.IsDir() {
|
||||||
|
if f.Name() == "filenames" {
|
||||||
|
return filepath.SkipDir
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
total++
|
||||||
|
expected := filepath.Base(filepath.Dir(path))
|
||||||
|
filename := filepath.Base(path)
|
||||||
|
content, _ := ioutil.ReadFile(path)
|
||||||
|
|
||||||
|
obtained, safe := GetLanguageByContent(filename, content)
|
||||||
|
if obtained == OtherLanguage {
|
||||||
|
other++
|
||||||
|
}
|
||||||
|
|
||||||
|
var status string
|
||||||
|
if expected == obtained {
|
||||||
|
status = "ok"
|
||||||
|
ok++
|
||||||
|
} else {
|
||||||
|
status = "failed"
|
||||||
|
failed++
|
||||||
|
if !safe {
|
||||||
|
unsafe++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fmt.Fprintf(w, "%s\t%s\t%s\t%v\t%s\n", filename, expected, obtained, safe, status)
|
||||||
|
|
||||||
|
return nil
|
||||||
|
})
|
||||||
|
|
||||||
|
fmt.Fprintln(w)
|
||||||
|
w.Flush()
|
||||||
|
|
||||||
|
fmt.Printf("total files: %d, ok: %d, failed: %d, unsafe: %d, other: %d\n", total, ok, failed, unsafe, other)
|
||||||
|
|
||||||
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user