mirror of
https://github.com/ralsina/tartrazine.git
synced 2025-05-23 00:20:09 -03:00
changed signatures for strategies
This commit is contained in:
parent
5f0e92b1a8
commit
beda5b73e7
@ -3,14 +3,15 @@ package enry
|
||||
import (
|
||||
"math"
|
||||
|
||||
"sort"
|
||||
|
||||
"gopkg.in/src-d/enry.v1/internal/tokenizer"
|
||||
)
|
||||
|
||||
// Classifier is the interface that contains the method Classify which is in charge to assign scores to the possibles candidates.
|
||||
// The scores must order the candidates so as the highest score be the most probably language of the content. The candidates is
|
||||
// a map which can be used to assign weights to languages dynamically.
|
||||
// Classifier is the interface in charge to detect the possible languages of the given content based on a set of
|
||||
// candidates. Candidates is a map which can be used to assign weights to languages dynamically.
|
||||
type Classifier interface {
|
||||
Classify(content []byte, candidates map[string]float64) map[string]float64
|
||||
Classify(content []byte, candidates map[string]float64) (languages []string)
|
||||
}
|
||||
|
||||
type classifier struct {
|
||||
@ -19,7 +20,13 @@ type classifier struct {
|
||||
tokensTotal float64
|
||||
}
|
||||
|
||||
func (c *classifier) Classify(content []byte, candidates map[string]float64) map[string]float64 {
|
||||
type scoredLanguage struct {
|
||||
language string
|
||||
score float64
|
||||
}
|
||||
|
||||
// Classify returns a sorted slice of possible languages sorted by decreasing language's probability
|
||||
func (c *classifier) Classify(content []byte, candidates map[string]float64) []string {
|
||||
if len(content) == 0 {
|
||||
return nil
|
||||
}
|
||||
@ -39,12 +46,27 @@ func (c *classifier) Classify(content []byte, candidates map[string]float64) map
|
||||
}
|
||||
|
||||
tokens := tokenizer.Tokenize(content)
|
||||
scores := make(map[string]float64, len(languages))
|
||||
scoredLangs := make([]*scoredLanguage, 0, len(languages))
|
||||
for language := range languages {
|
||||
scores[language] = c.tokensLogProbability(tokens, language) + c.languagesLogProbabilities[language]
|
||||
scoredLang := &scoredLanguage{
|
||||
language: language,
|
||||
score: c.tokensLogProbability(tokens, language) + c.languagesLogProbabilities[language],
|
||||
}
|
||||
|
||||
scoredLangs = append(scoredLangs, scoredLang)
|
||||
}
|
||||
|
||||
return scores
|
||||
return sortLanguagesByScore(scoredLangs)
|
||||
}
|
||||
|
||||
func sortLanguagesByScore(scoredLangs []*scoredLanguage) []string {
|
||||
sort.SliceStable(scoredLangs, func(i, j int) bool { return scoredLangs[j].score < scoredLangs[i].score })
|
||||
sortedLanguages := make([]string, 0, len(scoredLangs))
|
||||
for _, scoredLang := range scoredLangs {
|
||||
sortedLanguages = append(sortedLanguages, scoredLang.language)
|
||||
}
|
||||
|
||||
return sortedLanguages
|
||||
}
|
||||
|
||||
func (c *classifier) knownLangs() map[string]float64 {
|
||||
|
393
common.go
393
common.go
@ -1,83 +1,333 @@
|
||||
package enry
|
||||
|
||||
import (
|
||||
"math"
|
||||
"bufio"
|
||||
"bytes"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// OtherLanguage is used as a zero value when a function can not return a specific language.
|
||||
const OtherLanguage = "Other"
|
||||
const OtherLanguage = ""
|
||||
|
||||
// Strategy type fix the signature for the functions that can be used as a strategy.
|
||||
type Strategy func(filename string, content []byte) (languages []string)
|
||||
type Strategy func(filename string, content []byte, candidates []string) (languages []string)
|
||||
|
||||
var strategies = []Strategy{
|
||||
// DefaultStrategies is the strategies' sequence GetLanguage uses to detect languages.
|
||||
var DefaultStrategies = []Strategy{
|
||||
GetLanguagesByModeline,
|
||||
GetLanguagesByFilename,
|
||||
GetLanguagesByShebang,
|
||||
GetLanguagesByExtension,
|
||||
GetLanguagesByContent,
|
||||
GetLanguagesByClassifier,
|
||||
}
|
||||
|
||||
// GetLanguage applies a sequence of strategies based on the given filename and content
|
||||
// to find out the most probably language to return.
|
||||
func GetLanguage(filename string, content []byte) string {
|
||||
candidates := map[string]float64{}
|
||||
for _, strategy := range strategies {
|
||||
languages := strategy(filename, content)
|
||||
var languages []string
|
||||
candidates := []string{}
|
||||
for _, strategy := range DefaultStrategies {
|
||||
languages = strategy(filename, content, candidates)
|
||||
if len(languages) == 1 {
|
||||
return languages[0]
|
||||
}
|
||||
|
||||
if len(languages) > 0 {
|
||||
for _, language := range languages {
|
||||
candidates[language]++
|
||||
candidates = append(candidates, languages...)
|
||||
}
|
||||
}
|
||||
|
||||
return firstLanguage(languages)
|
||||
}
|
||||
|
||||
func firstLanguage(languages []string) string {
|
||||
if len(languages) == 0 {
|
||||
return OtherLanguage
|
||||
}
|
||||
|
||||
return languages[0]
|
||||
}
|
||||
|
||||
func getLanguageByStrategy(strategy Strategy, filename string, content []byte, candidates []string) (string, bool) {
|
||||
languages := strategy(filename, content, candidates)
|
||||
return getFirstLanguageAndSafe(languages)
|
||||
}
|
||||
|
||||
func getFirstLanguageAndSafe(languages []string) (language string, safe bool) {
|
||||
language = firstLanguage(languages)
|
||||
safe = len(languages) == 1
|
||||
return
|
||||
}
|
||||
|
||||
// GetLanguageByModeline returns detected language. If there are more than one possibles languages
|
||||
// it returns the first language by alphabetically order and safe to false.
|
||||
func GetLanguageByModeline(content []byte) (language string, safe bool) {
|
||||
return getLanguageByStrategy(GetLanguagesByModeline, "", content, nil)
|
||||
}
|
||||
|
||||
// GetLanguageByEmacsModeline returns detected language. If there are more than one possibles languages
|
||||
// it returns the first language by alphabetically order and safe to false.
|
||||
func GetLanguageByEmacsModeline(content []byte) (language string, safe bool) {
|
||||
return getLanguageByStrategy(GetLanguagesByEmacsModeline, "", content, nil)
|
||||
}
|
||||
|
||||
// GetLanguageByVimModeline returns detected language. If there are more than one possibles languages
|
||||
// it returns the first language by alphabetically order and safe to false.
|
||||
func GetLanguageByVimModeline(content []byte) (language string, safe bool) {
|
||||
return getLanguageByStrategy(GetLanguagesByVimModeline, "", content, nil)
|
||||
}
|
||||
|
||||
// GetLanguageByFilename returns detected language. If there are more than one possibles languages
|
||||
// it returns the first language by alphabetically order and safe to false.
|
||||
func GetLanguageByFilename(filename string) (language string, safe bool) {
|
||||
return getLanguageByStrategy(GetLanguagesByFilename, filename, nil, nil)
|
||||
}
|
||||
|
||||
// GetLanguageByShebang returns detected language. If there are more than one possibles languages
|
||||
// it returns the first language by alphabetically order and safe to false.
|
||||
func GetLanguageByShebang(content []byte) (language string, safe bool) {
|
||||
return getLanguageByStrategy(GetLanguagesByShebang, "", content, nil)
|
||||
}
|
||||
|
||||
// GetLanguageByExtension returns detected language. If there are more than one possibles languages
|
||||
// it returns the first language by alphabetically order and safe to false.
|
||||
func GetLanguageByExtension(filename string) (language string, safe bool) {
|
||||
return getLanguageByStrategy(GetLanguagesByExtension, filename, nil, nil)
|
||||
}
|
||||
|
||||
// GetLanguageByContent returns detected language. If there are more than one possibles languages
|
||||
// it returns the first language by alphabetically order and safe to false.
|
||||
func GetLanguageByContent(content []byte) (language string, safe bool) {
|
||||
return getLanguageByStrategy(GetLanguagesByContent, "", content, nil)
|
||||
}
|
||||
|
||||
// GetLanguageByClassifier returns the most probably language detected for the given content. It uses
|
||||
// DefaultClassifier, if no candidates are provided it returns OtherLanguage.
|
||||
func GetLanguageByClassifier(content []byte, candidates []string) (language string, safe bool) {
|
||||
return getLanguageByStrategy(GetLanguagesByClassifier, "", content, candidates)
|
||||
}
|
||||
|
||||
// GetLanguageBySpecificClassifier returns the most probably language for the given content using
|
||||
// classifier to detect language.
|
||||
func GetLanguageBySpecificClassifier(content []byte, candidates []string, classifier Classifier) (language string, safe bool) {
|
||||
languages := GetLanguagesBySpecificClassifier(content, candidates, classifier)
|
||||
return getFirstLanguageAndSafe(languages)
|
||||
}
|
||||
|
||||
// GetLanguagesByModeline returns a slice of possible languages for the given content, filename will be ignored.
|
||||
// It is comply with the signature to be a Strategy type.
|
||||
func GetLanguagesByModeline(filename string, content []byte, candidates []string) []string {
|
||||
headFoot := getHeaderAndFooter(content)
|
||||
var languages []string
|
||||
for _, getLang := range modelinesFunc {
|
||||
languages = getLang("", headFoot, candidates)
|
||||
if len(languages) > 0 {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
return languages
|
||||
}
|
||||
|
||||
var modelinesFunc = []Strategy{
|
||||
GetLanguagesByEmacsModeline,
|
||||
GetLanguagesByVimModeline,
|
||||
}
|
||||
|
||||
func getHeaderAndFooter(content []byte) []byte {
|
||||
const searchScope = 5
|
||||
if bytes.Count(content, []byte("\n")) < 2*searchScope {
|
||||
return content
|
||||
}
|
||||
|
||||
header := headScope(content, searchScope)
|
||||
footer := footScope(content, searchScope)
|
||||
headerAndFooter := make([]byte, 0, len(content[:header])+len(content[footer:]))
|
||||
headerAndFooter = append(headerAndFooter, content[:header]...)
|
||||
headerAndFooter = append(headerAndFooter, content[footer:]...)
|
||||
return headerAndFooter
|
||||
}
|
||||
|
||||
func headScope(content []byte, scope int) (index int) {
|
||||
for i := 0; i < scope; i++ {
|
||||
eol := bytes.IndexAny(content, "\n")
|
||||
content = content[eol+1:]
|
||||
index += eol
|
||||
}
|
||||
|
||||
return index + scope - 1
|
||||
}
|
||||
|
||||
func footScope(content []byte, scope int) (index int) {
|
||||
for i := 0; i < scope; i++ {
|
||||
index = bytes.LastIndexAny(content, "\n")
|
||||
content = content[:index]
|
||||
}
|
||||
|
||||
return index + 1
|
||||
}
|
||||
|
||||
var (
|
||||
reEmacsModeline = regexp.MustCompile(`.*-\*-\s*(.+?)\s*-\*-.*(?m:$)`)
|
||||
reEmacsLang = regexp.MustCompile(`.*(?i:mode)\s*:\s*([^\s;]+)\s*;*.*`)
|
||||
reVimModeline = regexp.MustCompile(`(?:(?m:\s|^)vi(?:m[<=>]?\d+|m)?|[\t\x20]*ex)\s*[:]\s*(.*)(?m:$)`)
|
||||
reVimLang = regexp.MustCompile(`(?i:filetype|ft|syntax)\s*=(\w+)(?:\s|:|$)`)
|
||||
)
|
||||
|
||||
// GetLanguagesByEmacsModeline returns a slice of possible languages for the given content, filename and candidates
|
||||
// will be ignored. It is comply with the signature to be a Strategy type.
|
||||
func GetLanguagesByEmacsModeline(filename string, content []byte, candidates []string) []string {
|
||||
matched := reEmacsModeline.FindAllSubmatch(content, -1)
|
||||
if matched == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
// only take the last matched line, discard previous lines
|
||||
lastLineMatched := matched[len(matched)-1][1]
|
||||
matchedAlias := reEmacsLang.FindSubmatch(lastLineMatched)
|
||||
var alias string
|
||||
if matchedAlias != nil {
|
||||
alias = string(matchedAlias[1])
|
||||
} else {
|
||||
alias = string(lastLineMatched)
|
||||
}
|
||||
|
||||
language, ok := GetLanguageByAlias(alias)
|
||||
if !ok {
|
||||
return nil
|
||||
}
|
||||
|
||||
return []string{language}
|
||||
}
|
||||
|
||||
// GetLanguagesByVimModeline returns a slice of possible languages for the given content, filename and candidates
|
||||
// will be ignored. It is comply with the signature to be a Strategy type.
|
||||
func GetLanguagesByVimModeline(filename string, content []byte, candidates []string) []string {
|
||||
matched := reVimModeline.FindAllSubmatch(content, -1)
|
||||
if matched == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
// only take the last matched line, discard previous lines
|
||||
lastLineMatched := matched[len(matched)-1][1]
|
||||
matchedAlias := reVimLang.FindAllSubmatch(lastLineMatched, -1)
|
||||
if matchedAlias == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
alias := string(matchedAlias[0][1])
|
||||
if len(matchedAlias) > 1 {
|
||||
// cases:
|
||||
// matchedAlias = [["syntax=ruby " "ruby"] ["ft=python " "python"] ["filetype=perl " "perl"]] returns OtherLanguage;
|
||||
// matchedAlias = [["syntax=python " "python"] ["ft=python " "python"] ["filetype=python " "python"]] returns "Python";
|
||||
for _, match := range matchedAlias {
|
||||
otherAlias := string(match[1])
|
||||
if otherAlias != alias {
|
||||
return nil
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if len(candidates) == 0 {
|
||||
return OtherLanguage
|
||||
language, ok := GetLanguageByAlias(alias)
|
||||
if !ok {
|
||||
return nil
|
||||
}
|
||||
|
||||
lang := GetLanguageByClassifier(content, candidates, nil)
|
||||
return lang
|
||||
return []string{language}
|
||||
}
|
||||
|
||||
// GetLanguageByModeline returns the language of the given content looking for the modeline,
|
||||
// and safe to indicate the sureness of returned language.
|
||||
func GetLanguageByModeline(content []byte) (lang string, safe bool) {
|
||||
return getLangAndSafe("", content, GetLanguagesByModeline)
|
||||
}
|
||||
|
||||
// GetLanguageByFilename returns a language based on the given filename, and safe to indicate
|
||||
// the sureness of returned language.
|
||||
func GetLanguageByFilename(filename string) (lang string, safe bool) {
|
||||
return getLangAndSafe(filename, nil, GetLanguagesByFilename)
|
||||
}
|
||||
|
||||
// GetLanguagesByFilename returns a slice of possible languages for the given filename, content will be ignored.
|
||||
// It accomplish the signature to be a Strategy type.
|
||||
func GetLanguagesByFilename(filename string, content []byte) []string {
|
||||
// GetLanguagesByFilename returns a slice of possible languages for the given filename, content and candidates
|
||||
// will be ignored. It is comply with the signature to be a Strategy type.
|
||||
func GetLanguagesByFilename(filename string, content []byte, candidates []string) []string {
|
||||
return languagesByFilename[filename]
|
||||
}
|
||||
|
||||
// GetLanguageByShebang returns the language of the given content looking for the shebang line,
|
||||
// and safe to indicate the sureness of returned language.
|
||||
func GetLanguageByShebang(content []byte) (lang string, safe bool) {
|
||||
return getLangAndSafe("", content, GetLanguagesByShebang)
|
||||
// GetLanguagesByShebang returns a slice of possible languages for the given content, filename and candidates
|
||||
// will be ignored. It is comply with the signature to be a Strategy type.
|
||||
func GetLanguagesByShebang(filename string, content []byte, candidates []string) (languages []string) {
|
||||
interpreter := getInterpreter(content)
|
||||
return languagesByInterpreter[interpreter]
|
||||
}
|
||||
|
||||
// GetLanguageByExtension returns a language based on the given filename, and safe to indicate
|
||||
// the sureness of returned language.
|
||||
func GetLanguageByExtension(filename string) (lang string, safe bool) {
|
||||
return getLangAndSafe(filename, nil, GetLanguagesByExtension)
|
||||
var (
|
||||
shebangExecHack = regexp.MustCompile(`exec (\w+).+\$0.+\$@`)
|
||||
pythonVersion = regexp.MustCompile(`python\d\.\d+`)
|
||||
)
|
||||
|
||||
func getInterpreter(data []byte) (interpreter string) {
|
||||
line := getFirstLine(data)
|
||||
if !hasShebang(line) {
|
||||
return ""
|
||||
}
|
||||
|
||||
// skip shebang
|
||||
line = bytes.TrimSpace(line[2:])
|
||||
|
||||
splitted := bytes.Fields(line)
|
||||
if bytes.Contains(splitted[0], []byte("env")) {
|
||||
if len(splitted) > 1 {
|
||||
interpreter = string(splitted[1])
|
||||
}
|
||||
} else {
|
||||
|
||||
splittedPath := bytes.Split(splitted[0], []byte{'/'})
|
||||
interpreter = string(splittedPath[len(splittedPath)-1])
|
||||
}
|
||||
|
||||
if interpreter == "sh" {
|
||||
interpreter = lookForMultilineExec(data)
|
||||
}
|
||||
|
||||
if pythonVersion.MatchString(interpreter) {
|
||||
interpreter = interpreter[:strings.Index(interpreter, `.`)]
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
// GetLanguagesByExtension returns a slice of possible languages for the given filename, content will be ignored.
|
||||
// It accomplish the signature to be a Strategy type.
|
||||
func GetLanguagesByExtension(filename string, content []byte) []string {
|
||||
func getFirstLine(data []byte) []byte {
|
||||
buf := bufio.NewScanner(bytes.NewReader(data))
|
||||
buf.Scan()
|
||||
line := buf.Bytes()
|
||||
if err := buf.Err(); err != nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
return line
|
||||
}
|
||||
|
||||
func hasShebang(line []byte) bool {
|
||||
const shebang = `#!`
|
||||
prefix := []byte(shebang)
|
||||
return bytes.HasPrefix(line, prefix)
|
||||
}
|
||||
|
||||
func lookForMultilineExec(data []byte) string {
|
||||
const magicNumOfLines = 5
|
||||
interpreter := "sh"
|
||||
|
||||
buf := bufio.NewScanner(bytes.NewReader(data))
|
||||
for i := 0; i < magicNumOfLines && buf.Scan(); i++ {
|
||||
line := buf.Bytes()
|
||||
if shebangExecHack.Match(line) {
|
||||
interpreter = shebangExecHack.FindStringSubmatch(string(line))[1]
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if err := buf.Err(); err != nil {
|
||||
return interpreter
|
||||
}
|
||||
|
||||
return interpreter
|
||||
}
|
||||
|
||||
// GetLanguagesByExtension returns a slice of possible languages for the given filename, content and candidates
|
||||
// will be ignored. It is comply with the signature to be a Strategy type.
|
||||
func GetLanguagesByExtension(filename string, content []byte, candidates []string) []string {
|
||||
if !strings.Contains(filename, ".") {
|
||||
return nil
|
||||
}
|
||||
@ -106,15 +356,9 @@ func getDotIndexes(filename string) []int {
|
||||
return dots
|
||||
}
|
||||
|
||||
// GetLanguageByContent returns a language based on the filename and heuristics applies to the content,
|
||||
// and safe to indicate the sureness of returned language.
|
||||
func GetLanguageByContent(filename string, content []byte) (lang string, safe bool) {
|
||||
return getLangAndSafe(filename, content, GetLanguagesByContent)
|
||||
}
|
||||
|
||||
// GetLanguagesByContent returns a slice of possible languages for the given content, filename will be ignored.
|
||||
// It accomplish the signature to be a Strategy type.
|
||||
func GetLanguagesByContent(filename string, content []byte) []string {
|
||||
// GetLanguagesByContent returns a slice of possible languages for the given content, filename and candidates
|
||||
// will be ignored. It is comply with the signature to be a Strategy type.
|
||||
func GetLanguagesByContent(filename string, content []byte, candidates []string) []string {
|
||||
ext := strings.ToLower(filepath.Ext(filename))
|
||||
fnMatcher, ok := contentMatchers[ext]
|
||||
if !ok {
|
||||
@ -124,51 +368,24 @@ func GetLanguagesByContent(filename string, content []byte) []string {
|
||||
return fnMatcher(content)
|
||||
}
|
||||
|
||||
func getLangAndSafe(filename string, content []byte, getLanguageByStrategy Strategy) (lang string, safe bool) {
|
||||
languages := getLanguageByStrategy(filename, content)
|
||||
if len(languages) == 0 {
|
||||
lang = OtherLanguage
|
||||
return
|
||||
// GetLanguagesByClassifier uses DefaultClassifier as a Classifier and returns a sorted slice of possible languages ordered by
|
||||
// decreasing language's probability. If there are not candidates it returns nil. It is comply with the signature to be a Strategy type.
|
||||
func GetLanguagesByClassifier(filename string, content []byte, candidates []string) (languages []string) {
|
||||
if len(candidates) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
lang = languages[0]
|
||||
safe = len(languages) == 1
|
||||
return
|
||||
return GetLanguagesBySpecificClassifier(content, candidates, DefaultClassifier)
|
||||
}
|
||||
|
||||
// GetLanguageByClassifier takes in a content and a list of candidates, and apply the classifier's Classify method to
|
||||
// get the most probably language. If classifier is null then DefaultClassfier will be used. If there aren't candidates
|
||||
// OtherLanguage is returned.
|
||||
func GetLanguageByClassifier(content []byte, candidates map[string]float64, classifier Classifier) string {
|
||||
scores := GetLanguagesByClassifier(content, candidates, classifier)
|
||||
if len(scores) == 0 {
|
||||
return OtherLanguage
|
||||
// GetLanguagesBySpecificClassifier returns a slice of possible languages. It takes in a Classifier to be used.
|
||||
func GetLanguagesBySpecificClassifier(content []byte, candidates []string, classifier Classifier) (languages []string) {
|
||||
mapCandidates := make(map[string]float64)
|
||||
for _, candidate := range candidates {
|
||||
mapCandidates[candidate]++
|
||||
}
|
||||
|
||||
return getLangugeHigherScore(scores)
|
||||
}
|
||||
|
||||
func getLangugeHigherScore(scores map[string]float64) string {
|
||||
var language string
|
||||
higher := -math.MaxFloat64
|
||||
for lang, score := range scores {
|
||||
if higher < score {
|
||||
language = lang
|
||||
higher = score
|
||||
}
|
||||
}
|
||||
|
||||
return language
|
||||
}
|
||||
|
||||
// GetLanguagesByClassifier returns a map of possible languages as keys and a score as value based on content and candidates. The values can be ordered
|
||||
// with the highest value as the most probably language. If classifier is null then DefaultClassfier will be used.
|
||||
func GetLanguagesByClassifier(content []byte, candidates map[string]float64, classifier Classifier) map[string]float64 {
|
||||
if classifier == nil {
|
||||
classifier = DefaultClassifier
|
||||
}
|
||||
|
||||
return classifier.Classify(content, candidates)
|
||||
return classifier.Classify(content, mapCandidates)
|
||||
}
|
||||
|
||||
// GetLanguageExtensions returns the different extensions being used by the language.
|
||||
@ -188,7 +405,7 @@ const (
|
||||
Prose
|
||||
)
|
||||
|
||||
// GetLanguageType returns the given language's type.
|
||||
// GetLanguageType returns the type of the given language.
|
||||
func GetLanguageType(language string) (langType Type) {
|
||||
langType, ok := languagesType[language]
|
||||
if !ok {
|
||||
|
256
common_test.go
256
common_test.go
@ -37,65 +37,64 @@ func (s *SimpleLinguistTestSuite) TestGetLanguage() {
|
||||
}
|
||||
}
|
||||
|
||||
func (s *SimpleLinguistTestSuite) TestGetLanguageByModelineLinguist() {
|
||||
func (s *SimpleLinguistTestSuite) TestGetLanguagesByModelineLinguist() {
|
||||
const (
|
||||
modelinesDir = ".linguist/test/fixtures/Data/Modelines"
|
||||
samplesDir = ".linguist/samples"
|
||||
)
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
filename string
|
||||
expectedLang string
|
||||
expectedSafe bool
|
||||
name string
|
||||
filename string
|
||||
candidates []string
|
||||
expected []string
|
||||
}{
|
||||
// Emacs
|
||||
{name: "TestGetLanguageByModelineLinguist_1", filename: filepath.Join(modelinesDir, "example_smalltalk.md"), expectedLang: "Smalltalk", expectedSafe: true},
|
||||
{name: "TestGetLanguageByModelineLinguist_2", filename: filepath.Join(modelinesDir, "fundamentalEmacs.c"), expectedLang: "Text", expectedSafe: true},
|
||||
{name: "TestGetLanguageByModelineLinguist_3", filename: filepath.Join(modelinesDir, "iamphp.inc"), expectedLang: "PHP", expectedSafe: true},
|
||||
{name: "TestGetLanguageByModelineLinguist_4", filename: filepath.Join(modelinesDir, "seeplusplusEmacs1"), expectedLang: "C++", expectedSafe: true},
|
||||
{name: "TestGetLanguageByModelineLinguist_5", filename: filepath.Join(modelinesDir, "seeplusplusEmacs2"), expectedLang: "C++", expectedSafe: true},
|
||||
{name: "TestGetLanguageByModelineLinguist_6", filename: filepath.Join(modelinesDir, "seeplusplusEmacs3"), expectedLang: "C++", expectedSafe: true},
|
||||
{name: "TestGetLanguageByModelineLinguist_7", filename: filepath.Join(modelinesDir, "seeplusplusEmacs4"), expectedLang: "C++", expectedSafe: true},
|
||||
{name: "TestGetLanguageByModelineLinguist_8", filename: filepath.Join(modelinesDir, "seeplusplusEmacs5"), expectedLang: "C++", expectedSafe: true},
|
||||
{name: "TestGetLanguageByModelineLinguist_9", filename: filepath.Join(modelinesDir, "seeplusplusEmacs6"), expectedLang: "C++", expectedSafe: true},
|
||||
{name: "TestGetLanguageByModelineLinguist_10", filename: filepath.Join(modelinesDir, "seeplusplusEmacs7"), expectedLang: "C++", expectedSafe: true},
|
||||
{name: "TestGetLanguageByModelineLinguist_11", filename: filepath.Join(modelinesDir, "seeplusplusEmacs9"), expectedLang: "C++", expectedSafe: true},
|
||||
{name: "TestGetLanguageByModelineLinguist_12", filename: filepath.Join(modelinesDir, "seeplusplusEmacs10"), expectedLang: "C++", expectedSafe: true},
|
||||
{name: "TestGetLanguageByModelineLinguist_13", filename: filepath.Join(modelinesDir, "seeplusplusEmacs11"), expectedLang: "C++", expectedSafe: true},
|
||||
{name: "TestGetLanguageByModelineLinguist_14", filename: filepath.Join(modelinesDir, "seeplusplusEmacs12"), expectedLang: "C++", expectedSafe: true},
|
||||
{name: "TestGetLanguagesByModelineLinguist_1", filename: filepath.Join(modelinesDir, "example_smalltalk.md"), expected: []string{"Smalltalk"}},
|
||||
{name: "TestGetLanguagesByModelineLinguist_2", filename: filepath.Join(modelinesDir, "fundamentalEmacs.c"), expected: []string{"Text"}},
|
||||
{name: "TestGetLanguagesByModelineLinguist_3", filename: filepath.Join(modelinesDir, "iamphp.inc"), expected: []string{"PHP"}},
|
||||
{name: "TestGetLanguagesByModelineLinguist_4", filename: filepath.Join(modelinesDir, "seeplusplusEmacs1"), expected: []string{"C++"}},
|
||||
{name: "TestGetLanguagesByModelineLinguist_5", filename: filepath.Join(modelinesDir, "seeplusplusEmacs2"), expected: []string{"C++"}},
|
||||
{name: "TestGetLanguagesByModelineLinguist_6", filename: filepath.Join(modelinesDir, "seeplusplusEmacs3"), expected: []string{"C++"}},
|
||||
{name: "TestGetLanguagesByModelineLinguist_7", filename: filepath.Join(modelinesDir, "seeplusplusEmacs4"), expected: []string{"C++"}},
|
||||
{name: "TestGetLanguagesByModelineLinguist_8", filename: filepath.Join(modelinesDir, "seeplusplusEmacs5"), expected: []string{"C++"}},
|
||||
{name: "TestGetLanguagesByModelineLinguist_9", filename: filepath.Join(modelinesDir, "seeplusplusEmacs6"), expected: []string{"C++"}},
|
||||
{name: "TestGetLanguagesByModelineLinguist_10", filename: filepath.Join(modelinesDir, "seeplusplusEmacs7"), expected: []string{"C++"}},
|
||||
{name: "TestGetLanguagesByModelineLinguist_11", filename: filepath.Join(modelinesDir, "seeplusplusEmacs9"), expected: []string{"C++"}},
|
||||
{name: "TestGetLanguagesByModelineLinguist_12", filename: filepath.Join(modelinesDir, "seeplusplusEmacs10"), expected: []string{"C++"}},
|
||||
{name: "TestGetLanguagesByModelineLinguist_13", filename: filepath.Join(modelinesDir, "seeplusplusEmacs11"), expected: []string{"C++"}},
|
||||
{name: "TestGetLanguagesByModelineLinguist_14", filename: filepath.Join(modelinesDir, "seeplusplusEmacs12"), expected: []string{"C++"}},
|
||||
|
||||
// Vim
|
||||
{name: "TestGetLanguageByModelineLinguist_15", filename: filepath.Join(modelinesDir, "seeplusplus"), expectedLang: "C++", expectedSafe: true},
|
||||
{name: "TestGetLanguageByModelineLinguist_16", filename: filepath.Join(modelinesDir, "iamjs.pl"), expectedLang: "JavaScript", expectedSafe: true},
|
||||
{name: "TestGetLanguageByModelineLinguist_17", filename: filepath.Join(modelinesDir, "iamjs2.pl"), expectedLang: "JavaScript", expectedSafe: true},
|
||||
{name: "TestGetLanguageByModelineLinguist_18", filename: filepath.Join(modelinesDir, "not_perl.pl"), expectedLang: "Prolog", expectedSafe: true},
|
||||
{name: "TestGetLanguageByModelineLinguist_19", filename: filepath.Join(modelinesDir, "ruby"), expectedLang: "Ruby", expectedSafe: true},
|
||||
{name: "TestGetLanguageByModelineLinguist_20", filename: filepath.Join(modelinesDir, "ruby2"), expectedLang: "Ruby", expectedSafe: true},
|
||||
{name: "TestGetLanguageByModelineLinguist_21", filename: filepath.Join(modelinesDir, "ruby3"), expectedLang: "Ruby", expectedSafe: true},
|
||||
{name: "TestGetLanguageByModelineLinguist_22", filename: filepath.Join(modelinesDir, "ruby4"), expectedLang: "Ruby", expectedSafe: true},
|
||||
{name: "TestGetLanguageByModelineLinguist_23", filename: filepath.Join(modelinesDir, "ruby5"), expectedLang: "Ruby", expectedSafe: true},
|
||||
{name: "TestGetLanguageByModelineLinguist_24", filename: filepath.Join(modelinesDir, "ruby6"), expectedLang: "Ruby", expectedSafe: true},
|
||||
{name: "TestGetLanguageByModelineLinguist_25", filename: filepath.Join(modelinesDir, "ruby7"), expectedLang: "Ruby", expectedSafe: true},
|
||||
{name: "TestGetLanguageByModelineLinguist_26", filename: filepath.Join(modelinesDir, "ruby8"), expectedLang: "Ruby", expectedSafe: true},
|
||||
{name: "TestGetLanguageByModelineLinguist_27", filename: filepath.Join(modelinesDir, "ruby9"), expectedLang: "Ruby", expectedSafe: true},
|
||||
{name: "TestGetLanguageByModelineLinguist_28", filename: filepath.Join(modelinesDir, "ruby10"), expectedLang: "Ruby", expectedSafe: true},
|
||||
{name: "TestGetLanguageByModelineLinguist_29", filename: filepath.Join(modelinesDir, "ruby11"), expectedLang: "Ruby", expectedSafe: true},
|
||||
{name: "TestGetLanguageByModelineLinguist_30", filename: filepath.Join(modelinesDir, "ruby12"), expectedLang: "Ruby", expectedSafe: true},
|
||||
{name: "TestGetLanguageByModelineLinguist_31", filename: filepath.Join(samplesDir, "C/main.c"), expectedLang: OtherLanguage, expectedSafe: false},
|
||||
{name: "TestGetLanguagesByModelineLinguist_15", filename: filepath.Join(modelinesDir, "seeplusplus"), expected: []string{"C++"}},
|
||||
{name: "TestGetLanguagesByModelineLinguist_16", filename: filepath.Join(modelinesDir, "iamjs.pl"), expected: []string{"JavaScript"}},
|
||||
{name: "TestGetLanguagesByModelineLinguist_17", filename: filepath.Join(modelinesDir, "iamjs2.pl"), expected: []string{"JavaScript"}},
|
||||
{name: "TestGetLanguagesByModelineLinguist_18", filename: filepath.Join(modelinesDir, "not_perl.pl"), expected: []string{"Prolog"}},
|
||||
{name: "TestGetLanguagesByModelineLinguist_19", filename: filepath.Join(modelinesDir, "ruby"), expected: []string{"Ruby"}},
|
||||
{name: "TestGetLanguagesByModelineLinguist_20", filename: filepath.Join(modelinesDir, "ruby2"), expected: []string{"Ruby"}},
|
||||
{name: "TestGetLanguagesByModelineLinguist_21", filename: filepath.Join(modelinesDir, "ruby3"), expected: []string{"Ruby"}},
|
||||
{name: "TestGetLanguagesByModelineLinguist_22", filename: filepath.Join(modelinesDir, "ruby4"), expected: []string{"Ruby"}},
|
||||
{name: "TestGetLanguagesByModelineLinguist_23", filename: filepath.Join(modelinesDir, "ruby5"), expected: []string{"Ruby"}},
|
||||
{name: "TestGetLanguagesByModelineLinguist_24", filename: filepath.Join(modelinesDir, "ruby6"), expected: []string{"Ruby"}},
|
||||
{name: "TestGetLanguagesByModelineLinguist_25", filename: filepath.Join(modelinesDir, "ruby7"), expected: []string{"Ruby"}},
|
||||
{name: "TestGetLanguagesByModelineLinguist_26", filename: filepath.Join(modelinesDir, "ruby8"), expected: []string{"Ruby"}},
|
||||
{name: "TestGetLanguagesByModelineLinguist_27", filename: filepath.Join(modelinesDir, "ruby9"), expected: []string{"Ruby"}},
|
||||
{name: "TestGetLanguagesByModelineLinguist_28", filename: filepath.Join(modelinesDir, "ruby10"), expected: []string{"Ruby"}},
|
||||
{name: "TestGetLanguagesByModelineLinguist_29", filename: filepath.Join(modelinesDir, "ruby11"), expected: []string{"Ruby"}},
|
||||
{name: "TestGetLanguagesByModelineLinguist_30", filename: filepath.Join(modelinesDir, "ruby12"), expected: []string{"Ruby"}},
|
||||
{name: "TestGetLanguagesByModelineLinguist_31", filename: filepath.Join(samplesDir, "C/main.c"), expected: nil},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
content, err := ioutil.ReadFile(test.filename)
|
||||
assert.NoError(s.T(), err)
|
||||
|
||||
lang, safe := GetLanguageByModeline(content)
|
||||
assert.Equal(s.T(), test.expectedLang, lang, fmt.Sprintf("%v: lang = %v, expected: %v", test.name, lang, test.expectedLang))
|
||||
assert.Equal(s.T(), test.expectedSafe, safe, fmt.Sprintf("%v: safe = %v, expected: %v", test.name, safe, test.expectedSafe))
|
||||
languages := GetLanguagesByModeline(test.filename, content, test.candidates)
|
||||
assert.Equal(s.T(), test.expected, languages, fmt.Sprintf("%v: languages = %v, expected: %v", test.name, languages, test.expected))
|
||||
}
|
||||
}
|
||||
|
||||
func (s *SimpleLinguistTestSuite) TestGetLanguageByModeline() {
|
||||
func (s *SimpleLinguistTestSuite) TestGetLanguagesByModeline() {
|
||||
const (
|
||||
wrongVim = `# vim: set syntax=ruby ft =python filetype=perl :`
|
||||
rightVim = `/* vim: set syntax=python ft =python filetype=python */`
|
||||
@ -103,48 +102,48 @@ func (s *SimpleLinguistTestSuite) TestGetLanguageByModeline() {
|
||||
)
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
content []byte
|
||||
expectedLang string
|
||||
expectedSafe bool
|
||||
name string
|
||||
filename string
|
||||
content []byte
|
||||
candidates []string
|
||||
expected []string
|
||||
}{
|
||||
{name: "TestGetLanguageByModeline_1", content: []byte(wrongVim), expectedLang: OtherLanguage, expectedSafe: false},
|
||||
{name: "TestGetLanguageByModeline_2", content: []byte(rightVim), expectedLang: "Python", expectedSafe: true},
|
||||
{name: "TestGetLanguageByModeline_3", content: []byte(noLangVim), expectedLang: OtherLanguage, expectedSafe: false},
|
||||
{name: "TestGetLanguagesByModeline_1", content: []byte(wrongVim), expected: nil},
|
||||
{name: "TestGetLanguagesByModeline_2", content: []byte(rightVim), expected: []string{"Python"}},
|
||||
{name: "TestGetLanguagesByModeline_3", content: []byte(noLangVim), expected: nil},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
lang, safe := GetLanguageByModeline(test.content)
|
||||
assert.Equal(s.T(), test.expectedLang, lang, fmt.Sprintf("%v: lang = %v, expected: %v", test.name, lang, test.expectedLang))
|
||||
assert.Equal(s.T(), test.expectedSafe, safe, fmt.Sprintf("%v: safe = %v, expected: %v", test.name, safe, test.expectedSafe))
|
||||
languages := GetLanguagesByModeline(test.filename, test.content, test.candidates)
|
||||
assert.Equal(s.T(), test.expected, languages, fmt.Sprintf("%v: languages = %v, expected: %v", test.name, languages, test.expected))
|
||||
}
|
||||
}
|
||||
|
||||
func (s *SimpleLinguistTestSuite) TestGetLanguageByFilename() {
|
||||
func (s *SimpleLinguistTestSuite) TestGetLanguagesByFilename() {
|
||||
tests := []struct {
|
||||
name string
|
||||
filename string
|
||||
expectedLang string
|
||||
expectedSafe bool
|
||||
name string
|
||||
filename string
|
||||
content []byte
|
||||
candidates []string
|
||||
expected []string
|
||||
}{
|
||||
{name: "TestGetLanguageByFilename_1", filename: "unknown.interpreter", expectedLang: OtherLanguage, expectedSafe: false},
|
||||
{name: "TestGetLanguageByFilename_2", filename: ".bashrc", expectedLang: "Shell", expectedSafe: true},
|
||||
{name: "TestGetLanguageByFilename_3", filename: "Dockerfile", expectedLang: "Dockerfile", expectedSafe: true},
|
||||
{name: "TestGetLanguageByFilename_4", filename: "Makefile.frag", expectedLang: "Makefile", expectedSafe: true},
|
||||
{name: "TestGetLanguageByFilename_5", filename: "makefile", expectedLang: "Makefile", expectedSafe: true},
|
||||
{name: "TestGetLanguageByFilename_6", filename: "Vagrantfile", expectedLang: "Ruby", expectedSafe: true},
|
||||
{name: "TestGetLanguageByFilename_7", filename: "_vimrc", expectedLang: "Vim script", expectedSafe: true},
|
||||
{name: "TestGetLanguageByFilename_8", filename: "pom.xml", expectedLang: "Maven POM", expectedSafe: true},
|
||||
{name: "TestGetLanguagesByFilename_1", filename: "unknown.interpreter", expected: nil},
|
||||
{name: "TestGetLanguagesByFilename_2", filename: ".bashrc", expected: []string{"Shell"}},
|
||||
{name: "TestGetLanguagesByFilename_3", filename: "Dockerfile", expected: []string{"Dockerfile"}},
|
||||
{name: "TestGetLanguagesByFilename_4", filename: "Makefile.frag", expected: []string{"Makefile"}},
|
||||
{name: "TestGetLanguagesByFilename_5", filename: "makefile", expected: []string{"Makefile"}},
|
||||
{name: "TestGetLanguagesByFilename_6", filename: "Vagrantfile", expected: []string{"Ruby"}},
|
||||
{name: "TestGetLanguagesByFilename_7", filename: "_vimrc", expected: []string{"Vim script"}},
|
||||
{name: "TestGetLanguagesByFilename_8", filename: "pom.xml", expected: []string{"Maven POM"}},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
lang, safe := GetLanguageByFilename(test.filename)
|
||||
assert.Equal(s.T(), test.expectedLang, lang, fmt.Sprintf("%v: lang = %v, expected: %v", test.name, lang, test.expectedLang))
|
||||
assert.Equal(s.T(), test.expectedSafe, safe, fmt.Sprintf("%v: safe = %v, expected: %v", test.name, safe, test.expectedSafe))
|
||||
languages := GetLanguagesByFilename(test.filename, test.content, test.candidates)
|
||||
assert.Equal(s.T(), test.expected, languages, fmt.Sprintf("%v: languages = %v, expected: %v", test.name, languages, test.expected))
|
||||
}
|
||||
}
|
||||
|
||||
func (s *SimpleLinguistTestSuite) TestGetLanguageByShebang() {
|
||||
func (s *SimpleLinguistTestSuite) TestGetLanguagesByShebang() {
|
||||
const (
|
||||
multilineExecHack = `#!/bin/sh
|
||||
# Next line is comment in Tcl, but not in sh... \
|
||||
@ -161,72 +160,112 @@ println("The shell script says ",vm.arglist.concat(" "));`
|
||||
)
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
content []byte
|
||||
expectedLang string
|
||||
expectedSafe bool
|
||||
name string
|
||||
filename string
|
||||
content []byte
|
||||
candidates []string
|
||||
expected []string
|
||||
}{
|
||||
{name: "TestGetLanguageByShebang_1", content: []byte(`#!/unknown/interpreter`), expectedLang: OtherLanguage, expectedSafe: false},
|
||||
{name: "TestGetLanguageByShebang_2", content: []byte(`no shebang`), expectedLang: OtherLanguage, expectedSafe: false},
|
||||
{name: "TestGetLanguageByShebang_3", content: []byte(`#!/usr/bin/env`), expectedLang: OtherLanguage, expectedSafe: false},
|
||||
{name: "TestGetLanguageByShebang_4", content: []byte(`#!/usr/bin/python -tt`), expectedLang: "Python", expectedSafe: true},
|
||||
{name: "TestGetLanguageByShebang_5", content: []byte(`#!/usr/bin/env python2.6`), expectedLang: "Python", expectedSafe: true},
|
||||
{name: "TestGetLanguageByShebang_6", content: []byte(`#!/usr/bin/env perl`), expectedLang: "Perl", expectedSafe: true},
|
||||
{name: "TestGetLanguageByShebang_7", content: []byte(`#! /bin/sh`), expectedLang: "Shell", expectedSafe: true},
|
||||
{name: "TestGetLanguageByShebang_8", content: []byte(`#!bash`), expectedLang: "Shell", expectedSafe: true},
|
||||
{name: "TestGetLanguageByShebang_9", content: []byte(multilineExecHack), expectedLang: "Tcl", expectedSafe: true},
|
||||
{name: "TestGetLanguageByShebang_10", content: []byte(multilineNoExecHack), expectedLang: "Shell", expectedSafe: true},
|
||||
{name: "TestGetLanguagesByShebang_1", content: []byte(`#!/unknown/interpreter`), expected: nil},
|
||||
{name: "TestGetLanguagesByShebang_2", content: []byte(`no shebang`), expected: nil},
|
||||
{name: "TestGetLanguagesByShebang_3", content: []byte(`#!/usr/bin/env`), expected: nil},
|
||||
{name: "TestGetLanguagesByShebang_4", content: []byte(`#!/usr/bin/python -tt`), expected: []string{"Python"}},
|
||||
{name: "TestGetLanguagesByShebang_5", content: []byte(`#!/usr/bin/env python2.6`), expected: []string{"Python"}},
|
||||
{name: "TestGetLanguagesByShebang_6", content: []byte(`#!/usr/bin/env perl`), expected: []string{"Perl"}},
|
||||
{name: "TestGetLanguagesByShebang_7", content: []byte(`#! /bin/sh`), expected: []string{"Shell"}},
|
||||
{name: "TestGetLanguagesByShebang_8", content: []byte(`#!bash`), expected: []string{"Shell"}},
|
||||
{name: "TestGetLanguagesByShebang_9", content: []byte(multilineExecHack), expected: []string{"Tcl"}},
|
||||
{name: "TestGetLanguagesByShebang_10", content: []byte(multilineNoExecHack), expected: []string{"Shell"}},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
lang, safe := GetLanguageByShebang(test.content)
|
||||
assert.Equal(s.T(), test.expectedLang, lang, fmt.Sprintf("%v: lang = %v, expected: %v", test.name, lang, test.expectedLang))
|
||||
assert.Equal(s.T(), test.expectedSafe, safe, fmt.Sprintf("%v: safe = %v, expected: %v", test.name, safe, test.expectedSafe))
|
||||
languages := GetLanguagesByShebang(test.filename, test.content, test.candidates)
|
||||
assert.Equal(s.T(), test.expected, languages, fmt.Sprintf("%v: languages = %v, expected: %v", test.name, languages, test.expected))
|
||||
}
|
||||
}
|
||||
|
||||
func (s *SimpleLinguistTestSuite) TestGetLanguageByExtension() {
|
||||
func (s *SimpleLinguistTestSuite) TestGetLanguagesByExtension() {
|
||||
tests := []struct {
|
||||
name string
|
||||
filename string
|
||||
expectedLang string
|
||||
expectedSafe bool
|
||||
name string
|
||||
filename string
|
||||
content []byte
|
||||
candidates []string
|
||||
expected []string
|
||||
}{
|
||||
{name: "TestGetLanguageByExtension_1", filename: "foo.foo", expectedLang: OtherLanguage, expectedSafe: false},
|
||||
{name: "TestGetLanguageByExtension_2", filename: "foo.go", expectedLang: "Go", expectedSafe: true},
|
||||
{name: "TestGetLanguageByExtension_3", filename: "foo.go.php", expectedLang: "Hack", expectedSafe: false},
|
||||
{name: "TestGetLanguagesByExtension_1", filename: "foo.foo", expected: nil},
|
||||
{name: "TestGetLanguagesByExtension_2", filename: "foo.go", expected: []string{"Go"}},
|
||||
{name: "TestGetLanguagesByExtension_3", filename: "foo.go.php", expected: []string{"Hack", "PHP"}},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
lang, safe := GetLanguageByExtension(test.filename)
|
||||
assert.Equal(s.T(), test.expectedLang, lang, fmt.Sprintf("%v: lang = %v, expected: %v", test.name, lang, test.expectedLang))
|
||||
assert.Equal(s.T(), test.expectedSafe, safe, fmt.Sprintf("%v: safe = %v, expected: %v", test.name, safe, test.expectedSafe))
|
||||
languages := GetLanguagesByExtension(test.filename, test.content, test.candidates)
|
||||
assert.Equal(s.T(), test.expected, languages, fmt.Sprintf("%v: languages = %v, expected: %v", test.name, languages, test.expected))
|
||||
}
|
||||
}
|
||||
|
||||
func (s *SimpleLinguistTestSuite) TestGetLanguageByClassifier() {
|
||||
func (s *SimpleLinguistTestSuite) TestGetLanguagesByClassifier() {
|
||||
const samples = `.linguist/samples/`
|
||||
test := []struct {
|
||||
name string
|
||||
filename string
|
||||
candidates map[string]float64
|
||||
candidates []string
|
||||
expected string
|
||||
}{
|
||||
{name: "TestGetLanguageByClassifier_1", filename: filepath.Join(samples, "C/blob.c"), candidates: map[string]float64{"python": 1.00, "ruby": 1.00, "c": 1.00, "c++": 1.00}, expected: "C"},
|
||||
{name: "TestGetLanguageByClassifier_2", filename: filepath.Join(samples, "C/blob.c"), candidates: nil, expected: "C"},
|
||||
{name: "TestGetLanguageByClassifier_3", filename: filepath.Join(samples, "C/main.c"), candidates: nil, expected: "C"},
|
||||
{name: "TestGetLanguageByClassifier_4", filename: filepath.Join(samples, "C/blob.c"), candidates: map[string]float64{"python": 1.00, "ruby": 1.00, "c++": 1.00}, expected: "C++"},
|
||||
{name: "TestGetLanguageByClassifier_5", filename: filepath.Join(samples, "C/blob.c"), candidates: map[string]float64{"ruby": 1.00}, expected: "Ruby"},
|
||||
{name: "TestGetLanguageByClassifier_6", filename: filepath.Join(samples, "Python/django-models-base.py"), candidates: map[string]float64{"python": 1.00, "ruby": 1.00, "c": 1.00, "c++": 1.00}, expected: "Python"},
|
||||
{name: "TestGetLanguageByClassifier_7", filename: filepath.Join(samples, "Python/django-models-base.py"), candidates: nil, expected: "Python"},
|
||||
{name: "TestGetLanguagesByClassifier_1", filename: filepath.Join(samples, "C/blob.c"), candidates: []string{"python", "ruby", "c", "c++"}, expected: "C"},
|
||||
{name: "TestGetLanguagesByClassifier_2", filename: filepath.Join(samples, "C/blob.c"), candidates: nil, expected: OtherLanguage},
|
||||
{name: "TestGetLanguagesByClassifier_3", filename: filepath.Join(samples, "C/main.c"), candidates: []string{}, expected: OtherLanguage},
|
||||
{name: "TestGetLanguagesByClassifier_4", filename: filepath.Join(samples, "C/blob.c"), candidates: []string{"python", "ruby", "c++"}, expected: "C++"},
|
||||
{name: "TestGetLanguagesByClassifier_5", filename: filepath.Join(samples, "C/blob.c"), candidates: []string{"ruby"}, expected: "Ruby"},
|
||||
{name: "TestGetLanguagesByClassifier_6", filename: filepath.Join(samples, "Python/django-models-base.py"), candidates: []string{"python", "ruby", "c", "c++"}, expected: "Python"},
|
||||
}
|
||||
|
||||
for _, test := range test {
|
||||
content, err := ioutil.ReadFile(test.filename)
|
||||
assert.NoError(s.T(), err)
|
||||
|
||||
lang := GetLanguageByClassifier(content, test.candidates, nil)
|
||||
assert.Equal(s.T(), test.expected, lang, fmt.Sprintf("%v: lang = %v, expected: %v", test.name, lang, test.expected))
|
||||
languages := GetLanguagesByClassifier(test.filename, content, test.candidates)
|
||||
var language string
|
||||
if len(languages) == 0 {
|
||||
language = OtherLanguage
|
||||
} else {
|
||||
language = languages[0]
|
||||
}
|
||||
|
||||
assert.Equal(s.T(), test.expected, language, fmt.Sprintf("%v: language = %v, expected: %v", test.name, language, test.expected))
|
||||
}
|
||||
}
|
||||
|
||||
func (s *SimpleLinguistTestSuite) TestGetLanguagesBySpecificClassifier() {
|
||||
const samples = `.linguist/samples/`
|
||||
test := []struct {
|
||||
name string
|
||||
filename string
|
||||
candidates []string
|
||||
classifier Classifier
|
||||
expected string
|
||||
}{
|
||||
{name: "TestGetLanguagesByClassifier_1", filename: filepath.Join(samples, "C/blob.c"), candidates: []string{"python", "ruby", "c", "c++"}, classifier: DefaultClassifier, expected: "C"},
|
||||
{name: "TestGetLanguagesByClassifier_2", filename: filepath.Join(samples, "C/blob.c"), candidates: nil, classifier: DefaultClassifier, expected: "C"},
|
||||
{name: "TestGetLanguagesByClassifier_3", filename: filepath.Join(samples, "C/main.c"), candidates: []string{}, classifier: DefaultClassifier, expected: "C"},
|
||||
{name: "TestGetLanguagesByClassifier_4", filename: filepath.Join(samples, "C/blob.c"), candidates: []string{"python", "ruby", "c++"}, classifier: DefaultClassifier, expected: "C++"},
|
||||
{name: "TestGetLanguagesByClassifier_5", filename: filepath.Join(samples, "C/blob.c"), candidates: []string{"ruby"}, classifier: DefaultClassifier, expected: "Ruby"},
|
||||
{name: "TestGetLanguagesByClassifier_6", filename: filepath.Join(samples, "Python/django-models-base.py"), candidates: []string{"python", "ruby", "c", "c++"}, classifier: DefaultClassifier, expected: "Python"},
|
||||
{name: "TestGetLanguagesByClassifier_6", filename: os.DevNull, candidates: nil, classifier: DefaultClassifier, expected: OtherLanguage},
|
||||
}
|
||||
|
||||
for _, test := range test {
|
||||
content, err := ioutil.ReadFile(test.filename)
|
||||
assert.NoError(s.T(), err)
|
||||
|
||||
languages := GetLanguagesBySpecificClassifier(content, test.candidates, test.classifier)
|
||||
var language string
|
||||
if len(languages) == 0 {
|
||||
language = OtherLanguage
|
||||
} else {
|
||||
language = languages[0]
|
||||
}
|
||||
|
||||
assert.Equal(s.T(), test.expected, language, fmt.Sprintf("%v: language = %v, expected: %v", test.name, language, test.expected))
|
||||
}
|
||||
}
|
||||
|
||||
@ -323,6 +362,7 @@ func (s *SimpleLinguistTestSuite) TestLinguistCorpus() {
|
||||
total++
|
||||
obtained := GetLanguage(filename, content)
|
||||
if obtained == OtherLanguage {
|
||||
obtained = "Other"
|
||||
other++
|
||||
}
|
||||
|
||||
@ -337,9 +377,9 @@ func (s *SimpleLinguistTestSuite) TestLinguistCorpus() {
|
||||
}
|
||||
|
||||
if _, ok := cornerCases[filename]; ok {
|
||||
fmt.Printf("\t\t[condidered corner case] %s\t%s\t%s\t%s\n", filename, expected, obtained, status)
|
||||
fmt.Printf("\t\t[considered corner case] %s\texpected: %s\tobtained: %s\tstatus: %s\n", filename, expected, obtained, status)
|
||||
} else {
|
||||
assert.Equal(s.T(), expected, obtained, fmt.Sprintf("%s\t%s\t%s\t%s\n", filename, expected, obtained, status))
|
||||
assert.Equal(s.T(), expected, obtained, fmt.Sprintf("%s\texpected: %s\tobtained: %s\tstatus: %s\n", filename, expected, obtained, status))
|
||||
}
|
||||
|
||||
return nil
|
||||
|
@ -1,4 +1,4 @@
|
||||
package slinguist
|
||||
package enry
|
||||
|
||||
// CODE GENERATED AUTOMATICALLY WITH gopkg.in/src-d/enry.v1/internal/code-generator
|
||||
// THIS FILE SHOULD NOT BE EDITED BY HAND
|
||||
|
@ -1,4 +1,4 @@
|
||||
package slinguist
|
||||
package enry
|
||||
|
||||
// CODE GENERATED AUTOMATICALLY WITH gopkg.in/src-d/enry.v1/internal/code-generator
|
||||
// THIS FILE SHOULD NOT BE EDITED BY HAND
|
||||
|
@ -1,4 +1,4 @@
|
||||
package slinguist
|
||||
package enry
|
||||
|
||||
// CODE GENERATED AUTOMATICALLY WITH gopkg.in/src-d/enry.v1/internal/code-generator
|
||||
// THIS FILE SHOULD NOT BE EDITED BY HAND
|
||||
|
@ -1,4 +1,4 @@
|
||||
package slinguist
|
||||
package enry
|
||||
|
||||
// CODE GENERATED AUTOMATICALLY WITH gopkg.in/src-d/enry.v1/internal/code-generator
|
||||
// THIS FILE SHOULD NOT BE EDITED BY HAND
|
||||
|
@ -1,4 +1,4 @@
|
||||
package slinguist
|
||||
package enry
|
||||
|
||||
// CODE GENERATED AUTOMATICALLY WITH gopkg.in/src-d/enry.v1/internal/code-generator
|
||||
// THIS FILE SHOULD NOT BE EDITED BY HAND
|
||||
|
@ -16,8 +16,8 @@ const (
|
||||
|
||||
// extension.go generation
|
||||
extensionsFile = "extension.go"
|
||||
extensionsTmplPath = "internal/code-generator/assets/extensions.go.tmpl"
|
||||
extensionsTmpl = "extensions.go.tmpl"
|
||||
extensionsTmplPath = "internal/code-generator/assets/extension.go.tmpl"
|
||||
extensionsTmpl = "extension.go.tmpl"
|
||||
|
||||
// content.go generation
|
||||
heuristicsRuby = ".linguist/lib/linguist/heuristics.rb"
|
||||
@ -39,23 +39,23 @@ const (
|
||||
|
||||
// type.go generation
|
||||
typeFile = "type.go"
|
||||
typeTmplPath = "internal/code-generator/assets/types.go.tmpl"
|
||||
typeTmpl = "types.go.tmpl"
|
||||
typeTmplPath = "internal/code-generator/assets/type.go.tmpl"
|
||||
typeTmpl = "type.go.tmpl"
|
||||
|
||||
// interpreter.go generation
|
||||
interpretersFile = "interpreter.go"
|
||||
interpretersTmplPath = "internal/code-generator/assets/interpreters.go.tmpl"
|
||||
interpretersTmpl = "interpreters.go.tmpl"
|
||||
interpretersTmplPath = "internal/code-generator/assets/interpreter.go.tmpl"
|
||||
interpretersTmpl = "interpreter.go.tmpl"
|
||||
|
||||
// filename.go generation
|
||||
filenamesFile = "filename.go"
|
||||
filenamesTmplPath = "internal/code-generator/assets/filenames.go.tmpl"
|
||||
filenamesTmpl = "filenames.go.tmpl"
|
||||
filenamesTmplPath = "internal/code-generator/assets/filename.go.tmpl"
|
||||
filenamesTmpl = "filename.go.tmpl"
|
||||
|
||||
// alias.go generation
|
||||
aliasesFile = "alias.go"
|
||||
aliasesTmplPath = "internal/code-generator/assets/aliases.go.tmpl"
|
||||
aliasesTmpl = "aliases.go.tmpl"
|
||||
aliasesTmplPath = "internal/code-generator/assets/alias.go.tmpl"
|
||||
aliasesTmpl = "alias.go.tmpl"
|
||||
|
||||
// frequencies.go generation
|
||||
frequenciesFile = "frequencies.go"
|
||||
|
153
modeline.go
153
modeline.go
@ -1,153 +0,0 @@
|
||||
package enry
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"regexp"
|
||||
)
|
||||
|
||||
const (
|
||||
searchScope = 5
|
||||
)
|
||||
|
||||
// GetLanguagesByModeline returns a slice of possible languages for the given content, filename will be ignored.
|
||||
// It accomplish the signature to be a Strategy type.
|
||||
func GetLanguagesByModeline(filename string, content []byte) []string {
|
||||
headFoot := getHeaderAndFooter(content)
|
||||
var languages []string
|
||||
for _, getLang := range modelinesFunc {
|
||||
languages = getLang("", headFoot)
|
||||
if len(languages) > 0 {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
return languages
|
||||
}
|
||||
|
||||
func getHeaderAndFooter(content []byte) []byte {
|
||||
if bytes.Count(content, []byte("\n")) < 2*searchScope {
|
||||
return content
|
||||
}
|
||||
|
||||
header := headScope(content, searchScope)
|
||||
footer := footScope(content, searchScope)
|
||||
headerAndFooter := make([]byte, 0, len(content[:header])+len(content[footer:]))
|
||||
headerAndFooter = append(headerAndFooter, content[:header]...)
|
||||
headerAndFooter = append(headerAndFooter, content[footer:]...)
|
||||
return headerAndFooter
|
||||
}
|
||||
|
||||
func headScope(content []byte, scope int) (index int) {
|
||||
for i := 0; i < scope; i++ {
|
||||
eol := bytes.IndexAny(content, "\n")
|
||||
content = content[eol+1:]
|
||||
index += eol
|
||||
}
|
||||
|
||||
return index + scope - 1
|
||||
}
|
||||
|
||||
func footScope(content []byte, scope int) (index int) {
|
||||
for i := 0; i < scope; i++ {
|
||||
index = bytes.LastIndexAny(content, "\n")
|
||||
content = content[:index]
|
||||
}
|
||||
|
||||
return index + 1
|
||||
}
|
||||
|
||||
var modelinesFunc = []func(filename string, content []byte) []string{
|
||||
GetLanguagesByEmacsModeline,
|
||||
GetLanguagesByVimModeline,
|
||||
}
|
||||
|
||||
var (
|
||||
reEmacsModeline = regexp.MustCompile(`.*-\*-\s*(.+?)\s*-\*-.*(?m:$)`)
|
||||
reEmacsLang = regexp.MustCompile(`.*(?i:mode)\s*:\s*([^\s;]+)\s*;*.*`)
|
||||
reVimModeline = regexp.MustCompile(`(?:(?m:\s|^)vi(?:m[<=>]?\d+|m)?|[\t\x20]*ex)\s*[:]\s*(.*)(?m:$)`)
|
||||
reVimLang = regexp.MustCompile(`(?i:filetype|ft|syntax)\s*=(\w+)(?:\s|:|$)`)
|
||||
)
|
||||
|
||||
// GetLanguageByEmacsModeline detecs if the content has a emacs modeline and try to get a
|
||||
// language basing on alias. If couldn't retrieve a valid language, it returns OtherLanguage and false.
|
||||
func GetLanguageByEmacsModeline(content []byte) (string, bool) {
|
||||
languages := GetLanguagesByEmacsModeline("", content)
|
||||
if len(languages) == 0 {
|
||||
return OtherLanguage, false
|
||||
}
|
||||
|
||||
return languages[0], true
|
||||
}
|
||||
|
||||
// GetLanguagesByEmacsModeline returns a slice of possible languages for the given content, filename will be ignored.
|
||||
// It accomplish the signature to be a Strategy type.
|
||||
func GetLanguagesByEmacsModeline(filename string, content []byte) []string {
|
||||
matched := reEmacsModeline.FindAllSubmatch(content, -1)
|
||||
if matched == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
// only take the last matched line, discard previous lines
|
||||
lastLineMatched := matched[len(matched)-1][1]
|
||||
matchedAlias := reEmacsLang.FindSubmatch(lastLineMatched)
|
||||
var alias string
|
||||
if matchedAlias != nil {
|
||||
alias = string(matchedAlias[1])
|
||||
} else {
|
||||
alias = string(lastLineMatched)
|
||||
}
|
||||
|
||||
language, ok := GetLanguageByAlias(alias)
|
||||
if !ok {
|
||||
return nil
|
||||
}
|
||||
|
||||
return []string{language}
|
||||
}
|
||||
|
||||
// GetLanguageByVimModeline detecs if the content has a vim modeline and try to get a
|
||||
// language basing on alias. If couldn't retrieve a valid language, it returns OtherLanguage and false.
|
||||
func GetLanguageByVimModeline(content []byte) (string, bool) {
|
||||
languages := GetLanguagesByVimModeline("", content)
|
||||
if len(languages) == 0 {
|
||||
return OtherLanguage, false
|
||||
}
|
||||
|
||||
return languages[0], true
|
||||
}
|
||||
|
||||
// GetLanguagesByVimModeline returns a slice of possible languages for the given content, filename will be ignored.
|
||||
// It accomplish the signature to be a Strategy type.
|
||||
func GetLanguagesByVimModeline(filename string, content []byte) []string {
|
||||
matched := reVimModeline.FindAllSubmatch(content, -1)
|
||||
if matched == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
// only take the last matched line, discard previous lines
|
||||
lastLineMatched := matched[len(matched)-1][1]
|
||||
matchedAlias := reVimLang.FindAllSubmatch(lastLineMatched, -1)
|
||||
if matchedAlias == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
alias := string(matchedAlias[0][1])
|
||||
if len(matchedAlias) > 1 {
|
||||
// cases:
|
||||
// matchedAlias = [["syntax=ruby " "ruby"] ["ft=python " "python"] ["filetype=perl " "perl"]] returns OtherLanguage;
|
||||
// matchedAlias = [["syntax=python " "python"] ["ft=python " "python"] ["filetype=python " "python"]] returns "Python";
|
||||
for _, match := range matchedAlias {
|
||||
otherAlias := string(match[1])
|
||||
if otherAlias != alias {
|
||||
return nil
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
language, ok := GetLanguageByAlias(alias)
|
||||
if !ok {
|
||||
return nil
|
||||
}
|
||||
|
||||
return []string{language}
|
||||
}
|
89
shebang.go
89
shebang.go
@ -1,89 +0,0 @@
|
||||
package enry
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"bytes"
|
||||
"regexp"
|
||||
"strings"
|
||||
)
|
||||
|
||||
const shebang = `#!`
|
||||
|
||||
var (
|
||||
shebangExecHack = regexp.MustCompile(`exec (\w+).+\$0.+\$@`)
|
||||
pythonVersion = regexp.MustCompile(`python\d\.\d+`)
|
||||
)
|
||||
|
||||
// GetLanguagesByShebang returns a slice of possible languages for the given content, filename will be ignored.
|
||||
// It accomplish the signature to be a Strategy type.
|
||||
func GetLanguagesByShebang(filename string, content []byte) (languages []string) {
|
||||
interpreter := getInterpreter(content)
|
||||
return languagesByInterpreter[interpreter]
|
||||
}
|
||||
|
||||
func getInterpreter(data []byte) (interpreter string) {
|
||||
line := getFirstLine(data)
|
||||
if !hasShebang(line) {
|
||||
return ""
|
||||
}
|
||||
|
||||
// skip shebang
|
||||
line = bytes.TrimSpace(line[2:])
|
||||
|
||||
splitted := bytes.Fields(line)
|
||||
if bytes.Contains(splitted[0], []byte("env")) {
|
||||
if len(splitted) > 1 {
|
||||
interpreter = string(splitted[1])
|
||||
}
|
||||
} else {
|
||||
|
||||
splittedPath := bytes.Split(splitted[0], []byte{'/'})
|
||||
interpreter = string(splittedPath[len(splittedPath)-1])
|
||||
}
|
||||
|
||||
if interpreter == "sh" {
|
||||
interpreter = lookForMultilineExec(data)
|
||||
}
|
||||
|
||||
if pythonVersion.MatchString(interpreter) {
|
||||
interpreter = interpreter[:strings.Index(interpreter, `.`)]
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
func getFirstLine(data []byte) []byte {
|
||||
buf := bufio.NewScanner(bytes.NewReader(data))
|
||||
buf.Scan()
|
||||
line := buf.Bytes()
|
||||
if err := buf.Err(); err != nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
return line
|
||||
}
|
||||
|
||||
func hasShebang(line []byte) bool {
|
||||
shebang := []byte(shebang)
|
||||
return bytes.HasPrefix(line, shebang)
|
||||
}
|
||||
|
||||
func lookForMultilineExec(data []byte) string {
|
||||
const magicNumOfLines = 5
|
||||
interpreter := "sh"
|
||||
|
||||
buf := bufio.NewScanner(bytes.NewReader(data))
|
||||
for i := 0; i < magicNumOfLines && buf.Scan(); i++ {
|
||||
line := buf.Bytes()
|
||||
if shebangExecHack.Match(line) {
|
||||
interpreter = shebangExecHack.FindStringSubmatch(string(line))[1]
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if err := buf.Err(); err != nil {
|
||||
return interpreter
|
||||
}
|
||||
|
||||
return interpreter
|
||||
}
|
5
utils.go
5
utils.go
@ -31,9 +31,8 @@ func IsAuxiliaryLanguage(lang string) bool {
|
||||
|
||||
// IsConfiguration returns whether or not path is using a configuration language.
|
||||
func IsConfiguration(path string) bool {
|
||||
lang, _ := GetLanguageByExtension(path)
|
||||
_, is := configurationLanguages[lang]
|
||||
|
||||
language, _ := GetLanguageByExtension(path)
|
||||
_, is := configurationLanguages[language]
|
||||
return is
|
||||
}
|
||||
|
||||
|
@ -55,7 +55,7 @@ func (s *SimpleLinguistTestSuite) TestIsConfiguration() {
|
||||
}{
|
||||
{name: "TestIsConfiguration_1", path: "foo", expected: false},
|
||||
{name: "TestIsConfiguration_2", path: "foo.ini", expected: true},
|
||||
{name: "TestIsConfiguration_3", path: "foo.json", expected: true},
|
||||
{name: "TestIsConfiguration_3", path: "/test/path/foo.json", expected: true},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
|
Loading…
x
Reference in New Issue
Block a user