changed signatures for strategies

This commit is contained in:
Manuel Carmona 2017-06-12 13:42:20 +02:00
parent 5f0e92b1a8
commit beda5b73e7
13 changed files with 501 additions and 465 deletions

View File

@ -3,14 +3,15 @@ package enry
import (
"math"
"sort"
"gopkg.in/src-d/enry.v1/internal/tokenizer"
)
// Classifier is the interface that contains the method Classify which is in charge to assign scores to the possibles candidates.
// The scores must order the candidates so as the highest score be the most probably language of the content. The candidates is
// a map which can be used to assign weights to languages dynamically.
// Classifier is the interface in charge to detect the possible languages of the given content based on a set of
// candidates. Candidates is a map which can be used to assign weights to languages dynamically.
type Classifier interface {
Classify(content []byte, candidates map[string]float64) map[string]float64
Classify(content []byte, candidates map[string]float64) (languages []string)
}
type classifier struct {
@ -19,7 +20,13 @@ type classifier struct {
tokensTotal float64
}
func (c *classifier) Classify(content []byte, candidates map[string]float64) map[string]float64 {
type scoredLanguage struct {
language string
score float64
}
// Classify returns a sorted slice of possible languages sorted by decreasing language's probability
func (c *classifier) Classify(content []byte, candidates map[string]float64) []string {
if len(content) == 0 {
return nil
}
@ -39,12 +46,27 @@ func (c *classifier) Classify(content []byte, candidates map[string]float64) map
}
tokens := tokenizer.Tokenize(content)
scores := make(map[string]float64, len(languages))
scoredLangs := make([]*scoredLanguage, 0, len(languages))
for language := range languages {
scores[language] = c.tokensLogProbability(tokens, language) + c.languagesLogProbabilities[language]
scoredLang := &scoredLanguage{
language: language,
score: c.tokensLogProbability(tokens, language) + c.languagesLogProbabilities[language],
}
return scores
scoredLangs = append(scoredLangs, scoredLang)
}
return sortLanguagesByScore(scoredLangs)
}
func sortLanguagesByScore(scoredLangs []*scoredLanguage) []string {
sort.SliceStable(scoredLangs, func(i, j int) bool { return scoredLangs[j].score < scoredLangs[i].score })
sortedLanguages := make([]string, 0, len(scoredLangs))
for _, scoredLang := range scoredLangs {
sortedLanguages = append(sortedLanguages, scoredLang.language)
}
return sortedLanguages
}
func (c *classifier) knownLangs() map[string]float64 {

385
common.go
View File

@ -1,83 +1,333 @@
package enry
import (
"math"
"bufio"
"bytes"
"path/filepath"
"regexp"
"strings"
)
// OtherLanguage is used as a zero value when a function can not return a specific language.
const OtherLanguage = "Other"
const OtherLanguage = ""
// Strategy type fix the signature for the functions that can be used as a strategy.
type Strategy func(filename string, content []byte) (languages []string)
type Strategy func(filename string, content []byte, candidates []string) (languages []string)
var strategies = []Strategy{
// DefaultStrategies is the strategies' sequence GetLanguage uses to detect languages.
var DefaultStrategies = []Strategy{
GetLanguagesByModeline,
GetLanguagesByFilename,
GetLanguagesByShebang,
GetLanguagesByExtension,
GetLanguagesByContent,
GetLanguagesByClassifier,
}
// GetLanguage applies a sequence of strategies based on the given filename and content
// to find out the most probably language to return.
func GetLanguage(filename string, content []byte) string {
candidates := map[string]float64{}
for _, strategy := range strategies {
languages := strategy(filename, content)
var languages []string
candidates := []string{}
for _, strategy := range DefaultStrategies {
languages = strategy(filename, content, candidates)
if len(languages) == 1 {
return languages[0]
}
if len(languages) > 0 {
for _, language := range languages {
candidates[language]++
}
candidates = append(candidates, languages...)
}
}
if len(candidates) == 0 {
return firstLanguage(languages)
}
func firstLanguage(languages []string) string {
if len(languages) == 0 {
return OtherLanguage
}
lang := GetLanguageByClassifier(content, candidates, nil)
return lang
return languages[0]
}
// GetLanguageByModeline returns the language of the given content looking for the modeline,
// and safe to indicate the sureness of returned language.
func GetLanguageByModeline(content []byte) (lang string, safe bool) {
return getLangAndSafe("", content, GetLanguagesByModeline)
func getLanguageByStrategy(strategy Strategy, filename string, content []byte, candidates []string) (string, bool) {
languages := strategy(filename, content, candidates)
return getFirstLanguageAndSafe(languages)
}
// GetLanguageByFilename returns a language based on the given filename, and safe to indicate
// the sureness of returned language.
func GetLanguageByFilename(filename string) (lang string, safe bool) {
return getLangAndSafe(filename, nil, GetLanguagesByFilename)
func getFirstLanguageAndSafe(languages []string) (language string, safe bool) {
language = firstLanguage(languages)
safe = len(languages) == 1
return
}
// GetLanguagesByFilename returns a slice of possible languages for the given filename, content will be ignored.
// It accomplish the signature to be a Strategy type.
func GetLanguagesByFilename(filename string, content []byte) []string {
// GetLanguageByModeline returns detected language. If there are more than one possibles languages
// it returns the first language by alphabetically order and safe to false.
func GetLanguageByModeline(content []byte) (language string, safe bool) {
return getLanguageByStrategy(GetLanguagesByModeline, "", content, nil)
}
// GetLanguageByEmacsModeline returns detected language. If there are more than one possibles languages
// it returns the first language by alphabetically order and safe to false.
func GetLanguageByEmacsModeline(content []byte) (language string, safe bool) {
return getLanguageByStrategy(GetLanguagesByEmacsModeline, "", content, nil)
}
// GetLanguageByVimModeline returns detected language. If there are more than one possibles languages
// it returns the first language by alphabetically order and safe to false.
func GetLanguageByVimModeline(content []byte) (language string, safe bool) {
return getLanguageByStrategy(GetLanguagesByVimModeline, "", content, nil)
}
// GetLanguageByFilename returns detected language. If there are more than one possibles languages
// it returns the first language by alphabetically order and safe to false.
func GetLanguageByFilename(filename string) (language string, safe bool) {
return getLanguageByStrategy(GetLanguagesByFilename, filename, nil, nil)
}
// GetLanguageByShebang returns detected language. If there are more than one possibles languages
// it returns the first language by alphabetically order and safe to false.
func GetLanguageByShebang(content []byte) (language string, safe bool) {
return getLanguageByStrategy(GetLanguagesByShebang, "", content, nil)
}
// GetLanguageByExtension returns detected language. If there are more than one possibles languages
// it returns the first language by alphabetically order and safe to false.
func GetLanguageByExtension(filename string) (language string, safe bool) {
return getLanguageByStrategy(GetLanguagesByExtension, filename, nil, nil)
}
// GetLanguageByContent returns detected language. If there are more than one possibles languages
// it returns the first language by alphabetically order and safe to false.
func GetLanguageByContent(content []byte) (language string, safe bool) {
return getLanguageByStrategy(GetLanguagesByContent, "", content, nil)
}
// GetLanguageByClassifier returns the most probably language detected for the given content. It uses
// DefaultClassifier, if no candidates are provided it returns OtherLanguage.
func GetLanguageByClassifier(content []byte, candidates []string) (language string, safe bool) {
return getLanguageByStrategy(GetLanguagesByClassifier, "", content, candidates)
}
// GetLanguageBySpecificClassifier returns the most probably language for the given content using
// classifier to detect language.
func GetLanguageBySpecificClassifier(content []byte, candidates []string, classifier Classifier) (language string, safe bool) {
languages := GetLanguagesBySpecificClassifier(content, candidates, classifier)
return getFirstLanguageAndSafe(languages)
}
// GetLanguagesByModeline returns a slice of possible languages for the given content, filename will be ignored.
// It is comply with the signature to be a Strategy type.
func GetLanguagesByModeline(filename string, content []byte, candidates []string) []string {
headFoot := getHeaderAndFooter(content)
var languages []string
for _, getLang := range modelinesFunc {
languages = getLang("", headFoot, candidates)
if len(languages) > 0 {
break
}
}
return languages
}
var modelinesFunc = []Strategy{
GetLanguagesByEmacsModeline,
GetLanguagesByVimModeline,
}
func getHeaderAndFooter(content []byte) []byte {
const searchScope = 5
if bytes.Count(content, []byte("\n")) < 2*searchScope {
return content
}
header := headScope(content, searchScope)
footer := footScope(content, searchScope)
headerAndFooter := make([]byte, 0, len(content[:header])+len(content[footer:]))
headerAndFooter = append(headerAndFooter, content[:header]...)
headerAndFooter = append(headerAndFooter, content[footer:]...)
return headerAndFooter
}
func headScope(content []byte, scope int) (index int) {
for i := 0; i < scope; i++ {
eol := bytes.IndexAny(content, "\n")
content = content[eol+1:]
index += eol
}
return index + scope - 1
}
func footScope(content []byte, scope int) (index int) {
for i := 0; i < scope; i++ {
index = bytes.LastIndexAny(content, "\n")
content = content[:index]
}
return index + 1
}
var (
reEmacsModeline = regexp.MustCompile(`.*-\*-\s*(.+?)\s*-\*-.*(?m:$)`)
reEmacsLang = regexp.MustCompile(`.*(?i:mode)\s*:\s*([^\s;]+)\s*;*.*`)
reVimModeline = regexp.MustCompile(`(?:(?m:\s|^)vi(?:m[<=>]?\d+|m)?|[\t\x20]*ex)\s*[:]\s*(.*)(?m:$)`)
reVimLang = regexp.MustCompile(`(?i:filetype|ft|syntax)\s*=(\w+)(?:\s|:|$)`)
)
// GetLanguagesByEmacsModeline returns a slice of possible languages for the given content, filename and candidates
// will be ignored. It is comply with the signature to be a Strategy type.
func GetLanguagesByEmacsModeline(filename string, content []byte, candidates []string) []string {
matched := reEmacsModeline.FindAllSubmatch(content, -1)
if matched == nil {
return nil
}
// only take the last matched line, discard previous lines
lastLineMatched := matched[len(matched)-1][1]
matchedAlias := reEmacsLang.FindSubmatch(lastLineMatched)
var alias string
if matchedAlias != nil {
alias = string(matchedAlias[1])
} else {
alias = string(lastLineMatched)
}
language, ok := GetLanguageByAlias(alias)
if !ok {
return nil
}
return []string{language}
}
// GetLanguagesByVimModeline returns a slice of possible languages for the given content, filename and candidates
// will be ignored. It is comply with the signature to be a Strategy type.
func GetLanguagesByVimModeline(filename string, content []byte, candidates []string) []string {
matched := reVimModeline.FindAllSubmatch(content, -1)
if matched == nil {
return nil
}
// only take the last matched line, discard previous lines
lastLineMatched := matched[len(matched)-1][1]
matchedAlias := reVimLang.FindAllSubmatch(lastLineMatched, -1)
if matchedAlias == nil {
return nil
}
alias := string(matchedAlias[0][1])
if len(matchedAlias) > 1 {
// cases:
// matchedAlias = [["syntax=ruby " "ruby"] ["ft=python " "python"] ["filetype=perl " "perl"]] returns OtherLanguage;
// matchedAlias = [["syntax=python " "python"] ["ft=python " "python"] ["filetype=python " "python"]] returns "Python";
for _, match := range matchedAlias {
otherAlias := string(match[1])
if otherAlias != alias {
return nil
}
}
}
language, ok := GetLanguageByAlias(alias)
if !ok {
return nil
}
return []string{language}
}
// GetLanguagesByFilename returns a slice of possible languages for the given filename, content and candidates
// will be ignored. It is comply with the signature to be a Strategy type.
func GetLanguagesByFilename(filename string, content []byte, candidates []string) []string {
return languagesByFilename[filename]
}
// GetLanguageByShebang returns the language of the given content looking for the shebang line,
// and safe to indicate the sureness of returned language.
func GetLanguageByShebang(content []byte) (lang string, safe bool) {
return getLangAndSafe("", content, GetLanguagesByShebang)
// GetLanguagesByShebang returns a slice of possible languages for the given content, filename and candidates
// will be ignored. It is comply with the signature to be a Strategy type.
func GetLanguagesByShebang(filename string, content []byte, candidates []string) (languages []string) {
interpreter := getInterpreter(content)
return languagesByInterpreter[interpreter]
}
// GetLanguageByExtension returns a language based on the given filename, and safe to indicate
// the sureness of returned language.
func GetLanguageByExtension(filename string) (lang string, safe bool) {
return getLangAndSafe(filename, nil, GetLanguagesByExtension)
var (
shebangExecHack = regexp.MustCompile(`exec (\w+).+\$0.+\$@`)
pythonVersion = regexp.MustCompile(`python\d\.\d+`)
)
func getInterpreter(data []byte) (interpreter string) {
line := getFirstLine(data)
if !hasShebang(line) {
return ""
}
// skip shebang
line = bytes.TrimSpace(line[2:])
splitted := bytes.Fields(line)
if bytes.Contains(splitted[0], []byte("env")) {
if len(splitted) > 1 {
interpreter = string(splitted[1])
}
} else {
splittedPath := bytes.Split(splitted[0], []byte{'/'})
interpreter = string(splittedPath[len(splittedPath)-1])
}
if interpreter == "sh" {
interpreter = lookForMultilineExec(data)
}
if pythonVersion.MatchString(interpreter) {
interpreter = interpreter[:strings.Index(interpreter, `.`)]
}
return
}
// GetLanguagesByExtension returns a slice of possible languages for the given filename, content will be ignored.
// It accomplish the signature to be a Strategy type.
func GetLanguagesByExtension(filename string, content []byte) []string {
func getFirstLine(data []byte) []byte {
buf := bufio.NewScanner(bytes.NewReader(data))
buf.Scan()
line := buf.Bytes()
if err := buf.Err(); err != nil {
return nil
}
return line
}
func hasShebang(line []byte) bool {
const shebang = `#!`
prefix := []byte(shebang)
return bytes.HasPrefix(line, prefix)
}
func lookForMultilineExec(data []byte) string {
const magicNumOfLines = 5
interpreter := "sh"
buf := bufio.NewScanner(bytes.NewReader(data))
for i := 0; i < magicNumOfLines && buf.Scan(); i++ {
line := buf.Bytes()
if shebangExecHack.Match(line) {
interpreter = shebangExecHack.FindStringSubmatch(string(line))[1]
break
}
}
if err := buf.Err(); err != nil {
return interpreter
}
return interpreter
}
// GetLanguagesByExtension returns a slice of possible languages for the given filename, content and candidates
// will be ignored. It is comply with the signature to be a Strategy type.
func GetLanguagesByExtension(filename string, content []byte, candidates []string) []string {
if !strings.Contains(filename, ".") {
return nil
}
@ -106,15 +356,9 @@ func getDotIndexes(filename string) []int {
return dots
}
// GetLanguageByContent returns a language based on the filename and heuristics applies to the content,
// and safe to indicate the sureness of returned language.
func GetLanguageByContent(filename string, content []byte) (lang string, safe bool) {
return getLangAndSafe(filename, content, GetLanguagesByContent)
}
// GetLanguagesByContent returns a slice of possible languages for the given content, filename will be ignored.
// It accomplish the signature to be a Strategy type.
func GetLanguagesByContent(filename string, content []byte) []string {
// GetLanguagesByContent returns a slice of possible languages for the given content, filename and candidates
// will be ignored. It is comply with the signature to be a Strategy type.
func GetLanguagesByContent(filename string, content []byte, candidates []string) []string {
ext := strings.ToLower(filepath.Ext(filename))
fnMatcher, ok := contentMatchers[ext]
if !ok {
@ -124,51 +368,24 @@ func GetLanguagesByContent(filename string, content []byte) []string {
return fnMatcher(content)
}
func getLangAndSafe(filename string, content []byte, getLanguageByStrategy Strategy) (lang string, safe bool) {
languages := getLanguageByStrategy(filename, content)
if len(languages) == 0 {
lang = OtherLanguage
return
// GetLanguagesByClassifier uses DefaultClassifier as a Classifier and returns a sorted slice of possible languages ordered by
// decreasing language's probability. If there are not candidates it returns nil. It is comply with the signature to be a Strategy type.
func GetLanguagesByClassifier(filename string, content []byte, candidates []string) (languages []string) {
if len(candidates) == 0 {
return nil
}
lang = languages[0]
safe = len(languages) == 1
return
return GetLanguagesBySpecificClassifier(content, candidates, DefaultClassifier)
}
// GetLanguageByClassifier takes in a content and a list of candidates, and apply the classifier's Classify method to
// get the most probably language. If classifier is null then DefaultClassfier will be used. If there aren't candidates
// OtherLanguage is returned.
func GetLanguageByClassifier(content []byte, candidates map[string]float64, classifier Classifier) string {
scores := GetLanguagesByClassifier(content, candidates, classifier)
if len(scores) == 0 {
return OtherLanguage
// GetLanguagesBySpecificClassifier returns a slice of possible languages. It takes in a Classifier to be used.
func GetLanguagesBySpecificClassifier(content []byte, candidates []string, classifier Classifier) (languages []string) {
mapCandidates := make(map[string]float64)
for _, candidate := range candidates {
mapCandidates[candidate]++
}
return getLangugeHigherScore(scores)
}
func getLangugeHigherScore(scores map[string]float64) string {
var language string
higher := -math.MaxFloat64
for lang, score := range scores {
if higher < score {
language = lang
higher = score
}
}
return language
}
// GetLanguagesByClassifier returns a map of possible languages as keys and a score as value based on content and candidates. The values can be ordered
// with the highest value as the most probably language. If classifier is null then DefaultClassfier will be used.
func GetLanguagesByClassifier(content []byte, candidates map[string]float64, classifier Classifier) map[string]float64 {
if classifier == nil {
classifier = DefaultClassifier
}
return classifier.Classify(content, candidates)
return classifier.Classify(content, mapCandidates)
}
// GetLanguageExtensions returns the different extensions being used by the language.
@ -188,7 +405,7 @@ const (
Prose
)
// GetLanguageType returns the given language's type.
// GetLanguageType returns the type of the given language.
func GetLanguageType(language string) (langType Type) {
langType, ok := languagesType[language]
if !ok {

View File

@ -37,7 +37,7 @@ func (s *SimpleLinguistTestSuite) TestGetLanguage() {
}
}
func (s *SimpleLinguistTestSuite) TestGetLanguageByModelineLinguist() {
func (s *SimpleLinguistTestSuite) TestGetLanguagesByModelineLinguist() {
const (
modelinesDir = ".linguist/test/fixtures/Data/Modelines"
samplesDir = ".linguist/samples"
@ -46,56 +46,55 @@ func (s *SimpleLinguistTestSuite) TestGetLanguageByModelineLinguist() {
tests := []struct {
name string
filename string
expectedLang string
expectedSafe bool
candidates []string
expected []string
}{
// Emacs
{name: "TestGetLanguageByModelineLinguist_1", filename: filepath.Join(modelinesDir, "example_smalltalk.md"), expectedLang: "Smalltalk", expectedSafe: true},
{name: "TestGetLanguageByModelineLinguist_2", filename: filepath.Join(modelinesDir, "fundamentalEmacs.c"), expectedLang: "Text", expectedSafe: true},
{name: "TestGetLanguageByModelineLinguist_3", filename: filepath.Join(modelinesDir, "iamphp.inc"), expectedLang: "PHP", expectedSafe: true},
{name: "TestGetLanguageByModelineLinguist_4", filename: filepath.Join(modelinesDir, "seeplusplusEmacs1"), expectedLang: "C++", expectedSafe: true},
{name: "TestGetLanguageByModelineLinguist_5", filename: filepath.Join(modelinesDir, "seeplusplusEmacs2"), expectedLang: "C++", expectedSafe: true},
{name: "TestGetLanguageByModelineLinguist_6", filename: filepath.Join(modelinesDir, "seeplusplusEmacs3"), expectedLang: "C++", expectedSafe: true},
{name: "TestGetLanguageByModelineLinguist_7", filename: filepath.Join(modelinesDir, "seeplusplusEmacs4"), expectedLang: "C++", expectedSafe: true},
{name: "TestGetLanguageByModelineLinguist_8", filename: filepath.Join(modelinesDir, "seeplusplusEmacs5"), expectedLang: "C++", expectedSafe: true},
{name: "TestGetLanguageByModelineLinguist_9", filename: filepath.Join(modelinesDir, "seeplusplusEmacs6"), expectedLang: "C++", expectedSafe: true},
{name: "TestGetLanguageByModelineLinguist_10", filename: filepath.Join(modelinesDir, "seeplusplusEmacs7"), expectedLang: "C++", expectedSafe: true},
{name: "TestGetLanguageByModelineLinguist_11", filename: filepath.Join(modelinesDir, "seeplusplusEmacs9"), expectedLang: "C++", expectedSafe: true},
{name: "TestGetLanguageByModelineLinguist_12", filename: filepath.Join(modelinesDir, "seeplusplusEmacs10"), expectedLang: "C++", expectedSafe: true},
{name: "TestGetLanguageByModelineLinguist_13", filename: filepath.Join(modelinesDir, "seeplusplusEmacs11"), expectedLang: "C++", expectedSafe: true},
{name: "TestGetLanguageByModelineLinguist_14", filename: filepath.Join(modelinesDir, "seeplusplusEmacs12"), expectedLang: "C++", expectedSafe: true},
{name: "TestGetLanguagesByModelineLinguist_1", filename: filepath.Join(modelinesDir, "example_smalltalk.md"), expected: []string{"Smalltalk"}},
{name: "TestGetLanguagesByModelineLinguist_2", filename: filepath.Join(modelinesDir, "fundamentalEmacs.c"), expected: []string{"Text"}},
{name: "TestGetLanguagesByModelineLinguist_3", filename: filepath.Join(modelinesDir, "iamphp.inc"), expected: []string{"PHP"}},
{name: "TestGetLanguagesByModelineLinguist_4", filename: filepath.Join(modelinesDir, "seeplusplusEmacs1"), expected: []string{"C++"}},
{name: "TestGetLanguagesByModelineLinguist_5", filename: filepath.Join(modelinesDir, "seeplusplusEmacs2"), expected: []string{"C++"}},
{name: "TestGetLanguagesByModelineLinguist_6", filename: filepath.Join(modelinesDir, "seeplusplusEmacs3"), expected: []string{"C++"}},
{name: "TestGetLanguagesByModelineLinguist_7", filename: filepath.Join(modelinesDir, "seeplusplusEmacs4"), expected: []string{"C++"}},
{name: "TestGetLanguagesByModelineLinguist_8", filename: filepath.Join(modelinesDir, "seeplusplusEmacs5"), expected: []string{"C++"}},
{name: "TestGetLanguagesByModelineLinguist_9", filename: filepath.Join(modelinesDir, "seeplusplusEmacs6"), expected: []string{"C++"}},
{name: "TestGetLanguagesByModelineLinguist_10", filename: filepath.Join(modelinesDir, "seeplusplusEmacs7"), expected: []string{"C++"}},
{name: "TestGetLanguagesByModelineLinguist_11", filename: filepath.Join(modelinesDir, "seeplusplusEmacs9"), expected: []string{"C++"}},
{name: "TestGetLanguagesByModelineLinguist_12", filename: filepath.Join(modelinesDir, "seeplusplusEmacs10"), expected: []string{"C++"}},
{name: "TestGetLanguagesByModelineLinguist_13", filename: filepath.Join(modelinesDir, "seeplusplusEmacs11"), expected: []string{"C++"}},
{name: "TestGetLanguagesByModelineLinguist_14", filename: filepath.Join(modelinesDir, "seeplusplusEmacs12"), expected: []string{"C++"}},
// Vim
{name: "TestGetLanguageByModelineLinguist_15", filename: filepath.Join(modelinesDir, "seeplusplus"), expectedLang: "C++", expectedSafe: true},
{name: "TestGetLanguageByModelineLinguist_16", filename: filepath.Join(modelinesDir, "iamjs.pl"), expectedLang: "JavaScript", expectedSafe: true},
{name: "TestGetLanguageByModelineLinguist_17", filename: filepath.Join(modelinesDir, "iamjs2.pl"), expectedLang: "JavaScript", expectedSafe: true},
{name: "TestGetLanguageByModelineLinguist_18", filename: filepath.Join(modelinesDir, "not_perl.pl"), expectedLang: "Prolog", expectedSafe: true},
{name: "TestGetLanguageByModelineLinguist_19", filename: filepath.Join(modelinesDir, "ruby"), expectedLang: "Ruby", expectedSafe: true},
{name: "TestGetLanguageByModelineLinguist_20", filename: filepath.Join(modelinesDir, "ruby2"), expectedLang: "Ruby", expectedSafe: true},
{name: "TestGetLanguageByModelineLinguist_21", filename: filepath.Join(modelinesDir, "ruby3"), expectedLang: "Ruby", expectedSafe: true},
{name: "TestGetLanguageByModelineLinguist_22", filename: filepath.Join(modelinesDir, "ruby4"), expectedLang: "Ruby", expectedSafe: true},
{name: "TestGetLanguageByModelineLinguist_23", filename: filepath.Join(modelinesDir, "ruby5"), expectedLang: "Ruby", expectedSafe: true},
{name: "TestGetLanguageByModelineLinguist_24", filename: filepath.Join(modelinesDir, "ruby6"), expectedLang: "Ruby", expectedSafe: true},
{name: "TestGetLanguageByModelineLinguist_25", filename: filepath.Join(modelinesDir, "ruby7"), expectedLang: "Ruby", expectedSafe: true},
{name: "TestGetLanguageByModelineLinguist_26", filename: filepath.Join(modelinesDir, "ruby8"), expectedLang: "Ruby", expectedSafe: true},
{name: "TestGetLanguageByModelineLinguist_27", filename: filepath.Join(modelinesDir, "ruby9"), expectedLang: "Ruby", expectedSafe: true},
{name: "TestGetLanguageByModelineLinguist_28", filename: filepath.Join(modelinesDir, "ruby10"), expectedLang: "Ruby", expectedSafe: true},
{name: "TestGetLanguageByModelineLinguist_29", filename: filepath.Join(modelinesDir, "ruby11"), expectedLang: "Ruby", expectedSafe: true},
{name: "TestGetLanguageByModelineLinguist_30", filename: filepath.Join(modelinesDir, "ruby12"), expectedLang: "Ruby", expectedSafe: true},
{name: "TestGetLanguageByModelineLinguist_31", filename: filepath.Join(samplesDir, "C/main.c"), expectedLang: OtherLanguage, expectedSafe: false},
{name: "TestGetLanguagesByModelineLinguist_15", filename: filepath.Join(modelinesDir, "seeplusplus"), expected: []string{"C++"}},
{name: "TestGetLanguagesByModelineLinguist_16", filename: filepath.Join(modelinesDir, "iamjs.pl"), expected: []string{"JavaScript"}},
{name: "TestGetLanguagesByModelineLinguist_17", filename: filepath.Join(modelinesDir, "iamjs2.pl"), expected: []string{"JavaScript"}},
{name: "TestGetLanguagesByModelineLinguist_18", filename: filepath.Join(modelinesDir, "not_perl.pl"), expected: []string{"Prolog"}},
{name: "TestGetLanguagesByModelineLinguist_19", filename: filepath.Join(modelinesDir, "ruby"), expected: []string{"Ruby"}},
{name: "TestGetLanguagesByModelineLinguist_20", filename: filepath.Join(modelinesDir, "ruby2"), expected: []string{"Ruby"}},
{name: "TestGetLanguagesByModelineLinguist_21", filename: filepath.Join(modelinesDir, "ruby3"), expected: []string{"Ruby"}},
{name: "TestGetLanguagesByModelineLinguist_22", filename: filepath.Join(modelinesDir, "ruby4"), expected: []string{"Ruby"}},
{name: "TestGetLanguagesByModelineLinguist_23", filename: filepath.Join(modelinesDir, "ruby5"), expected: []string{"Ruby"}},
{name: "TestGetLanguagesByModelineLinguist_24", filename: filepath.Join(modelinesDir, "ruby6"), expected: []string{"Ruby"}},
{name: "TestGetLanguagesByModelineLinguist_25", filename: filepath.Join(modelinesDir, "ruby7"), expected: []string{"Ruby"}},
{name: "TestGetLanguagesByModelineLinguist_26", filename: filepath.Join(modelinesDir, "ruby8"), expected: []string{"Ruby"}},
{name: "TestGetLanguagesByModelineLinguist_27", filename: filepath.Join(modelinesDir, "ruby9"), expected: []string{"Ruby"}},
{name: "TestGetLanguagesByModelineLinguist_28", filename: filepath.Join(modelinesDir, "ruby10"), expected: []string{"Ruby"}},
{name: "TestGetLanguagesByModelineLinguist_29", filename: filepath.Join(modelinesDir, "ruby11"), expected: []string{"Ruby"}},
{name: "TestGetLanguagesByModelineLinguist_30", filename: filepath.Join(modelinesDir, "ruby12"), expected: []string{"Ruby"}},
{name: "TestGetLanguagesByModelineLinguist_31", filename: filepath.Join(samplesDir, "C/main.c"), expected: nil},
}
for _, test := range tests {
content, err := ioutil.ReadFile(test.filename)
assert.NoError(s.T(), err)
lang, safe := GetLanguageByModeline(content)
assert.Equal(s.T(), test.expectedLang, lang, fmt.Sprintf("%v: lang = %v, expected: %v", test.name, lang, test.expectedLang))
assert.Equal(s.T(), test.expectedSafe, safe, fmt.Sprintf("%v: safe = %v, expected: %v", test.name, safe, test.expectedSafe))
languages := GetLanguagesByModeline(test.filename, content, test.candidates)
assert.Equal(s.T(), test.expected, languages, fmt.Sprintf("%v: languages = %v, expected: %v", test.name, languages, test.expected))
}
}
func (s *SimpleLinguistTestSuite) TestGetLanguageByModeline() {
func (s *SimpleLinguistTestSuite) TestGetLanguagesByModeline() {
const (
wrongVim = `# vim: set syntax=ruby ft =python filetype=perl :`
rightVim = `/* vim: set syntax=python ft =python filetype=python */`
@ -104,47 +103,47 @@ func (s *SimpleLinguistTestSuite) TestGetLanguageByModeline() {
tests := []struct {
name string
filename string
content []byte
expectedLang string
expectedSafe bool
candidates []string
expected []string
}{
{name: "TestGetLanguageByModeline_1", content: []byte(wrongVim), expectedLang: OtherLanguage, expectedSafe: false},
{name: "TestGetLanguageByModeline_2", content: []byte(rightVim), expectedLang: "Python", expectedSafe: true},
{name: "TestGetLanguageByModeline_3", content: []byte(noLangVim), expectedLang: OtherLanguage, expectedSafe: false},
{name: "TestGetLanguagesByModeline_1", content: []byte(wrongVim), expected: nil},
{name: "TestGetLanguagesByModeline_2", content: []byte(rightVim), expected: []string{"Python"}},
{name: "TestGetLanguagesByModeline_3", content: []byte(noLangVim), expected: nil},
}
for _, test := range tests {
lang, safe := GetLanguageByModeline(test.content)
assert.Equal(s.T(), test.expectedLang, lang, fmt.Sprintf("%v: lang = %v, expected: %v", test.name, lang, test.expectedLang))
assert.Equal(s.T(), test.expectedSafe, safe, fmt.Sprintf("%v: safe = %v, expected: %v", test.name, safe, test.expectedSafe))
languages := GetLanguagesByModeline(test.filename, test.content, test.candidates)
assert.Equal(s.T(), test.expected, languages, fmt.Sprintf("%v: languages = %v, expected: %v", test.name, languages, test.expected))
}
}
func (s *SimpleLinguistTestSuite) TestGetLanguageByFilename() {
func (s *SimpleLinguistTestSuite) TestGetLanguagesByFilename() {
tests := []struct {
name string
filename string
expectedLang string
expectedSafe bool
content []byte
candidates []string
expected []string
}{
{name: "TestGetLanguageByFilename_1", filename: "unknown.interpreter", expectedLang: OtherLanguage, expectedSafe: false},
{name: "TestGetLanguageByFilename_2", filename: ".bashrc", expectedLang: "Shell", expectedSafe: true},
{name: "TestGetLanguageByFilename_3", filename: "Dockerfile", expectedLang: "Dockerfile", expectedSafe: true},
{name: "TestGetLanguageByFilename_4", filename: "Makefile.frag", expectedLang: "Makefile", expectedSafe: true},
{name: "TestGetLanguageByFilename_5", filename: "makefile", expectedLang: "Makefile", expectedSafe: true},
{name: "TestGetLanguageByFilename_6", filename: "Vagrantfile", expectedLang: "Ruby", expectedSafe: true},
{name: "TestGetLanguageByFilename_7", filename: "_vimrc", expectedLang: "Vim script", expectedSafe: true},
{name: "TestGetLanguageByFilename_8", filename: "pom.xml", expectedLang: "Maven POM", expectedSafe: true},
{name: "TestGetLanguagesByFilename_1", filename: "unknown.interpreter", expected: nil},
{name: "TestGetLanguagesByFilename_2", filename: ".bashrc", expected: []string{"Shell"}},
{name: "TestGetLanguagesByFilename_3", filename: "Dockerfile", expected: []string{"Dockerfile"}},
{name: "TestGetLanguagesByFilename_4", filename: "Makefile.frag", expected: []string{"Makefile"}},
{name: "TestGetLanguagesByFilename_5", filename: "makefile", expected: []string{"Makefile"}},
{name: "TestGetLanguagesByFilename_6", filename: "Vagrantfile", expected: []string{"Ruby"}},
{name: "TestGetLanguagesByFilename_7", filename: "_vimrc", expected: []string{"Vim script"}},
{name: "TestGetLanguagesByFilename_8", filename: "pom.xml", expected: []string{"Maven POM"}},
}
for _, test := range tests {
lang, safe := GetLanguageByFilename(test.filename)
assert.Equal(s.T(), test.expectedLang, lang, fmt.Sprintf("%v: lang = %v, expected: %v", test.name, lang, test.expectedLang))
assert.Equal(s.T(), test.expectedSafe, safe, fmt.Sprintf("%v: safe = %v, expected: %v", test.name, safe, test.expectedSafe))
languages := GetLanguagesByFilename(test.filename, test.content, test.candidates)
assert.Equal(s.T(), test.expected, languages, fmt.Sprintf("%v: languages = %v, expected: %v", test.name, languages, test.expected))
}
}
func (s *SimpleLinguistTestSuite) TestGetLanguageByShebang() {
func (s *SimpleLinguistTestSuite) TestGetLanguagesByShebang() {
const (
multilineExecHack = `#!/bin/sh
# Next line is comment in Tcl, but not in sh... \
@ -162,71 +161,111 @@ println("The shell script says ",vm.arglist.concat(" "));`
tests := []struct {
name string
filename string
content []byte
expectedLang string
expectedSafe bool
candidates []string
expected []string
}{
{name: "TestGetLanguageByShebang_1", content: []byte(`#!/unknown/interpreter`), expectedLang: OtherLanguage, expectedSafe: false},
{name: "TestGetLanguageByShebang_2", content: []byte(`no shebang`), expectedLang: OtherLanguage, expectedSafe: false},
{name: "TestGetLanguageByShebang_3", content: []byte(`#!/usr/bin/env`), expectedLang: OtherLanguage, expectedSafe: false},
{name: "TestGetLanguageByShebang_4", content: []byte(`#!/usr/bin/python -tt`), expectedLang: "Python", expectedSafe: true},
{name: "TestGetLanguageByShebang_5", content: []byte(`#!/usr/bin/env python2.6`), expectedLang: "Python", expectedSafe: true},
{name: "TestGetLanguageByShebang_6", content: []byte(`#!/usr/bin/env perl`), expectedLang: "Perl", expectedSafe: true},
{name: "TestGetLanguageByShebang_7", content: []byte(`#! /bin/sh`), expectedLang: "Shell", expectedSafe: true},
{name: "TestGetLanguageByShebang_8", content: []byte(`#!bash`), expectedLang: "Shell", expectedSafe: true},
{name: "TestGetLanguageByShebang_9", content: []byte(multilineExecHack), expectedLang: "Tcl", expectedSafe: true},
{name: "TestGetLanguageByShebang_10", content: []byte(multilineNoExecHack), expectedLang: "Shell", expectedSafe: true},
{name: "TestGetLanguagesByShebang_1", content: []byte(`#!/unknown/interpreter`), expected: nil},
{name: "TestGetLanguagesByShebang_2", content: []byte(`no shebang`), expected: nil},
{name: "TestGetLanguagesByShebang_3", content: []byte(`#!/usr/bin/env`), expected: nil},
{name: "TestGetLanguagesByShebang_4", content: []byte(`#!/usr/bin/python -tt`), expected: []string{"Python"}},
{name: "TestGetLanguagesByShebang_5", content: []byte(`#!/usr/bin/env python2.6`), expected: []string{"Python"}},
{name: "TestGetLanguagesByShebang_6", content: []byte(`#!/usr/bin/env perl`), expected: []string{"Perl"}},
{name: "TestGetLanguagesByShebang_7", content: []byte(`#! /bin/sh`), expected: []string{"Shell"}},
{name: "TestGetLanguagesByShebang_8", content: []byte(`#!bash`), expected: []string{"Shell"}},
{name: "TestGetLanguagesByShebang_9", content: []byte(multilineExecHack), expected: []string{"Tcl"}},
{name: "TestGetLanguagesByShebang_10", content: []byte(multilineNoExecHack), expected: []string{"Shell"}},
}
for _, test := range tests {
lang, safe := GetLanguageByShebang(test.content)
assert.Equal(s.T(), test.expectedLang, lang, fmt.Sprintf("%v: lang = %v, expected: %v", test.name, lang, test.expectedLang))
assert.Equal(s.T(), test.expectedSafe, safe, fmt.Sprintf("%v: safe = %v, expected: %v", test.name, safe, test.expectedSafe))
languages := GetLanguagesByShebang(test.filename, test.content, test.candidates)
assert.Equal(s.T(), test.expected, languages, fmt.Sprintf("%v: languages = %v, expected: %v", test.name, languages, test.expected))
}
}
func (s *SimpleLinguistTestSuite) TestGetLanguageByExtension() {
func (s *SimpleLinguistTestSuite) TestGetLanguagesByExtension() {
tests := []struct {
name string
filename string
expectedLang string
expectedSafe bool
content []byte
candidates []string
expected []string
}{
{name: "TestGetLanguageByExtension_1", filename: "foo.foo", expectedLang: OtherLanguage, expectedSafe: false},
{name: "TestGetLanguageByExtension_2", filename: "foo.go", expectedLang: "Go", expectedSafe: true},
{name: "TestGetLanguageByExtension_3", filename: "foo.go.php", expectedLang: "Hack", expectedSafe: false},
{name: "TestGetLanguagesByExtension_1", filename: "foo.foo", expected: nil},
{name: "TestGetLanguagesByExtension_2", filename: "foo.go", expected: []string{"Go"}},
{name: "TestGetLanguagesByExtension_3", filename: "foo.go.php", expected: []string{"Hack", "PHP"}},
}
for _, test := range tests {
lang, safe := GetLanguageByExtension(test.filename)
assert.Equal(s.T(), test.expectedLang, lang, fmt.Sprintf("%v: lang = %v, expected: %v", test.name, lang, test.expectedLang))
assert.Equal(s.T(), test.expectedSafe, safe, fmt.Sprintf("%v: safe = %v, expected: %v", test.name, safe, test.expectedSafe))
languages := GetLanguagesByExtension(test.filename, test.content, test.candidates)
assert.Equal(s.T(), test.expected, languages, fmt.Sprintf("%v: languages = %v, expected: %v", test.name, languages, test.expected))
}
}
func (s *SimpleLinguistTestSuite) TestGetLanguageByClassifier() {
func (s *SimpleLinguistTestSuite) TestGetLanguagesByClassifier() {
const samples = `.linguist/samples/`
test := []struct {
name string
filename string
candidates map[string]float64
candidates []string
expected string
}{
{name: "TestGetLanguageByClassifier_1", filename: filepath.Join(samples, "C/blob.c"), candidates: map[string]float64{"python": 1.00, "ruby": 1.00, "c": 1.00, "c++": 1.00}, expected: "C"},
{name: "TestGetLanguageByClassifier_2", filename: filepath.Join(samples, "C/blob.c"), candidates: nil, expected: "C"},
{name: "TestGetLanguageByClassifier_3", filename: filepath.Join(samples, "C/main.c"), candidates: nil, expected: "C"},
{name: "TestGetLanguageByClassifier_4", filename: filepath.Join(samples, "C/blob.c"), candidates: map[string]float64{"python": 1.00, "ruby": 1.00, "c++": 1.00}, expected: "C++"},
{name: "TestGetLanguageByClassifier_5", filename: filepath.Join(samples, "C/blob.c"), candidates: map[string]float64{"ruby": 1.00}, expected: "Ruby"},
{name: "TestGetLanguageByClassifier_6", filename: filepath.Join(samples, "Python/django-models-base.py"), candidates: map[string]float64{"python": 1.00, "ruby": 1.00, "c": 1.00, "c++": 1.00}, expected: "Python"},
{name: "TestGetLanguageByClassifier_7", filename: filepath.Join(samples, "Python/django-models-base.py"), candidates: nil, expected: "Python"},
{name: "TestGetLanguagesByClassifier_1", filename: filepath.Join(samples, "C/blob.c"), candidates: []string{"python", "ruby", "c", "c++"}, expected: "C"},
{name: "TestGetLanguagesByClassifier_2", filename: filepath.Join(samples, "C/blob.c"), candidates: nil, expected: OtherLanguage},
{name: "TestGetLanguagesByClassifier_3", filename: filepath.Join(samples, "C/main.c"), candidates: []string{}, expected: OtherLanguage},
{name: "TestGetLanguagesByClassifier_4", filename: filepath.Join(samples, "C/blob.c"), candidates: []string{"python", "ruby", "c++"}, expected: "C++"},
{name: "TestGetLanguagesByClassifier_5", filename: filepath.Join(samples, "C/blob.c"), candidates: []string{"ruby"}, expected: "Ruby"},
{name: "TestGetLanguagesByClassifier_6", filename: filepath.Join(samples, "Python/django-models-base.py"), candidates: []string{"python", "ruby", "c", "c++"}, expected: "Python"},
}
for _, test := range test {
content, err := ioutil.ReadFile(test.filename)
assert.NoError(s.T(), err)
lang := GetLanguageByClassifier(content, test.candidates, nil)
assert.Equal(s.T(), test.expected, lang, fmt.Sprintf("%v: lang = %v, expected: %v", test.name, lang, test.expected))
languages := GetLanguagesByClassifier(test.filename, content, test.candidates)
var language string
if len(languages) == 0 {
language = OtherLanguage
} else {
language = languages[0]
}
assert.Equal(s.T(), test.expected, language, fmt.Sprintf("%v: language = %v, expected: %v", test.name, language, test.expected))
}
}
func (s *SimpleLinguistTestSuite) TestGetLanguagesBySpecificClassifier() {
const samples = `.linguist/samples/`
test := []struct {
name string
filename string
candidates []string
classifier Classifier
expected string
}{
{name: "TestGetLanguagesByClassifier_1", filename: filepath.Join(samples, "C/blob.c"), candidates: []string{"python", "ruby", "c", "c++"}, classifier: DefaultClassifier, expected: "C"},
{name: "TestGetLanguagesByClassifier_2", filename: filepath.Join(samples, "C/blob.c"), candidates: nil, classifier: DefaultClassifier, expected: "C"},
{name: "TestGetLanguagesByClassifier_3", filename: filepath.Join(samples, "C/main.c"), candidates: []string{}, classifier: DefaultClassifier, expected: "C"},
{name: "TestGetLanguagesByClassifier_4", filename: filepath.Join(samples, "C/blob.c"), candidates: []string{"python", "ruby", "c++"}, classifier: DefaultClassifier, expected: "C++"},
{name: "TestGetLanguagesByClassifier_5", filename: filepath.Join(samples, "C/blob.c"), candidates: []string{"ruby"}, classifier: DefaultClassifier, expected: "Ruby"},
{name: "TestGetLanguagesByClassifier_6", filename: filepath.Join(samples, "Python/django-models-base.py"), candidates: []string{"python", "ruby", "c", "c++"}, classifier: DefaultClassifier, expected: "Python"},
{name: "TestGetLanguagesByClassifier_6", filename: os.DevNull, candidates: nil, classifier: DefaultClassifier, expected: OtherLanguage},
}
for _, test := range test {
content, err := ioutil.ReadFile(test.filename)
assert.NoError(s.T(), err)
languages := GetLanguagesBySpecificClassifier(content, test.candidates, test.classifier)
var language string
if len(languages) == 0 {
language = OtherLanguage
} else {
language = languages[0]
}
assert.Equal(s.T(), test.expected, language, fmt.Sprintf("%v: language = %v, expected: %v", test.name, language, test.expected))
}
}
@ -323,6 +362,7 @@ func (s *SimpleLinguistTestSuite) TestLinguistCorpus() {
total++
obtained := GetLanguage(filename, content)
if obtained == OtherLanguage {
obtained = "Other"
other++
}
@ -337,9 +377,9 @@ func (s *SimpleLinguistTestSuite) TestLinguistCorpus() {
}
if _, ok := cornerCases[filename]; ok {
fmt.Printf("\t\t[condidered corner case] %s\t%s\t%s\t%s\n", filename, expected, obtained, status)
fmt.Printf("\t\t[considered corner case] %s\texpected: %s\tobtained: %s\tstatus: %s\n", filename, expected, obtained, status)
} else {
assert.Equal(s.T(), expected, obtained, fmt.Sprintf("%s\t%s\t%s\t%s\n", filename, expected, obtained, status))
assert.Equal(s.T(), expected, obtained, fmt.Sprintf("%s\texpected: %s\tobtained: %s\tstatus: %s\n", filename, expected, obtained, status))
}
return nil

View File

@ -1,4 +1,4 @@
package slinguist
package enry
// CODE GENERATED AUTOMATICALLY WITH gopkg.in/src-d/enry.v1/internal/code-generator
// THIS FILE SHOULD NOT BE EDITED BY HAND

View File

@ -1,4 +1,4 @@
package slinguist
package enry
// CODE GENERATED AUTOMATICALLY WITH gopkg.in/src-d/enry.v1/internal/code-generator
// THIS FILE SHOULD NOT BE EDITED BY HAND

View File

@ -1,4 +1,4 @@
package slinguist
package enry
// CODE GENERATED AUTOMATICALLY WITH gopkg.in/src-d/enry.v1/internal/code-generator
// THIS FILE SHOULD NOT BE EDITED BY HAND

View File

@ -1,4 +1,4 @@
package slinguist
package enry
// CODE GENERATED AUTOMATICALLY WITH gopkg.in/src-d/enry.v1/internal/code-generator
// THIS FILE SHOULD NOT BE EDITED BY HAND

View File

@ -1,4 +1,4 @@
package slinguist
package enry
// CODE GENERATED AUTOMATICALLY WITH gopkg.in/src-d/enry.v1/internal/code-generator
// THIS FILE SHOULD NOT BE EDITED BY HAND

View File

@ -16,8 +16,8 @@ const (
// extension.go generation
extensionsFile = "extension.go"
extensionsTmplPath = "internal/code-generator/assets/extensions.go.tmpl"
extensionsTmpl = "extensions.go.tmpl"
extensionsTmplPath = "internal/code-generator/assets/extension.go.tmpl"
extensionsTmpl = "extension.go.tmpl"
// content.go generation
heuristicsRuby = ".linguist/lib/linguist/heuristics.rb"
@ -39,23 +39,23 @@ const (
// type.go generation
typeFile = "type.go"
typeTmplPath = "internal/code-generator/assets/types.go.tmpl"
typeTmpl = "types.go.tmpl"
typeTmplPath = "internal/code-generator/assets/type.go.tmpl"
typeTmpl = "type.go.tmpl"
// interpreter.go generation
interpretersFile = "interpreter.go"
interpretersTmplPath = "internal/code-generator/assets/interpreters.go.tmpl"
interpretersTmpl = "interpreters.go.tmpl"
interpretersTmplPath = "internal/code-generator/assets/interpreter.go.tmpl"
interpretersTmpl = "interpreter.go.tmpl"
// filename.go generation
filenamesFile = "filename.go"
filenamesTmplPath = "internal/code-generator/assets/filenames.go.tmpl"
filenamesTmpl = "filenames.go.tmpl"
filenamesTmplPath = "internal/code-generator/assets/filename.go.tmpl"
filenamesTmpl = "filename.go.tmpl"
// alias.go generation
aliasesFile = "alias.go"
aliasesTmplPath = "internal/code-generator/assets/aliases.go.tmpl"
aliasesTmpl = "aliases.go.tmpl"
aliasesTmplPath = "internal/code-generator/assets/alias.go.tmpl"
aliasesTmpl = "alias.go.tmpl"
// frequencies.go generation
frequenciesFile = "frequencies.go"

View File

@ -1,153 +0,0 @@
package enry
import (
"bytes"
"regexp"
)
const (
searchScope = 5
)
// GetLanguagesByModeline returns a slice of possible languages for the given content, filename will be ignored.
// It accomplish the signature to be a Strategy type.
func GetLanguagesByModeline(filename string, content []byte) []string {
headFoot := getHeaderAndFooter(content)
var languages []string
for _, getLang := range modelinesFunc {
languages = getLang("", headFoot)
if len(languages) > 0 {
break
}
}
return languages
}
func getHeaderAndFooter(content []byte) []byte {
if bytes.Count(content, []byte("\n")) < 2*searchScope {
return content
}
header := headScope(content, searchScope)
footer := footScope(content, searchScope)
headerAndFooter := make([]byte, 0, len(content[:header])+len(content[footer:]))
headerAndFooter = append(headerAndFooter, content[:header]...)
headerAndFooter = append(headerAndFooter, content[footer:]...)
return headerAndFooter
}
func headScope(content []byte, scope int) (index int) {
for i := 0; i < scope; i++ {
eol := bytes.IndexAny(content, "\n")
content = content[eol+1:]
index += eol
}
return index + scope - 1
}
func footScope(content []byte, scope int) (index int) {
for i := 0; i < scope; i++ {
index = bytes.LastIndexAny(content, "\n")
content = content[:index]
}
return index + 1
}
var modelinesFunc = []func(filename string, content []byte) []string{
GetLanguagesByEmacsModeline,
GetLanguagesByVimModeline,
}
var (
reEmacsModeline = regexp.MustCompile(`.*-\*-\s*(.+?)\s*-\*-.*(?m:$)`)
reEmacsLang = regexp.MustCompile(`.*(?i:mode)\s*:\s*([^\s;]+)\s*;*.*`)
reVimModeline = regexp.MustCompile(`(?:(?m:\s|^)vi(?:m[<=>]?\d+|m)?|[\t\x20]*ex)\s*[:]\s*(.*)(?m:$)`)
reVimLang = regexp.MustCompile(`(?i:filetype|ft|syntax)\s*=(\w+)(?:\s|:|$)`)
)
// GetLanguageByEmacsModeline detecs if the content has a emacs modeline and try to get a
// language basing on alias. If couldn't retrieve a valid language, it returns OtherLanguage and false.
func GetLanguageByEmacsModeline(content []byte) (string, bool) {
languages := GetLanguagesByEmacsModeline("", content)
if len(languages) == 0 {
return OtherLanguage, false
}
return languages[0], true
}
// GetLanguagesByEmacsModeline returns a slice of possible languages for the given content, filename will be ignored.
// It accomplish the signature to be a Strategy type.
func GetLanguagesByEmacsModeline(filename string, content []byte) []string {
matched := reEmacsModeline.FindAllSubmatch(content, -1)
if matched == nil {
return nil
}
// only take the last matched line, discard previous lines
lastLineMatched := matched[len(matched)-1][1]
matchedAlias := reEmacsLang.FindSubmatch(lastLineMatched)
var alias string
if matchedAlias != nil {
alias = string(matchedAlias[1])
} else {
alias = string(lastLineMatched)
}
language, ok := GetLanguageByAlias(alias)
if !ok {
return nil
}
return []string{language}
}
// GetLanguageByVimModeline detecs if the content has a vim modeline and try to get a
// language basing on alias. If couldn't retrieve a valid language, it returns OtherLanguage and false.
func GetLanguageByVimModeline(content []byte) (string, bool) {
languages := GetLanguagesByVimModeline("", content)
if len(languages) == 0 {
return OtherLanguage, false
}
return languages[0], true
}
// GetLanguagesByVimModeline returns a slice of possible languages for the given content, filename will be ignored.
// It accomplish the signature to be a Strategy type.
func GetLanguagesByVimModeline(filename string, content []byte) []string {
matched := reVimModeline.FindAllSubmatch(content, -1)
if matched == nil {
return nil
}
// only take the last matched line, discard previous lines
lastLineMatched := matched[len(matched)-1][1]
matchedAlias := reVimLang.FindAllSubmatch(lastLineMatched, -1)
if matchedAlias == nil {
return nil
}
alias := string(matchedAlias[0][1])
if len(matchedAlias) > 1 {
// cases:
// matchedAlias = [["syntax=ruby " "ruby"] ["ft=python " "python"] ["filetype=perl " "perl"]] returns OtherLanguage;
// matchedAlias = [["syntax=python " "python"] ["ft=python " "python"] ["filetype=python " "python"]] returns "Python";
for _, match := range matchedAlias {
otherAlias := string(match[1])
if otherAlias != alias {
return nil
}
}
}
language, ok := GetLanguageByAlias(alias)
if !ok {
return nil
}
return []string{language}
}

View File

@ -1,89 +0,0 @@
package enry
import (
"bufio"
"bytes"
"regexp"
"strings"
)
const shebang = `#!`
var (
shebangExecHack = regexp.MustCompile(`exec (\w+).+\$0.+\$@`)
pythonVersion = regexp.MustCompile(`python\d\.\d+`)
)
// GetLanguagesByShebang returns a slice of possible languages for the given content, filename will be ignored.
// It accomplish the signature to be a Strategy type.
func GetLanguagesByShebang(filename string, content []byte) (languages []string) {
interpreter := getInterpreter(content)
return languagesByInterpreter[interpreter]
}
func getInterpreter(data []byte) (interpreter string) {
line := getFirstLine(data)
if !hasShebang(line) {
return ""
}
// skip shebang
line = bytes.TrimSpace(line[2:])
splitted := bytes.Fields(line)
if bytes.Contains(splitted[0], []byte("env")) {
if len(splitted) > 1 {
interpreter = string(splitted[1])
}
} else {
splittedPath := bytes.Split(splitted[0], []byte{'/'})
interpreter = string(splittedPath[len(splittedPath)-1])
}
if interpreter == "sh" {
interpreter = lookForMultilineExec(data)
}
if pythonVersion.MatchString(interpreter) {
interpreter = interpreter[:strings.Index(interpreter, `.`)]
}
return
}
func getFirstLine(data []byte) []byte {
buf := bufio.NewScanner(bytes.NewReader(data))
buf.Scan()
line := buf.Bytes()
if err := buf.Err(); err != nil {
return nil
}
return line
}
func hasShebang(line []byte) bool {
shebang := []byte(shebang)
return bytes.HasPrefix(line, shebang)
}
func lookForMultilineExec(data []byte) string {
const magicNumOfLines = 5
interpreter := "sh"
buf := bufio.NewScanner(bytes.NewReader(data))
for i := 0; i < magicNumOfLines && buf.Scan(); i++ {
line := buf.Bytes()
if shebangExecHack.Match(line) {
interpreter = shebangExecHack.FindStringSubmatch(string(line))[1]
break
}
}
if err := buf.Err(); err != nil {
return interpreter
}
return interpreter
}

View File

@ -31,9 +31,8 @@ func IsAuxiliaryLanguage(lang string) bool {
// IsConfiguration returns whether or not path is using a configuration language.
func IsConfiguration(path string) bool {
lang, _ := GetLanguageByExtension(path)
_, is := configurationLanguages[lang]
language, _ := GetLanguageByExtension(path)
_, is := configurationLanguages[language]
return is
}

View File

@ -55,7 +55,7 @@ func (s *SimpleLinguistTestSuite) TestIsConfiguration() {
}{
{name: "TestIsConfiguration_1", path: "foo", expected: false},
{name: "TestIsConfiguration_2", path: "foo.ini", expected: true},
{name: "TestIsConfiguration_3", path: "foo.json", expected: true},
{name: "TestIsConfiguration_3", path: "/test/path/foo.json", expected: true},
}
for _, test := range tests {