From beda5b73e7857f27dd6eb5e9f85fc81357b88342 Mon Sep 17 00:00:00 2001 From: Manuel Carmona Date: Mon, 12 Jun 2017 13:42:20 +0200 Subject: [PATCH] changed signatures for strategies --- classifier.go | 38 +- common.go | 393 ++++++++++++++---- common_test.go | 256 +++++++----- .../generator/test_files/alias.gold | 2 +- .../generator/test_files/extension.gold | 2 +- .../generator/test_files/filename.gold | 2 +- .../generator/test_files/interpreter.gold | 2 +- .../generator/test_files/type.gold | 2 +- internal/code-generator/main.go | 20 +- modeline.go | 153 ------- shebang.go | 89 ---- utils.go | 5 +- utils_test.go | 2 +- 13 files changed, 501 insertions(+), 465 deletions(-) delete mode 100644 modeline.go delete mode 100644 shebang.go diff --git a/classifier.go b/classifier.go index abd2f37..447f42a 100644 --- a/classifier.go +++ b/classifier.go @@ -3,14 +3,15 @@ package enry import ( "math" + "sort" + "gopkg.in/src-d/enry.v1/internal/tokenizer" ) -// Classifier is the interface that contains the method Classify which is in charge to assign scores to the possibles candidates. -// The scores must order the candidates so as the highest score be the most probably language of the content. The candidates is -// a map which can be used to assign weights to languages dynamically. +// Classifier is the interface in charge to detect the possible languages of the given content based on a set of +// candidates. Candidates is a map which can be used to assign weights to languages dynamically. type Classifier interface { - Classify(content []byte, candidates map[string]float64) map[string]float64 + Classify(content []byte, candidates map[string]float64) (languages []string) } type classifier struct { @@ -19,7 +20,13 @@ type classifier struct { tokensTotal float64 } -func (c *classifier) Classify(content []byte, candidates map[string]float64) map[string]float64 { +type scoredLanguage struct { + language string + score float64 +} + +// Classify returns a sorted slice of possible languages sorted by decreasing language's probability +func (c *classifier) Classify(content []byte, candidates map[string]float64) []string { if len(content) == 0 { return nil } @@ -39,12 +46,27 @@ func (c *classifier) Classify(content []byte, candidates map[string]float64) map } tokens := tokenizer.Tokenize(content) - scores := make(map[string]float64, len(languages)) + scoredLangs := make([]*scoredLanguage, 0, len(languages)) for language := range languages { - scores[language] = c.tokensLogProbability(tokens, language) + c.languagesLogProbabilities[language] + scoredLang := &scoredLanguage{ + language: language, + score: c.tokensLogProbability(tokens, language) + c.languagesLogProbabilities[language], + } + + scoredLangs = append(scoredLangs, scoredLang) } - return scores + return sortLanguagesByScore(scoredLangs) +} + +func sortLanguagesByScore(scoredLangs []*scoredLanguage) []string { + sort.SliceStable(scoredLangs, func(i, j int) bool { return scoredLangs[j].score < scoredLangs[i].score }) + sortedLanguages := make([]string, 0, len(scoredLangs)) + for _, scoredLang := range scoredLangs { + sortedLanguages = append(sortedLanguages, scoredLang.language) + } + + return sortedLanguages } func (c *classifier) knownLangs() map[string]float64 { diff --git a/common.go b/common.go index f3f4847..11bf0c7 100644 --- a/common.go +++ b/common.go @@ -1,83 +1,333 @@ package enry import ( - "math" + "bufio" + "bytes" "path/filepath" + "regexp" "strings" ) // OtherLanguage is used as a zero value when a function can not return a specific language. -const OtherLanguage = "Other" +const OtherLanguage = "" // Strategy type fix the signature for the functions that can be used as a strategy. -type Strategy func(filename string, content []byte) (languages []string) +type Strategy func(filename string, content []byte, candidates []string) (languages []string) -var strategies = []Strategy{ +// DefaultStrategies is the strategies' sequence GetLanguage uses to detect languages. +var DefaultStrategies = []Strategy{ GetLanguagesByModeline, GetLanguagesByFilename, GetLanguagesByShebang, GetLanguagesByExtension, GetLanguagesByContent, + GetLanguagesByClassifier, } // GetLanguage applies a sequence of strategies based on the given filename and content // to find out the most probably language to return. func GetLanguage(filename string, content []byte) string { - candidates := map[string]float64{} - for _, strategy := range strategies { - languages := strategy(filename, content) + var languages []string + candidates := []string{} + for _, strategy := range DefaultStrategies { + languages = strategy(filename, content, candidates) if len(languages) == 1 { return languages[0] } if len(languages) > 0 { - for _, language := range languages { - candidates[language]++ + candidates = append(candidates, languages...) + } + } + + return firstLanguage(languages) +} + +func firstLanguage(languages []string) string { + if len(languages) == 0 { + return OtherLanguage + } + + return languages[0] +} + +func getLanguageByStrategy(strategy Strategy, filename string, content []byte, candidates []string) (string, bool) { + languages := strategy(filename, content, candidates) + return getFirstLanguageAndSafe(languages) +} + +func getFirstLanguageAndSafe(languages []string) (language string, safe bool) { + language = firstLanguage(languages) + safe = len(languages) == 1 + return +} + +// GetLanguageByModeline returns detected language. If there are more than one possibles languages +// it returns the first language by alphabetically order and safe to false. +func GetLanguageByModeline(content []byte) (language string, safe bool) { + return getLanguageByStrategy(GetLanguagesByModeline, "", content, nil) +} + +// GetLanguageByEmacsModeline returns detected language. If there are more than one possibles languages +// it returns the first language by alphabetically order and safe to false. +func GetLanguageByEmacsModeline(content []byte) (language string, safe bool) { + return getLanguageByStrategy(GetLanguagesByEmacsModeline, "", content, nil) +} + +// GetLanguageByVimModeline returns detected language. If there are more than one possibles languages +// it returns the first language by alphabetically order and safe to false. +func GetLanguageByVimModeline(content []byte) (language string, safe bool) { + return getLanguageByStrategy(GetLanguagesByVimModeline, "", content, nil) +} + +// GetLanguageByFilename returns detected language. If there are more than one possibles languages +// it returns the first language by alphabetically order and safe to false. +func GetLanguageByFilename(filename string) (language string, safe bool) { + return getLanguageByStrategy(GetLanguagesByFilename, filename, nil, nil) +} + +// GetLanguageByShebang returns detected language. If there are more than one possibles languages +// it returns the first language by alphabetically order and safe to false. +func GetLanguageByShebang(content []byte) (language string, safe bool) { + return getLanguageByStrategy(GetLanguagesByShebang, "", content, nil) +} + +// GetLanguageByExtension returns detected language. If there are more than one possibles languages +// it returns the first language by alphabetically order and safe to false. +func GetLanguageByExtension(filename string) (language string, safe bool) { + return getLanguageByStrategy(GetLanguagesByExtension, filename, nil, nil) +} + +// GetLanguageByContent returns detected language. If there are more than one possibles languages +// it returns the first language by alphabetically order and safe to false. +func GetLanguageByContent(content []byte) (language string, safe bool) { + return getLanguageByStrategy(GetLanguagesByContent, "", content, nil) +} + +// GetLanguageByClassifier returns the most probably language detected for the given content. It uses +// DefaultClassifier, if no candidates are provided it returns OtherLanguage. +func GetLanguageByClassifier(content []byte, candidates []string) (language string, safe bool) { + return getLanguageByStrategy(GetLanguagesByClassifier, "", content, candidates) +} + +// GetLanguageBySpecificClassifier returns the most probably language for the given content using +// classifier to detect language. +func GetLanguageBySpecificClassifier(content []byte, candidates []string, classifier Classifier) (language string, safe bool) { + languages := GetLanguagesBySpecificClassifier(content, candidates, classifier) + return getFirstLanguageAndSafe(languages) +} + +// GetLanguagesByModeline returns a slice of possible languages for the given content, filename will be ignored. +// It is comply with the signature to be a Strategy type. +func GetLanguagesByModeline(filename string, content []byte, candidates []string) []string { + headFoot := getHeaderAndFooter(content) + var languages []string + for _, getLang := range modelinesFunc { + languages = getLang("", headFoot, candidates) + if len(languages) > 0 { + break + } + } + + return languages +} + +var modelinesFunc = []Strategy{ + GetLanguagesByEmacsModeline, + GetLanguagesByVimModeline, +} + +func getHeaderAndFooter(content []byte) []byte { + const searchScope = 5 + if bytes.Count(content, []byte("\n")) < 2*searchScope { + return content + } + + header := headScope(content, searchScope) + footer := footScope(content, searchScope) + headerAndFooter := make([]byte, 0, len(content[:header])+len(content[footer:])) + headerAndFooter = append(headerAndFooter, content[:header]...) + headerAndFooter = append(headerAndFooter, content[footer:]...) + return headerAndFooter +} + +func headScope(content []byte, scope int) (index int) { + for i := 0; i < scope; i++ { + eol := bytes.IndexAny(content, "\n") + content = content[eol+1:] + index += eol + } + + return index + scope - 1 +} + +func footScope(content []byte, scope int) (index int) { + for i := 0; i < scope; i++ { + index = bytes.LastIndexAny(content, "\n") + content = content[:index] + } + + return index + 1 +} + +var ( + reEmacsModeline = regexp.MustCompile(`.*-\*-\s*(.+?)\s*-\*-.*(?m:$)`) + reEmacsLang = regexp.MustCompile(`.*(?i:mode)\s*:\s*([^\s;]+)\s*;*.*`) + reVimModeline = regexp.MustCompile(`(?:(?m:\s|^)vi(?:m[<=>]?\d+|m)?|[\t\x20]*ex)\s*[:]\s*(.*)(?m:$)`) + reVimLang = regexp.MustCompile(`(?i:filetype|ft|syntax)\s*=(\w+)(?:\s|:|$)`) +) + +// GetLanguagesByEmacsModeline returns a slice of possible languages for the given content, filename and candidates +// will be ignored. It is comply with the signature to be a Strategy type. +func GetLanguagesByEmacsModeline(filename string, content []byte, candidates []string) []string { + matched := reEmacsModeline.FindAllSubmatch(content, -1) + if matched == nil { + return nil + } + + // only take the last matched line, discard previous lines + lastLineMatched := matched[len(matched)-1][1] + matchedAlias := reEmacsLang.FindSubmatch(lastLineMatched) + var alias string + if matchedAlias != nil { + alias = string(matchedAlias[1]) + } else { + alias = string(lastLineMatched) + } + + language, ok := GetLanguageByAlias(alias) + if !ok { + return nil + } + + return []string{language} +} + +// GetLanguagesByVimModeline returns a slice of possible languages for the given content, filename and candidates +// will be ignored. It is comply with the signature to be a Strategy type. +func GetLanguagesByVimModeline(filename string, content []byte, candidates []string) []string { + matched := reVimModeline.FindAllSubmatch(content, -1) + if matched == nil { + return nil + } + + // only take the last matched line, discard previous lines + lastLineMatched := matched[len(matched)-1][1] + matchedAlias := reVimLang.FindAllSubmatch(lastLineMatched, -1) + if matchedAlias == nil { + return nil + } + + alias := string(matchedAlias[0][1]) + if len(matchedAlias) > 1 { + // cases: + // matchedAlias = [["syntax=ruby " "ruby"] ["ft=python " "python"] ["filetype=perl " "perl"]] returns OtherLanguage; + // matchedAlias = [["syntax=python " "python"] ["ft=python " "python"] ["filetype=python " "python"]] returns "Python"; + for _, match := range matchedAlias { + otherAlias := string(match[1]) + if otherAlias != alias { + return nil } } } - if len(candidates) == 0 { - return OtherLanguage + language, ok := GetLanguageByAlias(alias) + if !ok { + return nil } - lang := GetLanguageByClassifier(content, candidates, nil) - return lang + return []string{language} } -// GetLanguageByModeline returns the language of the given content looking for the modeline, -// and safe to indicate the sureness of returned language. -func GetLanguageByModeline(content []byte) (lang string, safe bool) { - return getLangAndSafe("", content, GetLanguagesByModeline) -} - -// GetLanguageByFilename returns a language based on the given filename, and safe to indicate -// the sureness of returned language. -func GetLanguageByFilename(filename string) (lang string, safe bool) { - return getLangAndSafe(filename, nil, GetLanguagesByFilename) -} - -// GetLanguagesByFilename returns a slice of possible languages for the given filename, content will be ignored. -// It accomplish the signature to be a Strategy type. -func GetLanguagesByFilename(filename string, content []byte) []string { +// GetLanguagesByFilename returns a slice of possible languages for the given filename, content and candidates +// will be ignored. It is comply with the signature to be a Strategy type. +func GetLanguagesByFilename(filename string, content []byte, candidates []string) []string { return languagesByFilename[filename] } -// GetLanguageByShebang returns the language of the given content looking for the shebang line, -// and safe to indicate the sureness of returned language. -func GetLanguageByShebang(content []byte) (lang string, safe bool) { - return getLangAndSafe("", content, GetLanguagesByShebang) +// GetLanguagesByShebang returns a slice of possible languages for the given content, filename and candidates +// will be ignored. It is comply with the signature to be a Strategy type. +func GetLanguagesByShebang(filename string, content []byte, candidates []string) (languages []string) { + interpreter := getInterpreter(content) + return languagesByInterpreter[interpreter] } -// GetLanguageByExtension returns a language based on the given filename, and safe to indicate -// the sureness of returned language. -func GetLanguageByExtension(filename string) (lang string, safe bool) { - return getLangAndSafe(filename, nil, GetLanguagesByExtension) +var ( + shebangExecHack = regexp.MustCompile(`exec (\w+).+\$0.+\$@`) + pythonVersion = regexp.MustCompile(`python\d\.\d+`) +) + +func getInterpreter(data []byte) (interpreter string) { + line := getFirstLine(data) + if !hasShebang(line) { + return "" + } + + // skip shebang + line = bytes.TrimSpace(line[2:]) + + splitted := bytes.Fields(line) + if bytes.Contains(splitted[0], []byte("env")) { + if len(splitted) > 1 { + interpreter = string(splitted[1]) + } + } else { + + splittedPath := bytes.Split(splitted[0], []byte{'/'}) + interpreter = string(splittedPath[len(splittedPath)-1]) + } + + if interpreter == "sh" { + interpreter = lookForMultilineExec(data) + } + + if pythonVersion.MatchString(interpreter) { + interpreter = interpreter[:strings.Index(interpreter, `.`)] + } + + return } -// GetLanguagesByExtension returns a slice of possible languages for the given filename, content will be ignored. -// It accomplish the signature to be a Strategy type. -func GetLanguagesByExtension(filename string, content []byte) []string { +func getFirstLine(data []byte) []byte { + buf := bufio.NewScanner(bytes.NewReader(data)) + buf.Scan() + line := buf.Bytes() + if err := buf.Err(); err != nil { + return nil + } + + return line +} + +func hasShebang(line []byte) bool { + const shebang = `#!` + prefix := []byte(shebang) + return bytes.HasPrefix(line, prefix) +} + +func lookForMultilineExec(data []byte) string { + const magicNumOfLines = 5 + interpreter := "sh" + + buf := bufio.NewScanner(bytes.NewReader(data)) + for i := 0; i < magicNumOfLines && buf.Scan(); i++ { + line := buf.Bytes() + if shebangExecHack.Match(line) { + interpreter = shebangExecHack.FindStringSubmatch(string(line))[1] + break + } + } + + if err := buf.Err(); err != nil { + return interpreter + } + + return interpreter +} + +// GetLanguagesByExtension returns a slice of possible languages for the given filename, content and candidates +// will be ignored. It is comply with the signature to be a Strategy type. +func GetLanguagesByExtension(filename string, content []byte, candidates []string) []string { if !strings.Contains(filename, ".") { return nil } @@ -106,15 +356,9 @@ func getDotIndexes(filename string) []int { return dots } -// GetLanguageByContent returns a language based on the filename and heuristics applies to the content, -// and safe to indicate the sureness of returned language. -func GetLanguageByContent(filename string, content []byte) (lang string, safe bool) { - return getLangAndSafe(filename, content, GetLanguagesByContent) -} - -// GetLanguagesByContent returns a slice of possible languages for the given content, filename will be ignored. -// It accomplish the signature to be a Strategy type. -func GetLanguagesByContent(filename string, content []byte) []string { +// GetLanguagesByContent returns a slice of possible languages for the given content, filename and candidates +// will be ignored. It is comply with the signature to be a Strategy type. +func GetLanguagesByContent(filename string, content []byte, candidates []string) []string { ext := strings.ToLower(filepath.Ext(filename)) fnMatcher, ok := contentMatchers[ext] if !ok { @@ -124,51 +368,24 @@ func GetLanguagesByContent(filename string, content []byte) []string { return fnMatcher(content) } -func getLangAndSafe(filename string, content []byte, getLanguageByStrategy Strategy) (lang string, safe bool) { - languages := getLanguageByStrategy(filename, content) - if len(languages) == 0 { - lang = OtherLanguage - return +// GetLanguagesByClassifier uses DefaultClassifier as a Classifier and returns a sorted slice of possible languages ordered by +// decreasing language's probability. If there are not candidates it returns nil. It is comply with the signature to be a Strategy type. +func GetLanguagesByClassifier(filename string, content []byte, candidates []string) (languages []string) { + if len(candidates) == 0 { + return nil } - lang = languages[0] - safe = len(languages) == 1 - return + return GetLanguagesBySpecificClassifier(content, candidates, DefaultClassifier) } -// GetLanguageByClassifier takes in a content and a list of candidates, and apply the classifier's Classify method to -// get the most probably language. If classifier is null then DefaultClassfier will be used. If there aren't candidates -// OtherLanguage is returned. -func GetLanguageByClassifier(content []byte, candidates map[string]float64, classifier Classifier) string { - scores := GetLanguagesByClassifier(content, candidates, classifier) - if len(scores) == 0 { - return OtherLanguage +// GetLanguagesBySpecificClassifier returns a slice of possible languages. It takes in a Classifier to be used. +func GetLanguagesBySpecificClassifier(content []byte, candidates []string, classifier Classifier) (languages []string) { + mapCandidates := make(map[string]float64) + for _, candidate := range candidates { + mapCandidates[candidate]++ } - return getLangugeHigherScore(scores) -} - -func getLangugeHigherScore(scores map[string]float64) string { - var language string - higher := -math.MaxFloat64 - for lang, score := range scores { - if higher < score { - language = lang - higher = score - } - } - - return language -} - -// GetLanguagesByClassifier returns a map of possible languages as keys and a score as value based on content and candidates. The values can be ordered -// with the highest value as the most probably language. If classifier is null then DefaultClassfier will be used. -func GetLanguagesByClassifier(content []byte, candidates map[string]float64, classifier Classifier) map[string]float64 { - if classifier == nil { - classifier = DefaultClassifier - } - - return classifier.Classify(content, candidates) + return classifier.Classify(content, mapCandidates) } // GetLanguageExtensions returns the different extensions being used by the language. @@ -188,7 +405,7 @@ const ( Prose ) -// GetLanguageType returns the given language's type. +// GetLanguageType returns the type of the given language. func GetLanguageType(language string) (langType Type) { langType, ok := languagesType[language] if !ok { diff --git a/common_test.go b/common_test.go index 4b13d96..3f5aa78 100644 --- a/common_test.go +++ b/common_test.go @@ -37,65 +37,64 @@ func (s *SimpleLinguistTestSuite) TestGetLanguage() { } } -func (s *SimpleLinguistTestSuite) TestGetLanguageByModelineLinguist() { +func (s *SimpleLinguistTestSuite) TestGetLanguagesByModelineLinguist() { const ( modelinesDir = ".linguist/test/fixtures/Data/Modelines" samplesDir = ".linguist/samples" ) tests := []struct { - name string - filename string - expectedLang string - expectedSafe bool + name string + filename string + candidates []string + expected []string }{ // Emacs - {name: "TestGetLanguageByModelineLinguist_1", filename: filepath.Join(modelinesDir, "example_smalltalk.md"), expectedLang: "Smalltalk", expectedSafe: true}, - {name: "TestGetLanguageByModelineLinguist_2", filename: filepath.Join(modelinesDir, "fundamentalEmacs.c"), expectedLang: "Text", expectedSafe: true}, - {name: "TestGetLanguageByModelineLinguist_3", filename: filepath.Join(modelinesDir, "iamphp.inc"), expectedLang: "PHP", expectedSafe: true}, - {name: "TestGetLanguageByModelineLinguist_4", filename: filepath.Join(modelinesDir, "seeplusplusEmacs1"), expectedLang: "C++", expectedSafe: true}, - {name: "TestGetLanguageByModelineLinguist_5", filename: filepath.Join(modelinesDir, "seeplusplusEmacs2"), expectedLang: "C++", expectedSafe: true}, - {name: "TestGetLanguageByModelineLinguist_6", filename: filepath.Join(modelinesDir, "seeplusplusEmacs3"), expectedLang: "C++", expectedSafe: true}, - {name: "TestGetLanguageByModelineLinguist_7", filename: filepath.Join(modelinesDir, "seeplusplusEmacs4"), expectedLang: "C++", expectedSafe: true}, - {name: "TestGetLanguageByModelineLinguist_8", filename: filepath.Join(modelinesDir, "seeplusplusEmacs5"), expectedLang: "C++", expectedSafe: true}, - {name: "TestGetLanguageByModelineLinguist_9", filename: filepath.Join(modelinesDir, "seeplusplusEmacs6"), expectedLang: "C++", expectedSafe: true}, - {name: "TestGetLanguageByModelineLinguist_10", filename: filepath.Join(modelinesDir, "seeplusplusEmacs7"), expectedLang: "C++", expectedSafe: true}, - {name: "TestGetLanguageByModelineLinguist_11", filename: filepath.Join(modelinesDir, "seeplusplusEmacs9"), expectedLang: "C++", expectedSafe: true}, - {name: "TestGetLanguageByModelineLinguist_12", filename: filepath.Join(modelinesDir, "seeplusplusEmacs10"), expectedLang: "C++", expectedSafe: true}, - {name: "TestGetLanguageByModelineLinguist_13", filename: filepath.Join(modelinesDir, "seeplusplusEmacs11"), expectedLang: "C++", expectedSafe: true}, - {name: "TestGetLanguageByModelineLinguist_14", filename: filepath.Join(modelinesDir, "seeplusplusEmacs12"), expectedLang: "C++", expectedSafe: true}, + {name: "TestGetLanguagesByModelineLinguist_1", filename: filepath.Join(modelinesDir, "example_smalltalk.md"), expected: []string{"Smalltalk"}}, + {name: "TestGetLanguagesByModelineLinguist_2", filename: filepath.Join(modelinesDir, "fundamentalEmacs.c"), expected: []string{"Text"}}, + {name: "TestGetLanguagesByModelineLinguist_3", filename: filepath.Join(modelinesDir, "iamphp.inc"), expected: []string{"PHP"}}, + {name: "TestGetLanguagesByModelineLinguist_4", filename: filepath.Join(modelinesDir, "seeplusplusEmacs1"), expected: []string{"C++"}}, + {name: "TestGetLanguagesByModelineLinguist_5", filename: filepath.Join(modelinesDir, "seeplusplusEmacs2"), expected: []string{"C++"}}, + {name: "TestGetLanguagesByModelineLinguist_6", filename: filepath.Join(modelinesDir, "seeplusplusEmacs3"), expected: []string{"C++"}}, + {name: "TestGetLanguagesByModelineLinguist_7", filename: filepath.Join(modelinesDir, "seeplusplusEmacs4"), expected: []string{"C++"}}, + {name: "TestGetLanguagesByModelineLinguist_8", filename: filepath.Join(modelinesDir, "seeplusplusEmacs5"), expected: []string{"C++"}}, + {name: "TestGetLanguagesByModelineLinguist_9", filename: filepath.Join(modelinesDir, "seeplusplusEmacs6"), expected: []string{"C++"}}, + {name: "TestGetLanguagesByModelineLinguist_10", filename: filepath.Join(modelinesDir, "seeplusplusEmacs7"), expected: []string{"C++"}}, + {name: "TestGetLanguagesByModelineLinguist_11", filename: filepath.Join(modelinesDir, "seeplusplusEmacs9"), expected: []string{"C++"}}, + {name: "TestGetLanguagesByModelineLinguist_12", filename: filepath.Join(modelinesDir, "seeplusplusEmacs10"), expected: []string{"C++"}}, + {name: "TestGetLanguagesByModelineLinguist_13", filename: filepath.Join(modelinesDir, "seeplusplusEmacs11"), expected: []string{"C++"}}, + {name: "TestGetLanguagesByModelineLinguist_14", filename: filepath.Join(modelinesDir, "seeplusplusEmacs12"), expected: []string{"C++"}}, // Vim - {name: "TestGetLanguageByModelineLinguist_15", filename: filepath.Join(modelinesDir, "seeplusplus"), expectedLang: "C++", expectedSafe: true}, - {name: "TestGetLanguageByModelineLinguist_16", filename: filepath.Join(modelinesDir, "iamjs.pl"), expectedLang: "JavaScript", expectedSafe: true}, - {name: "TestGetLanguageByModelineLinguist_17", filename: filepath.Join(modelinesDir, "iamjs2.pl"), expectedLang: "JavaScript", expectedSafe: true}, - {name: "TestGetLanguageByModelineLinguist_18", filename: filepath.Join(modelinesDir, "not_perl.pl"), expectedLang: "Prolog", expectedSafe: true}, - {name: "TestGetLanguageByModelineLinguist_19", filename: filepath.Join(modelinesDir, "ruby"), expectedLang: "Ruby", expectedSafe: true}, - {name: "TestGetLanguageByModelineLinguist_20", filename: filepath.Join(modelinesDir, "ruby2"), expectedLang: "Ruby", expectedSafe: true}, - {name: "TestGetLanguageByModelineLinguist_21", filename: filepath.Join(modelinesDir, "ruby3"), expectedLang: "Ruby", expectedSafe: true}, - {name: "TestGetLanguageByModelineLinguist_22", filename: filepath.Join(modelinesDir, "ruby4"), expectedLang: "Ruby", expectedSafe: true}, - {name: "TestGetLanguageByModelineLinguist_23", filename: filepath.Join(modelinesDir, "ruby5"), expectedLang: "Ruby", expectedSafe: true}, - {name: "TestGetLanguageByModelineLinguist_24", filename: filepath.Join(modelinesDir, "ruby6"), expectedLang: "Ruby", expectedSafe: true}, - {name: "TestGetLanguageByModelineLinguist_25", filename: filepath.Join(modelinesDir, "ruby7"), expectedLang: "Ruby", expectedSafe: true}, - {name: "TestGetLanguageByModelineLinguist_26", filename: filepath.Join(modelinesDir, "ruby8"), expectedLang: "Ruby", expectedSafe: true}, - {name: "TestGetLanguageByModelineLinguist_27", filename: filepath.Join(modelinesDir, "ruby9"), expectedLang: "Ruby", expectedSafe: true}, - {name: "TestGetLanguageByModelineLinguist_28", filename: filepath.Join(modelinesDir, "ruby10"), expectedLang: "Ruby", expectedSafe: true}, - {name: "TestGetLanguageByModelineLinguist_29", filename: filepath.Join(modelinesDir, "ruby11"), expectedLang: "Ruby", expectedSafe: true}, - {name: "TestGetLanguageByModelineLinguist_30", filename: filepath.Join(modelinesDir, "ruby12"), expectedLang: "Ruby", expectedSafe: true}, - {name: "TestGetLanguageByModelineLinguist_31", filename: filepath.Join(samplesDir, "C/main.c"), expectedLang: OtherLanguage, expectedSafe: false}, + {name: "TestGetLanguagesByModelineLinguist_15", filename: filepath.Join(modelinesDir, "seeplusplus"), expected: []string{"C++"}}, + {name: "TestGetLanguagesByModelineLinguist_16", filename: filepath.Join(modelinesDir, "iamjs.pl"), expected: []string{"JavaScript"}}, + {name: "TestGetLanguagesByModelineLinguist_17", filename: filepath.Join(modelinesDir, "iamjs2.pl"), expected: []string{"JavaScript"}}, + {name: "TestGetLanguagesByModelineLinguist_18", filename: filepath.Join(modelinesDir, "not_perl.pl"), expected: []string{"Prolog"}}, + {name: "TestGetLanguagesByModelineLinguist_19", filename: filepath.Join(modelinesDir, "ruby"), expected: []string{"Ruby"}}, + {name: "TestGetLanguagesByModelineLinguist_20", filename: filepath.Join(modelinesDir, "ruby2"), expected: []string{"Ruby"}}, + {name: "TestGetLanguagesByModelineLinguist_21", filename: filepath.Join(modelinesDir, "ruby3"), expected: []string{"Ruby"}}, + {name: "TestGetLanguagesByModelineLinguist_22", filename: filepath.Join(modelinesDir, "ruby4"), expected: []string{"Ruby"}}, + {name: "TestGetLanguagesByModelineLinguist_23", filename: filepath.Join(modelinesDir, "ruby5"), expected: []string{"Ruby"}}, + {name: "TestGetLanguagesByModelineLinguist_24", filename: filepath.Join(modelinesDir, "ruby6"), expected: []string{"Ruby"}}, + {name: "TestGetLanguagesByModelineLinguist_25", filename: filepath.Join(modelinesDir, "ruby7"), expected: []string{"Ruby"}}, + {name: "TestGetLanguagesByModelineLinguist_26", filename: filepath.Join(modelinesDir, "ruby8"), expected: []string{"Ruby"}}, + {name: "TestGetLanguagesByModelineLinguist_27", filename: filepath.Join(modelinesDir, "ruby9"), expected: []string{"Ruby"}}, + {name: "TestGetLanguagesByModelineLinguist_28", filename: filepath.Join(modelinesDir, "ruby10"), expected: []string{"Ruby"}}, + {name: "TestGetLanguagesByModelineLinguist_29", filename: filepath.Join(modelinesDir, "ruby11"), expected: []string{"Ruby"}}, + {name: "TestGetLanguagesByModelineLinguist_30", filename: filepath.Join(modelinesDir, "ruby12"), expected: []string{"Ruby"}}, + {name: "TestGetLanguagesByModelineLinguist_31", filename: filepath.Join(samplesDir, "C/main.c"), expected: nil}, } for _, test := range tests { content, err := ioutil.ReadFile(test.filename) assert.NoError(s.T(), err) - lang, safe := GetLanguageByModeline(content) - assert.Equal(s.T(), test.expectedLang, lang, fmt.Sprintf("%v: lang = %v, expected: %v", test.name, lang, test.expectedLang)) - assert.Equal(s.T(), test.expectedSafe, safe, fmt.Sprintf("%v: safe = %v, expected: %v", test.name, safe, test.expectedSafe)) + languages := GetLanguagesByModeline(test.filename, content, test.candidates) + assert.Equal(s.T(), test.expected, languages, fmt.Sprintf("%v: languages = %v, expected: %v", test.name, languages, test.expected)) } } -func (s *SimpleLinguistTestSuite) TestGetLanguageByModeline() { +func (s *SimpleLinguistTestSuite) TestGetLanguagesByModeline() { const ( wrongVim = `# vim: set syntax=ruby ft =python filetype=perl :` rightVim = `/* vim: set syntax=python ft =python filetype=python */` @@ -103,48 +102,48 @@ func (s *SimpleLinguistTestSuite) TestGetLanguageByModeline() { ) tests := []struct { - name string - content []byte - expectedLang string - expectedSafe bool + name string + filename string + content []byte + candidates []string + expected []string }{ - {name: "TestGetLanguageByModeline_1", content: []byte(wrongVim), expectedLang: OtherLanguage, expectedSafe: false}, - {name: "TestGetLanguageByModeline_2", content: []byte(rightVim), expectedLang: "Python", expectedSafe: true}, - {name: "TestGetLanguageByModeline_3", content: []byte(noLangVim), expectedLang: OtherLanguage, expectedSafe: false}, + {name: "TestGetLanguagesByModeline_1", content: []byte(wrongVim), expected: nil}, + {name: "TestGetLanguagesByModeline_2", content: []byte(rightVim), expected: []string{"Python"}}, + {name: "TestGetLanguagesByModeline_3", content: []byte(noLangVim), expected: nil}, } for _, test := range tests { - lang, safe := GetLanguageByModeline(test.content) - assert.Equal(s.T(), test.expectedLang, lang, fmt.Sprintf("%v: lang = %v, expected: %v", test.name, lang, test.expectedLang)) - assert.Equal(s.T(), test.expectedSafe, safe, fmt.Sprintf("%v: safe = %v, expected: %v", test.name, safe, test.expectedSafe)) + languages := GetLanguagesByModeline(test.filename, test.content, test.candidates) + assert.Equal(s.T(), test.expected, languages, fmt.Sprintf("%v: languages = %v, expected: %v", test.name, languages, test.expected)) } } -func (s *SimpleLinguistTestSuite) TestGetLanguageByFilename() { +func (s *SimpleLinguistTestSuite) TestGetLanguagesByFilename() { tests := []struct { - name string - filename string - expectedLang string - expectedSafe bool + name string + filename string + content []byte + candidates []string + expected []string }{ - {name: "TestGetLanguageByFilename_1", filename: "unknown.interpreter", expectedLang: OtherLanguage, expectedSafe: false}, - {name: "TestGetLanguageByFilename_2", filename: ".bashrc", expectedLang: "Shell", expectedSafe: true}, - {name: "TestGetLanguageByFilename_3", filename: "Dockerfile", expectedLang: "Dockerfile", expectedSafe: true}, - {name: "TestGetLanguageByFilename_4", filename: "Makefile.frag", expectedLang: "Makefile", expectedSafe: true}, - {name: "TestGetLanguageByFilename_5", filename: "makefile", expectedLang: "Makefile", expectedSafe: true}, - {name: "TestGetLanguageByFilename_6", filename: "Vagrantfile", expectedLang: "Ruby", expectedSafe: true}, - {name: "TestGetLanguageByFilename_7", filename: "_vimrc", expectedLang: "Vim script", expectedSafe: true}, - {name: "TestGetLanguageByFilename_8", filename: "pom.xml", expectedLang: "Maven POM", expectedSafe: true}, + {name: "TestGetLanguagesByFilename_1", filename: "unknown.interpreter", expected: nil}, + {name: "TestGetLanguagesByFilename_2", filename: ".bashrc", expected: []string{"Shell"}}, + {name: "TestGetLanguagesByFilename_3", filename: "Dockerfile", expected: []string{"Dockerfile"}}, + {name: "TestGetLanguagesByFilename_4", filename: "Makefile.frag", expected: []string{"Makefile"}}, + {name: "TestGetLanguagesByFilename_5", filename: "makefile", expected: []string{"Makefile"}}, + {name: "TestGetLanguagesByFilename_6", filename: "Vagrantfile", expected: []string{"Ruby"}}, + {name: "TestGetLanguagesByFilename_7", filename: "_vimrc", expected: []string{"Vim script"}}, + {name: "TestGetLanguagesByFilename_8", filename: "pom.xml", expected: []string{"Maven POM"}}, } for _, test := range tests { - lang, safe := GetLanguageByFilename(test.filename) - assert.Equal(s.T(), test.expectedLang, lang, fmt.Sprintf("%v: lang = %v, expected: %v", test.name, lang, test.expectedLang)) - assert.Equal(s.T(), test.expectedSafe, safe, fmt.Sprintf("%v: safe = %v, expected: %v", test.name, safe, test.expectedSafe)) + languages := GetLanguagesByFilename(test.filename, test.content, test.candidates) + assert.Equal(s.T(), test.expected, languages, fmt.Sprintf("%v: languages = %v, expected: %v", test.name, languages, test.expected)) } } -func (s *SimpleLinguistTestSuite) TestGetLanguageByShebang() { +func (s *SimpleLinguistTestSuite) TestGetLanguagesByShebang() { const ( multilineExecHack = `#!/bin/sh # Next line is comment in Tcl, but not in sh... \ @@ -161,72 +160,112 @@ println("The shell script says ",vm.arglist.concat(" "));` ) tests := []struct { - name string - content []byte - expectedLang string - expectedSafe bool + name string + filename string + content []byte + candidates []string + expected []string }{ - {name: "TestGetLanguageByShebang_1", content: []byte(`#!/unknown/interpreter`), expectedLang: OtherLanguage, expectedSafe: false}, - {name: "TestGetLanguageByShebang_2", content: []byte(`no shebang`), expectedLang: OtherLanguage, expectedSafe: false}, - {name: "TestGetLanguageByShebang_3", content: []byte(`#!/usr/bin/env`), expectedLang: OtherLanguage, expectedSafe: false}, - {name: "TestGetLanguageByShebang_4", content: []byte(`#!/usr/bin/python -tt`), expectedLang: "Python", expectedSafe: true}, - {name: "TestGetLanguageByShebang_5", content: []byte(`#!/usr/bin/env python2.6`), expectedLang: "Python", expectedSafe: true}, - {name: "TestGetLanguageByShebang_6", content: []byte(`#!/usr/bin/env perl`), expectedLang: "Perl", expectedSafe: true}, - {name: "TestGetLanguageByShebang_7", content: []byte(`#! /bin/sh`), expectedLang: "Shell", expectedSafe: true}, - {name: "TestGetLanguageByShebang_8", content: []byte(`#!bash`), expectedLang: "Shell", expectedSafe: true}, - {name: "TestGetLanguageByShebang_9", content: []byte(multilineExecHack), expectedLang: "Tcl", expectedSafe: true}, - {name: "TestGetLanguageByShebang_10", content: []byte(multilineNoExecHack), expectedLang: "Shell", expectedSafe: true}, + {name: "TestGetLanguagesByShebang_1", content: []byte(`#!/unknown/interpreter`), expected: nil}, + {name: "TestGetLanguagesByShebang_2", content: []byte(`no shebang`), expected: nil}, + {name: "TestGetLanguagesByShebang_3", content: []byte(`#!/usr/bin/env`), expected: nil}, + {name: "TestGetLanguagesByShebang_4", content: []byte(`#!/usr/bin/python -tt`), expected: []string{"Python"}}, + {name: "TestGetLanguagesByShebang_5", content: []byte(`#!/usr/bin/env python2.6`), expected: []string{"Python"}}, + {name: "TestGetLanguagesByShebang_6", content: []byte(`#!/usr/bin/env perl`), expected: []string{"Perl"}}, + {name: "TestGetLanguagesByShebang_7", content: []byte(`#! /bin/sh`), expected: []string{"Shell"}}, + {name: "TestGetLanguagesByShebang_8", content: []byte(`#!bash`), expected: []string{"Shell"}}, + {name: "TestGetLanguagesByShebang_9", content: []byte(multilineExecHack), expected: []string{"Tcl"}}, + {name: "TestGetLanguagesByShebang_10", content: []byte(multilineNoExecHack), expected: []string{"Shell"}}, } for _, test := range tests { - lang, safe := GetLanguageByShebang(test.content) - assert.Equal(s.T(), test.expectedLang, lang, fmt.Sprintf("%v: lang = %v, expected: %v", test.name, lang, test.expectedLang)) - assert.Equal(s.T(), test.expectedSafe, safe, fmt.Sprintf("%v: safe = %v, expected: %v", test.name, safe, test.expectedSafe)) + languages := GetLanguagesByShebang(test.filename, test.content, test.candidates) + assert.Equal(s.T(), test.expected, languages, fmt.Sprintf("%v: languages = %v, expected: %v", test.name, languages, test.expected)) } } -func (s *SimpleLinguistTestSuite) TestGetLanguageByExtension() { +func (s *SimpleLinguistTestSuite) TestGetLanguagesByExtension() { tests := []struct { - name string - filename string - expectedLang string - expectedSafe bool + name string + filename string + content []byte + candidates []string + expected []string }{ - {name: "TestGetLanguageByExtension_1", filename: "foo.foo", expectedLang: OtherLanguage, expectedSafe: false}, - {name: "TestGetLanguageByExtension_2", filename: "foo.go", expectedLang: "Go", expectedSafe: true}, - {name: "TestGetLanguageByExtension_3", filename: "foo.go.php", expectedLang: "Hack", expectedSafe: false}, + {name: "TestGetLanguagesByExtension_1", filename: "foo.foo", expected: nil}, + {name: "TestGetLanguagesByExtension_2", filename: "foo.go", expected: []string{"Go"}}, + {name: "TestGetLanguagesByExtension_3", filename: "foo.go.php", expected: []string{"Hack", "PHP"}}, } for _, test := range tests { - lang, safe := GetLanguageByExtension(test.filename) - assert.Equal(s.T(), test.expectedLang, lang, fmt.Sprintf("%v: lang = %v, expected: %v", test.name, lang, test.expectedLang)) - assert.Equal(s.T(), test.expectedSafe, safe, fmt.Sprintf("%v: safe = %v, expected: %v", test.name, safe, test.expectedSafe)) + languages := GetLanguagesByExtension(test.filename, test.content, test.candidates) + assert.Equal(s.T(), test.expected, languages, fmt.Sprintf("%v: languages = %v, expected: %v", test.name, languages, test.expected)) } } -func (s *SimpleLinguistTestSuite) TestGetLanguageByClassifier() { +func (s *SimpleLinguistTestSuite) TestGetLanguagesByClassifier() { const samples = `.linguist/samples/` test := []struct { name string filename string - candidates map[string]float64 + candidates []string expected string }{ - {name: "TestGetLanguageByClassifier_1", filename: filepath.Join(samples, "C/blob.c"), candidates: map[string]float64{"python": 1.00, "ruby": 1.00, "c": 1.00, "c++": 1.00}, expected: "C"}, - {name: "TestGetLanguageByClassifier_2", filename: filepath.Join(samples, "C/blob.c"), candidates: nil, expected: "C"}, - {name: "TestGetLanguageByClassifier_3", filename: filepath.Join(samples, "C/main.c"), candidates: nil, expected: "C"}, - {name: "TestGetLanguageByClassifier_4", filename: filepath.Join(samples, "C/blob.c"), candidates: map[string]float64{"python": 1.00, "ruby": 1.00, "c++": 1.00}, expected: "C++"}, - {name: "TestGetLanguageByClassifier_5", filename: filepath.Join(samples, "C/blob.c"), candidates: map[string]float64{"ruby": 1.00}, expected: "Ruby"}, - {name: "TestGetLanguageByClassifier_6", filename: filepath.Join(samples, "Python/django-models-base.py"), candidates: map[string]float64{"python": 1.00, "ruby": 1.00, "c": 1.00, "c++": 1.00}, expected: "Python"}, - {name: "TestGetLanguageByClassifier_7", filename: filepath.Join(samples, "Python/django-models-base.py"), candidates: nil, expected: "Python"}, + {name: "TestGetLanguagesByClassifier_1", filename: filepath.Join(samples, "C/blob.c"), candidates: []string{"python", "ruby", "c", "c++"}, expected: "C"}, + {name: "TestGetLanguagesByClassifier_2", filename: filepath.Join(samples, "C/blob.c"), candidates: nil, expected: OtherLanguage}, + {name: "TestGetLanguagesByClassifier_3", filename: filepath.Join(samples, "C/main.c"), candidates: []string{}, expected: OtherLanguage}, + {name: "TestGetLanguagesByClassifier_4", filename: filepath.Join(samples, "C/blob.c"), candidates: []string{"python", "ruby", "c++"}, expected: "C++"}, + {name: "TestGetLanguagesByClassifier_5", filename: filepath.Join(samples, "C/blob.c"), candidates: []string{"ruby"}, expected: "Ruby"}, + {name: "TestGetLanguagesByClassifier_6", filename: filepath.Join(samples, "Python/django-models-base.py"), candidates: []string{"python", "ruby", "c", "c++"}, expected: "Python"}, } for _, test := range test { content, err := ioutil.ReadFile(test.filename) assert.NoError(s.T(), err) - lang := GetLanguageByClassifier(content, test.candidates, nil) - assert.Equal(s.T(), test.expected, lang, fmt.Sprintf("%v: lang = %v, expected: %v", test.name, lang, test.expected)) + languages := GetLanguagesByClassifier(test.filename, content, test.candidates) + var language string + if len(languages) == 0 { + language = OtherLanguage + } else { + language = languages[0] + } + + assert.Equal(s.T(), test.expected, language, fmt.Sprintf("%v: language = %v, expected: %v", test.name, language, test.expected)) + } +} + +func (s *SimpleLinguistTestSuite) TestGetLanguagesBySpecificClassifier() { + const samples = `.linguist/samples/` + test := []struct { + name string + filename string + candidates []string + classifier Classifier + expected string + }{ + {name: "TestGetLanguagesByClassifier_1", filename: filepath.Join(samples, "C/blob.c"), candidates: []string{"python", "ruby", "c", "c++"}, classifier: DefaultClassifier, expected: "C"}, + {name: "TestGetLanguagesByClassifier_2", filename: filepath.Join(samples, "C/blob.c"), candidates: nil, classifier: DefaultClassifier, expected: "C"}, + {name: "TestGetLanguagesByClassifier_3", filename: filepath.Join(samples, "C/main.c"), candidates: []string{}, classifier: DefaultClassifier, expected: "C"}, + {name: "TestGetLanguagesByClassifier_4", filename: filepath.Join(samples, "C/blob.c"), candidates: []string{"python", "ruby", "c++"}, classifier: DefaultClassifier, expected: "C++"}, + {name: "TestGetLanguagesByClassifier_5", filename: filepath.Join(samples, "C/blob.c"), candidates: []string{"ruby"}, classifier: DefaultClassifier, expected: "Ruby"}, + {name: "TestGetLanguagesByClassifier_6", filename: filepath.Join(samples, "Python/django-models-base.py"), candidates: []string{"python", "ruby", "c", "c++"}, classifier: DefaultClassifier, expected: "Python"}, + {name: "TestGetLanguagesByClassifier_6", filename: os.DevNull, candidates: nil, classifier: DefaultClassifier, expected: OtherLanguage}, + } + + for _, test := range test { + content, err := ioutil.ReadFile(test.filename) + assert.NoError(s.T(), err) + + languages := GetLanguagesBySpecificClassifier(content, test.candidates, test.classifier) + var language string + if len(languages) == 0 { + language = OtherLanguage + } else { + language = languages[0] + } + + assert.Equal(s.T(), test.expected, language, fmt.Sprintf("%v: language = %v, expected: %v", test.name, language, test.expected)) } } @@ -323,6 +362,7 @@ func (s *SimpleLinguistTestSuite) TestLinguistCorpus() { total++ obtained := GetLanguage(filename, content) if obtained == OtherLanguage { + obtained = "Other" other++ } @@ -337,9 +377,9 @@ func (s *SimpleLinguistTestSuite) TestLinguistCorpus() { } if _, ok := cornerCases[filename]; ok { - fmt.Printf("\t\t[condidered corner case] %s\t%s\t%s\t%s\n", filename, expected, obtained, status) + fmt.Printf("\t\t[considered corner case] %s\texpected: %s\tobtained: %s\tstatus: %s\n", filename, expected, obtained, status) } else { - assert.Equal(s.T(), expected, obtained, fmt.Sprintf("%s\t%s\t%s\t%s\n", filename, expected, obtained, status)) + assert.Equal(s.T(), expected, obtained, fmt.Sprintf("%s\texpected: %s\tobtained: %s\tstatus: %s\n", filename, expected, obtained, status)) } return nil diff --git a/internal/code-generator/generator/test_files/alias.gold b/internal/code-generator/generator/test_files/alias.gold index 96f23ad..78c75e5 100644 --- a/internal/code-generator/generator/test_files/alias.gold +++ b/internal/code-generator/generator/test_files/alias.gold @@ -1,4 +1,4 @@ -package slinguist +package enry // CODE GENERATED AUTOMATICALLY WITH gopkg.in/src-d/enry.v1/internal/code-generator // THIS FILE SHOULD NOT BE EDITED BY HAND diff --git a/internal/code-generator/generator/test_files/extension.gold b/internal/code-generator/generator/test_files/extension.gold index f453667..247ce12 100644 --- a/internal/code-generator/generator/test_files/extension.gold +++ b/internal/code-generator/generator/test_files/extension.gold @@ -1,4 +1,4 @@ -package slinguist +package enry // CODE GENERATED AUTOMATICALLY WITH gopkg.in/src-d/enry.v1/internal/code-generator // THIS FILE SHOULD NOT BE EDITED BY HAND diff --git a/internal/code-generator/generator/test_files/filename.gold b/internal/code-generator/generator/test_files/filename.gold index 723d14c..b4f19af 100644 --- a/internal/code-generator/generator/test_files/filename.gold +++ b/internal/code-generator/generator/test_files/filename.gold @@ -1,4 +1,4 @@ -package slinguist +package enry // CODE GENERATED AUTOMATICALLY WITH gopkg.in/src-d/enry.v1/internal/code-generator // THIS FILE SHOULD NOT BE EDITED BY HAND diff --git a/internal/code-generator/generator/test_files/interpreter.gold b/internal/code-generator/generator/test_files/interpreter.gold index 651b475..717f711 100644 --- a/internal/code-generator/generator/test_files/interpreter.gold +++ b/internal/code-generator/generator/test_files/interpreter.gold @@ -1,4 +1,4 @@ -package slinguist +package enry // CODE GENERATED AUTOMATICALLY WITH gopkg.in/src-d/enry.v1/internal/code-generator // THIS FILE SHOULD NOT BE EDITED BY HAND diff --git a/internal/code-generator/generator/test_files/type.gold b/internal/code-generator/generator/test_files/type.gold index 2b7805c..0820146 100644 --- a/internal/code-generator/generator/test_files/type.gold +++ b/internal/code-generator/generator/test_files/type.gold @@ -1,4 +1,4 @@ -package slinguist +package enry // CODE GENERATED AUTOMATICALLY WITH gopkg.in/src-d/enry.v1/internal/code-generator // THIS FILE SHOULD NOT BE EDITED BY HAND diff --git a/internal/code-generator/main.go b/internal/code-generator/main.go index 81a5794..d1c9f04 100644 --- a/internal/code-generator/main.go +++ b/internal/code-generator/main.go @@ -16,8 +16,8 @@ const ( // extension.go generation extensionsFile = "extension.go" - extensionsTmplPath = "internal/code-generator/assets/extensions.go.tmpl" - extensionsTmpl = "extensions.go.tmpl" + extensionsTmplPath = "internal/code-generator/assets/extension.go.tmpl" + extensionsTmpl = "extension.go.tmpl" // content.go generation heuristicsRuby = ".linguist/lib/linguist/heuristics.rb" @@ -39,23 +39,23 @@ const ( // type.go generation typeFile = "type.go" - typeTmplPath = "internal/code-generator/assets/types.go.tmpl" - typeTmpl = "types.go.tmpl" + typeTmplPath = "internal/code-generator/assets/type.go.tmpl" + typeTmpl = "type.go.tmpl" // interpreter.go generation interpretersFile = "interpreter.go" - interpretersTmplPath = "internal/code-generator/assets/interpreters.go.tmpl" - interpretersTmpl = "interpreters.go.tmpl" + interpretersTmplPath = "internal/code-generator/assets/interpreter.go.tmpl" + interpretersTmpl = "interpreter.go.tmpl" // filename.go generation filenamesFile = "filename.go" - filenamesTmplPath = "internal/code-generator/assets/filenames.go.tmpl" - filenamesTmpl = "filenames.go.tmpl" + filenamesTmplPath = "internal/code-generator/assets/filename.go.tmpl" + filenamesTmpl = "filename.go.tmpl" // alias.go generation aliasesFile = "alias.go" - aliasesTmplPath = "internal/code-generator/assets/aliases.go.tmpl" - aliasesTmpl = "aliases.go.tmpl" + aliasesTmplPath = "internal/code-generator/assets/alias.go.tmpl" + aliasesTmpl = "alias.go.tmpl" // frequencies.go generation frequenciesFile = "frequencies.go" diff --git a/modeline.go b/modeline.go deleted file mode 100644 index a7a0845..0000000 --- a/modeline.go +++ /dev/null @@ -1,153 +0,0 @@ -package enry - -import ( - "bytes" - "regexp" -) - -const ( - searchScope = 5 -) - -// GetLanguagesByModeline returns a slice of possible languages for the given content, filename will be ignored. -// It accomplish the signature to be a Strategy type. -func GetLanguagesByModeline(filename string, content []byte) []string { - headFoot := getHeaderAndFooter(content) - var languages []string - for _, getLang := range modelinesFunc { - languages = getLang("", headFoot) - if len(languages) > 0 { - break - } - } - - return languages -} - -func getHeaderAndFooter(content []byte) []byte { - if bytes.Count(content, []byte("\n")) < 2*searchScope { - return content - } - - header := headScope(content, searchScope) - footer := footScope(content, searchScope) - headerAndFooter := make([]byte, 0, len(content[:header])+len(content[footer:])) - headerAndFooter = append(headerAndFooter, content[:header]...) - headerAndFooter = append(headerAndFooter, content[footer:]...) - return headerAndFooter -} - -func headScope(content []byte, scope int) (index int) { - for i := 0; i < scope; i++ { - eol := bytes.IndexAny(content, "\n") - content = content[eol+1:] - index += eol - } - - return index + scope - 1 -} - -func footScope(content []byte, scope int) (index int) { - for i := 0; i < scope; i++ { - index = bytes.LastIndexAny(content, "\n") - content = content[:index] - } - - return index + 1 -} - -var modelinesFunc = []func(filename string, content []byte) []string{ - GetLanguagesByEmacsModeline, - GetLanguagesByVimModeline, -} - -var ( - reEmacsModeline = regexp.MustCompile(`.*-\*-\s*(.+?)\s*-\*-.*(?m:$)`) - reEmacsLang = regexp.MustCompile(`.*(?i:mode)\s*:\s*([^\s;]+)\s*;*.*`) - reVimModeline = regexp.MustCompile(`(?:(?m:\s|^)vi(?:m[<=>]?\d+|m)?|[\t\x20]*ex)\s*[:]\s*(.*)(?m:$)`) - reVimLang = regexp.MustCompile(`(?i:filetype|ft|syntax)\s*=(\w+)(?:\s|:|$)`) -) - -// GetLanguageByEmacsModeline detecs if the content has a emacs modeline and try to get a -// language basing on alias. If couldn't retrieve a valid language, it returns OtherLanguage and false. -func GetLanguageByEmacsModeline(content []byte) (string, bool) { - languages := GetLanguagesByEmacsModeline("", content) - if len(languages) == 0 { - return OtherLanguage, false - } - - return languages[0], true -} - -// GetLanguagesByEmacsModeline returns a slice of possible languages for the given content, filename will be ignored. -// It accomplish the signature to be a Strategy type. -func GetLanguagesByEmacsModeline(filename string, content []byte) []string { - matched := reEmacsModeline.FindAllSubmatch(content, -1) - if matched == nil { - return nil - } - - // only take the last matched line, discard previous lines - lastLineMatched := matched[len(matched)-1][1] - matchedAlias := reEmacsLang.FindSubmatch(lastLineMatched) - var alias string - if matchedAlias != nil { - alias = string(matchedAlias[1]) - } else { - alias = string(lastLineMatched) - } - - language, ok := GetLanguageByAlias(alias) - if !ok { - return nil - } - - return []string{language} -} - -// GetLanguageByVimModeline detecs if the content has a vim modeline and try to get a -// language basing on alias. If couldn't retrieve a valid language, it returns OtherLanguage and false. -func GetLanguageByVimModeline(content []byte) (string, bool) { - languages := GetLanguagesByVimModeline("", content) - if len(languages) == 0 { - return OtherLanguage, false - } - - return languages[0], true -} - -// GetLanguagesByVimModeline returns a slice of possible languages for the given content, filename will be ignored. -// It accomplish the signature to be a Strategy type. -func GetLanguagesByVimModeline(filename string, content []byte) []string { - matched := reVimModeline.FindAllSubmatch(content, -1) - if matched == nil { - return nil - } - - // only take the last matched line, discard previous lines - lastLineMatched := matched[len(matched)-1][1] - matchedAlias := reVimLang.FindAllSubmatch(lastLineMatched, -1) - if matchedAlias == nil { - return nil - } - - alias := string(matchedAlias[0][1]) - if len(matchedAlias) > 1 { - // cases: - // matchedAlias = [["syntax=ruby " "ruby"] ["ft=python " "python"] ["filetype=perl " "perl"]] returns OtherLanguage; - // matchedAlias = [["syntax=python " "python"] ["ft=python " "python"] ["filetype=python " "python"]] returns "Python"; - for _, match := range matchedAlias { - otherAlias := string(match[1]) - if otherAlias != alias { - return nil - } - } - } - - language, ok := GetLanguageByAlias(alias) - if !ok { - return nil - } - - return []string{language} -} diff --git a/shebang.go b/shebang.go deleted file mode 100644 index cf335a1..0000000 --- a/shebang.go +++ /dev/null @@ -1,89 +0,0 @@ -package enry - -import ( - "bufio" - "bytes" - "regexp" - "strings" -) - -const shebang = `#!` - -var ( - shebangExecHack = regexp.MustCompile(`exec (\w+).+\$0.+\$@`) - pythonVersion = regexp.MustCompile(`python\d\.\d+`) -) - -// GetLanguagesByShebang returns a slice of possible languages for the given content, filename will be ignored. -// It accomplish the signature to be a Strategy type. -func GetLanguagesByShebang(filename string, content []byte) (languages []string) { - interpreter := getInterpreter(content) - return languagesByInterpreter[interpreter] -} - -func getInterpreter(data []byte) (interpreter string) { - line := getFirstLine(data) - if !hasShebang(line) { - return "" - } - - // skip shebang - line = bytes.TrimSpace(line[2:]) - - splitted := bytes.Fields(line) - if bytes.Contains(splitted[0], []byte("env")) { - if len(splitted) > 1 { - interpreter = string(splitted[1]) - } - } else { - - splittedPath := bytes.Split(splitted[0], []byte{'/'}) - interpreter = string(splittedPath[len(splittedPath)-1]) - } - - if interpreter == "sh" { - interpreter = lookForMultilineExec(data) - } - - if pythonVersion.MatchString(interpreter) { - interpreter = interpreter[:strings.Index(interpreter, `.`)] - } - - return -} - -func getFirstLine(data []byte) []byte { - buf := bufio.NewScanner(bytes.NewReader(data)) - buf.Scan() - line := buf.Bytes() - if err := buf.Err(); err != nil { - return nil - } - - return line -} - -func hasShebang(line []byte) bool { - shebang := []byte(shebang) - return bytes.HasPrefix(line, shebang) -} - -func lookForMultilineExec(data []byte) string { - const magicNumOfLines = 5 - interpreter := "sh" - - buf := bufio.NewScanner(bytes.NewReader(data)) - for i := 0; i < magicNumOfLines && buf.Scan(); i++ { - line := buf.Bytes() - if shebangExecHack.Match(line) { - interpreter = shebangExecHack.FindStringSubmatch(string(line))[1] - break - } - } - - if err := buf.Err(); err != nil { - return interpreter - } - - return interpreter -} diff --git a/utils.go b/utils.go index a82d795..1434c9b 100644 --- a/utils.go +++ b/utils.go @@ -31,9 +31,8 @@ func IsAuxiliaryLanguage(lang string) bool { // IsConfiguration returns whether or not path is using a configuration language. func IsConfiguration(path string) bool { - lang, _ := GetLanguageByExtension(path) - _, is := configurationLanguages[lang] - + language, _ := GetLanguageByExtension(path) + _, is := configurationLanguages[language] return is } diff --git a/utils_test.go b/utils_test.go index 4e72b8f..32fa2e6 100644 --- a/utils_test.go +++ b/utils_test.go @@ -55,7 +55,7 @@ func (s *SimpleLinguistTestSuite) TestIsConfiguration() { }{ {name: "TestIsConfiguration_1", path: "foo", expected: false}, {name: "TestIsConfiguration_2", path: "foo.ini", expected: true}, - {name: "TestIsConfiguration_3", path: "foo.json", expected: true}, + {name: "TestIsConfiguration_3", path: "/test/path/foo.json", expected: true}, } for _, test := range tests {