mirror of
https://github.com/ralsina/tartrazine.git
synced 2025-05-24 08:18:52 -03:00
Merge pull request #47 from look/look/mimic-linguist-detect
Rewrite GetLanguages to work like Linguist.detect
This commit is contained in:
commit
b60e5c6f5a
24
common.go
24
common.go
@ -118,7 +118,13 @@ func getLanguageBySpecificClassifier(content []byte, candidates []string, classi
|
|||||||
}
|
}
|
||||||
|
|
||||||
// GetLanguages applies a sequence of strategies based on the given filename and content
|
// GetLanguages applies a sequence of strategies based on the given filename and content
|
||||||
// to find out the most probably languages to return.
|
// to find out the most probable languages to return.
|
||||||
|
//
|
||||||
|
// If it finds a strategy that produces a single result, it will be returned;
|
||||||
|
// otherise the last strategy that returned multiple results will be returned.
|
||||||
|
// If the content is binary, no results will be returned. This matches the
|
||||||
|
// behavior of Linguist.detect: https://github.com/github/linguist/blob/aad49acc0624c70d654a8dce447887dbbc713c7a/lib/linguist.rb#L14-L49
|
||||||
|
//
|
||||||
// At least one of arguments should be set. If content is missing, language detection will be based on the filename.
|
// At least one of arguments should be set. If content is missing, language detection will be based on the filename.
|
||||||
// The function won't read the file, given an empty content.
|
// The function won't read the file, given an empty content.
|
||||||
func GetLanguages(filename string, content []byte) []string {
|
func GetLanguages(filename string, content []byte) []string {
|
||||||
@ -127,16 +133,20 @@ func GetLanguages(filename string, content []byte) []string {
|
|||||||
}
|
}
|
||||||
|
|
||||||
var languages []string
|
var languages []string
|
||||||
candidates := []string{}
|
|
||||||
for _, strategy := range DefaultStrategies {
|
for _, strategy := range DefaultStrategies {
|
||||||
languages = strategy(filename, content, candidates)
|
candidates := strategy(filename, content, languages)
|
||||||
if len(languages) == 1 {
|
// No candidates, continue to next strategy without updating languages
|
||||||
return languages
|
if len(candidates) == 0 {
|
||||||
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
if len(languages) > 0 {
|
// Only one candidate match, return it
|
||||||
candidates = append(candidates, languages...)
|
if len(candidates) == 1 {
|
||||||
|
return candidates
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Save the candidates from this strategy to pass onto to the next strategy, like Linguist
|
||||||
|
languages = candidates
|
||||||
}
|
}
|
||||||
|
|
||||||
return languages
|
return languages
|
||||||
|
@ -119,6 +119,38 @@ func (s *EnryTestSuite) TestGetLanguage() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (s *EnryTestSuite) TestGetLanguages() {
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
filename string
|
||||||
|
content []byte
|
||||||
|
expected []string
|
||||||
|
}{
|
||||||
|
// With no content or filename, no language can be detected
|
||||||
|
{name: "TestGetLanguages_0", filename: "", content: []byte{}, expected: nil},
|
||||||
|
// The strategy that will match is GetLanguagesByExtension. Lacking content, it will return those results.
|
||||||
|
{name: "TestGetLanguages_1", filename: "foo.h", content: []byte{}, expected: []string{"C", "C++", "Objective-C"}},
|
||||||
|
// GetLanguagesByExtension will return an unambiguous match when there is a single result.
|
||||||
|
{name: "TestGetLanguages_2", filename: "foo.groovy", content: []byte{}, expected: []string{"Groovy"}},
|
||||||
|
// GetLanguagesByExtension will return "Rust", "RenderScript" for .rs,
|
||||||
|
// then GetLanguagesByContent will take the first rule that matches (in this case Rust)
|
||||||
|
{name: "TestGetLanguages_3", filename: "foo.rs", content: []byte("use \n#include"), expected: []string{"Rust"}},
|
||||||
|
// .. and in this case, RenderScript (no content that matches a Rust regex can be included, because it runs first.)
|
||||||
|
{name: "TestGetLanguages_4", filename: "foo.rs", content: []byte("#include"), expected: []string{"RenderScript"}},
|
||||||
|
// GetLanguagesByExtension will return "AMPL", "Linux Kernel Module", "Modula-2", "XML",
|
||||||
|
// then GetLanguagesByContent will ALWAYS return Linux Kernel Module and AMPL when there is no content,
|
||||||
|
// and no further classifier can do anything without content
|
||||||
|
{name: "TestGetLanguages_5", filename: "foo.mod", content: []byte{}, expected: []string{"Linux Kernel Module", "AMPL"}},
|
||||||
|
// ...with some AMPL tokens, the DefaultClassifier will pick AMPL as the most likely language.
|
||||||
|
{name: "TestGetLanguages_6", filename: "foo.mod", content: []byte("BEAMS ROWS - TotalWeight"), expected: []string{"AMPL", "Linux Kernel Module"}},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, test := range tests {
|
||||||
|
languages := GetLanguages(test.filename, test.content)
|
||||||
|
assert.Equal(s.T(), test.expected, languages, fmt.Sprintf("%v: %v, expected: %v", test.name, languages, test.expected))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func (s *EnryTestSuite) TestGetLanguagesByModelineLinguist() {
|
func (s *EnryTestSuite) TestGetLanguagesByModelineLinguist() {
|
||||||
var modelinesDir = filepath.Join(s.tmpLinguist, "test", "fixtures", "Data", "Modelines")
|
var modelinesDir = filepath.Join(s.tmpLinguist, "test", "fixtures", "Data", "Modelines")
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user