diff --git a/common.go b/common.go index 4764cf7..6659911 100644 --- a/common.go +++ b/common.go @@ -118,7 +118,13 @@ func getLanguageBySpecificClassifier(content []byte, candidates []string, classi } // GetLanguages applies a sequence of strategies based on the given filename and content -// to find out the most probably languages to return. +// to find out the most probable languages to return. +// +// If it finds a strategy that produces a single result, it will be returned; +// otherise the last strategy that returned multiple results will be returned. +// If the content is binary, no results will be returned. This matches the +// behavior of Linguist.detect: https://github.com/github/linguist/blob/aad49acc0624c70d654a8dce447887dbbc713c7a/lib/linguist.rb#L14-L49 +// // At least one of arguments should be set. If content is missing, language detection will be based on the filename. // The function won't read the file, given an empty content. func GetLanguages(filename string, content []byte) []string { @@ -127,16 +133,20 @@ func GetLanguages(filename string, content []byte) []string { } var languages []string - candidates := []string{} for _, strategy := range DefaultStrategies { - languages = strategy(filename, content, candidates) - if len(languages) == 1 { - return languages + candidates := strategy(filename, content, languages) + // No candidates, continue to next strategy without updating languages + if len(candidates) == 0 { + continue } - if len(languages) > 0 { - candidates = append(candidates, languages...) + // Only one candidate match, return it + if len(candidates) == 1 { + return candidates } + + // Save the candidates from this strategy to pass onto to the next strategy, like Linguist + languages = candidates } return languages diff --git a/common_test.go b/common_test.go index 46c2ee8..27b324f 100644 --- a/common_test.go +++ b/common_test.go @@ -119,6 +119,38 @@ func (s *EnryTestSuite) TestGetLanguage() { } } +func (s *EnryTestSuite) TestGetLanguages() { + tests := []struct { + name string + filename string + content []byte + expected []string + }{ + // With no content or filename, no language can be detected + {name: "TestGetLanguages_0", filename: "", content: []byte{}, expected: nil}, + // The strategy that will match is GetLanguagesByExtension. Lacking content, it will return those results. + {name: "TestGetLanguages_1", filename: "foo.h", content: []byte{}, expected: []string{"C", "C++", "Objective-C"}}, + // GetLanguagesByExtension will return an unambiguous match when there is a single result. + {name: "TestGetLanguages_2", filename: "foo.groovy", content: []byte{}, expected: []string{"Groovy"}}, + // GetLanguagesByExtension will return "Rust", "RenderScript" for .rs, + // then GetLanguagesByContent will take the first rule that matches (in this case Rust) + {name: "TestGetLanguages_3", filename: "foo.rs", content: []byte("use \n#include"), expected: []string{"Rust"}}, + // .. and in this case, RenderScript (no content that matches a Rust regex can be included, because it runs first.) + {name: "TestGetLanguages_4", filename: "foo.rs", content: []byte("#include"), expected: []string{"RenderScript"}}, + // GetLanguagesByExtension will return "AMPL", "Linux Kernel Module", "Modula-2", "XML", + // then GetLanguagesByContent will ALWAYS return Linux Kernel Module and AMPL when there is no content, + // and no further classifier can do anything without content + {name: "TestGetLanguages_5", filename: "foo.mod", content: []byte{}, expected: []string{"Linux Kernel Module", "AMPL"}}, + // ...with some AMPL tokens, the DefaultClassifier will pick AMPL as the most likely language. + {name: "TestGetLanguages_6", filename: "foo.mod", content: []byte("BEAMS ROWS - TotalWeight"), expected: []string{"AMPL", "Linux Kernel Module"}}, + } + + for _, test := range tests { + languages := GetLanguages(test.filename, test.content) + assert.Equal(s.T(), test.expected, languages, fmt.Sprintf("%v: %v, expected: %v", test.name, languages, test.expected)) + } +} + func (s *EnryTestSuite) TestGetLanguagesByModelineLinguist() { var modelinesDir = filepath.Join(s.tmpLinguist, "test", "fixtures", "Data", "Modelines")