mirror of
https://github.com/ralsina/tartrazine.git
synced 2025-06-18 22:23:07 -03:00
Rewrite GetLanguages to work like Linguist.detect
Prior to this change, GetLanguages collected all candidate languages from each
strategy to pass to the next strategy (without de-duplicating them). Linguist
only uses the previous strategy's candidates for the next strategy. Also, it
would overwrite languages with nil if a strategy returned that, so you could get
into a situation where you go from multiple languages to no language.
See the Ruby code for details: aad49acc06/lib/linguist.rb (L14-L49)
This addresses https://github.com/src-d/enry/issues/207 because GetLanguages
should not return all candidates detected, otherwise it would work differently
than Linguist.
This commit is contained in:
24
common.go
24
common.go
@ -118,7 +118,13 @@ func getLanguageBySpecificClassifier(content []byte, candidates []string, classi
|
||||
}
|
||||
|
||||
// GetLanguages applies a sequence of strategies based on the given filename and content
|
||||
// to find out the most probably languages to return.
|
||||
// to find out the most probable languages to return.
|
||||
//
|
||||
// If it finds a strategy that produces a single result, it will be returned;
|
||||
// otherise the last strategy that returned multiple results will be returned.
|
||||
// If the content is binary, no results will be returned. This matches the
|
||||
// behavior of Linguist.detect: https://github.com/github/linguist/blob/aad49acc0624c70d654a8dce447887dbbc713c7a/lib/linguist.rb#L14-L49
|
||||
//
|
||||
// At least one of arguments should be set. If content is missing, language detection will be based on the filename.
|
||||
// The function won't read the file, given an empty content.
|
||||
func GetLanguages(filename string, content []byte) []string {
|
||||
@ -127,16 +133,20 @@ func GetLanguages(filename string, content []byte) []string {
|
||||
}
|
||||
|
||||
var languages []string
|
||||
candidates := []string{}
|
||||
for _, strategy := range DefaultStrategies {
|
||||
languages = strategy(filename, content, candidates)
|
||||
if len(languages) == 1 {
|
||||
return languages
|
||||
candidates := strategy(filename, content, languages)
|
||||
// No candidates, continue to next strategy without updating languages
|
||||
if len(candidates) == 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
if len(languages) > 0 {
|
||||
candidates = append(candidates, languages...)
|
||||
// Only one candidate match, return it
|
||||
if len(candidates) == 1 {
|
||||
return candidates
|
||||
}
|
||||
|
||||
// Save the candidates from this strategy to pass onto to the next strategy, like Linguist
|
||||
languages = candidates
|
||||
}
|
||||
|
||||
return languages
|
||||
|
Reference in New Issue
Block a user