tartrazine/classifier.go

package enry

import (
	"math"
	"sort"

	"github.com/src-d/enry/v2/internal/tokenizer"
)

// Classifier is the interface in charge to detect the possible languages of the given content based on a set of
// candidates. Candidates is a map which can be used to assign weights to languages dynamically.
type Classifier interface {
	Classify(content []byte, candidates map[string]float64) (languages []string)
}

type classifier struct {
	languagesLogProbabilities map[string]float64
	tokensLogProbabilities    map[string]map[string]float64
	tokensTotal               float64
}

type scoredLanguage struct {
	language string
	score    float64
}

// Classify returns a sorted slice of possible languages sorted by decreasing language's probability
func (c *classifier) Classify(content []byte, candidates map[string]float64) []string {

	var languages map[string]float64
	if len(candidates) == 0 {
		languages = c.knownLangs()
	} else {
		languages = make(map[string]float64, len(candidates))
		for candidate, weight := range candidates {
			if lang, ok := GetLanguageByAlias(candidate); ok {
				candidate = lang
			}

			languages[candidate] = weight
		}
	}

	empty := len(content) == 0
	scoredLangs := make([]*scoredLanguage, 0, len(languages))

	var tokens []string
	if !empty {
		tokens = tokenizer.Tokenize(content)
	}

	for language := range languages {
		score := c.languagesLogProbabilities[language]
		if !empty {
			score += c.tokensLogProbability(tokens, language)
		}
		scoredLangs = append(scoredLangs, &scoredLanguage{
			language: language,
			score:    score,
		})
	}

	return sortLanguagesByScore(scoredLangs)
}

func sortLanguagesByScore(scoredLangs []*scoredLanguage) []string {
	sort.Stable(byScore(scoredLangs))
	sortedLanguages := make([]string, 0, len(scoredLangs))
	for _, scoredLang := range scoredLangs {
		sortedLanguages = append(sortedLanguages, scoredLang.language)
	}

	return sortedLanguages
}

func (c *classifier) knownLangs() map[string]float64 {
	langs := make(map[string]float64, len(c.languagesLogProbabilities))
	for lang := range c.languagesLogProbabilities {
		langs[lang]++
	}

	return langs
}

func (c *classifier) tokensLogProbability(tokens []string, language string) float64 {
	var sum float64
	for _, token := range tokens {
		sum += c.tokenProbability(token, language)
	}

	return sum
}

func (c *classifier) tokenProbability(token, language string) float64 {
	tokenProb, ok := c.tokensLogProbabilities[language][token]
	if !ok {
		tokenProb = math.Log(1.000000 / c.tokensTotal)
	}

	return tokenProb
}

type byScore []*scoredLanguage

func (b byScore) Len() int           { return len(b) }
func (b byScore) Swap(i, j int)      { b[i], b[j] = b[j], b[i] }
func (b byScore) Less(i, j int) bool { return b[j].score < b[i].score }
renamed package and cli to enry 2017-06-13 11:56:07 +00:00			`package enry`
Added classifier to the sequence of strategies 2017-05-25 10:34:32 +00:00
			`import (`
			`"math"`
changed signatures for strategies 2017-06-12 11:42:20 +00:00			`"sort"`

modules: prepare for v2 release - update go.mod \w v2 - update all import paths Signed-off-by: Alexander Bezzubov <bzz@apache.org> 2019-04-14 19:28:12 +00:00			`"github.com/src-d/enry/v2/internal/tokenizer"`
Added classifier to the sequence of strategies 2017-05-25 10:34:32 +00:00			`)`

changed signatures for strategies 2017-06-12 11:42:20 +00:00			`// Classifier is the interface in charge to detect the possible languages of the given content based on a set of`
			`// candidates. Candidates is a map which can be used to assign weights to languages dynamically.`
Added classifier to the sequence of strategies 2017-05-25 10:34:32 +00:00			`type Classifier interface {`
changed signatures for strategies 2017-06-12 11:42:20 +00:00			`Classify(content []byte, candidates map[string]float64) (languages []string)`
Added classifier to the sequence of strategies 2017-05-25 10:34:32 +00:00			`}`

			`type classifier struct {`
			`languagesLogProbabilities map[string]float64`
			`tokensLogProbabilities map[string]map[string]float64`
			`tokensTotal float64`
			`}`

changed signatures for strategies 2017-06-12 11:42:20 +00:00			`type scoredLanguage struct {`
			`language string`
			`score float64`
			`}`

			`// Classify returns a sorted slice of possible languages sorted by decreasing language's probability`
			`func (c *classifier) Classify(content []byte, candidates map[string]float64) []string {`
Added classifier to the sequence of strategies 2017-05-25 10:34:32 +00:00
changes in the API, ready to version 2 2017-05-31 10:07:46 +00:00			`var languages map[string]float64`
Added classifier to the sequence of strategies 2017-05-25 10:34:32 +00:00			`if len(candidates) == 0 {`
			`languages = c.knownLangs()`
			`} else {`
changes in the API, ready to version 2 2017-05-31 10:07:46 +00:00			`languages = make(map[string]float64, len(candidates))`
			`for candidate, weight := range candidates {`
Added classifier to the sequence of strategies 2017-05-25 10:34:32 +00:00			`if lang, ok := GetLanguageByAlias(candidate); ok {`
changes to improve detection accuracy 2017-06-13 11:56:07 +00:00			`candidate = lang`
Added classifier to the sequence of strategies 2017-05-25 10:34:32 +00:00			`}`
changes to improve detection accuracy 2017-06-13 11:56:07 +00:00
			`languages[candidate] = weight`
Added classifier to the sequence of strategies 2017-05-25 10:34:32 +00:00			`}`
			`}`

Do not return empty lang. It's better to return any potential candidate than nothing. Signed-off-by: kuba-- <kuba@sourced.tech> 2019-03-14 12:26:00 +00:00			`empty := len(content) == 0`
changed signatures for strategies 2017-06-12 11:42:20 +00:00			`scoredLangs := make([]*scoredLanguage, 0, len(languages))`
Do not return empty lang. It's better to return any potential candidate than nothing. Signed-off-by: kuba-- <kuba@sourced.tech> 2019-03-14 12:26:00 +00:00
			`var tokens []string`
			`if !empty {`
			`tokens = tokenizer.Tokenize(content)`
			`}`

changes in the API, ready to version 2 2017-05-31 10:07:46 +00:00			`for language := range languages {`
Do not return empty lang. It's better to return any potential candidate than nothing. Signed-off-by: kuba-- <kuba@sourced.tech> 2019-03-14 12:26:00 +00:00			`score := c.languagesLogProbabilities[language]`
			`if !empty {`
			`score += c.tokensLogProbability(tokens, language)`
changed signatures for strategies 2017-06-12 11:42:20 +00:00			`}`
Do not return empty lang. It's better to return any potential candidate than nothing. Signed-off-by: kuba-- <kuba@sourced.tech> 2019-03-14 12:26:00 +00:00			`scoredLangs = append(scoredLangs, &scoredLanguage{`
			`language: language,`
			`score: score,`
			`})`
changed signatures for strategies 2017-06-12 11:42:20 +00:00			`}`

			`return sortLanguagesByScore(scoredLangs)`
			`}`

			`func sortLanguagesByScore(scoredLangs []*scoredLanguage) []string {`
remove reflection-based slice sort Signed-off-by: Miguel Molina <miguel@erizocosmi.co> 2017-06-26 13:12:57 +00:00			`sort.Stable(byScore(scoredLangs))`
changed signatures for strategies 2017-06-12 11:42:20 +00:00			`sortedLanguages := make([]string, 0, len(scoredLangs))`
			`for _, scoredLang := range scoredLangs {`
			`sortedLanguages = append(sortedLanguages, scoredLang.language)`
Added classifier to the sequence of strategies 2017-05-25 10:34:32 +00:00			`}`

changed signatures for strategies 2017-06-12 11:42:20 +00:00			`return sortedLanguages`
Added classifier to the sequence of strategies 2017-05-25 10:34:32 +00:00			`}`

changes in the API, ready to version 2 2017-05-31 10:07:46 +00:00			`func (c *classifier) knownLangs() map[string]float64 {`
			`langs := make(map[string]float64, len(c.languagesLogProbabilities))`
Added classifier to the sequence of strategies 2017-05-25 10:34:32 +00:00			`for lang := range c.languagesLogProbabilities {`
changes in the API, ready to version 2 2017-05-31 10:07:46 +00:00			`langs[lang]++`
Added classifier to the sequence of strategies 2017-05-25 10:34:32 +00:00			`}`

			`return langs`
			`}`

			`func (c *classifier) tokensLogProbability(tokens []string, language string) float64 {`
			`var sum float64`
			`for _, token := range tokens {`
			`sum += c.tokenProbability(token, language)`
			`}`

			`return sum`
			`}`

			`func (c *classifier) tokenProbability(token, language string) float64 {`
			`tokenProb, ok := c.tokensLogProbabilities[language][token]`
			`if !ok {`
			`tokenProb = math.Log(1.000000 / c.tokensTotal)`
			`}`

			`return tokenProb`
			`}`
remove reflection-based slice sort Signed-off-by: Miguel Molina <miguel@erizocosmi.co> 2017-06-26 13:12:57 +00:00
			`type byScore []*scoredLanguage`

			`func (b byScore) Len() int { return len(b) }`
			`func (b byScore) Swap(i, j int) { b[i], b[j] = b[j], b[i] }`
			`func (b byScore) Less(i, j int) bool { return b[j].score < b[i].score }`