tartrazine/classifier.go
2017-06-13 14:18:23 +02:00

74 lines
1.9 KiB
Go

package enry
import (
"math"
"gopkg.in/src-d/enry.v1/internal/tokenizer"
)
// Classifier is the interface that contains the method Classify which is in charge to assign scores to the possibles candidates.
// The scores must order the candidates so as the highest score be the most probably language of the content. The candidates is
// a map which can be used to assign weights to languages dynamically.
type Classifier interface {
Classify(content []byte, candidates map[string]float64) map[string]float64
}
type classifier struct {
languagesLogProbabilities map[string]float64
tokensLogProbabilities map[string]map[string]float64
tokensTotal float64
}
func (c *classifier) Classify(content []byte, candidates map[string]float64) map[string]float64 {
if len(content) == 0 {
return nil
}
var languages map[string]float64
if len(candidates) == 0 {
languages = c.knownLangs()
} else {
languages = make(map[string]float64, len(candidates))
for candidate, weight := range candidates {
if lang, ok := GetLanguageByAlias(candidate); ok {
languages[lang] = weight
}
}
}
tokens := tokenizer.Tokenize(content)
scores := make(map[string]float64, len(languages))
for language := range languages {
scores[language] = c.tokensLogProbability(tokens, language) + c.languagesLogProbabilities[language]
}
return scores
}
func (c *classifier) knownLangs() map[string]float64 {
langs := make(map[string]float64, len(c.languagesLogProbabilities))
for lang := range c.languagesLogProbabilities {
langs[lang]++
}
return langs
}
func (c *classifier) tokensLogProbability(tokens []string, language string) float64 {
var sum float64
for _, token := range tokens {
sum += c.tokenProbability(token, language)
}
return sum
}
func (c *classifier) tokenProbability(token, language string) float64 {
tokenProb, ok := c.tokensLogProbabilities[language][token]
if !ok {
tokenProb = math.Log(1.000000 / c.tokensTotal)
}
return tokenProb
}