2017-05-25 10:34:32 +00:00
|
|
|
package slinguist
|
|
|
|
|
|
|
|
import (
|
|
|
|
"math"
|
|
|
|
|
2017-06-08 07:27:27 +00:00
|
|
|
"gopkg.in/src-d/enry.v1/internal/tokenizer"
|
2017-05-25 10:34:32 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
// Classifier is the interface that contains the method Classify which is in charge to assign scores to the possibles candidates.
|
2017-05-31 10:07:46 +00:00
|
|
|
// The scores must order the candidates so as the highest score be the most probably language of the content. The candidates is
|
|
|
|
// a map which can be used to assign weights to languages dynamically.
|
2017-05-25 10:34:32 +00:00
|
|
|
type Classifier interface {
|
2017-05-31 10:07:46 +00:00
|
|
|
Classify(content []byte, candidates map[string]float64) map[string]float64
|
2017-05-25 10:34:32 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
type classifier struct {
|
|
|
|
languagesLogProbabilities map[string]float64
|
|
|
|
tokensLogProbabilities map[string]map[string]float64
|
|
|
|
tokensTotal float64
|
|
|
|
}
|
|
|
|
|
2017-05-31 10:07:46 +00:00
|
|
|
func (c *classifier) Classify(content []byte, candidates map[string]float64) map[string]float64 {
|
2017-05-25 10:34:32 +00:00
|
|
|
if len(content) == 0 {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2017-05-31 10:07:46 +00:00
|
|
|
var languages map[string]float64
|
2017-05-25 10:34:32 +00:00
|
|
|
if len(candidates) == 0 {
|
|
|
|
languages = c.knownLangs()
|
|
|
|
} else {
|
2017-05-31 10:07:46 +00:00
|
|
|
languages = make(map[string]float64, len(candidates))
|
|
|
|
for candidate, weight := range candidates {
|
2017-05-25 10:34:32 +00:00
|
|
|
if lang, ok := GetLanguageByAlias(candidate); ok {
|
2017-05-31 10:07:46 +00:00
|
|
|
languages[lang] = weight
|
2017-05-25 10:34:32 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
tokens := tokenizer.Tokenize(content)
|
|
|
|
scores := make(map[string]float64, len(languages))
|
2017-05-31 10:07:46 +00:00
|
|
|
for language := range languages {
|
2017-05-25 10:34:32 +00:00
|
|
|
scores[language] = c.tokensLogProbability(tokens, language) + c.languagesLogProbabilities[language]
|
|
|
|
}
|
|
|
|
|
|
|
|
return scores
|
|
|
|
}
|
|
|
|
|
2017-05-31 10:07:46 +00:00
|
|
|
func (c *classifier) knownLangs() map[string]float64 {
|
|
|
|
langs := make(map[string]float64, len(c.languagesLogProbabilities))
|
2017-05-25 10:34:32 +00:00
|
|
|
for lang := range c.languagesLogProbabilities {
|
2017-05-31 10:07:46 +00:00
|
|
|
langs[lang]++
|
2017-05-25 10:34:32 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return langs
|
|
|
|
}
|
|
|
|
|
|
|
|
func (c *classifier) tokensLogProbability(tokens []string, language string) float64 {
|
|
|
|
var sum float64
|
|
|
|
for _, token := range tokens {
|
|
|
|
sum += c.tokenProbability(token, language)
|
|
|
|
}
|
|
|
|
|
|
|
|
return sum
|
|
|
|
}
|
|
|
|
|
|
|
|
func (c *classifier) tokenProbability(token, language string) float64 {
|
|
|
|
tokenProb, ok := c.tokensLogProbabilities[language][token]
|
|
|
|
if !ok {
|
|
|
|
tokenProb = math.Log(1.000000 / c.tokensTotal)
|
|
|
|
}
|
|
|
|
|
|
|
|
return tokenProb
|
|
|
|
}
|