mirror of
https://github.com/ralsina/tartrazine.git
synced 2024-11-12 22:42:23 +00:00
101 lines
2.6 KiB
Go
101 lines
2.6 KiB
Go
package slinguist
|
|
|
|
import (
|
|
"math"
|
|
|
|
"gopkg.in/src-d/simple-linguist.v1/internal/tokenizer"
|
|
)
|
|
|
|
// GetLanguageByClassifier takes in a content and a list of candidates, and apply the classifier's Classify method to
|
|
// get the most probably language. If classifier is null then DefaultClassfier will be used.
|
|
func GetLanguageByClassifier(content []byte, candidates []string, classifier Classifier) string {
|
|
if classifier == nil {
|
|
classifier = DefaultClassifier
|
|
}
|
|
|
|
scores := classifier.Classify(content, candidates)
|
|
if len(scores) == 0 {
|
|
return OtherLanguage
|
|
}
|
|
|
|
return getLangugeHigherScore(scores)
|
|
}
|
|
|
|
func getLangugeHigherScore(scores map[string]float64) string {
|
|
var language string
|
|
higher := -math.MaxFloat64
|
|
for lang, score := range scores {
|
|
if higher < score {
|
|
language = lang
|
|
higher = score
|
|
}
|
|
}
|
|
|
|
return language
|
|
}
|
|
|
|
// Classifier is the interface that contains the method Classify which is in charge to assign scores to the possibles candidates.
|
|
// The scores must order the candidates so as the highest score be the most probably language of the content.
|
|
type Classifier interface {
|
|
Classify(content []byte, candidates []string) map[string]float64
|
|
}
|
|
|
|
type classifier struct {
|
|
languagesLogProbabilities map[string]float64
|
|
tokensLogProbabilities map[string]map[string]float64
|
|
tokensTotal float64
|
|
}
|
|
|
|
func (c *classifier) Classify(content []byte, candidates []string) map[string]float64 {
|
|
if len(content) == 0 {
|
|
return nil
|
|
}
|
|
|
|
var languages []string
|
|
if len(candidates) == 0 {
|
|
languages = c.knownLangs()
|
|
} else {
|
|
languages = make([]string, 0, len(candidates))
|
|
for _, candidate := range candidates {
|
|
if lang, ok := GetLanguageByAlias(candidate); ok {
|
|
languages = append(languages, lang)
|
|
}
|
|
}
|
|
}
|
|
|
|
tokens := tokenizer.Tokenize(content)
|
|
scores := make(map[string]float64, len(languages))
|
|
for _, language := range languages {
|
|
scores[language] = c.tokensLogProbability(tokens, language) + c.languagesLogProbabilities[language]
|
|
}
|
|
|
|
return scores
|
|
}
|
|
|
|
func (c *classifier) knownLangs() []string {
|
|
langs := make([]string, 0, len(c.languagesLogProbabilities))
|
|
for lang := range c.languagesLogProbabilities {
|
|
langs = append(langs, lang)
|
|
}
|
|
|
|
return langs
|
|
}
|
|
|
|
func (c *classifier) tokensLogProbability(tokens []string, language string) float64 {
|
|
var sum float64
|
|
for _, token := range tokens {
|
|
sum += c.tokenProbability(token, language)
|
|
}
|
|
|
|
return sum
|
|
}
|
|
|
|
func (c *classifier) tokenProbability(token, language string) float64 {
|
|
tokenProb, ok := c.tokensLogProbabilities[language][token]
|
|
if !ok {
|
|
tokenProb = math.Log(1.000000 / c.tokensTotal)
|
|
}
|
|
|
|
return tokenProb
|
|
}
|