mirror of
https://github.com/ralsina/tartrazine.git
synced 2025-06-18 22:23:07 -03:00
changes in the API, ready to version 2
This commit is contained in:
@ -6,36 +6,11 @@ import (
|
||||
"gopkg.in/src-d/simple-linguist.v1/internal/tokenizer"
|
||||
)
|
||||
|
||||
func getLanguageByClassifier(content []byte, candidates []string, classifier Classifier) string {
|
||||
if classifier == nil {
|
||||
classifier = DefaultClassifier
|
||||
}
|
||||
|
||||
scores := classifier.Classify(content, candidates)
|
||||
if len(scores) == 0 {
|
||||
return OtherLanguage
|
||||
}
|
||||
|
||||
return getLangugeHigherScore(scores)
|
||||
}
|
||||
|
||||
func getLangugeHigherScore(scores map[string]float64) string {
|
||||
var language string
|
||||
higher := -math.MaxFloat64
|
||||
for lang, score := range scores {
|
||||
if higher < score {
|
||||
language = lang
|
||||
higher = score
|
||||
}
|
||||
}
|
||||
|
||||
return language
|
||||
}
|
||||
|
||||
// Classifier is the interface that contains the method Classify which is in charge to assign scores to the possibles candidates.
|
||||
// The scores must order the candidates so as the highest score be the most probably language of the content.
|
||||
// The scores must order the candidates so as the highest score be the most probably language of the content. The candidates is
|
||||
// a map which can be used to assign weights to languages dynamically.
|
||||
type Classifier interface {
|
||||
Classify(content []byte, candidates []string) map[string]float64
|
||||
Classify(content []byte, candidates map[string]float64) map[string]float64
|
||||
}
|
||||
|
||||
type classifier struct {
|
||||
@ -44,36 +19,36 @@ type classifier struct {
|
||||
tokensTotal float64
|
||||
}
|
||||
|
||||
func (c *classifier) Classify(content []byte, candidates []string) map[string]float64 {
|
||||
func (c *classifier) Classify(content []byte, candidates map[string]float64) map[string]float64 {
|
||||
if len(content) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
var languages []string
|
||||
var languages map[string]float64
|
||||
if len(candidates) == 0 {
|
||||
languages = c.knownLangs()
|
||||
} else {
|
||||
languages = make([]string, 0, len(candidates))
|
||||
for _, candidate := range candidates {
|
||||
languages = make(map[string]float64, len(candidates))
|
||||
for candidate, weight := range candidates {
|
||||
if lang, ok := GetLanguageByAlias(candidate); ok {
|
||||
languages = append(languages, lang)
|
||||
languages[lang] = weight
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
tokens := tokenizer.Tokenize(content)
|
||||
scores := make(map[string]float64, len(languages))
|
||||
for _, language := range languages {
|
||||
for language := range languages {
|
||||
scores[language] = c.tokensLogProbability(tokens, language) + c.languagesLogProbabilities[language]
|
||||
}
|
||||
|
||||
return scores
|
||||
}
|
||||
|
||||
func (c *classifier) knownLangs() []string {
|
||||
langs := make([]string, 0, len(c.languagesLogProbabilities))
|
||||
func (c *classifier) knownLangs() map[string]float64 {
|
||||
langs := make(map[string]float64, len(c.languagesLogProbabilities))
|
||||
for lang := range c.languagesLogProbabilities {
|
||||
langs = append(langs, lang)
|
||||
langs[lang]++
|
||||
}
|
||||
|
||||
return langs
|
||||
|
Reference in New Issue
Block a user