changes in the API, ready to version 2

This commit is contained in:
Manuel Carmona
2017-05-31 12:07:46 +02:00
parent 5b304524d1
commit 0d5dff1979
23 changed files with 1772 additions and 1448 deletions

View File

@ -6,36 +6,11 @@ import (
"gopkg.in/src-d/simple-linguist.v1/internal/tokenizer"
)
func getLanguageByClassifier(content []byte, candidates []string, classifier Classifier) string {
if classifier == nil {
classifier = DefaultClassifier
}
scores := classifier.Classify(content, candidates)
if len(scores) == 0 {
return OtherLanguage
}
return getLangugeHigherScore(scores)
}
func getLangugeHigherScore(scores map[string]float64) string {
var language string
higher := -math.MaxFloat64
for lang, score := range scores {
if higher < score {
language = lang
higher = score
}
}
return language
}
// Classifier is the interface that contains the method Classify which is in charge to assign scores to the possibles candidates.
// The scores must order the candidates so as the highest score be the most probably language of the content.
// The scores must order the candidates so as the highest score be the most probably language of the content. The candidates is
// a map which can be used to assign weights to languages dynamically.
type Classifier interface {
Classify(content []byte, candidates []string) map[string]float64
Classify(content []byte, candidates map[string]float64) map[string]float64
}
type classifier struct {
@ -44,36 +19,36 @@ type classifier struct {
tokensTotal float64
}
func (c *classifier) Classify(content []byte, candidates []string) map[string]float64 {
func (c *classifier) Classify(content []byte, candidates map[string]float64) map[string]float64 {
if len(content) == 0 {
return nil
}
var languages []string
var languages map[string]float64
if len(candidates) == 0 {
languages = c.knownLangs()
} else {
languages = make([]string, 0, len(candidates))
for _, candidate := range candidates {
languages = make(map[string]float64, len(candidates))
for candidate, weight := range candidates {
if lang, ok := GetLanguageByAlias(candidate); ok {
languages = append(languages, lang)
languages[lang] = weight
}
}
}
tokens := tokenizer.Tokenize(content)
scores := make(map[string]float64, len(languages))
for _, language := range languages {
for language := range languages {
scores[language] = c.tokensLogProbability(tokens, language) + c.languagesLogProbabilities[language]
}
return scores
}
func (c *classifier) knownLangs() []string {
langs := make([]string, 0, len(c.languagesLogProbabilities))
func (c *classifier) knownLangs() map[string]float64 {
langs := make(map[string]float64, len(c.languagesLogProbabilities))
for lang := range c.languagesLogProbabilities {
langs = append(langs, lang)
langs[lang]++
}
return langs