tartrazine/classifier.go

package slinguist

import (
	"math"

	"gopkg.in/src-d/simple-linguist.v1/internal/tokenizer"
)

// Classifier is the interface that contains the method Classify which is in charge to assign scores to the possibles candidates.
// The scores must order the candidates so as the highest score be the most probably language of the content. The candidates is
// a map which can be used to assign weights to languages dynamically.
type Classifier interface {
	Classify(content []byte, candidates map[string]float64) map[string]float64
}

type classifier struct {
	languagesLogProbabilities map[string]float64
	tokensLogProbabilities    map[string]map[string]float64
	tokensTotal               float64
}

func (c *classifier) Classify(content []byte, candidates map[string]float64) map[string]float64 {
	if len(content) == 0 {
		return nil
	}

	var languages map[string]float64
	if len(candidates) == 0 {
		languages = c.knownLangs()
	} else {
		languages = make(map[string]float64, len(candidates))
		for candidate, weight := range candidates {
			if lang, ok := GetLanguageByAlias(candidate); ok {
				languages[lang] = weight
			}
		}
	}

	tokens := tokenizer.Tokenize(content)
	scores := make(map[string]float64, len(languages))
	for language := range languages {
		scores[language] = c.tokensLogProbability(tokens, language) + c.languagesLogProbabilities[language]
	}

	return scores
}

func (c *classifier) knownLangs() map[string]float64 {
	langs := make(map[string]float64, len(c.languagesLogProbabilities))
	for lang := range c.languagesLogProbabilities {
		langs[lang]++
	}

	return langs
}

func (c *classifier) tokensLogProbability(tokens []string, language string) float64 {
	var sum float64
	for _, token := range tokens {
		sum += c.tokenProbability(token, language)
	}

	return sum
}

func (c *classifier) tokenProbability(token, language string) float64 {
	tokenProb, ok := c.tokensLogProbabilities[language][token]
	if !ok {
		tokenProb = math.Log(1.000000 / c.tokensTotal)
	}

	return tokenProb
}
Added classifier to the sequence of strategies 2017-05-25 10:34:32 +00:00			`package slinguist`

			`import (`
			`"math"`

			`"gopkg.in/src-d/simple-linguist.v1/internal/tokenizer"`
			`)`

			`// Classifier is the interface that contains the method Classify which is in charge to assign scores to the possibles candidates.`
changes in the API, ready to version 2 2017-05-31 10:07:46 +00:00			`// The scores must order the candidates so as the highest score be the most probably language of the content. The candidates is`
			`// a map which can be used to assign weights to languages dynamically.`
Added classifier to the sequence of strategies 2017-05-25 10:34:32 +00:00			`type Classifier interface {`
changes in the API, ready to version 2 2017-05-31 10:07:46 +00:00			`Classify(content []byte, candidates map[string]float64) map[string]float64`
Added classifier to the sequence of strategies 2017-05-25 10:34:32 +00:00			`}`

			`type classifier struct {`
			`languagesLogProbabilities map[string]float64`
			`tokensLogProbabilities map[string]map[string]float64`
			`tokensTotal float64`
			`}`

changes in the API, ready to version 2 2017-05-31 10:07:46 +00:00			`func (c *classifier) Classify(content []byte, candidates map[string]float64) map[string]float64 {`
Added classifier to the sequence of strategies 2017-05-25 10:34:32 +00:00			`if len(content) == 0 {`
			`return nil`
			`}`

changes in the API, ready to version 2 2017-05-31 10:07:46 +00:00			`var languages map[string]float64`
Added classifier to the sequence of strategies 2017-05-25 10:34:32 +00:00			`if len(candidates) == 0 {`
			`languages = c.knownLangs()`
			`} else {`
changes in the API, ready to version 2 2017-05-31 10:07:46 +00:00			`languages = make(map[string]float64, len(candidates))`
			`for candidate, weight := range candidates {`
Added classifier to the sequence of strategies 2017-05-25 10:34:32 +00:00			`if lang, ok := GetLanguageByAlias(candidate); ok {`
changes in the API, ready to version 2 2017-05-31 10:07:46 +00:00			`languages[lang] = weight`
Added classifier to the sequence of strategies 2017-05-25 10:34:32 +00:00			`}`
			`}`
			`}`

			`tokens := tokenizer.Tokenize(content)`
			`scores := make(map[string]float64, len(languages))`
changes in the API, ready to version 2 2017-05-31 10:07:46 +00:00			`for language := range languages {`
Added classifier to the sequence of strategies 2017-05-25 10:34:32 +00:00			`scores[language] = c.tokensLogProbability(tokens, language) + c.languagesLogProbabilities[language]`
			`}`

			`return scores`
			`}`

changes in the API, ready to version 2 2017-05-31 10:07:46 +00:00			`func (c *classifier) knownLangs() map[string]float64 {`
			`langs := make(map[string]float64, len(c.languagesLogProbabilities))`
Added classifier to the sequence of strategies 2017-05-25 10:34:32 +00:00			`for lang := range c.languagesLogProbabilities {`
changes in the API, ready to version 2 2017-05-31 10:07:46 +00:00			`langs[lang]++`
Added classifier to the sequence of strategies 2017-05-25 10:34:32 +00:00			`}`

			`return langs`
			`}`

			`func (c *classifier) tokensLogProbability(tokens []string, language string) float64 {`
			`var sum float64`
			`for _, token := range tokens {`
			`sum += c.tokenProbability(token, language)`
			`}`

			`return sum`
			`}`

			`func (c *classifier) tokenProbability(token, language string) float64 {`
			`tokenProb, ok := c.tokensLogProbabilities[language][token]`
			`if !ok {`
			`tokenProb = math.Log(1.000000 / c.tokensTotal)`
			`}`

			`return tokenProb`
			`}`