mirror of
https://github.com/ralsina/tartrazine.git
synced 2025-05-24 16:21:14 -03:00
go: remove Classifier from API
Even more reduces public API surface by hiding un-used Classifier API for providing a pre-trained classifier weights. Signed-off-by: Alexander Bezzubov <bzz@apache.org>
This commit is contained in:
parent
3f0c4e182b
commit
fa097f4ed4
@ -140,7 +140,7 @@ func BenchmarkClassifyTotal(b *testing.B) {
|
|||||||
b.Run("Classify()_TOTAL", func(b *testing.B) {
|
b.Run("Classify()_TOTAL", func(b *testing.B) {
|
||||||
for n := 0; n < b.N; n++ {
|
for n := 0; n < b.N; n++ {
|
||||||
for _, sample := range samples {
|
for _, sample := range samples {
|
||||||
o = defaultClassifier.Classify(sample.content, nil)
|
o = defaultClassifier.classify(sample.content, nil)
|
||||||
}
|
}
|
||||||
|
|
||||||
overcomeLanguages = o
|
overcomeLanguages = o
|
||||||
@ -195,7 +195,7 @@ func BenchmarkClassifyPerSample(b *testing.B) {
|
|||||||
for _, sample := range samples {
|
for _, sample := range samples {
|
||||||
b.Run("Classify()_SAMPLE_"+sample.filename, func(b *testing.B) {
|
b.Run("Classify()_SAMPLE_"+sample.filename, func(b *testing.B) {
|
||||||
for n := 0; n < b.N; n++ {
|
for n := 0; n < b.N; n++ {
|
||||||
o = defaultClassifier.Classify(sample.content, nil)
|
o = defaultClassifier.classify(sample.content, nil)
|
||||||
}
|
}
|
||||||
|
|
||||||
overcomeLanguages = o
|
overcomeLanguages = o
|
||||||
|
@ -7,13 +7,13 @@ import (
|
|||||||
"github.com/src-d/enry/v2/internal/tokenizer"
|
"github.com/src-d/enry/v2/internal/tokenizer"
|
||||||
)
|
)
|
||||||
|
|
||||||
// Classifier is the interface in charge to detect the possible languages of the given content based on a set of
|
// classifier is the interface in charge to detect the possible languages of the given content based on a set of
|
||||||
// candidates. Candidates is a map which can be used to assign weights to languages dynamically.
|
// candidates. Candidates is a map which can be used to assign weights to languages dynamically.
|
||||||
type Classifier interface {
|
type classifier interface {
|
||||||
Classify(content []byte, candidates map[string]float64) (languages []string)
|
classify(content []byte, candidates map[string]float64) (languages []string)
|
||||||
}
|
}
|
||||||
|
|
||||||
type classifier struct {
|
type naiveBayes struct {
|
||||||
languagesLogProbabilities map[string]float64
|
languagesLogProbabilities map[string]float64
|
||||||
tokensLogProbabilities map[string]map[string]float64
|
tokensLogProbabilities map[string]map[string]float64
|
||||||
tokensTotal float64
|
tokensTotal float64
|
||||||
@ -24,8 +24,8 @@ type scoredLanguage struct {
|
|||||||
score float64
|
score float64
|
||||||
}
|
}
|
||||||
|
|
||||||
// Classify returns a sorted slice of possible languages sorted by decreasing language's probability
|
// classify returns a sorted slice of possible languages sorted by decreasing language's probability
|
||||||
func (c *classifier) Classify(content []byte, candidates map[string]float64) []string {
|
func (c *naiveBayes) classify(content []byte, candidates map[string]float64) []string {
|
||||||
|
|
||||||
var languages map[string]float64
|
var languages map[string]float64
|
||||||
if len(candidates) == 0 {
|
if len(candidates) == 0 {
|
||||||
@ -73,7 +73,7 @@ func sortLanguagesByScore(scoredLangs []*scoredLanguage) []string {
|
|||||||
return sortedLanguages
|
return sortedLanguages
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *classifier) knownLangs() map[string]float64 {
|
func (c *naiveBayes) knownLangs() map[string]float64 {
|
||||||
langs := make(map[string]float64, len(c.languagesLogProbabilities))
|
langs := make(map[string]float64, len(c.languagesLogProbabilities))
|
||||||
for lang := range c.languagesLogProbabilities {
|
for lang := range c.languagesLogProbabilities {
|
||||||
langs[lang]++
|
langs[lang]++
|
||||||
@ -82,7 +82,7 @@ func (c *classifier) knownLangs() map[string]float64 {
|
|||||||
return langs
|
return langs
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *classifier) tokensLogProbability(tokens []string, language string) float64 {
|
func (c *naiveBayes) tokensLogProbability(tokens []string, language string) float64 {
|
||||||
var sum float64
|
var sum float64
|
||||||
for _, token := range tokens {
|
for _, token := range tokens {
|
||||||
sum += c.tokenProbability(token, language)
|
sum += c.tokenProbability(token, language)
|
||||||
@ -91,7 +91,7 @@ func (c *classifier) tokensLogProbability(tokens []string, language string) floa
|
|||||||
return sum
|
return sum
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *classifier) tokenProbability(token, language string) float64 {
|
func (c *naiveBayes) tokenProbability(token, language string) float64 {
|
||||||
tokenProb, ok := c.tokensLogProbabilities[language][token]
|
tokenProb, ok := c.tokensLogProbabilities[language][token]
|
||||||
if !ok {
|
if !ok {
|
||||||
tokenProb = math.Log(1.000000 / c.tokensTotal)
|
tokenProb = math.Log(1.000000 / c.tokensTotal)
|
||||||
|
16
common.go
16
common.go
@ -27,7 +27,7 @@ var DefaultStrategies = []Strategy{
|
|||||||
}
|
}
|
||||||
|
|
||||||
// defaultClassifier is a Naive Bayes classifier trained on Linguist samples.
|
// defaultClassifier is a Naive Bayes classifier trained on Linguist samples.
|
||||||
var defaultClassifier Classifier = &classifier{
|
var defaultClassifier classifier = &naiveBayes{
|
||||||
languagesLogProbabilities: data.LanguagesLogProbabilities,
|
languagesLogProbabilities: data.LanguagesLogProbabilities,
|
||||||
tokensLogProbabilities: data.TokensLogProbabilities,
|
tokensLogProbabilities: data.TokensLogProbabilities,
|
||||||
tokensTotal: data.TokensTotal,
|
tokensTotal: data.TokensTotal,
|
||||||
@ -108,10 +108,10 @@ func getFirstLanguageAndSafe(languages []string) (language string, safe bool) {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
// GetLanguageBySpecificClassifier returns the most probably language for the given content using
|
// getLanguageBySpecificClassifier returns the most probably language for the given content using
|
||||||
// classifier to detect language.
|
// classifier to detect language.
|
||||||
func GetLanguageBySpecificClassifier(content []byte, candidates []string, classifier Classifier) (language string, safe bool) {
|
func getLanguageBySpecificClassifier(content []byte, candidates []string, classifier classifier) (language string, safe bool) {
|
||||||
languages := GetLanguagesBySpecificClassifier(content, candidates, classifier)
|
languages := getLanguagesBySpecificClassifier(content, candidates, classifier)
|
||||||
return getFirstLanguageAndSafe(languages)
|
return getFirstLanguageAndSafe(languages)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -420,17 +420,17 @@ func GetLanguagesByClassifier(filename string, content []byte, candidates []stri
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
return GetLanguagesBySpecificClassifier(content, candidates, defaultClassifier)
|
return getLanguagesBySpecificClassifier(content, candidates, defaultClassifier)
|
||||||
}
|
}
|
||||||
|
|
||||||
// GetLanguagesBySpecificClassifier returns a slice of possible languages. It takes in a Classifier to be used.
|
// getLanguagesBySpecificClassifier returns a slice of possible languages. It takes in a Classifier to be used.
|
||||||
func GetLanguagesBySpecificClassifier(content []byte, candidates []string, classifier Classifier) (languages []string) {
|
func getLanguagesBySpecificClassifier(content []byte, candidates []string, classifier classifier) (languages []string) {
|
||||||
mapCandidates := make(map[string]float64)
|
mapCandidates := make(map[string]float64)
|
||||||
for _, candidate := range candidates {
|
for _, candidate := range candidates {
|
||||||
mapCandidates[candidate]++
|
mapCandidates[candidate]++
|
||||||
}
|
}
|
||||||
|
|
||||||
return classifier.Classify(content, mapCandidates)
|
return classifier.classify(content, mapCandidates)
|
||||||
}
|
}
|
||||||
|
|
||||||
// GetLanguageExtensions returns the different extensions being used by the language.
|
// GetLanguageExtensions returns the different extensions being used by the language.
|
||||||
|
@ -332,7 +332,7 @@ func (s *EnryTestSuite) TestGetLanguagesBySpecificClassifier() {
|
|||||||
name string
|
name string
|
||||||
filename string
|
filename string
|
||||||
candidates []string
|
candidates []string
|
||||||
classifier Classifier
|
classifier classifier
|
||||||
expected string
|
expected string
|
||||||
}{
|
}{
|
||||||
{name: "TestGetLanguagesByClassifier_1", filename: filepath.Join(s.samplesDir, "C/blob.c"), candidates: []string{"python", "ruby", "c", "c++"}, classifier: defaultClassifier, expected: "C"},
|
{name: "TestGetLanguagesByClassifier_1", filename: filepath.Join(s.samplesDir, "C/blob.c"), candidates: []string{"python", "ruby", "c", "c++"}, classifier: defaultClassifier, expected: "C"},
|
||||||
@ -348,7 +348,7 @@ func (s *EnryTestSuite) TestGetLanguagesBySpecificClassifier() {
|
|||||||
content, err := ioutil.ReadFile(test.filename)
|
content, err := ioutil.ReadFile(test.filename)
|
||||||
assert.NoError(s.T(), err)
|
assert.NoError(s.T(), err)
|
||||||
|
|
||||||
languages := GetLanguagesBySpecificClassifier(content, test.candidates, test.classifier)
|
languages := getLanguagesBySpecificClassifier(content, test.candidates, test.classifier)
|
||||||
var language string
|
var language string
|
||||||
if len(languages) == 0 {
|
if len(languages) == 0 {
|
||||||
language = OtherLanguage
|
language = OtherLanguage
|
||||||
|
Loading…
x
Reference in New Issue
Block a user