From fa097f4ed428ef8bd32fcfaa4aa6ca9ffb4b90b3 Mon Sep 17 00:00:00 2001 From: Alexander Bezzubov Date: Tue, 29 Oct 2019 17:56:13 +0100 Subject: [PATCH] go: remove Classifier from API Even more reduces public API surface by hiding un-used Classifier API for providing a pre-trained classifier weights. Signed-off-by: Alexander Bezzubov --- benchmark_test.go | 4 ++-- classifier.go | 18 +++++++++--------- common.go | 16 ++++++++-------- common_test.go | 4 ++-- 4 files changed, 21 insertions(+), 21 deletions(-) diff --git a/benchmark_test.go b/benchmark_test.go index df7f6db..3b3288e 100644 --- a/benchmark_test.go +++ b/benchmark_test.go @@ -140,7 +140,7 @@ func BenchmarkClassifyTotal(b *testing.B) { b.Run("Classify()_TOTAL", func(b *testing.B) { for n := 0; n < b.N; n++ { for _, sample := range samples { - o = defaultClassifier.Classify(sample.content, nil) + o = defaultClassifier.classify(sample.content, nil) } overcomeLanguages = o @@ -195,7 +195,7 @@ func BenchmarkClassifyPerSample(b *testing.B) { for _, sample := range samples { b.Run("Classify()_SAMPLE_"+sample.filename, func(b *testing.B) { for n := 0; n < b.N; n++ { - o = defaultClassifier.Classify(sample.content, nil) + o = defaultClassifier.classify(sample.content, nil) } overcomeLanguages = o diff --git a/classifier.go b/classifier.go index e70efc3..2bcf51a 100644 --- a/classifier.go +++ b/classifier.go @@ -7,13 +7,13 @@ import ( "github.com/src-d/enry/v2/internal/tokenizer" ) -// Classifier is the interface in charge to detect the possible languages of the given content based on a set of +// classifier is the interface in charge to detect the possible languages of the given content based on a set of // candidates. Candidates is a map which can be used to assign weights to languages dynamically. -type Classifier interface { - Classify(content []byte, candidates map[string]float64) (languages []string) +type classifier interface { + classify(content []byte, candidates map[string]float64) (languages []string) } -type classifier struct { +type naiveBayes struct { languagesLogProbabilities map[string]float64 tokensLogProbabilities map[string]map[string]float64 tokensTotal float64 @@ -24,8 +24,8 @@ type scoredLanguage struct { score float64 } -// Classify returns a sorted slice of possible languages sorted by decreasing language's probability -func (c *classifier) Classify(content []byte, candidates map[string]float64) []string { +// classify returns a sorted slice of possible languages sorted by decreasing language's probability +func (c *naiveBayes) classify(content []byte, candidates map[string]float64) []string { var languages map[string]float64 if len(candidates) == 0 { @@ -73,7 +73,7 @@ func sortLanguagesByScore(scoredLangs []*scoredLanguage) []string { return sortedLanguages } -func (c *classifier) knownLangs() map[string]float64 { +func (c *naiveBayes) knownLangs() map[string]float64 { langs := make(map[string]float64, len(c.languagesLogProbabilities)) for lang := range c.languagesLogProbabilities { langs[lang]++ @@ -82,7 +82,7 @@ func (c *classifier) knownLangs() map[string]float64 { return langs } -func (c *classifier) tokensLogProbability(tokens []string, language string) float64 { +func (c *naiveBayes) tokensLogProbability(tokens []string, language string) float64 { var sum float64 for _, token := range tokens { sum += c.tokenProbability(token, language) @@ -91,7 +91,7 @@ func (c *classifier) tokensLogProbability(tokens []string, language string) floa return sum } -func (c *classifier) tokenProbability(token, language string) float64 { +func (c *naiveBayes) tokenProbability(token, language string) float64 { tokenProb, ok := c.tokensLogProbabilities[language][token] if !ok { tokenProb = math.Log(1.000000 / c.tokensTotal) diff --git a/common.go b/common.go index 866a36f..677ee9c 100644 --- a/common.go +++ b/common.go @@ -27,7 +27,7 @@ var DefaultStrategies = []Strategy{ } // defaultClassifier is a Naive Bayes classifier trained on Linguist samples. -var defaultClassifier Classifier = &classifier{ +var defaultClassifier classifier = &naiveBayes{ languagesLogProbabilities: data.LanguagesLogProbabilities, tokensLogProbabilities: data.TokensLogProbabilities, tokensTotal: data.TokensTotal, @@ -108,10 +108,10 @@ func getFirstLanguageAndSafe(languages []string) (language string, safe bool) { return } -// GetLanguageBySpecificClassifier returns the most probably language for the given content using +// getLanguageBySpecificClassifier returns the most probably language for the given content using // classifier to detect language. -func GetLanguageBySpecificClassifier(content []byte, candidates []string, classifier Classifier) (language string, safe bool) { - languages := GetLanguagesBySpecificClassifier(content, candidates, classifier) +func getLanguageBySpecificClassifier(content []byte, candidates []string, classifier classifier) (language string, safe bool) { + languages := getLanguagesBySpecificClassifier(content, candidates, classifier) return getFirstLanguageAndSafe(languages) } @@ -420,17 +420,17 @@ func GetLanguagesByClassifier(filename string, content []byte, candidates []stri return nil } - return GetLanguagesBySpecificClassifier(content, candidates, defaultClassifier) + return getLanguagesBySpecificClassifier(content, candidates, defaultClassifier) } -// GetLanguagesBySpecificClassifier returns a slice of possible languages. It takes in a Classifier to be used. -func GetLanguagesBySpecificClassifier(content []byte, candidates []string, classifier Classifier) (languages []string) { +// getLanguagesBySpecificClassifier returns a slice of possible languages. It takes in a Classifier to be used. +func getLanguagesBySpecificClassifier(content []byte, candidates []string, classifier classifier) (languages []string) { mapCandidates := make(map[string]float64) for _, candidate := range candidates { mapCandidates[candidate]++ } - return classifier.Classify(content, mapCandidates) + return classifier.classify(content, mapCandidates) } // GetLanguageExtensions returns the different extensions being used by the language. diff --git a/common_test.go b/common_test.go index 3b1c323..802e466 100644 --- a/common_test.go +++ b/common_test.go @@ -332,7 +332,7 @@ func (s *EnryTestSuite) TestGetLanguagesBySpecificClassifier() { name string filename string candidates []string - classifier Classifier + classifier classifier expected string }{ {name: "TestGetLanguagesByClassifier_1", filename: filepath.Join(s.samplesDir, "C/blob.c"), candidates: []string{"python", "ruby", "c", "c++"}, classifier: defaultClassifier, expected: "C"}, @@ -348,7 +348,7 @@ func (s *EnryTestSuite) TestGetLanguagesBySpecificClassifier() { content, err := ioutil.ReadFile(test.filename) assert.NoError(s.T(), err) - languages := GetLanguagesBySpecificClassifier(content, test.candidates, test.classifier) + languages := getLanguagesBySpecificClassifier(content, test.candidates, test.classifier) var language string if len(languages) == 0 { language = OtherLanguage