Merge pull request #248 from bzz/go-api-surface

go: reduce API surface
2025-09-18 03:08:13 +00:00 · 2019-10-29 19:13:48 +01:00
parent 4d5ca8b9a6 aa40f75657
commit 697929e149
4 changed files with 34 additions and 33 deletions
--- a/benchmark_test.go
+++ b/benchmark_test.go
@@ -140,7 +140,7 @@ func BenchmarkClassifyTotal(b *testing.B) {
 	b.Run("Classify()_TOTAL", func(b *testing.B) {
 		for n := 0; n < b.N; n++ {
 			for _, sample := range samples {
-				o = DefaultClassifier.Classify(sample.content, nil)
+				o = defaultClassifier.classify(sample.content, nil)
 			}

 			overcomeLanguages = o
@@ -195,7 +195,7 @@ func BenchmarkClassifyPerSample(b *testing.B) {
 	for _, sample := range samples {
 		b.Run("Classify()_SAMPLE_"+sample.filename, func(b *testing.B) {
 			for n := 0; n < b.N; n++ {
-				o = DefaultClassifier.Classify(sample.content, nil)
+				o = defaultClassifier.classify(sample.content, nil)
 			}

 			overcomeLanguages = o
--- a/classifier.go
+++ b/classifier.go
@@ -7,13 +7,13 @@ import (
 	"github.com/src-d/enry/v2/internal/tokenizer"
 )

-// Classifier is the interface in charge to detect the possible languages of the given content based on a set of
+// classifier is the interface in charge to detect the possible languages of the given content based on a set of
 // candidates. Candidates is a map which can be used to assign weights to languages dynamically.
-type Classifier interface {
-	Classify(content []byte, candidates map[string]float64) (languages []string)
+type classifier interface {
+	classify(content []byte, candidates map[string]float64) (languages []string)
 }

-type classifier struct {
+type naiveBayes struct {
 	languagesLogProbabilities map[string]float64
 	tokensLogProbabilities    map[string]map[string]float64
 	tokensTotal               float64
@@ -24,8 +24,8 @@ type scoredLanguage struct {
 	score    float64
 }

-// Classify returns a sorted slice of possible languages sorted by decreasing language's probability
-func (c *classifier) Classify(content []byte, candidates map[string]float64) []string {
+// classify returns a sorted slice of possible languages sorted by decreasing language's probability
+func (c *naiveBayes) classify(content []byte, candidates map[string]float64) []string {

 	var languages map[string]float64
 	if len(candidates) == 0 {
@@ -73,7 +73,7 @@ func sortLanguagesByScore(scoredLangs []*scoredLanguage) []string {
 	return sortedLanguages
 }

-func (c *classifier) knownLangs() map[string]float64 {
+func (c *naiveBayes) knownLangs() map[string]float64 {
 	langs := make(map[string]float64, len(c.languagesLogProbabilities))
 	for lang := range c.languagesLogProbabilities {
 		langs[lang]++
@@ -82,7 +82,7 @@ func (c *classifier) knownLangs() map[string]float64 {
 	return langs
 }

-func (c *classifier) tokensLogProbability(tokens []string, language string) float64 {
+func (c *naiveBayes) tokensLogProbability(tokens []string, language string) float64 {
 	var sum float64
 	for _, token := range tokens {
 		sum += c.tokenProbability(token, language)
@@ -91,7 +91,7 @@ func (c *classifier) tokensLogProbability(tokens []string, language string) floa
 	return sum
 }

-func (c *classifier) tokenProbability(token, language string) float64 {
+func (c *naiveBayes) tokenProbability(token, language string) float64 {
 	tokenProb, ok := c.tokensLogProbabilities[language][token]
 	if !ok {
 		tokenProb = math.Log(1.000000 / c.tokensTotal)
--- a/common.go
+++ b/common.go
@@ -26,8 +26,8 @@ var DefaultStrategies = []Strategy{
 	GetLanguagesByClassifier,
 }

-// DefaultClassifier is a Naive Bayes classifier trained on Linguist samples.
-var DefaultClassifier Classifier = &classifier{
+// defaultClassifier is a Naive Bayes classifier trained on Linguist samples.
+var defaultClassifier classifier = &naiveBayes{
 	languagesLogProbabilities: data.LanguagesLogProbabilities,
 	tokensLogProbabilities:    data.TokensLogProbabilities,
 	tokensTotal:               data.TokensTotal,
@@ -92,7 +92,7 @@ func GetLanguageByContent(filename string, content []byte) (language string, saf
 }

 // GetLanguageByClassifier returns the most probably language detected for the given content. It uses
-// DefaultClassifier, if no candidates are provided it returns OtherLanguage.
+// defaultClassifier, if no candidates are provided it returns OtherLanguage.
 func GetLanguageByClassifier(content []byte, candidates []string) (language string, safe bool) {
 	return getLanguageByStrategy(GetLanguagesByClassifier, "", content, candidates)
 }
@@ -108,10 +108,10 @@ func getFirstLanguageAndSafe(languages []string) (language string, safe bool) {
 	return
 }

-// GetLanguageBySpecificClassifier returns the most probably language for the given content using
+// getLanguageBySpecificClassifier returns the most probably language for the given content using
 // classifier to detect language.
-func GetLanguageBySpecificClassifier(content []byte, candidates []string, classifier Classifier) (language string, safe bool) {
-	languages := GetLanguagesBySpecificClassifier(content, candidates, classifier)
+func getLanguageBySpecificClassifier(content []byte, candidates []string, classifier classifier) (language string, safe bool) {
+	languages := getLanguagesBySpecificClassifier(content, candidates, classifier)
 	return getFirstLanguageAndSafe(languages)
 }

@@ -413,27 +413,28 @@ func GetLanguagesByContent(filename string, content []byte, _ []string) []string
 	return heuristic.Match(content)
 }

-// GetLanguagesByClassifier uses DefaultClassifier as a Classifier and returns a sorted slice of possible languages ordered by
-// decreasing language's probability. If there are not candidates it returns nil. It complies with the signature to be a Strategy type.
+// GetLanguagesByClassifier returns a sorted slice of possible languages ordered by
+// decreasing language's probability. If there are not candidates it returns nil.
+// It is a Strategy that uses a pre-trained defaultClassifier.
 func GetLanguagesByClassifier(filename string, content []byte, candidates []string) (languages []string) {
 	if len(candidates) == 0 {
 		return nil
 	}

-	return GetLanguagesBySpecificClassifier(content, candidates, DefaultClassifier)
+	return getLanguagesBySpecificClassifier(content, candidates, defaultClassifier)
 }

-// GetLanguagesBySpecificClassifier returns a slice of possible languages. It takes in a Classifier to be used.
-func GetLanguagesBySpecificClassifier(content []byte, candidates []string, classifier Classifier) (languages []string) {
+// getLanguagesBySpecificClassifier returns a slice of possible languages. It takes in a Classifier to be used.
+func getLanguagesBySpecificClassifier(content []byte, candidates []string, classifier classifier) (languages []string) {
 	mapCandidates := make(map[string]float64)
 	for _, candidate := range candidates {
 		mapCandidates[candidate]++
 	}

-	return classifier.Classify(content, mapCandidates)
+	return classifier.classify(content, mapCandidates)
 }

-// GetLanguageExtensions returns the different extensions being used by the language.
+// GetLanguageExtensions returns all extensions associated with the given language.
 func GetLanguageExtensions(language string) []string {
 	return data.ExtensionsByLanguage[language]
 }
--- a/common_test.go
+++ b/common_test.go
@@ -332,23 +332,23 @@ func (s *EnryTestSuite) TestGetLanguagesBySpecificClassifier() {
 		name       string
 		filename   string
 		candidates []string
-		classifier Classifier
+		classifier classifier
 		expected   string
 	}{
-		{name: "TestGetLanguagesByClassifier_1", filename: filepath.Join(s.samplesDir, "C/blob.c"), candidates: []string{"python", "ruby", "c", "c++"}, classifier: DefaultClassifier, expected: "C"},
-		{name: "TestGetLanguagesByClassifier_2", filename: filepath.Join(s.samplesDir, "C/blob.c"), candidates: nil, classifier: DefaultClassifier, expected: "C"},
-		{name: "TestGetLanguagesByClassifier_3", filename: filepath.Join(s.samplesDir, "C/main.c"), candidates: []string{}, classifier: DefaultClassifier, expected: "C"},
-		{name: "TestGetLanguagesByClassifier_4", filename: filepath.Join(s.samplesDir, "C/blob.c"), candidates: []string{"python", "ruby", "c++"}, classifier: DefaultClassifier, expected: "C++"},
-		{name: "TestGetLanguagesByClassifier_5", filename: filepath.Join(s.samplesDir, "C/blob.c"), candidates: []string{"ruby"}, classifier: DefaultClassifier, expected: "Ruby"},
-		{name: "TestGetLanguagesByClassifier_6", filename: filepath.Join(s.samplesDir, "Python/django-models-base.py"), candidates: []string{"python", "ruby", "c", "c++"}, classifier: DefaultClassifier, expected: "Python"},
-		{name: "TestGetLanguagesByClassifier_7", filename: os.DevNull, candidates: nil, classifier: DefaultClassifier, expected: "XML"},
+		{name: "TestGetLanguagesByClassifier_1", filename: filepath.Join(s.samplesDir, "C/blob.c"), candidates: []string{"python", "ruby", "c", "c++"}, classifier: defaultClassifier, expected: "C"},
+		{name: "TestGetLanguagesByClassifier_2", filename: filepath.Join(s.samplesDir, "C/blob.c"), candidates: nil, classifier: defaultClassifier, expected: "C"},
+		{name: "TestGetLanguagesByClassifier_3", filename: filepath.Join(s.samplesDir, "C/main.c"), candidates: []string{}, classifier: defaultClassifier, expected: "C"},
+		{name: "TestGetLanguagesByClassifier_4", filename: filepath.Join(s.samplesDir, "C/blob.c"), candidates: []string{"python", "ruby", "c++"}, classifier: defaultClassifier, expected: "C++"},
+		{name: "TestGetLanguagesByClassifier_5", filename: filepath.Join(s.samplesDir, "C/blob.c"), candidates: []string{"ruby"}, classifier: defaultClassifier, expected: "Ruby"},
+		{name: "TestGetLanguagesByClassifier_6", filename: filepath.Join(s.samplesDir, "Python/django-models-base.py"), candidates: []string{"python", "ruby", "c", "c++"}, classifier: defaultClassifier, expected: "Python"},
+		{name: "TestGetLanguagesByClassifier_7", filename: os.DevNull, candidates: nil, classifier: defaultClassifier, expected: "XML"},
 	}

 	for _, test := range test {
 		content, err := ioutil.ReadFile(test.filename)
 		assert.NoError(s.T(), err)

-		languages := GetLanguagesBySpecificClassifier(content, test.candidates, test.classifier)
+		languages := getLanguagesBySpecificClassifier(content, test.candidates, test.classifier)
 		var language string
 		if len(languages) == 0 {
 			language = OtherLanguage