Merge pull request #248 from bzz/go-api-surface

go: reduce API surface
This commit is contained in:
Alexander 2019-10-29 19:13:48 +01:00 committed by GitHub
commit 697929e149
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 34 additions and 33 deletions

View File

@ -140,7 +140,7 @@ func BenchmarkClassifyTotal(b *testing.B) {
b.Run("Classify()_TOTAL", func(b *testing.B) { b.Run("Classify()_TOTAL", func(b *testing.B) {
for n := 0; n < b.N; n++ { for n := 0; n < b.N; n++ {
for _, sample := range samples { for _, sample := range samples {
o = DefaultClassifier.Classify(sample.content, nil) o = defaultClassifier.classify(sample.content, nil)
} }
overcomeLanguages = o overcomeLanguages = o
@ -195,7 +195,7 @@ func BenchmarkClassifyPerSample(b *testing.B) {
for _, sample := range samples { for _, sample := range samples {
b.Run("Classify()_SAMPLE_"+sample.filename, func(b *testing.B) { b.Run("Classify()_SAMPLE_"+sample.filename, func(b *testing.B) {
for n := 0; n < b.N; n++ { for n := 0; n < b.N; n++ {
o = DefaultClassifier.Classify(sample.content, nil) o = defaultClassifier.classify(sample.content, nil)
} }
overcomeLanguages = o overcomeLanguages = o

View File

@ -7,13 +7,13 @@ import (
"github.com/src-d/enry/v2/internal/tokenizer" "github.com/src-d/enry/v2/internal/tokenizer"
) )
// Classifier is the interface in charge to detect the possible languages of the given content based on a set of // classifier is the interface in charge to detect the possible languages of the given content based on a set of
// candidates. Candidates is a map which can be used to assign weights to languages dynamically. // candidates. Candidates is a map which can be used to assign weights to languages dynamically.
type Classifier interface { type classifier interface {
Classify(content []byte, candidates map[string]float64) (languages []string) classify(content []byte, candidates map[string]float64) (languages []string)
} }
type classifier struct { type naiveBayes struct {
languagesLogProbabilities map[string]float64 languagesLogProbabilities map[string]float64
tokensLogProbabilities map[string]map[string]float64 tokensLogProbabilities map[string]map[string]float64
tokensTotal float64 tokensTotal float64
@ -24,8 +24,8 @@ type scoredLanguage struct {
score float64 score float64
} }
// Classify returns a sorted slice of possible languages sorted by decreasing language's probability // classify returns a sorted slice of possible languages sorted by decreasing language's probability
func (c *classifier) Classify(content []byte, candidates map[string]float64) []string { func (c *naiveBayes) classify(content []byte, candidates map[string]float64) []string {
var languages map[string]float64 var languages map[string]float64
if len(candidates) == 0 { if len(candidates) == 0 {
@ -73,7 +73,7 @@ func sortLanguagesByScore(scoredLangs []*scoredLanguage) []string {
return sortedLanguages return sortedLanguages
} }
func (c *classifier) knownLangs() map[string]float64 { func (c *naiveBayes) knownLangs() map[string]float64 {
langs := make(map[string]float64, len(c.languagesLogProbabilities)) langs := make(map[string]float64, len(c.languagesLogProbabilities))
for lang := range c.languagesLogProbabilities { for lang := range c.languagesLogProbabilities {
langs[lang]++ langs[lang]++
@ -82,7 +82,7 @@ func (c *classifier) knownLangs() map[string]float64 {
return langs return langs
} }
func (c *classifier) tokensLogProbability(tokens []string, language string) float64 { func (c *naiveBayes) tokensLogProbability(tokens []string, language string) float64 {
var sum float64 var sum float64
for _, token := range tokens { for _, token := range tokens {
sum += c.tokenProbability(token, language) sum += c.tokenProbability(token, language)
@ -91,7 +91,7 @@ func (c *classifier) tokensLogProbability(tokens []string, language string) floa
return sum return sum
} }
func (c *classifier) tokenProbability(token, language string) float64 { func (c *naiveBayes) tokenProbability(token, language string) float64 {
tokenProb, ok := c.tokensLogProbabilities[language][token] tokenProb, ok := c.tokensLogProbabilities[language][token]
if !ok { if !ok {
tokenProb = math.Log(1.000000 / c.tokensTotal) tokenProb = math.Log(1.000000 / c.tokensTotal)

View File

@ -26,8 +26,8 @@ var DefaultStrategies = []Strategy{
GetLanguagesByClassifier, GetLanguagesByClassifier,
} }
// DefaultClassifier is a Naive Bayes classifier trained on Linguist samples. // defaultClassifier is a Naive Bayes classifier trained on Linguist samples.
var DefaultClassifier Classifier = &classifier{ var defaultClassifier classifier = &naiveBayes{
languagesLogProbabilities: data.LanguagesLogProbabilities, languagesLogProbabilities: data.LanguagesLogProbabilities,
tokensLogProbabilities: data.TokensLogProbabilities, tokensLogProbabilities: data.TokensLogProbabilities,
tokensTotal: data.TokensTotal, tokensTotal: data.TokensTotal,
@ -92,7 +92,7 @@ func GetLanguageByContent(filename string, content []byte) (language string, saf
} }
// GetLanguageByClassifier returns the most probably language detected for the given content. It uses // GetLanguageByClassifier returns the most probably language detected for the given content. It uses
// DefaultClassifier, if no candidates are provided it returns OtherLanguage. // defaultClassifier, if no candidates are provided it returns OtherLanguage.
func GetLanguageByClassifier(content []byte, candidates []string) (language string, safe bool) { func GetLanguageByClassifier(content []byte, candidates []string) (language string, safe bool) {
return getLanguageByStrategy(GetLanguagesByClassifier, "", content, candidates) return getLanguageByStrategy(GetLanguagesByClassifier, "", content, candidates)
} }
@ -108,10 +108,10 @@ func getFirstLanguageAndSafe(languages []string) (language string, safe bool) {
return return
} }
// GetLanguageBySpecificClassifier returns the most probably language for the given content using // getLanguageBySpecificClassifier returns the most probably language for the given content using
// classifier to detect language. // classifier to detect language.
func GetLanguageBySpecificClassifier(content []byte, candidates []string, classifier Classifier) (language string, safe bool) { func getLanguageBySpecificClassifier(content []byte, candidates []string, classifier classifier) (language string, safe bool) {
languages := GetLanguagesBySpecificClassifier(content, candidates, classifier) languages := getLanguagesBySpecificClassifier(content, candidates, classifier)
return getFirstLanguageAndSafe(languages) return getFirstLanguageAndSafe(languages)
} }
@ -413,27 +413,28 @@ func GetLanguagesByContent(filename string, content []byte, _ []string) []string
return heuristic.Match(content) return heuristic.Match(content)
} }
// GetLanguagesByClassifier uses DefaultClassifier as a Classifier and returns a sorted slice of possible languages ordered by // GetLanguagesByClassifier returns a sorted slice of possible languages ordered by
// decreasing language's probability. If there are not candidates it returns nil. It complies with the signature to be a Strategy type. // decreasing language's probability. If there are not candidates it returns nil.
// It is a Strategy that uses a pre-trained defaultClassifier.
func GetLanguagesByClassifier(filename string, content []byte, candidates []string) (languages []string) { func GetLanguagesByClassifier(filename string, content []byte, candidates []string) (languages []string) {
if len(candidates) == 0 { if len(candidates) == 0 {
return nil return nil
} }
return GetLanguagesBySpecificClassifier(content, candidates, DefaultClassifier) return getLanguagesBySpecificClassifier(content, candidates, defaultClassifier)
} }
// GetLanguagesBySpecificClassifier returns a slice of possible languages. It takes in a Classifier to be used. // getLanguagesBySpecificClassifier returns a slice of possible languages. It takes in a Classifier to be used.
func GetLanguagesBySpecificClassifier(content []byte, candidates []string, classifier Classifier) (languages []string) { func getLanguagesBySpecificClassifier(content []byte, candidates []string, classifier classifier) (languages []string) {
mapCandidates := make(map[string]float64) mapCandidates := make(map[string]float64)
for _, candidate := range candidates { for _, candidate := range candidates {
mapCandidates[candidate]++ mapCandidates[candidate]++
} }
return classifier.Classify(content, mapCandidates) return classifier.classify(content, mapCandidates)
} }
// GetLanguageExtensions returns the different extensions being used by the language. // GetLanguageExtensions returns all extensions associated with the given language.
func GetLanguageExtensions(language string) []string { func GetLanguageExtensions(language string) []string {
return data.ExtensionsByLanguage[language] return data.ExtensionsByLanguage[language]
} }

View File

@ -332,23 +332,23 @@ func (s *EnryTestSuite) TestGetLanguagesBySpecificClassifier() {
name string name string
filename string filename string
candidates []string candidates []string
classifier Classifier classifier classifier
expected string expected string
}{ }{
{name: "TestGetLanguagesByClassifier_1", filename: filepath.Join(s.samplesDir, "C/blob.c"), candidates: []string{"python", "ruby", "c", "c++"}, classifier: DefaultClassifier, expected: "C"}, {name: "TestGetLanguagesByClassifier_1", filename: filepath.Join(s.samplesDir, "C/blob.c"), candidates: []string{"python", "ruby", "c", "c++"}, classifier: defaultClassifier, expected: "C"},
{name: "TestGetLanguagesByClassifier_2", filename: filepath.Join(s.samplesDir, "C/blob.c"), candidates: nil, classifier: DefaultClassifier, expected: "C"}, {name: "TestGetLanguagesByClassifier_2", filename: filepath.Join(s.samplesDir, "C/blob.c"), candidates: nil, classifier: defaultClassifier, expected: "C"},
{name: "TestGetLanguagesByClassifier_3", filename: filepath.Join(s.samplesDir, "C/main.c"), candidates: []string{}, classifier: DefaultClassifier, expected: "C"}, {name: "TestGetLanguagesByClassifier_3", filename: filepath.Join(s.samplesDir, "C/main.c"), candidates: []string{}, classifier: defaultClassifier, expected: "C"},
{name: "TestGetLanguagesByClassifier_4", filename: filepath.Join(s.samplesDir, "C/blob.c"), candidates: []string{"python", "ruby", "c++"}, classifier: DefaultClassifier, expected: "C++"}, {name: "TestGetLanguagesByClassifier_4", filename: filepath.Join(s.samplesDir, "C/blob.c"), candidates: []string{"python", "ruby", "c++"}, classifier: defaultClassifier, expected: "C++"},
{name: "TestGetLanguagesByClassifier_5", filename: filepath.Join(s.samplesDir, "C/blob.c"), candidates: []string{"ruby"}, classifier: DefaultClassifier, expected: "Ruby"}, {name: "TestGetLanguagesByClassifier_5", filename: filepath.Join(s.samplesDir, "C/blob.c"), candidates: []string{"ruby"}, classifier: defaultClassifier, expected: "Ruby"},
{name: "TestGetLanguagesByClassifier_6", filename: filepath.Join(s.samplesDir, "Python/django-models-base.py"), candidates: []string{"python", "ruby", "c", "c++"}, classifier: DefaultClassifier, expected: "Python"}, {name: "TestGetLanguagesByClassifier_6", filename: filepath.Join(s.samplesDir, "Python/django-models-base.py"), candidates: []string{"python", "ruby", "c", "c++"}, classifier: defaultClassifier, expected: "Python"},
{name: "TestGetLanguagesByClassifier_7", filename: os.DevNull, candidates: nil, classifier: DefaultClassifier, expected: "XML"}, {name: "TestGetLanguagesByClassifier_7", filename: os.DevNull, candidates: nil, classifier: defaultClassifier, expected: "XML"},
} }
for _, test := range test { for _, test := range test {
content, err := ioutil.ReadFile(test.filename) content, err := ioutil.ReadFile(test.filename)
assert.NoError(s.T(), err) assert.NoError(s.T(), err)
languages := GetLanguagesBySpecificClassifier(content, test.candidates, test.classifier) languages := getLanguagesBySpecificClassifier(content, test.candidates, test.classifier)
var language string var language string
if len(languages) == 0 { if len(languages) == 0 {
language = OtherLanguage language = OtherLanguage