From b2b61c2a8c86b09dc9a1c9642e918b70a2c90af9 Mon Sep 17 00:00:00 2001 From: Alexander Bezzubov Date: Wed, 3 Apr 2019 15:40:23 +0200 Subject: [PATCH 1/9] gen: refactoring, renaming vars for readability This does not change the logic of the generatro but only renames/moves some vars for readability Signed-off-by: Alexander Bezzubov --- .../code-generator/generator/samplesfreq.go | 35 +++++++++---------- 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/internal/code-generator/generator/samplesfreq.go b/internal/code-generator/generator/samplesfreq.go index 7b734b0..25fe431 100644 --- a/internal/code-generator/generator/samplesfreq.go +++ b/internal/code-generator/generator/samplesfreq.go @@ -7,7 +7,6 @@ import ( "io/ioutil" "log" "math" - "os" "path/filepath" "sort" "strconv" @@ -41,7 +40,7 @@ func Frequencies(fileToParse, samplesDir, outPath, tmplPath, tmplName, commit st } func getFrequencies(samplesDir string) (*samplesFrequencies, error) { - entries, err := ioutil.ReadDir(samplesDir) + langDirs, err := ioutil.ReadDir(samplesDir) if err != nil { return nil, err } @@ -52,13 +51,14 @@ func getFrequencies(samplesDir string) (*samplesFrequencies, error) { var tokens = make(map[string]map[string]int) var languageTokens = make(map[string]int) - for _, entry := range entries { - if !entry.IsDir() { + for _, langDir := range langDirs { + if !langDir.IsDir() { log.Println(err) continue } - samples, err := getSamples(samplesDir, entry) + lang := langDir.Name() + samples, err := getSamplesFrom(filepath.Join(samplesDir, lang)) if err != nil { log.Println(err) } @@ -73,7 +73,6 @@ func getFrequencies(samplesDir string) (*samplesFrequencies, error) { continue } - lang := entry.Name() languageTotal += len(samples) languages[lang] = len(samples) tokensTotal += len(samplesTokens) @@ -93,22 +92,23 @@ func getFrequencies(samplesDir string) (*samplesFrequencies, error) { }, nil } -func getSamples(samplesDir string, langDir os.FileInfo) ([]string, error) { - const samplesSubDir = "filenames" - samples := []string{} - path := filepath.Join(samplesDir, langDir.Name()) - entries, err := ioutil.ReadDir(path) +func getSamplesFrom(samplesLangDir string) ([]string, error) { + const samplesLangFilesDir = "filenames" + var samples []string + sampleFiles, err := ioutil.ReadDir(samplesLangDir) if err != nil { return nil, err } - for _, entry := range entries { - if entry.Mode().IsRegular() { - samples = append(samples, filepath.Join(path, entry.Name())) + for _, sampleFile := range sampleFiles { + filename := filepath.Join(samplesLangDir, sampleFile.Name()) + if sampleFile.Mode().IsRegular() { + samples = append(samples, filename) + continue } - if entry.IsDir() && entry.Name() == samplesSubDir { - subSamples, err := getSubSamples(samplesDir, langDir.Name(), entry) + if sampleFile.IsDir() && sampleFile.Name() == samplesLangFilesDir { + subSamples, err := getSubSamplesFrom(filename) if err != nil { return nil, err } @@ -121,9 +121,8 @@ func getSamples(samplesDir string, langDir os.FileInfo) ([]string, error) { return samples, nil } -func getSubSamples(samplesDir, langDir string, subLangDir os.FileInfo) ([]string, error) { +func getSubSamplesFrom(path string) ([]string, error) { subSamples := []string{} - path := filepath.Join(samplesDir, langDir, subLangDir.Name()) entries, err := ioutil.ReadDir(path) if err != nil { return nil, err From df01124e1877e4f90f8938ad672bd4611d0ad9e0 Mon Sep 17 00:00:00 2001 From: Alexander Bezzubov Date: Wed, 3 Apr 2019 16:07:14 +0200 Subject: [PATCH 2/9] doc: better wording in API godoc Signed-off-by: Alexander Bezzubov --- common.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/common.go b/common.go index 3486274..b88c339 100644 --- a/common.go +++ b/common.go @@ -26,7 +26,7 @@ var DefaultStrategies = []Strategy{ GetLanguagesByClassifier, } -// DefaultClassifier is a naive Bayes classifier based on Linguist samples. +// DefaultClassifier is a Naive Bayes classifier trained on Linguist samples. var DefaultClassifier Classifier = &classifier{ languagesLogProbabilities: data.LanguagesLogProbabilities, tokensLogProbabilities: data.TokensLogProbabilities, @@ -390,8 +390,8 @@ func getDotIndexes(filename string) []int { return dots } -// GetLanguagesByContent returns a slice of possible languages for the given content. -// It complies with the signature to be a Strategy type. +// GetLanguagesByContent returns a slice of languages for the given content. +// It is a Strategy that uses a content-based regexp heuristics and a filename extension. func GetLanguagesByContent(filename string, content []byte, _ []string) []string { if filename == "" { return nil From 88810fed12a5264c82a147fa30c1eaa2e7bfa015 Mon Sep 17 00:00:00 2001 From: Alexander Bezzubov Date: Wed, 3 Apr 2019 16:21:10 +0200 Subject: [PATCH 3/9] cli: mimic linguist output by default This includes next main changes: - default: print only Programming and Markup types as Linguist does - `-prog` option replaced with `-all`, to allow for previous behavior - always use GetLanguage as main source of truth that fixes #204 and perf will be restored under #212 Signed-off-by: Alexander Bezzubov --- cmd/enry/main.go | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/cmd/enry/main.go b/cmd/enry/main.go index b67d2aa..139a6c0 100644 --- a/cmd/enry/main.go +++ b/cmd/enry/main.go @@ -29,7 +29,7 @@ func main() { breakdownFlag := flag.Bool("breakdown", false, "") jsonFlag := flag.Bool("json", false, "") showVersion := flag.Bool("version", false, "Show the enry version information") - onlyProg := flag.Bool("prog", false, "Only show programming file types in output") + allLangs := flag.Bool("all", false, "Show not only the files with programming languages (default) but all languages instead") countMode := flag.String("mode", "file", "the method used to count file size. Available options are: file, line and byte") limitKB := flag.Int64("limit", 16*1024, "Analyse first N KB of the file (-1 means no limit)") flag.Parse() @@ -96,24 +96,25 @@ func main() { return nil } - language, ok := enry.GetLanguageByExtension(path) - if !ok { - if language, ok = enry.GetLanguageByFilename(path); !ok { - content, err := readFile(path, limit) - if err != nil { - log.Println(err) - return nil - } - - language = enry.GetLanguage(filepath.Base(path), content) - if language == enry.OtherLanguage { - return nil - } - } + //TODO(bzz): provide API that mimics lingust CLI output for + // running ByExtension & ByFilename + // reading the file, if that did not work + // GetLanguage([]Strategy) + content, err := readFile(path, limit) + if err != nil { + log.Println(err) + return nil } - // If we are displaying only prog. and language is not prog. skip it. - if *onlyProg && enry.GetLanguageType(language) != enry.Programming { + language := enry.GetLanguage(filepath.Base(path), content) + if language == enry.OtherLanguage { + return nil + } + + // If we are displaying only prog, skip it + if !*allLangs && + enry.GetLanguageType(language) != enry.Programming && + enry.GetLanguageType(language) != enry.Markup { return nil } From c9f1793a78198109297b496fdd1d42d9699b1dfa Mon Sep 17 00:00:00 2001 From: Alexander Bezzubov Date: Wed, 3 Apr 2019 17:35:03 +0200 Subject: [PATCH 4/9] doc: update godoc and README \w supported features Signed-off-by: Alexander Bezzubov --- README.md | 13 +++++++++++-- cmd/enry/main.go | 5 ++++- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 3fb2c8d..3b93a86 100644 --- a/README.md +++ b/README.md @@ -188,9 +188,18 @@ as a set for the tests, the following issues were found: * [Heuristics for ".es" extension](https://github.com/github/linguist/blob/e761f9b013e5b61161481fcb898b59721ee40e3d/lib/linguist/heuristics.yml#L103) in JavaScript could not be parsed, due to unsupported backreference in RE2 regexp engine -* As of (Linguist v5.3.2)[https://github.com/github/linguist/releases/tag/v5.3.2] it is using [flex-based scanner in C for tokenization](https://github.com/github/linguist/pull/3846). Enry stil uses [extract_token](https://github.com/github/linguist/pull/3846/files#diff-d5179df0b71620e3fac4535cd1368d15L60) regex-based algorithm. Tracked under https://github.com/src-d/enry/issues/193 +* As of (Linguist v5.3.2)[https://github.com/github/linguist/releases/tag/v5.3.2] it is using [flex-based scanner in C for tokenization](https://github.com/github/linguist/pull/3846). Enry stil uses [extract_token](https://github.com/github/linguist/pull/3846/files#diff-d5179df0b71620e3fac4535cd1368d15L60) regex-based algorithm. [#193](https://github.com/src-d/enry/issues/193) -* Bayesian classifier cann't distinguish "SQL" vs "PLpgSQL". Tracked under https://github.com/src-d/enry/issues/194 +* Bayesian classifier cann't distinguish "SQL" vs "PLpgSQL. [#194](https://github.com/src-d/enry/issues/194) + +* Dection of [generated files](https://github.com/github/linguist/blob/bf95666fc15e49d556f2def4d0a85338423c25f3/lib/linguist/generated.rb#L53) is not supported yet. + (Thus they are not exclued from CLI output) [#213](https://github.com/src-d/enry/issues/213) + +* XML detection strategy is not implemented. [#192](https://github.com/src-d/enry/issues/192) + +* Overriding languaes and types though `.gitattributes` is not yet supported. [#18](https://github.com/src-d/enry/issues/18) + +* enry CLI output does NOT exclude `.gitignore`ed files and submodel dirs as linguist does `enry` [CLI tool](#cli) does not require a full Git repository to be present in filesystem in order to report languages. diff --git a/cmd/enry/main.go b/cmd/enry/main.go index 139a6c0..1b7e4aa 100644 --- a/cmd/enry/main.go +++ b/cmd/enry/main.go @@ -85,6 +85,7 @@ func main() { if enry.IsVendor(relativePath) || enry.IsDotFile(relativePath) || enry.IsDocumentation(relativePath) || enry.IsConfiguration(relativePath) { + //TODO(bzz): skip enry.IsGeneratedPath() after https://github.com/src-d/enry/issues/213 if f.IsDir() { return filepath.SkipDir } @@ -105,13 +106,15 @@ func main() { log.Println(err) return nil } + //TODO(bzz): skip enry.IsGeneratedContent() after https://github.com/src-d/enry/issues/213 language := enry.GetLanguage(filepath.Base(path), content) if language == enry.OtherLanguage { return nil } - // If we are displaying only prog, skip it + // If we are not asked to display all, do as + // https://github.com/github/linguist/blob/bf95666fc15e49d556f2def4d0a85338423c25f3/lib/linguist/blob_helper.rb#L382 if !*allLangs && enry.GetLanguageType(language) != enry.Programming && enry.GetLanguageType(language) != enry.Markup { From 94e8598d3d05c08def60e7dea0a3404f797a5010 Mon Sep 17 00:00:00 2001 From: Alexander Bezzubov Date: Thu, 4 Apr 2019 15:27:12 +0200 Subject: [PATCH 5/9] doc: update TravisCI links Signed-off-by: Alexander Bezzubov --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 3b93a86..7c2f7bb 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# enry [![GoDoc](https://godoc.org/gopkg.in/src-d/enry.v1?status.svg)](https://godoc.org/gopkg.in/src-d/enry.v1) [![Build Status](https://travis-ci.org/src-d/enry.svg?branch=master)](https://travis-ci.org/src-d/enry) [![codecov](https://codecov.io/gh/src-d/enry/branch/master/graph/badge.svg)](https://codecov.io/gh/src-d/enry) +# enry [![GoDoc](https://godoc.org/gopkg.in/src-d/enry.v1?status.svg)](https://godoc.org/gopkg.in/src-d/enry.v1) [![Build Status](https://travis-ci.com/src-d/enry.svg?branch=master)](https://travis-ci.com/src-d/enry) [![codecov](https://codecov.io/gh/src-d/enry/branch/master/graph/badge.svg)](https://codecov.io/gh/src-d/enry) File programming language detector and toolbox to ignore binary or vendored files. *enry*, started as a port to _Go_ of the original [linguist](https://github.com/github/linguist) _Ruby_ library, that has an improved *2x performance*. From b6027d6d0c4f31f63f1527cc7270a838585ec2dc Mon Sep 17 00:00:00 2001 From: Alexander Bezzubov Date: Thu, 4 Apr 2019 22:02:30 +0200 Subject: [PATCH 6/9] cli: mode=byte by default + fix file reading Signed-off-by: Alexander Bezzubov --- cmd/enry/main.go | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/cmd/enry/main.go b/cmd/enry/main.go index 1b7e4aa..20095ec 100644 --- a/cmd/enry/main.go +++ b/cmd/enry/main.go @@ -30,7 +30,7 @@ func main() { jsonFlag := flag.Bool("json", false, "") showVersion := flag.Bool("version", false, "Show the enry version information") allLangs := flag.Bool("all", false, "Show not only the files with programming languages (default) but all languages instead") - countMode := flag.String("mode", "file", "the method used to count file size. Available options are: file, line and byte") + countMode := flag.String("mode", "byte", "the method used to count file size. Available options are: file, line and byte") limitKB := flag.Int64("limit", 16*1024, "Analyse first N KB of the file (-1 means no limit)") flag.Parse() limit := (*limitKB) * 1024 @@ -136,11 +136,11 @@ func main() { case *jsonFlag && *breakdownFlag: printBreakDown(out, &buf) case *breakdownFlag: - printPercents(out, &buf, *countMode) + printPercents(root, out, &buf, *countMode) buf.WriteByte('\n') printBreakDown(out, &buf) default: - printPercents(out, &buf, *countMode) + printPercents(root, out, &buf, *countMode) } fmt.Print(buf.String()) @@ -182,9 +182,9 @@ func (e filelistError) Error() string { return fmt.Sprintf("Could not process the following files:\n%s", strings.Join(e, "\n")) } -func printPercents(fSummary map[string][]string, buff *bytes.Buffer, mode string) { +func printPercents(root string, fSummary map[string][]string, buff *bytes.Buffer, mode string) { // Select the way we quantify 'amount' of code. - var reducer func([]string) (float64, filelistError) + var reducer func(string, []string) (float64, filelistError) switch mode { case "file": reducer = fileCountValues @@ -204,7 +204,8 @@ func printPercents(fSummary map[string][]string, buff *bytes.Buffer, mode string fileValues = make(map[string]float64) ) for fType, files := range fSummary { - val, err := reducer(files) + //FIXME(bzz): all files here have relative paths + val, err := reducer(root, files) if err != nil { unreadableFiles = append(unreadableFiles, err...) } @@ -229,25 +230,25 @@ func printPercents(fSummary map[string][]string, buff *bytes.Buffer, mode string } } -func fileCountValues(files []string) (float64, filelistError) { +func fileCountValues(_ string, files []string) (float64, filelistError) { return float64(len(files)), nil } -func lineCountValues(files []string) (float64, filelistError) { +func lineCountValues(root string, files []string) (float64, filelistError) { var filesErr filelistError var t float64 for _, fName := range files { - l, _ := getLines(fName, nil) + l, _ := getLines(filepath.Join(root, fName), nil) t += float64(l) } return t, filesErr } -func byteCountValues(files []string) (float64, filelistError) { +func byteCountValues(root string, files []string) (float64, filelistError) { var filesErr filelistError var t float64 for _, fName := range files { - f, err := os.Open(fName) + f, err := os.Open(filepath.Join(root, fName)) if err != nil { filesErr = append(filesErr, fName) continue From 416afb45fcdb8606cbfd0e9bc0f81ce889f8d8d3 Mon Sep 17 00:00:00 2001 From: Alexander Bezzubov Date: Mon, 8 Apr 2019 15:58:46 +0200 Subject: [PATCH 7/9] doc: better wording in 'divergences from linguist' section Signed-off-by: Alexander Bezzubov --- README.md | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 7c2f7bb..b36d8b9 100644 --- a/README.md +++ b/README.md @@ -183,25 +183,28 @@ To run the tests, Divergences from linguist ------------ +`enry` [CLI tool](#cli) does *not* require a full Git repository to be present in the filesystem in order to report languages. + Using [linguist/samples](https://github.com/github/linguist/tree/master/samples) as a set for the tests, the following issues were found: * [Heuristics for ".es" extension](https://github.com/github/linguist/blob/e761f9b013e5b61161481fcb898b59721ee40e3d/lib/linguist/heuristics.yml#L103) in JavaScript could not be parsed, due to unsupported backreference in RE2 regexp engine -* As of (Linguist v5.3.2)[https://github.com/github/linguist/releases/tag/v5.3.2] it is using [flex-based scanner in C for tokenization](https://github.com/github/linguist/pull/3846). Enry stil uses [extract_token](https://github.com/github/linguist/pull/3846/files#diff-d5179df0b71620e3fac4535cd1368d15L60) regex-based algorithm. [#193](https://github.com/src-d/enry/issues/193) +* As of (Linguist v5.3.2)[https://github.com/github/linguist/releases/tag/v5.3.2] it is using [flex-based scanner in C for tokenization](https://github.com/github/linguist/pull/3846). Enry stil uses [extract_token](https://github.com/github/linguist/pull/3846/files#diff-d5179df0b71620e3fac4535cd1368d15L60) regex-based algorithm. See [#193](https://github.com/src-d/enry/issues/193). -* Bayesian classifier cann't distinguish "SQL" vs "PLpgSQL. [#194](https://github.com/src-d/enry/issues/194) +* Bayesian classifier can't distinguish "SQL" from "PLpgSQL. See [#194](https://github.com/src-d/enry/issues/194). -* Dection of [generated files](https://github.com/github/linguist/blob/bf95666fc15e49d556f2def4d0a85338423c25f3/lib/linguist/generated.rb#L53) is not supported yet. - (Thus they are not exclued from CLI output) [#213](https://github.com/src-d/enry/issues/213) +* Detection of [generated files](https://github.com/github/linguist/blob/bf95666fc15e49d556f2def4d0a85338423c25f3/lib/linguist/generated.rb#L53) is not supported yet. + (Thus they are not excluded from CLI output). See [#213](https://github.com/src-d/enry/issues/213). -* XML detection strategy is not implemented. [#192](https://github.com/src-d/enry/issues/192) +* XML detection strategy is not implemented. See [#192](https://github.com/src-d/enry/issues/192). -* Overriding languaes and types though `.gitattributes` is not yet supported. [#18](https://github.com/src-d/enry/issues/18) +* Overriding languages and types though `.gitattributes` is not yet supported. See [#18](https://github.com/src-d/enry/issues/18). -* enry CLI output does NOT exclude `.gitignore`ed files and submodel dirs as linguist does +* `enry` CLI output does NOT exclude `.gitignore`ed files and git submodules, as linguist does + +In all the cases above that have an issue number - we plan to update enry to match Linguist behaviour. -`enry` [CLI tool](#cli) does not require a full Git repository to be present in filesystem in order to report languages. Benchmarks ------------ From bdb5603f28eee0c96782d4e4520f9e9644a5fb3f Mon Sep 17 00:00:00 2001 From: Alexander Bezzubov Date: Mon, 8 Apr 2019 16:07:10 +0200 Subject: [PATCH 8/9] Address code review feedback Signed-off-by: Alexander Bezzubov --- cmd/enry/main.go | 19 ++++++++----------- common.go | 2 +- .../code-generator/generator/samplesfreq.go | 11 +++++------ 3 files changed, 14 insertions(+), 18 deletions(-) diff --git a/cmd/enry/main.go b/cmd/enry/main.go index 20095ec..9ad255d 100644 --- a/cmd/enry/main.go +++ b/cmd/enry/main.go @@ -29,7 +29,7 @@ func main() { breakdownFlag := flag.Bool("breakdown", false, "") jsonFlag := flag.Bool("json", false, "") showVersion := flag.Bool("version", false, "Show the enry version information") - allLangs := flag.Bool("all", false, "Show not only the files with programming languages (default) but all languages instead") + allLangs := flag.Bool("all", false, "Show all files, including those identifed as non-programming languages") countMode := flag.String("mode", "byte", "the method used to count file size. Available options are: file, line and byte") limitKB := flag.Int64("limit", 16*1024, "Analyse first N KB of the file (-1 means no limit)") flag.Parse() @@ -85,7 +85,7 @@ func main() { if enry.IsVendor(relativePath) || enry.IsDotFile(relativePath) || enry.IsDocumentation(relativePath) || enry.IsConfiguration(relativePath) { - //TODO(bzz): skip enry.IsGeneratedPath() after https://github.com/src-d/enry/issues/213 + // TODO(bzz): skip enry.IsGeneratedPath() after https://github.com/src-d/enry/issues/213 if f.IsDir() { return filepath.SkipDir } @@ -97,16 +97,16 @@ func main() { return nil } - //TODO(bzz): provide API that mimics lingust CLI output for - // running ByExtension & ByFilename - // reading the file, if that did not work - // GetLanguage([]Strategy) + // TODO(bzz): provide API that mimics lingust CLI output for + // - running ByExtension & ByFilename + // - reading the file, if that did not work + // - GetLanguage([]Strategy) content, err := readFile(path, limit) if err != nil { log.Println(err) return nil } - //TODO(bzz): skip enry.IsGeneratedContent() after https://github.com/src-d/enry/issues/213 + // TODO(bzz): skip enry.IsGeneratedContent() as well, after https://github.com/src-d/enry/issues/213 language := enry.GetLanguage(filepath.Base(path), content) if language == enry.OtherLanguage { @@ -184,7 +184,7 @@ func (e filelistError) Error() string { func printPercents(root string, fSummary map[string][]string, buff *bytes.Buffer, mode string) { // Select the way we quantify 'amount' of code. - var reducer func(string, []string) (float64, filelistError) + reducer := fileCountValues switch mode { case "file": reducer = fileCountValues @@ -192,8 +192,6 @@ func printPercents(root string, fSummary map[string][]string, buff *bytes.Buffer reducer = lineCountValues case "byte": reducer = byteCountValues - default: - reducer = fileCountValues } // Reduce the list of files to a quantity of file type. @@ -204,7 +202,6 @@ func printPercents(root string, fSummary map[string][]string, buff *bytes.Buffer fileValues = make(map[string]float64) ) for fType, files := range fSummary { - //FIXME(bzz): all files here have relative paths val, err := reducer(root, files) if err != nil { unreadableFiles = append(unreadableFiles, err...) diff --git a/common.go b/common.go index b88c339..567dd96 100644 --- a/common.go +++ b/common.go @@ -391,7 +391,7 @@ func getDotIndexes(filename string) []int { } // GetLanguagesByContent returns a slice of languages for the given content. -// It is a Strategy that uses a content-based regexp heuristics and a filename extension. +// It is a Strategy that uses content-based regexp heuristics and a filename extension. func GetLanguagesByContent(filename string, content []byte, _ []string) []string { if filename == "" { return nil diff --git a/internal/code-generator/generator/samplesfreq.go b/internal/code-generator/generator/samplesfreq.go index 25fe431..fc166d9 100644 --- a/internal/code-generator/generator/samplesfreq.go +++ b/internal/code-generator/generator/samplesfreq.go @@ -53,12 +53,11 @@ func getFrequencies(samplesDir string) (*samplesFrequencies, error) { for _, langDir := range langDirs { if !langDir.IsDir() { - log.Println(err) continue } lang := langDir.Name() - samples, err := getSamplesFrom(filepath.Join(samplesDir, lang)) + samples, err := readSamples(filepath.Join(samplesDir, lang)) if err != nil { log.Println(err) } @@ -92,14 +91,14 @@ func getFrequencies(samplesDir string) (*samplesFrequencies, error) { }, nil } -func getSamplesFrom(samplesLangDir string) ([]string, error) { +func readSamples(samplesLangDir string) ([]string, error) { const samplesLangFilesDir = "filenames" - var samples []string sampleFiles, err := ioutil.ReadDir(samplesLangDir) if err != nil { return nil, err } + var samples []string for _, sampleFile := range sampleFiles { filename := filepath.Join(samplesLangDir, sampleFile.Name()) if sampleFile.Mode().IsRegular() { @@ -108,7 +107,7 @@ func getSamplesFrom(samplesLangDir string) ([]string, error) { } if sampleFile.IsDir() && sampleFile.Name() == samplesLangFilesDir { - subSamples, err := getSubSamplesFrom(filename) + subSamples, err := readSubSamples(filename) if err != nil { return nil, err } @@ -121,7 +120,7 @@ func getSamplesFrom(samplesLangDir string) ([]string, error) { return samples, nil } -func getSubSamplesFrom(path string) ([]string, error) { +func readSubSamples(path string) ([]string, error) { subSamples := []string{} entries, err := ioutil.ReadDir(path) if err != nil { From bad147cb7286fe16cfab4f82c1ffb5791c4a722d Mon Sep 17 00:00:00 2001 From: "M. J. Fromberger" Date: Mon, 8 Apr 2019 18:00:43 +0200 Subject: [PATCH 9/9] Update README.md Co-Authored-By: bzz --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index b36d8b9..66e2156 100644 --- a/README.md +++ b/README.md @@ -190,7 +190,7 @@ as a set for the tests, the following issues were found: * [Heuristics for ".es" extension](https://github.com/github/linguist/blob/e761f9b013e5b61161481fcb898b59721ee40e3d/lib/linguist/heuristics.yml#L103) in JavaScript could not be parsed, due to unsupported backreference in RE2 regexp engine -* As of (Linguist v5.3.2)[https://github.com/github/linguist/releases/tag/v5.3.2] it is using [flex-based scanner in C for tokenization](https://github.com/github/linguist/pull/3846). Enry stil uses [extract_token](https://github.com/github/linguist/pull/3846/files#diff-d5179df0b71620e3fac4535cd1368d15L60) regex-based algorithm. See [#193](https://github.com/src-d/enry/issues/193). +* As of (Linguist v5.3.2)[https://github.com/github/linguist/releases/tag/v5.3.2] it is using [flex-based scanner in C for tokenization](https://github.com/github/linguist/pull/3846). Enry still uses [extract_token](https://github.com/github/linguist/pull/3846/files#diff-d5179df0b71620e3fac4535cd1368d15L60) regex-based algorithm. See [#193](https://github.com/src-d/enry/issues/193). * Bayesian classifier can't distinguish "SQL" from "PLpgSQL. See [#194](https://github.com/src-d/enry/issues/194).