Merge pull request #214 from bzz/fix-cli-accuracy

CLI: sync report logic \w Linguist
2025-09-18 11:18:12 +00:00 · 2019-04-09 16:32:52 +02:00
parent db6073efa6 bad147cb72
commit 7a6e8ca783
4 changed files with 67 additions and 55 deletions
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# enry [![GoDoc](https://godoc.org/gopkg.in/src-d/enry.v1?status.svg)](https://godoc.org/gopkg.in/src-d/enry.v1) [![Build Status](https://travis-ci.org/src-d/enry.svg?branch=master)](https://travis-ci.org/src-d/enry) [![codecov](https://codecov.io/gh/src-d/enry/branch/master/graph/badge.svg)](https://codecov.io/gh/src-d/enry)
+# enry [![GoDoc](https://godoc.org/gopkg.in/src-d/enry.v1?status.svg)](https://godoc.org/gopkg.in/src-d/enry.v1) [![Build Status](https://travis-ci.com/src-d/enry.svg?branch=master)](https://travis-ci.com/src-d/enry) [![codecov](https://codecov.io/gh/src-d/enry/branch/master/graph/badge.svg)](https://codecov.io/gh/src-d/enry)

 File programming language detector and toolbox to ignore binary or vendored files. *enry*, started as a port to _Go_ of the original [linguist](https://github.com/github/linguist) _Ruby_ library, that has an improved *2x performance*.

@@ -183,16 +183,28 @@ To run the tests,
 Divergences from linguist
 ------------

+`enry` [CLI tool](#cli) does *not* require a full Git repository to be present in the filesystem in order to report languages.
+
 Using [linguist/samples](https://github.com/github/linguist/tree/master/samples)
 as a set for the tests, the following issues were found:

 * [Heuristics for ".es" extension](https://github.com/github/linguist/blob/e761f9b013e5b61161481fcb898b59721ee40e3d/lib/linguist/heuristics.yml#L103) in JavaScript could not be parsed, due to unsupported backreference in RE2 regexp engine

-* As of (Linguist v5.3.2)[https://github.com/github/linguist/releases/tag/v5.3.2] it is using [flex-based scanner in C for tokenization](https://github.com/github/linguist/pull/3846). Enry stil uses [extract_token](https://github.com/github/linguist/pull/3846/files#diff-d5179df0b71620e3fac4535cd1368d15L60) regex-based algorithm. Tracked under https://github.com/src-d/enry/issues/193
+* As of (Linguist v5.3.2)[https://github.com/github/linguist/releases/tag/v5.3.2] it is using [flex-based scanner in C for tokenization](https://github.com/github/linguist/pull/3846). Enry still uses [extract_token](https://github.com/github/linguist/pull/3846/files#diff-d5179df0b71620e3fac4535cd1368d15L60) regex-based algorithm. See [#193](https://github.com/src-d/enry/issues/193).

-* Bayesian classifier cann't distinguish "SQL" vs "PLpgSQL". Tracked under https://github.com/src-d/enry/issues/194
+* Bayesian classifier can't distinguish "SQL" from "PLpgSQL. See [#194](https://github.com/src-d/enry/issues/194).
+
+* Detection of [generated files](https://github.com/github/linguist/blob/bf95666fc15e49d556f2def4d0a85338423c25f3/lib/linguist/generated.rb#L53) is not supported yet.
+ (Thus they are not excluded from CLI output). See [#213](https://github.com/src-d/enry/issues/213).
+
+* XML detection strategy is not implemented. See [#192](https://github.com/src-d/enry/issues/192).
+
+* Overriding languages and types though `.gitattributes` is not yet supported. See [#18](https://github.com/src-d/enry/issues/18).
+
+* `enry` CLI output does NOT exclude `.gitignore`ed files and git submodules, as linguist does
+
+In all the cases above that have an issue number - we plan to update enry to match Linguist behaviour.

-`enry` [CLI tool](#cli) does not require a full Git repository to be present in filesystem in order to report languages.

 Benchmarks
 ------------
--- a/cmd/enry/main.go
+++ b/cmd/enry/main.go
@@ -29,8 +29,8 @@ func main() {
 	breakdownFlag := flag.Bool("breakdown", false, "")
 	jsonFlag := flag.Bool("json", false, "")
 	showVersion := flag.Bool("version", false, "Show the enry version information")
-	onlyProg := flag.Bool("prog", false, "Only show programming file types in output")
-	countMode := flag.String("mode", "file", "the method used to count file size. Available options are: file, line and byte")
+	allLangs := flag.Bool("all", false, "Show all files, including those identifed as non-programming languages")
+	countMode := flag.String("mode", "byte", "the method used to count file size. Available options are: file, line and byte")
 	limitKB := flag.Int64("limit", 16*1024, "Analyse first N KB of the file (-1 means no limit)")
 	flag.Parse()
 	limit := (*limitKB) * 1024
@@ -85,6 +85,7 @@ func main() {

 		if enry.IsVendor(relativePath) || enry.IsDotFile(relativePath) ||
 			enry.IsDocumentation(relativePath) || enry.IsConfiguration(relativePath) {
+			// TODO(bzz): skip enry.IsGeneratedPath() after https://github.com/src-d/enry/issues/213
 			if f.IsDir() {
 				return filepath.SkipDir
 			}
@@ -96,24 +97,27 @@ func main() {
 			return nil
 		}

-		language, ok := enry.GetLanguageByExtension(path)
-		if !ok {
-			if language, ok = enry.GetLanguageByFilename(path); !ok {
-				content, err := readFile(path, limit)
-				if err != nil {
-					log.Println(err)
-					return nil
-				}
+		// TODO(bzz): provide API that mimics lingust CLI output for
+		// - running ByExtension & ByFilename
+		// - reading the file, if that did not work
+		// - GetLanguage([]Strategy)
+		content, err := readFile(path, limit)
+		if err != nil {
+			log.Println(err)
+			return nil
+		}
+		// TODO(bzz): skip enry.IsGeneratedContent() as well, after https://github.com/src-d/enry/issues/213

-				language = enry.GetLanguage(filepath.Base(path), content)
-				if language == enry.OtherLanguage {
-					return nil
-				}
-			}
+		language := enry.GetLanguage(filepath.Base(path), content)
+		if language == enry.OtherLanguage {
+			return nil
 		}

-		// If we are displaying only prog. and language is not prog. skip it.
-		if *onlyProg && enry.GetLanguageType(language) != enry.Programming {
+		// If we are not asked to display all, do as
+		// https://github.com/github/linguist/blob/bf95666fc15e49d556f2def4d0a85338423c25f3/lib/linguist/blob_helper.rb#L382
+		if !*allLangs &&
+			enry.GetLanguageType(language) != enry.Programming &&
+			enry.GetLanguageType(language) != enry.Markup {
 			return nil
 		}

@@ -132,11 +136,11 @@ func main() {
 	case *jsonFlag && *breakdownFlag:
 		printBreakDown(out, &buf)
 	case *breakdownFlag:
-		printPercents(out, &buf, *countMode)
+		printPercents(root, out, &buf, *countMode)
 		buf.WriteByte('\n')
 		printBreakDown(out, &buf)
 	default:
-		printPercents(out, &buf, *countMode)
+		printPercents(root, out, &buf, *countMode)
 	}

 	fmt.Print(buf.String())
@@ -178,9 +182,9 @@ func (e filelistError) Error() string {
 	return fmt.Sprintf("Could not process the following files:\n%s", strings.Join(e, "\n"))
 }

-func printPercents(fSummary map[string][]string, buff *bytes.Buffer, mode string) {
+func printPercents(root string, fSummary map[string][]string, buff *bytes.Buffer, mode string) {
 	// Select the way we quantify 'amount' of code.
-	var reducer func([]string) (float64, filelistError)
+	reducer := fileCountValues
 	switch mode {
 	case "file":
 		reducer = fileCountValues
@@ -188,8 +192,6 @@ func printPercents(fSummary map[string][]string, buff *bytes.Buffer, mode string
 		reducer = lineCountValues
 	case "byte":
 		reducer = byteCountValues
-	default:
-		reducer = fileCountValues
 	}

 	// Reduce the list of files to a quantity of file type.
@@ -200,7 +202,7 @@ func printPercents(fSummary map[string][]string, buff *bytes.Buffer, mode string
 		fileValues      = make(map[string]float64)
 	)
 	for fType, files := range fSummary {
-		val, err := reducer(files)
+		val, err := reducer(root, files)
 		if err != nil {
 			unreadableFiles = append(unreadableFiles, err...)
 		}
@@ -225,25 +227,25 @@ func printPercents(fSummary map[string][]string, buff *bytes.Buffer, mode string
 	}
 }

-func fileCountValues(files []string) (float64, filelistError) {
+func fileCountValues(_ string, files []string) (float64, filelistError) {
 	return float64(len(files)), nil
 }

-func lineCountValues(files []string) (float64, filelistError) {
+func lineCountValues(root string, files []string) (float64, filelistError) {
 	var filesErr filelistError
 	var t float64
 	for _, fName := range files {
-		l, _ := getLines(fName, nil)
+		l, _ := getLines(filepath.Join(root, fName), nil)
 		t += float64(l)
 	}
 	return t, filesErr
 }

-func byteCountValues(files []string) (float64, filelistError) {
+func byteCountValues(root string, files []string) (float64, filelistError) {
 	var filesErr filelistError
 	var t float64
 	for _, fName := range files {
-		f, err := os.Open(fName)
+		f, err := os.Open(filepath.Join(root, fName))
 		if err != nil {
 			filesErr = append(filesErr, fName)
 			continue
--- a/common.go
+++ b/common.go
@@ -26,7 +26,7 @@ var DefaultStrategies = []Strategy{
 	GetLanguagesByClassifier,
 }

-// DefaultClassifier is a naive Bayes classifier based on Linguist samples.
+// DefaultClassifier is a Naive Bayes classifier trained on Linguist samples.
 var DefaultClassifier Classifier = &classifier{
 	languagesLogProbabilities: data.LanguagesLogProbabilities,
 	tokensLogProbabilities:    data.TokensLogProbabilities,
@@ -390,8 +390,8 @@ func getDotIndexes(filename string) []int {
 	return dots
 }

-// GetLanguagesByContent returns a slice of possible languages for the given content.
-// It complies with the signature to be a Strategy type.
+// GetLanguagesByContent returns a slice of languages for the given content.
+// It is a Strategy that uses content-based regexp heuristics and a filename extension.
 func GetLanguagesByContent(filename string, content []byte, _ []string) []string {
 	if filename == "" {
 		return nil
--- a/internal/code-generator/generator/samplesfreq.go
+++ b/internal/code-generator/generator/samplesfreq.go
@@ -7,7 +7,6 @@ import (
 	"io/ioutil"
 	"log"
 	"math"
-	"os"
 	"path/filepath"
 	"sort"
 	"strconv"
@@ -41,7 +40,7 @@ func Frequencies(fileToParse, samplesDir, outPath, tmplPath, tmplName, commit st
 }

 func getFrequencies(samplesDir string) (*samplesFrequencies, error) {
-	entries, err := ioutil.ReadDir(samplesDir)
+	langDirs, err := ioutil.ReadDir(samplesDir)
 	if err != nil {
 		return nil, err
 	}
@@ -52,13 +51,13 @@ func getFrequencies(samplesDir string) (*samplesFrequencies, error) {
 	var tokens = make(map[string]map[string]int)
 	var languageTokens = make(map[string]int)

-	for _, entry := range entries {
-		if !entry.IsDir() {
-			log.Println(err)
+	for _, langDir := range langDirs {
+		if !langDir.IsDir() {
 			continue
 		}

-		samples, err := getSamples(samplesDir, entry)
+		lang := langDir.Name()
+		samples, err := readSamples(filepath.Join(samplesDir, lang))
 		if err != nil {
 			log.Println(err)
 		}
@@ -73,7 +72,6 @@ func getFrequencies(samplesDir string) (*samplesFrequencies, error) {
 			continue
 		}

-		lang := entry.Name()
 		languageTotal += len(samples)
 		languages[lang] = len(samples)
 		tokensTotal += len(samplesTokens)
@@ -93,22 +91,23 @@ func getFrequencies(samplesDir string) (*samplesFrequencies, error) {
 	}, nil
 }

-func getSamples(samplesDir string, langDir os.FileInfo) ([]string, error) {
-	const samplesSubDir = "filenames"
-	samples := []string{}
-	path := filepath.Join(samplesDir, langDir.Name())
-	entries, err := ioutil.ReadDir(path)
+func readSamples(samplesLangDir string) ([]string, error) {
+	const samplesLangFilesDir = "filenames"
+	sampleFiles, err := ioutil.ReadDir(samplesLangDir)
 	if err != nil {
 		return nil, err
 	}

-	for _, entry := range entries {
-		if entry.Mode().IsRegular() {
-			samples = append(samples, filepath.Join(path, entry.Name()))
+	var samples []string
+	for _, sampleFile := range sampleFiles {
+		filename := filepath.Join(samplesLangDir, sampleFile.Name())
+		if sampleFile.Mode().IsRegular() {
+			samples = append(samples, filename)
+			continue
 		}

-		if entry.IsDir() && entry.Name() == samplesSubDir {
-			subSamples, err := getSubSamples(samplesDir, langDir.Name(), entry)
+		if sampleFile.IsDir() && sampleFile.Name() == samplesLangFilesDir {
+			subSamples, err := readSubSamples(filename)
 			if err != nil {
 				return nil, err
 			}
@@ -121,9 +120,8 @@ func getSamples(samplesDir string, langDir os.FileInfo) ([]string, error) {
 	return samples, nil
 }

-func getSubSamples(samplesDir, langDir string, subLangDir os.FileInfo) ([]string, error) {
+func readSubSamples(path string) ([]string, error) {
 	subSamples := []string{}
-	path := filepath.Join(samplesDir, langDir, subLangDir.Name())
 	entries, err := ioutil.ReadDir(path)
 	if err != nil {
 		return nil, err