Merge pull request #214 from bzz/fix-cli-accuracy

CLI: sync report logic \w Linguist
This commit is contained in:
Alexander 2019-04-09 16:32:52 +02:00 committed by GitHub
commit 7a6e8ca783
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 67 additions and 55 deletions

View File

@ -1,4 +1,4 @@
# enry [![GoDoc](https://godoc.org/gopkg.in/src-d/enry.v1?status.svg)](https://godoc.org/gopkg.in/src-d/enry.v1) [![Build Status](https://travis-ci.org/src-d/enry.svg?branch=master)](https://travis-ci.org/src-d/enry) [![codecov](https://codecov.io/gh/src-d/enry/branch/master/graph/badge.svg)](https://codecov.io/gh/src-d/enry) # enry [![GoDoc](https://godoc.org/gopkg.in/src-d/enry.v1?status.svg)](https://godoc.org/gopkg.in/src-d/enry.v1) [![Build Status](https://travis-ci.com/src-d/enry.svg?branch=master)](https://travis-ci.com/src-d/enry) [![codecov](https://codecov.io/gh/src-d/enry/branch/master/graph/badge.svg)](https://codecov.io/gh/src-d/enry)
File programming language detector and toolbox to ignore binary or vendored files. *enry*, started as a port to _Go_ of the original [linguist](https://github.com/github/linguist) _Ruby_ library, that has an improved *2x performance*. File programming language detector and toolbox to ignore binary or vendored files. *enry*, started as a port to _Go_ of the original [linguist](https://github.com/github/linguist) _Ruby_ library, that has an improved *2x performance*.
@ -183,16 +183,28 @@ To run the tests,
Divergences from linguist Divergences from linguist
------------ ------------
`enry` [CLI tool](#cli) does *not* require a full Git repository to be present in the filesystem in order to report languages.
Using [linguist/samples](https://github.com/github/linguist/tree/master/samples) Using [linguist/samples](https://github.com/github/linguist/tree/master/samples)
as a set for the tests, the following issues were found: as a set for the tests, the following issues were found:
* [Heuristics for ".es" extension](https://github.com/github/linguist/blob/e761f9b013e5b61161481fcb898b59721ee40e3d/lib/linguist/heuristics.yml#L103) in JavaScript could not be parsed, due to unsupported backreference in RE2 regexp engine * [Heuristics for ".es" extension](https://github.com/github/linguist/blob/e761f9b013e5b61161481fcb898b59721ee40e3d/lib/linguist/heuristics.yml#L103) in JavaScript could not be parsed, due to unsupported backreference in RE2 regexp engine
* As of (Linguist v5.3.2)[https://github.com/github/linguist/releases/tag/v5.3.2] it is using [flex-based scanner in C for tokenization](https://github.com/github/linguist/pull/3846). Enry stil uses [extract_token](https://github.com/github/linguist/pull/3846/files#diff-d5179df0b71620e3fac4535cd1368d15L60) regex-based algorithm. Tracked under https://github.com/src-d/enry/issues/193 * As of (Linguist v5.3.2)[https://github.com/github/linguist/releases/tag/v5.3.2] it is using [flex-based scanner in C for tokenization](https://github.com/github/linguist/pull/3846). Enry still uses [extract_token](https://github.com/github/linguist/pull/3846/files#diff-d5179df0b71620e3fac4535cd1368d15L60) regex-based algorithm. See [#193](https://github.com/src-d/enry/issues/193).
* Bayesian classifier cann't distinguish "SQL" vs "PLpgSQL". Tracked under https://github.com/src-d/enry/issues/194 * Bayesian classifier can't distinguish "SQL" from "PLpgSQL. See [#194](https://github.com/src-d/enry/issues/194).
* Detection of [generated files](https://github.com/github/linguist/blob/bf95666fc15e49d556f2def4d0a85338423c25f3/lib/linguist/generated.rb#L53) is not supported yet.
(Thus they are not excluded from CLI output). See [#213](https://github.com/src-d/enry/issues/213).
* XML detection strategy is not implemented. See [#192](https://github.com/src-d/enry/issues/192).
* Overriding languages and types though `.gitattributes` is not yet supported. See [#18](https://github.com/src-d/enry/issues/18).
* `enry` CLI output does NOT exclude `.gitignore`ed files and git submodules, as linguist does
In all the cases above that have an issue number - we plan to update enry to match Linguist behaviour.
`enry` [CLI tool](#cli) does not require a full Git repository to be present in filesystem in order to report languages.
Benchmarks Benchmarks
------------ ------------

View File

@ -29,8 +29,8 @@ func main() {
breakdownFlag := flag.Bool("breakdown", false, "") breakdownFlag := flag.Bool("breakdown", false, "")
jsonFlag := flag.Bool("json", false, "") jsonFlag := flag.Bool("json", false, "")
showVersion := flag.Bool("version", false, "Show the enry version information") showVersion := flag.Bool("version", false, "Show the enry version information")
onlyProg := flag.Bool("prog", false, "Only show programming file types in output") allLangs := flag.Bool("all", false, "Show all files, including those identifed as non-programming languages")
countMode := flag.String("mode", "file", "the method used to count file size. Available options are: file, line and byte") countMode := flag.String("mode", "byte", "the method used to count file size. Available options are: file, line and byte")
limitKB := flag.Int64("limit", 16*1024, "Analyse first N KB of the file (-1 means no limit)") limitKB := flag.Int64("limit", 16*1024, "Analyse first N KB of the file (-1 means no limit)")
flag.Parse() flag.Parse()
limit := (*limitKB) * 1024 limit := (*limitKB) * 1024
@ -85,6 +85,7 @@ func main() {
if enry.IsVendor(relativePath) || enry.IsDotFile(relativePath) || if enry.IsVendor(relativePath) || enry.IsDotFile(relativePath) ||
enry.IsDocumentation(relativePath) || enry.IsConfiguration(relativePath) { enry.IsDocumentation(relativePath) || enry.IsConfiguration(relativePath) {
// TODO(bzz): skip enry.IsGeneratedPath() after https://github.com/src-d/enry/issues/213
if f.IsDir() { if f.IsDir() {
return filepath.SkipDir return filepath.SkipDir
} }
@ -96,24 +97,27 @@ func main() {
return nil return nil
} }
language, ok := enry.GetLanguageByExtension(path) // TODO(bzz): provide API that mimics lingust CLI output for
if !ok { // - running ByExtension & ByFilename
if language, ok = enry.GetLanguageByFilename(path); !ok { // - reading the file, if that did not work
content, err := readFile(path, limit) // - GetLanguage([]Strategy)
if err != nil { content, err := readFile(path, limit)
log.Println(err) if err != nil {
return nil log.Println(err)
} return nil
}
// TODO(bzz): skip enry.IsGeneratedContent() as well, after https://github.com/src-d/enry/issues/213
language = enry.GetLanguage(filepath.Base(path), content) language := enry.GetLanguage(filepath.Base(path), content)
if language == enry.OtherLanguage { if language == enry.OtherLanguage {
return nil return nil
}
}
} }
// If we are displaying only prog. and language is not prog. skip it. // If we are not asked to display all, do as
if *onlyProg && enry.GetLanguageType(language) != enry.Programming { // https://github.com/github/linguist/blob/bf95666fc15e49d556f2def4d0a85338423c25f3/lib/linguist/blob_helper.rb#L382
if !*allLangs &&
enry.GetLanguageType(language) != enry.Programming &&
enry.GetLanguageType(language) != enry.Markup {
return nil return nil
} }
@ -132,11 +136,11 @@ func main() {
case *jsonFlag && *breakdownFlag: case *jsonFlag && *breakdownFlag:
printBreakDown(out, &buf) printBreakDown(out, &buf)
case *breakdownFlag: case *breakdownFlag:
printPercents(out, &buf, *countMode) printPercents(root, out, &buf, *countMode)
buf.WriteByte('\n') buf.WriteByte('\n')
printBreakDown(out, &buf) printBreakDown(out, &buf)
default: default:
printPercents(out, &buf, *countMode) printPercents(root, out, &buf, *countMode)
} }
fmt.Print(buf.String()) fmt.Print(buf.String())
@ -178,9 +182,9 @@ func (e filelistError) Error() string {
return fmt.Sprintf("Could not process the following files:\n%s", strings.Join(e, "\n")) return fmt.Sprintf("Could not process the following files:\n%s", strings.Join(e, "\n"))
} }
func printPercents(fSummary map[string][]string, buff *bytes.Buffer, mode string) { func printPercents(root string, fSummary map[string][]string, buff *bytes.Buffer, mode string) {
// Select the way we quantify 'amount' of code. // Select the way we quantify 'amount' of code.
var reducer func([]string) (float64, filelistError) reducer := fileCountValues
switch mode { switch mode {
case "file": case "file":
reducer = fileCountValues reducer = fileCountValues
@ -188,8 +192,6 @@ func printPercents(fSummary map[string][]string, buff *bytes.Buffer, mode string
reducer = lineCountValues reducer = lineCountValues
case "byte": case "byte":
reducer = byteCountValues reducer = byteCountValues
default:
reducer = fileCountValues
} }
// Reduce the list of files to a quantity of file type. // Reduce the list of files to a quantity of file type.
@ -200,7 +202,7 @@ func printPercents(fSummary map[string][]string, buff *bytes.Buffer, mode string
fileValues = make(map[string]float64) fileValues = make(map[string]float64)
) )
for fType, files := range fSummary { for fType, files := range fSummary {
val, err := reducer(files) val, err := reducer(root, files)
if err != nil { if err != nil {
unreadableFiles = append(unreadableFiles, err...) unreadableFiles = append(unreadableFiles, err...)
} }
@ -225,25 +227,25 @@ func printPercents(fSummary map[string][]string, buff *bytes.Buffer, mode string
} }
} }
func fileCountValues(files []string) (float64, filelistError) { func fileCountValues(_ string, files []string) (float64, filelistError) {
return float64(len(files)), nil return float64(len(files)), nil
} }
func lineCountValues(files []string) (float64, filelistError) { func lineCountValues(root string, files []string) (float64, filelistError) {
var filesErr filelistError var filesErr filelistError
var t float64 var t float64
for _, fName := range files { for _, fName := range files {
l, _ := getLines(fName, nil) l, _ := getLines(filepath.Join(root, fName), nil)
t += float64(l) t += float64(l)
} }
return t, filesErr return t, filesErr
} }
func byteCountValues(files []string) (float64, filelistError) { func byteCountValues(root string, files []string) (float64, filelistError) {
var filesErr filelistError var filesErr filelistError
var t float64 var t float64
for _, fName := range files { for _, fName := range files {
f, err := os.Open(fName) f, err := os.Open(filepath.Join(root, fName))
if err != nil { if err != nil {
filesErr = append(filesErr, fName) filesErr = append(filesErr, fName)
continue continue

View File

@ -26,7 +26,7 @@ var DefaultStrategies = []Strategy{
GetLanguagesByClassifier, GetLanguagesByClassifier,
} }
// DefaultClassifier is a naive Bayes classifier based on Linguist samples. // DefaultClassifier is a Naive Bayes classifier trained on Linguist samples.
var DefaultClassifier Classifier = &classifier{ var DefaultClassifier Classifier = &classifier{
languagesLogProbabilities: data.LanguagesLogProbabilities, languagesLogProbabilities: data.LanguagesLogProbabilities,
tokensLogProbabilities: data.TokensLogProbabilities, tokensLogProbabilities: data.TokensLogProbabilities,
@ -390,8 +390,8 @@ func getDotIndexes(filename string) []int {
return dots return dots
} }
// GetLanguagesByContent returns a slice of possible languages for the given content. // GetLanguagesByContent returns a slice of languages for the given content.
// It complies with the signature to be a Strategy type. // It is a Strategy that uses content-based regexp heuristics and a filename extension.
func GetLanguagesByContent(filename string, content []byte, _ []string) []string { func GetLanguagesByContent(filename string, content []byte, _ []string) []string {
if filename == "" { if filename == "" {
return nil return nil

View File

@ -7,7 +7,6 @@ import (
"io/ioutil" "io/ioutil"
"log" "log"
"math" "math"
"os"
"path/filepath" "path/filepath"
"sort" "sort"
"strconv" "strconv"
@ -41,7 +40,7 @@ func Frequencies(fileToParse, samplesDir, outPath, tmplPath, tmplName, commit st
} }
func getFrequencies(samplesDir string) (*samplesFrequencies, error) { func getFrequencies(samplesDir string) (*samplesFrequencies, error) {
entries, err := ioutil.ReadDir(samplesDir) langDirs, err := ioutil.ReadDir(samplesDir)
if err != nil { if err != nil {
return nil, err return nil, err
} }
@ -52,13 +51,13 @@ func getFrequencies(samplesDir string) (*samplesFrequencies, error) {
var tokens = make(map[string]map[string]int) var tokens = make(map[string]map[string]int)
var languageTokens = make(map[string]int) var languageTokens = make(map[string]int)
for _, entry := range entries { for _, langDir := range langDirs {
if !entry.IsDir() { if !langDir.IsDir() {
log.Println(err)
continue continue
} }
samples, err := getSamples(samplesDir, entry) lang := langDir.Name()
samples, err := readSamples(filepath.Join(samplesDir, lang))
if err != nil { if err != nil {
log.Println(err) log.Println(err)
} }
@ -73,7 +72,6 @@ func getFrequencies(samplesDir string) (*samplesFrequencies, error) {
continue continue
} }
lang := entry.Name()
languageTotal += len(samples) languageTotal += len(samples)
languages[lang] = len(samples) languages[lang] = len(samples)
tokensTotal += len(samplesTokens) tokensTotal += len(samplesTokens)
@ -93,22 +91,23 @@ func getFrequencies(samplesDir string) (*samplesFrequencies, error) {
}, nil }, nil
} }
func getSamples(samplesDir string, langDir os.FileInfo) ([]string, error) { func readSamples(samplesLangDir string) ([]string, error) {
const samplesSubDir = "filenames" const samplesLangFilesDir = "filenames"
samples := []string{} sampleFiles, err := ioutil.ReadDir(samplesLangDir)
path := filepath.Join(samplesDir, langDir.Name())
entries, err := ioutil.ReadDir(path)
if err != nil { if err != nil {
return nil, err return nil, err
} }
for _, entry := range entries { var samples []string
if entry.Mode().IsRegular() { for _, sampleFile := range sampleFiles {
samples = append(samples, filepath.Join(path, entry.Name())) filename := filepath.Join(samplesLangDir, sampleFile.Name())
if sampleFile.Mode().IsRegular() {
samples = append(samples, filename)
continue
} }
if entry.IsDir() && entry.Name() == samplesSubDir { if sampleFile.IsDir() && sampleFile.Name() == samplesLangFilesDir {
subSamples, err := getSubSamples(samplesDir, langDir.Name(), entry) subSamples, err := readSubSamples(filename)
if err != nil { if err != nil {
return nil, err return nil, err
} }
@ -121,9 +120,8 @@ func getSamples(samplesDir string, langDir os.FileInfo) ([]string, error) {
return samples, nil return samples, nil
} }
func getSubSamples(samplesDir, langDir string, subLangDir os.FileInfo) ([]string, error) { func readSubSamples(path string) ([]string, error) {
subSamples := []string{} subSamples := []string{}
path := filepath.Join(samplesDir, langDir, subLangDir.Name())
entries, err := ioutil.ReadDir(path) entries, err := ioutil.ReadDir(path)
if err != nil { if err != nil {
return nil, err return nil, err