Merge pull request #214 from bzz/fix-cli-accuracy

CLI: sync report logic \w Linguist
This commit is contained in:
Alexander 2019-04-09 16:32:52 +02:00 committed by GitHub
commit 7a6e8ca783
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 67 additions and 55 deletions

View File

@ -1,4 +1,4 @@
# enry [![GoDoc](https://godoc.org/gopkg.in/src-d/enry.v1?status.svg)](https://godoc.org/gopkg.in/src-d/enry.v1) [![Build Status](https://travis-ci.org/src-d/enry.svg?branch=master)](https://travis-ci.org/src-d/enry) [![codecov](https://codecov.io/gh/src-d/enry/branch/master/graph/badge.svg)](https://codecov.io/gh/src-d/enry)
# enry [![GoDoc](https://godoc.org/gopkg.in/src-d/enry.v1?status.svg)](https://godoc.org/gopkg.in/src-d/enry.v1) [![Build Status](https://travis-ci.com/src-d/enry.svg?branch=master)](https://travis-ci.com/src-d/enry) [![codecov](https://codecov.io/gh/src-d/enry/branch/master/graph/badge.svg)](https://codecov.io/gh/src-d/enry)
File programming language detector and toolbox to ignore binary or vendored files. *enry*, started as a port to _Go_ of the original [linguist](https://github.com/github/linguist) _Ruby_ library, that has an improved *2x performance*.
@ -183,16 +183,28 @@ To run the tests,
Divergences from linguist
------------
`enry` [CLI tool](#cli) does *not* require a full Git repository to be present in the filesystem in order to report languages.
Using [linguist/samples](https://github.com/github/linguist/tree/master/samples)
as a set for the tests, the following issues were found:
* [Heuristics for ".es" extension](https://github.com/github/linguist/blob/e761f9b013e5b61161481fcb898b59721ee40e3d/lib/linguist/heuristics.yml#L103) in JavaScript could not be parsed, due to unsupported backreference in RE2 regexp engine
* As of (Linguist v5.3.2)[https://github.com/github/linguist/releases/tag/v5.3.2] it is using [flex-based scanner in C for tokenization](https://github.com/github/linguist/pull/3846). Enry stil uses [extract_token](https://github.com/github/linguist/pull/3846/files#diff-d5179df0b71620e3fac4535cd1368d15L60) regex-based algorithm. Tracked under https://github.com/src-d/enry/issues/193
* As of (Linguist v5.3.2)[https://github.com/github/linguist/releases/tag/v5.3.2] it is using [flex-based scanner in C for tokenization](https://github.com/github/linguist/pull/3846). Enry still uses [extract_token](https://github.com/github/linguist/pull/3846/files#diff-d5179df0b71620e3fac4535cd1368d15L60) regex-based algorithm. See [#193](https://github.com/src-d/enry/issues/193).
* Bayesian classifier cann't distinguish "SQL" vs "PLpgSQL". Tracked under https://github.com/src-d/enry/issues/194
* Bayesian classifier can't distinguish "SQL" from "PLpgSQL. See [#194](https://github.com/src-d/enry/issues/194).
* Detection of [generated files](https://github.com/github/linguist/blob/bf95666fc15e49d556f2def4d0a85338423c25f3/lib/linguist/generated.rb#L53) is not supported yet.
(Thus they are not excluded from CLI output). See [#213](https://github.com/src-d/enry/issues/213).
* XML detection strategy is not implemented. See [#192](https://github.com/src-d/enry/issues/192).
* Overriding languages and types though `.gitattributes` is not yet supported. See [#18](https://github.com/src-d/enry/issues/18).
* `enry` CLI output does NOT exclude `.gitignore`ed files and git submodules, as linguist does
In all the cases above that have an issue number - we plan to update enry to match Linguist behaviour.
`enry` [CLI tool](#cli) does not require a full Git repository to be present in filesystem in order to report languages.
Benchmarks
------------

View File

@ -29,8 +29,8 @@ func main() {
breakdownFlag := flag.Bool("breakdown", false, "")
jsonFlag := flag.Bool("json", false, "")
showVersion := flag.Bool("version", false, "Show the enry version information")
onlyProg := flag.Bool("prog", false, "Only show programming file types in output")
countMode := flag.String("mode", "file", "the method used to count file size. Available options are: file, line and byte")
allLangs := flag.Bool("all", false, "Show all files, including those identifed as non-programming languages")
countMode := flag.String("mode", "byte", "the method used to count file size. Available options are: file, line and byte")
limitKB := flag.Int64("limit", 16*1024, "Analyse first N KB of the file (-1 means no limit)")
flag.Parse()
limit := (*limitKB) * 1024
@ -85,6 +85,7 @@ func main() {
if enry.IsVendor(relativePath) || enry.IsDotFile(relativePath) ||
enry.IsDocumentation(relativePath) || enry.IsConfiguration(relativePath) {
// TODO(bzz): skip enry.IsGeneratedPath() after https://github.com/src-d/enry/issues/213
if f.IsDir() {
return filepath.SkipDir
}
@ -96,24 +97,27 @@ func main() {
return nil
}
language, ok := enry.GetLanguageByExtension(path)
if !ok {
if language, ok = enry.GetLanguageByFilename(path); !ok {
content, err := readFile(path, limit)
if err != nil {
log.Println(err)
return nil
}
// TODO(bzz): provide API that mimics lingust CLI output for
// - running ByExtension & ByFilename
// - reading the file, if that did not work
// - GetLanguage([]Strategy)
content, err := readFile(path, limit)
if err != nil {
log.Println(err)
return nil
}
// TODO(bzz): skip enry.IsGeneratedContent() as well, after https://github.com/src-d/enry/issues/213
language = enry.GetLanguage(filepath.Base(path), content)
if language == enry.OtherLanguage {
return nil
}
}
language := enry.GetLanguage(filepath.Base(path), content)
if language == enry.OtherLanguage {
return nil
}
// If we are displaying only prog. and language is not prog. skip it.
if *onlyProg && enry.GetLanguageType(language) != enry.Programming {
// If we are not asked to display all, do as
// https://github.com/github/linguist/blob/bf95666fc15e49d556f2def4d0a85338423c25f3/lib/linguist/blob_helper.rb#L382
if !*allLangs &&
enry.GetLanguageType(language) != enry.Programming &&
enry.GetLanguageType(language) != enry.Markup {
return nil
}
@ -132,11 +136,11 @@ func main() {
case *jsonFlag && *breakdownFlag:
printBreakDown(out, &buf)
case *breakdownFlag:
printPercents(out, &buf, *countMode)
printPercents(root, out, &buf, *countMode)
buf.WriteByte('\n')
printBreakDown(out, &buf)
default:
printPercents(out, &buf, *countMode)
printPercents(root, out, &buf, *countMode)
}
fmt.Print(buf.String())
@ -178,9 +182,9 @@ func (e filelistError) Error() string {
return fmt.Sprintf("Could not process the following files:\n%s", strings.Join(e, "\n"))
}
func printPercents(fSummary map[string][]string, buff *bytes.Buffer, mode string) {
func printPercents(root string, fSummary map[string][]string, buff *bytes.Buffer, mode string) {
// Select the way we quantify 'amount' of code.
var reducer func([]string) (float64, filelistError)
reducer := fileCountValues
switch mode {
case "file":
reducer = fileCountValues
@ -188,8 +192,6 @@ func printPercents(fSummary map[string][]string, buff *bytes.Buffer, mode string
reducer = lineCountValues
case "byte":
reducer = byteCountValues
default:
reducer = fileCountValues
}
// Reduce the list of files to a quantity of file type.
@ -200,7 +202,7 @@ func printPercents(fSummary map[string][]string, buff *bytes.Buffer, mode string
fileValues = make(map[string]float64)
)
for fType, files := range fSummary {
val, err := reducer(files)
val, err := reducer(root, files)
if err != nil {
unreadableFiles = append(unreadableFiles, err...)
}
@ -225,25 +227,25 @@ func printPercents(fSummary map[string][]string, buff *bytes.Buffer, mode string
}
}
func fileCountValues(files []string) (float64, filelistError) {
func fileCountValues(_ string, files []string) (float64, filelistError) {
return float64(len(files)), nil
}
func lineCountValues(files []string) (float64, filelistError) {
func lineCountValues(root string, files []string) (float64, filelistError) {
var filesErr filelistError
var t float64
for _, fName := range files {
l, _ := getLines(fName, nil)
l, _ := getLines(filepath.Join(root, fName), nil)
t += float64(l)
}
return t, filesErr
}
func byteCountValues(files []string) (float64, filelistError) {
func byteCountValues(root string, files []string) (float64, filelistError) {
var filesErr filelistError
var t float64
for _, fName := range files {
f, err := os.Open(fName)
f, err := os.Open(filepath.Join(root, fName))
if err != nil {
filesErr = append(filesErr, fName)
continue

View File

@ -26,7 +26,7 @@ var DefaultStrategies = []Strategy{
GetLanguagesByClassifier,
}
// DefaultClassifier is a naive Bayes classifier based on Linguist samples.
// DefaultClassifier is a Naive Bayes classifier trained on Linguist samples.
var DefaultClassifier Classifier = &classifier{
languagesLogProbabilities: data.LanguagesLogProbabilities,
tokensLogProbabilities: data.TokensLogProbabilities,
@ -390,8 +390,8 @@ func getDotIndexes(filename string) []int {
return dots
}
// GetLanguagesByContent returns a slice of possible languages for the given content.
// It complies with the signature to be a Strategy type.
// GetLanguagesByContent returns a slice of languages for the given content.
// It is a Strategy that uses content-based regexp heuristics and a filename extension.
func GetLanguagesByContent(filename string, content []byte, _ []string) []string {
if filename == "" {
return nil

View File

@ -7,7 +7,6 @@ import (
"io/ioutil"
"log"
"math"
"os"
"path/filepath"
"sort"
"strconv"
@ -41,7 +40,7 @@ func Frequencies(fileToParse, samplesDir, outPath, tmplPath, tmplName, commit st
}
func getFrequencies(samplesDir string) (*samplesFrequencies, error) {
entries, err := ioutil.ReadDir(samplesDir)
langDirs, err := ioutil.ReadDir(samplesDir)
if err != nil {
return nil, err
}
@ -52,13 +51,13 @@ func getFrequencies(samplesDir string) (*samplesFrequencies, error) {
var tokens = make(map[string]map[string]int)
var languageTokens = make(map[string]int)
for _, entry := range entries {
if !entry.IsDir() {
log.Println(err)
for _, langDir := range langDirs {
if !langDir.IsDir() {
continue
}
samples, err := getSamples(samplesDir, entry)
lang := langDir.Name()
samples, err := readSamples(filepath.Join(samplesDir, lang))
if err != nil {
log.Println(err)
}
@ -73,7 +72,6 @@ func getFrequencies(samplesDir string) (*samplesFrequencies, error) {
continue
}
lang := entry.Name()
languageTotal += len(samples)
languages[lang] = len(samples)
tokensTotal += len(samplesTokens)
@ -93,22 +91,23 @@ func getFrequencies(samplesDir string) (*samplesFrequencies, error) {
}, nil
}
func getSamples(samplesDir string, langDir os.FileInfo) ([]string, error) {
const samplesSubDir = "filenames"
samples := []string{}
path := filepath.Join(samplesDir, langDir.Name())
entries, err := ioutil.ReadDir(path)
func readSamples(samplesLangDir string) ([]string, error) {
const samplesLangFilesDir = "filenames"
sampleFiles, err := ioutil.ReadDir(samplesLangDir)
if err != nil {
return nil, err
}
for _, entry := range entries {
if entry.Mode().IsRegular() {
samples = append(samples, filepath.Join(path, entry.Name()))
var samples []string
for _, sampleFile := range sampleFiles {
filename := filepath.Join(samplesLangDir, sampleFile.Name())
if sampleFile.Mode().IsRegular() {
samples = append(samples, filename)
continue
}
if entry.IsDir() && entry.Name() == samplesSubDir {
subSamples, err := getSubSamples(samplesDir, langDir.Name(), entry)
if sampleFile.IsDir() && sampleFile.Name() == samplesLangFilesDir {
subSamples, err := readSubSamples(filename)
if err != nil {
return nil, err
}
@ -121,9 +120,8 @@ func getSamples(samplesDir string, langDir os.FileInfo) ([]string, error) {
return samples, nil
}
func getSubSamples(samplesDir, langDir string, subLangDir os.FileInfo) ([]string, error) {
func readSubSamples(path string) ([]string, error) {
subSamples := []string{}
path := filepath.Join(samplesDir, langDir, subLangDir.Name())
entries, err := ioutil.ReadDir(path)
if err != nil {
return nil, err