mirror of
https://github.com/ralsina/tartrazine.git
synced 2025-05-23 08:30:07 -03:00
Merge pull request #214 from bzz/fix-cli-accuracy
CLI: sync report logic \w Linguist
This commit is contained in:
commit
7a6e8ca783
20
README.md
20
README.md
@ -1,4 +1,4 @@
|
||||
# enry [](https://godoc.org/gopkg.in/src-d/enry.v1) [](https://travis-ci.org/src-d/enry) [](https://codecov.io/gh/src-d/enry)
|
||||
# enry [](https://godoc.org/gopkg.in/src-d/enry.v1) [](https://travis-ci.com/src-d/enry) [](https://codecov.io/gh/src-d/enry)
|
||||
|
||||
File programming language detector and toolbox to ignore binary or vendored files. *enry*, started as a port to _Go_ of the original [linguist](https://github.com/github/linguist) _Ruby_ library, that has an improved *2x performance*.
|
||||
|
||||
@ -183,16 +183,28 @@ To run the tests,
|
||||
Divergences from linguist
|
||||
------------
|
||||
|
||||
`enry` [CLI tool](#cli) does *not* require a full Git repository to be present in the filesystem in order to report languages.
|
||||
|
||||
Using [linguist/samples](https://github.com/github/linguist/tree/master/samples)
|
||||
as a set for the tests, the following issues were found:
|
||||
|
||||
* [Heuristics for ".es" extension](https://github.com/github/linguist/blob/e761f9b013e5b61161481fcb898b59721ee40e3d/lib/linguist/heuristics.yml#L103) in JavaScript could not be parsed, due to unsupported backreference in RE2 regexp engine
|
||||
|
||||
* As of (Linguist v5.3.2)[https://github.com/github/linguist/releases/tag/v5.3.2] it is using [flex-based scanner in C for tokenization](https://github.com/github/linguist/pull/3846). Enry stil uses [extract_token](https://github.com/github/linguist/pull/3846/files#diff-d5179df0b71620e3fac4535cd1368d15L60) regex-based algorithm. Tracked under https://github.com/src-d/enry/issues/193
|
||||
* As of (Linguist v5.3.2)[https://github.com/github/linguist/releases/tag/v5.3.2] it is using [flex-based scanner in C for tokenization](https://github.com/github/linguist/pull/3846). Enry still uses [extract_token](https://github.com/github/linguist/pull/3846/files#diff-d5179df0b71620e3fac4535cd1368d15L60) regex-based algorithm. See [#193](https://github.com/src-d/enry/issues/193).
|
||||
|
||||
* Bayesian classifier cann't distinguish "SQL" vs "PLpgSQL". Tracked under https://github.com/src-d/enry/issues/194
|
||||
* Bayesian classifier can't distinguish "SQL" from "PLpgSQL. See [#194](https://github.com/src-d/enry/issues/194).
|
||||
|
||||
* Detection of [generated files](https://github.com/github/linguist/blob/bf95666fc15e49d556f2def4d0a85338423c25f3/lib/linguist/generated.rb#L53) is not supported yet.
|
||||
(Thus they are not excluded from CLI output). See [#213](https://github.com/src-d/enry/issues/213).
|
||||
|
||||
* XML detection strategy is not implemented. See [#192](https://github.com/src-d/enry/issues/192).
|
||||
|
||||
* Overriding languages and types though `.gitattributes` is not yet supported. See [#18](https://github.com/src-d/enry/issues/18).
|
||||
|
||||
* `enry` CLI output does NOT exclude `.gitignore`ed files and git submodules, as linguist does
|
||||
|
||||
In all the cases above that have an issue number - we plan to update enry to match Linguist behaviour.
|
||||
|
||||
`enry` [CLI tool](#cli) does not require a full Git repository to be present in filesystem in order to report languages.
|
||||
|
||||
Benchmarks
|
||||
------------
|
||||
|
@ -29,8 +29,8 @@ func main() {
|
||||
breakdownFlag := flag.Bool("breakdown", false, "")
|
||||
jsonFlag := flag.Bool("json", false, "")
|
||||
showVersion := flag.Bool("version", false, "Show the enry version information")
|
||||
onlyProg := flag.Bool("prog", false, "Only show programming file types in output")
|
||||
countMode := flag.String("mode", "file", "the method used to count file size. Available options are: file, line and byte")
|
||||
allLangs := flag.Bool("all", false, "Show all files, including those identifed as non-programming languages")
|
||||
countMode := flag.String("mode", "byte", "the method used to count file size. Available options are: file, line and byte")
|
||||
limitKB := flag.Int64("limit", 16*1024, "Analyse first N KB of the file (-1 means no limit)")
|
||||
flag.Parse()
|
||||
limit := (*limitKB) * 1024
|
||||
@ -85,6 +85,7 @@ func main() {
|
||||
|
||||
if enry.IsVendor(relativePath) || enry.IsDotFile(relativePath) ||
|
||||
enry.IsDocumentation(relativePath) || enry.IsConfiguration(relativePath) {
|
||||
// TODO(bzz): skip enry.IsGeneratedPath() after https://github.com/src-d/enry/issues/213
|
||||
if f.IsDir() {
|
||||
return filepath.SkipDir
|
||||
}
|
||||
@ -96,24 +97,27 @@ func main() {
|
||||
return nil
|
||||
}
|
||||
|
||||
language, ok := enry.GetLanguageByExtension(path)
|
||||
if !ok {
|
||||
if language, ok = enry.GetLanguageByFilename(path); !ok {
|
||||
content, err := readFile(path, limit)
|
||||
if err != nil {
|
||||
log.Println(err)
|
||||
return nil
|
||||
}
|
||||
// TODO(bzz): provide API that mimics lingust CLI output for
|
||||
// - running ByExtension & ByFilename
|
||||
// - reading the file, if that did not work
|
||||
// - GetLanguage([]Strategy)
|
||||
content, err := readFile(path, limit)
|
||||
if err != nil {
|
||||
log.Println(err)
|
||||
return nil
|
||||
}
|
||||
// TODO(bzz): skip enry.IsGeneratedContent() as well, after https://github.com/src-d/enry/issues/213
|
||||
|
||||
language = enry.GetLanguage(filepath.Base(path), content)
|
||||
if language == enry.OtherLanguage {
|
||||
return nil
|
||||
}
|
||||
}
|
||||
language := enry.GetLanguage(filepath.Base(path), content)
|
||||
if language == enry.OtherLanguage {
|
||||
return nil
|
||||
}
|
||||
|
||||
// If we are displaying only prog. and language is not prog. skip it.
|
||||
if *onlyProg && enry.GetLanguageType(language) != enry.Programming {
|
||||
// If we are not asked to display all, do as
|
||||
// https://github.com/github/linguist/blob/bf95666fc15e49d556f2def4d0a85338423c25f3/lib/linguist/blob_helper.rb#L382
|
||||
if !*allLangs &&
|
||||
enry.GetLanguageType(language) != enry.Programming &&
|
||||
enry.GetLanguageType(language) != enry.Markup {
|
||||
return nil
|
||||
}
|
||||
|
||||
@ -132,11 +136,11 @@ func main() {
|
||||
case *jsonFlag && *breakdownFlag:
|
||||
printBreakDown(out, &buf)
|
||||
case *breakdownFlag:
|
||||
printPercents(out, &buf, *countMode)
|
||||
printPercents(root, out, &buf, *countMode)
|
||||
buf.WriteByte('\n')
|
||||
printBreakDown(out, &buf)
|
||||
default:
|
||||
printPercents(out, &buf, *countMode)
|
||||
printPercents(root, out, &buf, *countMode)
|
||||
}
|
||||
|
||||
fmt.Print(buf.String())
|
||||
@ -178,9 +182,9 @@ func (e filelistError) Error() string {
|
||||
return fmt.Sprintf("Could not process the following files:\n%s", strings.Join(e, "\n"))
|
||||
}
|
||||
|
||||
func printPercents(fSummary map[string][]string, buff *bytes.Buffer, mode string) {
|
||||
func printPercents(root string, fSummary map[string][]string, buff *bytes.Buffer, mode string) {
|
||||
// Select the way we quantify 'amount' of code.
|
||||
var reducer func([]string) (float64, filelistError)
|
||||
reducer := fileCountValues
|
||||
switch mode {
|
||||
case "file":
|
||||
reducer = fileCountValues
|
||||
@ -188,8 +192,6 @@ func printPercents(fSummary map[string][]string, buff *bytes.Buffer, mode string
|
||||
reducer = lineCountValues
|
||||
case "byte":
|
||||
reducer = byteCountValues
|
||||
default:
|
||||
reducer = fileCountValues
|
||||
}
|
||||
|
||||
// Reduce the list of files to a quantity of file type.
|
||||
@ -200,7 +202,7 @@ func printPercents(fSummary map[string][]string, buff *bytes.Buffer, mode string
|
||||
fileValues = make(map[string]float64)
|
||||
)
|
||||
for fType, files := range fSummary {
|
||||
val, err := reducer(files)
|
||||
val, err := reducer(root, files)
|
||||
if err != nil {
|
||||
unreadableFiles = append(unreadableFiles, err...)
|
||||
}
|
||||
@ -225,25 +227,25 @@ func printPercents(fSummary map[string][]string, buff *bytes.Buffer, mode string
|
||||
}
|
||||
}
|
||||
|
||||
func fileCountValues(files []string) (float64, filelistError) {
|
||||
func fileCountValues(_ string, files []string) (float64, filelistError) {
|
||||
return float64(len(files)), nil
|
||||
}
|
||||
|
||||
func lineCountValues(files []string) (float64, filelistError) {
|
||||
func lineCountValues(root string, files []string) (float64, filelistError) {
|
||||
var filesErr filelistError
|
||||
var t float64
|
||||
for _, fName := range files {
|
||||
l, _ := getLines(fName, nil)
|
||||
l, _ := getLines(filepath.Join(root, fName), nil)
|
||||
t += float64(l)
|
||||
}
|
||||
return t, filesErr
|
||||
}
|
||||
|
||||
func byteCountValues(files []string) (float64, filelistError) {
|
||||
func byteCountValues(root string, files []string) (float64, filelistError) {
|
||||
var filesErr filelistError
|
||||
var t float64
|
||||
for _, fName := range files {
|
||||
f, err := os.Open(fName)
|
||||
f, err := os.Open(filepath.Join(root, fName))
|
||||
if err != nil {
|
||||
filesErr = append(filesErr, fName)
|
||||
continue
|
||||
|
@ -26,7 +26,7 @@ var DefaultStrategies = []Strategy{
|
||||
GetLanguagesByClassifier,
|
||||
}
|
||||
|
||||
// DefaultClassifier is a naive Bayes classifier based on Linguist samples.
|
||||
// DefaultClassifier is a Naive Bayes classifier trained on Linguist samples.
|
||||
var DefaultClassifier Classifier = &classifier{
|
||||
languagesLogProbabilities: data.LanguagesLogProbabilities,
|
||||
tokensLogProbabilities: data.TokensLogProbabilities,
|
||||
@ -390,8 +390,8 @@ func getDotIndexes(filename string) []int {
|
||||
return dots
|
||||
}
|
||||
|
||||
// GetLanguagesByContent returns a slice of possible languages for the given content.
|
||||
// It complies with the signature to be a Strategy type.
|
||||
// GetLanguagesByContent returns a slice of languages for the given content.
|
||||
// It is a Strategy that uses content-based regexp heuristics and a filename extension.
|
||||
func GetLanguagesByContent(filename string, content []byte, _ []string) []string {
|
||||
if filename == "" {
|
||||
return nil
|
||||
|
@ -7,7 +7,6 @@ import (
|
||||
"io/ioutil"
|
||||
"log"
|
||||
"math"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strconv"
|
||||
@ -41,7 +40,7 @@ func Frequencies(fileToParse, samplesDir, outPath, tmplPath, tmplName, commit st
|
||||
}
|
||||
|
||||
func getFrequencies(samplesDir string) (*samplesFrequencies, error) {
|
||||
entries, err := ioutil.ReadDir(samplesDir)
|
||||
langDirs, err := ioutil.ReadDir(samplesDir)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@ -52,13 +51,13 @@ func getFrequencies(samplesDir string) (*samplesFrequencies, error) {
|
||||
var tokens = make(map[string]map[string]int)
|
||||
var languageTokens = make(map[string]int)
|
||||
|
||||
for _, entry := range entries {
|
||||
if !entry.IsDir() {
|
||||
log.Println(err)
|
||||
for _, langDir := range langDirs {
|
||||
if !langDir.IsDir() {
|
||||
continue
|
||||
}
|
||||
|
||||
samples, err := getSamples(samplesDir, entry)
|
||||
lang := langDir.Name()
|
||||
samples, err := readSamples(filepath.Join(samplesDir, lang))
|
||||
if err != nil {
|
||||
log.Println(err)
|
||||
}
|
||||
@ -73,7 +72,6 @@ func getFrequencies(samplesDir string) (*samplesFrequencies, error) {
|
||||
continue
|
||||
}
|
||||
|
||||
lang := entry.Name()
|
||||
languageTotal += len(samples)
|
||||
languages[lang] = len(samples)
|
||||
tokensTotal += len(samplesTokens)
|
||||
@ -93,22 +91,23 @@ func getFrequencies(samplesDir string) (*samplesFrequencies, error) {
|
||||
}, nil
|
||||
}
|
||||
|
||||
func getSamples(samplesDir string, langDir os.FileInfo) ([]string, error) {
|
||||
const samplesSubDir = "filenames"
|
||||
samples := []string{}
|
||||
path := filepath.Join(samplesDir, langDir.Name())
|
||||
entries, err := ioutil.ReadDir(path)
|
||||
func readSamples(samplesLangDir string) ([]string, error) {
|
||||
const samplesLangFilesDir = "filenames"
|
||||
sampleFiles, err := ioutil.ReadDir(samplesLangDir)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
for _, entry := range entries {
|
||||
if entry.Mode().IsRegular() {
|
||||
samples = append(samples, filepath.Join(path, entry.Name()))
|
||||
var samples []string
|
||||
for _, sampleFile := range sampleFiles {
|
||||
filename := filepath.Join(samplesLangDir, sampleFile.Name())
|
||||
if sampleFile.Mode().IsRegular() {
|
||||
samples = append(samples, filename)
|
||||
continue
|
||||
}
|
||||
|
||||
if entry.IsDir() && entry.Name() == samplesSubDir {
|
||||
subSamples, err := getSubSamples(samplesDir, langDir.Name(), entry)
|
||||
if sampleFile.IsDir() && sampleFile.Name() == samplesLangFilesDir {
|
||||
subSamples, err := readSubSamples(filename)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@ -121,9 +120,8 @@ func getSamples(samplesDir string, langDir os.FileInfo) ([]string, error) {
|
||||
return samples, nil
|
||||
}
|
||||
|
||||
func getSubSamples(samplesDir, langDir string, subLangDir os.FileInfo) ([]string, error) {
|
||||
func readSubSamples(path string) ([]string, error) {
|
||||
subSamples := []string{}
|
||||
path := filepath.Join(samplesDir, langDir, subLangDir.Name())
|
||||
entries, err := ioutil.ReadDir(path)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
|
Loading…
x
Reference in New Issue
Block a user