Merge commit 'f955c625aded244864e83a872b396868a490dbc5' as 'go-enry'

2025-09-02 03:37:31 +00:00 · 2024-09-04 16:33:41 -03:00
parent 6a38f2f5fb f955c625ad
commit e44f64a7df
192 changed files with 528500 additions and 0 deletions
--- a/go-enry/benchmarks/csv/enry-distribution.csv
+++ b/go-enry/benchmarks/csv/enry-distribution.csv
@@ -0,0 +1,6 @@
+timeInterval,enry,numberOfFiles
+1us-10us,enry,83
+10us-100us,enry,1341
+100us-1ms,enry,314
+1ms-10ms,enry,146
+10ms-100ms,enry,48
--- a/go-enry/benchmarks/csv/enry-samples.csv
+++ b/go-enry/benchmarks/csv/enry-samples.csv
--- a/go-enry/benchmarks/csv/enry-total.csv
+++ b/go-enry/benchmarks/csv/enry-total.csv
@@ -0,0 +1,8 @@
+function,tool,iterations,ns/op
+GetLanguage(),enry,100,2333748307
+Classify(),enry,3,53842505853
+GetLanguagesByModeline(),enry,1000,228234491
+GetLanguagesByFilename(),enry,1000000,124782
+GetLanguagesByShebang(),enry,100000,2339138
+GetLanguagesByExtension(),enry,200000,1110007
+GetLanguagesByContent(),enry,500,342358978
--- a/go-enry/benchmarks/csv/linguist-distribution.csv
+++ b/go-enry/benchmarks/csv/linguist-distribution.csv
@@ -0,0 +1,6 @@
+timeInterval,linguist,numberOfFiles
+1us-10us,linguist,0
+10us-100us,linguist,120
+100us-1ms,linguist,1070
+1ms-10ms,linguist,816
+10ms-100ms,linguist,71
--- a/go-enry/benchmarks/csv/linguist-samples.csv
+++ b/go-enry/benchmarks/csv/linguist-samples.csv
--- a/go-enry/benchmarks/csv/linguist-total.csv
+++ b/go-enry/benchmarks/csv/linguist-total.csv
@@ -0,0 +1,8 @@
+function,tool,iterations,ns/op
+GetLanguage(),linguist,5,3822076000
+Classify(),linguist,5,329660597600
+GetLanguagesByModeline(),linguist,5,2770912600
+GetLanguagesByFilename(),linguist,5,34159000
+GetLanguagesByShebang(),linguist,5,159317200
+GetLanguagesByExtension(),linguist,5,354929800
+GetLanguagesByContent(),linguist,5,3881611000
--- a/go-enry/benchmarks/histogram/distribution.png
+++ b/go-enry/benchmarks/histogram/distribution.png
--- a/go-enry/benchmarks/linguist-samples.rb
+++ b/go-enry/benchmarks/linguist-samples.rb
@@ -0,0 +1,126 @@
+#!/usr/bin/env ruby
+
+require 'benchmark'
+require 'linguist'
+
+iterations = (ARGV[0] || 1).to_i
+
+# BenchBlob wraps a FileBlob to keep data loaded and to clean attributes added by language detection.
+class BenchBlob < Linguist::FileBlob
+  attr_accessor :data
+
+  def initialize(path, base_path = nil)
+    super
+    @data = File.read(@fullpath)
+  end
+
+  def clean
+    @_mime_type = nil
+    @detect_encoding = nil
+    @lines = nil
+  end
+end
+
+def get_samples(root)
+  samples = Array.new
+  Dir.foreach(root) do |file|
+    path = File.join(root, file)
+    if file == "." or file == ".."
+      next
+    elsif File.directory?(path)
+      get_samples(path).each do |blob|
+        samples << blob
+      end
+    else
+      samples << BenchBlob.new(path)
+    end
+  end
+  return samples
+end
+
+samples = get_samples('.linguist/samples')
+languages = Linguist::Language.all
+
+samples.each do |blob|
+  sample_name = blob.path.gsub(/\s/, '_')
+  Benchmark.bmbm do |bm|
+    bm.report('GetLanguage()_SAMPLE_' + sample_name + ' ' + iterations.to_s) do
+      iterations.times do
+        Linguist::detect(blob)
+        blob.clean
+      end
+    end
+  end
+end
+
+samples.each do |blob|
+  sample_name = blob.path.gsub(/\s/, '_')
+  Benchmark.bmbm do |bm|
+    bm.report('Classify()_SAMPLE_' + sample_name + ' ' + iterations.to_s) do
+      iterations.times do
+        Linguist::Classifier.classify(Linguist::Samples.cache, blob.data)
+        blob.clean
+      end
+    end
+  end
+end
+
+samples.each do |blob|
+  sample_name = blob.path.gsub(/\s/, '_')
+  Benchmark.bmbm do |bm|
+    bm.report('GetLanguagesByModeline()_SAMPLE_' + sample_name + ' ' + iterations.to_s) do
+      iterations.times do
+        Linguist::Strategy::Modeline.call(blob, languages)
+        blob.clean
+      end
+    end
+  end
+end
+
+samples.each do |blob|
+  sample_name = blob.path.gsub(/\s/, '_')
+  Benchmark.bmbm do |bm|
+    bm.report('GetLanguagesByFilename()_SAMPLE_' + sample_name + ' ' + iterations.to_s) do
+    iterations.times do
+        Linguist::Strategy::Filename.call(blob, languages)
+        blob.clean
+      end
+    end
+  end
+end
+
+samples.each do |blob|
+  sample_name = blob.path.gsub(/\s/, '_')
+  Benchmark.bmbm do |bm|
+    bm.report('GetLanguagesByShebang()_SAMPLE_' + sample_name + ' ' + iterations.to_s) do
+      iterations.times do
+        Linguist::Shebang.call(blob, languages)
+        blob.clean
+      end
+    end
+  end
+end
+
+samples.each do |blob|
+  sample_name = blob.path.gsub(/\s/, '_')
+  Benchmark.bmbm do |bm|
+    bm.report('GetLanguagesByExtension()_SAMPLE_' + sample_name + ' ' + iterations.to_s) do
+      iterations.times do
+        Linguist::Strategy::Extension.call(blob, languages)
+        blob.clean
+      end
+    end
+  end
+end
+
+samples.each do |blob|
+  sample_name = blob.path.gsub(/\s/, '_')
+  Benchmark.bmbm do |bm|
+    bm.report('GetLanguagesByContent()_SAMPLE_' + sample_name + ' ' + iterations.to_s) do
+    iterations.times do
+        Linguist::Heuristics.call(blob, languages)
+        blob.clean
+      end
+    end
+  end
+end
--- a/go-enry/benchmarks/linguist-total.rb
+++ b/go-enry/benchmarks/linguist-total.rb
@@ -0,0 +1,120 @@
+#!/usr/bin/env ruby
+
+require 'benchmark'
+require 'linguist'
+
+iterations = (ARGV[0] || 1).to_i
+
+# BenchBlob wraps a FileBlob to keep data loaded and to clean attributes added by language detection.
+class BenchBlob < Linguist::FileBlob
+  attr_accessor :data
+  attr_accessor :fullpath
+
+  def initialize(path, base_path = nil)
+    super
+    @data = File.read(@fullpath)
+  end
+
+  def clean
+    @_mime_type = nil
+    @detect_encoding = nil
+    @lines = nil
+  end
+end
+
+def get_samples(root)
+  samples = Array.new
+  Dir.foreach(root) do |file|
+    path = File.join(root, file)
+    if file == "." or file == ".."
+      next
+    elsif File.directory?(path)
+      get_samples(path).each do |blob|
+        samples << blob
+      end
+    else
+      samples << BenchBlob.new(path)
+    end
+  end
+  return samples
+end
+
+samples = get_samples('.linguist/samples')
+languages = Linguist::Language.all
+
+Benchmark.bmbm do |bm|
+  time = bm.report('GetLanguage()_TOTAL ' + iterations.to_s) do
+    iterations.times do
+      samples.each do |blob|
+        Linguist::detect(blob)
+        blob.clean
+      end
+    end
+  end
+end
+
+Benchmark.bmbm do |bm|
+  bm.report('Classify()_TOTAL ' + iterations.to_s) do
+    iterations.times do
+      samples.each do |blob|
+        Linguist::Classifier.classify(Linguist::Samples.cache, blob.data)
+        blob.clean
+      end
+    end
+  end
+end
+
+Benchmark.bmbm do |bm|
+  bm.report('GetLanguagesByModeline()_TOTAL ' + iterations.to_s) do
+    iterations.times do
+      samples.each do |blob|
+        Linguist::Strategy::Modeline.call(blob, languages)
+        blob.clean
+      end
+    end
+  end
+end
+
+Benchmark.bmbm do |bm|
+  bm.report('GetLanguagesByFilename()_TOTAL ' + iterations.to_s) do
+    iterations.times do
+      samples.each do |blob|
+        Linguist::Strategy::Filename.call(blob, languages)
+        blob.clean
+      end
+    end
+  end
+end
+
+Benchmark.bmbm do |bm|
+  bm.report('GetLanguagesByShebang()_TOTAL ' + iterations.to_s) do
+    iterations.times do
+      samples.each do |blob|
+        Linguist::Shebang.call(blob, languages)
+        blob.clean
+      end
+    end
+  end
+end
+
+Benchmark.bmbm do |bm|
+  bm.report('GetLanguagesByExtension()_TOTAL ' + iterations.to_s) do
+    iterations.times do
+      samples.each do |blob|
+        Linguist::Strategy::Extension.call(blob, languages)
+        blob.clean
+      end
+    end
+  end
+end
+
+Benchmark.bmbm do |bm|
+  bm.report('GetLanguagesByContent()_TOTAL ' + iterations.to_s) do
+    iterations.times do
+      samples.each do |blob|
+        Linguist::Heuristics.call(blob, languages)
+        blob.clean
+      end
+    end
+  end
+end
--- a/go-enry/benchmarks/parse.sh
+++ b/go-enry/benchmarks/parse.sh
@@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+set -e
+
+cd benchmarks/output
+go run ../parser/main.go -outdir ../csv
+cd ../csv
+go run ../parser/main.go -distribution
--- a/go-enry/benchmarks/parser/main.go
+++ b/go-enry/benchmarks/parser/main.go
@@ -0,0 +1,386 @@
+package main
+
+import (
+	"bufio"
+	"bytes"
+	"encoding/csv"
+	"flag"
+	"fmt"
+	"io/ioutil"
+	"log"
+	"math"
+	"os"
+	"path/filepath"
+	"runtime"
+	"sort"
+	"strconv"
+	"strings"
+)
+
+const (
+	// functions benchmarked
+	getLanguageFunc = "GetLanguage()"
+	classifyFunc    = "Classify()"
+	modelineFunc    = "GetLanguagesByModeline()"
+	filenameFunc    = "GetLanguagesByFilename()"
+	shebangFunc     = "GetLanguagesByShebang()"
+	extensionFunc   = "GetLanguagesByExtension()"
+	contentFunc     = "GetLanguagesByContent()"
+
+	// benchmark's outputs
+	enryTotalBench       = "enry_total.bench"
+	enrySamplesBench     = "enry_samples.bench"
+	linguistTotalBench   = "linguist_total.bench"
+	linguistSamplesBench = "linguist_samples.bench"
+
+	// files to generate
+	enryTotalCSV       = "enry-total.csv"
+	enrySamplesCSV     = "enry-samples.csv"
+	linguistTotalCSV   = "linguist-total.csv"
+	linguistSamplesCSV = "linguist-samples.csv"
+
+	// files to generate with flag distribution
+	enryDistributionCSV     = "enry-distribution.csv"
+	linguistDistributionCSV = "linguist-distribution.csv"
+)
+
+var (
+	// flags
+	distribution bool
+	outDir       string
+
+	enryFunctions         = []string{getLanguageFunc, classifyFunc, modelineFunc, filenameFunc, shebangFunc, extensionFunc, contentFunc}
+	distributionIntervals = []string{"1us-10us", "10us-100us", "100us-1ms", "1ms-10ms", "10ms-100ms"}
+)
+
+func main() {
+	flag.BoolVar(&distribution, "distribution", false, "generate enry-distribution.csv and linguist-distribution.csv")
+	flag.StringVar(&outDir, "outdir", "", "path to leave csv files")
+	flag.Parse()
+
+	if distribution {
+		generateDistributionCSV()
+		return
+	}
+
+	generateCSV()
+}
+
+func generateDistributionCSV() {
+	CSVFiles := []struct {
+		in   string
+		out  string
+		tool string
+	}{
+		{in: enrySamplesCSV, out: enryDistributionCSV, tool: "enry"},
+		{in: linguistSamplesCSV, out: linguistDistributionCSV, tool: "linguist"},
+	}
+
+	for _, CSVFile := range CSVFiles {
+		f, err := os.Open(CSVFile.in)
+		if err != nil {
+			log.Println(err)
+			continue
+		}
+		defer f.Close()
+
+		r := csv.NewReader(f)
+		CSVSamples, err := r.ReadAll()
+		if err != nil {
+			log.Println(err)
+			continue
+		}
+
+		CSVDistribution, err := buildDistribution(CSVSamples[1:], CSVFile.tool)
+		if err != nil {
+			log.Println(err)
+			continue
+		}
+
+		if err := writeCSV(CSVDistribution, filepath.Join(outDir, CSVFile.out)); err != nil {
+			log.Println(err)
+			continue
+		}
+	}
+}
+
+func buildDistribution(CSVSamples [][]string, tool string) ([][]string, error) {
+	count := make(map[string]int, len(distributionIntervals))
+	for _, row := range CSVSamples {
+		if row[1] != getLanguageFunc {
+			continue
+		}
+
+		num, err := strconv.ParseFloat(row[len(row)-1], 64)
+		if err != nil {
+			return nil, err
+		}
+
+		arrangeByTime(count, num)
+	}
+
+	CSVDistribution := make([][]string, 0, len(count)+1)
+	firstLine := []string{"timeInterval", tool, "numberOfFiles"}
+	CSVDistribution = append(CSVDistribution, firstLine)
+	for _, interval := range distributionIntervals {
+		number := strconv.FormatInt(int64(count[interval]), 10)
+		row := []string{interval, tool, number}
+		CSVDistribution = append(CSVDistribution, row)
+	}
+
+	printDistributionInfo(count, tool)
+	return CSVDistribution, nil
+}
+
+func printDistributionInfo(count map[string]int, tool string) {
+	total := 0
+	for _, v := range count {
+		total += v
+	}
+
+	fmt.Println(tool, "files", total)
+	fmt.Println("Distribution")
+	for _, interval := range distributionIntervals {
+		fmt.Println("\t", interval, count[interval])
+	}
+
+	fmt.Println("Percentage")
+	for _, interval := range distributionIntervals {
+		p := (float64(count[interval]) / float64(total)) * 100.00
+		fmt.Printf("\t %s %f%%\n", interval, p)
+	}
+
+	fmt.Printf("\n\n")
+}
+
+func arrangeByTime(count map[string]int, num float64) {
+	switch {
+	case num > 1000.00 && num <= 10000.00:
+		count[distributionIntervals[0]]++
+	case num > 10000.00 && num <= 100000.00:
+		count[distributionIntervals[1]]++
+	case num > 100000.00 && num <= 1000000.00:
+		count[distributionIntervals[2]]++
+	case num > 1000000.00 && num <= 10000000.00:
+		count[distributionIntervals[3]]++
+	case num > 10000000.00 && num <= 100000000.00:
+		count[distributionIntervals[4]]++
+	}
+}
+
+func writeCSV(CSVData [][]string, outPath string) error {
+	out, err := os.Create(outPath)
+	if err != nil {
+		return err
+	}
+
+	w := csv.NewWriter(out)
+	w.WriteAll(CSVData)
+
+	if err := w.Error(); err != nil {
+		return err
+	}
+
+	return nil
+}
+
+type parse func(data []byte, tool string) ([][]string, error)
+
+func generateCSV() {
+	bmFiles := []struct {
+		in    string
+		out   string
+		tool  string
+		parse parse
+	}{
+		{in: enryTotalBench, out: enryTotalCSV, tool: "enry", parse: parseTotal},
+		{in: linguistTotalBench, out: linguistTotalCSV, tool: "linguist", parse: parseTotal},
+		{in: enrySamplesBench, out: enrySamplesCSV, tool: "enry", parse: parseSamples},
+		{in: linguistSamplesBench, out: linguistSamplesCSV, tool: "linguist", parse: parseSamples},
+	}
+
+	for _, bmFile := range bmFiles {
+		buf, err := ioutil.ReadFile(bmFile.in)
+		if err != nil {
+			log.Println(err)
+			continue
+		}
+
+		info, err := bmFile.parse(buf, bmFile.tool)
+		if err != nil {
+			log.Println(err)
+			continue
+		}
+
+		if err := writeCSV(info, filepath.Join(outDir, bmFile.out)); err != nil {
+			log.Println(err)
+			continue
+		}
+	}
+}
+
+func parseTotal(data []byte, tool string) ([][]string, error) {
+	const totalLine = "_TOTAL"
+	parsedInfo := map[string][]string{}
+	buf := bufio.NewScanner(bytes.NewReader(data))
+	for buf.Scan() {
+		line := buf.Text()
+		if strings.Contains(line, totalLine) {
+			split := strings.Fields(line)
+			row, err := getRow(split, tool)
+			if err != nil {
+				return nil, err
+			}
+
+			parsedInfo[row[0]] = row
+		}
+	}
+
+	if err := buf.Err(); err != nil {
+		return nil, err
+	}
+
+	firstLine := []string{"function", "tool", "iterations", "ns/op"}
+	return prepareInfoForCSV(parsedInfo, firstLine), nil
+}
+
+func getRow(line []string, tool string) ([]string, error) {
+	row := make([]string, 0, 3)
+	for _, function := range enryFunctions {
+		if strings.Contains(line[0], function) {
+			row = append(row, function)
+			break
+		}
+	}
+
+	row = append(row, tool)
+	iterations := line[1]
+	row = append(row, iterations)
+
+	average, err := getAverage(line)
+	if err != nil {
+		return nil, err
+
+	}
+
+	row = append(row, average)
+	return row, nil
+}
+
+func getAverage(line []string) (string, error) {
+	average := line[len(line)-1]
+	if !strings.HasSuffix(average, ")") {
+		return line[2], nil
+	}
+
+	totalTime := strings.Trim(average, "() ")
+	time, err := strconv.ParseFloat(totalTime, 64)
+	if err != nil {
+		return "", err
+	}
+
+	iterations := line[1]
+	i, err := strconv.ParseFloat(iterations, 64)
+	if err != nil {
+		return "", err
+	}
+
+	avg := (time * math.Pow10(9)) / i
+	return fmt.Sprintf("%d", int(avg)), nil
+}
+
+func prepareInfoForCSV(parsedInfo map[string][]string, firstLine []string) [][]string {
+	info := createInfoWithFirstLine(firstLine, len(parsedInfo))
+	for _, function := range enryFunctions {
+		info = append(info, parsedInfo[function])
+	}
+
+	return info
+}
+
+func createInfoWithFirstLine(firstLine []string, sliceLength int) (info [][]string) {
+	if len(firstLine) > 0 {
+		info = make([][]string, 0, sliceLength+1)
+		info = append(info, firstLine)
+	} else {
+		info = make([][]string, 0, sliceLength)
+	}
+
+	return
+}
+
+type enryFuncs map[string][]string
+
+func newEnryFuncs() enryFuncs {
+	return enryFuncs{
+		getLanguageFunc: nil,
+		classifyFunc:    nil,
+		modelineFunc:    nil,
+		filenameFunc:    nil,
+		shebangFunc:     nil,
+		extensionFunc:   nil,
+		contentFunc:     nil,
+	}
+}
+
+func parseSamples(data []byte, tool string) ([][]string, error) {
+	const sampleLine = "SAMPLE_"
+	parsedInfo := map[string]enryFuncs{}
+	buf := bufio.NewScanner(bytes.NewReader(data))
+	for buf.Scan() {
+		line := buf.Text()
+		if strings.Contains(line, sampleLine) {
+			split := strings.Fields(line)
+			name := getSampleName(split[0])
+			if _, ok := parsedInfo[name]; !ok {
+				parsedInfo[name] = newEnryFuncs()
+			}
+
+			row := make([]string, 0, 4)
+			row = append(row, name)
+			r, err := getRow(split, tool)
+			if err != nil {
+				return nil, err
+			}
+
+			row = append(row, r...)
+			function := row[1]
+			parsedInfo[name][function] = row
+		}
+	}
+
+	if err := buf.Err(); err != nil {
+		return nil, err
+	}
+
+	firstLine := []string{"file", "function", "tool", "iterations", "ns/op"}
+	return prepareSamplesInfoForCSV(parsedInfo, firstLine), nil
+}
+
+func getSampleName(s string) string {
+	start := strings.Index(s, "SAMPLE_") + len("SAMPLE_")
+	suffix := fmt.Sprintf("-%d", runtime.GOMAXPROCS(-1))
+	name := strings.TrimSuffix(s[start:], suffix)
+	return name
+}
+
+func prepareSamplesInfoForCSV(parsedInfo map[string]enryFuncs, firstLine []string) [][]string {
+	info := createInfoWithFirstLine(firstLine, len(parsedInfo)*len(enryFunctions))
+	orderedKeys := sortKeys(parsedInfo)
+	for _, path := range orderedKeys {
+		sampleInfo := prepareInfoForCSV(parsedInfo[path], nil)
+		info = append(info, sampleInfo...)
+	}
+
+	return info
+}
+
+func sortKeys(parsedInfo map[string]enryFuncs) []string {
+	keys := make([]string, 0, len(parsedInfo))
+	for key := range parsedInfo {
+		keys = append(keys, key)
+	}
+
+	sort.Strings(keys)
+	return keys
+}
--- a/go-enry/benchmarks/plot-histogram.gp
+++ b/go-enry/benchmarks/plot-histogram.gp
@@ -0,0 +1,21 @@
+#!/usr/bin/env gnuplot
+
+set terminal png large font "arial,26" size 1920,1080
+set output 'benchmarks/histogram/distribution.png'
+
+set datafile separator comma
+set key under
+
+set style data histogram
+set style histogram clustered gap 1 title offset 1,1
+set style fill solid noborder
+set boxwidth 0.95
+set grid y
+set bmargin 12
+set autoscale
+set title "Number of files per processing time"
+
+plot newhistogram, 'benchmarks/csv/enry-distribution.csv' using 3:xtic(1) title "enry", 'benchmarks/csv/linguist-distribution.csv' using 3 title "linguist"
+
+unset output
+
--- a/go-enry/benchmarks/run-benchmarks.sh
+++ b/go-enry/benchmarks/run-benchmarks.sh
@@ -0,0 +1,6 @@
+#!/usr/bin/env bash
+set -e
+
+mkdir -p benchmarks/output
+go test -run NONE -bench=. -benchtime=120s -timeout=100h > benchmarks/output/enry_total.bench
+benchmarks/linguist-total.rb 5 > benchmarks/output/linguist_total.bench
--- a/go-enry/benchmarks/run.sh
+++ b/go-enry/benchmarks/run.sh
@@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+set -e
+
+benchmarks/run-benchmarks.sh
+make benchmarks-slow
+benchmarks/parse.sh
+benchmarks/plot-histogram.gp
--- a/go-enry/benchmarks/soft-hard-info.txt
+++ b/go-enry/benchmarks/soft-hard-info.txt
@@ -0,0 +1,9 @@
+# Hardware and software used to run benchmarks
+
+MacBookPro13,1
+Darwin Kernel Version 16.7.0: Tue Jan 30 11:27:06 PST 2018; root:xnu-3789.73.11~1/RELEASE_X86_64 x86_64 i386
+go version go1.10.3 darwin/amd64
+ruby 2.4.1p111 (2017-03-22 revision 58053) [x86_64-darwin16]
+
+github/linguist v7.1.3 commit: e761f9b013e5b61161481fcb898b59721ee40e3d
+src-d/enry v1.6.7 commit: 3d356c70ae322f41048f74d01c5e8572f5898d34