renamed benchmark directory to benchmarks

updated .gitignore
This commit is contained in:
Manuel Carmona
2017-07-10 17:08:21 +02:00
parent 4ca7ffb769
commit 22cb6f602e
20 changed files with 20 additions and 20 deletions

View File

@ -0,0 +1,6 @@
timeInterval,enry,numberOfFiles
1us-10us,enry,96
10us-100us,enry,1244
100us-1ms,enry,321
1ms-10ms,enry,135
10ms-100ms,enry,43
1 timeInterval enry numberOfFiles
2 1us-10us enry 96
3 10us-100us enry 1244
4 100us-1ms enry 321
5 1ms-10ms enry 135
6 10ms-100ms enry 43

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,8 @@
function,tool,iterations,ns/op
GetLanguage(),enry,100,1915861259
Classify(),enry,5,39977943775
GetLanguagesByModeline(),enry,1000,196571071
GetLanguagesByFilename(),enry,2000000,89774
GetLanguagesByShebang(),enry,100000,1892569
GetLanguagesByExtension(),enry,200000,921160
GetLanguagesByContent(),enry,1000,286159159
1 function tool iterations ns/op
2 GetLanguage() enry 100 1915861259
3 Classify() enry 5 39977943775
4 GetLanguagesByModeline() enry 1000 196571071
5 GetLanguagesByFilename() enry 2000000 89774
6 GetLanguagesByShebang() enry 100000 1892569
7 GetLanguagesByExtension() enry 200000 921160
8 GetLanguagesByContent() enry 1000 286159159

View File

@ -0,0 +1,6 @@
timeInterval,linguist,numberOfFiles
1us-10us,linguist,0
10us-100us,linguist,74
100us-1ms,linguist,920
1ms-10ms,linguist,788
10ms-100ms,linguist,57
1 timeInterval linguist numberOfFiles
2 1us-10us linguist 0
3 10us-100us linguist 74
4 100us-1ms linguist 920
5 1ms-10ms linguist 788
6 10ms-100ms linguist 57

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,8 @@
function,tool,iterations,ns/op
GetLanguage(),linguist,5,3979096800
Classify(),linguist,5,178253431800
GetLanguagesByModeline(),linguist,5,2582204000
GetLanguagesByFilename(),linguist,5,2688800
GetLanguagesByShebang(),linguist,5,77155200
GetLanguagesByExtension(),linguist,5,6688800
GetLanguagesByContent(),linguist,5,161719000
1 function tool iterations ns/op
2 GetLanguage() linguist 5 3979096800
3 Classify() linguist 5 178253431800
4 GetLanguagesByModeline() linguist 5 2582204000
5 GetLanguagesByFilename() linguist 5 2688800
6 GetLanguagesByShebang() linguist 5 77155200
7 GetLanguagesByExtension() linguist 5 6688800
8 GetLanguagesByContent() linguist 5 161719000

Binary file not shown.

After

Width:  |  Height:  |  Size: 17 KiB

126
benchmarks/linguist-samples.rb Executable file
View File

@ -0,0 +1,126 @@
#!/usr/bin/env ruby
require 'benchmark'
require 'linguist'
iterations = (ARGV[0] || 1).to_i
# BenchBlob wraps a FileBlob to keep data loaded and to clean attributes added by language detection.
class BenchBlob < Linguist::FileBlob
attr_accessor :data
def initialize(path, base_path = nil)
super
@data = File.read(@fullpath)
end
def clean
@_mime_type = nil
@detect_encoding = nil
@lines = nil
end
end
def get_samples(root)
samples = Array.new
Dir.foreach(root) do |file|
path = File.join(root, file)
if file == "." or file == ".."
next
elsif File.directory?(path)
get_samples(path).each do |blob|
samples << blob
end
else
samples << BenchBlob.new(path)
end
end
return samples
end
samples = get_samples('.linguist/samples')
languages = Linguist::Language.all
samples.each do |blob|
sample_name = blob.path.gsub(/\s/, '_')
Benchmark.bmbm do |bm|
bm.report('GetLanguage()_SAMPLE_' + sample_name + ' ' + iterations.to_s) do
iterations.times do
Linguist::detect(blob)
blob.clean
end
end
end
end
samples.each do |blob|
sample_name = blob.path.gsub(/\s/, '_')
Benchmark.bmbm do |bm|
bm.report('Classify()_SAMPLE_' + sample_name + ' ' + iterations.to_s) do
iterations.times do
Linguist::Classifier.classify(Linguist::Samples.cache, blob.data)
blob.clean
end
end
end
end
samples.each do |blob|
sample_name = blob.path.gsub(/\s/, '_')
Benchmark.bmbm do |bm|
bm.report('GetLanguagesByModeline()_SAMPLE_' + sample_name + ' ' + iterations.to_s) do
iterations.times do
Linguist::Strategy::Modeline.call(blob, languages)
blob.clean
end
end
end
end
samples.each do |blob|
sample_name = blob.path.gsub(/\s/, '_')
Benchmark.bmbm do |bm|
bm.report('GetLanguagesByFilename()_SAMPLE_' + sample_name + ' ' + iterations.to_s) do
iterations.times do
Linguist::Strategy::Filename.call(blob, languages)
blob.clean
end
end
end
end
samples.each do |blob|
sample_name = blob.path.gsub(/\s/, '_')
Benchmark.bmbm do |bm|
bm.report('GetLanguagesByShebang()_SAMPLE_' + sample_name + ' ' + iterations.to_s) do
iterations.times do
Linguist::Shebang.call(blob, languages)
blob.clean
end
end
end
end
samples.each do |blob|
sample_name = blob.path.gsub(/\s/, '_')
Benchmark.bmbm do |bm|
bm.report('GetLanguagesByExtension()_SAMPLE_' + sample_name + ' ' + iterations.to_s) do
iterations.times do
Linguist::Strategy::Extension.call(blob, languages)
blob.clean
end
end
end
end
samples.each do |blob|
sample_name = blob.path.gsub(/\s/, '_')
Benchmark.bmbm do |bm|
bm.report('GetLanguagesByContent()_SAMPLE_' + sample_name + ' ' + iterations.to_s) do
iterations.times do
Linguist::Heuristics.call(blob, languages)
blob.clean
end
end
end
end

120
benchmarks/linguist-total.rb Executable file
View File

@ -0,0 +1,120 @@
#!/usr/bin/env ruby
require 'benchmark'
require 'linguist'
iterations = (ARGV[0] || 1).to_i
# BenchBlob wraps a FileBlob to keep data loaded and to clean attributes added by language detection.
class BenchBlob < Linguist::FileBlob
attr_accessor :data
attr_accessor :fullpath
def initialize(path, base_path = nil)
super
@data = File.read(@fullpath)
end
def clean
@_mime_type = nil
@detect_encoding = nil
@lines = nil
end
end
def get_samples(root)
samples = Array.new
Dir.foreach(root) do |file|
path = File.join(root, file)
if file == "." or file == ".."
next
elsif File.directory?(path)
get_samples(path).each do |blob|
samples << blob
end
else
samples << BenchBlob.new(path)
end
end
return samples
end
samples = get_samples('.linguist/samples')
languages = Linguist::Language.all
Benchmark.bmbm do |bm|
time = bm.report('GetLanguage()_TOTAL ' + iterations.to_s) do
iterations.times do
samples.each do |blob|
Linguist::detect(blob)
blob.clean
end
end
end
end
Benchmark.bmbm do |bm|
bm.report('Classify()_TOTAL ' + iterations.to_s) do
iterations.times do
samples.each do |blob|
Linguist::Classifier.classify(Linguist::Samples.cache, blob.data)
blob.clean
end
end
end
end
Benchmark.bmbm do |bm|
bm.report('GetLanguagesByModeline()_TOTAL ' + iterations.to_s) do
iterations.times do
samples.each do |blob|
Linguist::Strategy::Modeline.call(blob, languages)
blob.clean
end
end
end
end
Benchmark.bmbm do |bm|
bm.report('GetLanguagesByFilename()_TOTAL ' + iterations.to_s) do
iterations.times do
samples.each do |blob|
Linguist::Strategy::Filename.call(blob, languages)
blob.clean
end
end
end
end
Benchmark.bmbm do |bm|
bm.report('GetLanguagesByShebang()_TOTAL ' + iterations.to_s) do
iterations.times do
samples.each do |blob|
Linguist::Shebang.call(blob, languages)
blob.clean
end
end
end
end
Benchmark.bmbm do |bm|
bm.report('GetLanguagesByExtension()_TOTAL ' + iterations.to_s) do
iterations.times do
samples.each do |blob|
Linguist::Strategy::Extension.call(blob, languages)
blob.clean
end
end
end
end
Benchmark.bmbm do |bm|
bm.report('GetLanguagesByContent()_TOTAL ' + iterations.to_s) do
iterations.times do
samples.each do |blob|
Linguist::Heuristics.call(blob, languages)
blob.clean
end
end
end
end

5
benchmarks/parse.sh Executable file
View File

@ -0,0 +1,5 @@
#!/bin/sh
cd benchmarks/output && go run ../parser/main.go -outdir ../csv && \
cd ../csv && go run ../parser/main.go -distribution

386
benchmarks/parser/main.go Normal file
View File

@ -0,0 +1,386 @@
package main
import (
"bufio"
"bytes"
"encoding/csv"
"flag"
"fmt"
"io/ioutil"
"log"
"math"
"os"
"path/filepath"
"runtime"
"sort"
"strconv"
"strings"
)
const (
// functions benchmarked
getLanguageFunc = "GetLanguage()"
classifyFunc = "Classify()"
modelineFunc = "GetLanguagesByModeline()"
filenameFunc = "GetLanguagesByFilename()"
shebangFunc = "GetLanguagesByShebang()"
extensionFunc = "GetLanguagesByExtension()"
contentFunc = "GetLanguagesByContent()"
// benchmark's outputs
enryTotalBench = "enry_total.bench"
enrySamplesBench = "enry_samples.bench"
linguistTotalBench = "linguist_total.bench"
linguistSamplesBench = "linguist_samples.bench"
// files to generate
enryTotalCSV = "enry-total.csv"
enrySamplesCSV = "enry-samples.csv"
linguistTotalCSV = "linguist-total.csv"
linguistSamplesCSV = "linguist-samples.csv"
// files to generate with flag distribution
enryDistributionCSV = "enry-distribution.csv"
linguistDistributionCSV = "linguist-distribution.csv"
)
var (
// flags
distribution bool
outDir string
enryFunctions = []string{getLanguageFunc, classifyFunc, modelineFunc, filenameFunc, shebangFunc, extensionFunc, contentFunc}
distributionIntervals = []string{"1us-10us", "10us-100us", "100us-1ms", "1ms-10ms", "10ms-100ms"}
)
func main() {
flag.BoolVar(&distribution, "distribution", false, "generate enry-distribuition.csv and linguist-distribution.csv")
flag.StringVar(&outDir, "outdir", "", "path to leave csv files")
flag.Parse()
if distribution {
generateDistributionCSV()
return
}
generateCSV()
}
func generateDistributionCSV() {
CSVFiles := []struct {
in string
out string
tool string
}{
{in: enrySamplesCSV, out: enryDistributionCSV, tool: "enry"},
{in: linguistSamplesCSV, out: linguistDistributionCSV, tool: "linguist"},
}
for _, CSVFile := range CSVFiles {
f, err := os.Open(CSVFile.in)
if err != nil {
log.Println(err)
continue
}
defer f.Close()
r := csv.NewReader(f)
CSVSamples, err := r.ReadAll()
if err != nil {
log.Println(err)
continue
}
CSVDistribution, err := buildDistribution(CSVSamples[1:], CSVFile.tool)
if err != nil {
log.Println(err)
continue
}
if err := writeCSV(CSVDistribution, filepath.Join(outDir, CSVFile.out)); err != nil {
log.Println(err)
continue
}
}
}
func buildDistribution(CSVSamples [][]string, tool string) ([][]string, error) {
count := make(map[string]int, len(distributionIntervals))
for _, row := range CSVSamples {
if row[1] != getLanguageFunc {
continue
}
num, err := strconv.ParseFloat(row[len(row)-1], 64)
if err != nil {
return nil, err
}
arrangeByTime(count, num)
}
CSVDistribution := make([][]string, 0, len(count)+1)
firstLine := []string{"timeInterval", tool, "numberOfFiles"}
CSVDistribution = append(CSVDistribution, firstLine)
for _, interval := range distributionIntervals {
number := strconv.FormatInt(int64(count[interval]), 10)
row := []string{interval, tool, number}
CSVDistribution = append(CSVDistribution, row)
}
printDistributionInfo(count, tool)
return CSVDistribution, nil
}
func printDistributionInfo(count map[string]int, tool string) {
total := 0
for _, v := range count {
total += v
}
fmt.Println(tool, "files", total)
fmt.Println("Distribution")
for _, interval := range distributionIntervals {
fmt.Println("\t", interval, count[interval])
}
fmt.Println("Percentage")
for _, interval := range distributionIntervals {
p := (float64(count[interval]) / float64(total)) * 100.00
fmt.Printf("\t %s %f%%\n", interval, p)
}
fmt.Printf("\n\n")
}
func arrangeByTime(count map[string]int, num float64) {
switch {
case num > 1000.00 && num <= 10000.00:
count[distributionIntervals[0]]++
case num > 10000.00 && num <= 100000.00:
count[distributionIntervals[1]]++
case num > 100000.00 && num <= 1000000.00:
count[distributionIntervals[2]]++
case num > 1000000.00 && num <= 10000000.00:
count[distributionIntervals[3]]++
case num > 10000000.00 && num <= 100000000.00:
count[distributionIntervals[4]]++
}
}
func writeCSV(CSVData [][]string, outPath string) error {
out, err := os.Create(outPath)
if err != nil {
return err
}
w := csv.NewWriter(out)
w.WriteAll(CSVData)
if err := w.Error(); err != nil {
return err
}
return nil
}
type parse func(data []byte, tool string) ([][]string, error)
func generateCSV() {
bmFiles := []struct {
in string
out string
tool string
parse parse
}{
{in: enryTotalBench, out: enryTotalCSV, tool: "enry", parse: parseTotal},
{in: linguistTotalBench, out: linguistTotalCSV, tool: "linguist", parse: parseTotal},
{in: enrySamplesBench, out: enrySamplesCSV, tool: "enry", parse: parseSamples},
{in: linguistSamplesBench, out: linguistSamplesCSV, tool: "linguist", parse: parseSamples},
}
for _, bmFile := range bmFiles {
buf, err := ioutil.ReadFile(bmFile.in)
if err != nil {
log.Println(err)
continue
}
info, err := bmFile.parse(buf, bmFile.tool)
if err != nil {
log.Println(err)
continue
}
if err := writeCSV(info, filepath.Join(outDir, bmFile.out)); err != nil {
log.Println(err)
continue
}
}
}
func parseTotal(data []byte, tool string) ([][]string, error) {
const totalLine = "_TOTAL"
parsedInfo := map[string][]string{}
buf := bufio.NewScanner(bytes.NewReader(data))
for buf.Scan() {
line := buf.Text()
if strings.Contains(line, totalLine) {
split := strings.Fields(line)
row, err := getRow(split, tool)
if err != nil {
return nil, err
}
parsedInfo[row[0]] = row
}
}
if err := buf.Err(); err != nil {
return nil, err
}
firstLine := []string{"function", "tool", "iterations", "ns/op"}
return prepareInfoForCSV(parsedInfo, firstLine), nil
}
func getRow(line []string, tool string) ([]string, error) {
row := make([]string, 0, 3)
for _, function := range enryFunctions {
if strings.Contains(line[0], function) {
row = append(row, function)
break
}
}
row = append(row, tool)
iterations := line[1]
row = append(row, iterations)
average, err := getAverage(line)
if err != nil {
return nil, err
}
row = append(row, average)
return row, nil
}
func getAverage(line []string) (string, error) {
average := line[len(line)-1]
if !strings.HasSuffix(average, ")") {
return line[2], nil
}
totalTime := strings.Trim(average, "() ")
time, err := strconv.ParseFloat(totalTime, 64)
if err != nil {
return "", err
}
iterations := line[1]
i, err := strconv.ParseFloat(iterations, 64)
if err != nil {
return "", err
}
avg := (time * math.Pow10(9)) / i
return fmt.Sprintf("%d", int(avg)), nil
}
func prepareInfoForCSV(parsedInfo map[string][]string, firstLine []string) [][]string {
info := createInfoWithFirstLine(firstLine, len(parsedInfo))
for _, function := range enryFunctions {
info = append(info, parsedInfo[function])
}
return info
}
func createInfoWithFirstLine(firstLine []string, sliceLength int) (info [][]string) {
if len(firstLine) > 0 {
info = make([][]string, 0, sliceLength+1)
info = append(info, firstLine)
} else {
info = make([][]string, 0, sliceLength)
}
return
}
type enryFuncs map[string][]string
func newEnryFuncs() enryFuncs {
return enryFuncs{
getLanguageFunc: nil,
classifyFunc: nil,
modelineFunc: nil,
filenameFunc: nil,
shebangFunc: nil,
extensionFunc: nil,
contentFunc: nil,
}
}
func parseSamples(data []byte, tool string) ([][]string, error) {
const sampleLine = "SAMPLE_"
parsedInfo := map[string]enryFuncs{}
buf := bufio.NewScanner(bytes.NewReader(data))
for buf.Scan() {
line := buf.Text()
if strings.Contains(line, sampleLine) {
split := strings.Fields(line)
name := getSampleName(split[0])
if _, ok := parsedInfo[name]; !ok {
parsedInfo[name] = newEnryFuncs()
}
row := make([]string, 0, 4)
row = append(row, name)
r, err := getRow(split, tool)
if err != nil {
return nil, err
}
row = append(row, r...)
function := row[1]
parsedInfo[name][function] = row
}
}
if err := buf.Err(); err != nil {
return nil, err
}
firstLine := []string{"file", "function", "tool", "iterations", "ns/op"}
return prepareSamplesInfoForCSV(parsedInfo, firstLine), nil
}
func getSampleName(s string) string {
start := strings.Index(s, "SAMPLE_") + len("SAMPLE_")
suffix := fmt.Sprintf("-%d", runtime.GOMAXPROCS(-1))
name := strings.TrimSuffix(s[start:], suffix)
return name
}
func prepareSamplesInfoForCSV(parsedInfo map[string]enryFuncs, firstLine []string) [][]string {
info := createInfoWithFirstLine(firstLine, len(parsedInfo)*len(enryFunctions))
orderedKeys := sortKeys(parsedInfo)
for _, path := range orderedKeys {
sampleInfo := prepareInfoForCSV(parsedInfo[path], nil)
info = append(info, sampleInfo...)
}
return info
}
func sortKeys(parsedInfo map[string]enryFuncs) []string {
keys := make([]string, 0, len(parsedInfo))
for key := range parsedInfo {
keys = append(keys, key)
}
sort.Strings(keys)
return keys
}

21
benchmarks/plot-histogram.gp Executable file
View File

@ -0,0 +1,21 @@
#!/usr/bin/env gnuplot
set terminal png large font "arial,26" size 1920,1080
set output 'benchmarks/histogram/distribution.png'
set datafile separator comma
set key under
set style data histogram
set style histogram clustered gap 1 title offset 1,1
set style fill solid noborder
set boxwidth 0.95
set grid y
set bmargin 12
set autoscale
set title "Number of files per processing time"
plot newhistogram, 'benchmarks/csv/enry-distribution.csv' using 3:xtic(1) title "enry", 'benchmarks/csv/linguist-distribution.csv' using 3 title "linguist"
unset output

4
benchmarks/run-benchmarks.sh Executable file
View File

@ -0,0 +1,4 @@
#!/bin/sh
mkdir -p benchmarks/output && go test -run NONE -bench=. -benchtime=120s -timeout=100h >benchmarks/output/enry_total.bench && \
benchmarks/linguist-total.rb 5 >benchmarks/output/linguist_total.bench

4
benchmarks/run.sh Executable file
View File

@ -0,0 +1,4 @@
#!/bin/sh
benchmarks/run-benchmarks.sh && make benchmarks-slow && \
benchmarks/parse.sh && benchmarks/plot-histogram.gp

View File

@ -0,0 +1,9 @@
# Hardware and software used to run benchmarks
Dell XPS 9360
Linux 4.11.6-3-ARCH #1 SMP PREEMPT Thu Jun 22 12:21:46 CEST 2017 x86_64
go version go1.8.3 linux/amd64
ruby 2.4.1p111 (2017-03-22 revision 58053) [x86_64-linux]
github/linguist/samples commit: d5c8db3fb91963c4b2762ca2ea2ff7cfac109f68