added benchmarks and scripts to run, parse and plot them

moved benchmark/run-slow-benchmarks.sh's content to Makefile
This commit is contained in:
Manuel Carmona 2017-06-28 13:01:36 +02:00
parent 2045abfa41
commit 8d91dc7be8
11 changed files with 872 additions and 18 deletions

View File

@ -32,6 +32,16 @@ code-generate: $(LINGUIST_PATH)
mkdir -p data
go run internal/code-generator/main.go
benchmarks: $(LINGUIST_PATH)
go test -run=NONE -bench=. && benchmark/linguist-total.sh
benchmarks-samples: $(LINGUIST_PATH)
go test -run=NONE -bench=. -benchtime=5us && benchmark/linguist-samples.rb
benchmarks-slow: $(LINGUST_PATH)
go test -run=NONE -bench=. -slow -benchtime=100ms -timeout=100h >benchmark/output/enry_samples.bench && \
benchmark/linguist-samples.rb 5 >benchmark/output/linguist_samples.bench
clean:
rm -rf $(LINGUIST_PATH)

126
benchmark/linguist-samples.rb Executable file
View File

@ -0,0 +1,126 @@
#!/usr/bin/env ruby
require 'benchmark'
require 'linguist'
iterations = (ARGV[0] || 1).to_i
# BenchBlob wraps a FileBlob to keep data loaded and to clean attributes added by language detection.
class BenchBlob < Linguist::FileBlob
attr_accessor :data
def initialize(path, base_path = nil)
super
@data = File.read(@fullpath)
end
def clean
@_mime_type = nil
@detect_encoding = nil
@lines = nil
end
end
def get_samples(root)
samples = Array.new
Dir.foreach(root) do |file|
path = File.join(root, file)
if file == "." or file == ".."
next
elsif File.directory?(path)
get_samples(path).each do |blob|
samples << blob
end
else
samples << BenchBlob.new(path)
end
end
return samples
end
samples = get_samples('.linguist/samples')
languages = Linguist::Language.all
samples.each do |blob|
sample_name = blob.path.gsub(/\s/, '_')
Benchmark.bmbm do |bm|
bm.report('GetLanguage()_SAMPLE_' + sample_name + ' ' + iterations.to_s) do
iterations.times do
Linguist::detect(blob)
blob.clean
end
end
end
end
samples.each do |blob|
sample_name = blob.path.gsub(/\s/, '_')
Benchmark.bmbm do |bm|
bm.report('Classify()_SAMPLE_' + sample_name + ' ' + iterations.to_s) do
iterations.times do
Linguist::Classifier.classify(Linguist::Samples.cache, blob.data)
blob.clean
end
end
end
end
samples.each do |blob|
sample_name = blob.path.gsub(/\s/, '_')
Benchmark.bmbm do |bm|
bm.report('GetLanguagesByModeline()_SAMPLE_' + sample_name + ' ' + iterations.to_s) do
iterations.times do
Linguist::Strategy::Modeline.call(blob, languages)
blob.clean
end
end
end
end
samples.each do |blob|
sample_name = blob.path.gsub(/\s/, '_')
Benchmark.bmbm do |bm|
bm.report('GetLanguagesByFilename()_SAMPLE_' + sample_name + ' ' + iterations.to_s) do
iterations.times do
Linguist::Strategy::Filename.call(blob, languages)
blob.clean
end
end
end
end
samples.each do |blob|
sample_name = blob.path.gsub(/\s/, '_')
Benchmark.bmbm do |bm|
bm.report('GetLanguagesByShebang()_SAMPLE_' + sample_name + ' ' + iterations.to_s) do
iterations.times do
Linguist::Shebang.call(blob, languages)
blob.clean
end
end
end
end
samples.each do |blob|
sample_name = blob.path.gsub(/\s/, '_')
Benchmark.bmbm do |bm|
bm.report('GetLanguagesByExtension()_SAMPLE_' + sample_name + ' ' + iterations.to_s) do
iterations.times do
Linguist::Strategy::Extension.call(blob, languages)
blob.clean
end
end
end
end
samples.each do |blob|
sample_name = blob.path.gsub(/\s/, '_')
Benchmark.bmbm do |bm|
bm.report('GetLanguagesByContent()_SAMPLE_' + sample_name + ' ' + iterations.to_s) do
iterations.times do
Linguist::Heuristics.call(blob, languages)
blob.clean
end
end
end
end

120
benchmark/linguist-total.rb Executable file
View File

@ -0,0 +1,120 @@
#!/usr/bin/env ruby
require 'benchmark'
require 'linguist'
iterations = (ARGV[0] || 1).to_i
# BenchBlob wraps a FileBlob to keep data loaded and to clean attributes added by language detection.
class BenchBlob < Linguist::FileBlob
attr_accessor :data
attr_accessor :fullpath
def initialize(path, base_path = nil)
super
@data = File.read(@fullpath)
end
def clean
@_mime_type = nil
@detect_encoding = nil
@lines = nil
end
end
def get_samples(root)
samples = Array.new
Dir.foreach(root) do |file|
path = File.join(root, file)
if file == "." or file == ".."
next
elsif File.directory?(path)
get_samples(path).each do |blob|
samples << blob
end
else
samples << BenchBlob.new(path)
end
end
return samples
end
samples = get_samples('.linguist/samples')
languages = Linguist::Language.all
Benchmark.bmbm do |bm|
time = bm.report('GetLanguage()_TOTAL ' + iterations.to_s) do
iterations.times do
samples.each do |blob|
Linguist::detect(blob)
blob.clean
end
end
end
end
Benchmark.bmbm do |bm|
bm.report('Classify()_TOTAL ' + iterations.to_s) do
iterations.times do
samples.each do |blob|
Linguist::Classifier.classify(Linguist::Samples.cache, blob.data)
blob.clean
end
end
end
end
Benchmark.bmbm do |bm|
bm.report('GetLanguagesByModeline()_TOTAL ' + iterations.to_s) do
iterations.times do
samples.each do |blob|
Linguist::Strategy::Modeline.call(blob, languages)
blob.clean
end
end
end
end
Benchmark.bmbm do |bm|
bm.report('GetLanguagesByFilename()_TOTAL ' + iterations.to_s) do
iterations.times do
samples.each do |blob|
Linguist::Strategy::Filename.call(blob, languages)
blob.clean
end
end
end
end
Benchmark.bmbm do |bm|
bm.report('GetLanguagesByShebang()_TOTAL ' + iterations.to_s) do
iterations.times do
samples.each do |blob|
Linguist::Shebang.call(blob, languages)
blob.clean
end
end
end
end
Benchmark.bmbm do |bm|
bm.report('GetLanguagesByExtension()_TOTAL ' + iterations.to_s) do
iterations.times do
samples.each do |blob|
Linguist::Strategy::Extension.call(blob, languages)
blob.clean
end
end
end
end
Benchmark.bmbm do |bm|
bm.report('GetLanguagesByContent()_TOTAL ' + iterations.to_s) do
iterations.times do
samples.each do |blob|
Linguist::Heuristics.call(blob, languages)
blob.clean
end
end
end
end

5
benchmark/parse.sh Executable file
View File

@ -0,0 +1,5 @@
#!/bin/sh
cd benchmark/output && go run ../parser/main.go -outdir ../csv && \
cd ../csv && go run ../parser/main.go -distribution

386
benchmark/parser/main.go Normal file
View File

@ -0,0 +1,386 @@
package main
import (
"bufio"
"bytes"
"encoding/csv"
"flag"
"fmt"
"io/ioutil"
"log"
"math"
"os"
"path/filepath"
"runtime"
"sort"
"strconv"
"strings"
)
const (
// functions benchmarked
getLanguageFunc = "GetLanguage()"
classifyFunc = "Classify()"
modelineFunc = "GetLanguagesByModeline()"
filenameFunc = "GetLanguagesByFilename()"
shebangFunc = "GetLanguagesByShebang()"
extensionFunc = "GetLanguagesByExtension()"
contentFunc = "GetLanguagesByContent()"
// benchmark's outputs
enryTotalBench = "enry_total.bench"
enrySamplesBench = "enry_samples.bench"
linguistTotalBench = "linguist_total.bench"
linguistSamplesBench = "linguist_samples.bench"
// files to generate
enryTotalCSV = "enry-total.csv"
enrySamplesCSV = "enry-samples.csv"
linguistTotalCSV = "linguist-total.csv"
linguistSamplesCSV = "linguist-samples.csv"
// files to generate with flag distribution
enryDistributionCSV = "enry-distribution.csv"
linguistDistributionCSV = "linguist-distribution.csv"
)
var (
// flags
distribution bool
outDir string
enryFunctions = []string{getLanguageFunc, classifyFunc, modelineFunc, filenameFunc, shebangFunc, extensionFunc, contentFunc}
distributionIntervals = []string{"1us-10us", "10us-100us", "100us-1ms", "1ms-10ms", "10ms-100ms"}
)
func main() {
flag.BoolVar(&distribution, "distribution", false, "generate enry-distribuition.csv and linguist-distribution.csv")
flag.StringVar(&outDir, "outdir", "", "path to leave csv files")
flag.Parse()
if distribution {
generateDistributionCSV()
return
}
generateCSV()
}
func generateDistributionCSV() {
CSVFiles := []struct {
in string
out string
tool string
}{
{in: enrySamplesCSV, out: enryDistributionCSV, tool: "enry"},
{in: linguistSamplesCSV, out: linguistDistributionCSV, tool: "linguist"},
}
for _, CSVFile := range CSVFiles {
f, err := os.Open(CSVFile.in)
if err != nil {
log.Println(err)
continue
}
defer f.Close()
r := csv.NewReader(f)
CSVSamples, err := r.ReadAll()
if err != nil {
log.Println(err)
continue
}
CSVDistribution, err := buildDistribution(CSVSamples[1:], CSVFile.tool)
if err != nil {
log.Println(err)
continue
}
if err := writeCSV(CSVDistribution, filepath.Join(outDir, CSVFile.out)); err != nil {
log.Println(err)
continue
}
}
}
func buildDistribution(CSVSamples [][]string, tool string) ([][]string, error) {
count := make(map[string]int, len(distributionIntervals))
for _, row := range CSVSamples {
if row[1] != getLanguageFunc {
continue
}
num, err := strconv.ParseFloat(row[len(row)-1], 64)
if err != nil {
return nil, err
}
arrangeByTime(count, num)
}
CSVDistribution := make([][]string, 0, len(count)+1)
firstLine := []string{"timeInterval", tool, "numberOfFiles"}
CSVDistribution = append(CSVDistribution, firstLine)
for _, interval := range distributionIntervals {
number := strconv.FormatInt(int64(count[interval]), 10)
row := []string{interval, tool, number}
CSVDistribution = append(CSVDistribution, row)
}
printDistributionInfo(count, tool)
return CSVDistribution, nil
}
func printDistributionInfo(count map[string]int, tool string) {
total := 0
for _, v := range count {
total += v
}
fmt.Println(tool, "files", total)
fmt.Println("Distribution")
for _, interval := range distributionIntervals {
fmt.Println("\t", interval, count[interval])
}
fmt.Println("Percentage")
for _, interval := range distributionIntervals {
p := (float64(count[interval]) / float64(total)) * 100.00
fmt.Printf("\t %s %f%%\n", interval, p)
}
fmt.Printf("\n\n")
}
func arrangeByTime(count map[string]int, num float64) {
switch {
case num > 1000.00 && num <= 10000.00:
count[distributionIntervals[0]]++
case num > 10000.00 && num <= 100000.00:
count[distributionIntervals[1]]++
case num > 100000.00 && num <= 1000000.00:
count[distributionIntervals[2]]++
case num > 1000000.00 && num <= 10000000.00:
count[distributionIntervals[3]]++
case num > 10000000.00 && num <= 100000000.00:
count[distributionIntervals[4]]++
}
}
func writeCSV(CSVData [][]string, outPath string) error {
out, err := os.Create(outPath)
if err != nil {
return err
}
w := csv.NewWriter(out)
w.WriteAll(CSVData)
if err := w.Error(); err != nil {
return err
}
return nil
}
type parse func(data []byte, tool string) ([][]string, error)
func generateCSV() {
bmFiles := []struct {
in string
out string
tool string
parse parse
}{
{in: enryTotalBench, out: enryTotalCSV, tool: "enry", parse: parseTotal},
{in: linguistTotalBench, out: linguistTotalCSV, tool: "linguist", parse: parseTotal},
{in: enrySamplesBench, out: enrySamplesCSV, tool: "enry", parse: parseSamples},
{in: linguistSamplesBench, out: linguistSamplesCSV, tool: "linguist", parse: parseSamples},
}
for _, bmFile := range bmFiles {
buf, err := ioutil.ReadFile(bmFile.in)
if err != nil {
log.Println(err)
continue
}
info, err := bmFile.parse(buf, bmFile.tool)
if err != nil {
log.Println(err)
continue
}
if err := writeCSV(info, filepath.Join(outDir, bmFile.out)); err != nil {
log.Println(err)
continue
}
}
}
func parseTotal(data []byte, tool string) ([][]string, error) {
const totalLine = "_TOTAL"
parsedInfo := map[string][]string{}
buf := bufio.NewScanner(bytes.NewReader(data))
for buf.Scan() {
line := buf.Text()
if strings.Contains(line, totalLine) {
split := strings.Fields(line)
row, err := getRow(split, tool)
if err != nil {
return nil, err
}
parsedInfo[row[0]] = row
}
}
if err := buf.Err(); err != nil {
return nil, err
}
firstLine := []string{"function", "tool", "iterations", "ns/op"}
return prepareInfoForCSV(parsedInfo, firstLine), nil
}
func getRow(line []string, tool string) ([]string, error) {
row := make([]string, 0, 3)
for _, function := range enryFunctions {
if strings.Contains(line[0], function) {
row = append(row, function)
break
}
}
row = append(row, tool)
iterations := line[1]
row = append(row, iterations)
average, err := getAverage(line)
if err != nil {
return nil, err
}
row = append(row, average)
return row, nil
}
func getAverage(line []string) (string, error) {
average := line[len(line)-1]
if !strings.HasSuffix(average, ")") {
return line[2], nil
}
totalTime := strings.Trim(average, "() ")
time, err := strconv.ParseFloat(totalTime, 64)
if err != nil {
return "", err
}
iterations := line[1]
i, err := strconv.ParseFloat(iterations, 64)
if err != nil {
return "", err
}
avg := (time * math.Pow10(9)) / i
return fmt.Sprintf("%d", int(avg)), nil
}
func prepareInfoForCSV(parsedInfo map[string][]string, firstLine []string) [][]string {
info := createInfoWithFirstLine(firstLine, len(parsedInfo))
for _, function := range enryFunctions {
info = append(info, parsedInfo[function])
}
return info
}
func createInfoWithFirstLine(firstLine []string, sliceLength int) (info [][]string) {
if len(firstLine) > 0 {
info = make([][]string, 0, sliceLength+1)
info = append(info, firstLine)
} else {
info = make([][]string, 0, sliceLength)
}
return
}
type enryFuncs map[string][]string
func newEnryFuncs() enryFuncs {
return enryFuncs{
getLanguageFunc: nil,
classifyFunc: nil,
modelineFunc: nil,
filenameFunc: nil,
shebangFunc: nil,
extensionFunc: nil,
contentFunc: nil,
}
}
func parseSamples(data []byte, tool string) ([][]string, error) {
const sampleLine = "SAMPLE_"
parsedInfo := map[string]enryFuncs{}
buf := bufio.NewScanner(bytes.NewReader(data))
for buf.Scan() {
line := buf.Text()
if strings.Contains(line, sampleLine) {
split := strings.Fields(line)
name := getSampleName(split[0])
if _, ok := parsedInfo[name]; !ok {
parsedInfo[name] = newEnryFuncs()
}
row := make([]string, 0, 4)
row = append(row, name)
r, err := getRow(split, tool)
if err != nil {
return nil, err
}
row = append(row, r...)
function := row[1]
parsedInfo[name][function] = row
}
}
if err := buf.Err(); err != nil {
return nil, err
}
firstLine := []string{"file", "function", "tool", "iterations", "ns/op"}
return prepareSamplesInfoForCSV(parsedInfo, firstLine), nil
}
func getSampleName(s string) string {
start := strings.Index(s, "SAMPLE_") + len("SAMPLE_")
suffix := fmt.Sprintf("-%d", runtime.GOMAXPROCS(-1))
name := strings.TrimSuffix(s[start:], suffix)
return name
}
func prepareSamplesInfoForCSV(parsedInfo map[string]enryFuncs, firstLine []string) [][]string {
info := createInfoWithFirstLine(firstLine, len(parsedInfo)*len(enryFunctions))
orderedKeys := sortKeys(parsedInfo)
for _, path := range orderedKeys {
sampleInfo := prepareInfoForCSV(parsedInfo[path], nil)
info = append(info, sampleInfo...)
}
return info
}
func sortKeys(parsedInfo map[string]enryFuncs) []string {
keys := make([]string, 0, len(parsedInfo))
for key := range parsedInfo {
keys = append(keys, key)
}
sort.Strings(keys)
return keys
}

View File

@ -0,0 +1,19 @@
set terminal jpeg large font arial size 1920,1080
set output 'benchmark/histogram/distribution.jpg'
set datafile separator comma
set key under
set style data histogram
set style histogram clustered gap 1 title offset 1,1
set style fill solid noborder
set boxwidth 0.95
set grid y
set bmargin 12
set autoscale
set title "Number of files whose processed time was inside time interval"
plot newhistogram, 'benchmark/csv/enry-distribution.csv' using 3:xtic(1) title "enry", 'benchmark/csv/linguist-distribution.csv' using 3 title "linguist"
unset output

4
benchmark/plot-histogram.sh Executable file
View File

@ -0,0 +1,4 @@
#!/bin/sh
gnuplot benchmark/plot-histogram.gp

4
benchmark/run-benchmark.sh Executable file
View File

@ -0,0 +1,4 @@
#!/bin/sh
go test -run NONE -bench=. -benchtime=120s -timeout=100h >benchmark/output/enry_total.bench && \
benchmark/linguist-total.rb 5 >benchmark/output/linguist_total.bench

4
benchmark/run.sh Executable file
View File

@ -0,0 +1,4 @@
#!/bin/sh
benchmark/run-benchmark.sh && make benchmarks-slow && \
benchmark/parse.sh && benchmark/plot-histogram.sh

194
benchmark_test.go Normal file
View File

@ -0,0 +1,194 @@
package enry
import (
"flag"
"io/ioutil"
"log"
"os"
"path/filepath"
"testing"
)
const samplesDir = ".linguist/samples"
type sample struct {
filename string
content []byte
}
var (
slow bool
overcomeLanguage string
overcomeLanguages []string
samples []*sample
)
func TestMain(m *testing.M) {
flag.BoolVar(&slow, "slow", false, "run benchmarks per sample for strategies too")
flag.Parse()
var err error
samples, err = getSamples(samplesDir)
if err != nil {
log.Fatal(err)
}
os.Exit(m.Run())
}
func getSamples(dir string) ([]*sample, error) {
samples := make([]*sample, 0, 2000)
err := filepath.Walk(dir, func(path string, info os.FileInfo, err error) error {
if err != nil {
return err
}
if info.IsDir() {
return nil
}
content, err := ioutil.ReadFile(path)
if err != nil {
return err
}
s := &sample{
filename: path,
content: content,
}
samples = append(samples, s)
return nil
})
return samples, err
}
func BenchmarkGetLanguageTotal(b *testing.B) {
if slow {
b.SkipNow()
}
var o string
b.Run("GetLanguage()_TOTAL", func(b *testing.B) {
for n := 0; n < b.N; n++ {
for _, sample := range samples {
o = GetLanguage(sample.filename, sample.content)
}
}
overcomeLanguage = o
})
}
func BenchmarkClassifyTotal(b *testing.B) {
if slow {
b.SkipNow()
}
var o []string
b.Run("Classify()_TOTAL", func(b *testing.B) {
for n := 0; n < b.N; n++ {
for _, sample := range samples {
o = DefaultClassifier.Classify(sample.content, nil)
}
overcomeLanguages = o
}
})
}
func BenchmarkStrategiesTotal(b *testing.B) {
if slow {
b.SkipNow()
}
benchmarks := []struct {
name string
strategy Strategy
candidates []string
}{
{name: "GetLanguagesByModeline()_TOTAL", strategy: GetLanguagesByModeline},
{name: "GetLanguagesByFilename()_TOTAL", strategy: GetLanguagesByFilename},
{name: "GetLanguagesByShebang()_TOTAL", strategy: GetLanguagesByShebang},
{name: "GetLanguagesByExtension()_TOTAL", strategy: GetLanguagesByExtension},
{name: "GetLanguagesByContent()_TOTAL", strategy: GetLanguagesByContent},
}
var o []string
for _, benchmark := range benchmarks {
b.Run(benchmark.name, func(b *testing.B) {
for n := 0; n < b.N; n++ {
for _, sample := range samples {
o = benchmark.strategy(sample.filename, sample.content, benchmark.candidates)
}
overcomeLanguages = o
}
})
}
}
func BenchmarkGetLanguagePerSample(b *testing.B) {
if !slow {
b.SkipNow()
}
var o string
for _, sample := range samples {
b.Run("GetLanguage()_SAMPLE_"+sample.filename, func(b *testing.B) {
for n := 0; n < b.N; n++ {
o = GetLanguage(sample.filename, sample.content)
}
overcomeLanguage = o
})
}
}
func BenchmarkClassifyPerSample(b *testing.B) {
if !slow {
b.SkipNow()
}
var o []string
for _, sample := range samples {
b.Run("Classify()_SAMPLE_"+sample.filename, func(b *testing.B) {
for n := 0; n < b.N; n++ {
o = DefaultClassifier.Classify(sample.content, nil)
}
overcomeLanguages = o
})
}
}
func BenchmarkStrategiesPerSample(b *testing.B) {
if !slow {
b.SkipNow()
}
benchmarks := []struct {
name string
strategy Strategy
candidates []string
}{
{name: "GetLanguagesByModeline()_SAMPLE_", strategy: GetLanguagesByModeline},
{name: "GetLanguagesByFilename()_SAMPLE_", strategy: GetLanguagesByFilename},
{name: "GetLanguagesByShebang()_SAMPLE_", strategy: GetLanguagesByShebang},
{name: "GetLanguagesByExtension()_SAMPLE_", strategy: GetLanguagesByExtension},
{name: "GetLanguagesByContent()_SAMPLE_", strategy: GetLanguagesByContent},
}
var o []string
for _, benchmark := range benchmarks {
for _, sample := range samples {
b.Run(benchmark.name+sample.filename, func(b *testing.B) {
for n := 0; n < b.N; n++ {
o = benchmark.strategy(sample.filename, sample.content, benchmark.candidates)
}
overcomeLanguages = o
})
}
}
}

View File

@ -3,7 +3,6 @@ package enry
import (
"bytes"
"fmt"
"testing"
"github.com/stretchr/testify/assert"
)
@ -80,20 +79,3 @@ func (s *EnryTestSuite) TestIsBinary() {
assert.Equal(s.T(), is, test.expected, fmt.Sprintf("%v: is = %v, expected: %v", test.name, is, test.expected))
}
}
const (
htmlPath = "some/random/dir/file.html"
jsPath = "some/random/dir/file.js"
)
func BenchmarkVendor(b *testing.B) {
for i := 0; i < b.N; i++ {
_ = IsVendor(htmlPath)
}
}
func BenchmarkVendorJS(b *testing.B) {
for i := 0; i < b.N; i++ {
_ = IsVendor(jsPath)
}
}