Merge pull request #184 from bzz/maintenance/update-benchmark

Update benchmarks to latest Enry and Github-Linguist
2025-07-12 04:09:48 +00:00 · 2018-12-28 12:03:17 +01:00
parent ef50154395 890afc400f
commit f28fc12300
10 changed files with 28117 additions and 25770 deletions
--- a/README.md
+++ b/README.md
@ -217,13 +217,27 @@ Golang's regexp engine being slower than Ruby's, which uses the [oniguruma](http
 You can find scripts and additional information (like software and hardware used
 and benchmarks' results per sample file) in [*benchmarks*](https://github.com/src-d/enry/blob/master/benchmarks) directory.
 If you want to reproduce the same benchmarks you can run:
-    benchmarks/run.sh
+### Benchmark Dependencies
 As benchmarks depend on Ruby and Github-Linguist gem make sure you have:
 - Ruby (e.g using [`rbenv`](https://github.com/rbenv/rbenv)), [`bundler`](https://bundler.io/) installed
 - Docker
 - [native dependencies](https://github.com/github/linguist/#dependencies) installed
 - Build the gem `cd .linguist && bundle install && rake build_gem && cd -`
 - Install it `gem install --no-rdoc --no-ri --local .linguist/github-linguist-*.gem`
 from the root's project directory and it'll run benchmarks for enry and linguist, parse the output, create csv files and create a histogram (you must have installed [gnuplot](http://gnuplot.info) in your system to get the histogram).
-This can take some time, so to run local benchmarks for a quick check you can either:
+### How to reproduce current results
 If you want to reproduce the same benchmarks as reported above:
 - Make sure all [dependencies](#benchmark-dependencies) are installed
 - Install [gnuplot](http://gnuplot.info) (in order to plot the histogram)
 - Run `ENRY_TEST_REPO=.linguist benchmarks/run.sh` (takes ~15h)
 It will run the benchmarks for enry and linguist, parse the output, create csv files and plot the histogram. This takes some time.
 ### Quick
 To run quicker benchmarks you can either:
    make benchmarks
@ -231,7 +245,7 @@ to get average times for the main detection function and strategies for the whol
    make benchmarks-samples
-if you want to see measures by sample file.
+if you want to see measures per sample file.
 Why Enry?
--- a/benchmark_test.go
+++ b/benchmark_test.go
@ -2,6 +2,7 @@ package enry
 import (
 	"flag"
 	"fmt"
 	"io/ioutil"
 	"log"
 	"os"
@ -110,11 +111,9 @@ func getSamples(dir string) ([]*sample, error) {
 			filename: path,
 			content:  content,
 		}
 		samples = append(samples, s)
 		return nil
 	})
 	return samples, err
 }
@ -157,17 +156,7 @@ func BenchmarkStrategiesTotal(b *testing.B) {
 		b.SkipNow()
 	}
-	benchmarks := []struct {
+	benchmarks := benchmarkForAllStrategies("TOTAL")
 		name       string
 		strategy   Strategy
 		candidates []string
 	}{
 		{name: "GetLanguagesByModeline()_TOTAL", strategy: GetLanguagesByModeline},
 		{name: "GetLanguagesByFilename()_TOTAL", strategy: GetLanguagesByFilename},
 		{name: "GetLanguagesByShebang()_TOTAL", strategy: GetLanguagesByShebang},
 		{name: "GetLanguagesByExtension()_TOTAL", strategy: GetLanguagesByExtension},
 		{name: "GetLanguagesByContent()_TOTAL", strategy: GetLanguagesByContent},
 	}
 	var o []string
 	for _, benchmark := range benchmarks {
@ -222,17 +211,7 @@ func BenchmarkStrategiesPerSample(b *testing.B) {
 		b.SkipNow()
 	}
-	benchmarks := []struct {
+	benchmarks := benchmarkForAllStrategies("SAMPLE")
 		name       string
 		strategy   Strategy
 		candidates []string
 	}{
 		{name: "GetLanguagesByModeline()_SAMPLE_", strategy: GetLanguagesByModeline},
 		{name: "GetLanguagesByFilename()_SAMPLE_", strategy: GetLanguagesByFilename},
 		{name: "GetLanguagesByShebang()_SAMPLE_", strategy: GetLanguagesByShebang},
 		{name: "GetLanguagesByExtension()_SAMPLE_", strategy: GetLanguagesByExtension},
 		{name: "GetLanguagesByContent()_SAMPLE_", strategy: GetLanguagesByContent},
 	}
 	var o []string
 	for _, benchmark := range benchmarks {
@ -247,3 +226,19 @@ func BenchmarkStrategiesPerSample(b *testing.B) {
 		}
 	}
 }
 type strategyName struct {
 	name       string
 	strategy   Strategy
 	candidates []string
 }
 func benchmarkForAllStrategies(class string) []strategyName {
 	return []strategyName{
 		{name: fmt.Sprintf("GetLanguagesByModeline()_%s_", class), strategy: GetLanguagesByModeline},
 		{name: fmt.Sprintf("GetLanguagesByFilename()_%s_", class), strategy: GetLanguagesByFilename},
 		{name: fmt.Sprintf("GetLanguagesByShebang()_%s_", class), strategy: GetLanguagesByShebang},
 		{name: fmt.Sprintf("GetLanguagesByExtension()_%s_", class), strategy: GetLanguagesByExtension},
 		{name: fmt.Sprintf("GetLanguagesByContent()_%s_", class), strategy: GetLanguagesByContent},
 	}
 }
--- a/benchmarks/csv/enry-distribution.csv
+++ b/benchmarks/csv/enry-distribution.csv
@ -1,6 +1,6 @@
 timeInterval,enry,numberOfFiles
-1us-10us,enry,96
+1us-10us,enry,83
-10us-100us,enry,1244
+10us-100us,enry,1341
-100us-1ms,enry,321
+100us-1ms,enry,314
-1ms-10ms,enry,135
+1ms-10ms,enry,146
-10ms-100ms,enry,43
+10ms-100ms,enry,48
--- a/benchmarks/csv/enry-samples.csv
+++ b/benchmarks/csv/enry-samples.csv
--- a/benchmarks/csv/enry-total.csv
+++ b/benchmarks/csv/enry-total.csv
@ -1,8 +1,8 @@
 function,tool,iterations,ns/op
-GetLanguage(),enry,100,1915861259
+GetLanguage(),enry,100,2333748307
-Classify(),enry,5,39977943775
+Classify(),enry,3,53842505853
-GetLanguagesByModeline(),enry,1000,196571071
+GetLanguagesByModeline(),enry,1000,228234491
-GetLanguagesByFilename(),enry,2000000,89774
+GetLanguagesByFilename(),enry,1000000,124782
-GetLanguagesByShebang(),enry,100000,1892569
+GetLanguagesByShebang(),enry,100000,2339138
-GetLanguagesByExtension(),enry,200000,921160
+GetLanguagesByExtension(),enry,200000,1110007
-GetLanguagesByContent(),enry,1000,286159159
+GetLanguagesByContent(),enry,500,342358978
--- a/benchmarks/csv/linguist-distribution.csv
+++ b/benchmarks/csv/linguist-distribution.csv
@ -1,6 +1,6 @@
 timeInterval,linguist,numberOfFiles
 1us-10us,linguist,0
-10us-100us,linguist,74
+10us-100us,linguist,120
-100us-1ms,linguist,920
+100us-1ms,linguist,1070
-1ms-10ms,linguist,788
+1ms-10ms,linguist,816
-10ms-100ms,linguist,57
+10ms-100ms,linguist,71
--- a/benchmarks/csv/linguist-samples.csv
+++ b/benchmarks/csv/linguist-samples.csv
--- a/benchmarks/csv/linguist-total.csv
+++ b/benchmarks/csv/linguist-total.csv
@ -1,8 +1,8 @@
 function,tool,iterations,ns/op
-GetLanguage(),linguist,5,3979096800
+GetLanguage(),linguist,5,3822076000
-Classify(),linguist,5,178253431800
+Classify(),linguist,5,329660597600
-GetLanguagesByModeline(),linguist,5,2582204000
+GetLanguagesByModeline(),linguist,5,2770912600
-GetLanguagesByFilename(),linguist,5,2688800
+GetLanguagesByFilename(),linguist,5,34159000
-GetLanguagesByShebang(),linguist,5,77155200
+GetLanguagesByShebang(),linguist,5,159317200
-GetLanguagesByExtension(),linguist,5,6688800
+GetLanguagesByExtension(),linguist,5,354929800
-GetLanguagesByContent(),linguist,5,161719000
+GetLanguagesByContent(),linguist,5,3881611000
--- a/benchmarks/histogram/distribution.png
+++ b/benchmarks/histogram/distribution.png
--- a/benchmarks/soft-hard-info.txt
+++ b/benchmarks/soft-hard-info.txt
@ -1,9 +1,9 @@
 # Hardware and software used to run benchmarks
-Dell XPS 9360
+MacBookPro13,1
-Linux 4.11.6-3-ARCH #1 SMP PREEMPT Thu Jun 22 12:21:46 CEST 2017 x86_64
+Darwin Kernel Version 16.7.0: Tue Jan 30 11:27:06 PST 2018; root:xnu-3789.73.11~1/RELEASE_X86_64 x86_64 i386
-go version go1.8.3 linux/amd64
+go version go1.10.3 darwin/amd64
-ruby 2.4.1p111 (2017-03-22 revision 58053) [x86_64-linux]
+ruby 2.4.1p111 (2017-03-22 revision 58053) [x86_64-darwin16]
 github/linguist/samples commit: d5c8db3fb91963c4b2762ca2ea2ff7cfac109f68
 github/linguist v7.1.3 commit: e761f9b013e5b61161481fcb898b59721ee40e3d
 src-d/enry v1.6.7 commit: 3d356c70ae322f41048f74d01c5e8572f5898d34