Merge pull request #184 from bzz/maintenance/update-benchmark

Update benchmarks to latest Enry and Github-Linguist
2025-08-02 13:59:51 +00:00 · 2018-12-28 12:03:17 +01:00
parent ef50154395 890afc400f
commit f28fc12300
10 changed files with 28117 additions and 25770 deletions
--- a/README.md
+++ b/README.md
@@ -217,13 +217,27 @@ Golang's regexp engine being slower than Ruby's, which uses the [oniguruma](http
 You can find scripts and additional information (like software and hardware used
 and benchmarks' results per sample file) in [*benchmarks*](https://github.com/src-d/enry/blob/master/benchmarks) directory.

-If you want to reproduce the same benchmarks you can run:

-    benchmarks/run.sh
+### Benchmark Dependencies
+As benchmarks depend on Ruby and Github-Linguist gem make sure you have:
+ - Ruby (e.g using [`rbenv`](https://github.com/rbenv/rbenv)), [`bundler`](https://bundler.io/) installed
+ - Docker
+ - [native dependencies](https://github.com/github/linguist/#dependencies) installed
+ - Build the gem `cd .linguist && bundle install && rake build_gem && cd -`
+ - Install it `gem install --no-rdoc --no-ri --local .linguist/github-linguist-*.gem`

-from the root's project directory and it'll run benchmarks for enry and linguist, parse the output, create csv files and create a histogram (you must have installed [gnuplot](http://gnuplot.info) in your system to get the histogram).

-This can take some time, so to run local benchmarks for a quick check you can either:
+### How to reproduce current results
+
+If you want to reproduce the same benchmarks as reported above:
+ - Make sure all [dependencies](#benchmark-dependencies) are installed
+ - Install [gnuplot](http://gnuplot.info) (in order to plot the histogram)
+ - Run `ENRY_TEST_REPO=.linguist benchmarks/run.sh` (takes ~15h)
+
+It will run the benchmarks for enry and linguist, parse the output, create csv files and plot the histogram. This takes some time.
+
+### Quick
+To run quicker benchmarks you can either:

    make benchmarks

@@ -231,7 +245,7 @@ to get average times for the main detection function and strategies for the whol

    make benchmarks-samples

-if you want to see measures by sample file.
+if you want to see measures per sample file.


 Why Enry?
--- a/benchmark_test.go
+++ b/benchmark_test.go
@@ -2,6 +2,7 @@ package enry

 import (
 	"flag"
+	"fmt"
 	"io/ioutil"
 	"log"
 	"os"
@@ -110,11 +111,9 @@ func getSamples(dir string) ([]*sample, error) {
 			filename: path,
 			content:  content,
 		}
-
 		samples = append(samples, s)
 		return nil
 	})
-
 	return samples, err
 }

@@ -157,17 +156,7 @@ func BenchmarkStrategiesTotal(b *testing.B) {
 		b.SkipNow()
 	}

-	benchmarks := []struct {
-		name       string
-		strategy   Strategy
-		candidates []string
-	}{
-		{name: "GetLanguagesByModeline()_TOTAL", strategy: GetLanguagesByModeline},
-		{name: "GetLanguagesByFilename()_TOTAL", strategy: GetLanguagesByFilename},
-		{name: "GetLanguagesByShebang()_TOTAL", strategy: GetLanguagesByShebang},
-		{name: "GetLanguagesByExtension()_TOTAL", strategy: GetLanguagesByExtension},
-		{name: "GetLanguagesByContent()_TOTAL", strategy: GetLanguagesByContent},
-	}
+	benchmarks := benchmarkForAllStrategies("TOTAL")

 	var o []string
 	for _, benchmark := range benchmarks {
@@ -222,17 +211,7 @@ func BenchmarkStrategiesPerSample(b *testing.B) {
 		b.SkipNow()
 	}

-	benchmarks := []struct {
-		name       string
-		strategy   Strategy
-		candidates []string
-	}{
-		{name: "GetLanguagesByModeline()_SAMPLE_", strategy: GetLanguagesByModeline},
-		{name: "GetLanguagesByFilename()_SAMPLE_", strategy: GetLanguagesByFilename},
-		{name: "GetLanguagesByShebang()_SAMPLE_", strategy: GetLanguagesByShebang},
-		{name: "GetLanguagesByExtension()_SAMPLE_", strategy: GetLanguagesByExtension},
-		{name: "GetLanguagesByContent()_SAMPLE_", strategy: GetLanguagesByContent},
-	}
+	benchmarks := benchmarkForAllStrategies("SAMPLE")

 	var o []string
 	for _, benchmark := range benchmarks {
@@ -247,3 +226,19 @@ func BenchmarkStrategiesPerSample(b *testing.B) {
 		}
 	}
 }
+
+type strategyName struct {
+	name       string
+	strategy   Strategy
+	candidates []string
+}
+
+func benchmarkForAllStrategies(class string) []strategyName {
+	return []strategyName{
+		{name: fmt.Sprintf("GetLanguagesByModeline()_%s_", class), strategy: GetLanguagesByModeline},
+		{name: fmt.Sprintf("GetLanguagesByFilename()_%s_", class), strategy: GetLanguagesByFilename},
+		{name: fmt.Sprintf("GetLanguagesByShebang()_%s_", class), strategy: GetLanguagesByShebang},
+		{name: fmt.Sprintf("GetLanguagesByExtension()_%s_", class), strategy: GetLanguagesByExtension},
+		{name: fmt.Sprintf("GetLanguagesByContent()_%s_", class), strategy: GetLanguagesByContent},
+	}
+}
--- a/benchmarks/csv/enry-distribution.csv
+++ b/benchmarks/csv/enry-distribution.csv
@@ -1,6 +1,6 @@
 timeInterval,enry,numberOfFiles
-1us-10us,enry,96
-10us-100us,enry,1244
-100us-1ms,enry,321
-1ms-10ms,enry,135
-10ms-100ms,enry,43
+1us-10us,enry,83
+10us-100us,enry,1341
+100us-1ms,enry,314
+1ms-10ms,enry,146
+10ms-100ms,enry,48
--- a/benchmarks/csv/enry-samples.csv
+++ b/benchmarks/csv/enry-samples.csv
--- a/benchmarks/csv/enry-total.csv
+++ b/benchmarks/csv/enry-total.csv
@@ -1,8 +1,8 @@
 function,tool,iterations,ns/op
-GetLanguage(),enry,100,1915861259
-Classify(),enry,5,39977943775
-GetLanguagesByModeline(),enry,1000,196571071
-GetLanguagesByFilename(),enry,2000000,89774
-GetLanguagesByShebang(),enry,100000,1892569
-GetLanguagesByExtension(),enry,200000,921160
-GetLanguagesByContent(),enry,1000,286159159
+GetLanguage(),enry,100,2333748307
+Classify(),enry,3,53842505853
+GetLanguagesByModeline(),enry,1000,228234491
+GetLanguagesByFilename(),enry,1000000,124782
+GetLanguagesByShebang(),enry,100000,2339138
+GetLanguagesByExtension(),enry,200000,1110007
+GetLanguagesByContent(),enry,500,342358978
--- a/benchmarks/csv/linguist-distribution.csv
+++ b/benchmarks/csv/linguist-distribution.csv
@@ -1,6 +1,6 @@
 timeInterval,linguist,numberOfFiles
 1us-10us,linguist,0
-10us-100us,linguist,74
-100us-1ms,linguist,920
-1ms-10ms,linguist,788
-10ms-100ms,linguist,57
+10us-100us,linguist,120
+100us-1ms,linguist,1070
+1ms-10ms,linguist,816
+10ms-100ms,linguist,71
--- a/benchmarks/csv/linguist-samples.csv
+++ b/benchmarks/csv/linguist-samples.csv
--- a/benchmarks/csv/linguist-total.csv
+++ b/benchmarks/csv/linguist-total.csv
@@ -1,8 +1,8 @@
 function,tool,iterations,ns/op
-GetLanguage(),linguist,5,3979096800
-Classify(),linguist,5,178253431800
-GetLanguagesByModeline(),linguist,5,2582204000
-GetLanguagesByFilename(),linguist,5,2688800
-GetLanguagesByShebang(),linguist,5,77155200
-GetLanguagesByExtension(),linguist,5,6688800
-GetLanguagesByContent(),linguist,5,161719000
+GetLanguage(),linguist,5,3822076000
+Classify(),linguist,5,329660597600
+GetLanguagesByModeline(),linguist,5,2770912600
+GetLanguagesByFilename(),linguist,5,34159000
+GetLanguagesByShebang(),linguist,5,159317200
+GetLanguagesByExtension(),linguist,5,354929800
+GetLanguagesByContent(),linguist,5,3881611000
--- a/benchmarks/histogram/distribution.png
+++ b/benchmarks/histogram/distribution.png
--- a/benchmarks/soft-hard-info.txt
+++ b/benchmarks/soft-hard-info.txt
@@ -1,9 +1,9 @@
 # Hardware and software used to run benchmarks

-Dell XPS 9360
-Linux 4.11.6-3-ARCH #1 SMP PREEMPT Thu Jun 22 12:21:46 CEST 2017 x86_64
-go version go1.8.3 linux/amd64
-ruby 2.4.1p111 (2017-03-22 revision 58053) [x86_64-linux]
-
-github/linguist/samples commit: d5c8db3fb91963c4b2762ca2ea2ff7cfac109f68
+MacBookPro13,1
+Darwin Kernel Version 16.7.0: Tue Jan 30 11:27:06 PST 2018; root:xnu-3789.73.11~1/RELEASE_X86_64 x86_64 i386
+go version go1.10.3 darwin/amd64
+ruby 2.4.1p111 (2017-03-22 revision 58053) [x86_64-darwin16]

+github/linguist v7.1.3 commit: e761f9b013e5b61161481fcb898b59721ee40e3d
+src-d/enry v1.6.7 commit: 3d356c70ae322f41048f74d01c5e8572f5898d34