From a66154b7ebcb27949f26f3fc7a76b39907c206c4 Mon Sep 17 00:00:00 2001 From: Vadim Markovtsev Date: Thu, 28 Sep 2017 23:33:25 +0200 Subject: [PATCH 1/7] Make tokenizer regexps work under rubex Signed-off-by: Vadim Markovtsev --- internal/tokenizer/tokenize.go | 47 +++++++++++++++++++++++++++------- 1 file changed, 38 insertions(+), 9 deletions(-) diff --git a/internal/tokenizer/tokenize.go b/internal/tokenizer/tokenize.go index 307ebb9..18f04d8 100644 --- a/internal/tokenizer/tokenize.go +++ b/internal/tokenizer/tokenize.go @@ -43,17 +43,46 @@ var ( extractRemainders, } - reLiteralStringQuotes = regexp.MustCompile(`(?sU)(".*"|'.*')`) - reSingleLineComment = regexp.MustCompile(`(?m)(//|--|#|%|")\s(.*$)`) - reMultilineComment = regexp.MustCompile(`(?sU)(/\*.*\*/||\{-.*-\}|\(\*.*\*\)|""".*"""|'''.*''')`) + // Differences between golang regexp and onigumura: + // 1. no (?s) in onigumura - makes dot match \n + // 2. no (?U) in onigumura - ungreedy * + // 3. (?m) implies dot matches \n in onigumura + // 4. onigumura handles \w differently - impossible, but true + // + // Workarounds: + // 1. (.|\n) + // 2. replace * with *? + // 3. replace . with [^\n] + // 4. replace \w with [0-9A-Za-z_] + // + // Original golang regexps: + // + // reLiteralStringQuotes = regexp.MustCompile(`(?sU)(".*"|'.*')`) + // reSingleLineComment = regexp.MustCompile(`(?m)(//|--|#|%|")\s(.*$)`) + // reMultilineComment = regexp.MustCompile(`(?sU)(/\*.*\*/||\{-.*-\}|\(\*.*\*\)|""".*"""|'''.*''')`) + // reLiteralNumber = regexp.MustCompile(`(0x[0-9A-Fa-f]([0-9A-Fa-f]|\.)*|\d(\d|\.)*)([uU][lL]{0,2}|([eE][-+]\d*)?[fFlL]*)`) + // reShebang = regexp.MustCompile(`(?m)^#!(?:/\w+)*/(?:(\w+)|\w+(?:\s*\w+=\w+\s*)*\s*(\w+))(?:\s*-\w+\s*)*$`) + // rePunctuation = regexp.MustCompile(`;|\{|\}|\(|\)|\[|\]`) + // reSGML = regexp.MustCompile(`(?sU)(<\/?[^\s<>=\d"']+)(?:\s.*\/?>|>)`) + // reSGMLComment = regexp.MustCompile(`(?sU)()`) + // reSGMLAttributes = regexp.MustCompile(`\s+(\w+=)|\s+([^\s>]+)`) + // reSGMLLoneAttribute = regexp.MustCompile(`(\w+)`) + // reRegularToken = regexp.MustCompile(`[\w\.@#\/\*]+`) + // reOperators = regexp.MustCompile(`<|\{-(.|\n)*?-\}|\(\*(.|\n)*?\*\)|"""(.|\n)*?"""|'''(.|\n)*?''')`) reLiteralNumber = regexp.MustCompile(`(0x[0-9A-Fa-f]([0-9A-Fa-f]|\.)*|\d(\d|\.)*)([uU][lL]{0,2}|([eE][-+]\d*)?[fFlL]*)`) - reShebang = regexp.MustCompile(`(?m)^#!(?:/\w+)*/(?:(\w+)|\w+(?:\s*\w+=\w+\s*)*\s*(\w+))(?:\s*-\w+\s*)*$`) + reShebang = regexp.MustCompile(`(?m)^#!(?:/[0-9A-Za-z_]+)*/(?:([0-9A-Za-z_]+)|[0-9A-Za-z_]+(?:\s*[0-9A-Za-z_]+=[0-9A-Za-z_]+\s*)*\s*([0-9A-Za-z_]+))(?:\s*-[0-9A-Za-z_]+\s*)*$`) rePunctuation = regexp.MustCompile(`;|\{|\}|\(|\)|\[|\]`) - reSGML = regexp.MustCompile(`(?sU)(<\/?[^\s<>=\d"']+)(?:\s.*\/?>|>)`) - reSGMLComment = regexp.MustCompile(`(?sU)()`) - reSGMLAttributes = regexp.MustCompile(`\s+(\w+=)|\s+([^\s>]+)`) - reSGMLLoneAttribute = regexp.MustCompile(`(\w+)`) - reRegularToken = regexp.MustCompile(`[\w\.@#\/\*]+`) + reSGML = regexp.MustCompile(`(<\/?[^\s<>=\d"']+)(?:\s(.|\n)*?\/?>|>)`) + reSGMLComment = regexp.MustCompile(`()`) + reSGMLAttributes = regexp.MustCompile(`\s+([0-9A-Za-z_]+=)|\s+([^\s>]+)`) + reSGMLLoneAttribute = regexp.MustCompile(`([0-9A-Za-z_]+)`) + reRegularToken = regexp.MustCompile(`[0-9A-Za-z_\.@#\/\*]+`) reOperators = regexp.MustCompile(`< Date: Fri, 29 Sep 2017 12:38:57 +0200 Subject: [PATCH 2/7] Add optional activation of rubex/onigumura Signed-off-by: Vadim Markovtsev --- Makefile | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/Makefile b/Makefile index 724a142..2c435c7 100644 --- a/Makefile +++ b/Makefile @@ -63,6 +63,16 @@ benchmarks-slow: $(LINGUST_PATH) mkdir -p benchmarks/output && go test -run=NONE -bench=. -slow -benchtime=100ms -timeout=100h >benchmarks/output/enry_samples.bench && \ benchmarks/linguist-samples.rb 5 >benchmarks/output/linguist_samples.bench +onigumura: + sed -i.orig -e 's/"regexp"/regexp "github.com\/moovweb\/rubex"/g' internal/code-generator/generator/heuristics.go + sed -i.orig -e 's/"regexp"/regexp "github.com\/moovweb\/rubex"/g' internal/tokenizer/tokenize.go + sed -i.orig -e 's/"regexp"/regexp "github.com\/moovweb\/rubex"/g' common.go + +revert-onigumura: + mv internal/code-generator/generator/heuristics.go.orig internal/code-generator/generator/heuristics.go + mv internal/tokenizer/tokenize.go.orig internal/tokenizer/tokenize.go + mv common.go.orig common.go + build-cli: go build -o enry -ldflags "$(LOCAL_LDFLAGS)" cli/enry/main.go From b087f856970b6d739203f7e3b39faba3c688f9f5 Mon Sep 17 00:00:00 2001 From: Vadim Markovtsev Date: Fri, 29 Sep 2017 12:39:18 +0200 Subject: [PATCH 3/7] Add two-engine tests Signed-off-by: Vadim Markovtsev --- .travis.yml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/.travis.yml b/.travis.yml index 7e9c164..94bd418 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,16 +4,26 @@ go: - 1.8 - tip +addons: + apt: + packages: + - libonig-dev + matrix: allow_failures: - go: tip fast_finish: true +env: + - ONIGUMURA=1 + - ONIGUMURA=0 + install: - rm -rf $GOPATH/src/gopkg.in/src-d - mkdir -p $GOPATH/src/gopkg.in/src-d - ln -s $PWD $GOPATH/src/gopkg.in/src-d/enry.v1 - cd $GOPATH/src/gopkg.in/src-d/enry.v1 + - if [ "$ONIGUMURA" == "1" ]; then make onigumura; fi - go get -v -t ./... script: From 52ab47de4dc62727ceecf4f75c4413e1f8fe31e0 Mon Sep 17 00:00:00 2001 From: Vadim Markovtsev Date: Fri, 29 Sep 2017 14:58:24 +0200 Subject: [PATCH 4/7] Add the readme Signed-off-by: Vadim Markovtsev --- README.md | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/README.md b/README.md index ef38305..8903731 100644 --- a/README.md +++ b/README.md @@ -19,6 +19,31 @@ To build enry's CLI you must run this will generate a binary in the project's root directory called `enry`. You can then move this binary to anywhere in your `PATH`. +### Faster regexp engine + +[Onigumura](https://github.com/kkos/oniguruma) is CRuby's regular expression engine. +It is very fast and performs better than the one built into Go runtime. +The typical overall speedup is 1.5-2x. However, it requires CGo and the external shared library. +On macOS with brew, it is + +``` +brew install onigumura +``` + +On Ubuntu, it is + +``` +sudo apt install libonig-dev +``` + +To build enry with Onigumura regexps, patch the imports with + +``` +make onigumura +``` + +nnd then rebuild the project. + Examples ------------ From db0e1e0d71f94363b97d3933f89e66c57c4e6f7d Mon Sep 17 00:00:00 2001 From: Vadim Markovtsev Date: Wed, 25 Oct 2017 11:11:14 +0200 Subject: [PATCH 5/7] Fix readme flaws Signed-off-by: Vadim Markovtsev --- README.md | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 8903731..da6b70d 100644 --- a/README.md +++ b/README.md @@ -19,11 +19,12 @@ To build enry's CLI you must run this will generate a binary in the project's root directory called `enry`. You can then move this binary to anywhere in your `PATH`. -### Faster regexp engine +### Faster regexp engine (optional) [Onigumura](https://github.com/kkos/oniguruma) is CRuby's regular expression engine. -It is very fast and performs better than the one built into Go runtime. -The typical overall speedup is 1.5-2x. However, it requires CGo and the external shared library. +It is very fast and performs better than the one built into Go runtime. *enry* supports swapping +between those two engines thanks to [rubex](https://github.com/moovweb/rubex) project. +The typical overall speedup from using Onigumura is 1.5-2x. However, it requires CGo and the external shared library. On macOS with brew, it is ``` @@ -42,7 +43,7 @@ To build enry with Onigumura regexps, patch the imports with make onigumura ``` -nnd then rebuild the project. +and then rebuild the project. Examples ------------ From 8eb17ebd111cc2f58650406c03c6329a2b83c97c Mon Sep 17 00:00:00 2001 From: Vadim Markovtsev Date: Wed, 25 Oct 2017 12:30:25 +0200 Subject: [PATCH 6/7] Improve the Makefile Signed-off-by: Vadim Markovtsev --- Makefile | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/Makefile b/Makefile index 2c435c7..4d6eeb8 100644 --- a/Makefile +++ b/Makefile @@ -38,6 +38,12 @@ DARWIN_SHARED_LIB=$(DARWIN_DIR)/libenry.dylib HEADER_FILE=libenry.h NATIVE_LIB=./shared/enry.go +# source files to be patched for using "rubex" instead of "regexp" +RUBEX_PATCHED := internal/code-generator/generator/heuristics.go internal/tokenizer/tokenize.go common.go +RUBEX_ORIG := $(RUBEX_PATCHED:=.orig) + +.PHONY: revert-onigumura + $(LINGUIST_PATH): git clone https://github.com/github/linguist.git $@ @@ -63,15 +69,14 @@ benchmarks-slow: $(LINGUST_PATH) mkdir -p benchmarks/output && go test -run=NONE -bench=. -slow -benchtime=100ms -timeout=100h >benchmarks/output/enry_samples.bench && \ benchmarks/linguist-samples.rb 5 >benchmarks/output/linguist_samples.bench -onigumura: - sed -i.orig -e 's/"regexp"/regexp "github.com\/moovweb\/rubex"/g' internal/code-generator/generator/heuristics.go - sed -i.orig -e 's/"regexp"/regexp "github.com\/moovweb\/rubex"/g' internal/tokenizer/tokenize.go - sed -i.orig -e 's/"regexp"/regexp "github.com\/moovweb\/rubex"/g' common.go +$(RUBEX_ORIG): %.orig : % + sed -i.orig -e 's/"regexp"/regexp "github.com\/moovweb\/rubex"/g' $< + @touch $@ + +onigumura: $(RUBEX_ORIG) revert-onigumura: - mv internal/code-generator/generator/heuristics.go.orig internal/code-generator/generator/heuristics.go - mv internal/tokenizer/tokenize.go.orig internal/tokenizer/tokenize.go - mv common.go.orig common.go + @for file in $(RUBEX_PATCHED); do if [ -e "$$file.orig" ]; then mv "$$file.orig" "$$file" && echo mv "$$file.orig" "$$file"; fi; done build-cli: go build -o enry -ldflags "$(LOCAL_LDFLAGS)" cli/enry/main.go From b66d54eaec6e732fe0b6d988aa1b8f885462700c Mon Sep 17 00:00:00 2001 From: Vadim Markovtsev Date: Thu, 26 Oct 2017 17:17:03 +0200 Subject: [PATCH 7/7] Do not run Travis jobs with ONIGUMURA by default Signed-off-by: Vadim Markovtsev --- .travis.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 94bd418..0965377 100644 --- a/.travis.yml +++ b/.travis.yml @@ -15,8 +15,8 @@ matrix: fast_finish: true env: - - ONIGUMURA=1 - ONIGUMURA=0 + - ONIGUMURA=1 install: - rm -rf $GOPATH/src/gopkg.in/src-d @@ -46,6 +46,8 @@ deploy: tags: true jobs: + env: + - ONIGUMURA=0 include: - stage: test language: scala