diff --git a/.travis.yml b/.travis.yml index 7e9c164..0965377 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,16 +4,26 @@ go: - 1.8 - tip +addons: + apt: + packages: + - libonig-dev + matrix: allow_failures: - go: tip fast_finish: true +env: + - ONIGUMURA=0 + - ONIGUMURA=1 + install: - rm -rf $GOPATH/src/gopkg.in/src-d - mkdir -p $GOPATH/src/gopkg.in/src-d - ln -s $PWD $GOPATH/src/gopkg.in/src-d/enry.v1 - cd $GOPATH/src/gopkg.in/src-d/enry.v1 + - if [ "$ONIGUMURA" == "1" ]; then make onigumura; fi - go get -v -t ./... script: @@ -36,6 +46,8 @@ deploy: tags: true jobs: + env: + - ONIGUMURA=0 include: - stage: test language: scala diff --git a/Makefile b/Makefile index 724a142..4d6eeb8 100644 --- a/Makefile +++ b/Makefile @@ -38,6 +38,12 @@ DARWIN_SHARED_LIB=$(DARWIN_DIR)/libenry.dylib HEADER_FILE=libenry.h NATIVE_LIB=./shared/enry.go +# source files to be patched for using "rubex" instead of "regexp" +RUBEX_PATCHED := internal/code-generator/generator/heuristics.go internal/tokenizer/tokenize.go common.go +RUBEX_ORIG := $(RUBEX_PATCHED:=.orig) + +.PHONY: revert-onigumura + $(LINGUIST_PATH): git clone https://github.com/github/linguist.git $@ @@ -63,6 +69,15 @@ benchmarks-slow: $(LINGUST_PATH) mkdir -p benchmarks/output && go test -run=NONE -bench=. -slow -benchtime=100ms -timeout=100h >benchmarks/output/enry_samples.bench && \ benchmarks/linguist-samples.rb 5 >benchmarks/output/linguist_samples.bench +$(RUBEX_ORIG): %.orig : % + sed -i.orig -e 's/"regexp"/regexp "github.com\/moovweb\/rubex"/g' $< + @touch $@ + +onigumura: $(RUBEX_ORIG) + +revert-onigumura: + @for file in $(RUBEX_PATCHED); do if [ -e "$$file.orig" ]; then mv "$$file.orig" "$$file" && echo mv "$$file.orig" "$$file"; fi; done + build-cli: go build -o enry -ldflags "$(LOCAL_LDFLAGS)" cli/enry/main.go diff --git a/README.md b/README.md index ef38305..da6b70d 100644 --- a/README.md +++ b/README.md @@ -19,6 +19,32 @@ To build enry's CLI you must run this will generate a binary in the project's root directory called `enry`. You can then move this binary to anywhere in your `PATH`. +### Faster regexp engine (optional) + +[Onigumura](https://github.com/kkos/oniguruma) is CRuby's regular expression engine. +It is very fast and performs better than the one built into Go runtime. *enry* supports swapping +between those two engines thanks to [rubex](https://github.com/moovweb/rubex) project. +The typical overall speedup from using Onigumura is 1.5-2x. However, it requires CGo and the external shared library. +On macOS with brew, it is + +``` +brew install onigumura +``` + +On Ubuntu, it is + +``` +sudo apt install libonig-dev +``` + +To build enry with Onigumura regexps, patch the imports with + +``` +make onigumura +``` + +and then rebuild the project. + Examples ------------ diff --git a/internal/tokenizer/tokenize.go b/internal/tokenizer/tokenize.go index 307ebb9..18f04d8 100644 --- a/internal/tokenizer/tokenize.go +++ b/internal/tokenizer/tokenize.go @@ -43,17 +43,46 @@ var ( extractRemainders, } - reLiteralStringQuotes = regexp.MustCompile(`(?sU)(".*"|'.*')`) - reSingleLineComment = regexp.MustCompile(`(?m)(//|--|#|%|")\s(.*$)`) - reMultilineComment = regexp.MustCompile(`(?sU)(/\*.*\*/||\{-.*-\}|\(\*.*\*\)|""".*"""|'''.*''')`) + // Differences between golang regexp and onigumura: + // 1. no (?s) in onigumura - makes dot match \n + // 2. no (?U) in onigumura - ungreedy * + // 3. (?m) implies dot matches \n in onigumura + // 4. onigumura handles \w differently - impossible, but true + // + // Workarounds: + // 1. (.|\n) + // 2. replace * with *? + // 3. replace . with [^\n] + // 4. replace \w with [0-9A-Za-z_] + // + // Original golang regexps: + // + // reLiteralStringQuotes = regexp.MustCompile(`(?sU)(".*"|'.*')`) + // reSingleLineComment = regexp.MustCompile(`(?m)(//|--|#|%|")\s(.*$)`) + // reMultilineComment = regexp.MustCompile(`(?sU)(/\*.*\*/||\{-.*-\}|\(\*.*\*\)|""".*"""|'''.*''')`) + // reLiteralNumber = regexp.MustCompile(`(0x[0-9A-Fa-f]([0-9A-Fa-f]|\.)*|\d(\d|\.)*)([uU][lL]{0,2}|([eE][-+]\d*)?[fFlL]*)`) + // reShebang = regexp.MustCompile(`(?m)^#!(?:/\w+)*/(?:(\w+)|\w+(?:\s*\w+=\w+\s*)*\s*(\w+))(?:\s*-\w+\s*)*$`) + // rePunctuation = regexp.MustCompile(`;|\{|\}|\(|\)|\[|\]`) + // reSGML = regexp.MustCompile(`(?sU)(<\/?[^\s<>=\d"']+)(?:\s.*\/?>|>)`) + // reSGMLComment = regexp.MustCompile(`(?sU)()`) + // reSGMLAttributes = regexp.MustCompile(`\s+(\w+=)|\s+([^\s>]+)`) + // reSGMLLoneAttribute = regexp.MustCompile(`(\w+)`) + // reRegularToken = regexp.MustCompile(`[\w\.@#\/\*]+`) + // reOperators = regexp.MustCompile(`<|\{-(.|\n)*?-\}|\(\*(.|\n)*?\*\)|"""(.|\n)*?"""|'''(.|\n)*?''')`) reLiteralNumber = regexp.MustCompile(`(0x[0-9A-Fa-f]([0-9A-Fa-f]|\.)*|\d(\d|\.)*)([uU][lL]{0,2}|([eE][-+]\d*)?[fFlL]*)`) - reShebang = regexp.MustCompile(`(?m)^#!(?:/\w+)*/(?:(\w+)|\w+(?:\s*\w+=\w+\s*)*\s*(\w+))(?:\s*-\w+\s*)*$`) + reShebang = regexp.MustCompile(`(?m)^#!(?:/[0-9A-Za-z_]+)*/(?:([0-9A-Za-z_]+)|[0-9A-Za-z_]+(?:\s*[0-9A-Za-z_]+=[0-9A-Za-z_]+\s*)*\s*([0-9A-Za-z_]+))(?:\s*-[0-9A-Za-z_]+\s*)*$`) rePunctuation = regexp.MustCompile(`;|\{|\}|\(|\)|\[|\]`) - reSGML = regexp.MustCompile(`(?sU)(<\/?[^\s<>=\d"']+)(?:\s.*\/?>|>)`) - reSGMLComment = regexp.MustCompile(`(?sU)()`) - reSGMLAttributes = regexp.MustCompile(`\s+(\w+=)|\s+([^\s>]+)`) - reSGMLLoneAttribute = regexp.MustCompile(`(\w+)`) - reRegularToken = regexp.MustCompile(`[\w\.@#\/\*]+`) + reSGML = regexp.MustCompile(`(<\/?[^\s<>=\d"']+)(?:\s(.|\n)*?\/?>|>)`) + reSGMLComment = regexp.MustCompile(`()`) + reSGMLAttributes = regexp.MustCompile(`\s+([0-9A-Za-z_]+=)|\s+([^\s>]+)`) + reSGMLLoneAttribute = regexp.MustCompile(`([0-9A-Za-z_]+)`) + reRegularToken = regexp.MustCompile(`[0-9A-Za-z_\.@#\/\*]+`) reOperators = regexp.MustCompile(`<