mirror of
https://github.com/ralsina/tartrazine.git
synced 2025-05-23 08:30:07 -03:00
Merge pull request #113 from vmarkovtsev/master
Use rubex for faster regular expressions
This commit is contained in:
commit
ce5adee8ab
12
.travis.yml
12
.travis.yml
@ -4,16 +4,26 @@ go:
|
|||||||
- 1.8
|
- 1.8
|
||||||
- tip
|
- tip
|
||||||
|
|
||||||
|
addons:
|
||||||
|
apt:
|
||||||
|
packages:
|
||||||
|
- libonig-dev
|
||||||
|
|
||||||
matrix:
|
matrix:
|
||||||
allow_failures:
|
allow_failures:
|
||||||
- go: tip
|
- go: tip
|
||||||
fast_finish: true
|
fast_finish: true
|
||||||
|
|
||||||
|
env:
|
||||||
|
- ONIGUMURA=0
|
||||||
|
- ONIGUMURA=1
|
||||||
|
|
||||||
install:
|
install:
|
||||||
- rm -rf $GOPATH/src/gopkg.in/src-d
|
- rm -rf $GOPATH/src/gopkg.in/src-d
|
||||||
- mkdir -p $GOPATH/src/gopkg.in/src-d
|
- mkdir -p $GOPATH/src/gopkg.in/src-d
|
||||||
- ln -s $PWD $GOPATH/src/gopkg.in/src-d/enry.v1
|
- ln -s $PWD $GOPATH/src/gopkg.in/src-d/enry.v1
|
||||||
- cd $GOPATH/src/gopkg.in/src-d/enry.v1
|
- cd $GOPATH/src/gopkg.in/src-d/enry.v1
|
||||||
|
- if [ "$ONIGUMURA" == "1" ]; then make onigumura; fi
|
||||||
- go get -v -t ./...
|
- go get -v -t ./...
|
||||||
|
|
||||||
script:
|
script:
|
||||||
@ -36,6 +46,8 @@ deploy:
|
|||||||
tags: true
|
tags: true
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
|
env:
|
||||||
|
- ONIGUMURA=0
|
||||||
include:
|
include:
|
||||||
- stage: test
|
- stage: test
|
||||||
language: scala
|
language: scala
|
||||||
|
15
Makefile
15
Makefile
@ -38,6 +38,12 @@ DARWIN_SHARED_LIB=$(DARWIN_DIR)/libenry.dylib
|
|||||||
HEADER_FILE=libenry.h
|
HEADER_FILE=libenry.h
|
||||||
NATIVE_LIB=./shared/enry.go
|
NATIVE_LIB=./shared/enry.go
|
||||||
|
|
||||||
|
# source files to be patched for using "rubex" instead of "regexp"
|
||||||
|
RUBEX_PATCHED := internal/code-generator/generator/heuristics.go internal/tokenizer/tokenize.go common.go
|
||||||
|
RUBEX_ORIG := $(RUBEX_PATCHED:=.orig)
|
||||||
|
|
||||||
|
.PHONY: revert-onigumura
|
||||||
|
|
||||||
$(LINGUIST_PATH):
|
$(LINGUIST_PATH):
|
||||||
git clone https://github.com/github/linguist.git $@
|
git clone https://github.com/github/linguist.git $@
|
||||||
|
|
||||||
@ -63,6 +69,15 @@ benchmarks-slow: $(LINGUST_PATH)
|
|||||||
mkdir -p benchmarks/output && go test -run=NONE -bench=. -slow -benchtime=100ms -timeout=100h >benchmarks/output/enry_samples.bench && \
|
mkdir -p benchmarks/output && go test -run=NONE -bench=. -slow -benchtime=100ms -timeout=100h >benchmarks/output/enry_samples.bench && \
|
||||||
benchmarks/linguist-samples.rb 5 >benchmarks/output/linguist_samples.bench
|
benchmarks/linguist-samples.rb 5 >benchmarks/output/linguist_samples.bench
|
||||||
|
|
||||||
|
$(RUBEX_ORIG): %.orig : %
|
||||||
|
sed -i.orig -e 's/"regexp"/regexp "github.com\/moovweb\/rubex"/g' $<
|
||||||
|
@touch $@
|
||||||
|
|
||||||
|
onigumura: $(RUBEX_ORIG)
|
||||||
|
|
||||||
|
revert-onigumura:
|
||||||
|
@for file in $(RUBEX_PATCHED); do if [ -e "$$file.orig" ]; then mv "$$file.orig" "$$file" && echo mv "$$file.orig" "$$file"; fi; done
|
||||||
|
|
||||||
build-cli:
|
build-cli:
|
||||||
go build -o enry -ldflags "$(LOCAL_LDFLAGS)" cli/enry/main.go
|
go build -o enry -ldflags "$(LOCAL_LDFLAGS)" cli/enry/main.go
|
||||||
|
|
||||||
|
26
README.md
26
README.md
@ -19,6 +19,32 @@ To build enry's CLI you must run
|
|||||||
this will generate a binary in the project's root directory called `enry`. You can then move this binary to anywhere in your `PATH`.
|
this will generate a binary in the project's root directory called `enry`. You can then move this binary to anywhere in your `PATH`.
|
||||||
|
|
||||||
|
|
||||||
|
### Faster regexp engine (optional)
|
||||||
|
|
||||||
|
[Onigumura](https://github.com/kkos/oniguruma) is CRuby's regular expression engine.
|
||||||
|
It is very fast and performs better than the one built into Go runtime. *enry* supports swapping
|
||||||
|
between those two engines thanks to [rubex](https://github.com/moovweb/rubex) project.
|
||||||
|
The typical overall speedup from using Onigumura is 1.5-2x. However, it requires CGo and the external shared library.
|
||||||
|
On macOS with brew, it is
|
||||||
|
|
||||||
|
```
|
||||||
|
brew install onigumura
|
||||||
|
```
|
||||||
|
|
||||||
|
On Ubuntu, it is
|
||||||
|
|
||||||
|
```
|
||||||
|
sudo apt install libonig-dev
|
||||||
|
```
|
||||||
|
|
||||||
|
To build enry with Onigumura regexps, patch the imports with
|
||||||
|
|
||||||
|
```
|
||||||
|
make onigumura
|
||||||
|
```
|
||||||
|
|
||||||
|
and then rebuild the project.
|
||||||
|
|
||||||
Examples
|
Examples
|
||||||
------------
|
------------
|
||||||
|
|
||||||
|
@ -43,17 +43,46 @@ var (
|
|||||||
extractRemainders,
|
extractRemainders,
|
||||||
}
|
}
|
||||||
|
|
||||||
reLiteralStringQuotes = regexp.MustCompile(`(?sU)(".*"|'.*')`)
|
// Differences between golang regexp and onigumura:
|
||||||
reSingleLineComment = regexp.MustCompile(`(?m)(//|--|#|%|")\s(.*$)`)
|
// 1. no (?s) in onigumura - makes dot match \n
|
||||||
reMultilineComment = regexp.MustCompile(`(?sU)(/\*.*\*/|<!--.*-->|\{-.*-\}|\(\*.*\*\)|""".*"""|'''.*''')`)
|
// 2. no (?U) in onigumura - ungreedy *
|
||||||
|
// 3. (?m) implies dot matches \n in onigumura
|
||||||
|
// 4. onigumura handles \w differently - impossible, but true
|
||||||
|
//
|
||||||
|
// Workarounds:
|
||||||
|
// 1. (.|\n)
|
||||||
|
// 2. replace * with *?
|
||||||
|
// 3. replace . with [^\n]
|
||||||
|
// 4. replace \w with [0-9A-Za-z_]
|
||||||
|
//
|
||||||
|
// Original golang regexps:
|
||||||
|
//
|
||||||
|
// reLiteralStringQuotes = regexp.MustCompile(`(?sU)(".*"|'.*')`)
|
||||||
|
// reSingleLineComment = regexp.MustCompile(`(?m)(//|--|#|%|")\s(.*$)`)
|
||||||
|
// reMultilineComment = regexp.MustCompile(`(?sU)(/\*.*\*/|<!--.*-->|\{-.*-\}|\(\*.*\*\)|""".*"""|'''.*''')`)
|
||||||
|
// reLiteralNumber = regexp.MustCompile(`(0x[0-9A-Fa-f]([0-9A-Fa-f]|\.)*|\d(\d|\.)*)([uU][lL]{0,2}|([eE][-+]\d*)?[fFlL]*)`)
|
||||||
|
// reShebang = regexp.MustCompile(`(?m)^#!(?:/\w+)*/(?:(\w+)|\w+(?:\s*\w+=\w+\s*)*\s*(\w+))(?:\s*-\w+\s*)*$`)
|
||||||
|
// rePunctuation = regexp.MustCompile(`;|\{|\}|\(|\)|\[|\]`)
|
||||||
|
// reSGML = regexp.MustCompile(`(?sU)(<\/?[^\s<>=\d"']+)(?:\s.*\/?>|>)`)
|
||||||
|
// reSGMLComment = regexp.MustCompile(`(?sU)(<!--.*-->)`)
|
||||||
|
// reSGMLAttributes = regexp.MustCompile(`\s+(\w+=)|\s+([^\s>]+)`)
|
||||||
|
// reSGMLLoneAttribute = regexp.MustCompile(`(\w+)`)
|
||||||
|
// reRegularToken = regexp.MustCompile(`[\w\.@#\/\*]+`)
|
||||||
|
// reOperators = regexp.MustCompile(`<<?|\+|\-|\*|\/|%|&&?|\|\|?`)
|
||||||
|
//
|
||||||
|
// These regexps were converted to work in the same way for both engines:
|
||||||
|
//
|
||||||
|
reLiteralStringQuotes = regexp.MustCompile(`("(.|\n)*?"|'(.|\n)*?')`)
|
||||||
|
reSingleLineComment = regexp.MustCompile(`(?m)(//|--|#|%|")\s([^\n]*$)`)
|
||||||
|
reMultilineComment = regexp.MustCompile(`(/\*(.|\n)*?\*/|<!--(.|\n)*?-->|\{-(.|\n)*?-\}|\(\*(.|\n)*?\*\)|"""(.|\n)*?"""|'''(.|\n)*?''')`)
|
||||||
reLiteralNumber = regexp.MustCompile(`(0x[0-9A-Fa-f]([0-9A-Fa-f]|\.)*|\d(\d|\.)*)([uU][lL]{0,2}|([eE][-+]\d*)?[fFlL]*)`)
|
reLiteralNumber = regexp.MustCompile(`(0x[0-9A-Fa-f]([0-9A-Fa-f]|\.)*|\d(\d|\.)*)([uU][lL]{0,2}|([eE][-+]\d*)?[fFlL]*)`)
|
||||||
reShebang = regexp.MustCompile(`(?m)^#!(?:/\w+)*/(?:(\w+)|\w+(?:\s*\w+=\w+\s*)*\s*(\w+))(?:\s*-\w+\s*)*$`)
|
reShebang = regexp.MustCompile(`(?m)^#!(?:/[0-9A-Za-z_]+)*/(?:([0-9A-Za-z_]+)|[0-9A-Za-z_]+(?:\s*[0-9A-Za-z_]+=[0-9A-Za-z_]+\s*)*\s*([0-9A-Za-z_]+))(?:\s*-[0-9A-Za-z_]+\s*)*$`)
|
||||||
rePunctuation = regexp.MustCompile(`;|\{|\}|\(|\)|\[|\]`)
|
rePunctuation = regexp.MustCompile(`;|\{|\}|\(|\)|\[|\]`)
|
||||||
reSGML = regexp.MustCompile(`(?sU)(<\/?[^\s<>=\d"']+)(?:\s.*\/?>|>)`)
|
reSGML = regexp.MustCompile(`(<\/?[^\s<>=\d"']+)(?:\s(.|\n)*?\/?>|>)`)
|
||||||
reSGMLComment = regexp.MustCompile(`(?sU)(<!--.*-->)`)
|
reSGMLComment = regexp.MustCompile(`(<!--(.|\n)*?-->)`)
|
||||||
reSGMLAttributes = regexp.MustCompile(`\s+(\w+=)|\s+([^\s>]+)`)
|
reSGMLAttributes = regexp.MustCompile(`\s+([0-9A-Za-z_]+=)|\s+([^\s>]+)`)
|
||||||
reSGMLLoneAttribute = regexp.MustCompile(`(\w+)`)
|
reSGMLLoneAttribute = regexp.MustCompile(`([0-9A-Za-z_]+)`)
|
||||||
reRegularToken = regexp.MustCompile(`[\w\.@#\/\*]+`)
|
reRegularToken = regexp.MustCompile(`[0-9A-Za-z_\.@#\/\*]+`)
|
||||||
reOperators = regexp.MustCompile(`<<?|\+|\-|\*|\/|%|&&?|\|\|?`)
|
reOperators = regexp.MustCompile(`<<?|\+|\-|\*|\/|%|&&?|\|\|?`)
|
||||||
|
|
||||||
regexToSkip = []*regexp.Regexp{
|
regexToSkip = []*regexp.Regexp{
|
||||||
|
Loading…
x
Reference in New Issue
Block a user