Refactor Oniguruma integration

Instead of use a command to change imports before build, using a build tag to generate the correct binary.

This will allow applications to compile enry using oniguruma with less troubles.

Signed-off-by: Antonio Jesus Navarro Perez <antnavper@gmail.com>
This commit is contained in:
Antonio Jesus Navarro Perez 2018-08-28 17:27:18 +02:00 committed by Denys Smirnov
parent 8da8516ac1
commit 15bb13117f
8 changed files with 70 additions and 51 deletions

View File

@ -24,9 +24,7 @@ install:
- mkdir -p $GOPATH/src/gopkg.in/src-d
- ln -s $PWD $GOPATH/src/gopkg.in/src-d/enry.v1
- cd $GOPATH/src/gopkg.in/src-d/enry.v1
- if [ "$ONIGURUMA" == "1" ]; then make oniguruma; fi
- go get -v -t ./...
- if [ "$ONIGURUMA" == "1" ]; then tags="$tags oniguruma"; fi; go get -v -t --tags "$tags" ./...
script:
- make test-coverage
@ -100,7 +98,7 @@ jobs:
- sudo apt-get update
- sudo apt-get install -y --no-install-recommends clang g++ gcc gcc-multilib libc6-dev libc6-dev-i386 mingw-w64 patch xz-utils
- cd ${HOME}
- curl -sSL ${OSXCROSS_URL} | tar -C ${HOME} -xzf -
- curl -sSL ${OSXCROSS_URL} | tar -C ${HOME} -xzf -
- cd $GOPATH/src/gopkg.in/src-d/enry.v1
script:

View File

@ -38,12 +38,6 @@ DARWIN_SHARED_LIB=$(DARWIN_DIR)/libenry.dylib
HEADER_FILE=libenry.h
NATIVE_LIB=./shared/enry.go
# source files to be patched for using "rubex" instead of "regexp"
RUBEX_PATCHED := internal/code-generator/generator/heuristics.go internal/tokenizer/tokenize.go common.go
RUBEX_ORIG := $(RUBEX_PATCHED:=.orig)
.PHONY: revert-oniguruma
$(LINGUIST_PATH):
git clone https://github.com/github/linguist.git $@
@ -69,15 +63,6 @@ benchmarks-slow: $(LINGUST_PATH)
mkdir -p benchmarks/output && go test -run=NONE -bench=. -slow -benchtime=100ms -timeout=100h >benchmarks/output/enry_samples.bench && \
benchmarks/linguist-samples.rb 5 >benchmarks/output/linguist_samples.bench
$(RUBEX_ORIG): %.orig : %
sed -i.orig -e 's/"regexp"/regexp "github.com\/moovweb\/rubex"/g' $<
@touch $@
oniguruma: $(RUBEX_ORIG)
revert-oniguruma:
@for file in $(RUBEX_PATCHED); do if [ -e "$$file.orig" ]; then mv "$$file.orig" "$$file" && echo mv "$$file.orig" "$$file"; fi; done
build-cli:
go build -o enry -ldflags "$(LOCAL_LDFLAGS)" cmd/enry/main.go

View File

@ -37,10 +37,10 @@ On Ubuntu, it is
sudo apt install libonig-dev
```
To build enry with Oniguruma regexps, patch the imports with
To build enry with Oniguruma regexps use the `oniguruma` build tag
```
make oniguruma
go get -v -t --tags oniguruma ./...
```
and then rebuild the project.
@ -162,7 +162,7 @@ We update enry when changes are done in linguist's master branch on the followin
* [vendor.yml](https://github.com/github/linguist/blob/master/lib/linguist/vendor.yml)
* [documentation.yml](https://github.com/github/linguist/blob/master/lib/linguist/documentation.yml)
Currently we don't have any procedure established to automatically detect changes in the linguist project and regenerate the code.
Currently we don't have any procedure established to automatically detect changes in the linguist project and regenerate the code.
So we update the generated code as needed, without any specific criteria.
If you want to update *enry* because of changes in linguist, you can run the *go
@ -217,7 +217,7 @@ If you want to reproduce the same benchmarks you can run:
benchmarks/run.sh
from the root's project directory and it'll run benchmarks for enry and linguist, parse the output, create csv files and create a histogram (you must have installed [gnuplot](http://gnuplot.info) in your system to get the histogram).
from the root's project directory and it'll run benchmarks for enry and linguist, parse the output, create csv files and create a histogram (you must have installed [gnuplot](http://gnuplot.info) in your system to get the histogram).
This can take some time, so to run local benchmarks for a quick check you can either:

View File

@ -4,10 +4,10 @@ import (
"bufio"
"bytes"
"path/filepath"
"regexp"
"strings"
"gopkg.in/src-d/enry.v1/data"
"gopkg.in/src-d/enry.v1/regex"
)
// OtherLanguage is used as a zero value when a function can not return a specific language.
@ -197,10 +197,10 @@ func footScope(content []byte, scope int) (index int) {
}
var (
reEmacsModeline = regexp.MustCompile(`.*-\*-\s*(.+?)\s*-\*-.*(?m:$)`)
reEmacsLang = regexp.MustCompile(`.*(?i:mode)\s*:\s*([^\s;]+)\s*;*.*`)
reVimModeline = regexp.MustCompile(`(?:(?m:\s|^)vi(?:m[<=>]?\d+|m)?|[\t\x20]*ex)\s*[:]\s*(.*)(?m:$)`)
reVimLang = regexp.MustCompile(`(?i:filetype|ft|syntax)\s*=(\w+)(?:\s|:|$)`)
reEmacsModeline = regex.MustCompile(`.*-\*-\s*(.+?)\s*-\*-.*(?m:$)`)
reEmacsLang = regex.MustCompile(`.*(?i:mode)\s*:\s*([^\s;]+)\s*;*.*`)
reVimModeline = regex.MustCompile(`(?:(?m:\s|^)vi(?:m[<=>]?\d+|m)?|[\t\x20]*ex)\s*[:]\s*(.*)(?m:$)`)
reVimLang = regex.MustCompile(`(?i:filetype|ft|syntax)\s*=(\w+)(?:\s|:|$)`)
)
// GetLanguagesByEmacsModeline returns a slice of possible languages for the given content.
@ -283,8 +283,8 @@ func GetLanguagesByShebang(_ string, content []byte, _ []string) (languages []st
}
var (
shebangExecHack = regexp.MustCompile(`exec (\w+).+\$0.+\$@`)
pythonVersion = regexp.MustCompile(`python\d\.\d+`)
shebangExecHack = regex.MustCompile(`exec (\w+).+\$0.+\$@`)
pythonVersion = regex.MustCompile(`python\d\.\d+`)
)
func getInterpreter(data []byte) (interpreter string) {

View File

@ -6,10 +6,11 @@ import (
"fmt"
"io"
"io/ioutil"
"regexp"
"strconv"
"strings"
"text/template"
"gopkg.in/src-d/enry.v1/regex"
)
// Heuristics reads from fileToParse and builds source file from tmplPath. It complies with type File signature.
@ -38,7 +39,7 @@ const (
)
var (
disambLine = regexp.MustCompile(`^(\s*)disambiguate`)
disambLine = regex.MustCompile(`^(\s*)disambiguate`)
definedRegs = make(map[string]string)
illegalCharacter = map[string]string{
"#": "Sharp",
@ -378,7 +379,7 @@ func convertToValidRegexp(reg string) string {
func includeToRegExp(include string) string {
content := include[strings.Index(include, `(`)+1 : strings.Index(include, `)`)]
content = strings.Trim(content, `"'`)
return regexp.QuoteMeta(content)
return regex.QuoteMeta(content)
}
func getLanguages(line string) []string {

View File

@ -2,7 +2,8 @@ package tokenizer
import (
"bytes"
"regexp"
"gopkg.in/src-d/enry.v1/regex"
)
const byteLimit = 100000
@ -72,20 +73,20 @@ var (
//
// These regexps were converted to work in the same way for both engines:
//
reLiteralStringQuotes = regexp.MustCompile(`("(.|\n)*?"|'(.|\n)*?')`)
reSingleLineComment = regexp.MustCompile(`(?m)(//|--|#|%|")\s([^\n]*$)`)
reMultilineComment = regexp.MustCompile(`(/\*(.|\n)*?\*/|<!--(.|\n)*?-->|\{-(.|\n)*?-\}|\(\*(.|\n)*?\*\)|"""(.|\n)*?"""|'''(.|\n)*?''')`)
reLiteralNumber = regexp.MustCompile(`(0x[0-9A-Fa-f]([0-9A-Fa-f]|\.)*|\d(\d|\.)*)([uU][lL]{0,2}|([eE][-+]\d*)?[fFlL]*)`)
reShebang = regexp.MustCompile(`(?m)^#!(?:/[0-9A-Za-z_]+)*/(?:([0-9A-Za-z_]+)|[0-9A-Za-z_]+(?:\s*[0-9A-Za-z_]+=[0-9A-Za-z_]+\s*)*\s*([0-9A-Za-z_]+))(?:\s*-[0-9A-Za-z_]+\s*)*$`)
rePunctuation = regexp.MustCompile(`;|\{|\}|\(|\)|\[|\]`)
reSGML = regexp.MustCompile(`(<\/?[^\s<>=\d"']+)(?:\s(.|\n)*?\/?>|>)`)
reSGMLComment = regexp.MustCompile(`(<!--(.|\n)*?-->)`)
reSGMLAttributes = regexp.MustCompile(`\s+([0-9A-Za-z_]+=)|\s+([^\s>]+)`)
reSGMLLoneAttribute = regexp.MustCompile(`([0-9A-Za-z_]+)`)
reRegularToken = regexp.MustCompile(`[0-9A-Za-z_\.@#\/\*]+`)
reOperators = regexp.MustCompile(`<<?|\+|\-|\*|\/|%|&&?|\|\|?`)
reLiteralStringQuotes = regex.MustCompile(`("(.|\n)*?"|'(.|\n)*?')`)
reSingleLineComment = regex.MustCompile(`(?m)(//|--|#|%|")\s([^\n]*$)`)
reMultilineComment = regex.MustCompile(`(/\*(.|\n)*?\*/|<!--(.|\n)*?-->|\{-(.|\n)*?-\}|\(\*(.|\n)*?\*\)|"""(.|\n)*?"""|'''(.|\n)*?''')`)
reLiteralNumber = regex.MustCompile(`(0x[0-9A-Fa-f]([0-9A-Fa-f]|\.)*|\d(\d|\.)*)([uU][lL]{0,2}|([eE][-+]\d*)?[fFlL]*)`)
reShebang = regex.MustCompile(`(?m)^#!(?:/[0-9A-Za-z_]+)*/(?:([0-9A-Za-z_]+)|[0-9A-Za-z_]+(?:\s*[0-9A-Za-z_]+=[0-9A-Za-z_]+\s*)*\s*([0-9A-Za-z_]+))(?:\s*-[0-9A-Za-z_]+\s*)*$`)
rePunctuation = regex.MustCompile(`;|\{|\}|\(|\)|\[|\]`)
reSGML = regex.MustCompile(`(<\/?[^\s<>=\d"']+)(?:\s(.|\n)*?\/?>|>)`)
reSGMLComment = regex.MustCompile(`(<!--(.|\n)*?-->)`)
reSGMLAttributes = regex.MustCompile(`\s+([0-9A-Za-z_]+=)|\s+([^\s>]+)`)
reSGMLLoneAttribute = regex.MustCompile(`([0-9A-Za-z_]+)`)
reRegularToken = regex.MustCompile(`[0-9A-Za-z_\.@#\/\*]+`)
reOperators = regex.MustCompile(`<<?|\+|\-|\*|\/|%|&&?|\|\|?`)
regexToSkip = []*regexp.Regexp{
regexToSkip = []regex.EnryRegexp{
// The order must be this
reLiteralStringQuotes,
reMultilineComment,
@ -124,22 +125,22 @@ func getShebangToken(matchedShebang [][]byte) []byte {
return tokenShebang
}
func commonExtracAndReplace(content []byte, re *regexp.Regexp) ([]byte, [][]byte) {
func commonExtractAndReplace(content []byte, re regex.EnryRegexp) ([]byte, [][]byte) {
tokens := re.FindAll(content, -1)
content = re.ReplaceAll(content, []byte(` `))
return content, tokens
}
func extractAndReplacePunctuation(content []byte) ([]byte, [][]byte) {
return commonExtracAndReplace(content, rePunctuation)
return commonExtractAndReplace(content, rePunctuation)
}
func extractAndReplaceRegular(content []byte) ([]byte, [][]byte) {
return commonExtracAndReplace(content, reRegularToken)
return commonExtractAndReplace(content, reRegularToken)
}
func extractAndReplaceOperator(content []byte) ([]byte, [][]byte) {
return commonExtracAndReplace(content, reOperators)
return commonExtractAndReplace(content, reOperators)
}
func extractAndReplaceSGML(content []byte) ([]byte, [][]byte) {

17
regex/oniguruma.go Normal file
View File

@ -0,0 +1,17 @@
// +build oniguruma
package regex
import (
"github.com/moovweb/rubex"
)
type EnryRegexp = *rubex.Regexp
func MustCompile(str string) EnryRegexp {
return rubex.MustCompile(str)
}
func QuoteMeta(s string) string {
return rubex.QuoteMeta(s)
}

17
regex/standard.go Normal file
View File

@ -0,0 +1,17 @@
// +build !oniguruma
package regex
import (
"regexp"
)
type EnryRegexp = *regexp.Regexp
func MustCompile(str string) EnryRegexp {
return regexp.MustCompile(str)
}
func QuoteMeta(s string) string {
return regexp.QuoteMeta(s)
}