Refactor Oniguruma integration

Instead of use a command to change imports before build, using a build tag to generate the correct binary.

This will allow applications to compile enry using oniguruma with less troubles.

Signed-off-by: Antonio Jesus Navarro Perez <antnavper@gmail.com>
This commit is contained in:
Antonio Jesus Navarro Perez 2018-08-28 17:27:18 +02:00 committed by Denys Smirnov
parent 8da8516ac1
commit 15bb13117f
8 changed files with 70 additions and 51 deletions

View File

@ -24,9 +24,7 @@ install:
- mkdir -p $GOPATH/src/gopkg.in/src-d
- ln -s $PWD $GOPATH/src/gopkg.in/src-d/enry.v1
- cd $GOPATH/src/gopkg.in/src-d/enry.v1
- if [ "$ONIGURUMA" == "1" ]; then make oniguruma; fi
- go get -v -t ./...
- if [ "$ONIGURUMA" == "1" ]; then tags="$tags oniguruma"; fi; go get -v -t --tags "$tags" ./...
script:
- make test-coverage

View File

@ -38,12 +38,6 @@ DARWIN_SHARED_LIB=$(DARWIN_DIR)/libenry.dylib
HEADER_FILE=libenry.h
NATIVE_LIB=./shared/enry.go
# source files to be patched for using "rubex" instead of "regexp"
RUBEX_PATCHED := internal/code-generator/generator/heuristics.go internal/tokenizer/tokenize.go common.go
RUBEX_ORIG := $(RUBEX_PATCHED:=.orig)
.PHONY: revert-oniguruma
$(LINGUIST_PATH):
git clone https://github.com/github/linguist.git $@
@ -69,15 +63,6 @@ benchmarks-slow: $(LINGUST_PATH)
mkdir -p benchmarks/output && go test -run=NONE -bench=. -slow -benchtime=100ms -timeout=100h >benchmarks/output/enry_samples.bench && \
benchmarks/linguist-samples.rb 5 >benchmarks/output/linguist_samples.bench
$(RUBEX_ORIG): %.orig : %
sed -i.orig -e 's/"regexp"/regexp "github.com\/moovweb\/rubex"/g' $<
@touch $@
oniguruma: $(RUBEX_ORIG)
revert-oniguruma:
@for file in $(RUBEX_PATCHED); do if [ -e "$$file.orig" ]; then mv "$$file.orig" "$$file" && echo mv "$$file.orig" "$$file"; fi; done
build-cli:
go build -o enry -ldflags "$(LOCAL_LDFLAGS)" cmd/enry/main.go

View File

@ -37,10 +37,10 @@ On Ubuntu, it is
sudo apt install libonig-dev
```
To build enry with Oniguruma regexps, patch the imports with
To build enry with Oniguruma regexps use the `oniguruma` build tag
```
make oniguruma
go get -v -t --tags oniguruma ./...
```
and then rebuild the project.

View File

@ -4,10 +4,10 @@ import (
"bufio"
"bytes"
"path/filepath"
"regexp"
"strings"
"gopkg.in/src-d/enry.v1/data"
"gopkg.in/src-d/enry.v1/regex"
)
// OtherLanguage is used as a zero value when a function can not return a specific language.
@ -197,10 +197,10 @@ func footScope(content []byte, scope int) (index int) {
}
var (
reEmacsModeline = regexp.MustCompile(`.*-\*-\s*(.+?)\s*-\*-.*(?m:$)`)
reEmacsLang = regexp.MustCompile(`.*(?i:mode)\s*:\s*([^\s;]+)\s*;*.*`)
reVimModeline = regexp.MustCompile(`(?:(?m:\s|^)vi(?:m[<=>]?\d+|m)?|[\t\x20]*ex)\s*[:]\s*(.*)(?m:$)`)
reVimLang = regexp.MustCompile(`(?i:filetype|ft|syntax)\s*=(\w+)(?:\s|:|$)`)
reEmacsModeline = regex.MustCompile(`.*-\*-\s*(.+?)\s*-\*-.*(?m:$)`)
reEmacsLang = regex.MustCompile(`.*(?i:mode)\s*:\s*([^\s;]+)\s*;*.*`)
reVimModeline = regex.MustCompile(`(?:(?m:\s|^)vi(?:m[<=>]?\d+|m)?|[\t\x20]*ex)\s*[:]\s*(.*)(?m:$)`)
reVimLang = regex.MustCompile(`(?i:filetype|ft|syntax)\s*=(\w+)(?:\s|:|$)`)
)
// GetLanguagesByEmacsModeline returns a slice of possible languages for the given content.
@ -283,8 +283,8 @@ func GetLanguagesByShebang(_ string, content []byte, _ []string) (languages []st
}
var (
shebangExecHack = regexp.MustCompile(`exec (\w+).+\$0.+\$@`)
pythonVersion = regexp.MustCompile(`python\d\.\d+`)
shebangExecHack = regex.MustCompile(`exec (\w+).+\$0.+\$@`)
pythonVersion = regex.MustCompile(`python\d\.\d+`)
)
func getInterpreter(data []byte) (interpreter string) {

View File

@ -6,10 +6,11 @@ import (
"fmt"
"io"
"io/ioutil"
"regexp"
"strconv"
"strings"
"text/template"
"gopkg.in/src-d/enry.v1/regex"
)
// Heuristics reads from fileToParse and builds source file from tmplPath. It complies with type File signature.
@ -38,7 +39,7 @@ const (
)
var (
disambLine = regexp.MustCompile(`^(\s*)disambiguate`)
disambLine = regex.MustCompile(`^(\s*)disambiguate`)
definedRegs = make(map[string]string)
illegalCharacter = map[string]string{
"#": "Sharp",
@ -378,7 +379,7 @@ func convertToValidRegexp(reg string) string {
func includeToRegExp(include string) string {
content := include[strings.Index(include, `(`)+1 : strings.Index(include, `)`)]
content = strings.Trim(content, `"'`)
return regexp.QuoteMeta(content)
return regex.QuoteMeta(content)
}
func getLanguages(line string) []string {

View File

@ -2,7 +2,8 @@ package tokenizer
import (
"bytes"
"regexp"
"gopkg.in/src-d/enry.v1/regex"
)
const byteLimit = 100000
@ -72,20 +73,20 @@ var (
//
// These regexps were converted to work in the same way for both engines:
//
reLiteralStringQuotes = regexp.MustCompile(`("(.|\n)*?"|'(.|\n)*?')`)
reSingleLineComment = regexp.MustCompile(`(?m)(//|--|#|%|")\s([^\n]*$)`)
reMultilineComment = regexp.MustCompile(`(/\*(.|\n)*?\*/|<!--(.|\n)*?-->|\{-(.|\n)*?-\}|\(\*(.|\n)*?\*\)|"""(.|\n)*?"""|'''(.|\n)*?''')`)
reLiteralNumber = regexp.MustCompile(`(0x[0-9A-Fa-f]([0-9A-Fa-f]|\.)*|\d(\d|\.)*)([uU][lL]{0,2}|([eE][-+]\d*)?[fFlL]*)`)
reShebang = regexp.MustCompile(`(?m)^#!(?:/[0-9A-Za-z_]+)*/(?:([0-9A-Za-z_]+)|[0-9A-Za-z_]+(?:\s*[0-9A-Za-z_]+=[0-9A-Za-z_]+\s*)*\s*([0-9A-Za-z_]+))(?:\s*-[0-9A-Za-z_]+\s*)*$`)
rePunctuation = regexp.MustCompile(`;|\{|\}|\(|\)|\[|\]`)
reSGML = regexp.MustCompile(`(<\/?[^\s<>=\d"']+)(?:\s(.|\n)*?\/?>|>)`)
reSGMLComment = regexp.MustCompile(`(<!--(.|\n)*?-->)`)
reSGMLAttributes = regexp.MustCompile(`\s+([0-9A-Za-z_]+=)|\s+([^\s>]+)`)
reSGMLLoneAttribute = regexp.MustCompile(`([0-9A-Za-z_]+)`)
reRegularToken = regexp.MustCompile(`[0-9A-Za-z_\.@#\/\*]+`)
reOperators = regexp.MustCompile(`<<?|\+|\-|\*|\/|%|&&?|\|\|?`)
reLiteralStringQuotes = regex.MustCompile(`("(.|\n)*?"|'(.|\n)*?')`)
reSingleLineComment = regex.MustCompile(`(?m)(//|--|#|%|")\s([^\n]*$)`)
reMultilineComment = regex.MustCompile(`(/\*(.|\n)*?\*/|<!--(.|\n)*?-->|\{-(.|\n)*?-\}|\(\*(.|\n)*?\*\)|"""(.|\n)*?"""|'''(.|\n)*?''')`)
reLiteralNumber = regex.MustCompile(`(0x[0-9A-Fa-f]([0-9A-Fa-f]|\.)*|\d(\d|\.)*)([uU][lL]{0,2}|([eE][-+]\d*)?[fFlL]*)`)
reShebang = regex.MustCompile(`(?m)^#!(?:/[0-9A-Za-z_]+)*/(?:([0-9A-Za-z_]+)|[0-9A-Za-z_]+(?:\s*[0-9A-Za-z_]+=[0-9A-Za-z_]+\s*)*\s*([0-9A-Za-z_]+))(?:\s*-[0-9A-Za-z_]+\s*)*$`)
rePunctuation = regex.MustCompile(`;|\{|\}|\(|\)|\[|\]`)
reSGML = regex.MustCompile(`(<\/?[^\s<>=\d"']+)(?:\s(.|\n)*?\/?>|>)`)
reSGMLComment = regex.MustCompile(`(<!--(.|\n)*?-->)`)
reSGMLAttributes = regex.MustCompile(`\s+([0-9A-Za-z_]+=)|\s+([^\s>]+)`)
reSGMLLoneAttribute = regex.MustCompile(`([0-9A-Za-z_]+)`)
reRegularToken = regex.MustCompile(`[0-9A-Za-z_\.@#\/\*]+`)
reOperators = regex.MustCompile(`<<?|\+|\-|\*|\/|%|&&?|\|\|?`)
regexToSkip = []*regexp.Regexp{
regexToSkip = []regex.EnryRegexp{
// The order must be this
reLiteralStringQuotes,
reMultilineComment,
@ -124,22 +125,22 @@ func getShebangToken(matchedShebang [][]byte) []byte {
return tokenShebang
}
func commonExtracAndReplace(content []byte, re *regexp.Regexp) ([]byte, [][]byte) {
func commonExtractAndReplace(content []byte, re regex.EnryRegexp) ([]byte, [][]byte) {
tokens := re.FindAll(content, -1)
content = re.ReplaceAll(content, []byte(` `))
return content, tokens
}
func extractAndReplacePunctuation(content []byte) ([]byte, [][]byte) {
return commonExtracAndReplace(content, rePunctuation)
return commonExtractAndReplace(content, rePunctuation)
}
func extractAndReplaceRegular(content []byte) ([]byte, [][]byte) {
return commonExtracAndReplace(content, reRegularToken)
return commonExtractAndReplace(content, reRegularToken)
}
func extractAndReplaceOperator(content []byte) ([]byte, [][]byte) {
return commonExtracAndReplace(content, reOperators)
return commonExtractAndReplace(content, reOperators)
}
func extractAndReplaceSGML(content []byte) ([]byte, [][]byte) {

17
regex/oniguruma.go Normal file
View File

@ -0,0 +1,17 @@
// +build oniguruma
package regex
import (
"github.com/moovweb/rubex"
)
type EnryRegexp = *rubex.Regexp
func MustCompile(str string) EnryRegexp {
return rubex.MustCompile(str)
}
func QuoteMeta(s string) string {
return rubex.QuoteMeta(s)
}

17
regex/standard.go Normal file
View File

@ -0,0 +1,17 @@
// +build !oniguruma
package regex
import (
"regexp"
)
type EnryRegexp = *regexp.Regexp
func MustCompile(str string) EnryRegexp {
return regexp.MustCompile(str)
}
func QuoteMeta(s string) string {
return regexp.QuoteMeta(s)
}