Refactor Oniguruma integration

Instead of use a command to change imports before build, using a build tag to generate the correct binary.

This will allow applications to compile enry using oniguruma with less troubles.

Signed-off-by: Antonio Jesus Navarro Perez <antnavper@gmail.com>
This commit is contained in:
Antonio Jesus Navarro Perez 2018-08-28 17:27:18 +02:00 committed by Denys Smirnov
parent 8da8516ac1
commit 15bb13117f
8 changed files with 70 additions and 51 deletions

View File

@ -24,9 +24,7 @@ install:
- mkdir -p $GOPATH/src/gopkg.in/src-d - mkdir -p $GOPATH/src/gopkg.in/src-d
- ln -s $PWD $GOPATH/src/gopkg.in/src-d/enry.v1 - ln -s $PWD $GOPATH/src/gopkg.in/src-d/enry.v1
- cd $GOPATH/src/gopkg.in/src-d/enry.v1 - cd $GOPATH/src/gopkg.in/src-d/enry.v1
- if [ "$ONIGURUMA" == "1" ]; then make oniguruma; fi - if [ "$ONIGURUMA" == "1" ]; then tags="$tags oniguruma"; fi; go get -v -t --tags "$tags" ./...
- go get -v -t ./...
script: script:
- make test-coverage - make test-coverage

View File

@ -38,12 +38,6 @@ DARWIN_SHARED_LIB=$(DARWIN_DIR)/libenry.dylib
HEADER_FILE=libenry.h HEADER_FILE=libenry.h
NATIVE_LIB=./shared/enry.go NATIVE_LIB=./shared/enry.go
# source files to be patched for using "rubex" instead of "regexp"
RUBEX_PATCHED := internal/code-generator/generator/heuristics.go internal/tokenizer/tokenize.go common.go
RUBEX_ORIG := $(RUBEX_PATCHED:=.orig)
.PHONY: revert-oniguruma
$(LINGUIST_PATH): $(LINGUIST_PATH):
git clone https://github.com/github/linguist.git $@ git clone https://github.com/github/linguist.git $@
@ -69,15 +63,6 @@ benchmarks-slow: $(LINGUST_PATH)
mkdir -p benchmarks/output && go test -run=NONE -bench=. -slow -benchtime=100ms -timeout=100h >benchmarks/output/enry_samples.bench && \ mkdir -p benchmarks/output && go test -run=NONE -bench=. -slow -benchtime=100ms -timeout=100h >benchmarks/output/enry_samples.bench && \
benchmarks/linguist-samples.rb 5 >benchmarks/output/linguist_samples.bench benchmarks/linguist-samples.rb 5 >benchmarks/output/linguist_samples.bench
$(RUBEX_ORIG): %.orig : %
sed -i.orig -e 's/"regexp"/regexp "github.com\/moovweb\/rubex"/g' $<
@touch $@
oniguruma: $(RUBEX_ORIG)
revert-oniguruma:
@for file in $(RUBEX_PATCHED); do if [ -e "$$file.orig" ]; then mv "$$file.orig" "$$file" && echo mv "$$file.orig" "$$file"; fi; done
build-cli: build-cli:
go build -o enry -ldflags "$(LOCAL_LDFLAGS)" cmd/enry/main.go go build -o enry -ldflags "$(LOCAL_LDFLAGS)" cmd/enry/main.go

View File

@ -37,10 +37,10 @@ On Ubuntu, it is
sudo apt install libonig-dev sudo apt install libonig-dev
``` ```
To build enry with Oniguruma regexps, patch the imports with To build enry with Oniguruma regexps use the `oniguruma` build tag
``` ```
make oniguruma go get -v -t --tags oniguruma ./...
``` ```
and then rebuild the project. and then rebuild the project.

View File

@ -4,10 +4,10 @@ import (
"bufio" "bufio"
"bytes" "bytes"
"path/filepath" "path/filepath"
"regexp"
"strings" "strings"
"gopkg.in/src-d/enry.v1/data" "gopkg.in/src-d/enry.v1/data"
"gopkg.in/src-d/enry.v1/regex"
) )
// OtherLanguage is used as a zero value when a function can not return a specific language. // OtherLanguage is used as a zero value when a function can not return a specific language.
@ -197,10 +197,10 @@ func footScope(content []byte, scope int) (index int) {
} }
var ( var (
reEmacsModeline = regexp.MustCompile(`.*-\*-\s*(.+?)\s*-\*-.*(?m:$)`) reEmacsModeline = regex.MustCompile(`.*-\*-\s*(.+?)\s*-\*-.*(?m:$)`)
reEmacsLang = regexp.MustCompile(`.*(?i:mode)\s*:\s*([^\s;]+)\s*;*.*`) reEmacsLang = regex.MustCompile(`.*(?i:mode)\s*:\s*([^\s;]+)\s*;*.*`)
reVimModeline = regexp.MustCompile(`(?:(?m:\s|^)vi(?:m[<=>]?\d+|m)?|[\t\x20]*ex)\s*[:]\s*(.*)(?m:$)`) reVimModeline = regex.MustCompile(`(?:(?m:\s|^)vi(?:m[<=>]?\d+|m)?|[\t\x20]*ex)\s*[:]\s*(.*)(?m:$)`)
reVimLang = regexp.MustCompile(`(?i:filetype|ft|syntax)\s*=(\w+)(?:\s|:|$)`) reVimLang = regex.MustCompile(`(?i:filetype|ft|syntax)\s*=(\w+)(?:\s|:|$)`)
) )
// GetLanguagesByEmacsModeline returns a slice of possible languages for the given content. // GetLanguagesByEmacsModeline returns a slice of possible languages for the given content.
@ -283,8 +283,8 @@ func GetLanguagesByShebang(_ string, content []byte, _ []string) (languages []st
} }
var ( var (
shebangExecHack = regexp.MustCompile(`exec (\w+).+\$0.+\$@`) shebangExecHack = regex.MustCompile(`exec (\w+).+\$0.+\$@`)
pythonVersion = regexp.MustCompile(`python\d\.\d+`) pythonVersion = regex.MustCompile(`python\d\.\d+`)
) )
func getInterpreter(data []byte) (interpreter string) { func getInterpreter(data []byte) (interpreter string) {

View File

@ -6,10 +6,11 @@ import (
"fmt" "fmt"
"io" "io"
"io/ioutil" "io/ioutil"
"regexp"
"strconv" "strconv"
"strings" "strings"
"text/template" "text/template"
"gopkg.in/src-d/enry.v1/regex"
) )
// Heuristics reads from fileToParse and builds source file from tmplPath. It complies with type File signature. // Heuristics reads from fileToParse and builds source file from tmplPath. It complies with type File signature.
@ -38,7 +39,7 @@ const (
) )
var ( var (
disambLine = regexp.MustCompile(`^(\s*)disambiguate`) disambLine = regex.MustCompile(`^(\s*)disambiguate`)
definedRegs = make(map[string]string) definedRegs = make(map[string]string)
illegalCharacter = map[string]string{ illegalCharacter = map[string]string{
"#": "Sharp", "#": "Sharp",
@ -378,7 +379,7 @@ func convertToValidRegexp(reg string) string {
func includeToRegExp(include string) string { func includeToRegExp(include string) string {
content := include[strings.Index(include, `(`)+1 : strings.Index(include, `)`)] content := include[strings.Index(include, `(`)+1 : strings.Index(include, `)`)]
content = strings.Trim(content, `"'`) content = strings.Trim(content, `"'`)
return regexp.QuoteMeta(content) return regex.QuoteMeta(content)
} }
func getLanguages(line string) []string { func getLanguages(line string) []string {

View File

@ -2,7 +2,8 @@ package tokenizer
import ( import (
"bytes" "bytes"
"regexp"
"gopkg.in/src-d/enry.v1/regex"
) )
const byteLimit = 100000 const byteLimit = 100000
@ -72,20 +73,20 @@ var (
// //
// These regexps were converted to work in the same way for both engines: // These regexps were converted to work in the same way for both engines:
// //
reLiteralStringQuotes = regexp.MustCompile(`("(.|\n)*?"|'(.|\n)*?')`) reLiteralStringQuotes = regex.MustCompile(`("(.|\n)*?"|'(.|\n)*?')`)
reSingleLineComment = regexp.MustCompile(`(?m)(//|--|#|%|")\s([^\n]*$)`) reSingleLineComment = regex.MustCompile(`(?m)(//|--|#|%|")\s([^\n]*$)`)
reMultilineComment = regexp.MustCompile(`(/\*(.|\n)*?\*/|<!--(.|\n)*?-->|\{-(.|\n)*?-\}|\(\*(.|\n)*?\*\)|"""(.|\n)*?"""|'''(.|\n)*?''')`) reMultilineComment = regex.MustCompile(`(/\*(.|\n)*?\*/|<!--(.|\n)*?-->|\{-(.|\n)*?-\}|\(\*(.|\n)*?\*\)|"""(.|\n)*?"""|'''(.|\n)*?''')`)
reLiteralNumber = regexp.MustCompile(`(0x[0-9A-Fa-f]([0-9A-Fa-f]|\.)*|\d(\d|\.)*)([uU][lL]{0,2}|([eE][-+]\d*)?[fFlL]*)`) reLiteralNumber = regex.MustCompile(`(0x[0-9A-Fa-f]([0-9A-Fa-f]|\.)*|\d(\d|\.)*)([uU][lL]{0,2}|([eE][-+]\d*)?[fFlL]*)`)
reShebang = regexp.MustCompile(`(?m)^#!(?:/[0-9A-Za-z_]+)*/(?:([0-9A-Za-z_]+)|[0-9A-Za-z_]+(?:\s*[0-9A-Za-z_]+=[0-9A-Za-z_]+\s*)*\s*([0-9A-Za-z_]+))(?:\s*-[0-9A-Za-z_]+\s*)*$`) reShebang = regex.MustCompile(`(?m)^#!(?:/[0-9A-Za-z_]+)*/(?:([0-9A-Za-z_]+)|[0-9A-Za-z_]+(?:\s*[0-9A-Za-z_]+=[0-9A-Za-z_]+\s*)*\s*([0-9A-Za-z_]+))(?:\s*-[0-9A-Za-z_]+\s*)*$`)
rePunctuation = regexp.MustCompile(`;|\{|\}|\(|\)|\[|\]`) rePunctuation = regex.MustCompile(`;|\{|\}|\(|\)|\[|\]`)
reSGML = regexp.MustCompile(`(<\/?[^\s<>=\d"']+)(?:\s(.|\n)*?\/?>|>)`) reSGML = regex.MustCompile(`(<\/?[^\s<>=\d"']+)(?:\s(.|\n)*?\/?>|>)`)
reSGMLComment = regexp.MustCompile(`(<!--(.|\n)*?-->)`) reSGMLComment = regex.MustCompile(`(<!--(.|\n)*?-->)`)
reSGMLAttributes = regexp.MustCompile(`\s+([0-9A-Za-z_]+=)|\s+([^\s>]+)`) reSGMLAttributes = regex.MustCompile(`\s+([0-9A-Za-z_]+=)|\s+([^\s>]+)`)
reSGMLLoneAttribute = regexp.MustCompile(`([0-9A-Za-z_]+)`) reSGMLLoneAttribute = regex.MustCompile(`([0-9A-Za-z_]+)`)
reRegularToken = regexp.MustCompile(`[0-9A-Za-z_\.@#\/\*]+`) reRegularToken = regex.MustCompile(`[0-9A-Za-z_\.@#\/\*]+`)
reOperators = regexp.MustCompile(`<<?|\+|\-|\*|\/|%|&&?|\|\|?`) reOperators = regex.MustCompile(`<<?|\+|\-|\*|\/|%|&&?|\|\|?`)
regexToSkip = []*regexp.Regexp{ regexToSkip = []regex.EnryRegexp{
// The order must be this // The order must be this
reLiteralStringQuotes, reLiteralStringQuotes,
reMultilineComment, reMultilineComment,
@ -124,22 +125,22 @@ func getShebangToken(matchedShebang [][]byte) []byte {
return tokenShebang return tokenShebang
} }
func commonExtracAndReplace(content []byte, re *regexp.Regexp) ([]byte, [][]byte) { func commonExtractAndReplace(content []byte, re regex.EnryRegexp) ([]byte, [][]byte) {
tokens := re.FindAll(content, -1) tokens := re.FindAll(content, -1)
content = re.ReplaceAll(content, []byte(` `)) content = re.ReplaceAll(content, []byte(` `))
return content, tokens return content, tokens
} }
func extractAndReplacePunctuation(content []byte) ([]byte, [][]byte) { func extractAndReplacePunctuation(content []byte) ([]byte, [][]byte) {
return commonExtracAndReplace(content, rePunctuation) return commonExtractAndReplace(content, rePunctuation)
} }
func extractAndReplaceRegular(content []byte) ([]byte, [][]byte) { func extractAndReplaceRegular(content []byte) ([]byte, [][]byte) {
return commonExtracAndReplace(content, reRegularToken) return commonExtractAndReplace(content, reRegularToken)
} }
func extractAndReplaceOperator(content []byte) ([]byte, [][]byte) { func extractAndReplaceOperator(content []byte) ([]byte, [][]byte) {
return commonExtracAndReplace(content, reOperators) return commonExtractAndReplace(content, reOperators)
} }
func extractAndReplaceSGML(content []byte) ([]byte, [][]byte) { func extractAndReplaceSGML(content []byte) ([]byte, [][]byte) {

17
regex/oniguruma.go Normal file
View File

@ -0,0 +1,17 @@
// +build oniguruma
package regex
import (
"github.com/moovweb/rubex"
)
type EnryRegexp = *rubex.Regexp
func MustCompile(str string) EnryRegexp {
return rubex.MustCompile(str)
}
func QuoteMeta(s string) string {
return rubex.QuoteMeta(s)
}

17
regex/standard.go Normal file
View File

@ -0,0 +1,17 @@
// +build !oniguruma
package regex
import (
"regexp"
)
type EnryRegexp = *regexp.Regexp
func MustCompile(str string) EnryRegexp {
return regexp.MustCompile(str)
}
func QuoteMeta(s string) string {
return regexp.QuoteMeta(s)
}