Refactor Oniguruma integration

Instead of use a command to change imports before build, using a build tag to generate the correct binary. This will allow applications to compile enry using oniguruma with less troubles. Signed-off-by: Antonio Jesus Navarro Perez <antnavper@gmail.com>
2025-09-16 18:37:33 +00:00 · 2018-08-28 17:27:18 +02:00
parent 8da8516ac1
commit 15bb13117f
8 changed files with 70 additions and 51 deletions
--- a/.travis.yml
+++ b/.travis.yml
@@ -24,9 +24,7 @@ install:
  - mkdir -p $GOPATH/src/gopkg.in/src-d
  - ln -s $PWD $GOPATH/src/gopkg.in/src-d/enry.v1
  - cd $GOPATH/src/gopkg.in/src-d/enry.v1
-  - if [ "$ONIGURUMA" == "1" ]; then make oniguruma; fi
+  - if [ "$ONIGURUMA" == "1" ]; then tags="$tags oniguruma"; fi; go get -v -t --tags "$tags" ./...
  - go get -v -t ./...
 script:
  - make test-coverage
--- a/15
+++ b/15
@@ -38,12 +38,6 @@ DARWIN_SHARED_LIB=$(DARWIN_DIR)/libenry.dylib
 HEADER_FILE=libenry.h
 NATIVE_LIB=./shared/enry.go
 # source files to be patched for using "rubex" instead of "regexp"
 RUBEX_PATCHED := internal/code-generator/generator/heuristics.go internal/tokenizer/tokenize.go common.go
 RUBEX_ORIG := $(RUBEX_PATCHED:=.orig)
 .PHONY: revert-oniguruma
 $(LINGUIST_PATH):
 	git clone https://github.com/github/linguist.git $@
@@ -69,15 +63,6 @@ benchmarks-slow: $(LINGUST_PATH)
 	mkdir -p benchmarks/output && go test -run=NONE -bench=. -slow -benchtime=100ms -timeout=100h >benchmarks/output/enry_samples.bench && \
 	benchmarks/linguist-samples.rb 5 >benchmarks/output/linguist_samples.bench
 $(RUBEX_ORIG): %.orig : %
 	sed -i.orig -e 's/"regexp"/regexp "github.com\/moovweb\/rubex"/g' $<
 	@touch $@
 oniguruma: $(RUBEX_ORIG)
 revert-oniguruma:
 	@for file in $(RUBEX_PATCHED); do if [ -e "$$file.orig" ]; then mv "$$file.orig" "$$file" && echo mv "$$file.orig" "$$file"; fi; done
 build-cli:
 	go build -o enry -ldflags "$(LOCAL_LDFLAGS)" cmd/enry/main.go
--- a/README.md
+++ b/README.md
@@ -37,10 +37,10 @@ On Ubuntu, it is
 sudo apt install libonig-dev
 ```
-To build enry with Oniguruma regexps, patch the imports with
+To build enry with Oniguruma regexps use the `oniguruma` build tag
 ```
-make oniguruma
+go get -v -t --tags oniguruma ./...
 ```
 and then rebuild the project.
--- a/common.go
+++ b/common.go
@@ -4,10 +4,10 @@ import (
 	"bufio"
 	"bytes"
 	"path/filepath"
 	"regexp"
 	"strings"
 	"gopkg.in/src-d/enry.v1/data"
 	"gopkg.in/src-d/enry.v1/regex"
 )
 // OtherLanguage is used as a zero value when a function can not return a specific language.
@@ -197,10 +197,10 @@ func footScope(content []byte, scope int) (index int) {
 }
 var (
-	reEmacsModeline = regexp.MustCompile(`.*-\*-\s*(.+?)\s*-\*-.*(?m:$)`)
+	reEmacsModeline = regex.MustCompile(`.*-\*-\s*(.+?)\s*-\*-.*(?m:$)`)
-	reEmacsLang     = regexp.MustCompile(`.*(?i:mode)\s*:\s*([^\s;]+)\s*;*.*`)
+	reEmacsLang     = regex.MustCompile(`.*(?i:mode)\s*:\s*([^\s;]+)\s*;*.*`)
-	reVimModeline   = regexp.MustCompile(`(?:(?m:\s|^)vi(?:m[<=>]?\d+|m)?|[\t\x20]*ex)\s*[:]\s*(.*)(?m:$)`)
+	reVimModeline   = regex.MustCompile(`(?:(?m:\s|^)vi(?:m[<=>]?\d+|m)?|[\t\x20]*ex)\s*[:]\s*(.*)(?m:$)`)
-	reVimLang       = regexp.MustCompile(`(?i:filetype|ft|syntax)\s*=(\w+)(?:\s|:|$)`)
+	reVimLang       = regex.MustCompile(`(?i:filetype|ft|syntax)\s*=(\w+)(?:\s|:|$)`)
 )
 // GetLanguagesByEmacsModeline returns a slice of possible languages for the given content.
@@ -283,8 +283,8 @@ func GetLanguagesByShebang(_ string, content []byte, _ []string) (languages []st
 }
 var (
-	shebangExecHack = regexp.MustCompile(`exec (\w+).+\$0.+\$@`)
+	shebangExecHack = regex.MustCompile(`exec (\w+).+\$0.+\$@`)
-	pythonVersion   = regexp.MustCompile(`python\d\.\d+`)
+	pythonVersion   = regex.MustCompile(`python\d\.\d+`)
 )
 func getInterpreter(data []byte) (interpreter string) {
--- a/internal/code-generator/generator/heuristics.go
+++ b/internal/code-generator/generator/heuristics.go
@@ -6,10 +6,11 @@ import (
 	"fmt"
 	"io"
 	"io/ioutil"
 	"regexp"
 	"strconv"
 	"strings"
 	"text/template"
 	"gopkg.in/src-d/enry.v1/regex"
 )
 // Heuristics reads from fileToParse and builds source file from tmplPath. It complies with type File signature.
@@ -38,7 +39,7 @@ const (
 )
 var (
-	disambLine       = regexp.MustCompile(`^(\s*)disambiguate`)
+	disambLine       = regex.MustCompile(`^(\s*)disambiguate`)
 	definedRegs      = make(map[string]string)
 	illegalCharacter = map[string]string{
 		"#": "Sharp",
@@ -378,7 +379,7 @@ func convertToValidRegexp(reg string) string {
 func includeToRegExp(include string) string {
 	content := include[strings.Index(include, `(`)+1 : strings.Index(include, `)`)]
 	content = strings.Trim(content, `"'`)
-	return regexp.QuoteMeta(content)
+	return regex.QuoteMeta(content)
 }
 func getLanguages(line string) []string {
--- a/internal/tokenizer/tokenize.go
+++ b/internal/tokenizer/tokenize.go
@@ -2,7 +2,8 @@ package tokenizer
 import (
 	"bytes"
-	"regexp"
+
 	"gopkg.in/src-d/enry.v1/regex"
 )
 const byteLimit = 100000
@@ -72,20 +73,20 @@ var (
 	//
 	// These regexps were converted to work in the same way for both engines:
 	//
-	reLiteralStringQuotes = regexp.MustCompile(`("(.|\n)*?"|'(.|\n)*?')`)
+	reLiteralStringQuotes = regex.MustCompile(`("(.|\n)*?"|'(.|\n)*?')`)
-	reSingleLineComment   = regexp.MustCompile(`(?m)(//|--|#|%|")\s([^\n]*$)`)
+	reSingleLineComment   = regex.MustCompile(`(?m)(//|--|#|%|")\s([^\n]*$)`)
-	reMultilineComment    = regexp.MustCompile(`(/\*(.|\n)*?\*/|<!--(.|\n)*?-->|\{-(.|\n)*?-\}|\(\*(.|\n)*?\*\)|"""(.|\n)*?"""|'''(.|\n)*?''')`)
+	reMultilineComment    = regex.MustCompile(`(/\*(.|\n)*?\*/|<!--(.|\n)*?-->|\{-(.|\n)*?-\}|\(\*(.|\n)*?\*\)|"""(.|\n)*?"""|'''(.|\n)*?''')`)
-	reLiteralNumber       = regexp.MustCompile(`(0x[0-9A-Fa-f]([0-9A-Fa-f]|\.)*|\d(\d|\.)*)([uU][lL]{0,2}|([eE][-+]\d*)?[fFlL]*)`)
+	reLiteralNumber       = regex.MustCompile(`(0x[0-9A-Fa-f]([0-9A-Fa-f]|\.)*|\d(\d|\.)*)([uU][lL]{0,2}|([eE][-+]\d*)?[fFlL]*)`)
-	reShebang             = regexp.MustCompile(`(?m)^#!(?:/[0-9A-Za-z_]+)*/(?:([0-9A-Za-z_]+)|[0-9A-Za-z_]+(?:\s*[0-9A-Za-z_]+=[0-9A-Za-z_]+\s*)*\s*([0-9A-Za-z_]+))(?:\s*-[0-9A-Za-z_]+\s*)*$`)
+	reShebang             = regex.MustCompile(`(?m)^#!(?:/[0-9A-Za-z_]+)*/(?:([0-9A-Za-z_]+)|[0-9A-Za-z_]+(?:\s*[0-9A-Za-z_]+=[0-9A-Za-z_]+\s*)*\s*([0-9A-Za-z_]+))(?:\s*-[0-9A-Za-z_]+\s*)*$`)
-	rePunctuation         = regexp.MustCompile(`;|\{|\}|\(|\)|\[|\]`)
+	rePunctuation         = regex.MustCompile(`;|\{|\}|\(|\)|\[|\]`)
-	reSGML                = regexp.MustCompile(`(<\/?[^\s<>=\d"']+)(?:\s(.|\n)*?\/?>|>)`)
+	reSGML                = regex.MustCompile(`(<\/?[^\s<>=\d"']+)(?:\s(.|\n)*?\/?>|>)`)
-	reSGMLComment         = regexp.MustCompile(`(<!--(.|\n)*?-->)`)
+	reSGMLComment         = regex.MustCompile(`(<!--(.|\n)*?-->)`)
-	reSGMLAttributes      = regexp.MustCompile(`\s+([0-9A-Za-z_]+=)|\s+([^\s>]+)`)
+	reSGMLAttributes      = regex.MustCompile(`\s+([0-9A-Za-z_]+=)|\s+([^\s>]+)`)
-	reSGMLLoneAttribute   = regexp.MustCompile(`([0-9A-Za-z_]+)`)
+	reSGMLLoneAttribute   = regex.MustCompile(`([0-9A-Za-z_]+)`)
-	reRegularToken        = regexp.MustCompile(`[0-9A-Za-z_\.@#\/\*]+`)
+	reRegularToken        = regex.MustCompile(`[0-9A-Za-z_\.@#\/\*]+`)
-	reOperators           = regexp.MustCompile(`<<?|\+|\-|\*|\/|%|&&?|\|\|?`)
+	reOperators           = regex.MustCompile(`<<?|\+|\-|\*|\/|%|&&?|\|\|?`)
-	regexToSkip = []*regexp.Regexp{
+	regexToSkip = []regex.EnryRegexp{
 		// The order must be this
 		reLiteralStringQuotes,
 		reMultilineComment,
@@ -124,22 +125,22 @@ func getShebangToken(matchedShebang [][]byte) []byte {
 	return tokenShebang
 }
-func commonExtracAndReplace(content []byte, re *regexp.Regexp) ([]byte, [][]byte) {
+func commonExtractAndReplace(content []byte, re regex.EnryRegexp) ([]byte, [][]byte) {
 	tokens := re.FindAll(content, -1)
 	content = re.ReplaceAll(content, []byte(` `))
 	return content, tokens
 }
 func extractAndReplacePunctuation(content []byte) ([]byte, [][]byte) {
-	return commonExtracAndReplace(content, rePunctuation)
+	return commonExtractAndReplace(content, rePunctuation)
 }
 func extractAndReplaceRegular(content []byte) ([]byte, [][]byte) {
-	return commonExtracAndReplace(content, reRegularToken)
+	return commonExtractAndReplace(content, reRegularToken)
 }
 func extractAndReplaceOperator(content []byte) ([]byte, [][]byte) {
-	return commonExtracAndReplace(content, reOperators)
+	return commonExtractAndReplace(content, reOperators)
 }
 func extractAndReplaceSGML(content []byte) ([]byte, [][]byte) {
--- a/regex/oniguruma.go
+++ b/regex/oniguruma.go
@@ -0,0 +1,17 @@
 // +build oniguruma
 package regex
 import (
 	"github.com/moovweb/rubex"
 )
 type EnryRegexp = *rubex.Regexp
 func MustCompile(str string) EnryRegexp {
 	return rubex.MustCompile(str)
 }
 func QuoteMeta(s string) string {
 	return rubex.QuoteMeta(s)
 }
--- a/regex/standard.go
+++ b/regex/standard.go
@@ -0,0 +1,17 @@
 // +build !oniguruma
 package regex
 import (
 	"regexp"
 )
 type EnryRegexp = *regexp.Regexp
 func MustCompile(str string) EnryRegexp {
 	return regexp.MustCompile(str)
 }
 func QuoteMeta(s string) string {
 	return regexp.QuoteMeta(s)
 }