Refactor Oniguruma integration

Instead of use a command to change imports before build, using a build tag to generate the correct binary. This will allow applications to compile enry using oniguruma with less troubles. Signed-off-by: Antonio Jesus Navarro Perez <antnavper@gmail.com>
2025-09-16 10:27:34 +00:00 · 2018-08-28 17:27:18 +02:00
parent 8da8516ac1
commit 15bb13117f
8 changed files with 70 additions and 51 deletions
--- a/.travis.yml
+++ b/.travis.yml
@@ -24,9 +24,7 @@ install:
  - mkdir -p $GOPATH/src/gopkg.in/src-d
  - ln -s $PWD $GOPATH/src/gopkg.in/src-d/enry.v1
  - cd $GOPATH/src/gopkg.in/src-d/enry.v1
-  - if [ "$ONIGURUMA" == "1" ]; then make oniguruma; fi
-  - go get -v -t ./...
-
+  - if [ "$ONIGURUMA" == "1" ]; then tags="$tags oniguruma"; fi; go get -v -t --tags "$tags" ./...
 script:
  - make test-coverage

@@ -100,7 +98,7 @@ jobs:
        - sudo apt-get update
        - sudo apt-get install -y --no-install-recommends clang g++ gcc gcc-multilib libc6-dev libc6-dev-i386 mingw-w64 patch xz-utils
        - cd ${HOME}
-        - curl -sSL ${OSXCROSS_URL} | tar -C ${HOME} -xzf - 
+        - curl -sSL ${OSXCROSS_URL} | tar -C ${HOME} -xzf -
        - cd $GOPATH/src/gopkg.in/src-d/enry.v1

      script:
--- a/15
+++ b/15
@@ -38,12 +38,6 @@ DARWIN_SHARED_LIB=$(DARWIN_DIR)/libenry.dylib
 HEADER_FILE=libenry.h
 NATIVE_LIB=./shared/enry.go

-# source files to be patched for using "rubex" instead of "regexp"
-RUBEX_PATCHED := internal/code-generator/generator/heuristics.go internal/tokenizer/tokenize.go common.go
-RUBEX_ORIG := $(RUBEX_PATCHED:=.orig)
-
-.PHONY: revert-oniguruma
-
 $(LINGUIST_PATH):
 	git clone https://github.com/github/linguist.git $@

@@ -69,15 +63,6 @@ benchmarks-slow: $(LINGUST_PATH)
 	mkdir -p benchmarks/output && go test -run=NONE -bench=. -slow -benchtime=100ms -timeout=100h >benchmarks/output/enry_samples.bench && \
 	benchmarks/linguist-samples.rb 5 >benchmarks/output/linguist_samples.bench

-$(RUBEX_ORIG): %.orig : %
-	sed -i.orig -e 's/"regexp"/regexp "github.com\/moovweb\/rubex"/g' $<
-	@touch $@
-
-oniguruma: $(RUBEX_ORIG)
-
-revert-oniguruma:
-	@for file in $(RUBEX_PATCHED); do if [ -e "$$file.orig" ]; then mv "$$file.orig" "$$file" && echo mv "$$file.orig" "$$file"; fi; done
-
 build-cli:
 	go build -o enry -ldflags "$(LOCAL_LDFLAGS)" cmd/enry/main.go

--- a/README.md
+++ b/README.md
@@ -37,10 +37,10 @@ On Ubuntu, it is
 sudo apt install libonig-dev
 ```

-To build enry with Oniguruma regexps, patch the imports with
+To build enry with Oniguruma regexps use the `oniguruma` build tag

 ```
-make oniguruma
+go get -v -t --tags oniguruma ./...
 ```

 and then rebuild the project.
@@ -162,7 +162,7 @@ We update enry when changes are done in linguist's master branch on the followin
 * [vendor.yml](https://github.com/github/linguist/blob/master/lib/linguist/vendor.yml)
 * [documentation.yml](https://github.com/github/linguist/blob/master/lib/linguist/documentation.yml)

-Currently we don't have any procedure established to automatically detect changes in the linguist project and regenerate the code. 
+Currently we don't have any procedure established to automatically detect changes in the linguist project and regenerate the code.
 So we update the generated code as needed, without any specific criteria.

 If you want to update *enry* because of changes in linguist, you can run the *go
@@ -217,7 +217,7 @@ If you want to reproduce the same benchmarks you can run:

    benchmarks/run.sh

-from the root's project directory and it'll run benchmarks for enry and linguist, parse the output, create csv files and create a histogram (you must have installed [gnuplot](http://gnuplot.info) in your system to get the histogram). 
+from the root's project directory and it'll run benchmarks for enry and linguist, parse the output, create csv files and create a histogram (you must have installed [gnuplot](http://gnuplot.info) in your system to get the histogram).

 This can take some time, so to run local benchmarks for a quick check you can either:

--- a/common.go
+++ b/common.go
@@ -4,10 +4,10 @@ import (
 	"bufio"
 	"bytes"
 	"path/filepath"
-	"regexp"
 	"strings"

 	"gopkg.in/src-d/enry.v1/data"
+	"gopkg.in/src-d/enry.v1/regex"
 )

 // OtherLanguage is used as a zero value when a function can not return a specific language.
@@ -197,10 +197,10 @@ func footScope(content []byte, scope int) (index int) {
 }

 var (
-	reEmacsModeline = regexp.MustCompile(`.*-\*-\s*(.+?)\s*-\*-.*(?m:$)`)
-	reEmacsLang     = regexp.MustCompile(`.*(?i:mode)\s*:\s*([^\s;]+)\s*;*.*`)
-	reVimModeline   = regexp.MustCompile(`(?:(?m:\s|^)vi(?:m[<=>]?\d+|m)?|[\t\x20]*ex)\s*[:]\s*(.*)(?m:$)`)
-	reVimLang       = regexp.MustCompile(`(?i:filetype|ft|syntax)\s*=(\w+)(?:\s|:|$)`)
+	reEmacsModeline = regex.MustCompile(`.*-\*-\s*(.+?)\s*-\*-.*(?m:$)`)
+	reEmacsLang     = regex.MustCompile(`.*(?i:mode)\s*:\s*([^\s;]+)\s*;*.*`)
+	reVimModeline   = regex.MustCompile(`(?:(?m:\s|^)vi(?:m[<=>]?\d+|m)?|[\t\x20]*ex)\s*[:]\s*(.*)(?m:$)`)
+	reVimLang       = regex.MustCompile(`(?i:filetype|ft|syntax)\s*=(\w+)(?:\s|:|$)`)
 )

 // GetLanguagesByEmacsModeline returns a slice of possible languages for the given content.
@@ -283,8 +283,8 @@ func GetLanguagesByShebang(_ string, content []byte, _ []string) (languages []st
 }

 var (
-	shebangExecHack = regexp.MustCompile(`exec (\w+).+\$0.+\$@`)
-	pythonVersion   = regexp.MustCompile(`python\d\.\d+`)
+	shebangExecHack = regex.MustCompile(`exec (\w+).+\$0.+\$@`)
+	pythonVersion   = regex.MustCompile(`python\d\.\d+`)
 )

 func getInterpreter(data []byte) (interpreter string) {
--- a/internal/code-generator/generator/heuristics.go
+++ b/internal/code-generator/generator/heuristics.go
@@ -6,10 +6,11 @@ import (
 	"fmt"
 	"io"
 	"io/ioutil"
-	"regexp"
 	"strconv"
 	"strings"
 	"text/template"
+
+	"gopkg.in/src-d/enry.v1/regex"
 )

 // Heuristics reads from fileToParse and builds source file from tmplPath. It complies with type File signature.
@@ -38,7 +39,7 @@ const (
 )

 var (
-	disambLine       = regexp.MustCompile(`^(\s*)disambiguate`)
+	disambLine       = regex.MustCompile(`^(\s*)disambiguate`)
 	definedRegs      = make(map[string]string)
 	illegalCharacter = map[string]string{
 		"#": "Sharp",
@@ -378,7 +379,7 @@ func convertToValidRegexp(reg string) string {
 func includeToRegExp(include string) string {
 	content := include[strings.Index(include, `(`)+1 : strings.Index(include, `)`)]
 	content = strings.Trim(content, `"'`)
-	return regexp.QuoteMeta(content)
+	return regex.QuoteMeta(content)
 }

 func getLanguages(line string) []string {
--- a/internal/tokenizer/tokenize.go
+++ b/internal/tokenizer/tokenize.go
@@ -2,7 +2,8 @@ package tokenizer

 import (
 	"bytes"
-	"regexp"
+
+	"gopkg.in/src-d/enry.v1/regex"
 )

 const byteLimit = 100000
@@ -72,20 +73,20 @@ var (
 	//
 	// These regexps were converted to work in the same way for both engines:
 	//
-	reLiteralStringQuotes = regexp.MustCompile(`("(.|\n)*?"|'(.|\n)*?')`)
-	reSingleLineComment   = regexp.MustCompile(`(?m)(//|--|#|%|")\s([^\n]*$)`)
-	reMultilineComment    = regexp.MustCompile(`(/\*(.|\n)*?\*/|<!--(.|\n)*?-->|\{-(.|\n)*?-\}|\(\*(.|\n)*?\*\)|"""(.|\n)*?"""|'''(.|\n)*?''')`)
-	reLiteralNumber       = regexp.MustCompile(`(0x[0-9A-Fa-f]([0-9A-Fa-f]|\.)*|\d(\d|\.)*)([uU][lL]{0,2}|([eE][-+]\d*)?[fFlL]*)`)
-	reShebang             = regexp.MustCompile(`(?m)^#!(?:/[0-9A-Za-z_]+)*/(?:([0-9A-Za-z_]+)|[0-9A-Za-z_]+(?:\s*[0-9A-Za-z_]+=[0-9A-Za-z_]+\s*)*\s*([0-9A-Za-z_]+))(?:\s*-[0-9A-Za-z_]+\s*)*$`)
-	rePunctuation         = regexp.MustCompile(`;|\{|\}|\(|\)|\[|\]`)
-	reSGML                = regexp.MustCompile(`(<\/?[^\s<>=\d"']+)(?:\s(.|\n)*?\/?>|>)`)
-	reSGMLComment         = regexp.MustCompile(`(<!--(.|\n)*?-->)`)
-	reSGMLAttributes      = regexp.MustCompile(`\s+([0-9A-Za-z_]+=)|\s+([^\s>]+)`)
-	reSGMLLoneAttribute   = regexp.MustCompile(`([0-9A-Za-z_]+)`)
-	reRegularToken        = regexp.MustCompile(`[0-9A-Za-z_\.@#\/\*]+`)
-	reOperators           = regexp.MustCompile(`<<?|\+|\-|\*|\/|%|&&?|\|\|?`)
+	reLiteralStringQuotes = regex.MustCompile(`("(.|\n)*?"|'(.|\n)*?')`)
+	reSingleLineComment   = regex.MustCompile(`(?m)(//|--|#|%|")\s([^\n]*$)`)
+	reMultilineComment    = regex.MustCompile(`(/\*(.|\n)*?\*/|<!--(.|\n)*?-->|\{-(.|\n)*?-\}|\(\*(.|\n)*?\*\)|"""(.|\n)*?"""|'''(.|\n)*?''')`)
+	reLiteralNumber       = regex.MustCompile(`(0x[0-9A-Fa-f]([0-9A-Fa-f]|\.)*|\d(\d|\.)*)([uU][lL]{0,2}|([eE][-+]\d*)?[fFlL]*)`)
+	reShebang             = regex.MustCompile(`(?m)^#!(?:/[0-9A-Za-z_]+)*/(?:([0-9A-Za-z_]+)|[0-9A-Za-z_]+(?:\s*[0-9A-Za-z_]+=[0-9A-Za-z_]+\s*)*\s*([0-9A-Za-z_]+))(?:\s*-[0-9A-Za-z_]+\s*)*$`)
+	rePunctuation         = regex.MustCompile(`;|\{|\}|\(|\)|\[|\]`)
+	reSGML                = regex.MustCompile(`(<\/?[^\s<>=\d"']+)(?:\s(.|\n)*?\/?>|>)`)
+	reSGMLComment         = regex.MustCompile(`(<!--(.|\n)*?-->)`)
+	reSGMLAttributes      = regex.MustCompile(`\s+([0-9A-Za-z_]+=)|\s+([^\s>]+)`)
+	reSGMLLoneAttribute   = regex.MustCompile(`([0-9A-Za-z_]+)`)
+	reRegularToken        = regex.MustCompile(`[0-9A-Za-z_\.@#\/\*]+`)
+	reOperators           = regex.MustCompile(`<<?|\+|\-|\*|\/|%|&&?|\|\|?`)

-	regexToSkip = []*regexp.Regexp{
+	regexToSkip = []regex.EnryRegexp{
 		// The order must be this
 		reLiteralStringQuotes,
 		reMultilineComment,
@@ -124,22 +125,22 @@ func getShebangToken(matchedShebang [][]byte) []byte {
 	return tokenShebang
 }

-func commonExtracAndReplace(content []byte, re *regexp.Regexp) ([]byte, [][]byte) {
+func commonExtractAndReplace(content []byte, re regex.EnryRegexp) ([]byte, [][]byte) {
 	tokens := re.FindAll(content, -1)
 	content = re.ReplaceAll(content, []byte(` `))
 	return content, tokens
 }

 func extractAndReplacePunctuation(content []byte) ([]byte, [][]byte) {
-	return commonExtracAndReplace(content, rePunctuation)
+	return commonExtractAndReplace(content, rePunctuation)
 }

 func extractAndReplaceRegular(content []byte) ([]byte, [][]byte) {
-	return commonExtracAndReplace(content, reRegularToken)
+	return commonExtractAndReplace(content, reRegularToken)
 }

 func extractAndReplaceOperator(content []byte) ([]byte, [][]byte) {
-	return commonExtracAndReplace(content, reOperators)
+	return commonExtractAndReplace(content, reOperators)
 }

 func extractAndReplaceSGML(content []byte) ([]byte, [][]byte) {
--- a/regex/oniguruma.go
+++ b/regex/oniguruma.go
@@ -0,0 +1,17 @@
+// +build oniguruma
+
+package regex
+
+import (
+	"github.com/moovweb/rubex"
+)
+
+type EnryRegexp = *rubex.Regexp
+
+func MustCompile(str string) EnryRegexp {
+	return rubex.MustCompile(str)
+}
+
+func QuoteMeta(s string) string {
+	return rubex.QuoteMeta(s)
+}
--- a/regex/standard.go
+++ b/regex/standard.go
@@ -0,0 +1,17 @@
+// +build !oniguruma
+
+package regex
+
+import (
+	"regexp"
+)
+
+type EnryRegexp = *regexp.Regexp
+
+func MustCompile(str string) EnryRegexp {
+	return regexp.MustCompile(str)
+}
+
+func QuoteMeta(s string) string {
+	return regexp.QuoteMeta(s)
+}