mirror of
https://github.com/ralsina/tartrazine.git
synced 2024-11-10 05:22:23 +00:00
Refactor Oniguruma integration
Instead of use a command to change imports before build, using a build tag to generate the correct binary. This will allow applications to compile enry using oniguruma with less troubles. Signed-off-by: Antonio Jesus Navarro Perez <antnavper@gmail.com>
This commit is contained in:
parent
8da8516ac1
commit
15bb13117f
@ -24,9 +24,7 @@ install:
|
||||
- mkdir -p $GOPATH/src/gopkg.in/src-d
|
||||
- ln -s $PWD $GOPATH/src/gopkg.in/src-d/enry.v1
|
||||
- cd $GOPATH/src/gopkg.in/src-d/enry.v1
|
||||
- if [ "$ONIGURUMA" == "1" ]; then make oniguruma; fi
|
||||
- go get -v -t ./...
|
||||
|
||||
- if [ "$ONIGURUMA" == "1" ]; then tags="$tags oniguruma"; fi; go get -v -t --tags "$tags" ./...
|
||||
script:
|
||||
- make test-coverage
|
||||
|
||||
@ -100,7 +98,7 @@ jobs:
|
||||
- sudo apt-get update
|
||||
- sudo apt-get install -y --no-install-recommends clang g++ gcc gcc-multilib libc6-dev libc6-dev-i386 mingw-w64 patch xz-utils
|
||||
- cd ${HOME}
|
||||
- curl -sSL ${OSXCROSS_URL} | tar -C ${HOME} -xzf -
|
||||
- curl -sSL ${OSXCROSS_URL} | tar -C ${HOME} -xzf -
|
||||
- cd $GOPATH/src/gopkg.in/src-d/enry.v1
|
||||
|
||||
script:
|
||||
|
15
Makefile
15
Makefile
@ -38,12 +38,6 @@ DARWIN_SHARED_LIB=$(DARWIN_DIR)/libenry.dylib
|
||||
HEADER_FILE=libenry.h
|
||||
NATIVE_LIB=./shared/enry.go
|
||||
|
||||
# source files to be patched for using "rubex" instead of "regexp"
|
||||
RUBEX_PATCHED := internal/code-generator/generator/heuristics.go internal/tokenizer/tokenize.go common.go
|
||||
RUBEX_ORIG := $(RUBEX_PATCHED:=.orig)
|
||||
|
||||
.PHONY: revert-oniguruma
|
||||
|
||||
$(LINGUIST_PATH):
|
||||
git clone https://github.com/github/linguist.git $@
|
||||
|
||||
@ -69,15 +63,6 @@ benchmarks-slow: $(LINGUST_PATH)
|
||||
mkdir -p benchmarks/output && go test -run=NONE -bench=. -slow -benchtime=100ms -timeout=100h >benchmarks/output/enry_samples.bench && \
|
||||
benchmarks/linguist-samples.rb 5 >benchmarks/output/linguist_samples.bench
|
||||
|
||||
$(RUBEX_ORIG): %.orig : %
|
||||
sed -i.orig -e 's/"regexp"/regexp "github.com\/moovweb\/rubex"/g' $<
|
||||
@touch $@
|
||||
|
||||
oniguruma: $(RUBEX_ORIG)
|
||||
|
||||
revert-oniguruma:
|
||||
@for file in $(RUBEX_PATCHED); do if [ -e "$$file.orig" ]; then mv "$$file.orig" "$$file" && echo mv "$$file.orig" "$$file"; fi; done
|
||||
|
||||
build-cli:
|
||||
go build -o enry -ldflags "$(LOCAL_LDFLAGS)" cmd/enry/main.go
|
||||
|
||||
|
@ -37,10 +37,10 @@ On Ubuntu, it is
|
||||
sudo apt install libonig-dev
|
||||
```
|
||||
|
||||
To build enry with Oniguruma regexps, patch the imports with
|
||||
To build enry with Oniguruma regexps use the `oniguruma` build tag
|
||||
|
||||
```
|
||||
make oniguruma
|
||||
go get -v -t --tags oniguruma ./...
|
||||
```
|
||||
|
||||
and then rebuild the project.
|
||||
@ -162,7 +162,7 @@ We update enry when changes are done in linguist's master branch on the followin
|
||||
* [vendor.yml](https://github.com/github/linguist/blob/master/lib/linguist/vendor.yml)
|
||||
* [documentation.yml](https://github.com/github/linguist/blob/master/lib/linguist/documentation.yml)
|
||||
|
||||
Currently we don't have any procedure established to automatically detect changes in the linguist project and regenerate the code.
|
||||
Currently we don't have any procedure established to automatically detect changes in the linguist project and regenerate the code.
|
||||
So we update the generated code as needed, without any specific criteria.
|
||||
|
||||
If you want to update *enry* because of changes in linguist, you can run the *go
|
||||
@ -217,7 +217,7 @@ If you want to reproduce the same benchmarks you can run:
|
||||
|
||||
benchmarks/run.sh
|
||||
|
||||
from the root's project directory and it'll run benchmarks for enry and linguist, parse the output, create csv files and create a histogram (you must have installed [gnuplot](http://gnuplot.info) in your system to get the histogram).
|
||||
from the root's project directory and it'll run benchmarks for enry and linguist, parse the output, create csv files and create a histogram (you must have installed [gnuplot](http://gnuplot.info) in your system to get the histogram).
|
||||
|
||||
This can take some time, so to run local benchmarks for a quick check you can either:
|
||||
|
||||
|
14
common.go
14
common.go
@ -4,10 +4,10 @@ import (
|
||||
"bufio"
|
||||
"bytes"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"strings"
|
||||
|
||||
"gopkg.in/src-d/enry.v1/data"
|
||||
"gopkg.in/src-d/enry.v1/regex"
|
||||
)
|
||||
|
||||
// OtherLanguage is used as a zero value when a function can not return a specific language.
|
||||
@ -197,10 +197,10 @@ func footScope(content []byte, scope int) (index int) {
|
||||
}
|
||||
|
||||
var (
|
||||
reEmacsModeline = regexp.MustCompile(`.*-\*-\s*(.+?)\s*-\*-.*(?m:$)`)
|
||||
reEmacsLang = regexp.MustCompile(`.*(?i:mode)\s*:\s*([^\s;]+)\s*;*.*`)
|
||||
reVimModeline = regexp.MustCompile(`(?:(?m:\s|^)vi(?:m[<=>]?\d+|m)?|[\t\x20]*ex)\s*[:]\s*(.*)(?m:$)`)
|
||||
reVimLang = regexp.MustCompile(`(?i:filetype|ft|syntax)\s*=(\w+)(?:\s|:|$)`)
|
||||
reEmacsModeline = regex.MustCompile(`.*-\*-\s*(.+?)\s*-\*-.*(?m:$)`)
|
||||
reEmacsLang = regex.MustCompile(`.*(?i:mode)\s*:\s*([^\s;]+)\s*;*.*`)
|
||||
reVimModeline = regex.MustCompile(`(?:(?m:\s|^)vi(?:m[<=>]?\d+|m)?|[\t\x20]*ex)\s*[:]\s*(.*)(?m:$)`)
|
||||
reVimLang = regex.MustCompile(`(?i:filetype|ft|syntax)\s*=(\w+)(?:\s|:|$)`)
|
||||
)
|
||||
|
||||
// GetLanguagesByEmacsModeline returns a slice of possible languages for the given content.
|
||||
@ -283,8 +283,8 @@ func GetLanguagesByShebang(_ string, content []byte, _ []string) (languages []st
|
||||
}
|
||||
|
||||
var (
|
||||
shebangExecHack = regexp.MustCompile(`exec (\w+).+\$0.+\$@`)
|
||||
pythonVersion = regexp.MustCompile(`python\d\.\d+`)
|
||||
shebangExecHack = regex.MustCompile(`exec (\w+).+\$0.+\$@`)
|
||||
pythonVersion = regex.MustCompile(`python\d\.\d+`)
|
||||
)
|
||||
|
||||
func getInterpreter(data []byte) (interpreter string) {
|
||||
|
@ -6,10 +6,11 @@ import (
|
||||
"fmt"
|
||||
"io"
|
||||
"io/ioutil"
|
||||
"regexp"
|
||||
"strconv"
|
||||
"strings"
|
||||
"text/template"
|
||||
|
||||
"gopkg.in/src-d/enry.v1/regex"
|
||||
)
|
||||
|
||||
// Heuristics reads from fileToParse and builds source file from tmplPath. It complies with type File signature.
|
||||
@ -38,7 +39,7 @@ const (
|
||||
)
|
||||
|
||||
var (
|
||||
disambLine = regexp.MustCompile(`^(\s*)disambiguate`)
|
||||
disambLine = regex.MustCompile(`^(\s*)disambiguate`)
|
||||
definedRegs = make(map[string]string)
|
||||
illegalCharacter = map[string]string{
|
||||
"#": "Sharp",
|
||||
@ -378,7 +379,7 @@ func convertToValidRegexp(reg string) string {
|
||||
func includeToRegExp(include string) string {
|
||||
content := include[strings.Index(include, `(`)+1 : strings.Index(include, `)`)]
|
||||
content = strings.Trim(content, `"'`)
|
||||
return regexp.QuoteMeta(content)
|
||||
return regex.QuoteMeta(content)
|
||||
}
|
||||
|
||||
func getLanguages(line string) []string {
|
||||
|
@ -2,7 +2,8 @@ package tokenizer
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"regexp"
|
||||
|
||||
"gopkg.in/src-d/enry.v1/regex"
|
||||
)
|
||||
|
||||
const byteLimit = 100000
|
||||
@ -72,20 +73,20 @@ var (
|
||||
//
|
||||
// These regexps were converted to work in the same way for both engines:
|
||||
//
|
||||
reLiteralStringQuotes = regexp.MustCompile(`("(.|\n)*?"|'(.|\n)*?')`)
|
||||
reSingleLineComment = regexp.MustCompile(`(?m)(//|--|#|%|")\s([^\n]*$)`)
|
||||
reMultilineComment = regexp.MustCompile(`(/\*(.|\n)*?\*/|<!--(.|\n)*?-->|\{-(.|\n)*?-\}|\(\*(.|\n)*?\*\)|"""(.|\n)*?"""|'''(.|\n)*?''')`)
|
||||
reLiteralNumber = regexp.MustCompile(`(0x[0-9A-Fa-f]([0-9A-Fa-f]|\.)*|\d(\d|\.)*)([uU][lL]{0,2}|([eE][-+]\d*)?[fFlL]*)`)
|
||||
reShebang = regexp.MustCompile(`(?m)^#!(?:/[0-9A-Za-z_]+)*/(?:([0-9A-Za-z_]+)|[0-9A-Za-z_]+(?:\s*[0-9A-Za-z_]+=[0-9A-Za-z_]+\s*)*\s*([0-9A-Za-z_]+))(?:\s*-[0-9A-Za-z_]+\s*)*$`)
|
||||
rePunctuation = regexp.MustCompile(`;|\{|\}|\(|\)|\[|\]`)
|
||||
reSGML = regexp.MustCompile(`(<\/?[^\s<>=\d"']+)(?:\s(.|\n)*?\/?>|>)`)
|
||||
reSGMLComment = regexp.MustCompile(`(<!--(.|\n)*?-->)`)
|
||||
reSGMLAttributes = regexp.MustCompile(`\s+([0-9A-Za-z_]+=)|\s+([^\s>]+)`)
|
||||
reSGMLLoneAttribute = regexp.MustCompile(`([0-9A-Za-z_]+)`)
|
||||
reRegularToken = regexp.MustCompile(`[0-9A-Za-z_\.@#\/\*]+`)
|
||||
reOperators = regexp.MustCompile(`<<?|\+|\-|\*|\/|%|&&?|\|\|?`)
|
||||
reLiteralStringQuotes = regex.MustCompile(`("(.|\n)*?"|'(.|\n)*?')`)
|
||||
reSingleLineComment = regex.MustCompile(`(?m)(//|--|#|%|")\s([^\n]*$)`)
|
||||
reMultilineComment = regex.MustCompile(`(/\*(.|\n)*?\*/|<!--(.|\n)*?-->|\{-(.|\n)*?-\}|\(\*(.|\n)*?\*\)|"""(.|\n)*?"""|'''(.|\n)*?''')`)
|
||||
reLiteralNumber = regex.MustCompile(`(0x[0-9A-Fa-f]([0-9A-Fa-f]|\.)*|\d(\d|\.)*)([uU][lL]{0,2}|([eE][-+]\d*)?[fFlL]*)`)
|
||||
reShebang = regex.MustCompile(`(?m)^#!(?:/[0-9A-Za-z_]+)*/(?:([0-9A-Za-z_]+)|[0-9A-Za-z_]+(?:\s*[0-9A-Za-z_]+=[0-9A-Za-z_]+\s*)*\s*([0-9A-Za-z_]+))(?:\s*-[0-9A-Za-z_]+\s*)*$`)
|
||||
rePunctuation = regex.MustCompile(`;|\{|\}|\(|\)|\[|\]`)
|
||||
reSGML = regex.MustCompile(`(<\/?[^\s<>=\d"']+)(?:\s(.|\n)*?\/?>|>)`)
|
||||
reSGMLComment = regex.MustCompile(`(<!--(.|\n)*?-->)`)
|
||||
reSGMLAttributes = regex.MustCompile(`\s+([0-9A-Za-z_]+=)|\s+([^\s>]+)`)
|
||||
reSGMLLoneAttribute = regex.MustCompile(`([0-9A-Za-z_]+)`)
|
||||
reRegularToken = regex.MustCompile(`[0-9A-Za-z_\.@#\/\*]+`)
|
||||
reOperators = regex.MustCompile(`<<?|\+|\-|\*|\/|%|&&?|\|\|?`)
|
||||
|
||||
regexToSkip = []*regexp.Regexp{
|
||||
regexToSkip = []regex.EnryRegexp{
|
||||
// The order must be this
|
||||
reLiteralStringQuotes,
|
||||
reMultilineComment,
|
||||
@ -124,22 +125,22 @@ func getShebangToken(matchedShebang [][]byte) []byte {
|
||||
return tokenShebang
|
||||
}
|
||||
|
||||
func commonExtracAndReplace(content []byte, re *regexp.Regexp) ([]byte, [][]byte) {
|
||||
func commonExtractAndReplace(content []byte, re regex.EnryRegexp) ([]byte, [][]byte) {
|
||||
tokens := re.FindAll(content, -1)
|
||||
content = re.ReplaceAll(content, []byte(` `))
|
||||
return content, tokens
|
||||
}
|
||||
|
||||
func extractAndReplacePunctuation(content []byte) ([]byte, [][]byte) {
|
||||
return commonExtracAndReplace(content, rePunctuation)
|
||||
return commonExtractAndReplace(content, rePunctuation)
|
||||
}
|
||||
|
||||
func extractAndReplaceRegular(content []byte) ([]byte, [][]byte) {
|
||||
return commonExtracAndReplace(content, reRegularToken)
|
||||
return commonExtractAndReplace(content, reRegularToken)
|
||||
}
|
||||
|
||||
func extractAndReplaceOperator(content []byte) ([]byte, [][]byte) {
|
||||
return commonExtracAndReplace(content, reOperators)
|
||||
return commonExtractAndReplace(content, reOperators)
|
||||
}
|
||||
|
||||
func extractAndReplaceSGML(content []byte) ([]byte, [][]byte) {
|
||||
|
17
regex/oniguruma.go
Normal file
17
regex/oniguruma.go
Normal file
@ -0,0 +1,17 @@
|
||||
// +build oniguruma
|
||||
|
||||
package regex
|
||||
|
||||
import (
|
||||
"github.com/moovweb/rubex"
|
||||
)
|
||||
|
||||
type EnryRegexp = *rubex.Regexp
|
||||
|
||||
func MustCompile(str string) EnryRegexp {
|
||||
return rubex.MustCompile(str)
|
||||
}
|
||||
|
||||
func QuoteMeta(s string) string {
|
||||
return rubex.QuoteMeta(s)
|
||||
}
|
17
regex/standard.go
Normal file
17
regex/standard.go
Normal file
@ -0,0 +1,17 @@
|
||||
// +build !oniguruma
|
||||
|
||||
package regex
|
||||
|
||||
import (
|
||||
"regexp"
|
||||
)
|
||||
|
||||
type EnryRegexp = *regexp.Regexp
|
||||
|
||||
func MustCompile(str string) EnryRegexp {
|
||||
return regexp.MustCompile(str)
|
||||
}
|
||||
|
||||
func QuoteMeta(s string) string {
|
||||
return regexp.QuoteMeta(s)
|
||||
}
|
Loading…
Reference in New Issue
Block a user