mirror of
https://github.com/ralsina/tartrazine.git
synced 2025-05-23 08:30:07 -03:00
Refactor Oniguruma integration
Instead of use a command to change imports before build, using a build tag to generate the correct binary. This will allow applications to compile enry using oniguruma with less troubles. Signed-off-by: Antonio Jesus Navarro Perez <antnavper@gmail.com>
This commit is contained in:
parent
8da8516ac1
commit
15bb13117f
@ -24,9 +24,7 @@ install:
|
|||||||
- mkdir -p $GOPATH/src/gopkg.in/src-d
|
- mkdir -p $GOPATH/src/gopkg.in/src-d
|
||||||
- ln -s $PWD $GOPATH/src/gopkg.in/src-d/enry.v1
|
- ln -s $PWD $GOPATH/src/gopkg.in/src-d/enry.v1
|
||||||
- cd $GOPATH/src/gopkg.in/src-d/enry.v1
|
- cd $GOPATH/src/gopkg.in/src-d/enry.v1
|
||||||
- if [ "$ONIGURUMA" == "1" ]; then make oniguruma; fi
|
- if [ "$ONIGURUMA" == "1" ]; then tags="$tags oniguruma"; fi; go get -v -t --tags "$tags" ./...
|
||||||
- go get -v -t ./...
|
|
||||||
|
|
||||||
script:
|
script:
|
||||||
- make test-coverage
|
- make test-coverage
|
||||||
|
|
||||||
@ -100,7 +98,7 @@ jobs:
|
|||||||
- sudo apt-get update
|
- sudo apt-get update
|
||||||
- sudo apt-get install -y --no-install-recommends clang g++ gcc gcc-multilib libc6-dev libc6-dev-i386 mingw-w64 patch xz-utils
|
- sudo apt-get install -y --no-install-recommends clang g++ gcc gcc-multilib libc6-dev libc6-dev-i386 mingw-w64 patch xz-utils
|
||||||
- cd ${HOME}
|
- cd ${HOME}
|
||||||
- curl -sSL ${OSXCROSS_URL} | tar -C ${HOME} -xzf -
|
- curl -sSL ${OSXCROSS_URL} | tar -C ${HOME} -xzf -
|
||||||
- cd $GOPATH/src/gopkg.in/src-d/enry.v1
|
- cd $GOPATH/src/gopkg.in/src-d/enry.v1
|
||||||
|
|
||||||
script:
|
script:
|
||||||
|
15
Makefile
15
Makefile
@ -38,12 +38,6 @@ DARWIN_SHARED_LIB=$(DARWIN_DIR)/libenry.dylib
|
|||||||
HEADER_FILE=libenry.h
|
HEADER_FILE=libenry.h
|
||||||
NATIVE_LIB=./shared/enry.go
|
NATIVE_LIB=./shared/enry.go
|
||||||
|
|
||||||
# source files to be patched for using "rubex" instead of "regexp"
|
|
||||||
RUBEX_PATCHED := internal/code-generator/generator/heuristics.go internal/tokenizer/tokenize.go common.go
|
|
||||||
RUBEX_ORIG := $(RUBEX_PATCHED:=.orig)
|
|
||||||
|
|
||||||
.PHONY: revert-oniguruma
|
|
||||||
|
|
||||||
$(LINGUIST_PATH):
|
$(LINGUIST_PATH):
|
||||||
git clone https://github.com/github/linguist.git $@
|
git clone https://github.com/github/linguist.git $@
|
||||||
|
|
||||||
@ -69,15 +63,6 @@ benchmarks-slow: $(LINGUST_PATH)
|
|||||||
mkdir -p benchmarks/output && go test -run=NONE -bench=. -slow -benchtime=100ms -timeout=100h >benchmarks/output/enry_samples.bench && \
|
mkdir -p benchmarks/output && go test -run=NONE -bench=. -slow -benchtime=100ms -timeout=100h >benchmarks/output/enry_samples.bench && \
|
||||||
benchmarks/linguist-samples.rb 5 >benchmarks/output/linguist_samples.bench
|
benchmarks/linguist-samples.rb 5 >benchmarks/output/linguist_samples.bench
|
||||||
|
|
||||||
$(RUBEX_ORIG): %.orig : %
|
|
||||||
sed -i.orig -e 's/"regexp"/regexp "github.com\/moovweb\/rubex"/g' $<
|
|
||||||
@touch $@
|
|
||||||
|
|
||||||
oniguruma: $(RUBEX_ORIG)
|
|
||||||
|
|
||||||
revert-oniguruma:
|
|
||||||
@for file in $(RUBEX_PATCHED); do if [ -e "$$file.orig" ]; then mv "$$file.orig" "$$file" && echo mv "$$file.orig" "$$file"; fi; done
|
|
||||||
|
|
||||||
build-cli:
|
build-cli:
|
||||||
go build -o enry -ldflags "$(LOCAL_LDFLAGS)" cmd/enry/main.go
|
go build -o enry -ldflags "$(LOCAL_LDFLAGS)" cmd/enry/main.go
|
||||||
|
|
||||||
|
@ -37,10 +37,10 @@ On Ubuntu, it is
|
|||||||
sudo apt install libonig-dev
|
sudo apt install libonig-dev
|
||||||
```
|
```
|
||||||
|
|
||||||
To build enry with Oniguruma regexps, patch the imports with
|
To build enry with Oniguruma regexps use the `oniguruma` build tag
|
||||||
|
|
||||||
```
|
```
|
||||||
make oniguruma
|
go get -v -t --tags oniguruma ./...
|
||||||
```
|
```
|
||||||
|
|
||||||
and then rebuild the project.
|
and then rebuild the project.
|
||||||
@ -162,7 +162,7 @@ We update enry when changes are done in linguist's master branch on the followin
|
|||||||
* [vendor.yml](https://github.com/github/linguist/blob/master/lib/linguist/vendor.yml)
|
* [vendor.yml](https://github.com/github/linguist/blob/master/lib/linguist/vendor.yml)
|
||||||
* [documentation.yml](https://github.com/github/linguist/blob/master/lib/linguist/documentation.yml)
|
* [documentation.yml](https://github.com/github/linguist/blob/master/lib/linguist/documentation.yml)
|
||||||
|
|
||||||
Currently we don't have any procedure established to automatically detect changes in the linguist project and regenerate the code.
|
Currently we don't have any procedure established to automatically detect changes in the linguist project and regenerate the code.
|
||||||
So we update the generated code as needed, without any specific criteria.
|
So we update the generated code as needed, without any specific criteria.
|
||||||
|
|
||||||
If you want to update *enry* because of changes in linguist, you can run the *go
|
If you want to update *enry* because of changes in linguist, you can run the *go
|
||||||
@ -217,7 +217,7 @@ If you want to reproduce the same benchmarks you can run:
|
|||||||
|
|
||||||
benchmarks/run.sh
|
benchmarks/run.sh
|
||||||
|
|
||||||
from the root's project directory and it'll run benchmarks for enry and linguist, parse the output, create csv files and create a histogram (you must have installed [gnuplot](http://gnuplot.info) in your system to get the histogram).
|
from the root's project directory and it'll run benchmarks for enry and linguist, parse the output, create csv files and create a histogram (you must have installed [gnuplot](http://gnuplot.info) in your system to get the histogram).
|
||||||
|
|
||||||
This can take some time, so to run local benchmarks for a quick check you can either:
|
This can take some time, so to run local benchmarks for a quick check you can either:
|
||||||
|
|
||||||
|
14
common.go
14
common.go
@ -4,10 +4,10 @@ import (
|
|||||||
"bufio"
|
"bufio"
|
||||||
"bytes"
|
"bytes"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"regexp"
|
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
"gopkg.in/src-d/enry.v1/data"
|
"gopkg.in/src-d/enry.v1/data"
|
||||||
|
"gopkg.in/src-d/enry.v1/regex"
|
||||||
)
|
)
|
||||||
|
|
||||||
// OtherLanguage is used as a zero value when a function can not return a specific language.
|
// OtherLanguage is used as a zero value when a function can not return a specific language.
|
||||||
@ -197,10 +197,10 @@ func footScope(content []byte, scope int) (index int) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
var (
|
var (
|
||||||
reEmacsModeline = regexp.MustCompile(`.*-\*-\s*(.+?)\s*-\*-.*(?m:$)`)
|
reEmacsModeline = regex.MustCompile(`.*-\*-\s*(.+?)\s*-\*-.*(?m:$)`)
|
||||||
reEmacsLang = regexp.MustCompile(`.*(?i:mode)\s*:\s*([^\s;]+)\s*;*.*`)
|
reEmacsLang = regex.MustCompile(`.*(?i:mode)\s*:\s*([^\s;]+)\s*;*.*`)
|
||||||
reVimModeline = regexp.MustCompile(`(?:(?m:\s|^)vi(?:m[<=>]?\d+|m)?|[\t\x20]*ex)\s*[:]\s*(.*)(?m:$)`)
|
reVimModeline = regex.MustCompile(`(?:(?m:\s|^)vi(?:m[<=>]?\d+|m)?|[\t\x20]*ex)\s*[:]\s*(.*)(?m:$)`)
|
||||||
reVimLang = regexp.MustCompile(`(?i:filetype|ft|syntax)\s*=(\w+)(?:\s|:|$)`)
|
reVimLang = regex.MustCompile(`(?i:filetype|ft|syntax)\s*=(\w+)(?:\s|:|$)`)
|
||||||
)
|
)
|
||||||
|
|
||||||
// GetLanguagesByEmacsModeline returns a slice of possible languages for the given content.
|
// GetLanguagesByEmacsModeline returns a slice of possible languages for the given content.
|
||||||
@ -283,8 +283,8 @@ func GetLanguagesByShebang(_ string, content []byte, _ []string) (languages []st
|
|||||||
}
|
}
|
||||||
|
|
||||||
var (
|
var (
|
||||||
shebangExecHack = regexp.MustCompile(`exec (\w+).+\$0.+\$@`)
|
shebangExecHack = regex.MustCompile(`exec (\w+).+\$0.+\$@`)
|
||||||
pythonVersion = regexp.MustCompile(`python\d\.\d+`)
|
pythonVersion = regex.MustCompile(`python\d\.\d+`)
|
||||||
)
|
)
|
||||||
|
|
||||||
func getInterpreter(data []byte) (interpreter string) {
|
func getInterpreter(data []byte) (interpreter string) {
|
||||||
|
@ -6,10 +6,11 @@ import (
|
|||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
"io/ioutil"
|
"io/ioutil"
|
||||||
"regexp"
|
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"text/template"
|
"text/template"
|
||||||
|
|
||||||
|
"gopkg.in/src-d/enry.v1/regex"
|
||||||
)
|
)
|
||||||
|
|
||||||
// Heuristics reads from fileToParse and builds source file from tmplPath. It complies with type File signature.
|
// Heuristics reads from fileToParse and builds source file from tmplPath. It complies with type File signature.
|
||||||
@ -38,7 +39,7 @@ const (
|
|||||||
)
|
)
|
||||||
|
|
||||||
var (
|
var (
|
||||||
disambLine = regexp.MustCompile(`^(\s*)disambiguate`)
|
disambLine = regex.MustCompile(`^(\s*)disambiguate`)
|
||||||
definedRegs = make(map[string]string)
|
definedRegs = make(map[string]string)
|
||||||
illegalCharacter = map[string]string{
|
illegalCharacter = map[string]string{
|
||||||
"#": "Sharp",
|
"#": "Sharp",
|
||||||
@ -378,7 +379,7 @@ func convertToValidRegexp(reg string) string {
|
|||||||
func includeToRegExp(include string) string {
|
func includeToRegExp(include string) string {
|
||||||
content := include[strings.Index(include, `(`)+1 : strings.Index(include, `)`)]
|
content := include[strings.Index(include, `(`)+1 : strings.Index(include, `)`)]
|
||||||
content = strings.Trim(content, `"'`)
|
content = strings.Trim(content, `"'`)
|
||||||
return regexp.QuoteMeta(content)
|
return regex.QuoteMeta(content)
|
||||||
}
|
}
|
||||||
|
|
||||||
func getLanguages(line string) []string {
|
func getLanguages(line string) []string {
|
||||||
|
@ -2,7 +2,8 @@ package tokenizer
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"bytes"
|
"bytes"
|
||||||
"regexp"
|
|
||||||
|
"gopkg.in/src-d/enry.v1/regex"
|
||||||
)
|
)
|
||||||
|
|
||||||
const byteLimit = 100000
|
const byteLimit = 100000
|
||||||
@ -72,20 +73,20 @@ var (
|
|||||||
//
|
//
|
||||||
// These regexps were converted to work in the same way for both engines:
|
// These regexps were converted to work in the same way for both engines:
|
||||||
//
|
//
|
||||||
reLiteralStringQuotes = regexp.MustCompile(`("(.|\n)*?"|'(.|\n)*?')`)
|
reLiteralStringQuotes = regex.MustCompile(`("(.|\n)*?"|'(.|\n)*?')`)
|
||||||
reSingleLineComment = regexp.MustCompile(`(?m)(//|--|#|%|")\s([^\n]*$)`)
|
reSingleLineComment = regex.MustCompile(`(?m)(//|--|#|%|")\s([^\n]*$)`)
|
||||||
reMultilineComment = regexp.MustCompile(`(/\*(.|\n)*?\*/|<!--(.|\n)*?-->|\{-(.|\n)*?-\}|\(\*(.|\n)*?\*\)|"""(.|\n)*?"""|'''(.|\n)*?''')`)
|
reMultilineComment = regex.MustCompile(`(/\*(.|\n)*?\*/|<!--(.|\n)*?-->|\{-(.|\n)*?-\}|\(\*(.|\n)*?\*\)|"""(.|\n)*?"""|'''(.|\n)*?''')`)
|
||||||
reLiteralNumber = regexp.MustCompile(`(0x[0-9A-Fa-f]([0-9A-Fa-f]|\.)*|\d(\d|\.)*)([uU][lL]{0,2}|([eE][-+]\d*)?[fFlL]*)`)
|
reLiteralNumber = regex.MustCompile(`(0x[0-9A-Fa-f]([0-9A-Fa-f]|\.)*|\d(\d|\.)*)([uU][lL]{0,2}|([eE][-+]\d*)?[fFlL]*)`)
|
||||||
reShebang = regexp.MustCompile(`(?m)^#!(?:/[0-9A-Za-z_]+)*/(?:([0-9A-Za-z_]+)|[0-9A-Za-z_]+(?:\s*[0-9A-Za-z_]+=[0-9A-Za-z_]+\s*)*\s*([0-9A-Za-z_]+))(?:\s*-[0-9A-Za-z_]+\s*)*$`)
|
reShebang = regex.MustCompile(`(?m)^#!(?:/[0-9A-Za-z_]+)*/(?:([0-9A-Za-z_]+)|[0-9A-Za-z_]+(?:\s*[0-9A-Za-z_]+=[0-9A-Za-z_]+\s*)*\s*([0-9A-Za-z_]+))(?:\s*-[0-9A-Za-z_]+\s*)*$`)
|
||||||
rePunctuation = regexp.MustCompile(`;|\{|\}|\(|\)|\[|\]`)
|
rePunctuation = regex.MustCompile(`;|\{|\}|\(|\)|\[|\]`)
|
||||||
reSGML = regexp.MustCompile(`(<\/?[^\s<>=\d"']+)(?:\s(.|\n)*?\/?>|>)`)
|
reSGML = regex.MustCompile(`(<\/?[^\s<>=\d"']+)(?:\s(.|\n)*?\/?>|>)`)
|
||||||
reSGMLComment = regexp.MustCompile(`(<!--(.|\n)*?-->)`)
|
reSGMLComment = regex.MustCompile(`(<!--(.|\n)*?-->)`)
|
||||||
reSGMLAttributes = regexp.MustCompile(`\s+([0-9A-Za-z_]+=)|\s+([^\s>]+)`)
|
reSGMLAttributes = regex.MustCompile(`\s+([0-9A-Za-z_]+=)|\s+([^\s>]+)`)
|
||||||
reSGMLLoneAttribute = regexp.MustCompile(`([0-9A-Za-z_]+)`)
|
reSGMLLoneAttribute = regex.MustCompile(`([0-9A-Za-z_]+)`)
|
||||||
reRegularToken = regexp.MustCompile(`[0-9A-Za-z_\.@#\/\*]+`)
|
reRegularToken = regex.MustCompile(`[0-9A-Za-z_\.@#\/\*]+`)
|
||||||
reOperators = regexp.MustCompile(`<<?|\+|\-|\*|\/|%|&&?|\|\|?`)
|
reOperators = regex.MustCompile(`<<?|\+|\-|\*|\/|%|&&?|\|\|?`)
|
||||||
|
|
||||||
regexToSkip = []*regexp.Regexp{
|
regexToSkip = []regex.EnryRegexp{
|
||||||
// The order must be this
|
// The order must be this
|
||||||
reLiteralStringQuotes,
|
reLiteralStringQuotes,
|
||||||
reMultilineComment,
|
reMultilineComment,
|
||||||
@ -124,22 +125,22 @@ func getShebangToken(matchedShebang [][]byte) []byte {
|
|||||||
return tokenShebang
|
return tokenShebang
|
||||||
}
|
}
|
||||||
|
|
||||||
func commonExtracAndReplace(content []byte, re *regexp.Regexp) ([]byte, [][]byte) {
|
func commonExtractAndReplace(content []byte, re regex.EnryRegexp) ([]byte, [][]byte) {
|
||||||
tokens := re.FindAll(content, -1)
|
tokens := re.FindAll(content, -1)
|
||||||
content = re.ReplaceAll(content, []byte(` `))
|
content = re.ReplaceAll(content, []byte(` `))
|
||||||
return content, tokens
|
return content, tokens
|
||||||
}
|
}
|
||||||
|
|
||||||
func extractAndReplacePunctuation(content []byte) ([]byte, [][]byte) {
|
func extractAndReplacePunctuation(content []byte) ([]byte, [][]byte) {
|
||||||
return commonExtracAndReplace(content, rePunctuation)
|
return commonExtractAndReplace(content, rePunctuation)
|
||||||
}
|
}
|
||||||
|
|
||||||
func extractAndReplaceRegular(content []byte) ([]byte, [][]byte) {
|
func extractAndReplaceRegular(content []byte) ([]byte, [][]byte) {
|
||||||
return commonExtracAndReplace(content, reRegularToken)
|
return commonExtractAndReplace(content, reRegularToken)
|
||||||
}
|
}
|
||||||
|
|
||||||
func extractAndReplaceOperator(content []byte) ([]byte, [][]byte) {
|
func extractAndReplaceOperator(content []byte) ([]byte, [][]byte) {
|
||||||
return commonExtracAndReplace(content, reOperators)
|
return commonExtractAndReplace(content, reOperators)
|
||||||
}
|
}
|
||||||
|
|
||||||
func extractAndReplaceSGML(content []byte) ([]byte, [][]byte) {
|
func extractAndReplaceSGML(content []byte) ([]byte, [][]byte) {
|
||||||
|
17
regex/oniguruma.go
Normal file
17
regex/oniguruma.go
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
// +build oniguruma
|
||||||
|
|
||||||
|
package regex
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/moovweb/rubex"
|
||||||
|
)
|
||||||
|
|
||||||
|
type EnryRegexp = *rubex.Regexp
|
||||||
|
|
||||||
|
func MustCompile(str string) EnryRegexp {
|
||||||
|
return rubex.MustCompile(str)
|
||||||
|
}
|
||||||
|
|
||||||
|
func QuoteMeta(s string) string {
|
||||||
|
return rubex.QuoteMeta(s)
|
||||||
|
}
|
17
regex/standard.go
Normal file
17
regex/standard.go
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
// +build !oniguruma
|
||||||
|
|
||||||
|
package regex
|
||||||
|
|
||||||
|
import (
|
||||||
|
"regexp"
|
||||||
|
)
|
||||||
|
|
||||||
|
type EnryRegexp = *regexp.Regexp
|
||||||
|
|
||||||
|
func MustCompile(str string) EnryRegexp {
|
||||||
|
return regexp.MustCompile(str)
|
||||||
|
}
|
||||||
|
|
||||||
|
func QuoteMeta(s string) string {
|
||||||
|
return regexp.QuoteMeta(s)
|
||||||
|
}
|
Loading…
x
Reference in New Issue
Block a user