mirror of
https://github.com/ralsina/tartrazine.git
synced 2025-05-24 08:18:52 -03:00
commit
6ccf0b6bd1
18
.travis.yml
18
.travis.yml
@ -1,7 +1,5 @@
|
|||||||
dist: trusty
|
dist: trusty
|
||||||
|
|
||||||
language: go
|
language: go
|
||||||
|
|
||||||
go:
|
go:
|
||||||
- '1.12.x'
|
- '1.12.x'
|
||||||
- '1.11.x'
|
- '1.11.x'
|
||||||
@ -10,17 +8,13 @@ env:
|
|||||||
- GO_VERSION_FOR_JVM='1.11.x'
|
- GO_VERSION_FOR_JVM='1.11.x'
|
||||||
- CGO_ENABLED=0
|
- CGO_ENABLED=0
|
||||||
- GO111MODULE=on
|
- GO111MODULE=on
|
||||||
|
- ONIGURUMA_VERSION='6.9.1'
|
||||||
matrix:
|
matrix:
|
||||||
- ONIGURUMA=0
|
- ONIGURUMA=0
|
||||||
- ONIGURUMA=1
|
- ONIGURUMA=1
|
||||||
matrix:
|
matrix:
|
||||||
fast_finish: true
|
fast_finish: true
|
||||||
|
|
||||||
addons:
|
|
||||||
apt:
|
|
||||||
packages:
|
|
||||||
- libonig-dev
|
|
||||||
|
|
||||||
stages:
|
stages:
|
||||||
- name: test
|
- name: test
|
||||||
- name: release
|
- name: release
|
||||||
@ -32,8 +26,14 @@ stage: test
|
|||||||
install:
|
install:
|
||||||
- >
|
- >
|
||||||
if [[ "${ONIGURUMA}" -gt 0 ]]; then
|
if [[ "${ONIGURUMA}" -gt 0 ]]; then
|
||||||
export CGO_ENABLED=1;
|
export CGO_ENABLED=1
|
||||||
export GO_TAGS='oniguruma';
|
export GO_TAGS='oniguruma'
|
||||||
|
# install oniguruma manually as trusty has only ancient 5.x
|
||||||
|
sudo apt-get install -y dpkg # dpkg >= 1.17.5ubuntu5.8 fixes https://bugs.launchpad.net/ubuntu/+source/dpkg/+bug/1730627
|
||||||
|
wget "http://archive.ubuntu.com/ubuntu/pool/universe/libo/libonig/libonig5_${ONIGURUMA_VERSION}-1_amd64.deb"
|
||||||
|
sudo dpkg -i "libonig5_${ONIGURUMA_VERSION}-1_amd64.deb"
|
||||||
|
wget "http://archive.ubuntu.com/ubuntu/pool/universe/libo/libonig/libonig-dev_${ONIGURUMA_VERSION}-1_amd64.deb"
|
||||||
|
sudo dpkg -i "libonig-dev_${ONIGURUMA_VERSION}-1_amd64.deb"
|
||||||
fi;
|
fi;
|
||||||
script:
|
script:
|
||||||
- make test-coverage
|
- make test-coverage
|
||||||
|
2
go.mod
2
go.mod
@ -3,7 +3,7 @@ module github.com/src-d/enry/v2
|
|||||||
go 1.12
|
go 1.12
|
||||||
|
|
||||||
require (
|
require (
|
||||||
github.com/src-d/go-oniguruma v1.0.0
|
github.com/src-d/go-oniguruma v1.1.0
|
||||||
github.com/stretchr/testify v1.3.0
|
github.com/stretchr/testify v1.3.0
|
||||||
github.com/toqueteos/trie v1.0.0 // indirect
|
github.com/toqueteos/trie v1.0.0 // indirect
|
||||||
gopkg.in/toqueteos/substring.v1 v1.0.2
|
gopkg.in/toqueteos/substring.v1 v1.0.2
|
||||||
|
4
go.sum
4
go.sum
@ -2,8 +2,8 @@ github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8
|
|||||||
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||||
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
||||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||||
github.com/src-d/go-oniguruma v1.0.0 h1:JDk5PUAjreGsGAKLsoDLNmrsaryjJ5RqT3h+Si6aw/E=
|
github.com/src-d/go-oniguruma v1.1.0 h1:EG+Nm5n2JqWUaCjtM0NtutPxU7ZN5Tp50GWrrV8bTww=
|
||||||
github.com/src-d/go-oniguruma v1.0.0/go.mod h1:chVbff8kcVtmrhxtZ3yBVLLquXbzCS6DrxQaAK/CeqM=
|
github.com/src-d/go-oniguruma v1.1.0/go.mod h1:chVbff8kcVtmrhxtZ3yBVLLquXbzCS6DrxQaAK/CeqM=
|
||||||
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
|
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
|
||||||
github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q=
|
github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q=
|
||||||
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
|
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
|
||||||
|
@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
package tokenizer
|
package tokenizer
|
||||||
|
|
||||||
import "gopkg.in/src-d/enry.v1/internal/tokenizer/flex"
|
import "github.com/src-d/enry/v2/internal/tokenizer/flex"
|
||||||
|
|
||||||
// Tokenize returns lexical tokens from content. The tokens returned match what
|
// Tokenize returns lexical tokens from content. The tokens returned match what
|
||||||
// the Linguist library returns. At most the first ByteLimit bytes of content are tokenized.
|
// the Linguist library returns. At most the first ByteLimit bytes of content are tokenized.
|
||||||
|
@ -115,6 +115,45 @@ func TestTokenize(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestTokenizerLatin1AsUtf8(t *testing.T) {
|
||||||
|
content := []byte("th\xe5 filling") // `th<74> filling`
|
||||||
|
t.Logf("%v - %q", content, string(content))
|
||||||
|
tokens := Tokenize(content)
|
||||||
|
for i, token := range tokens {
|
||||||
|
t.Logf("token %d, %s", i+1, token)
|
||||||
|
}
|
||||||
|
require.Equal(t, 3, len(tokens))
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRegexpOnInvalidUtf8(t *testing.T) {
|
||||||
|
origContent := []struct {
|
||||||
|
text string
|
||||||
|
tokens []string
|
||||||
|
}{
|
||||||
|
{"th\xe0 filling", []string{"th", "filling"}}, // `th<74> filling`
|
||||||
|
{"th\u0100 filling", []string{"th", "filling"}}, // `thĀ filling`
|
||||||
|
{"привет, как дела?", []string{}}, // empty, no ASCII tokens
|
||||||
|
}
|
||||||
|
re := reRegularToken
|
||||||
|
|
||||||
|
for _, content := range origContent {
|
||||||
|
t.Run("", func(t *testing.T) {
|
||||||
|
t.Logf("%v - %q", content, content.text)
|
||||||
|
input := []byte(content.text)
|
||||||
|
tokens := re.FindAll(input, -1)
|
||||||
|
require.Equal(t, len(content.tokens), len(tokens))
|
||||||
|
|
||||||
|
newContent := re.ReplaceAll(input, []byte(` `))
|
||||||
|
t.Logf("content:%q, tokens:[", newContent)
|
||||||
|
for i, token := range tokens {
|
||||||
|
t.Logf("\t%q,", string(token))
|
||||||
|
require.Equal(t, content.tokens[i], string(token))
|
||||||
|
}
|
||||||
|
t.Logf(" ]\n")
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func BenchmarkTokenizer_BaselineCopy(b *testing.B) {
|
func BenchmarkTokenizer_BaselineCopy(b *testing.B) {
|
||||||
b.ReportAllocs()
|
b.ReportAllocs()
|
||||||
for i := 0; i < b.N; i++ {
|
for i := 0; i < b.N; i++ {
|
||||||
|
@ -9,7 +9,7 @@ import (
|
|||||||
type EnryRegexp = *rubex.Regexp
|
type EnryRegexp = *rubex.Regexp
|
||||||
|
|
||||||
func MustCompile(str string) EnryRegexp {
|
func MustCompile(str string) EnryRegexp {
|
||||||
return rubex.MustCompile(str)
|
return rubex.MustCompileASCII(str)
|
||||||
}
|
}
|
||||||
|
|
||||||
func QuoteMeta(s string) string {
|
func QuoteMeta(s string) string {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user