address review feedback

Signed-off-by: Alexander Bezzubov <bzz@apache.org>
This commit is contained in:
Alexander Bezzubov 2019-04-14 22:15:18 +02:00
parent 7929933eb5
commit ada6f15c93
No known key found for this signature in database
GPG Key ID: 8039F5787EFCD05D
5 changed files with 22 additions and 19 deletions

View File

@ -3,4 +3,5 @@
// be imported by other packages.
package tokenizer
const byteLimit = 100000
// ByteLimit defines the maximum prefix of an input text that will be tokenized.
const ByteLimit = 100000

View File

@ -9,8 +9,7 @@ package flex
import "C"
import "unsafe"
const maxTokenLen = 32
const maxTokenLen = 32 // bytes
// TokenizeFlex implements tokenizer by calling Flex generated code from linguist in C
// This is a transliteration from C https://github.com/github/linguist/blob/master/ext/linguist/linguist.c#L12
@ -28,25 +27,24 @@ func TokenizeFlex(content []byte) []string {
C.linguist_yylex_init_extra(&extra, &scanner)
buf = C.linguist_yy_scan_bytes((*C.char)(cs), _len, scanner)
ary := []string{}
for {
extra._type = C.NO_ACTION
extra.token = nil
r = C.linguist_yylex(scanner)
switch (extra._type) {
switch extra._type {
case C.NO_ACTION:
break
case C.REGULAR_TOKEN:
_len = C.strlen(extra.token)
if (_len <= maxTokenLen) {
if _len <= maxTokenLen {
ary = append(ary, C.GoStringN(extra.token, (C.int)(_len)))
}
C.free(unsafe.Pointer(extra.token))
break
case C.SHEBANG_TOKEN:
_len = C.strlen(extra.token)
if (_len <= maxTokenLen) {
if _len <= maxTokenLen {
s := "SHEBANG#!" + C.GoStringN(extra.token, (C.int)(_len))
ary = append(ary, s)
}
@ -54,7 +52,7 @@ func TokenizeFlex(content []byte) []string {
break
case C.SGML_TOKEN:
_len = C.strlen(extra.token)
if (_len <= maxTokenLen) {
if _len <= maxTokenLen {
s := C.GoStringN(extra.token, (C.int)(_len)) + ">"
ary = append(ary, s)
}

View File

@ -8,12 +8,12 @@ import (
"gopkg.in/src-d/enry.v1/regex"
)
// Tokenize returns language-agnostic lexical tokens from content. The tokens
// returned should match what the Linguist library returns. At most the first
// 100KB of content are tokenized.
// Tokenize returns lexical tokens from content. The tokens returned should match what
// the Linguist library returns (but they are not, until https://github.com/src-d/enry/issues/193).
// At most the first ByteLimit bytes of content are tokenized.
func Tokenize(content []byte) []string {
if len(content) > byteLimit {
content = content[:byteLimit]
if len(content) > ByteLimit {
content = content[:ByteLimit]
}
// Copy the input so that changes wrought by the tokenization steps do not

View File

@ -4,12 +4,13 @@ package tokenizer
import "gopkg.in/src-d/enry.v1/internal/tokenizer/flex"
// Tokenize returns language-agnostic lexical tokens from content. The tokens
// returned should match what the Linguist library returns. At most the first
// 100KB of content are tokenized.
// Tokenize returns lexical tokens from content. The tokens returned match what
// the Linguist library returns. At most the first ByteLimit bytes of content are tokenized.
// Splitting at a byte offset means it might partition a last multibyte unicode character
// in the middle of a token (but it should not affect results).
func Tokenize(content []byte) []string {
if len(content) > byteLimit {
content = content[:byteLimit]
if len(content) > ByteLimit {
content = content[:ByteLimit]
}
return flex.TokenizeFlex(content)

View File

@ -119,7 +119,10 @@ func BenchmarkTokenizer_BaselineCopy(b *testing.B) {
b.ReportAllocs()
for i := 0; i < b.N; i++ {
for _, test := range Tests {
test.content = append([]byte(nil), test.content...)
if len(test.content) > ByteLimit {
test.content = test.content[:ByteLimit]
}
_ = append([]byte(nil), test.content...)
}
}
}