address review feedback

Signed-off-by: Alexander Bezzubov <bzz@apache.org>
This commit is contained in:
Alexander Bezzubov
2019-04-14 22:15:18 +02:00
parent 7929933eb5
commit ada6f15c93
5 changed files with 22 additions and 19 deletions

View File

@ -3,4 +3,5 @@
// be imported by other packages. // be imported by other packages.
package tokenizer package tokenizer
const byteLimit = 100000 // ByteLimit defines the maximum prefix of an input text that will be tokenized.
const ByteLimit = 100000

View File

@ -9,8 +9,7 @@ package flex
import "C" import "C"
import "unsafe" import "unsafe"
const maxTokenLen = 32 const maxTokenLen = 32 // bytes
// TokenizeFlex implements tokenizer by calling Flex generated code from linguist in C // TokenizeFlex implements tokenizer by calling Flex generated code from linguist in C
// This is a transliteration from C https://github.com/github/linguist/blob/master/ext/linguist/linguist.c#L12 // This is a transliteration from C https://github.com/github/linguist/blob/master/ext/linguist/linguist.c#L12
@ -28,25 +27,24 @@ func TokenizeFlex(content []byte) []string {
C.linguist_yylex_init_extra(&extra, &scanner) C.linguist_yylex_init_extra(&extra, &scanner)
buf = C.linguist_yy_scan_bytes((*C.char)(cs), _len, scanner) buf = C.linguist_yy_scan_bytes((*C.char)(cs), _len, scanner)
ary := []string{} ary := []string{}
for { for {
extra._type = C.NO_ACTION extra._type = C.NO_ACTION
extra.token = nil extra.token = nil
r = C.linguist_yylex(scanner) r = C.linguist_yylex(scanner)
switch (extra._type) { switch extra._type {
case C.NO_ACTION: case C.NO_ACTION:
break break
case C.REGULAR_TOKEN: case C.REGULAR_TOKEN:
_len = C.strlen(extra.token) _len = C.strlen(extra.token)
if (_len <= maxTokenLen) { if _len <= maxTokenLen {
ary = append(ary, C.GoStringN(extra.token, (C.int)(_len))) ary = append(ary, C.GoStringN(extra.token, (C.int)(_len)))
} }
C.free(unsafe.Pointer(extra.token)) C.free(unsafe.Pointer(extra.token))
break break
case C.SHEBANG_TOKEN: case C.SHEBANG_TOKEN:
_len = C.strlen(extra.token) _len = C.strlen(extra.token)
if (_len <= maxTokenLen) { if _len <= maxTokenLen {
s := "SHEBANG#!" + C.GoStringN(extra.token, (C.int)(_len)) s := "SHEBANG#!" + C.GoStringN(extra.token, (C.int)(_len))
ary = append(ary, s) ary = append(ary, s)
} }
@ -54,7 +52,7 @@ func TokenizeFlex(content []byte) []string {
break break
case C.SGML_TOKEN: case C.SGML_TOKEN:
_len = C.strlen(extra.token) _len = C.strlen(extra.token)
if (_len <= maxTokenLen) { if _len <= maxTokenLen {
s := C.GoStringN(extra.token, (C.int)(_len)) + ">" s := C.GoStringN(extra.token, (C.int)(_len)) + ">"
ary = append(ary, s) ary = append(ary, s)
} }

View File

@ -8,12 +8,12 @@ import (
"gopkg.in/src-d/enry.v1/regex" "gopkg.in/src-d/enry.v1/regex"
) )
// Tokenize returns language-agnostic lexical tokens from content. The tokens // Tokenize returns lexical tokens from content. The tokens returned should match what
// returned should match what the Linguist library returns. At most the first // the Linguist library returns (but they are not, until https://github.com/src-d/enry/issues/193).
// 100KB of content are tokenized. // At most the first ByteLimit bytes of content are tokenized.
func Tokenize(content []byte) []string { func Tokenize(content []byte) []string {
if len(content) > byteLimit { if len(content) > ByteLimit {
content = content[:byteLimit] content = content[:ByteLimit]
} }
// Copy the input so that changes wrought by the tokenization steps do not // Copy the input so that changes wrought by the tokenization steps do not

View File

@ -4,12 +4,13 @@ package tokenizer
import "gopkg.in/src-d/enry.v1/internal/tokenizer/flex" import "gopkg.in/src-d/enry.v1/internal/tokenizer/flex"
// Tokenize returns language-agnostic lexical tokens from content. The tokens // Tokenize returns lexical tokens from content. The tokens returned match what
// returned should match what the Linguist library returns. At most the first // the Linguist library returns. At most the first ByteLimit bytes of content are tokenized.
// 100KB of content are tokenized. // Splitting at a byte offset means it might partition a last multibyte unicode character
// in the middle of a token (but it should not affect results).
func Tokenize(content []byte) []string { func Tokenize(content []byte) []string {
if len(content) > byteLimit { if len(content) > ByteLimit {
content = content[:byteLimit] content = content[:ByteLimit]
} }
return flex.TokenizeFlex(content) return flex.TokenizeFlex(content)

View File

@ -119,7 +119,10 @@ func BenchmarkTokenizer_BaselineCopy(b *testing.B) {
b.ReportAllocs() b.ReportAllocs()
for i := 0; i < b.N; i++ { for i := 0; i < b.N; i++ {
for _, test := range Tests { for _, test := range Tests {
test.content = append([]byte(nil), test.content...) if len(test.content) > ByteLimit {
test.content = test.content[:ByteLimit]
}
_ = append([]byte(nil), test.content...)
} }
} }
} }