diff --git a/internal/tokenizer/common.go b/internal/tokenizer/common.go index 6cae666..1c47ee3 100644 --- a/internal/tokenizer/common.go +++ b/internal/tokenizer/common.go @@ -3,4 +3,5 @@ // be imported by other packages. package tokenizer -const byteLimit = 100000 +// ByteLimit defines the maximum prefix of an input text that will be tokenized. +const ByteLimit = 100000 diff --git a/internal/tokenizer/flex/tokenize_c.go b/internal/tokenizer/flex/tokenize_c.go index 0c78ebe..411a9a5 100644 --- a/internal/tokenizer/flex/tokenize_c.go +++ b/internal/tokenizer/flex/tokenize_c.go @@ -9,8 +9,7 @@ package flex import "C" import "unsafe" -const maxTokenLen = 32 - +const maxTokenLen = 32 // bytes // TokenizeFlex implements tokenizer by calling Flex generated code from linguist in C // This is a transliteration from C https://github.com/github/linguist/blob/master/ext/linguist/linguist.c#L12 @@ -28,25 +27,24 @@ func TokenizeFlex(content []byte) []string { C.linguist_yylex_init_extra(&extra, &scanner) buf = C.linguist_yy_scan_bytes((*C.char)(cs), _len, scanner) - ary := []string{} for { extra._type = C.NO_ACTION extra.token = nil r = C.linguist_yylex(scanner) - switch (extra._type) { + switch extra._type { case C.NO_ACTION: break case C.REGULAR_TOKEN: _len = C.strlen(extra.token) - if (_len <= maxTokenLen) { + if _len <= maxTokenLen { ary = append(ary, C.GoStringN(extra.token, (C.int)(_len))) } C.free(unsafe.Pointer(extra.token)) break case C.SHEBANG_TOKEN: _len = C.strlen(extra.token) - if (_len <= maxTokenLen) { + if _len <= maxTokenLen { s := "SHEBANG#!" + C.GoStringN(extra.token, (C.int)(_len)) ary = append(ary, s) } @@ -54,7 +52,7 @@ func TokenizeFlex(content []byte) []string { break case C.SGML_TOKEN: _len = C.strlen(extra.token) - if (_len <= maxTokenLen) { + if _len <= maxTokenLen { s := C.GoStringN(extra.token, (C.int)(_len)) + ">" ary = append(ary, s) } diff --git a/internal/tokenizer/tokenize.go b/internal/tokenizer/tokenize.go index de30f36..f4d5575 100644 --- a/internal/tokenizer/tokenize.go +++ b/internal/tokenizer/tokenize.go @@ -8,12 +8,12 @@ import ( "gopkg.in/src-d/enry.v1/regex" ) -// Tokenize returns language-agnostic lexical tokens from content. The tokens -// returned should match what the Linguist library returns. At most the first -// 100KB of content are tokenized. +// Tokenize returns lexical tokens from content. The tokens returned should match what +// the Linguist library returns (but they are not, until https://github.com/src-d/enry/issues/193). +// At most the first ByteLimit bytes of content are tokenized. func Tokenize(content []byte) []string { - if len(content) > byteLimit { - content = content[:byteLimit] + if len(content) > ByteLimit { + content = content[:ByteLimit] } // Copy the input so that changes wrought by the tokenization steps do not diff --git a/internal/tokenizer/tokenize_c.go b/internal/tokenizer/tokenize_c.go index be4d023..3ebf2a5 100644 --- a/internal/tokenizer/tokenize_c.go +++ b/internal/tokenizer/tokenize_c.go @@ -4,12 +4,13 @@ package tokenizer import "gopkg.in/src-d/enry.v1/internal/tokenizer/flex" -// Tokenize returns language-agnostic lexical tokens from content. The tokens -// returned should match what the Linguist library returns. At most the first -// 100KB of content are tokenized. +// Tokenize returns lexical tokens from content. The tokens returned match what +// the Linguist library returns. At most the first ByteLimit bytes of content are tokenized. +// Splitting at a byte offset means it might partition a last multibyte unicode character +// in the middle of a token (but it should not affect results). func Tokenize(content []byte) []string { - if len(content) > byteLimit { - content = content[:byteLimit] + if len(content) > ByteLimit { + content = content[:ByteLimit] } return flex.TokenizeFlex(content) diff --git a/internal/tokenizer/tokenize_test.go b/internal/tokenizer/tokenize_test.go index 9307cfd..ace9f79 100644 --- a/internal/tokenizer/tokenize_test.go +++ b/internal/tokenizer/tokenize_test.go @@ -119,7 +119,10 @@ func BenchmarkTokenizer_BaselineCopy(b *testing.B) { b.ReportAllocs() for i := 0; i < b.N; i++ { for _, test := range Tests { - test.content = append([]byte(nil), test.content...) + if len(test.content) > ByteLimit { + test.content = test.content[:ByteLimit] + } + _ = append([]byte(nil), test.content...) } } }