mirror of
https://github.com/ralsina/tartrazine.git
synced 2025-05-23 08:30:07 -03:00
address review feedback
Signed-off-by: Alexander Bezzubov <bzz@apache.org>
This commit is contained in:
parent
7929933eb5
commit
ada6f15c93
@ -3,4 +3,5 @@
|
||||
// be imported by other packages.
|
||||
package tokenizer
|
||||
|
||||
const byteLimit = 100000
|
||||
// ByteLimit defines the maximum prefix of an input text that will be tokenized.
|
||||
const ByteLimit = 100000
|
||||
|
@ -9,8 +9,7 @@ package flex
|
||||
import "C"
|
||||
import "unsafe"
|
||||
|
||||
const maxTokenLen = 32
|
||||
|
||||
const maxTokenLen = 32 // bytes
|
||||
|
||||
// TokenizeFlex implements tokenizer by calling Flex generated code from linguist in C
|
||||
// This is a transliteration from C https://github.com/github/linguist/blob/master/ext/linguist/linguist.c#L12
|
||||
@ -28,25 +27,24 @@ func TokenizeFlex(content []byte) []string {
|
||||
C.linguist_yylex_init_extra(&extra, &scanner)
|
||||
buf = C.linguist_yy_scan_bytes((*C.char)(cs), _len, scanner)
|
||||
|
||||
|
||||
ary := []string{}
|
||||
for {
|
||||
extra._type = C.NO_ACTION
|
||||
extra.token = nil
|
||||
r = C.linguist_yylex(scanner)
|
||||
switch (extra._type) {
|
||||
switch extra._type {
|
||||
case C.NO_ACTION:
|
||||
break
|
||||
case C.REGULAR_TOKEN:
|
||||
_len = C.strlen(extra.token)
|
||||
if (_len <= maxTokenLen) {
|
||||
if _len <= maxTokenLen {
|
||||
ary = append(ary, C.GoStringN(extra.token, (C.int)(_len)))
|
||||
}
|
||||
C.free(unsafe.Pointer(extra.token))
|
||||
break
|
||||
case C.SHEBANG_TOKEN:
|
||||
_len = C.strlen(extra.token)
|
||||
if (_len <= maxTokenLen) {
|
||||
if _len <= maxTokenLen {
|
||||
s := "SHEBANG#!" + C.GoStringN(extra.token, (C.int)(_len))
|
||||
ary = append(ary, s)
|
||||
}
|
||||
@ -54,7 +52,7 @@ func TokenizeFlex(content []byte) []string {
|
||||
break
|
||||
case C.SGML_TOKEN:
|
||||
_len = C.strlen(extra.token)
|
||||
if (_len <= maxTokenLen) {
|
||||
if _len <= maxTokenLen {
|
||||
s := C.GoStringN(extra.token, (C.int)(_len)) + ">"
|
||||
ary = append(ary, s)
|
||||
}
|
||||
|
@ -8,12 +8,12 @@ import (
|
||||
"gopkg.in/src-d/enry.v1/regex"
|
||||
)
|
||||
|
||||
// Tokenize returns language-agnostic lexical tokens from content. The tokens
|
||||
// returned should match what the Linguist library returns. At most the first
|
||||
// 100KB of content are tokenized.
|
||||
// Tokenize returns lexical tokens from content. The tokens returned should match what
|
||||
// the Linguist library returns (but they are not, until https://github.com/src-d/enry/issues/193).
|
||||
// At most the first ByteLimit bytes of content are tokenized.
|
||||
func Tokenize(content []byte) []string {
|
||||
if len(content) > byteLimit {
|
||||
content = content[:byteLimit]
|
||||
if len(content) > ByteLimit {
|
||||
content = content[:ByteLimit]
|
||||
}
|
||||
|
||||
// Copy the input so that changes wrought by the tokenization steps do not
|
||||
|
@ -4,12 +4,13 @@ package tokenizer
|
||||
|
||||
import "gopkg.in/src-d/enry.v1/internal/tokenizer/flex"
|
||||
|
||||
// Tokenize returns language-agnostic lexical tokens from content. The tokens
|
||||
// returned should match what the Linguist library returns. At most the first
|
||||
// 100KB of content are tokenized.
|
||||
// Tokenize returns lexical tokens from content. The tokens returned match what
|
||||
// the Linguist library returns. At most the first ByteLimit bytes of content are tokenized.
|
||||
// Splitting at a byte offset means it might partition a last multibyte unicode character
|
||||
// in the middle of a token (but it should not affect results).
|
||||
func Tokenize(content []byte) []string {
|
||||
if len(content) > byteLimit {
|
||||
content = content[:byteLimit]
|
||||
if len(content) > ByteLimit {
|
||||
content = content[:ByteLimit]
|
||||
}
|
||||
|
||||
return flex.TokenizeFlex(content)
|
||||
|
@ -119,7 +119,10 @@ func BenchmarkTokenizer_BaselineCopy(b *testing.B) {
|
||||
b.ReportAllocs()
|
||||
for i := 0; i < b.N; i++ {
|
||||
for _, test := range Tests {
|
||||
test.content = append([]byte(nil), test.content...)
|
||||
if len(test.content) > ByteLimit {
|
||||
test.content = test.content[:ByteLimit]
|
||||
}
|
||||
_ = append([]byte(nil), test.content...)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user