address review feedback

Signed-off-by: Alexander Bezzubov <bzz@apache.org>
2025-09-17 10:48:12 +00:00 · 2019-04-14 22:15:18 +02:00
parent 7929933eb5
commit ada6f15c93
5 changed files with 22 additions and 19 deletions
--- a/internal/tokenizer/common.go
+++ b/internal/tokenizer/common.go
@@ -3,4 +3,5 @@
 // be imported by other packages.
 package tokenizer

-const byteLimit = 100000
+// ByteLimit defines the maximum prefix of an input text that will be tokenized.
+const ByteLimit = 100000
--- a/internal/tokenizer/flex/tokenize_c.go
+++ b/internal/tokenizer/flex/tokenize_c.go
@@ -9,8 +9,7 @@ package flex
 import "C"
 import "unsafe"

-const maxTokenLen = 32
-
+const maxTokenLen = 32 // bytes

 // TokenizeFlex implements tokenizer by calling Flex generated code from linguist in C
 // This is a transliteration from C https://github.com/github/linguist/blob/master/ext/linguist/linguist.c#L12
@@ -28,25 +27,24 @@ func TokenizeFlex(content []byte) []string {
 	C.linguist_yylex_init_extra(&extra, &scanner)
 	buf = C.linguist_yy_scan_bytes((*C.char)(cs), _len, scanner)

-
 	ary := []string{}
 	for {
 		extra._type = C.NO_ACTION
 		extra.token = nil
 		r = C.linguist_yylex(scanner)
-		switch (extra._type) {
+		switch extra._type {
 		case C.NO_ACTION:
 			break
 		case C.REGULAR_TOKEN:
 			_len = C.strlen(extra.token)
-			if (_len <= maxTokenLen) {
+			if _len <= maxTokenLen {
 				ary = append(ary, C.GoStringN(extra.token, (C.int)(_len)))
 			}
 			C.free(unsafe.Pointer(extra.token))
 			break
 		case C.SHEBANG_TOKEN:
 			_len = C.strlen(extra.token)
-			if (_len <= maxTokenLen) {
+			if _len <= maxTokenLen {
 				s := "SHEBANG#!" + C.GoStringN(extra.token, (C.int)(_len))
 				ary = append(ary, s)
 			}
@@ -54,7 +52,7 @@ func TokenizeFlex(content []byte) []string {
 			break
 		case C.SGML_TOKEN:
 			_len = C.strlen(extra.token)
-			if (_len <= maxTokenLen) {
+			if _len <= maxTokenLen {
 				s := C.GoStringN(extra.token, (C.int)(_len)) + ">"
 				ary = append(ary, s)
 			}
--- a/internal/tokenizer/tokenize.go
+++ b/internal/tokenizer/tokenize.go
@@ -8,12 +8,12 @@ import (
 	"gopkg.in/src-d/enry.v1/regex"
 )

-// Tokenize returns language-agnostic lexical tokens from content. The tokens
-// returned should match what the Linguist library returns. At most the first
-// 100KB of content are tokenized.
+// Tokenize returns lexical tokens from content. The tokens returned should match what
+// the Linguist library returns (but they are not, until https://github.com/src-d/enry/issues/193).
+// At most the first ByteLimit bytes of content are tokenized.
 func Tokenize(content []byte) []string {
-	if len(content) > byteLimit {
-		content = content[:byteLimit]
+	if len(content) > ByteLimit {
+		content = content[:ByteLimit]
 	}

 	// Copy the input so that changes wrought by the tokenization steps do not
--- a/internal/tokenizer/tokenize_c.go
+++ b/internal/tokenizer/tokenize_c.go
@@ -4,12 +4,13 @@ package tokenizer

 import "gopkg.in/src-d/enry.v1/internal/tokenizer/flex"

-// Tokenize returns language-agnostic lexical tokens from content. The tokens
-// returned should match what the Linguist library returns. At most the first
-// 100KB of content are tokenized.
+// Tokenize returns lexical tokens from content. The tokens returned match what
+// the Linguist library returns. At most the first ByteLimit bytes of content are tokenized.
+// Splitting at a byte offset means it might partition a last multibyte unicode character
+// in the middle of a token (but it should not affect results).
 func Tokenize(content []byte) []string {
-	if len(content) > byteLimit {
-		content = content[:byteLimit]
+	if len(content) > ByteLimit {
+		content = content[:ByteLimit]
 	}

 	return flex.TokenizeFlex(content)
--- a/internal/tokenizer/tokenize_test.go
+++ b/internal/tokenizer/tokenize_test.go
@@ -119,7 +119,10 @@ func BenchmarkTokenizer_BaselineCopy(b *testing.B) {
 	b.ReportAllocs()
 	for i := 0; i < b.N; i++ {
 		for _, test := range Tests {
-			test.content = append([]byte(nil), test.content...)
+			if len(test.content) > ByteLimit {
+				test.content = test.content[:ByteLimit]
+			}
+			_ = append([]byte(nil), test.content...)
 		}
 	}
 }