token: new test case with Unicode replacement

Signed-off-by: Alexander Bezzubov <bzz@apache.org>
This commit is contained in:
Alexander Bezzubov 2019-04-17 19:28:06 +02:00
parent 278eaf1c22
commit 8bdc830833
No known key found for this signature in database
GPG Key ID: 8039F5787EFCD05D

View File

@ -115,6 +115,13 @@ func TestTokenize(t *testing.T) {
}
}
func TestTokenizerLatin1AsUtf8(t *testing.T) {
content := []byte("th\xe5 filling") // `th<74> filling`
t.Logf("%v - %q", content, string(content))
tokens := Tokenize(content)
require.Equal(t, 3, len(tokens))
}
func BenchmarkTokenizer_BaselineCopy(b *testing.B) {
b.ReportAllocs()
for i := 0; i < b.N; i++ {