From a724a2f8416e6edd6ab1e429fcf5ef452dae6c4d Mon Sep 17 00:00:00 2001 From: Alexander Bezzubov Date: Tue, 7 May 2019 13:46:36 +0200 Subject: [PATCH] token: test case for regexp + non-valid UTF8 Signed-off-by: Alexander Bezzubov --- internal/tokenizer/tokenize_test.go | 32 +++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/internal/tokenizer/tokenize_test.go b/internal/tokenizer/tokenize_test.go index 36d4585..4737c8e 100644 --- a/internal/tokenizer/tokenize_test.go +++ b/internal/tokenizer/tokenize_test.go @@ -119,9 +119,41 @@ func TestTokenizerLatin1AsUtf8(t *testing.T) { content := []byte("th\xe5 filling") // `th� filling` t.Logf("%v - %q", content, string(content)) tokens := Tokenize(content) + for i, token := range tokens { + t.Logf("token %d, %s", i+1, token) + } require.Equal(t, 3, len(tokens)) } +func TestRegexpOnInvalidUtf8(t *testing.T) { + origContent := []struct { + bytes []byte + tokens []string + }{ + {[]byte("th\xe0 filling"), []string{"th", "filling"}}, // `th� filling` + {[]byte("th\u0100 filling"), []string{"th", "filling"}}, // `thĀ filling` + {[]byte("привет, как дела?"), []string{}}, // empty, no ASCII tokens + } + re := reRegularToken + + for _, content := range origContent { + t.Run("", func(t *testing.T) { + t.Logf("%v - %q", content, string(content.bytes)) + + tokens := re.FindAll(content.bytes, -1) + require.Equal(t, len(content.tokens), len(tokens)) + + newContent := re.ReplaceAll(content.bytes, []byte(` `)) + t.Logf("content:%q, tokens:[", newContent) + for i, token := range tokens { + t.Logf("\t%q,", string(token)) + require.Equal(t, content.tokens[i], string(token)) + } + t.Logf(" ]\n") + }) + } +} + func BenchmarkTokenizer_BaselineCopy(b *testing.B) { b.ReportAllocs() for i := 0; i < b.N; i++ {