diff --git a/internal/tokenizer/tokenize_test.go b/internal/tokenizer/tokenize_test.go index 4737c8e..36378ef 100644 --- a/internal/tokenizer/tokenize_test.go +++ b/internal/tokenizer/tokenize_test.go @@ -127,23 +127,23 @@ func TestTokenizerLatin1AsUtf8(t *testing.T) { func TestRegexpOnInvalidUtf8(t *testing.T) { origContent := []struct { - bytes []byte + text string tokens []string }{ - {[]byte("th\xe0 filling"), []string{"th", "filling"}}, // `th� filling` - {[]byte("th\u0100 filling"), []string{"th", "filling"}}, // `thĀ filling` - {[]byte("привет, как дела?"), []string{}}, // empty, no ASCII tokens + {"th\xe0 filling", []string{"th", "filling"}}, // `th� filling` + {"th\u0100 filling", []string{"th", "filling"}}, // `thĀ filling` + {"привет, как дела?", []string{}}, // empty, no ASCII tokens } re := reRegularToken for _, content := range origContent { t.Run("", func(t *testing.T) { - t.Logf("%v - %q", content, string(content.bytes)) - - tokens := re.FindAll(content.bytes, -1) + t.Logf("%v - %q", content, content.text) + input := []byte(content.text) + tokens := re.FindAll(input, -1) require.Equal(t, len(content.tokens), len(tokens)) - newContent := re.ReplaceAll(content.bytes, []byte(` `)) + newContent := re.ReplaceAll(input, []byte(` `)) t.Logf("content:%q, tokens:[", newContent) for i, token := range tokens { t.Logf("\t%q,", string(token))