token: refactor & simplify test fixtures

Signed-off-by: Alexander Bezzubov <bzz@apache.org>
This commit is contained in:
Alexander Bezzubov 2019-05-08 22:17:32 +02:00
parent 9a7b370b17
commit f3ceaa6330
No known key found for this signature in database
GPG Key ID: 8039F5787EFCD05D

View File

@ -127,23 +127,23 @@ func TestTokenizerLatin1AsUtf8(t *testing.T) {
func TestRegexpOnInvalidUtf8(t *testing.T) {
origContent := []struct {
bytes []byte
text string
tokens []string
}{
{[]byte("th\xe0 filling"), []string{"th", "filling"}}, // `th<74> filling`
{[]byte("th\u0100 filling"), []string{"th", "filling"}}, // `thĀ filling`
{[]byte("привет, как дела?"), []string{}}, // empty, no ASCII tokens
{"th\xe0 filling", []string{"th", "filling"}}, // `th<74> filling`
{"th\u0100 filling", []string{"th", "filling"}}, // `thĀ filling`
{"привет, как дела?", []string{}}, // empty, no ASCII tokens
}
re := reRegularToken
for _, content := range origContent {
t.Run("", func(t *testing.T) {
t.Logf("%v - %q", content, string(content.bytes))
tokens := re.FindAll(content.bytes, -1)
t.Logf("%v - %q", content, content.text)
input := []byte(content.text)
tokens := re.FindAll(input, -1)
require.Equal(t, len(content.tokens), len(tokens))
newContent := re.ReplaceAll(content.bytes, []byte(` `))
newContent := re.ReplaceAll(input, []byte(` `))
t.Logf("content:%q, tokens:[", newContent)
for i, token := range tokens {
t.Logf("\t%q,", string(token))