token: refactor & simplify test fixtures

Signed-off-by: Alexander Bezzubov <bzz@apache.org>
This commit is contained in:
Alexander Bezzubov 2019-05-08 22:17:32 +02:00
parent 9a7b370b17
commit f3ceaa6330
No known key found for this signature in database
GPG Key ID: 8039F5787EFCD05D

View File

@ -127,23 +127,23 @@ func TestTokenizerLatin1AsUtf8(t *testing.T) {
func TestRegexpOnInvalidUtf8(t *testing.T) { func TestRegexpOnInvalidUtf8(t *testing.T) {
origContent := []struct { origContent := []struct {
bytes []byte text string
tokens []string tokens []string
}{ }{
{[]byte("th\xe0 filling"), []string{"th", "filling"}}, // `th<74> filling` {"th\xe0 filling", []string{"th", "filling"}}, // `th<74> filling`
{[]byte("th\u0100 filling"), []string{"th", "filling"}}, // `thĀ filling` {"th\u0100 filling", []string{"th", "filling"}}, // `thĀ filling`
{[]byte("привет, как дела?"), []string{}}, // empty, no ASCII tokens {"привет, как дела?", []string{}}, // empty, no ASCII tokens
} }
re := reRegularToken re := reRegularToken
for _, content := range origContent { for _, content := range origContent {
t.Run("", func(t *testing.T) { t.Run("", func(t *testing.T) {
t.Logf("%v - %q", content, string(content.bytes)) t.Logf("%v - %q", content, content.text)
input := []byte(content.text)
tokens := re.FindAll(content.bytes, -1) tokens := re.FindAll(input, -1)
require.Equal(t, len(content.tokens), len(tokens)) require.Equal(t, len(content.tokens), len(tokens))
newContent := re.ReplaceAll(content.bytes, []byte(` `)) newContent := re.ReplaceAll(input, []byte(` `))
t.Logf("content:%q, tokens:[", newContent) t.Logf("content:%q, tokens:[", newContent)
for i, token := range tokens { for i, token := range tokens {
t.Logf("\t%q,", string(token)) t.Logf("\t%q,", string(token))