mirror of
https://github.com/ralsina/tartrazine.git
synced 2024-11-10 13:32:24 +00:00
token: refactor & simplify test fixtures
Signed-off-by: Alexander Bezzubov <bzz@apache.org>
This commit is contained in:
parent
9a7b370b17
commit
f3ceaa6330
@ -127,23 +127,23 @@ func TestTokenizerLatin1AsUtf8(t *testing.T) {
|
||||
|
||||
func TestRegexpOnInvalidUtf8(t *testing.T) {
|
||||
origContent := []struct {
|
||||
bytes []byte
|
||||
text string
|
||||
tokens []string
|
||||
}{
|
||||
{[]byte("th\xe0 filling"), []string{"th", "filling"}}, // `th<74> filling`
|
||||
{[]byte("th\u0100 filling"), []string{"th", "filling"}}, // `thĀ filling`
|
||||
{[]byte("привет, как дела?"), []string{}}, // empty, no ASCII tokens
|
||||
{"th\xe0 filling", []string{"th", "filling"}}, // `th<74> filling`
|
||||
{"th\u0100 filling", []string{"th", "filling"}}, // `thĀ filling`
|
||||
{"привет, как дела?", []string{}}, // empty, no ASCII tokens
|
||||
}
|
||||
re := reRegularToken
|
||||
|
||||
for _, content := range origContent {
|
||||
t.Run("", func(t *testing.T) {
|
||||
t.Logf("%v - %q", content, string(content.bytes))
|
||||
|
||||
tokens := re.FindAll(content.bytes, -1)
|
||||
t.Logf("%v - %q", content, content.text)
|
||||
input := []byte(content.text)
|
||||
tokens := re.FindAll(input, -1)
|
||||
require.Equal(t, len(content.tokens), len(tokens))
|
||||
|
||||
newContent := re.ReplaceAll(content.bytes, []byte(` `))
|
||||
newContent := re.ReplaceAll(input, []byte(` `))
|
||||
t.Logf("content:%q, tokens:[", newContent)
|
||||
for i, token := range tokens {
|
||||
t.Logf("\t%q,", string(token))
|
||||
|
Loading…
Reference in New Issue
Block a user