Merge pull request #197 from creachadair/muckthebits

Prevent tokenization from modifying its input.
This commit is contained in:
M. J. Fromberger 2019-01-29 11:18:06 -08:00 committed by GitHub
commit 260dcfe002
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 7 additions and 0 deletions

View File

@ -13,6 +13,10 @@ func Tokenize(content []byte) []string {
content = content[:byteLimit] content = content[:byteLimit]
} }
// Copy the input so that changes wrought by the tokenization steps do not
// modify the caller's copy of the input. See #196.
content = append([]byte(nil), content...)
tokens := make([][]byte, 0, 50) tokens := make([][]byte, 0, 50)
for _, extract := range extractTokens { for _, extract := range extractTokens {
var extractedTokens [][]byte var extractedTokens [][]byte

View File

@ -102,7 +102,10 @@ func TestTokenize(t *testing.T) {
for _, test := range tests { for _, test := range tests {
t.Run(test.name, func(t *testing.T) { t.Run(test.name, func(t *testing.T) {
before := string(test.content)
tokens := Tokenize(test.content) tokens := Tokenize(test.content)
after := string(test.content)
assert.Equal(t, before, after, "the input slice was modified")
assert.Equal(t, len(test.expected), len(tokens), fmt.Sprintf("token' slice length = %v, want %v", len(test.expected), len(tokens))) assert.Equal(t, len(test.expected), len(tokens), fmt.Sprintf("token' slice length = %v, want %v", len(test.expected), len(tokens)))
for i, expectedToken := range test.expected { for i, expectedToken := range test.expected {
assert.Equal(t, expectedToken, tokens[i], fmt.Sprintf("token = %v, want %v", tokens[i], expectedToken)) assert.Equal(t, expectedToken, tokens[i], fmt.Sprintf("token = %v, want %v", tokens[i], expectedToken))