From 169060e1cd54c8f0e2eccf0c6d9bcd52402f32f4 Mon Sep 17 00:00:00 2001 From: "M. J. Fromberger" Date: Tue, 29 Jan 2019 10:03:09 -0800 Subject: [PATCH 1/2] Add a test that tokenization does not modify the input. At present this test fails, since the tokenizer replaces text in shared slices of the input. A subsequent commit will fix that. Signed-off-by: M. J. Fromberger --- internal/tokenizer/tokenize_test.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/internal/tokenizer/tokenize_test.go b/internal/tokenizer/tokenize_test.go index 9736543..34f75db 100644 --- a/internal/tokenizer/tokenize_test.go +++ b/internal/tokenizer/tokenize_test.go @@ -102,7 +102,10 @@ func TestTokenize(t *testing.T) { for _, test := range tests { t.Run(test.name, func(t *testing.T) { + before := string(test.content) tokens := Tokenize(test.content) + after := string(test.content) + assert.Equal(t, before, after, "the input slice was modified") assert.Equal(t, len(test.expected), len(tokens), fmt.Sprintf("token' slice length = %v, want %v", len(test.expected), len(tokens))) for i, expectedToken := range test.expected { assert.Equal(t, expectedToken, tokens[i], fmt.Sprintf("token = %v, want %v", tokens[i], expectedToken)) From 7d277b11de2a38fa6a04900fcec3ecd00c49a1c0 Mon Sep 17 00:00:00 2001 From: "M. J. Fromberger" Date: Tue, 29 Jan 2019 10:12:33 -0800 Subject: [PATCH 2/2] Copy the tokenizer input to avoid modifying the caller's copy. Addresses #196. Several of the tokenizer's processing steps wind up editing the source, and we don't want those changes to be observed by the caller, which may use the source for other purposes afterward. Signed-off-by: M. J. Fromberger --- internal/tokenizer/tokenize.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/internal/tokenizer/tokenize.go b/internal/tokenizer/tokenize.go index a836d79..dadbccd 100644 --- a/internal/tokenizer/tokenize.go +++ b/internal/tokenizer/tokenize.go @@ -13,6 +13,10 @@ func Tokenize(content []byte) []string { content = content[:byteLimit] } + // Copy the input so that changes wrought by the tokenization steps do not + // modify the caller's copy of the input. See #196. + content = append([]byte(nil), content...) + tokens := make([][]byte, 0, 50) for _, extract := range extractTokens { var extractedTokens [][]byte