From 7d277b11de2a38fa6a04900fcec3ecd00c49a1c0 Mon Sep 17 00:00:00 2001 From: "M. J. Fromberger" Date: Tue, 29 Jan 2019 10:12:33 -0800 Subject: [PATCH] Copy the tokenizer input to avoid modifying the caller's copy. Addresses #196. Several of the tokenizer's processing steps wind up editing the source, and we don't want those changes to be observed by the caller, which may use the source for other purposes afterward. Signed-off-by: M. J. Fromberger --- internal/tokenizer/tokenize.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/internal/tokenizer/tokenize.go b/internal/tokenizer/tokenize.go index a836d79..dadbccd 100644 --- a/internal/tokenizer/tokenize.go +++ b/internal/tokenizer/tokenize.go @@ -13,6 +13,10 @@ func Tokenize(content []byte) []string { content = content[:byteLimit] } + // Copy the input so that changes wrought by the tokenization steps do not + // modify the caller's copy of the input. See #196. + content = append([]byte(nil), content...) + tokens := make([][]byte, 0, 50) for _, extract := range extractTokens { var extractedTokens [][]byte