2017-05-25 10:33:26 +00:00
|
|
|
|
package tokenizer
|
|
|
|
|
|
|
|
|
|
import (
|
|
|
|
|
"fmt"
|
|
|
|
|
"testing"
|
|
|
|
|
|
2020-03-19 18:58:48 +00:00
|
|
|
|
"github.com/go-enry/go-enry/v2/regex"
|
2017-05-25 10:33:26 +00:00
|
|
|
|
"github.com/stretchr/testify/assert"
|
2019-03-24 01:06:19 +00:00
|
|
|
|
"github.com/stretchr/testify/require"
|
2017-05-25 10:33:26 +00:00
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
const (
|
|
|
|
|
testContent = `#!/usr/bin/ruby
|
|
|
|
|
|
|
|
|
|
#!/usr/bin/env node
|
|
|
|
|
|
|
|
|
|
aaa
|
|
|
|
|
|
|
|
|
|
#!/usr/bin/env A=B foo=bar awk -f
|
|
|
|
|
|
|
|
|
|
#!python
|
|
|
|
|
|
|
|
|
|
func Tokenize(content []byte) []string {
|
|
|
|
|
splitted := bytes.Fields(content)
|
|
|
|
|
tokens := /* make([]string, 0, len(splitted))
|
|
|
|
|
no comment -- comment
|
|
|
|
|
for _, tokenByte := range splitted {
|
|
|
|
|
token64 := base64.StdEncoding.EncodeToString(tokenByte)
|
|
|
|
|
tokens = append(tokens, token64)
|
|
|
|
|
notcatchasanumber3.5
|
|
|
|
|
}*/
|
|
|
|
|
othercode
|
|
|
|
|
/* testing multiple
|
|
|
|
|
|
|
|
|
|
multiline comments*/
|
|
|
|
|
|
|
|
|
|
<!-- com
|
|
|
|
|
ment -->
|
|
|
|
|
<!-- comment 2-->
|
|
|
|
|
ppp no comment # comment
|
|
|
|
|
|
|
|
|
|
"literal1"
|
|
|
|
|
|
|
|
|
|
abb (tokenByte, 0xAF02) | ,3.2L
|
|
|
|
|
|
|
|
|
|
'literal2' notcatchasanumber3.5
|
|
|
|
|
|
|
|
|
|
5 += number * anotherNumber
|
|
|
|
|
if isTrue && isToo {
|
|
|
|
|
0b00001000 >> 1
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return tokens
|
|
|
|
|
|
|
|
|
|
oneBool = 3 <= 2
|
|
|
|
|
varBool = 3<=2>
|
|
|
|
|
|
2017-06-13 11:56:07 +00:00
|
|
|
|
#ifndef
|
|
|
|
|
#i'm not a comment if the single line comment symbol is not followed by a white
|
|
|
|
|
|
2017-05-25 10:33:26 +00:00
|
|
|
|
PyErr_SetString(PyExc_RuntimeError, "Relative import is not supported for Python <=2.4.");
|
|
|
|
|
|
|
|
|
|
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
|
|
|
|
<html xmlns="http://www.w3.org/1999/xhtml">
|
|
|
|
|
<head>
|
|
|
|
|
<title id="hola" class="">This is a XHTML sample file</title>
|
|
|
|
|
<style type="text/css"><![CDATA[
|
|
|
|
|
#example {
|
|
|
|
|
background-color: yellow;
|
|
|
|
|
}
|
|
|
|
|
]]></style>
|
|
|
|
|
</head>
|
|
|
|
|
<body>
|
|
|
|
|
<div id="example">
|
|
|
|
|
Just a simple <strong>XHTML</strong> test page.
|
|
|
|
|
</div>
|
|
|
|
|
</body>
|
|
|
|
|
</html>`
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
var (
|
2020-03-19 18:08:21 +00:00
|
|
|
|
tokensFromTestContent = []string{"SHEBANG#!ruby", "SHEBANG#!node", "SHEBANG#!awk", "<!DOCTYPE>", "html", "PUBLIC",
|
|
|
|
|
"W3C", "DTD", "XHTML", "1", "0", "Strict", "EN", "http", "www", "w3", "org", "TR", "xhtml1", "DTD", "xhtml1",
|
|
|
|
|
"strict", "dtd", "<html>", "xmlns=", "<head>", "<title>", "id=", "class=", "</title>", "<style>", "type=",
|
|
|
|
|
"<![CDATA[>", "example", "background", "color", "yellow", "</style>", "</head>", "<body>", "<div>", "id=",
|
|
|
|
|
"<strong>", "</strong>", "</div>", "</body>", "</html>", "(", "[", "]", ")", "[", "]", "{", "(", ")", "(", ")",
|
|
|
|
|
"{", "}", "(", ")", ";", "#", "/usr/bin/ruby", "#", "/usr/bin/env", "node", "aaa", "#", "/usr/bin/env", "A",
|
|
|
|
|
"B", "foo", "bar", "awk", "f", "#", "python", "func", "Tokenize", "content", "byte", "string", "splitted",
|
|
|
|
|
"bytes.Fields", "content", "tokens", "othercode", "ppp", "no", "comment", "abb", "tokenByte",
|
|
|
|
|
"notcatchasanumber", "number", "*", "anotherNumber", "if", "isTrue", "isToo", "b", "return", "tokens",
|
|
|
|
|
"oneBool", "varBool", "#ifndef", "#i", "m", "not", "a", "comment", "if", "the", "single", "line", "comment",
|
|
|
|
|
"symbol", "is", "not", "followed", "by", "a", "white", "PyErr_SetString", "PyExc_RuntimeError", "This", "is",
|
|
|
|
|
"a", "XHTML", "sample", "file", "Just", "a", "simple", "XHTML", "test", "page.", "-", "|", "+", "&&", "<", "<",
|
|
|
|
|
"!", "!", "!", "=", "=", "!", ":", "=", ":", "=", ",", ",", "=", ">", ">", "=", "=", "=", "=", ">", "'", ","}
|
2017-05-25 10:33:26 +00:00
|
|
|
|
|
2019-04-16 17:27:23 +00:00
|
|
|
|
tests = []struct {
|
2017-05-25 10:33:26 +00:00
|
|
|
|
name string
|
|
|
|
|
content []byte
|
|
|
|
|
expected []string
|
|
|
|
|
}{
|
|
|
|
|
{name: "content", content: []byte(testContent), expected: tokensFromTestContent},
|
|
|
|
|
}
|
2019-03-24 01:06:19 +00:00
|
|
|
|
)
|
2017-05-25 10:33:26 +00:00
|
|
|
|
|
2019-03-24 01:06:19 +00:00
|
|
|
|
func TestTokenize(t *testing.T) {
|
2019-04-16 17:27:23 +00:00
|
|
|
|
for _, test := range tests {
|
2017-05-25 10:33:26 +00:00
|
|
|
|
t.Run(test.name, func(t *testing.T) {
|
2019-01-29 18:03:09 +00:00
|
|
|
|
before := string(test.content)
|
2019-03-24 17:55:05 +00:00
|
|
|
|
tokens := Tokenize(test.content)
|
2019-01-29 18:03:09 +00:00
|
|
|
|
after := string(test.content)
|
2019-03-24 01:06:19 +00:00
|
|
|
|
require.Equal(t, before, after, "the input slice was modified")
|
|
|
|
|
require.Equal(t, len(test.expected), len(tokens), fmt.Sprintf("token' slice length = %v, want %v", len(test.expected), len(tokens)))
|
2020-03-19 18:08:21 +00:00
|
|
|
|
|
2017-05-25 10:33:26 +00:00
|
|
|
|
for i, expectedToken := range test.expected {
|
|
|
|
|
assert.Equal(t, expectedToken, tokens[i], fmt.Sprintf("token = %v, want %v", tokens[i], expectedToken))
|
|
|
|
|
}
|
|
|
|
|
})
|
|
|
|
|
}
|
|
|
|
|
}
|
2019-03-24 01:06:19 +00:00
|
|
|
|
|
2019-04-17 17:28:06 +00:00
|
|
|
|
func TestTokenizerLatin1AsUtf8(t *testing.T) {
|
|
|
|
|
content := []byte("th\xe5 filling") // `th<74> filling`
|
|
|
|
|
t.Logf("%v - %q", content, string(content))
|
|
|
|
|
tokens := Tokenize(content)
|
2019-05-07 11:46:36 +00:00
|
|
|
|
for i, token := range tokens {
|
|
|
|
|
t.Logf("token %d, %s", i+1, token)
|
|
|
|
|
}
|
2019-04-17 17:28:06 +00:00
|
|
|
|
require.Equal(t, 3, len(tokens))
|
|
|
|
|
}
|
|
|
|
|
|
2019-05-07 11:46:36 +00:00
|
|
|
|
func TestRegexpOnInvalidUtf8(t *testing.T) {
|
|
|
|
|
origContent := []struct {
|
2019-05-08 20:17:32 +00:00
|
|
|
|
text string
|
2019-05-07 11:46:36 +00:00
|
|
|
|
tokens []string
|
|
|
|
|
}{
|
2019-05-08 20:17:32 +00:00
|
|
|
|
{"th\xe0 filling", []string{"th", "filling"}}, // `th<74> filling`
|
|
|
|
|
{"th\u0100 filling", []string{"th", "filling"}}, // `thĀ filling`
|
|
|
|
|
{"привет, как дела?", []string{}}, // empty, no ASCII tokens
|
2019-05-07 11:46:36 +00:00
|
|
|
|
}
|
2020-03-19 18:58:48 +00:00
|
|
|
|
re := regex.MustCompile(`[0-9A-Za-z_\.@#\/\*]+`) // a reRegularToken from tokenizer.go
|
2019-05-07 11:46:36 +00:00
|
|
|
|
|
|
|
|
|
for _, content := range origContent {
|
|
|
|
|
t.Run("", func(t *testing.T) {
|
2019-05-08 20:17:32 +00:00
|
|
|
|
t.Logf("%v - %q", content, content.text)
|
|
|
|
|
input := []byte(content.text)
|
|
|
|
|
tokens := re.FindAll(input, -1)
|
2019-05-07 11:46:36 +00:00
|
|
|
|
require.Equal(t, len(content.tokens), len(tokens))
|
|
|
|
|
|
2019-05-08 20:17:32 +00:00
|
|
|
|
newContent := re.ReplaceAll(input, []byte(` `))
|
2019-05-07 11:46:36 +00:00
|
|
|
|
t.Logf("content:%q, tokens:[", newContent)
|
|
|
|
|
for i, token := range tokens {
|
|
|
|
|
t.Logf("\t%q,", string(token))
|
|
|
|
|
require.Equal(t, content.tokens[i], string(token))
|
|
|
|
|
}
|
|
|
|
|
t.Logf(" ]\n")
|
|
|
|
|
})
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2019-03-24 01:06:19 +00:00
|
|
|
|
func BenchmarkTokenizer_BaselineCopy(b *testing.B) {
|
|
|
|
|
b.ReportAllocs()
|
|
|
|
|
for i := 0; i < b.N; i++ {
|
2019-04-16 17:27:23 +00:00
|
|
|
|
for _, test := range tests {
|
2019-04-14 20:15:18 +00:00
|
|
|
|
if len(test.content) > ByteLimit {
|
|
|
|
|
test.content = test.content[:ByteLimit]
|
|
|
|
|
}
|
|
|
|
|
_ = append([]byte(nil), test.content...)
|
2019-03-24 01:06:19 +00:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2019-03-24 17:55:05 +00:00
|
|
|
|
func BenchmarkTokenizer(b *testing.B) {
|
2019-03-24 01:06:19 +00:00
|
|
|
|
b.ReportAllocs()
|
|
|
|
|
for i := 0; i < b.N; i++ {
|
2019-04-16 17:27:23 +00:00
|
|
|
|
for _, test := range tests {
|
2019-03-24 01:06:19 +00:00
|
|
|
|
Tokenize(test.content)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|