tokenizer: port flex-based C impl from linguist

Signed-off-by: Alexander Bezzubov <bzz@apache.org>
This commit is contained in:
Alexander Bezzubov
2019-03-24 02:06:19 +01:00
parent ab3c26b46d
commit 553399ed76
5 changed files with 2722 additions and 6 deletions

View File

@ -5,6 +5,7 @@ import (
"testing"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
const (
@ -89,27 +90,69 @@ var (
"XHTML", "sample", "file", "type", "#example", "background", "color", "yellow", "id", "Just", "a", "simple", "XHTML", "test", "page.",
"-", "|", "+", "&&", "<", "<", "-", "!", "!", "!", "=", "=", "!", ":", "=", ":", "=", ",", ",", "=", ">", ">", "=", "=", "=", "=", ">",
"'", ",", ">", "=", ">", "=", "=", ">", "=", ">", ":", ">", "=", ">"}
)
func TestTokenize(t *testing.T) {
tests := []struct {
tests = []struct {
name string
content []byte
expected []string
}{
{name: "content", content: []byte(testContent), expected: tokensFromTestContent},
}
)
func TestTokenize(t *testing.T) {
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
before := string(test.content)
tokens := Tokenize(test.content)
tokens := TokenizeFlex(test.content)
after := string(test.content)
assert.Equal(t, before, after, "the input slice was modified")
assert.Equal(t, len(test.expected), len(tokens), fmt.Sprintf("token' slice length = %v, want %v", len(test.expected), len(tokens)))
require.Equal(t, before, after, "the input slice was modified")
require.Equal(t, len(test.expected), len(tokens), fmt.Sprintf("token' slice length = %v, want %v", len(test.expected), len(tokens)))
for i, expectedToken := range test.expected {
assert.Equal(t, expectedToken, tokens[i], fmt.Sprintf("token = %v, want %v", tokens[i], expectedToken))
}
})
}
}
func BenchmarkTokenizer_BaselineCopy(b *testing.B) {
b.ReportAllocs()
for i := 0; i < b.N; i++ {
for _, test := range tests {
test.content = append([]byte(nil), test.content...)
}
}
}
func BenchmarkTokenizerGo(b *testing.B) {
b.ReportAllocs()
for i := 0; i < b.N; i++ {
for _, test := range tests {
Tokenize(test.content)
}
}
}
func BenchmarkTokenizerC(b *testing.B) {
b.ReportAllocs()
for i := 0; i < b.N; i++ {
for _, test := range tests {
TokenizeC(test.content)
}
}
}
func BenchmarkTokenizerFlex(b *testing.B) {
b.ReportAllocs()
for i := 0; i < b.N; i++ {
for _, test := range tests {
TokenizeFlex(test.content)
}
}
}
//TODO(bzz): introduce tokenizer benchmark suit
// baseline - just read the files
// RE2
// oniguruma
// cgo to flex-based impl