mirror of
https://github.com/ralsina/tartrazine.git
synced 2025-06-19 14:43:05 -03:00
tokenizer: port flex-based C impl from linguist
Signed-off-by: Alexander Bezzubov <bzz@apache.org>
This commit is contained in:
@ -5,6 +5,7 @@ import (
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
const (
|
||||
@ -89,27 +90,69 @@ var (
|
||||
"XHTML", "sample", "file", "type", "#example", "background", "color", "yellow", "id", "Just", "a", "simple", "XHTML", "test", "page.",
|
||||
"-", "|", "+", "&&", "<", "<", "-", "!", "!", "!", "=", "=", "!", ":", "=", ":", "=", ",", ",", "=", ">", ">", "=", "=", "=", "=", ">",
|
||||
"'", ",", ">", "=", ">", "=", "=", ">", "=", ">", ":", ">", "=", ">"}
|
||||
)
|
||||
|
||||
func TestTokenize(t *testing.T) {
|
||||
tests := []struct {
|
||||
tests = []struct {
|
||||
name string
|
||||
content []byte
|
||||
expected []string
|
||||
}{
|
||||
{name: "content", content: []byte(testContent), expected: tokensFromTestContent},
|
||||
}
|
||||
)
|
||||
|
||||
func TestTokenize(t *testing.T) {
|
||||
for _, test := range tests {
|
||||
t.Run(test.name, func(t *testing.T) {
|
||||
before := string(test.content)
|
||||
tokens := Tokenize(test.content)
|
||||
tokens := TokenizeFlex(test.content)
|
||||
after := string(test.content)
|
||||
assert.Equal(t, before, after, "the input slice was modified")
|
||||
assert.Equal(t, len(test.expected), len(tokens), fmt.Sprintf("token' slice length = %v, want %v", len(test.expected), len(tokens)))
|
||||
require.Equal(t, before, after, "the input slice was modified")
|
||||
require.Equal(t, len(test.expected), len(tokens), fmt.Sprintf("token' slice length = %v, want %v", len(test.expected), len(tokens)))
|
||||
for i, expectedToken := range test.expected {
|
||||
assert.Equal(t, expectedToken, tokens[i], fmt.Sprintf("token = %v, want %v", tokens[i], expectedToken))
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkTokenizer_BaselineCopy(b *testing.B) {
|
||||
b.ReportAllocs()
|
||||
for i := 0; i < b.N; i++ {
|
||||
for _, test := range tests {
|
||||
test.content = append([]byte(nil), test.content...)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkTokenizerGo(b *testing.B) {
|
||||
b.ReportAllocs()
|
||||
for i := 0; i < b.N; i++ {
|
||||
for _, test := range tests {
|
||||
Tokenize(test.content)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkTokenizerC(b *testing.B) {
|
||||
b.ReportAllocs()
|
||||
for i := 0; i < b.N; i++ {
|
||||
for _, test := range tests {
|
||||
TokenizeC(test.content)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkTokenizerFlex(b *testing.B) {
|
||||
b.ReportAllocs()
|
||||
for i := 0; i < b.N; i++ {
|
||||
for _, test := range tests {
|
||||
TokenizeFlex(test.content)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//TODO(bzz): introduce tokenizer benchmark suit
|
||||
// baseline - just read the files
|
||||
// RE2
|
||||
// oniguruma
|
||||
// cgo to flex-based impl
|
||||
|
Reference in New Issue
Block a user