refactor to build tags

Signed-off-by: Alexander Bezzubov <bzz@apache.org>
This commit is contained in:
Alexander Bezzubov 2019-03-24 18:55:05 +01:00
parent 553399ed76
commit 8756fbdcb4
No known key found for this signature in database
GPG Key ID: 8039F5787EFCD05D
10 changed files with 141 additions and 114 deletions

1
.gitignore vendored
View File

@ -8,3 +8,4 @@ Makefile.main
build/
vendor/
java/lib/
.vscode/

View File

@ -0,0 +1,6 @@
// Package tokenizer implements file tokenization used by the enry content
// classifier. This package is an implementation detail of enry and should not
// be imported by other packages.
package tokenizer
const byteLimit = 100000

View File

@ -0,0 +1,91 @@
package flex
// #include <stdlib.h>
// #include "linguist.h"
// #include "lex.linguist_yy.h"
// int linguist_yywrap(yyscan_t yyscanner) {
// return 1;
// }
import "C"
import "unsafe"
// TokenizeC is only calling a C-flex based tokenizer from linguist
func TokenizeC(content []byte) []string {
cs := C.CBytes(content)
defer C.free(unsafe.Pointer(cs))
// C.tokenizer_extract_tokens((*C.char)(cs))
return nil
}
const maxTokenLen = 32
// TokenizeFlex implements tokenizer by calling Flex generated code from linguist in C
func TokenizeFlex(content []byte) []string {
var buf C.YY_BUFFER_STATE
var scanner C.yyscan_t
var extra C.struct_tokenizer_extra
// var scanner *C.yyscan_t = (*C.yyscan_t)(C.malloc(C.sizeof_yyscan_t))
// var extra *C.struct_tokenizer_extra = (*C.struct_tokenizer_extra)(C.malloc(C.sizeof_struct_tokenizer_extra))
var _len C.ulong
var r C.int
_len = C.ulong(len(content))
cs := C.CBytes(content)
defer C.free(unsafe.Pointer(cs))
C.linguist_yylex_init_extra(&extra, &scanner)
buf = C.linguist_yy_scan_bytes((*C.char)(cs), _len, scanner)
ary := []string{}
for {
extra._type = C.NO_ACTION
extra.token = nil
r = C.linguist_yylex(scanner)
switch (extra._type) {
case C.NO_ACTION:
break
case C.REGULAR_TOKEN:
_len = C.strlen(extra.token)
if (_len <= maxTokenLen) {
ary = append(ary, C.GoStringN(extra.token, (C.int)(_len)))
//rb_ary_push(ary, rb_str_new(extra.token, len))
}
C.free(unsafe.Pointer(extra.token))
break
case C.SHEBANG_TOKEN:
_len = C.strlen(extra.token)
if (_len <= maxTokenLen) {
s := "SHEBANG#!" + C.GoStringN(extra.token, (C.int)(_len))
ary = append(ary, s)
//s = rb_str_new2("SHEBANG#!");
//rb_str_cat(s, extra.token, len);
//rb_ary_push(ary, s);
}
C.free(unsafe.Pointer(extra.token))
break
case C.SGML_TOKEN:
_len = C.strlen(extra.token)
if (_len <= maxTokenLen) {
s := C.GoStringN(extra.token, (C.int)(_len)) + ">"
ary = append(ary, s)
//s = rb_str_new(extra.token, len);
//rb_str_cat2(s, ">");
//rb_ary_push(ary, s);
}
C.free(unsafe.Pointer(extra.token))
break
}
if r == 0 {
break
}
}
C.linguist_yy_delete_buffer(buf, scanner)
C.linguist_yylex_destroy(scanner)
// C.free(unsafe.Pointer(extra))
// C.free(unsafe.Pointer(scanner))
return ary
}

View File

@ -0,0 +1,25 @@
package flex
// import (
// "testing"
// "gopkg.in/src-d/enry.v1/internal/tokenizer"
// )
// func BenchmarkTokenizerC(b *testing.B) {
// b.ReportAllocs()
// for i := 0; i < b.N; i++ {
// for _, test := range tokenizer.Tests {
// TokenizeC(test.content)
// }
// }
// }
// func BenchmarkTokenizerFlex(b *testing.B) {
// b.ReportAllocs()
// for i := 0; i < b.N; i++ {
// for _, test := range tokenizer.Tests {
// TokenizeFlex(test.content)
// }
// }
// }

View File

@ -1,6 +1,5 @@
// Package tokenizer implements file tokenization used by the enry content
// classifier. This package is an implementation detail of enry and should not
// be imported by other packages.
// +build !flex
package tokenizer
import (
@ -9,8 +8,6 @@ import (
"gopkg.in/src-d/enry.v1/regex"
)
const byteLimit = 100000
// Tokenize returns language-agnostic lexical tokens from content. The tokens
// returned should match what the Linguist library returns. At most the first
// 100KB of content are tokenized.

View File

@ -1,91 +1,16 @@
// +build flex
package tokenizer
// #include <stdlib.h>
// #include "linguist.h"
// #include "lex.linguist_yy.h"
// int linguist_yywrap(yyscan_t yyscanner) {
// return 1;
// }
import "C"
import "unsafe"
import "gopkg.in/src-d/enry.v1/internal/tokenizer/flex"
// TokenizeC is only calling a C-flex based tokenizer from linguist
func TokenizeC(content []byte) []string {
cs := C.CBytes(content)
defer C.free(unsafe.Pointer(cs))
// C.tokenizer_extract_tokens((*C.char)(cs))
return nil
}
const maxTokenLen = 32
// TokenizeFlex implements tokenizer by calling Flex generated code from linguist in C
func TokenizeFlex(content []byte) []string {
var buf C.YY_BUFFER_STATE
var scanner C.yyscan_t
var extra C.struct_tokenizer_extra
// var scanner *C.yyscan_t = (*C.yyscan_t)(C.malloc(C.sizeof_yyscan_t))
// var extra *C.struct_tokenizer_extra = (*C.struct_tokenizer_extra)(C.malloc(C.sizeof_struct_tokenizer_extra))
var _len C.ulong
var r C.int
_len = C.ulong(len(content))
cs := C.CBytes(content)
defer C.free(unsafe.Pointer(cs))
C.linguist_yylex_init_extra(&extra, &scanner)
buf = C.linguist_yy_scan_bytes((*C.char)(cs), _len, scanner)
ary := []string{}
for {
extra._type = C.NO_ACTION
extra.token = nil
r = C.linguist_yylex(scanner)
switch (extra._type) {
case C.NO_ACTION:
break
case C.REGULAR_TOKEN:
_len = C.strlen(extra.token)
if (_len <= maxTokenLen) {
ary = append(ary, C.GoStringN(extra.token, (C.int)(_len)))
//rb_ary_push(ary, rb_str_new(extra.token, len))
}
C.free(unsafe.Pointer(extra.token))
break
case C.SHEBANG_TOKEN:
_len = C.strlen(extra.token)
if (_len <= maxTokenLen) {
s := "SHEBANG#!" + C.GoStringN(extra.token, (C.int)(_len))
ary = append(ary, s)
//s = rb_str_new2("SHEBANG#!");
//rb_str_cat(s, extra.token, len);
//rb_ary_push(ary, s);
}
C.free(unsafe.Pointer(extra.token))
break
case C.SGML_TOKEN:
_len = C.strlen(extra.token)
if (_len <= maxTokenLen) {
s := C.GoStringN(extra.token, (C.int)(_len)) + ">"
ary = append(ary, s)
//s = rb_str_new(extra.token, len);
//rb_str_cat2(s, ">");
//rb_ary_push(ary, s);
}
C.free(unsafe.Pointer(extra.token))
break
}
if r == 0 {
break
}
// Tokenize returns language-agnostic lexical tokens from content. The tokens
// returned should match what the Linguist library returns. At most the first
// 100KB of content are tokenized.
func Tokenize(content []byte) []string {
if len(content) > byteLimit {
content = content[:byteLimit]
}
C.linguist_yy_delete_buffer(buf, scanner)
C.linguist_yylex_destroy(scanner)
// C.free(unsafe.Pointer(extra))
// C.free(unsafe.Pointer(scanner))
return ary
return flex.TokenizeFlex(content)
}

View File

@ -91,7 +91,7 @@ var (
"-", "|", "+", "&&", "<", "<", "-", "!", "!", "!", "=", "=", "!", ":", "=", ":", "=", ",", ",", "=", ">", ">", "=", "=", "=", "=", ">",
"'", ",", ">", "=", ">", "=", "=", ">", "=", ">", ":", ">", "=", ">"}
tests = []struct {
Tests = []struct {
name string
content []byte
expected []string
@ -101,10 +101,10 @@ var (
)
func TestTokenize(t *testing.T) {
for _, test := range tests {
for _, test := range Tests {
t.Run(test.name, func(t *testing.T) {
before := string(test.content)
tokens := TokenizeFlex(test.content)
tokens := Tokenize(test.content)
after := string(test.content)
require.Equal(t, before, after, "the input slice was modified")
require.Equal(t, len(test.expected), len(tokens), fmt.Sprintf("token' slice length = %v, want %v", len(test.expected), len(tokens)))
@ -118,39 +118,21 @@ func TestTokenize(t *testing.T) {
func BenchmarkTokenizer_BaselineCopy(b *testing.B) {
b.ReportAllocs()
for i := 0; i < b.N; i++ {
for _, test := range tests {
for _, test := range Tests {
test.content = append([]byte(nil), test.content...)
}
}
}
func BenchmarkTokenizerGo(b *testing.B) {
func BenchmarkTokenizer(b *testing.B) {
b.ReportAllocs()
for i := 0; i < b.N; i++ {
for _, test := range tests {
for _, test := range Tests {
Tokenize(test.content)
}
}
}
func BenchmarkTokenizerC(b *testing.B) {
b.ReportAllocs()
for i := 0; i < b.N; i++ {
for _, test := range tests {
TokenizeC(test.content)
}
}
}
func BenchmarkTokenizerFlex(b *testing.B) {
b.ReportAllocs()
for i := 0; i < b.N; i++ {
for _, test := range tests {
TokenizeFlex(test.content)
}
}
}
//TODO(bzz): introduce tokenizer benchmark suit
// baseline - just read the files
// RE2