mirror of
https://github.com/ralsina/tartrazine.git
synced 2025-07-12 04:09:48 +00:00
refactor to build tags
Signed-off-by: Alexander Bezzubov <bzz@apache.org>
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@ -8,3 +8,4 @@ Makefile.main
|
|||||||
build/
|
build/
|
||||||
vendor/
|
vendor/
|
||||||
java/lib/
|
java/lib/
|
||||||
|
.vscode/
|
||||||
|
6
internal/tokenizer/common.go
Normal file
6
internal/tokenizer/common.go
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
// Package tokenizer implements file tokenization used by the enry content
|
||||||
|
// classifier. This package is an implementation detail of enry and should not
|
||||||
|
// be imported by other packages.
|
||||||
|
package tokenizer
|
||||||
|
|
||||||
|
const byteLimit = 100000
|
91
internal/tokenizer/flex/tokenize_c.go
Normal file
91
internal/tokenizer/flex/tokenize_c.go
Normal file
@ -0,0 +1,91 @@
|
|||||||
|
package flex
|
||||||
|
|
||||||
|
// #include <stdlib.h>
|
||||||
|
// #include "linguist.h"
|
||||||
|
// #include "lex.linguist_yy.h"
|
||||||
|
// int linguist_yywrap(yyscan_t yyscanner) {
|
||||||
|
// return 1;
|
||||||
|
// }
|
||||||
|
import "C"
|
||||||
|
import "unsafe"
|
||||||
|
|
||||||
|
// TokenizeC is only calling a C-flex based tokenizer from linguist
|
||||||
|
func TokenizeC(content []byte) []string {
|
||||||
|
cs := C.CBytes(content)
|
||||||
|
defer C.free(unsafe.Pointer(cs))
|
||||||
|
// C.tokenizer_extract_tokens((*C.char)(cs))
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
const maxTokenLen = 32
|
||||||
|
|
||||||
|
|
||||||
|
// TokenizeFlex implements tokenizer by calling Flex generated code from linguist in C
|
||||||
|
func TokenizeFlex(content []byte) []string {
|
||||||
|
var buf C.YY_BUFFER_STATE
|
||||||
|
var scanner C.yyscan_t
|
||||||
|
var extra C.struct_tokenizer_extra
|
||||||
|
// var scanner *C.yyscan_t = (*C.yyscan_t)(C.malloc(C.sizeof_yyscan_t))
|
||||||
|
// var extra *C.struct_tokenizer_extra = (*C.struct_tokenizer_extra)(C.malloc(C.sizeof_struct_tokenizer_extra))
|
||||||
|
var _len C.ulong
|
||||||
|
var r C.int
|
||||||
|
|
||||||
|
_len = C.ulong(len(content))
|
||||||
|
cs := C.CBytes(content)
|
||||||
|
defer C.free(unsafe.Pointer(cs))
|
||||||
|
|
||||||
|
C.linguist_yylex_init_extra(&extra, &scanner)
|
||||||
|
buf = C.linguist_yy_scan_bytes((*C.char)(cs), _len, scanner)
|
||||||
|
|
||||||
|
|
||||||
|
ary := []string{}
|
||||||
|
for {
|
||||||
|
extra._type = C.NO_ACTION
|
||||||
|
extra.token = nil
|
||||||
|
r = C.linguist_yylex(scanner)
|
||||||
|
switch (extra._type) {
|
||||||
|
case C.NO_ACTION:
|
||||||
|
break
|
||||||
|
case C.REGULAR_TOKEN:
|
||||||
|
_len = C.strlen(extra.token)
|
||||||
|
if (_len <= maxTokenLen) {
|
||||||
|
ary = append(ary, C.GoStringN(extra.token, (C.int)(_len)))
|
||||||
|
//rb_ary_push(ary, rb_str_new(extra.token, len))
|
||||||
|
}
|
||||||
|
C.free(unsafe.Pointer(extra.token))
|
||||||
|
break
|
||||||
|
case C.SHEBANG_TOKEN:
|
||||||
|
_len = C.strlen(extra.token)
|
||||||
|
if (_len <= maxTokenLen) {
|
||||||
|
s := "SHEBANG#!" + C.GoStringN(extra.token, (C.int)(_len))
|
||||||
|
ary = append(ary, s)
|
||||||
|
//s = rb_str_new2("SHEBANG#!");
|
||||||
|
//rb_str_cat(s, extra.token, len);
|
||||||
|
//rb_ary_push(ary, s);
|
||||||
|
}
|
||||||
|
C.free(unsafe.Pointer(extra.token))
|
||||||
|
break
|
||||||
|
case C.SGML_TOKEN:
|
||||||
|
_len = C.strlen(extra.token)
|
||||||
|
if (_len <= maxTokenLen) {
|
||||||
|
s := C.GoStringN(extra.token, (C.int)(_len)) + ">"
|
||||||
|
ary = append(ary, s)
|
||||||
|
//s = rb_str_new(extra.token, len);
|
||||||
|
//rb_str_cat2(s, ">");
|
||||||
|
//rb_ary_push(ary, s);
|
||||||
|
}
|
||||||
|
C.free(unsafe.Pointer(extra.token))
|
||||||
|
break
|
||||||
|
}
|
||||||
|
if r == 0 {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
C.linguist_yy_delete_buffer(buf, scanner)
|
||||||
|
C.linguist_yylex_destroy(scanner)
|
||||||
|
// C.free(unsafe.Pointer(extra))
|
||||||
|
// C.free(unsafe.Pointer(scanner))
|
||||||
|
|
||||||
|
return ary
|
||||||
|
}
|
25
internal/tokenizer/flex/tokenize_c_test.go
Normal file
25
internal/tokenizer/flex/tokenize_c_test.go
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
package flex
|
||||||
|
|
||||||
|
// import (
|
||||||
|
// "testing"
|
||||||
|
|
||||||
|
// "gopkg.in/src-d/enry.v1/internal/tokenizer"
|
||||||
|
// )
|
||||||
|
|
||||||
|
// func BenchmarkTokenizerC(b *testing.B) {
|
||||||
|
// b.ReportAllocs()
|
||||||
|
// for i := 0; i < b.N; i++ {
|
||||||
|
// for _, test := range tokenizer.Tests {
|
||||||
|
// TokenizeC(test.content)
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
|
||||||
|
// func BenchmarkTokenizerFlex(b *testing.B) {
|
||||||
|
// b.ReportAllocs()
|
||||||
|
// for i := 0; i < b.N; i++ {
|
||||||
|
// for _, test := range tokenizer.Tests {
|
||||||
|
// TokenizeFlex(test.content)
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
// }
|
@ -1,6 +1,5 @@
|
|||||||
// Package tokenizer implements file tokenization used by the enry content
|
// +build !flex
|
||||||
// classifier. This package is an implementation detail of enry and should not
|
|
||||||
// be imported by other packages.
|
|
||||||
package tokenizer
|
package tokenizer
|
||||||
|
|
||||||
import (
|
import (
|
||||||
@ -9,8 +8,6 @@ import (
|
|||||||
"gopkg.in/src-d/enry.v1/regex"
|
"gopkg.in/src-d/enry.v1/regex"
|
||||||
)
|
)
|
||||||
|
|
||||||
const byteLimit = 100000
|
|
||||||
|
|
||||||
// Tokenize returns language-agnostic lexical tokens from content. The tokens
|
// Tokenize returns language-agnostic lexical tokens from content. The tokens
|
||||||
// returned should match what the Linguist library returns. At most the first
|
// returned should match what the Linguist library returns. At most the first
|
||||||
// 100KB of content are tokenized.
|
// 100KB of content are tokenized.
|
||||||
|
@ -1,91 +1,16 @@
|
|||||||
|
// +build flex
|
||||||
|
|
||||||
package tokenizer
|
package tokenizer
|
||||||
|
|
||||||
// #include <stdlib.h>
|
import "gopkg.in/src-d/enry.v1/internal/tokenizer/flex"
|
||||||
// #include "linguist.h"
|
|
||||||
// #include "lex.linguist_yy.h"
|
|
||||||
// int linguist_yywrap(yyscan_t yyscanner) {
|
|
||||||
// return 1;
|
|
||||||
// }
|
|
||||||
import "C"
|
|
||||||
import "unsafe"
|
|
||||||
|
|
||||||
// TokenizeC is only calling a C-flex based tokenizer from linguist
|
// Tokenize returns language-agnostic lexical tokens from content. The tokens
|
||||||
func TokenizeC(content []byte) []string {
|
// returned should match what the Linguist library returns. At most the first
|
||||||
cs := C.CBytes(content)
|
// 100KB of content are tokenized.
|
||||||
defer C.free(unsafe.Pointer(cs))
|
func Tokenize(content []byte) []string {
|
||||||
// C.tokenizer_extract_tokens((*C.char)(cs))
|
if len(content) > byteLimit {
|
||||||
return nil
|
content = content[:byteLimit]
|
||||||
}
|
|
||||||
|
|
||||||
const maxTokenLen = 32
|
|
||||||
|
|
||||||
|
|
||||||
// TokenizeFlex implements tokenizer by calling Flex generated code from linguist in C
|
|
||||||
func TokenizeFlex(content []byte) []string {
|
|
||||||
var buf C.YY_BUFFER_STATE
|
|
||||||
var scanner C.yyscan_t
|
|
||||||
var extra C.struct_tokenizer_extra
|
|
||||||
// var scanner *C.yyscan_t = (*C.yyscan_t)(C.malloc(C.sizeof_yyscan_t))
|
|
||||||
// var extra *C.struct_tokenizer_extra = (*C.struct_tokenizer_extra)(C.malloc(C.sizeof_struct_tokenizer_extra))
|
|
||||||
var _len C.ulong
|
|
||||||
var r C.int
|
|
||||||
|
|
||||||
_len = C.ulong(len(content))
|
|
||||||
cs := C.CBytes(content)
|
|
||||||
defer C.free(unsafe.Pointer(cs))
|
|
||||||
|
|
||||||
C.linguist_yylex_init_extra(&extra, &scanner)
|
|
||||||
buf = C.linguist_yy_scan_bytes((*C.char)(cs), _len, scanner)
|
|
||||||
|
|
||||||
|
|
||||||
ary := []string{}
|
|
||||||
for {
|
|
||||||
extra._type = C.NO_ACTION
|
|
||||||
extra.token = nil
|
|
||||||
r = C.linguist_yylex(scanner)
|
|
||||||
switch (extra._type) {
|
|
||||||
case C.NO_ACTION:
|
|
||||||
break
|
|
||||||
case C.REGULAR_TOKEN:
|
|
||||||
_len = C.strlen(extra.token)
|
|
||||||
if (_len <= maxTokenLen) {
|
|
||||||
ary = append(ary, C.GoStringN(extra.token, (C.int)(_len)))
|
|
||||||
//rb_ary_push(ary, rb_str_new(extra.token, len))
|
|
||||||
}
|
|
||||||
C.free(unsafe.Pointer(extra.token))
|
|
||||||
break
|
|
||||||
case C.SHEBANG_TOKEN:
|
|
||||||
_len = C.strlen(extra.token)
|
|
||||||
if (_len <= maxTokenLen) {
|
|
||||||
s := "SHEBANG#!" + C.GoStringN(extra.token, (C.int)(_len))
|
|
||||||
ary = append(ary, s)
|
|
||||||
//s = rb_str_new2("SHEBANG#!");
|
|
||||||
//rb_str_cat(s, extra.token, len);
|
|
||||||
//rb_ary_push(ary, s);
|
|
||||||
}
|
|
||||||
C.free(unsafe.Pointer(extra.token))
|
|
||||||
break
|
|
||||||
case C.SGML_TOKEN:
|
|
||||||
_len = C.strlen(extra.token)
|
|
||||||
if (_len <= maxTokenLen) {
|
|
||||||
s := C.GoStringN(extra.token, (C.int)(_len)) + ">"
|
|
||||||
ary = append(ary, s)
|
|
||||||
//s = rb_str_new(extra.token, len);
|
|
||||||
//rb_str_cat2(s, ">");
|
|
||||||
//rb_ary_push(ary, s);
|
|
||||||
}
|
|
||||||
C.free(unsafe.Pointer(extra.token))
|
|
||||||
break
|
|
||||||
}
|
|
||||||
if r == 0 {
|
|
||||||
break
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
C.linguist_yy_delete_buffer(buf, scanner)
|
return flex.TokenizeFlex(content)
|
||||||
C.linguist_yylex_destroy(scanner)
|
|
||||||
// C.free(unsafe.Pointer(extra))
|
|
||||||
// C.free(unsafe.Pointer(scanner))
|
|
||||||
|
|
||||||
return ary
|
|
||||||
}
|
}
|
||||||
|
@ -91,7 +91,7 @@ var (
|
|||||||
"-", "|", "+", "&&", "<", "<", "-", "!", "!", "!", "=", "=", "!", ":", "=", ":", "=", ",", ",", "=", ">", ">", "=", "=", "=", "=", ">",
|
"-", "|", "+", "&&", "<", "<", "-", "!", "!", "!", "=", "=", "!", ":", "=", ":", "=", ",", ",", "=", ">", ">", "=", "=", "=", "=", ">",
|
||||||
"'", ",", ">", "=", ">", "=", "=", ">", "=", ">", ":", ">", "=", ">"}
|
"'", ",", ">", "=", ">", "=", "=", ">", "=", ">", ":", ">", "=", ">"}
|
||||||
|
|
||||||
tests = []struct {
|
Tests = []struct {
|
||||||
name string
|
name string
|
||||||
content []byte
|
content []byte
|
||||||
expected []string
|
expected []string
|
||||||
@ -101,10 +101,10 @@ var (
|
|||||||
)
|
)
|
||||||
|
|
||||||
func TestTokenize(t *testing.T) {
|
func TestTokenize(t *testing.T) {
|
||||||
for _, test := range tests {
|
for _, test := range Tests {
|
||||||
t.Run(test.name, func(t *testing.T) {
|
t.Run(test.name, func(t *testing.T) {
|
||||||
before := string(test.content)
|
before := string(test.content)
|
||||||
tokens := TokenizeFlex(test.content)
|
tokens := Tokenize(test.content)
|
||||||
after := string(test.content)
|
after := string(test.content)
|
||||||
require.Equal(t, before, after, "the input slice was modified")
|
require.Equal(t, before, after, "the input slice was modified")
|
||||||
require.Equal(t, len(test.expected), len(tokens), fmt.Sprintf("token' slice length = %v, want %v", len(test.expected), len(tokens)))
|
require.Equal(t, len(test.expected), len(tokens), fmt.Sprintf("token' slice length = %v, want %v", len(test.expected), len(tokens)))
|
||||||
@ -118,39 +118,21 @@ func TestTokenize(t *testing.T) {
|
|||||||
func BenchmarkTokenizer_BaselineCopy(b *testing.B) {
|
func BenchmarkTokenizer_BaselineCopy(b *testing.B) {
|
||||||
b.ReportAllocs()
|
b.ReportAllocs()
|
||||||
for i := 0; i < b.N; i++ {
|
for i := 0; i < b.N; i++ {
|
||||||
for _, test := range tests {
|
for _, test := range Tests {
|
||||||
test.content = append([]byte(nil), test.content...)
|
test.content = append([]byte(nil), test.content...)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func BenchmarkTokenizerGo(b *testing.B) {
|
func BenchmarkTokenizer(b *testing.B) {
|
||||||
b.ReportAllocs()
|
b.ReportAllocs()
|
||||||
for i := 0; i < b.N; i++ {
|
for i := 0; i < b.N; i++ {
|
||||||
for _, test := range tests {
|
for _, test := range Tests {
|
||||||
Tokenize(test.content)
|
Tokenize(test.content)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func BenchmarkTokenizerC(b *testing.B) {
|
|
||||||
b.ReportAllocs()
|
|
||||||
for i := 0; i < b.N; i++ {
|
|
||||||
for _, test := range tests {
|
|
||||||
TokenizeC(test.content)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func BenchmarkTokenizerFlex(b *testing.B) {
|
|
||||||
b.ReportAllocs()
|
|
||||||
for i := 0; i < b.N; i++ {
|
|
||||||
for _, test := range tests {
|
|
||||||
TokenizeFlex(test.content)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
//TODO(bzz): introduce tokenizer benchmark suit
|
//TODO(bzz): introduce tokenizer benchmark suit
|
||||||
// baseline - just read the files
|
// baseline - just read the files
|
||||||
// RE2
|
// RE2
|
||||||
|
Reference in New Issue
Block a user