diff --git a/internal/tokenizer/flex/linguist.h b/internal/tokenizer/flex/linguist.h index c65de42..eda1796 100644 --- a/internal/tokenizer/flex/linguist.h +++ b/internal/tokenizer/flex/linguist.h @@ -1,3 +1,4 @@ +// https://github.com/github/linguist/blob/f72f2a21dfe80ebd16af3bc6216da75cd983a4f6/ext/linguist/linguist.h#L1 enum tokenizer_type { NO_ACTION, REGULAR_TOKEN, @@ -10,11 +11,5 @@ struct tokenizer_extra { enum tokenizer_type type; }; -// #include - -// #ifdef __APPLE__ -// char *strndup(const char *s1, size_t n); -// #elif defined(_WIN32) || defined(_WIN64) -// char *strndup(const char *s1, size_t n); -// #pragma warning (disable: 4244) -// #endif // _WIN32 || _WIN64 +// TODO(bzz) port Win support from +// https://github.com/github/linguist/commit/8e912b4d8bf2aef7948de59eba48b75cfcbc97e0 \ No newline at end of file diff --git a/internal/tokenizer/flex/tokenize_c.go b/internal/tokenizer/flex/tokenize_c.go index e5c5fcd..0c78ebe 100644 --- a/internal/tokenizer/flex/tokenize_c.go +++ b/internal/tokenizer/flex/tokenize_c.go @@ -9,24 +9,15 @@ package flex import "C" import "unsafe" -// TokenizeC is only calling a C-flex based tokenizer from linguist -func TokenizeC(content []byte) []string { - cs := C.CBytes(content) - defer C.free(unsafe.Pointer(cs)) - // C.tokenizer_extract_tokens((*C.char)(cs)) - return nil -} - const maxTokenLen = 32 // TokenizeFlex implements tokenizer by calling Flex generated code from linguist in C +// This is a transliteration from C https://github.com/github/linguist/blob/master/ext/linguist/linguist.c#L12 func TokenizeFlex(content []byte) []string { var buf C.YY_BUFFER_STATE var scanner C.yyscan_t var extra C.struct_tokenizer_extra - // var scanner *C.yyscan_t = (*C.yyscan_t)(C.malloc(C.sizeof_yyscan_t)) - // var extra *C.struct_tokenizer_extra = (*C.struct_tokenizer_extra)(C.malloc(C.sizeof_struct_tokenizer_extra)) var _len C.ulong var r C.int @@ -50,7 +41,6 @@ func TokenizeFlex(content []byte) []string { _len = C.strlen(extra.token) if (_len <= maxTokenLen) { ary = append(ary, C.GoStringN(extra.token, (C.int)(_len))) - //rb_ary_push(ary, rb_str_new(extra.token, len)) } C.free(unsafe.Pointer(extra.token)) break @@ -59,9 +49,6 @@ func TokenizeFlex(content []byte) []string { if (_len <= maxTokenLen) { s := "SHEBANG#!" + C.GoStringN(extra.token, (C.int)(_len)) ary = append(ary, s) - //s = rb_str_new2("SHEBANG#!"); - //rb_str_cat(s, extra.token, len); - //rb_ary_push(ary, s); } C.free(unsafe.Pointer(extra.token)) break @@ -70,9 +57,6 @@ func TokenizeFlex(content []byte) []string { if (_len <= maxTokenLen) { s := C.GoStringN(extra.token, (C.int)(_len)) + ">" ary = append(ary, s) - //s = rb_str_new(extra.token, len); - //rb_str_cat2(s, ">"); - //rb_ary_push(ary, s); } C.free(unsafe.Pointer(extra.token)) break @@ -84,8 +68,6 @@ func TokenizeFlex(content []byte) []string { C.linguist_yy_delete_buffer(buf, scanner) C.linguist_yylex_destroy(scanner) - // C.free(unsafe.Pointer(extra)) - // C.free(unsafe.Pointer(scanner)) return ary } diff --git a/internal/tokenizer/flex/tokenize_c_test.go b/internal/tokenizer/flex/tokenize_c_test.go deleted file mode 100644 index e1150cd..0000000 --- a/internal/tokenizer/flex/tokenize_c_test.go +++ /dev/null @@ -1,25 +0,0 @@ -package flex - -// import ( -// "testing" - -// "gopkg.in/src-d/enry.v1/internal/tokenizer" -// ) - -// func BenchmarkTokenizerC(b *testing.B) { -// b.ReportAllocs() -// for i := 0; i < b.N; i++ { -// for _, test := range tokenizer.Tests { -// TokenizeC(test.content) -// } -// } -// } - -// func BenchmarkTokenizerFlex(b *testing.B) { -// b.ReportAllocs() -// for i := 0; i < b.N; i++ { -// for _, test := range tokenizer.Tests { -// TokenizeFlex(test.content) -// } -// } -// } diff --git a/internal/tokenizer/tokenize_test.go b/internal/tokenizer/tokenize_test.go index d9673b4..9307cfd 100644 --- a/internal/tokenizer/tokenize_test.go +++ b/internal/tokenizer/tokenize_test.go @@ -132,9 +132,3 @@ func BenchmarkTokenizer(b *testing.B) { } } } - -//TODO(bzz): introduce tokenizer benchmark suit -// baseline - just read the files -// RE2 -// oniguruma -// cgo to flex-based impl