From 8756fbdcb477cee7c18fc88ab8f5fc28674f5d25 Mon Sep 17 00:00:00 2001 From: Alexander Bezzubov Date: Sun, 24 Mar 2019 18:55:05 +0100 Subject: [PATCH] refactor to build tags Signed-off-by: Alexander Bezzubov --- .gitignore | 1 + internal/tokenizer/common.go | 6 ++ .../tokenizer/{ => flex}/lex.linguist_yy.c | 0 .../tokenizer/{ => flex}/lex.linguist_yy.h | 0 internal/tokenizer/{ => flex}/linguist.h | 0 internal/tokenizer/flex/tokenize_c.go | 91 ++++++++++++++++++ internal/tokenizer/flex/tokenize_c_test.go | 25 +++++ internal/tokenizer/tokenize.go | 7 +- internal/tokenizer/tokenize_c.go | 95 ++----------------- internal/tokenizer/tokenize_test.go | 30 ++---- 10 files changed, 141 insertions(+), 114 deletions(-) create mode 100644 internal/tokenizer/common.go rename internal/tokenizer/{ => flex}/lex.linguist_yy.c (100%) rename internal/tokenizer/{ => flex}/lex.linguist_yy.h (100%) rename internal/tokenizer/{ => flex}/linguist.h (100%) create mode 100644 internal/tokenizer/flex/tokenize_c.go create mode 100644 internal/tokenizer/flex/tokenize_c_test.go diff --git a/.gitignore b/.gitignore index b76c892..78b3fb1 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,4 @@ Makefile.main build/ vendor/ java/lib/ +.vscode/ diff --git a/internal/tokenizer/common.go b/internal/tokenizer/common.go new file mode 100644 index 0000000..6cae666 --- /dev/null +++ b/internal/tokenizer/common.go @@ -0,0 +1,6 @@ +// Package tokenizer implements file tokenization used by the enry content +// classifier. This package is an implementation detail of enry and should not +// be imported by other packages. +package tokenizer + +const byteLimit = 100000 diff --git a/internal/tokenizer/lex.linguist_yy.c b/internal/tokenizer/flex/lex.linguist_yy.c similarity index 100% rename from internal/tokenizer/lex.linguist_yy.c rename to internal/tokenizer/flex/lex.linguist_yy.c diff --git a/internal/tokenizer/lex.linguist_yy.h b/internal/tokenizer/flex/lex.linguist_yy.h similarity index 100% rename from internal/tokenizer/lex.linguist_yy.h rename to internal/tokenizer/flex/lex.linguist_yy.h diff --git a/internal/tokenizer/linguist.h b/internal/tokenizer/flex/linguist.h similarity index 100% rename from internal/tokenizer/linguist.h rename to internal/tokenizer/flex/linguist.h diff --git a/internal/tokenizer/flex/tokenize_c.go b/internal/tokenizer/flex/tokenize_c.go new file mode 100644 index 0000000..e5c5fcd --- /dev/null +++ b/internal/tokenizer/flex/tokenize_c.go @@ -0,0 +1,91 @@ +package flex + +// #include +// #include "linguist.h" +// #include "lex.linguist_yy.h" +// int linguist_yywrap(yyscan_t yyscanner) { +// return 1; +// } +import "C" +import "unsafe" + +// TokenizeC is only calling a C-flex based tokenizer from linguist +func TokenizeC(content []byte) []string { + cs := C.CBytes(content) + defer C.free(unsafe.Pointer(cs)) + // C.tokenizer_extract_tokens((*C.char)(cs)) + return nil +} + +const maxTokenLen = 32 + + +// TokenizeFlex implements tokenizer by calling Flex generated code from linguist in C +func TokenizeFlex(content []byte) []string { + var buf C.YY_BUFFER_STATE + var scanner C.yyscan_t + var extra C.struct_tokenizer_extra + // var scanner *C.yyscan_t = (*C.yyscan_t)(C.malloc(C.sizeof_yyscan_t)) + // var extra *C.struct_tokenizer_extra = (*C.struct_tokenizer_extra)(C.malloc(C.sizeof_struct_tokenizer_extra)) + var _len C.ulong + var r C.int + + _len = C.ulong(len(content)) + cs := C.CBytes(content) + defer C.free(unsafe.Pointer(cs)) + + C.linguist_yylex_init_extra(&extra, &scanner) + buf = C.linguist_yy_scan_bytes((*C.char)(cs), _len, scanner) + + + ary := []string{} + for { + extra._type = C.NO_ACTION + extra.token = nil + r = C.linguist_yylex(scanner) + switch (extra._type) { + case C.NO_ACTION: + break + case C.REGULAR_TOKEN: + _len = C.strlen(extra.token) + if (_len <= maxTokenLen) { + ary = append(ary, C.GoStringN(extra.token, (C.int)(_len))) + //rb_ary_push(ary, rb_str_new(extra.token, len)) + } + C.free(unsafe.Pointer(extra.token)) + break + case C.SHEBANG_TOKEN: + _len = C.strlen(extra.token) + if (_len <= maxTokenLen) { + s := "SHEBANG#!" + C.GoStringN(extra.token, (C.int)(_len)) + ary = append(ary, s) + //s = rb_str_new2("SHEBANG#!"); + //rb_str_cat(s, extra.token, len); + //rb_ary_push(ary, s); + } + C.free(unsafe.Pointer(extra.token)) + break + case C.SGML_TOKEN: + _len = C.strlen(extra.token) + if (_len <= maxTokenLen) { + s := C.GoStringN(extra.token, (C.int)(_len)) + ">" + ary = append(ary, s) + //s = rb_str_new(extra.token, len); + //rb_str_cat2(s, ">"); + //rb_ary_push(ary, s); + } + C.free(unsafe.Pointer(extra.token)) + break + } + if r == 0 { + break + } + } + + C.linguist_yy_delete_buffer(buf, scanner) + C.linguist_yylex_destroy(scanner) + // C.free(unsafe.Pointer(extra)) + // C.free(unsafe.Pointer(scanner)) + + return ary +} diff --git a/internal/tokenizer/flex/tokenize_c_test.go b/internal/tokenizer/flex/tokenize_c_test.go new file mode 100644 index 0000000..e1150cd --- /dev/null +++ b/internal/tokenizer/flex/tokenize_c_test.go @@ -0,0 +1,25 @@ +package flex + +// import ( +// "testing" + +// "gopkg.in/src-d/enry.v1/internal/tokenizer" +// ) + +// func BenchmarkTokenizerC(b *testing.B) { +// b.ReportAllocs() +// for i := 0; i < b.N; i++ { +// for _, test := range tokenizer.Tests { +// TokenizeC(test.content) +// } +// } +// } + +// func BenchmarkTokenizerFlex(b *testing.B) { +// b.ReportAllocs() +// for i := 0; i < b.N; i++ { +// for _, test := range tokenizer.Tests { +// TokenizeFlex(test.content) +// } +// } +// } diff --git a/internal/tokenizer/tokenize.go b/internal/tokenizer/tokenize.go index 6a721c4..de30f36 100644 --- a/internal/tokenizer/tokenize.go +++ b/internal/tokenizer/tokenize.go @@ -1,6 +1,5 @@ -// Package tokenizer implements file tokenization used by the enry content -// classifier. This package is an implementation detail of enry and should not -// be imported by other packages. +// +build !flex + package tokenizer import ( @@ -9,8 +8,6 @@ import ( "gopkg.in/src-d/enry.v1/regex" ) -const byteLimit = 100000 - // Tokenize returns language-agnostic lexical tokens from content. The tokens // returned should match what the Linguist library returns. At most the first // 100KB of content are tokenized. diff --git a/internal/tokenizer/tokenize_c.go b/internal/tokenizer/tokenize_c.go index 6105f66..be4d023 100644 --- a/internal/tokenizer/tokenize_c.go +++ b/internal/tokenizer/tokenize_c.go @@ -1,91 +1,16 @@ +// +build flex + package tokenizer -// #include -// #include "linguist.h" -// #include "lex.linguist_yy.h" -// int linguist_yywrap(yyscan_t yyscanner) { -// return 1; -// } -import "C" -import "unsafe" +import "gopkg.in/src-d/enry.v1/internal/tokenizer/flex" -// TokenizeC is only calling a C-flex based tokenizer from linguist -func TokenizeC(content []byte) []string { - cs := C.CBytes(content) - defer C.free(unsafe.Pointer(cs)) - // C.tokenizer_extract_tokens((*C.char)(cs)) - return nil -} - -const maxTokenLen = 32 - - -// TokenizeFlex implements tokenizer by calling Flex generated code from linguist in C -func TokenizeFlex(content []byte) []string { - var buf C.YY_BUFFER_STATE - var scanner C.yyscan_t - var extra C.struct_tokenizer_extra - // var scanner *C.yyscan_t = (*C.yyscan_t)(C.malloc(C.sizeof_yyscan_t)) - // var extra *C.struct_tokenizer_extra = (*C.struct_tokenizer_extra)(C.malloc(C.sizeof_struct_tokenizer_extra)) - var _len C.ulong - var r C.int - - _len = C.ulong(len(content)) - cs := C.CBytes(content) - defer C.free(unsafe.Pointer(cs)) - - C.linguist_yylex_init_extra(&extra, &scanner) - buf = C.linguist_yy_scan_bytes((*C.char)(cs), _len, scanner) - - - ary := []string{} - for { - extra._type = C.NO_ACTION - extra.token = nil - r = C.linguist_yylex(scanner) - switch (extra._type) { - case C.NO_ACTION: - break - case C.REGULAR_TOKEN: - _len = C.strlen(extra.token) - if (_len <= maxTokenLen) { - ary = append(ary, C.GoStringN(extra.token, (C.int)(_len))) - //rb_ary_push(ary, rb_str_new(extra.token, len)) - } - C.free(unsafe.Pointer(extra.token)) - break - case C.SHEBANG_TOKEN: - _len = C.strlen(extra.token) - if (_len <= maxTokenLen) { - s := "SHEBANG#!" + C.GoStringN(extra.token, (C.int)(_len)) - ary = append(ary, s) - //s = rb_str_new2("SHEBANG#!"); - //rb_str_cat(s, extra.token, len); - //rb_ary_push(ary, s); - } - C.free(unsafe.Pointer(extra.token)) - break - case C.SGML_TOKEN: - _len = C.strlen(extra.token) - if (_len <= maxTokenLen) { - s := C.GoStringN(extra.token, (C.int)(_len)) + ">" - ary = append(ary, s) - //s = rb_str_new(extra.token, len); - //rb_str_cat2(s, ">"); - //rb_ary_push(ary, s); - } - C.free(unsafe.Pointer(extra.token)) - break - } - if r == 0 { - break - } +// Tokenize returns language-agnostic lexical tokens from content. The tokens +// returned should match what the Linguist library returns. At most the first +// 100KB of content are tokenized. +func Tokenize(content []byte) []string { + if len(content) > byteLimit { + content = content[:byteLimit] } - C.linguist_yy_delete_buffer(buf, scanner) - C.linguist_yylex_destroy(scanner) - // C.free(unsafe.Pointer(extra)) - // C.free(unsafe.Pointer(scanner)) - - return ary + return flex.TokenizeFlex(content) } diff --git a/internal/tokenizer/tokenize_test.go b/internal/tokenizer/tokenize_test.go index 969bf19..d9673b4 100644 --- a/internal/tokenizer/tokenize_test.go +++ b/internal/tokenizer/tokenize_test.go @@ -91,7 +91,7 @@ var ( "-", "|", "+", "&&", "<", "<", "-", "!", "!", "!", "=", "=", "!", ":", "=", ":", "=", ",", ",", "=", ">", ">", "=", "=", "=", "=", ">", "'", ",", ">", "=", ">", "=", "=", ">", "=", ">", ":", ">", "=", ">"} - tests = []struct { + Tests = []struct { name string content []byte expected []string @@ -101,10 +101,10 @@ var ( ) func TestTokenize(t *testing.T) { - for _, test := range tests { + for _, test := range Tests { t.Run(test.name, func(t *testing.T) { before := string(test.content) - tokens := TokenizeFlex(test.content) + tokens := Tokenize(test.content) after := string(test.content) require.Equal(t, before, after, "the input slice was modified") require.Equal(t, len(test.expected), len(tokens), fmt.Sprintf("token' slice length = %v, want %v", len(test.expected), len(tokens))) @@ -118,39 +118,21 @@ func TestTokenize(t *testing.T) { func BenchmarkTokenizer_BaselineCopy(b *testing.B) { b.ReportAllocs() for i := 0; i < b.N; i++ { - for _, test := range tests { + for _, test := range Tests { test.content = append([]byte(nil), test.content...) } } } -func BenchmarkTokenizerGo(b *testing.B) { +func BenchmarkTokenizer(b *testing.B) { b.ReportAllocs() for i := 0; i < b.N; i++ { - for _, test := range tests { + for _, test := range Tests { Tokenize(test.content) } } } -func BenchmarkTokenizerC(b *testing.B) { - b.ReportAllocs() - for i := 0; i < b.N; i++ { - for _, test := range tests { - TokenizeC(test.content) - } - } -} - -func BenchmarkTokenizerFlex(b *testing.B) { - b.ReportAllocs() - for i := 0; i < b.N; i++ { - for _, test := range tests { - TokenizeFlex(test.content) - } - } -} - //TODO(bzz): introduce tokenizer benchmark suit // baseline - just read the files // RE2