mirror of
https://github.com/ralsina/tartrazine.git
synced 2025-05-24 08:18:52 -03:00
tokenizer: cleanup & attributions
Signed-off-by: Alexander Bezzubov <bzz@apache.org>
This commit is contained in:
parent
8756fbdcb4
commit
7929933eb5
@ -1,3 +1,4 @@
|
|||||||
|
// https://github.com/github/linguist/blob/f72f2a21dfe80ebd16af3bc6216da75cd983a4f6/ext/linguist/linguist.h#L1
|
||||||
enum tokenizer_type {
|
enum tokenizer_type {
|
||||||
NO_ACTION,
|
NO_ACTION,
|
||||||
REGULAR_TOKEN,
|
REGULAR_TOKEN,
|
||||||
@ -10,11 +11,5 @@ struct tokenizer_extra {
|
|||||||
enum tokenizer_type type;
|
enum tokenizer_type type;
|
||||||
};
|
};
|
||||||
|
|
||||||
// #include <stddef.h>
|
// TODO(bzz) port Win support from
|
||||||
|
// https://github.com/github/linguist/commit/8e912b4d8bf2aef7948de59eba48b75cfcbc97e0
|
||||||
// #ifdef __APPLE__
|
|
||||||
// char *strndup(const char *s1, size_t n);
|
|
||||||
// #elif defined(_WIN32) || defined(_WIN64)
|
|
||||||
// char *strndup(const char *s1, size_t n);
|
|
||||||
// #pragma warning (disable: 4244)
|
|
||||||
// #endif // _WIN32 || _WIN64
|
|
@ -9,24 +9,15 @@ package flex
|
|||||||
import "C"
|
import "C"
|
||||||
import "unsafe"
|
import "unsafe"
|
||||||
|
|
||||||
// TokenizeC is only calling a C-flex based tokenizer from linguist
|
|
||||||
func TokenizeC(content []byte) []string {
|
|
||||||
cs := C.CBytes(content)
|
|
||||||
defer C.free(unsafe.Pointer(cs))
|
|
||||||
// C.tokenizer_extract_tokens((*C.char)(cs))
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
const maxTokenLen = 32
|
const maxTokenLen = 32
|
||||||
|
|
||||||
|
|
||||||
// TokenizeFlex implements tokenizer by calling Flex generated code from linguist in C
|
// TokenizeFlex implements tokenizer by calling Flex generated code from linguist in C
|
||||||
|
// This is a transliteration from C https://github.com/github/linguist/blob/master/ext/linguist/linguist.c#L12
|
||||||
func TokenizeFlex(content []byte) []string {
|
func TokenizeFlex(content []byte) []string {
|
||||||
var buf C.YY_BUFFER_STATE
|
var buf C.YY_BUFFER_STATE
|
||||||
var scanner C.yyscan_t
|
var scanner C.yyscan_t
|
||||||
var extra C.struct_tokenizer_extra
|
var extra C.struct_tokenizer_extra
|
||||||
// var scanner *C.yyscan_t = (*C.yyscan_t)(C.malloc(C.sizeof_yyscan_t))
|
|
||||||
// var extra *C.struct_tokenizer_extra = (*C.struct_tokenizer_extra)(C.malloc(C.sizeof_struct_tokenizer_extra))
|
|
||||||
var _len C.ulong
|
var _len C.ulong
|
||||||
var r C.int
|
var r C.int
|
||||||
|
|
||||||
@ -50,7 +41,6 @@ func TokenizeFlex(content []byte) []string {
|
|||||||
_len = C.strlen(extra.token)
|
_len = C.strlen(extra.token)
|
||||||
if (_len <= maxTokenLen) {
|
if (_len <= maxTokenLen) {
|
||||||
ary = append(ary, C.GoStringN(extra.token, (C.int)(_len)))
|
ary = append(ary, C.GoStringN(extra.token, (C.int)(_len)))
|
||||||
//rb_ary_push(ary, rb_str_new(extra.token, len))
|
|
||||||
}
|
}
|
||||||
C.free(unsafe.Pointer(extra.token))
|
C.free(unsafe.Pointer(extra.token))
|
||||||
break
|
break
|
||||||
@ -59,9 +49,6 @@ func TokenizeFlex(content []byte) []string {
|
|||||||
if (_len <= maxTokenLen) {
|
if (_len <= maxTokenLen) {
|
||||||
s := "SHEBANG#!" + C.GoStringN(extra.token, (C.int)(_len))
|
s := "SHEBANG#!" + C.GoStringN(extra.token, (C.int)(_len))
|
||||||
ary = append(ary, s)
|
ary = append(ary, s)
|
||||||
//s = rb_str_new2("SHEBANG#!");
|
|
||||||
//rb_str_cat(s, extra.token, len);
|
|
||||||
//rb_ary_push(ary, s);
|
|
||||||
}
|
}
|
||||||
C.free(unsafe.Pointer(extra.token))
|
C.free(unsafe.Pointer(extra.token))
|
||||||
break
|
break
|
||||||
@ -70,9 +57,6 @@ func TokenizeFlex(content []byte) []string {
|
|||||||
if (_len <= maxTokenLen) {
|
if (_len <= maxTokenLen) {
|
||||||
s := C.GoStringN(extra.token, (C.int)(_len)) + ">"
|
s := C.GoStringN(extra.token, (C.int)(_len)) + ">"
|
||||||
ary = append(ary, s)
|
ary = append(ary, s)
|
||||||
//s = rb_str_new(extra.token, len);
|
|
||||||
//rb_str_cat2(s, ">");
|
|
||||||
//rb_ary_push(ary, s);
|
|
||||||
}
|
}
|
||||||
C.free(unsafe.Pointer(extra.token))
|
C.free(unsafe.Pointer(extra.token))
|
||||||
break
|
break
|
||||||
@ -84,8 +68,6 @@ func TokenizeFlex(content []byte) []string {
|
|||||||
|
|
||||||
C.linguist_yy_delete_buffer(buf, scanner)
|
C.linguist_yy_delete_buffer(buf, scanner)
|
||||||
C.linguist_yylex_destroy(scanner)
|
C.linguist_yylex_destroy(scanner)
|
||||||
// C.free(unsafe.Pointer(extra))
|
|
||||||
// C.free(unsafe.Pointer(scanner))
|
|
||||||
|
|
||||||
return ary
|
return ary
|
||||||
}
|
}
|
||||||
|
@ -1,25 +0,0 @@
|
|||||||
package flex
|
|
||||||
|
|
||||||
// import (
|
|
||||||
// "testing"
|
|
||||||
|
|
||||||
// "gopkg.in/src-d/enry.v1/internal/tokenizer"
|
|
||||||
// )
|
|
||||||
|
|
||||||
// func BenchmarkTokenizerC(b *testing.B) {
|
|
||||||
// b.ReportAllocs()
|
|
||||||
// for i := 0; i < b.N; i++ {
|
|
||||||
// for _, test := range tokenizer.Tests {
|
|
||||||
// TokenizeC(test.content)
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
|
|
||||||
// func BenchmarkTokenizerFlex(b *testing.B) {
|
|
||||||
// b.ReportAllocs()
|
|
||||||
// for i := 0; i < b.N; i++ {
|
|
||||||
// for _, test := range tokenizer.Tests {
|
|
||||||
// TokenizeFlex(test.content)
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
// }
|
|
@ -132,9 +132,3 @@ func BenchmarkTokenizer(b *testing.B) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
//TODO(bzz): introduce tokenizer benchmark suit
|
|
||||||
// baseline - just read the files
|
|
||||||
// RE2
|
|
||||||
// oniguruma
|
|
||||||
// cgo to flex-based impl
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user