tokenizer: cleanup & attributions

Signed-off-by: Alexander Bezzubov <bzz@apache.org>
2025-09-15 18:07:32 +00:00 · 2019-04-08 16:52:26 +02:00
parent 8756fbdcb4
commit 7929933eb5
4 changed files with 4 additions and 58 deletions
--- a/internal/tokenizer/flex/linguist.h
+++ b/internal/tokenizer/flex/linguist.h
@@ -1,3 +1,4 @@
+// https://github.com/github/linguist/blob/f72f2a21dfe80ebd16af3bc6216da75cd983a4f6/ext/linguist/linguist.h#L1
 enum tokenizer_type {
  NO_ACTION,
  REGULAR_TOKEN,
@@ -10,11 +11,5 @@ struct tokenizer_extra {
  enum tokenizer_type type;
 };

-// #include <stddef.h>
-
-// #ifdef __APPLE__
-// char *strndup(const char *s1, size_t n);
-// #elif defined(_WIN32) || defined(_WIN64)
-// char *strndup(const char *s1, size_t n);
-// #pragma warning (disable: 4244)
-// #endif // _WIN32 || _WIN64
+// TODO(bzz) port Win support from
+// https://github.com/github/linguist/commit/8e912b4d8bf2aef7948de59eba48b75cfcbc97e0
--- a/internal/tokenizer/flex/tokenize_c.go
+++ b/internal/tokenizer/flex/tokenize_c.go
@@ -9,24 +9,15 @@ package flex
 import "C"
 import "unsafe"

-// TokenizeC is only calling a C-flex based tokenizer from linguist
-func TokenizeC(content []byte) []string {
-	cs := C.CBytes(content)
-	defer C.free(unsafe.Pointer(cs))
-	// C.tokenizer_extract_tokens((*C.char)(cs))
-	return nil
-}
-
 const maxTokenLen = 32


 // TokenizeFlex implements tokenizer by calling Flex generated code from linguist in C
+// This is a transliteration from C https://github.com/github/linguist/blob/master/ext/linguist/linguist.c#L12
 func TokenizeFlex(content []byte) []string {
 	var buf C.YY_BUFFER_STATE
 	var scanner C.yyscan_t
 	var extra C.struct_tokenizer_extra
-	// var scanner *C.yyscan_t = (*C.yyscan_t)(C.malloc(C.sizeof_yyscan_t))
-	// var extra *C.struct_tokenizer_extra = (*C.struct_tokenizer_extra)(C.malloc(C.sizeof_struct_tokenizer_extra))
 	var _len C.ulong
 	var r C.int

@@ -50,7 +41,6 @@ func TokenizeFlex(content []byte) []string {
 			_len = C.strlen(extra.token)
 			if (_len <= maxTokenLen) {
 				ary = append(ary, C.GoStringN(extra.token, (C.int)(_len)))
-				//rb_ary_push(ary, rb_str_new(extra.token, len))
 			}
 			C.free(unsafe.Pointer(extra.token))
 			break
@@ -59,9 +49,6 @@ func TokenizeFlex(content []byte) []string {
 			if (_len <= maxTokenLen) {
 				s := "SHEBANG#!" + C.GoStringN(extra.token, (C.int)(_len))
 				ary = append(ary, s)
-				//s = rb_str_new2("SHEBANG#!");
-				//rb_str_cat(s, extra.token, len);
-				//rb_ary_push(ary, s);
 			}
 			C.free(unsafe.Pointer(extra.token))
 			break
@@ -70,9 +57,6 @@ func TokenizeFlex(content []byte) []string {
 			if (_len <= maxTokenLen) {
 				s := C.GoStringN(extra.token, (C.int)(_len)) + ">"
 				ary = append(ary, s)
-				//s = rb_str_new(extra.token, len);
-				//rb_str_cat2(s, ">");
-				//rb_ary_push(ary, s);
 			}
 			C.free(unsafe.Pointer(extra.token))
 			break
@@ -84,8 +68,6 @@ func TokenizeFlex(content []byte) []string {

 	C.linguist_yy_delete_buffer(buf, scanner)
 	C.linguist_yylex_destroy(scanner)
-	// C.free(unsafe.Pointer(extra))
-	// C.free(unsafe.Pointer(scanner))

 	return ary
 }
--- a/internal/tokenizer/flex/tokenize_c_test.go
+++ b/internal/tokenizer/flex/tokenize_c_test.go
@@ -1,25 +0,0 @@
-package flex
-
-// import (
-// 	"testing"
-
-// 	"gopkg.in/src-d/enry.v1/internal/tokenizer"
-// )
-
-// func BenchmarkTokenizerC(b *testing.B) {
-// 	b.ReportAllocs()
-// 	for i := 0; i < b.N; i++ {
-// 		for _, test := range tokenizer.Tests {
-// 			TokenizeC(test.content)
-// 		}
-// 	}
-// }
-
-// func BenchmarkTokenizerFlex(b *testing.B) {
-// 	b.ReportAllocs()
-// 	for i := 0; i < b.N; i++ {
-// 		for _, test := range tokenizer.Tests {
-// 			TokenizeFlex(test.content)
-// 		}
-// 	}
-// }
--- a/internal/tokenizer/tokenize_test.go
+++ b/internal/tokenizer/tokenize_test.go
@@ -132,9 +132,3 @@ func BenchmarkTokenizer(b *testing.B) {
 		}
 	}
 }
-
-//TODO(bzz): introduce tokenizer benchmark suit
-// baseline - just read the files
-// RE2
-// oniguruma
-// cgo to flex-based impl