Merge pull request #218 from bzz/tokenizer-flex-cgo

New, optional flex-based tokenizer
2025-08-24 14:02:07 +00:00 · 2019-04-17 13:38:34 +02:00
parent ab3c26b46d 7e136bade8
commit b6daf5c079
9 changed files with 2707 additions and 15 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -8,3 +8,4 @@ Makefile.main
 build/
 vendor/
 java/lib/
+.vscode/
--- a/internal/tokenizer/common.go
+++ b/internal/tokenizer/common.go
@@ -0,0 +1,7 @@
+// Package tokenizer implements file tokenization used by the enry content
+// classifier. This package is an implementation detail of enry and should not
+// be imported by other packages.
+package tokenizer
+
+// ByteLimit defines the maximum prefix of an input text that will be tokenized.
+const ByteLimit = 100000
--- a/internal/tokenizer/flex/lex.linguist_yy.c
+++ b/internal/tokenizer/flex/lex.linguist_yy.c
--- a/internal/tokenizer/flex/lex.linguist_yy.h
+++ b/internal/tokenizer/flex/lex.linguist_yy.h
@@ -0,0 +1,336 @@
+#ifndef linguist_yyHEADER_H
+#define linguist_yyHEADER_H 1
+#define linguist_yyIN_HEADER 1
+
+#line 6 "lex.linguist_yy.h"
+
+#define  YY_INT_ALIGNED short int
+
+/* A lexical scanner generated by flex */
+
+#define FLEX_SCANNER
+#define YY_FLEX_MAJOR_VERSION 2
+#define YY_FLEX_MINOR_VERSION 5
+#define YY_FLEX_SUBMINOR_VERSION 35
+#if YY_FLEX_SUBMINOR_VERSION > 0
+#define FLEX_BETA
+#endif
+
+/* First, we deal with  platform-specific or compiler-specific issues. */
+
+/* begin standard C headers. */
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <stdlib.h>
+
+/* end standard C headers. */
+
+/* flex integer type definitions */
+
+#ifndef FLEXINT_H
+#define FLEXINT_H
+
+/* C99 systems have <inttypes.h>. Non-C99 systems may or may not. */
+
+#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+
+/* C99 says to define __STDC_LIMIT_MACROS before including stdint.h,
+ * if you want the limit (max/min) macros for int types. 
+ */
+#ifndef __STDC_LIMIT_MACROS
+#define __STDC_LIMIT_MACROS 1
+#endif
+
+#include <inttypes.h>
+typedef int8_t flex_int8_t;
+typedef uint8_t flex_uint8_t;
+typedef int16_t flex_int16_t;
+typedef uint16_t flex_uint16_t;
+typedef int32_t flex_int32_t;
+typedef uint32_t flex_uint32_t;
+typedef uint64_t flex_uint64_t;
+#else
+typedef signed char flex_int8_t;
+typedef short int flex_int16_t;
+typedef int flex_int32_t;
+typedef unsigned char flex_uint8_t; 
+typedef unsigned short int flex_uint16_t;
+typedef unsigned int flex_uint32_t;
+#endif /* ! C99 */
+
+/* Limits of integral types. */
+#ifndef INT8_MIN
+#define INT8_MIN               (-128)
+#endif
+#ifndef INT16_MIN
+#define INT16_MIN              (-32767-1)
+#endif
+#ifndef INT32_MIN
+#define INT32_MIN              (-2147483647-1)
+#endif
+#ifndef INT8_MAX
+#define INT8_MAX               (127)
+#endif
+#ifndef INT16_MAX
+#define INT16_MAX              (32767)
+#endif
+#ifndef INT32_MAX
+#define INT32_MAX              (2147483647)
+#endif
+#ifndef UINT8_MAX
+#define UINT8_MAX              (255U)
+#endif
+#ifndef UINT16_MAX
+#define UINT16_MAX             (65535U)
+#endif
+#ifndef UINT32_MAX
+#define UINT32_MAX             (4294967295U)
+#endif
+
+#endif /* ! FLEXINT_H */
+
+#ifdef __cplusplus
+
+/* The "const" storage-class-modifier is valid. */
+#define YY_USE_CONST
+
+#else	/* ! __cplusplus */
+
+/* C99 requires __STDC__ to be defined as 1. */
+#if defined (__STDC__)
+
+#define YY_USE_CONST
+
+#endif	/* defined (__STDC__) */
+#endif	/* ! __cplusplus */
+
+#ifdef YY_USE_CONST
+#define yyconst const
+#else
+#define yyconst
+#endif
+
+/* An opaque pointer. */
+#ifndef YY_TYPEDEF_YY_SCANNER_T
+#define YY_TYPEDEF_YY_SCANNER_T
+typedef void* yyscan_t;
+#endif
+
+/* For convenience, these vars (plus the bison vars far below)
+   are macros in the reentrant scanner. */
+#define yyin yyg->yyin_r
+#define yyout yyg->yyout_r
+#define yyextra yyg->yyextra_r
+#define yyleng yyg->yyleng_r
+#define yytext yyg->yytext_r
+#define yylineno (YY_CURRENT_BUFFER_LVALUE->yy_bs_lineno)
+#define yycolumn (YY_CURRENT_BUFFER_LVALUE->yy_bs_column)
+#define yy_flex_debug yyg->yy_flex_debug_r
+
+/* Size of default input buffer. */
+#ifndef YY_BUF_SIZE
+#define YY_BUF_SIZE 16384
+#endif
+
+#ifndef YY_TYPEDEF_YY_BUFFER_STATE
+#define YY_TYPEDEF_YY_BUFFER_STATE
+typedef struct yy_buffer_state *YY_BUFFER_STATE;
+#endif
+
+#ifndef YY_TYPEDEF_YY_SIZE_T
+#define YY_TYPEDEF_YY_SIZE_T
+typedef size_t yy_size_t;
+#endif
+
+#ifndef YY_STRUCT_YY_BUFFER_STATE
+#define YY_STRUCT_YY_BUFFER_STATE
+struct yy_buffer_state
+	{
+	FILE *yy_input_file;
+
+	char *yy_ch_buf;		/* input buffer */
+	char *yy_buf_pos;		/* current position in input buffer */
+
+	/* Size of input buffer in bytes, not including room for EOB
+	 * characters.
+	 */
+	yy_size_t yy_buf_size;
+
+	/* Number of characters read into yy_ch_buf, not including EOB
+	 * characters.
+	 */
+	yy_size_t yy_n_chars;
+
+	/* Whether we "own" the buffer - i.e., we know we created it,
+	 * and can realloc() it to grow it, and should free() it to
+	 * delete it.
+	 */
+	int yy_is_our_buffer;
+
+	/* Whether this is an "interactive" input source; if so, and
+	 * if we're using stdio for input, then we want to use getc()
+	 * instead of fread(), to make sure we stop fetching input after
+	 * each newline.
+	 */
+	int yy_is_interactive;
+
+	/* Whether we're considered to be at the beginning of a line.
+	 * If so, '^' rules will be active on the next match, otherwise
+	 * not.
+	 */
+	int yy_at_bol;
+
+    int yy_bs_lineno; /**< The line count. */
+    int yy_bs_column; /**< The column count. */
+    
+	/* Whether to try to fill the input buffer when we reach the
+	 * end of it.
+	 */
+	int yy_fill_buffer;
+
+	int yy_buffer_status;
+
+	};
+#endif /* !YY_STRUCT_YY_BUFFER_STATE */
+
+void linguist_yyrestart (FILE *input_file ,yyscan_t yyscanner );
+void linguist_yy_switch_to_buffer (YY_BUFFER_STATE new_buffer ,yyscan_t yyscanner );
+YY_BUFFER_STATE linguist_yy_create_buffer (FILE *file,int size ,yyscan_t yyscanner );
+void linguist_yy_delete_buffer (YY_BUFFER_STATE b ,yyscan_t yyscanner );
+void linguist_yy_flush_buffer (YY_BUFFER_STATE b ,yyscan_t yyscanner );
+void linguist_yypush_buffer_state (YY_BUFFER_STATE new_buffer ,yyscan_t yyscanner );
+void linguist_yypop_buffer_state (yyscan_t yyscanner );
+
+YY_BUFFER_STATE linguist_yy_scan_buffer (char *base,yy_size_t size ,yyscan_t yyscanner );
+YY_BUFFER_STATE linguist_yy_scan_string (yyconst char *yy_str ,yyscan_t yyscanner );
+YY_BUFFER_STATE linguist_yy_scan_bytes (yyconst char *bytes,yy_size_t len ,yyscan_t yyscanner );
+
+void *linguist_yyalloc (yy_size_t ,yyscan_t yyscanner );
+void *linguist_yyrealloc (void *,yy_size_t ,yyscan_t yyscanner );
+void linguist_yyfree (void * ,yyscan_t yyscanner );
+
+/* Begin user sect3 */
+
+#define yytext_ptr yytext_r
+
+#ifdef YY_HEADER_EXPORT_START_CONDITIONS
+#define INITIAL 0
+#define sgml 1
+#define c_comment 2
+#define xml_comment 3
+#define haskell_comment 4
+#define ocaml_comment 5
+#define python_dcomment 6
+#define python_scomment 7
+
+#endif
+
+#ifndef YY_NO_UNISTD_H
+/* Special case for "unistd.h", since it is non-ANSI. We include it way
+ * down here because we want the user's section 1 to have been scanned first.
+ * The user has a chance to override it with an option.
+ */
+#include <unistd.h>
+#endif
+
+#define YY_EXTRA_TYPE struct tokenizer_extra *
+
+int linguist_yylex_init (yyscan_t* scanner);
+
+int linguist_yylex_init_extra (YY_EXTRA_TYPE user_defined,yyscan_t* scanner);
+
+/* Accessor methods to globals.
+   These are made visible to non-reentrant scanners for convenience. */
+
+int linguist_yylex_destroy (yyscan_t yyscanner );
+
+int linguist_yyget_debug (yyscan_t yyscanner );
+
+void linguist_yyset_debug (int debug_flag ,yyscan_t yyscanner );
+
+YY_EXTRA_TYPE linguist_yyget_extra (yyscan_t yyscanner );
+
+void linguist_yyset_extra (YY_EXTRA_TYPE user_defined ,yyscan_t yyscanner );
+
+FILE *linguist_yyget_in (yyscan_t yyscanner );
+
+void linguist_yyset_in  (FILE * in_str ,yyscan_t yyscanner );
+
+FILE *linguist_yyget_out (yyscan_t yyscanner );
+
+void linguist_yyset_out  (FILE * out_str ,yyscan_t yyscanner );
+
+yy_size_t linguist_yyget_leng (yyscan_t yyscanner );
+
+char *linguist_yyget_text (yyscan_t yyscanner );
+
+int linguist_yyget_lineno (yyscan_t yyscanner );
+
+void linguist_yyset_lineno (int line_number ,yyscan_t yyscanner );
+
+/* Macros after this point can all be overridden by user definitions in
+ * section 1.
+ */
+
+#ifndef YY_SKIP_YYWRAP
+#ifdef __cplusplus
+extern "C" int linguist_yywrap (yyscan_t yyscanner );
+#else
+extern int linguist_yywrap (yyscan_t yyscanner );
+#endif
+#endif
+
+#ifndef yytext_ptr
+static void yy_flex_strncpy (char *,yyconst char *,int ,yyscan_t yyscanner);
+#endif
+
+#ifdef YY_NEED_STRLEN
+static int yy_flex_strlen (yyconst char * ,yyscan_t yyscanner);
+#endif
+
+#ifndef YY_NO_INPUT
+
+#endif
+
+/* Amount of stuff to slurp up with each read. */
+#ifndef YY_READ_BUF_SIZE
+#define YY_READ_BUF_SIZE 8192
+#endif
+
+/* Number of entries by which start-condition stack grows. */
+#ifndef YY_START_STACK_INCR
+#define YY_START_STACK_INCR 25
+#endif
+
+/* Default declaration of generated scanner - a define so the user can
+ * easily add parameters.
+ */
+#ifndef YY_DECL
+#define YY_DECL_IS_OURS 1
+
+extern int linguist_yylex (yyscan_t yyscanner);
+
+#define YY_DECL int linguist_yylex (yyscan_t yyscanner)
+#endif /* !YY_DECL */
+
+/* yy_get_previous_state - get the state just before the EOB char was reached */
+
+#undef YY_NEW_FILE
+#undef YY_FLUSH_BUFFER
+#undef yy_set_bol
+#undef yy_new_buffer
+#undef yy_set_interactive
+#undef YY_DO_BEFORE_ACTION
+
+#ifdef YY_DECL_IS_OURS
+#undef YY_DECL_IS_OURS
+#undef YY_DECL
+#endif
+
+#line 118 "tokenizer.l"
+
+
+#line 335 "lex.linguist_yy.h"
+#undef linguist_yyIN_HEADER
+#endif /* linguist_yyHEADER_H */
--- a/internal/tokenizer/flex/linguist.h
+++ b/internal/tokenizer/flex/linguist.h
@@ -0,0 +1,15 @@
+// https://github.com/github/linguist/blob/f72f2a21dfe80ebd16af3bc6216da75cd983a4f6/ext/linguist/linguist.h#L1
+enum tokenizer_type {
+  NO_ACTION,
+  REGULAR_TOKEN,
+  SHEBANG_TOKEN,
+  SGML_TOKEN,
+};
+
+struct tokenizer_extra {
+  char *token;
+  enum tokenizer_type type;
+};
+
+// TODO(bzz) port Win support from
+// https://github.com/github/linguist/commit/8e912b4d8bf2aef7948de59eba48b75cfcbc97e0
--- a/internal/tokenizer/flex/tokenize_c.go
+++ b/internal/tokenizer/flex/tokenize_c.go
@@ -0,0 +1,71 @@
+package flex
+
+// #include <stdlib.h>
+// #include "linguist.h"
+// #include "lex.linguist_yy.h"
+// int linguist_yywrap(yyscan_t yyscanner) {
+// 	return 1;
+// }
+import "C"
+import "unsafe"
+
+const maxTokenLen = 32 // bytes
+
+// TokenizeFlex implements tokenizer by calling Flex generated code from linguist in C
+// This is a transliteration from C https://github.com/github/linguist/blob/master/ext/linguist/linguist.c#L12
+func TokenizeFlex(content []byte) []string {
+	var buf C.YY_BUFFER_STATE
+	var scanner C.yyscan_t
+	var extra C.struct_tokenizer_extra
+	var _len C.ulong
+	var r C.int
+
+	_len = C.ulong(len(content))
+	cs := C.CBytes(content)
+	defer C.free(unsafe.Pointer(cs))
+
+	C.linguist_yylex_init_extra(&extra, &scanner)
+	buf = C.linguist_yy_scan_bytes((*C.char)(cs), _len, scanner)
+
+	ary := []string{}
+	for {
+		extra._type = C.NO_ACTION
+		extra.token = nil
+		r = C.linguist_yylex(scanner)
+		switch extra._type {
+		case C.NO_ACTION:
+			break
+		case C.REGULAR_TOKEN:
+			_len = C.strlen(extra.token)
+			if _len <= maxTokenLen {
+				ary = append(ary, C.GoStringN(extra.token, (C.int)(_len)))
+			}
+			C.free(unsafe.Pointer(extra.token))
+			break
+		case C.SHEBANG_TOKEN:
+			_len = C.strlen(extra.token)
+			if _len <= maxTokenLen {
+				s := "SHEBANG#!" + C.GoStringN(extra.token, (C.int)(_len))
+				ary = append(ary, s)
+			}
+			C.free(unsafe.Pointer(extra.token))
+			break
+		case C.SGML_TOKEN:
+			_len = C.strlen(extra.token)
+			if _len <= maxTokenLen {
+				s := C.GoStringN(extra.token, (C.int)(_len)) + ">"
+				ary = append(ary, s)
+			}
+			C.free(unsafe.Pointer(extra.token))
+			break
+		}
+		if r == 0 {
+			break
+		}
+	}
+
+	C.linguist_yy_delete_buffer(buf, scanner)
+	C.linguist_yylex_destroy(scanner)
+
+	return ary
+}
--- a/internal/tokenizer/tokenize.go
+++ b/internal/tokenizer/tokenize.go
@@ -1,6 +1,5 @@
-// Package tokenizer implements file tokenization used by the enry content
-// classifier. This package is an implementation detail of enry and should not
-// be imported by other packages.
+// +build !flex
+
 package tokenizer

 import (
@@ -9,14 +8,14 @@ import (
 	"gopkg.in/src-d/enry.v1/regex"
 )

-const byteLimit = 100000
-
-// Tokenize returns language-agnostic lexical tokens from content. The tokens
-// returned should match what the Linguist library returns. At most the first
-// 100KB of content are tokenized.
+// Tokenize returns lexical tokens from content. The tokens returned match what
+// the Linguist library returns. At most the first ByteLimit bytes of content are tokenized.
+//
+// BUG: Until https://github.com/src-d/enry/issues/193 is resolved, there are some
+// differences between this function and the Linguist output.
 func Tokenize(content []byte) []string {
-	if len(content) > byteLimit {
-		content = content[:byteLimit]
+	if len(content) > ByteLimit {
+		content = content[:ByteLimit]
 	}

 	// Copy the input so that changes wrought by the tokenization steps do not
--- a/internal/tokenizer/tokenize_c.go
+++ b/internal/tokenizer/tokenize_c.go
@@ -0,0 +1,15 @@
+// +build flex
+
+package tokenizer
+
+import "gopkg.in/src-d/enry.v1/internal/tokenizer/flex"
+
+// Tokenize returns lexical tokens from content. The tokens returned match what
+// the Linguist library returns. At most the first ByteLimit bytes of content are tokenized.
+func Tokenize(content []byte) []string {
+	if len(content) > ByteLimit {
+		content = content[:ByteLimit]
+	}
+
+	return flex.TokenizeFlex(content)
+}
--- a/internal/tokenizer/tokenize_test.go
+++ b/internal/tokenizer/tokenize_test.go
@@ -5,6 +5,7 @@ import (
 	"testing"

 	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
 )

 const (
@@ -89,27 +90,48 @@ var (
 		"XHTML", "sample", "file", "type", "#example", "background", "color", "yellow", "id", "Just", "a", "simple", "XHTML", "test", "page.",
 		"-", "|", "+", "&&", "<", "<", "-", "!", "!", "!", "=", "=", "!", ":", "=", ":", "=", ",", ",", "=", ">", ">", "=", "=", "=", "=", ">",
 		"'", ",", ">", "=", ">", "=", "=", ">", "=", ">", ":", ">", "=", ">"}
-)

-func TestTokenize(t *testing.T) {
-	tests := []struct {
+	tests = []struct {
 		name     string
 		content  []byte
 		expected []string
 	}{
 		{name: "content", content: []byte(testContent), expected: tokensFromTestContent},
 	}
+)

+func TestTokenize(t *testing.T) {
 	for _, test := range tests {
 		t.Run(test.name, func(t *testing.T) {
 			before := string(test.content)
 			tokens := Tokenize(test.content)
 			after := string(test.content)
-			assert.Equal(t, before, after, "the input slice was modified")
-			assert.Equal(t, len(test.expected), len(tokens), fmt.Sprintf("token' slice length = %v, want %v", len(test.expected), len(tokens)))
+			require.Equal(t, before, after, "the input slice was modified")
+			require.Equal(t, len(test.expected), len(tokens), fmt.Sprintf("token' slice length = %v, want %v", len(test.expected), len(tokens)))
 			for i, expectedToken := range test.expected {
 				assert.Equal(t, expectedToken, tokens[i], fmt.Sprintf("token = %v, want %v", tokens[i], expectedToken))
 			}
 		})
 	}
 }
+
+func BenchmarkTokenizer_BaselineCopy(b *testing.B) {
+	b.ReportAllocs()
+	for i := 0; i < b.N; i++ {
+		for _, test := range tests {
+			if len(test.content) > ByteLimit {
+				test.content = test.content[:ByteLimit]
+			}
+			_ = append([]byte(nil), test.content...)
+		}
+	}
+}
+
+func BenchmarkTokenizer(b *testing.B) {
+	b.ReportAllocs()
+	for i := 0; i < b.N; i++ {
+		for _, test := range tests {
+			Tokenize(test.content)
+		}
+	}
+}