Merge pull request #218 from bzz/tokenizer-flex-cgo

New, optional flex-based tokenizer
This commit is contained in:
Alexander 2019-04-17 13:38:34 +02:00 committed by GitHub
commit b6daf5c079
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 2707 additions and 15 deletions

1
.gitignore vendored
View File

@ -8,3 +8,4 @@ Makefile.main
build/
vendor/
java/lib/
.vscode/

View File

@ -0,0 +1,7 @@
// Package tokenizer implements file tokenization used by the enry content
// classifier. This package is an implementation detail of enry and should not
// be imported by other packages.
package tokenizer
// ByteLimit defines the maximum prefix of an input text that will be tokenized.
const ByteLimit = 100000

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,336 @@
#ifndef linguist_yyHEADER_H
#define linguist_yyHEADER_H 1
#define linguist_yyIN_HEADER 1
#line 6 "lex.linguist_yy.h"
#define YY_INT_ALIGNED short int
/* A lexical scanner generated by flex */
#define FLEX_SCANNER
#define YY_FLEX_MAJOR_VERSION 2
#define YY_FLEX_MINOR_VERSION 5
#define YY_FLEX_SUBMINOR_VERSION 35
#if YY_FLEX_SUBMINOR_VERSION > 0
#define FLEX_BETA
#endif
/* First, we deal with platform-specific or compiler-specific issues. */
/* begin standard C headers. */
#include <stdio.h>
#include <string.h>
#include <errno.h>
#include <stdlib.h>
/* end standard C headers. */
/* flex integer type definitions */
#ifndef FLEXINT_H
#define FLEXINT_H
/* C99 systems have <inttypes.h>. Non-C99 systems may or may not. */
#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
/* C99 says to define __STDC_LIMIT_MACROS before including stdint.h,
* if you want the limit (max/min) macros for int types.
*/
#ifndef __STDC_LIMIT_MACROS
#define __STDC_LIMIT_MACROS 1
#endif
#include <inttypes.h>
typedef int8_t flex_int8_t;
typedef uint8_t flex_uint8_t;
typedef int16_t flex_int16_t;
typedef uint16_t flex_uint16_t;
typedef int32_t flex_int32_t;
typedef uint32_t flex_uint32_t;
typedef uint64_t flex_uint64_t;
#else
typedef signed char flex_int8_t;
typedef short int flex_int16_t;
typedef int flex_int32_t;
typedef unsigned char flex_uint8_t;
typedef unsigned short int flex_uint16_t;
typedef unsigned int flex_uint32_t;
#endif /* ! C99 */
/* Limits of integral types. */
#ifndef INT8_MIN
#define INT8_MIN (-128)
#endif
#ifndef INT16_MIN
#define INT16_MIN (-32767-1)
#endif
#ifndef INT32_MIN
#define INT32_MIN (-2147483647-1)
#endif
#ifndef INT8_MAX
#define INT8_MAX (127)
#endif
#ifndef INT16_MAX
#define INT16_MAX (32767)
#endif
#ifndef INT32_MAX
#define INT32_MAX (2147483647)
#endif
#ifndef UINT8_MAX
#define UINT8_MAX (255U)
#endif
#ifndef UINT16_MAX
#define UINT16_MAX (65535U)
#endif
#ifndef UINT32_MAX
#define UINT32_MAX (4294967295U)
#endif
#endif /* ! FLEXINT_H */
#ifdef __cplusplus
/* The "const" storage-class-modifier is valid. */
#define YY_USE_CONST
#else /* ! __cplusplus */
/* C99 requires __STDC__ to be defined as 1. */
#if defined (__STDC__)
#define YY_USE_CONST
#endif /* defined (__STDC__) */
#endif /* ! __cplusplus */
#ifdef YY_USE_CONST
#define yyconst const
#else
#define yyconst
#endif
/* An opaque pointer. */
#ifndef YY_TYPEDEF_YY_SCANNER_T
#define YY_TYPEDEF_YY_SCANNER_T
typedef void* yyscan_t;
#endif
/* For convenience, these vars (plus the bison vars far below)
are macros in the reentrant scanner. */
#define yyin yyg->yyin_r
#define yyout yyg->yyout_r
#define yyextra yyg->yyextra_r
#define yyleng yyg->yyleng_r
#define yytext yyg->yytext_r
#define yylineno (YY_CURRENT_BUFFER_LVALUE->yy_bs_lineno)
#define yycolumn (YY_CURRENT_BUFFER_LVALUE->yy_bs_column)
#define yy_flex_debug yyg->yy_flex_debug_r
/* Size of default input buffer. */
#ifndef YY_BUF_SIZE
#define YY_BUF_SIZE 16384
#endif
#ifndef YY_TYPEDEF_YY_BUFFER_STATE
#define YY_TYPEDEF_YY_BUFFER_STATE
typedef struct yy_buffer_state *YY_BUFFER_STATE;
#endif
#ifndef YY_TYPEDEF_YY_SIZE_T
#define YY_TYPEDEF_YY_SIZE_T
typedef size_t yy_size_t;
#endif
#ifndef YY_STRUCT_YY_BUFFER_STATE
#define YY_STRUCT_YY_BUFFER_STATE
struct yy_buffer_state
{
FILE *yy_input_file;
char *yy_ch_buf; /* input buffer */
char *yy_buf_pos; /* current position in input buffer */
/* Size of input buffer in bytes, not including room for EOB
* characters.
*/
yy_size_t yy_buf_size;
/* Number of characters read into yy_ch_buf, not including EOB
* characters.
*/
yy_size_t yy_n_chars;
/* Whether we "own" the buffer - i.e., we know we created it,
* and can realloc() it to grow it, and should free() it to
* delete it.
*/
int yy_is_our_buffer;
/* Whether this is an "interactive" input source; if so, and
* if we're using stdio for input, then we want to use getc()
* instead of fread(), to make sure we stop fetching input after
* each newline.
*/
int yy_is_interactive;
/* Whether we're considered to be at the beginning of a line.
* If so, '^' rules will be active on the next match, otherwise
* not.
*/
int yy_at_bol;
int yy_bs_lineno; /**< The line count. */
int yy_bs_column; /**< The column count. */
/* Whether to try to fill the input buffer when we reach the
* end of it.
*/
int yy_fill_buffer;
int yy_buffer_status;
};
#endif /* !YY_STRUCT_YY_BUFFER_STATE */
void linguist_yyrestart (FILE *input_file ,yyscan_t yyscanner );
void linguist_yy_switch_to_buffer (YY_BUFFER_STATE new_buffer ,yyscan_t yyscanner );
YY_BUFFER_STATE linguist_yy_create_buffer (FILE *file,int size ,yyscan_t yyscanner );
void linguist_yy_delete_buffer (YY_BUFFER_STATE b ,yyscan_t yyscanner );
void linguist_yy_flush_buffer (YY_BUFFER_STATE b ,yyscan_t yyscanner );
void linguist_yypush_buffer_state (YY_BUFFER_STATE new_buffer ,yyscan_t yyscanner );
void linguist_yypop_buffer_state (yyscan_t yyscanner );
YY_BUFFER_STATE linguist_yy_scan_buffer (char *base,yy_size_t size ,yyscan_t yyscanner );
YY_BUFFER_STATE linguist_yy_scan_string (yyconst char *yy_str ,yyscan_t yyscanner );
YY_BUFFER_STATE linguist_yy_scan_bytes (yyconst char *bytes,yy_size_t len ,yyscan_t yyscanner );
void *linguist_yyalloc (yy_size_t ,yyscan_t yyscanner );
void *linguist_yyrealloc (void *,yy_size_t ,yyscan_t yyscanner );
void linguist_yyfree (void * ,yyscan_t yyscanner );
/* Begin user sect3 */
#define yytext_ptr yytext_r
#ifdef YY_HEADER_EXPORT_START_CONDITIONS
#define INITIAL 0
#define sgml 1
#define c_comment 2
#define xml_comment 3
#define haskell_comment 4
#define ocaml_comment 5
#define python_dcomment 6
#define python_scomment 7
#endif
#ifndef YY_NO_UNISTD_H
/* Special case for "unistd.h", since it is non-ANSI. We include it way
* down here because we want the user's section 1 to have been scanned first.
* The user has a chance to override it with an option.
*/
#include <unistd.h>
#endif
#define YY_EXTRA_TYPE struct tokenizer_extra *
int linguist_yylex_init (yyscan_t* scanner);
int linguist_yylex_init_extra (YY_EXTRA_TYPE user_defined,yyscan_t* scanner);
/* Accessor methods to globals.
These are made visible to non-reentrant scanners for convenience. */
int linguist_yylex_destroy (yyscan_t yyscanner );
int linguist_yyget_debug (yyscan_t yyscanner );
void linguist_yyset_debug (int debug_flag ,yyscan_t yyscanner );
YY_EXTRA_TYPE linguist_yyget_extra (yyscan_t yyscanner );
void linguist_yyset_extra (YY_EXTRA_TYPE user_defined ,yyscan_t yyscanner );
FILE *linguist_yyget_in (yyscan_t yyscanner );
void linguist_yyset_in (FILE * in_str ,yyscan_t yyscanner );
FILE *linguist_yyget_out (yyscan_t yyscanner );
void linguist_yyset_out (FILE * out_str ,yyscan_t yyscanner );
yy_size_t linguist_yyget_leng (yyscan_t yyscanner );
char *linguist_yyget_text (yyscan_t yyscanner );
int linguist_yyget_lineno (yyscan_t yyscanner );
void linguist_yyset_lineno (int line_number ,yyscan_t yyscanner );
/* Macros after this point can all be overridden by user definitions in
* section 1.
*/
#ifndef YY_SKIP_YYWRAP
#ifdef __cplusplus
extern "C" int linguist_yywrap (yyscan_t yyscanner );
#else
extern int linguist_yywrap (yyscan_t yyscanner );
#endif
#endif
#ifndef yytext_ptr
static void yy_flex_strncpy (char *,yyconst char *,int ,yyscan_t yyscanner);
#endif
#ifdef YY_NEED_STRLEN
static int yy_flex_strlen (yyconst char * ,yyscan_t yyscanner);
#endif
#ifndef YY_NO_INPUT
#endif
/* Amount of stuff to slurp up with each read. */
#ifndef YY_READ_BUF_SIZE
#define YY_READ_BUF_SIZE 8192
#endif
/* Number of entries by which start-condition stack grows. */
#ifndef YY_START_STACK_INCR
#define YY_START_STACK_INCR 25
#endif
/* Default declaration of generated scanner - a define so the user can
* easily add parameters.
*/
#ifndef YY_DECL
#define YY_DECL_IS_OURS 1
extern int linguist_yylex (yyscan_t yyscanner);
#define YY_DECL int linguist_yylex (yyscan_t yyscanner)
#endif /* !YY_DECL */
/* yy_get_previous_state - get the state just before the EOB char was reached */
#undef YY_NEW_FILE
#undef YY_FLUSH_BUFFER
#undef yy_set_bol
#undef yy_new_buffer
#undef yy_set_interactive
#undef YY_DO_BEFORE_ACTION
#ifdef YY_DECL_IS_OURS
#undef YY_DECL_IS_OURS
#undef YY_DECL
#endif
#line 118 "tokenizer.l"
#line 335 "lex.linguist_yy.h"
#undef linguist_yyIN_HEADER
#endif /* linguist_yyHEADER_H */

View File

@ -0,0 +1,15 @@
// https://github.com/github/linguist/blob/f72f2a21dfe80ebd16af3bc6216da75cd983a4f6/ext/linguist/linguist.h#L1
enum tokenizer_type {
NO_ACTION,
REGULAR_TOKEN,
SHEBANG_TOKEN,
SGML_TOKEN,
};
struct tokenizer_extra {
char *token;
enum tokenizer_type type;
};
// TODO(bzz) port Win support from
// https://github.com/github/linguist/commit/8e912b4d8bf2aef7948de59eba48b75cfcbc97e0

View File

@ -0,0 +1,71 @@
package flex
// #include <stdlib.h>
// #include "linguist.h"
// #include "lex.linguist_yy.h"
// int linguist_yywrap(yyscan_t yyscanner) {
// return 1;
// }
import "C"
import "unsafe"
const maxTokenLen = 32 // bytes
// TokenizeFlex implements tokenizer by calling Flex generated code from linguist in C
// This is a transliteration from C https://github.com/github/linguist/blob/master/ext/linguist/linguist.c#L12
func TokenizeFlex(content []byte) []string {
var buf C.YY_BUFFER_STATE
var scanner C.yyscan_t
var extra C.struct_tokenizer_extra
var _len C.ulong
var r C.int
_len = C.ulong(len(content))
cs := C.CBytes(content)
defer C.free(unsafe.Pointer(cs))
C.linguist_yylex_init_extra(&extra, &scanner)
buf = C.linguist_yy_scan_bytes((*C.char)(cs), _len, scanner)
ary := []string{}
for {
extra._type = C.NO_ACTION
extra.token = nil
r = C.linguist_yylex(scanner)
switch extra._type {
case C.NO_ACTION:
break
case C.REGULAR_TOKEN:
_len = C.strlen(extra.token)
if _len <= maxTokenLen {
ary = append(ary, C.GoStringN(extra.token, (C.int)(_len)))
}
C.free(unsafe.Pointer(extra.token))
break
case C.SHEBANG_TOKEN:
_len = C.strlen(extra.token)
if _len <= maxTokenLen {
s := "SHEBANG#!" + C.GoStringN(extra.token, (C.int)(_len))
ary = append(ary, s)
}
C.free(unsafe.Pointer(extra.token))
break
case C.SGML_TOKEN:
_len = C.strlen(extra.token)
if _len <= maxTokenLen {
s := C.GoStringN(extra.token, (C.int)(_len)) + ">"
ary = append(ary, s)
}
C.free(unsafe.Pointer(extra.token))
break
}
if r == 0 {
break
}
}
C.linguist_yy_delete_buffer(buf, scanner)
C.linguist_yylex_destroy(scanner)
return ary
}

View File

@ -1,6 +1,5 @@
// Package tokenizer implements file tokenization used by the enry content
// classifier. This package is an implementation detail of enry and should not
// be imported by other packages.
// +build !flex
package tokenizer
import (
@ -9,14 +8,14 @@ import (
"gopkg.in/src-d/enry.v1/regex"
)
const byteLimit = 100000
// Tokenize returns language-agnostic lexical tokens from content. The tokens
// returned should match what the Linguist library returns. At most the first
// 100KB of content are tokenized.
// Tokenize returns lexical tokens from content. The tokens returned match what
// the Linguist library returns. At most the first ByteLimit bytes of content are tokenized.
//
// BUG: Until https://github.com/src-d/enry/issues/193 is resolved, there are some
// differences between this function and the Linguist output.
func Tokenize(content []byte) []string {
if len(content) > byteLimit {
content = content[:byteLimit]
if len(content) > ByteLimit {
content = content[:ByteLimit]
}
// Copy the input so that changes wrought by the tokenization steps do not

View File

@ -0,0 +1,15 @@
// +build flex
package tokenizer
import "gopkg.in/src-d/enry.v1/internal/tokenizer/flex"
// Tokenize returns lexical tokens from content. The tokens returned match what
// the Linguist library returns. At most the first ByteLimit bytes of content are tokenized.
func Tokenize(content []byte) []string {
if len(content) > ByteLimit {
content = content[:ByteLimit]
}
return flex.TokenizeFlex(content)
}

View File

@ -5,6 +5,7 @@ import (
"testing"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
const (
@ -89,27 +90,48 @@ var (
"XHTML", "sample", "file", "type", "#example", "background", "color", "yellow", "id", "Just", "a", "simple", "XHTML", "test", "page.",
"-", "|", "+", "&&", "<", "<", "-", "!", "!", "!", "=", "=", "!", ":", "=", ":", "=", ",", ",", "=", ">", ">", "=", "=", "=", "=", ">",
"'", ",", ">", "=", ">", "=", "=", ">", "=", ">", ":", ">", "=", ">"}
)
func TestTokenize(t *testing.T) {
tests := []struct {
tests = []struct {
name string
content []byte
expected []string
}{
{name: "content", content: []byte(testContent), expected: tokensFromTestContent},
}
)
func TestTokenize(t *testing.T) {
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
before := string(test.content)
tokens := Tokenize(test.content)
after := string(test.content)
assert.Equal(t, before, after, "the input slice was modified")
assert.Equal(t, len(test.expected), len(tokens), fmt.Sprintf("token' slice length = %v, want %v", len(test.expected), len(tokens)))
require.Equal(t, before, after, "the input slice was modified")
require.Equal(t, len(test.expected), len(tokens), fmt.Sprintf("token' slice length = %v, want %v", len(test.expected), len(tokens)))
for i, expectedToken := range test.expected {
assert.Equal(t, expectedToken, tokens[i], fmt.Sprintf("token = %v, want %v", tokens[i], expectedToken))
}
})
}
}
func BenchmarkTokenizer_BaselineCopy(b *testing.B) {
b.ReportAllocs()
for i := 0; i < b.N; i++ {
for _, test := range tests {
if len(test.content) > ByteLimit {
test.content = test.content[:ByteLimit]
}
_ = append([]byte(nil), test.content...)
}
}
}
func BenchmarkTokenizer(b *testing.B) {
b.ReportAllocs()
for i := 0; i < b.N; i++ {
for _, test := range tests {
Tokenize(test.content)
}
}
}