Squashed 'go-enry/' content from commit 7e3a9a7

git-subtree-dir: go-enry git-subtree-split: 7e3a9a7241
2025-09-02 03:37:31 +00:00 · 2024-09-04 16:33:41 -03:00
commit f955c625ad
192 changed files with 528500 additions and 0 deletions
--- a/internal/tokenizer/common.go
+++ b/internal/tokenizer/common.go
@@ -0,0 +1,7 @@
+// Package tokenizer implements file tokenization used by the enry content
+// classifier. This package is an implementation detail of enry and should not
+// be imported by other packages.
+package tokenizer
+
+// ByteLimit defines the maximum prefix of an input text that will be tokenized.
+const ByteLimit = 100000
--- a/internal/tokenizer/flex/lex.linguist_yy.c
+++ b/internal/tokenizer/flex/lex.linguist_yy.c
--- a/internal/tokenizer/flex/lex.linguist_yy.h
+++ b/internal/tokenizer/flex/lex.linguist_yy.h
@@ -0,0 +1,336 @@
+#ifndef linguist_yyHEADER_H
+#define linguist_yyHEADER_H 1
+#define linguist_yyIN_HEADER 1
+
+#line 6 "lex.linguist_yy.h"
+
+#define  YY_INT_ALIGNED short int
+
+/* A lexical scanner generated by flex */
+
+#define FLEX_SCANNER
+#define YY_FLEX_MAJOR_VERSION 2
+#define YY_FLEX_MINOR_VERSION 5
+#define YY_FLEX_SUBMINOR_VERSION 35
+#if YY_FLEX_SUBMINOR_VERSION > 0
+#define FLEX_BETA
+#endif
+
+/* First, we deal with  platform-specific or compiler-specific issues. */
+
+/* begin standard C headers. */
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <stdlib.h>
+
+/* end standard C headers. */
+
+/* flex integer type definitions */
+
+#ifndef FLEXINT_H
+#define FLEXINT_H
+
+/* C99 systems have <inttypes.h>. Non-C99 systems may or may not. */
+
+#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+
+/* C99 says to define __STDC_LIMIT_MACROS before including stdint.h,
+ * if you want the limit (max/min) macros for int types. 
+ */
+#ifndef __STDC_LIMIT_MACROS
+#define __STDC_LIMIT_MACROS 1
+#endif
+
+#include <inttypes.h>
+typedef int8_t flex_int8_t;
+typedef uint8_t flex_uint8_t;
+typedef int16_t flex_int16_t;
+typedef uint16_t flex_uint16_t;
+typedef int32_t flex_int32_t;
+typedef uint32_t flex_uint32_t;
+typedef uint64_t flex_uint64_t;
+#else
+typedef signed char flex_int8_t;
+typedef short int flex_int16_t;
+typedef int flex_int32_t;
+typedef unsigned char flex_uint8_t; 
+typedef unsigned short int flex_uint16_t;
+typedef unsigned int flex_uint32_t;
+#endif /* ! C99 */
+
+/* Limits of integral types. */
+#ifndef INT8_MIN
+#define INT8_MIN               (-128)
+#endif
+#ifndef INT16_MIN
+#define INT16_MIN              (-32767-1)
+#endif
+#ifndef INT32_MIN
+#define INT32_MIN              (-2147483647-1)
+#endif
+#ifndef INT8_MAX
+#define INT8_MAX               (127)
+#endif
+#ifndef INT16_MAX
+#define INT16_MAX              (32767)
+#endif
+#ifndef INT32_MAX
+#define INT32_MAX              (2147483647)
+#endif
+#ifndef UINT8_MAX
+#define UINT8_MAX              (255U)
+#endif
+#ifndef UINT16_MAX
+#define UINT16_MAX             (65535U)
+#endif
+#ifndef UINT32_MAX
+#define UINT32_MAX             (4294967295U)
+#endif
+
+#endif /* ! FLEXINT_H */
+
+#ifdef __cplusplus
+
+/* The "const" storage-class-modifier is valid. */
+#define YY_USE_CONST
+
+#else	/* ! __cplusplus */
+
+/* C99 requires __STDC__ to be defined as 1. */
+#if defined (__STDC__)
+
+#define YY_USE_CONST
+
+#endif	/* defined (__STDC__) */
+#endif	/* ! __cplusplus */
+
+#ifdef YY_USE_CONST
+#define yyconst const
+#else
+#define yyconst
+#endif
+
+/* An opaque pointer. */
+#ifndef YY_TYPEDEF_YY_SCANNER_T
+#define YY_TYPEDEF_YY_SCANNER_T
+typedef void* yyscan_t;
+#endif
+
+/* For convenience, these vars (plus the bison vars far below)
+   are macros in the reentrant scanner. */
+#define yyin yyg->yyin_r
+#define yyout yyg->yyout_r
+#define yyextra yyg->yyextra_r
+#define yyleng yyg->yyleng_r
+#define yytext yyg->yytext_r
+#define yylineno (YY_CURRENT_BUFFER_LVALUE->yy_bs_lineno)
+#define yycolumn (YY_CURRENT_BUFFER_LVALUE->yy_bs_column)
+#define yy_flex_debug yyg->yy_flex_debug_r
+
+/* Size of default input buffer. */
+#ifndef YY_BUF_SIZE
+#define YY_BUF_SIZE 16384
+#endif
+
+#ifndef YY_TYPEDEF_YY_BUFFER_STATE
+#define YY_TYPEDEF_YY_BUFFER_STATE
+typedef struct yy_buffer_state *YY_BUFFER_STATE;
+#endif
+
+#ifndef YY_TYPEDEF_YY_SIZE_T
+#define YY_TYPEDEF_YY_SIZE_T
+typedef size_t yy_size_t;
+#endif
+
+#ifndef YY_STRUCT_YY_BUFFER_STATE
+#define YY_STRUCT_YY_BUFFER_STATE
+struct yy_buffer_state
+	{
+	FILE *yy_input_file;
+
+	char *yy_ch_buf;		/* input buffer */
+	char *yy_buf_pos;		/* current position in input buffer */
+
+	/* Size of input buffer in bytes, not including room for EOB
+	 * characters.
+	 */
+	yy_size_t yy_buf_size;
+
+	/* Number of characters read into yy_ch_buf, not including EOB
+	 * characters.
+	 */
+	yy_size_t yy_n_chars;
+
+	/* Whether we "own" the buffer - i.e., we know we created it,
+	 * and can realloc() it to grow it, and should free() it to
+	 * delete it.
+	 */
+	int yy_is_our_buffer;
+
+	/* Whether this is an "interactive" input source; if so, and
+	 * if we're using stdio for input, then we want to use getc()
+	 * instead of fread(), to make sure we stop fetching input after
+	 * each newline.
+	 */
+	int yy_is_interactive;
+
+	/* Whether we're considered to be at the beginning of a line.
+	 * If so, '^' rules will be active on the next match, otherwise
+	 * not.
+	 */
+	int yy_at_bol;
+
+    int yy_bs_lineno; /**< The line count. */
+    int yy_bs_column; /**< The column count. */
+    
+	/* Whether to try to fill the input buffer when we reach the
+	 * end of it.
+	 */
+	int yy_fill_buffer;
+
+	int yy_buffer_status;
+
+	};
+#endif /* !YY_STRUCT_YY_BUFFER_STATE */
+
+void linguist_yyrestart (FILE *input_file ,yyscan_t yyscanner );
+void linguist_yy_switch_to_buffer (YY_BUFFER_STATE new_buffer ,yyscan_t yyscanner );
+YY_BUFFER_STATE linguist_yy_create_buffer (FILE *file,int size ,yyscan_t yyscanner );
+void linguist_yy_delete_buffer (YY_BUFFER_STATE b ,yyscan_t yyscanner );
+void linguist_yy_flush_buffer (YY_BUFFER_STATE b ,yyscan_t yyscanner );
+void linguist_yypush_buffer_state (YY_BUFFER_STATE new_buffer ,yyscan_t yyscanner );
+void linguist_yypop_buffer_state (yyscan_t yyscanner );
+
+YY_BUFFER_STATE linguist_yy_scan_buffer (char *base,yy_size_t size ,yyscan_t yyscanner );
+YY_BUFFER_STATE linguist_yy_scan_string (yyconst char *yy_str ,yyscan_t yyscanner );
+YY_BUFFER_STATE linguist_yy_scan_bytes (yyconst char *bytes,yy_size_t len ,yyscan_t yyscanner );
+
+void *linguist_yyalloc (yy_size_t ,yyscan_t yyscanner );
+void *linguist_yyrealloc (void *,yy_size_t ,yyscan_t yyscanner );
+void linguist_yyfree (void * ,yyscan_t yyscanner );
+
+/* Begin user sect3 */
+
+#define yytext_ptr yytext_r
+
+#ifdef YY_HEADER_EXPORT_START_CONDITIONS
+#define INITIAL 0
+#define sgml 1
+#define c_comment 2
+#define xml_comment 3
+#define haskell_comment 4
+#define ocaml_comment 5
+#define python_dcomment 6
+#define python_scomment 7
+
+#endif
+
+#ifndef YY_NO_UNISTD_H
+/* Special case for "unistd.h", since it is non-ANSI. We include it way
+ * down here because we want the user's section 1 to have been scanned first.
+ * The user has a chance to override it with an option.
+ */
+#include <unistd.h>
+#endif
+
+#define YY_EXTRA_TYPE struct tokenizer_extra *
+
+int linguist_yylex_init (yyscan_t* scanner);
+
+int linguist_yylex_init_extra (YY_EXTRA_TYPE user_defined,yyscan_t* scanner);
+
+/* Accessor methods to globals.
+   These are made visible to non-reentrant scanners for convenience. */
+
+int linguist_yylex_destroy (yyscan_t yyscanner );
+
+int linguist_yyget_debug (yyscan_t yyscanner );
+
+void linguist_yyset_debug (int debug_flag ,yyscan_t yyscanner );
+
+YY_EXTRA_TYPE linguist_yyget_extra (yyscan_t yyscanner );
+
+void linguist_yyset_extra (YY_EXTRA_TYPE user_defined ,yyscan_t yyscanner );
+
+FILE *linguist_yyget_in (yyscan_t yyscanner );
+
+void linguist_yyset_in  (FILE * in_str ,yyscan_t yyscanner );
+
+FILE *linguist_yyget_out (yyscan_t yyscanner );
+
+void linguist_yyset_out  (FILE * out_str ,yyscan_t yyscanner );
+
+yy_size_t linguist_yyget_leng (yyscan_t yyscanner );
+
+char *linguist_yyget_text (yyscan_t yyscanner );
+
+int linguist_yyget_lineno (yyscan_t yyscanner );
+
+void linguist_yyset_lineno (int line_number ,yyscan_t yyscanner );
+
+/* Macros after this point can all be overridden by user definitions in
+ * section 1.
+ */
+
+#ifndef YY_SKIP_YYWRAP
+#ifdef __cplusplus
+extern "C" int linguist_yywrap (yyscan_t yyscanner );
+#else
+extern int linguist_yywrap (yyscan_t yyscanner );
+#endif
+#endif
+
+#ifndef yytext_ptr
+static void yy_flex_strncpy (char *,yyconst char *,int ,yyscan_t yyscanner);
+#endif
+
+#ifdef YY_NEED_STRLEN
+static int yy_flex_strlen (yyconst char * ,yyscan_t yyscanner);
+#endif
+
+#ifndef YY_NO_INPUT
+
+#endif
+
+/* Amount of stuff to slurp up with each read. */
+#ifndef YY_READ_BUF_SIZE
+#define YY_READ_BUF_SIZE 8192
+#endif
+
+/* Number of entries by which start-condition stack grows. */
+#ifndef YY_START_STACK_INCR
+#define YY_START_STACK_INCR 25
+#endif
+
+/* Default declaration of generated scanner - a define so the user can
+ * easily add parameters.
+ */
+#ifndef YY_DECL
+#define YY_DECL_IS_OURS 1
+
+extern int linguist_yylex (yyscan_t yyscanner);
+
+#define YY_DECL int linguist_yylex (yyscan_t yyscanner)
+#endif /* !YY_DECL */
+
+/* yy_get_previous_state - get the state just before the EOB char was reached */
+
+#undef YY_NEW_FILE
+#undef YY_FLUSH_BUFFER
+#undef yy_set_bol
+#undef yy_new_buffer
+#undef yy_set_interactive
+#undef YY_DO_BEFORE_ACTION
+
+#ifdef YY_DECL_IS_OURS
+#undef YY_DECL_IS_OURS
+#undef YY_DECL
+#endif
+
+#line 118 "tokenizer.l"
+
+
+#line 335 "lex.linguist_yy.h"
+#undef linguist_yyIN_HEADER
+#endif /* linguist_yyHEADER_H */
--- a/internal/tokenizer/flex/linguist.h
+++ b/internal/tokenizer/flex/linguist.h
@@ -0,0 +1,15 @@
+// https://github.com/github/linguist/blob/f72f2a21dfe80ebd16af3bc6216da75cd983a4f6/ext/linguist/linguist.h#L1
+enum tokenizer_type {
+  NO_ACTION,
+  REGULAR_TOKEN,
+  SHEBANG_TOKEN,
+  SGML_TOKEN,
+};
+
+struct tokenizer_extra {
+  char *token;
+  enum tokenizer_type type;
+};
+
+// TODO(bzz) port Win support from
+// https://github.com/github/linguist/commit/8e912b4d8bf2aef7948de59eba48b75cfcbc97e0
--- a/internal/tokenizer/flex/tokenize_c.go
+++ b/internal/tokenizer/flex/tokenize_c.go
@@ -0,0 +1,73 @@
+// +build flex
+
+package flex
+
+// #include <stdlib.h>
+// #include "linguist.h"
+// #include "lex.linguist_yy.h"
+// int linguist_yywrap(yyscan_t yyscanner) {
+// 	return 1;
+// }
+import "C"
+import "unsafe"
+
+const maxTokenLen = 32 // bytes
+
+// TokenizeFlex implements tokenizer by calling Flex generated code from linguist in C
+// This is a transliteration from C https://github.com/github/linguist/blob/master/ext/linguist/linguist.c#L12
+func TokenizeFlex(content []byte) []string {
+	var buf C.YY_BUFFER_STATE
+	var scanner C.yyscan_t
+	var extra C.struct_tokenizer_extra
+	var _len C.ulong
+	var r C.int
+
+	_len = C.ulong(len(content))
+	cs := C.CBytes(content)
+	defer C.free(unsafe.Pointer(cs))
+
+	C.linguist_yylex_init_extra(&extra, &scanner)
+	buf = C.linguist_yy_scan_bytes((*C.char)(cs), _len, scanner)
+
+	ary := []string{}
+	for {
+		extra._type = C.NO_ACTION
+		extra.token = nil
+		r = C.linguist_yylex(scanner)
+		switch extra._type {
+		case C.NO_ACTION:
+			break
+		case C.REGULAR_TOKEN:
+			_len = C.strlen(extra.token)
+			if _len <= maxTokenLen {
+				ary = append(ary, C.GoStringN(extra.token, (C.int)(_len)))
+			}
+			C.free(unsafe.Pointer(extra.token))
+			break
+		case C.SHEBANG_TOKEN:
+			_len = C.strlen(extra.token)
+			if _len <= maxTokenLen {
+				s := "SHEBANG#!" + C.GoStringN(extra.token, (C.int)(_len))
+				ary = append(ary, s)
+			}
+			C.free(unsafe.Pointer(extra.token))
+			break
+		case C.SGML_TOKEN:
+			_len = C.strlen(extra.token)
+			if _len <= maxTokenLen {
+				s := C.GoStringN(extra.token, (C.int)(_len)) + ">"
+				ary = append(ary, s)
+			}
+			C.free(unsafe.Pointer(extra.token))
+			break
+		}
+		if r == 0 {
+			break
+		}
+	}
+
+	C.linguist_yy_delete_buffer(buf, scanner)
+	C.linguist_yylex_destroy(scanner)
+
+	return ary
+}
--- a/internal/tokenizer/tokenize.go
+++ b/internal/tokenizer/tokenize.go
@@ -0,0 +1,210 @@
+// +build !flex
+
+package tokenizer
+
+import (
+	"bytes"
+
+	"github.com/go-enry/go-enry/v2/regex"
+)
+
+// Tokenize returns lexical tokens from content. The tokens returned match what
+// the Linguist library returns. At most the first ByteLimit bytes of content are tokenized.
+//
+// BUG: Until https://github.com/src-d/enry/issues/193 is resolved, there are some
+// differences between this function and the Linguist output.
+func Tokenize(content []byte) []string {
+	if len(content) > ByteLimit {
+		content = content[:ByteLimit]
+	}
+
+	tokens := make([][]byte, 0, 50)
+	for _, extract := range extractTokens {
+		var extractedTokens [][]byte
+		content, extractedTokens = extract(content)
+		tokens = append(tokens, extractedTokens...)
+	}
+
+	return toString(tokens)
+}
+
+func toString(tokens [][]byte) []string {
+	stokens := make([]string, 0, len(tokens))
+	for _, token := range tokens {
+		stokens = append(stokens, string(token))
+	}
+
+	return stokens
+}
+
+var (
+	extractTokens = []func(content []byte) (replacedContent []byte, tokens [][]byte){
+		// The order to must be this
+		extractAndReplaceShebang,
+		extractAndReplaceSGML,
+		skipCommentsAndLiterals,
+		extractAndReplacePunctuation,
+		extractAndReplaceRegular,
+		extractAndReplaceOperator,
+		extractRemainders,
+	}
+
+	// Differences between golang regexp and oniguruma:
+	// 1. no (?s) in oniguruma - makes dot match \n
+	// 2. no (?U) in oniguruma - ungreedy *
+	// 3. (?m) implies dot matches \n in oniguruma
+	// 4. oniguruma handles \w differently - impossible, but true
+	//
+	// Workarounds:
+	// 1. (.|\n)
+	// 2. replace * with *?
+	// 3. replace . with [^\n]
+	// 4. replace \w with [0-9A-Za-z_]
+	//
+	// Original golang regexps:
+	//
+	// reLiteralStringQuotes = regexp.MustCompile(`(?sU)(".*"|'.*')`)
+	// reSingleLineComment   = regexp.MustCompile(`(?m)(//|--|#|%|")\s(.*$)`)
+	// reMultilineComment    = regexp.MustCompile(`(?sU)(/\*.*\*/|<!--.*-->|\{-.*-\}|\(\*.*\*\)|""".*"""|'''.*''')`)
+	// reLiteralNumber       = regexp.MustCompile(`(0x[0-9A-Fa-f]([0-9A-Fa-f]|\.)*|\d(\d|\.)*)([uU][lL]{0,2}|([eE][-+]\d*)?[fFlL]*)`)
+	// reShebang             = regexp.MustCompile(`(?m)^#!(?:/\w+)*/(?:(\w+)|\w+(?:\s*\w+=\w+\s*)*\s*(\w+))(?:\s*-\w+\s*)*$`)
+	// rePunctuation         = regexp.MustCompile(`;|\{|\}|\(|\)|\[|\]`)
+	// reSGML                = regexp.MustCompile(`(?sU)(<\/?[^\s<>=\d"']+)(?:\s.*\/?>|>)`)
+	// reSGMLComment         = regexp.MustCompile(`(?sU)(<!--.*-->)`)
+	// reSGMLAttributes      = regexp.MustCompile(`\s+(\w+=)|\s+([^\s>]+)`)
+	// reSGMLLoneAttribute   = regexp.MustCompile(`(\w+)`)
+	// reRegularToken        = regexp.MustCompile(`[\w\.@#\/\*]+`)
+	// reOperators           = regexp.MustCompile(`<<?|\+|\-|\*|\/|%|&&?|\|\|?`)
+	//
+	// These regexps were converted to work in the same way for both engines:
+	//
+	reLiteralStringQuotes = regex.MustCompile(`("(.|\n)*?"|'(.|\n)*?')`)
+	reSingleLineComment   = regex.MustCompile(`(?m)(//|--|#|%|")\s([^\n]*$)`)
+	reMultilineComment    = regex.MustCompile(`(/\*(.|\n)*?\*/|<!--(.|\n)*?-->|\{-(.|\n)*?-\}|\(\*(.|\n)*?\*\)|"""(.|\n)*?"""|'''(.|\n)*?''')`)
+	reLiteralNumber       = regex.MustCompile(`(0x[0-9A-Fa-f]([0-9A-Fa-f]|\.)*|\d(\d|\.)*)([uU][lL]{0,2}|([eE][-+]\d*)?[fFlL]*)`)
+	reShebang             = regex.MustCompile(`(?m)^#!(?:/[0-9A-Za-z_]+)*/(?:([0-9A-Za-z_]+)|[0-9A-Za-z_]+(?:\s*[0-9A-Za-z_]+=[0-9A-Za-z_]+\s*)*\s*([0-9A-Za-z_]+))(?:\s*-[0-9A-Za-z_]+\s*)*$`)
+	rePunctuation         = regex.MustCompile(`;|\{|\}|\(|\)|\[|\]`)
+	reSGML                = regex.MustCompile(`(<\/?[^\s<>=\d"']+)(?:\s(.|\n)*?\/?>|>)`)
+	reSGMLComment         = regex.MustCompile(`(<!--(.|\n)*?-->)`)
+	reSGMLAttributes      = regex.MustCompile(`\s+([0-9A-Za-z_]+=)|\s+([^\s>]+)`)
+	reSGMLLoneAttribute   = regex.MustCompile(`([0-9A-Za-z_]+)`)
+	reRegularToken        = regex.MustCompile(`[0-9A-Za-z_\.@#\/\*]+`)
+	reOperators           = regex.MustCompile(`<<?|\+|\-|\*|\/|%|&&?|\|\|?`)
+
+	regexToSkip = []regex.EnryRegexp{
+		// The order must be this
+		reLiteralStringQuotes,
+		reMultilineComment,
+		reSingleLineComment,
+		reLiteralNumber,
+	}
+)
+
+func extractAndReplaceShebang(content []byte) ([]byte, [][]byte) {
+	var shebangTokens [][]byte
+	matches := reShebang.FindAllSubmatch(content, -1)
+	if matches != nil {
+		shebangTokens = make([][]byte, 0, 2)
+		for _, match := range matches {
+			shebangToken := getShebangToken(match)
+			shebangTokens = append(shebangTokens, shebangToken)
+		}
+
+		reShebang.ReplaceAll(content, []byte(` `))
+	}
+
+	return content, shebangTokens
+}
+
+func getShebangToken(matchedShebang [][]byte) []byte {
+	const prefix = `SHEBANG#!`
+	var token []byte
+	for i := 1; i < len(matchedShebang); i++ {
+		if len(matchedShebang[i]) > 0 {
+			token = matchedShebang[i]
+			break
+		}
+	}
+
+	tokenShebang := append([]byte(prefix), token...)
+	return tokenShebang
+}
+
+func commonExtractAndReplace(content []byte, re regex.EnryRegexp) ([]byte, [][]byte) {
+	tokens := re.FindAll(content, -1)
+	content = re.ReplaceAll(content, []byte(` `))
+	return content, tokens
+}
+
+func extractAndReplacePunctuation(content []byte) ([]byte, [][]byte) {
+	return commonExtractAndReplace(content, rePunctuation)
+}
+
+func extractAndReplaceRegular(content []byte) ([]byte, [][]byte) {
+	return commonExtractAndReplace(content, reRegularToken)
+}
+
+func extractAndReplaceOperator(content []byte) ([]byte, [][]byte) {
+	return commonExtractAndReplace(content, reOperators)
+}
+
+func extractAndReplaceSGML(content []byte) ([]byte, [][]byte) {
+	var SGMLTokens [][]byte
+	matches := reSGML.FindAllSubmatch(content, -1)
+	if matches != nil {
+		SGMLTokens = make([][]byte, 0, 2)
+		for _, match := range matches {
+			if reSGMLComment.Match(match[0]) {
+				continue
+			}
+
+			token := append(append([]byte(nil), match[1]...), '>')
+			SGMLTokens = append(SGMLTokens, token)
+			attributes := getSGMLAttributes(match[0])
+			SGMLTokens = append(SGMLTokens, attributes...)
+		}
+
+		content = reSGML.ReplaceAll(content, []byte(` `))
+	}
+
+	return content, SGMLTokens
+}
+
+func getSGMLAttributes(SGMLTag []byte) [][]byte {
+	var attributes [][]byte
+	matches := reSGMLAttributes.FindAllSubmatch(SGMLTag, -1)
+	if matches != nil {
+		attributes = make([][]byte, 0, 5)
+		for _, match := range matches {
+			if len(match[1]) != 0 {
+				attributes = append(attributes, match[1])
+			}
+
+			if len(match[2]) != 0 {
+				loneAttributes := reSGMLLoneAttribute.FindAll(match[2], -1)
+				attributes = append(attributes, loneAttributes...)
+			}
+		}
+	}
+
+	return attributes
+}
+
+func skipCommentsAndLiterals(content []byte) ([]byte, [][]byte) {
+	for _, skip := range regexToSkip {
+		content = skip.ReplaceAll(content, []byte(` `))
+	}
+
+	return content, nil
+}
+
+func extractRemainders(content []byte) ([]byte, [][]byte) {
+	splitted := bytes.Fields(content)
+	remainderTokens := make([][]byte, 0, len(splitted)*3)
+	for _, remainder := range splitted {
+		remainders := bytes.Split(remainder, nil)
+		remainderTokens = append(remainderTokens, remainders...)
+	}
+
+	return content, remainderTokens
+}
--- a/internal/tokenizer/tokenize_c.go
+++ b/internal/tokenizer/tokenize_c.go
@@ -0,0 +1,15 @@
+// +build flex
+
+package tokenizer
+
+import "github.com/go-enry/go-enry/v2/internal/tokenizer/flex"
+
+// Tokenize returns lexical tokens from content. The tokens returned match what
+// the Linguist library returns. At most the first ByteLimit bytes of content are tokenized.
+func Tokenize(content []byte) []string {
+	if len(content) > ByteLimit {
+		content = content[:ByteLimit]
+	}
+
+	return flex.TokenizeFlex(content)
+}
--- a/internal/tokenizer/tokenize_test.go
+++ b/internal/tokenizer/tokenize_test.go
@@ -0,0 +1,179 @@
+package tokenizer
+
+import (
+	"fmt"
+	"testing"
+
+	"github.com/go-enry/go-enry/v2/regex"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+const (
+	testContent = `#!/usr/bin/ruby
+
+#!/usr/bin/env node
+
+aaa
+
+#!/usr/bin/env A=B foo=bar awk -f
+
+#!python
+
+func Tokenize(content []byte) []string {
+	splitted := bytes.Fields(content)
+	tokens := /* make([]string, 0, len(splitted))
+	no comment -- comment
+	for _, tokenByte := range splitted {
+		token64 := base64.StdEncoding.EncodeToString(tokenByte)
+		tokens = append(tokens, token64)
+		notcatchasanumber3.5
+	}*/
+othercode
+	/* testing multiple 
+	
+		multiline comments*/
+
+<!-- com
+	ment -->
+<!-- comment 2-->
+ppp no comment # comment
+
+"literal1"
+
+abb (tokenByte, 0xAF02) | ,3.2L
+
+'literal2' notcatchasanumber3.5
+
+	5 += number * anotherNumber
+	if isTrue && isToo {
+		0b00001000 >> 1
+	}
+
+	return tokens
+
+oneBool = 3 <= 2
+varBool = 3<=2>
+ 
+#ifndef
+#i'm not a comment if the single line comment symbol is not followed by a white
+
+  PyErr_SetString(PyExc_RuntimeError, "Relative import is not supported for Python <=2.4.");
+
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml">
+    <head>
+        <title id="hola" class="">This is a XHTML sample file</title>
+        <style type="text/css"><![CDATA[
+            #example {
+                background-color: yellow;
+            }
+        ]]></style>
+    </head>
+    <body>
+        <div id="example">
+            Just a simple <strong>XHTML</strong> test page.
+        </div>
+    </body>
+</html>`
+)
+
+var (
+	tokensFromTestContent = []string{"SHEBANG#!ruby", "SHEBANG#!node", "SHEBANG#!awk", "<!DOCTYPE>", "html", "PUBLIC",
+		"W3C", "DTD", "XHTML", "1", "0", "Strict", "EN", "http", "www", "w3", "org", "TR", "xhtml1", "DTD", "xhtml1",
+		"strict", "dtd", "<html>", "xmlns=", "<head>", "<title>", "id=", "class=", "</title>", "<style>", "type=",
+		"<![CDATA[>", "example", "background", "color", "yellow", "</style>", "</head>", "<body>", "<div>", "id=",
+		"<strong>", "</strong>", "</div>", "</body>", "</html>", "(", "[", "]", ")", "[", "]", "{", "(", ")", "(", ")",
+		"{", "}", "(", ")", ";", "#", "/usr/bin/ruby", "#", "/usr/bin/env", "node", "aaa", "#", "/usr/bin/env", "A",
+		"B", "foo", "bar", "awk", "f", "#", "python", "func", "Tokenize", "content", "byte", "string", "splitted",
+		"bytes.Fields", "content", "tokens", "othercode", "ppp", "no", "comment", "abb", "tokenByte",
+		"notcatchasanumber", "number", "*", "anotherNumber", "if", "isTrue", "isToo", "b", "return", "tokens",
+		"oneBool", "varBool", "#ifndef", "#i", "m", "not", "a", "comment", "if", "the", "single", "line", "comment",
+		"symbol", "is", "not", "followed", "by", "a", "white", "PyErr_SetString", "PyExc_RuntimeError", "This", "is",
+		"a", "XHTML", "sample", "file", "Just", "a", "simple", "XHTML", "test", "page.", "-", "|", "+", "&&", "<", "<",
+		"!", "!", "!", "=", "=", "!", ":", "=", ":", "=", ",", ",", "=", ">", ">", "=", "=", "=", "=", ">", "'", ","}
+
+	tests = []struct {
+		name     string
+		content  []byte
+		expected []string
+	}{
+		{name: "content", content: []byte(testContent), expected: tokensFromTestContent},
+	}
+)
+
+func TestTokenize(t *testing.T) {
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			before := string(test.content)
+			tokens := Tokenize(test.content)
+			after := string(test.content)
+			require.Equal(t, before, after, "the input slice was modified")
+			require.Equal(t, len(test.expected), len(tokens), fmt.Sprintf("token' slice length = %v, want %v", len(test.expected), len(tokens)))
+
+			for i, expectedToken := range test.expected {
+				assert.Equal(t, expectedToken, tokens[i], fmt.Sprintf("token = %v, want %v", tokens[i], expectedToken))
+			}
+		})
+	}
+}
+
+func TestTokenizerLatin1AsUtf8(t *testing.T) {
+	content := []byte("th\xe5 filling") // `th<74> filling`
+	t.Logf("%v - %q", content, string(content))
+	tokens := Tokenize(content)
+	for i, token := range tokens {
+		t.Logf("token %d, %s", i+1, token)
+	}
+	require.Equal(t, 3, len(tokens))
+}
+
+func TestRegexpOnInvalidUtf8(t *testing.T) {
+	origContent := []struct {
+		text   string
+		tokens []string
+	}{
+		{"th\xe0 filling", []string{"th", "filling"}},   // `th<74> filling`
+		{"th\u0100 filling", []string{"th", "filling"}}, // `thĀ filling`
+		{"привет, как дела?", []string{}},               // empty, no ASCII tokens
+	}
+	re := regex.MustCompile(`[0-9A-Za-z_\.@#\/\*]+`) // a reRegularToken from tokenizer.go
+
+	for _, content := range origContent {
+		t.Run("", func(t *testing.T) {
+			t.Logf("%v - %q", content, content.text)
+			input := []byte(content.text)
+			tokens := re.FindAll(input, -1)
+			require.Equal(t, len(content.tokens), len(tokens))
+
+			newContent := re.ReplaceAll(input, []byte(` `))
+			t.Logf("content:%q, tokens:[", newContent)
+			for i, token := range tokens {
+				t.Logf("\t%q,", string(token))
+				require.Equal(t, content.tokens[i], string(token))
+			}
+			t.Logf(" ]\n")
+		})
+	}
+}
+
+func BenchmarkTokenizer_BaselineCopy(b *testing.B) {
+	b.ReportAllocs()
+	for i := 0; i < b.N; i++ {
+		for _, test := range tests {
+			if len(test.content) > ByteLimit {
+				test.content = test.content[:ByteLimit]
+			}
+			_ = append([]byte(nil), test.content...)
+		}
+	}
+}
+
+func BenchmarkTokenizer(b *testing.B) {
+	b.ReportAllocs()
+	for i := 0; i < b.N; i++ {
+		for _, test := range tests {
+			Tokenize(test.content)
+		}
+	}
+}