tartrazine/internal/tokenizer/tokenize.go

// +build !flex

package tokenizer

import (
	"bytes"

	"github.com/src-d/enry/v2/regex"
)

// Tokenize returns lexical tokens from content. The tokens returned match what
// the Linguist library returns. At most the first ByteLimit bytes of content are tokenized.
//
// BUG: Until https://github.com/src-d/enry/issues/193 is resolved, there are some
// differences between this function and the Linguist output.
func Tokenize(content []byte) []string {
	if len(content) > ByteLimit {
		content = content[:ByteLimit]
	}

	// Copy the input so that changes wrought by the tokenization steps do not
	// modify the caller's copy of the input. See #196.
	content = append([]byte(nil), content...)

	tokens := make([][]byte, 0, 50)
	for _, extract := range extractTokens {
		var extractedTokens [][]byte
		content, extractedTokens = extract(content)
		tokens = append(tokens, extractedTokens...)
	}

	return toString(tokens)
}

func toString(tokens [][]byte) []string {
	stokens := make([]string, 0, len(tokens))
	for _, token := range tokens {
		stokens = append(stokens, string(token))
	}

	return stokens
}

var (
	extractTokens = []func(content []byte) (replacedContent []byte, tokens [][]byte){
		// The order to must be this
		extractAndReplaceShebang,
		extractAndReplaceSGML,
		skipCommentsAndLiterals,
		extractAndReplacePunctuation,
		extractAndReplaceRegular,
		extractAndReplaceOperator,
		extractRemainders,
	}

	// Differences between golang regexp and oniguruma:
	// 1. no (?s) in oniguruma - makes dot match \n
	// 2. no (?U) in oniguruma - ungreedy *
	// 3. (?m) implies dot matches \n in oniguruma
	// 4. oniguruma handles \w differently - impossible, but true
	//
	// Workarounds:
	// 1. (.|\n)
	// 2. replace * with *?
	// 3. replace . with [^\n]
	// 4. replace \w with [0-9A-Za-z_]
	//
	// Original golang regexps:
	//
	// reLiteralStringQuotes = regexp.MustCompile(`(?sU)(".*"|'.*')`)
	// reSingleLineComment   = regexp.MustCompile(`(?m)(//|--|#|%|")\s(.*$)`)
	// reMultilineComment    = regexp.MustCompile(`(?sU)(/\*.*\*/|<!--.*-->|\{-.*-\}|\(\*.*\*\)|""".*"""|'''.*''')`)
	// reLiteralNumber       = regexp.MustCompile(`(0x[0-9A-Fa-f]([0-9A-Fa-f]|\.)*|\d(\d|\.)*)([uU][lL]{0,2}|([eE][-+]\d*)?[fFlL]*)`)
	// reShebang             = regexp.MustCompile(`(?m)^#!(?:/\w+)*/(?:(\w+)|\w+(?:\s*\w+=\w+\s*)*\s*(\w+))(?:\s*-\w+\s*)*$`)
	// rePunctuation         = regexp.MustCompile(`;|\{|\}|\(|\)|\[|\]`)
	// reSGML                = regexp.MustCompile(`(?sU)(<\/?[^\s<>=\d"']+)(?:\s.*\/?>|>)`)
	// reSGMLComment         = regexp.MustCompile(`(?sU)(<!--.*-->)`)
	// reSGMLAttributes      = regexp.MustCompile(`\s+(\w+=)|\s+([^\s>]+)`)
	// reSGMLLoneAttribute   = regexp.MustCompile(`(\w+)`)
	// reRegularToken        = regexp.MustCompile(`[\w\.@#\/\*]+`)
	// reOperators           = regexp.MustCompile(`<<?|\+|\-|\*|\/|%|&&?|\|\|?`)
	//
	// These regexps were converted to work in the same way for both engines:
	//
	reLiteralStringQuotes = regex.MustCompile(`("(.|\n)*?"|'(.|\n)*?')`)
	reSingleLineComment   = regex.MustCompile(`(?m)(//|--|#|%|")\s([^\n]*$)`)
	reMultilineComment    = regex.MustCompile(`(/\*(.|\n)*?\*/|<!--(.|\n)*?-->|\{-(.|\n)*?-\}|\(\*(.|\n)*?\*\)|"""(.|\n)*?"""|'''(.|\n)*?''')`)
	reLiteralNumber       = regex.MustCompile(`(0x[0-9A-Fa-f]([0-9A-Fa-f]|\.)*|\d(\d|\.)*)([uU][lL]{0,2}|([eE][-+]\d*)?[fFlL]*)`)
	reShebang             = regex.MustCompile(`(?m)^#!(?:/[0-9A-Za-z_]+)*/(?:([0-9A-Za-z_]+)|[0-9A-Za-z_]+(?:\s*[0-9A-Za-z_]+=[0-9A-Za-z_]+\s*)*\s*([0-9A-Za-z_]+))(?:\s*-[0-9A-Za-z_]+\s*)*$`)
	rePunctuation         = regex.MustCompile(`;|\{|\}|\(|\)|\[|\]`)
	reSGML                = regex.MustCompile(`(<\/?[^\s<>=\d"']+)(?:\s(.|\n)*?\/?>|>)`)
	reSGMLComment         = regex.MustCompile(`(<!--(.|\n)*?-->)`)
	reSGMLAttributes      = regex.MustCompile(`\s+([0-9A-Za-z_]+=)|\s+([^\s>]+)`)
	reSGMLLoneAttribute   = regex.MustCompile(`([0-9A-Za-z_]+)`)
	reRegularToken        = regex.MustCompile(`[0-9A-Za-z_\.@#\/\*]+`)
	reOperators           = regex.MustCompile(`<<?|\+|\-|\*|\/|%|&&?|\|\|?`)

	regexToSkip = []regex.EnryRegexp{
		// The order must be this
		reLiteralStringQuotes,
		reMultilineComment,
		reSingleLineComment,
		reLiteralNumber,
	}
)

func extractAndReplaceShebang(content []byte) ([]byte, [][]byte) {
	var shebangTokens [][]byte
	matches := reShebang.FindAllSubmatch(content, -1)
	if matches != nil {
		shebangTokens = make([][]byte, 0, 2)
		for _, match := range matches {
			shebangToken := getShebangToken(match)
			shebangTokens = append(shebangTokens, shebangToken)
		}

		reShebang.ReplaceAll(content, []byte(` `))
	}

	return content, shebangTokens
}

func getShebangToken(matchedShebang [][]byte) []byte {
	const prefix = `SHEBANG#!`
	var token []byte
	for i := 1; i < len(matchedShebang); i++ {
		if len(matchedShebang[i]) > 0 {
			token = matchedShebang[i]
			break
		}
	}

	tokenShebang := append([]byte(prefix), token...)
	return tokenShebang
}

func commonExtractAndReplace(content []byte, re regex.EnryRegexp) ([]byte, [][]byte) {
	tokens := re.FindAll(content, -1)
	content = re.ReplaceAll(content, []byte(` `))
	return content, tokens
}

func extractAndReplacePunctuation(content []byte) ([]byte, [][]byte) {
	return commonExtractAndReplace(content, rePunctuation)
}

func extractAndReplaceRegular(content []byte) ([]byte, [][]byte) {
	return commonExtractAndReplace(content, reRegularToken)
}

func extractAndReplaceOperator(content []byte) ([]byte, [][]byte) {
	return commonExtractAndReplace(content, reOperators)
}

func extractAndReplaceSGML(content []byte) ([]byte, [][]byte) {
	var SGMLTokens [][]byte
	matches := reSGML.FindAllSubmatch(content, -1)
	if matches != nil {
		SGMLTokens = make([][]byte, 0, 2)
		for _, match := range matches {
			if reSGMLComment.Match(match[0]) {
				continue
			}

			token := append(match[1], '>')
			SGMLTokens = append(SGMLTokens, token)
			attributes := getSGMLAttributes(match[0])
			SGMLTokens = append(SGMLTokens, attributes...)
		}

		content = reSGML.ReplaceAll(content, []byte(` `))
	}

	return content, SGMLTokens
}

func getSGMLAttributes(SGMLTag []byte) [][]byte {
	var attributes [][]byte
	matches := reSGMLAttributes.FindAllSubmatch(SGMLTag, -1)
	if matches != nil {
		attributes = make([][]byte, 0, 5)
		for _, match := range matches {
			if len(match[1]) != 0 {
				attributes = append(attributes, match[1])
			}

			if len(match[2]) != 0 {
				loneAttributes := reSGMLLoneAttribute.FindAll(match[2], -1)
				attributes = append(attributes, loneAttributes...)
			}
		}
	}

	return attributes
}

func skipCommentsAndLiterals(content []byte) ([]byte, [][]byte) {
	for _, skip := range regexToSkip {
		content = skip.ReplaceAll(content, []byte(` `))
	}

	return content, nil
}

func extractRemainders(content []byte) ([]byte, [][]byte) {
	splitted := bytes.Fields(content)
	remainderTokens := make([][]byte, 0, len(splitted)*3)
	for _, remainder := range splitted {
		remainders := bytes.Split(remainder, nil)
		remainderTokens = append(remainderTokens, remainders...)
	}

	return content, remainderTokens
}
refactor to build tags Signed-off-by: Alexander Bezzubov <bzz@apache.org> 2019-03-24 17:55:05 +00:00			`// +build !flex`

Added frequencies.go generation 2017-05-25 10:33:26 +00:00			`package tokenizer`

			`import (`
			`"bytes"`
Refactor Oniguruma integration Instead of use a command to change imports before build, using a build tag to generate the correct binary. This will allow applications to compile enry using oniguruma with less troubles. Signed-off-by: Antonio Jesus Navarro Perez <antnavper@gmail.com> 2018-08-28 15:27:18 +00:00
modules: prepare for v2 release - update go.mod \w v2 - update all import paths Signed-off-by: Alexander Bezzubov <bzz@apache.org> 2019-04-14 19:28:12 +00:00			`"github.com/src-d/enry/v2/regex"`
Added frequencies.go generation 2017-05-25 10:33:26 +00:00			`)`

doc: improve API doc on review feedback Signed-off-by: Alexander Bezzubov <bzz@apache.org> 2019-04-16 11:05:45 +00:00			`// Tokenize returns lexical tokens from content. The tokens returned match what`
			`// the Linguist library returns. At most the first ByteLimit bytes of content are tokenized.`
			`//`
			`// BUG: Until https://github.com/src-d/enry/issues/193 is resolved, there are some`
			`// differences between this function and the Linguist output.`
Added frequencies.go generation 2017-05-25 10:33:26 +00:00			`func Tokenize(content []byte) []string {`
address review feedback Signed-off-by: Alexander Bezzubov <bzz@apache.org> 2019-04-14 20:15:18 +00:00			`if len(content) > ByteLimit {`
			`content = content[:ByteLimit]`
Rearranged code 2017-05-29 08:05:16 +00:00			`}`

Copy the tokenizer input to avoid modifying the caller's copy. Addresses #196. Several of the tokenizer's processing steps wind up editing the source, and we don't want those changes to be observed by the caller, which may use the source for other purposes afterward. Signed-off-by: M. J. Fromberger <michael.j.fromberger@gmail.com> 2019-01-29 18:12:33 +00:00			`// Copy the input so that changes wrought by the tokenization steps do not`
			`// modify the caller's copy of the input. See #196.`
			`content = append([]byte(nil), content...)`

Added frequencies.go generation 2017-05-25 10:33:26 +00:00			`tokens := make([][]byte, 0, 50)`
			`for _, extract := range extractTokens {`
			`var extractedTokens [][]byte`
			`content, extractedTokens = extract(content)`
			`tokens = append(tokens, extractedTokens...)`
			`}`

			`return toString(tokens)`
			`}`

			`func toString(tokens [][]byte) []string {`
			`stokens := make([]string, 0, len(tokens))`
			`for _, token := range tokens {`
			`stokens = append(stokens, string(token))`
			`}`

			`return stokens`
			`}`

			`var (`
			`extractTokens = []func(content []byte) (replacedContent []byte, tokens [][]byte){`
			`// The order to must be this`
			`extractAndReplaceShebang,`
			`extractAndReplaceSGML,`
			`skipCommentsAndLiterals,`
			`extractAndReplacePunctuation,`
			`extractAndReplaceRegular,`
			`extractAndReplaceOperator,`
			`extractRemainders,`
			`}`

Rename onigumura to oniguruma This change names the dependency like its called. The link to the package was correct, but all other references were renamed where I could find time with git grep. Signed-off-by: Zeger-Jan van de Weg <git@zjvandeweg.nl> 2018-03-28 18:52:49 +00:00			`// Differences between golang regexp and oniguruma:`
			`// 1. no (?s) in oniguruma - makes dot match \n`
			`// 2. no (?U) in oniguruma - ungreedy *`
			`// 3. (?m) implies dot matches \n in oniguruma`
			`// 4. oniguruma handles \w differently - impossible, but true`
Make tokenizer regexps work under rubex Signed-off-by: Vadim Markovtsev <vadim@sourced.tech> 2017-09-28 21:33:25 +00:00			`//`
			`// Workarounds:`
			`// 1. (.\|\n)`
			`// 2. replace * with *?`
			`// 3. replace . with [^\n]`
			`// 4. replace \w with [0-9A-Za-z_]`
			`//`
			`// Original golang regexps:`
			`//`
			// reLiteralStringQuotes = regexp.MustCompile(`(?sU)("."\|'.')`)
			// reSingleLineComment = regexp.MustCompile(`(?m)(//\|--\|#\|%\|")\s(.*$)`)
			// reMultilineComment = regexp.MustCompile(`(?sU)(/\.\/\|<!--.-->\|\{-.-\}\|\(\.\\)\|"""."""\|'''.''')`)
			// reLiteralNumber = regexp.MustCompile(`(0x[0-9A-Fa-f]([0-9A-Fa-f]\|\.)\|\d(\d\|\.))([uU][lL]{0,2}\|([eE][-+]\d)?[fFlL])`)
			// reShebang = regexp.MustCompile(`(?m)^#!(?:/\w+)/(?:(\w+)\|\w+(?:\s\w+=\w+\s)\s(\w+))(?:\s-\w+\s)$`)
			// rePunctuation = regexp.MustCompile(`;\|\{\|\}\|\(\|\)\|\[\|\]`)
			// reSGML = regexp.MustCompile(`(?sU)(<\/?[^\s<>=\d"']+)(?:\s.*\/?>\|>)`)
			// reSGMLComment = regexp.MustCompile(`(?sU)(<!--.*-->)`)
			// reSGMLAttributes = regexp.MustCompile(`\s+(\w+=)\|\s+([^\s>]+)`)
			// reSGMLLoneAttribute = regexp.MustCompile(`(\w+)`)
			// reRegularToken = regexp.MustCompile(`[\w\.@#\/\*]+`)
			// reOperators = regexp.MustCompile(`<<?\|\+\|\-\|\*\|\/\|%\|&&?\|\\|\\|?`)
			`//`
			`// These regexps were converted to work in the same way for both engines:`
			`//`
Refactor Oniguruma integration Instead of use a command to change imports before build, using a build tag to generate the correct binary. This will allow applications to compile enry using oniguruma with less troubles. Signed-off-by: Antonio Jesus Navarro Perez <antnavper@gmail.com> 2018-08-28 15:27:18 +00:00			reLiteralStringQuotes = regex.MustCompile(`("(.\|\n)?"\|'(.\|\n)?')`)
			reSingleLineComment = regex.MustCompile(`(?m)(//\|--\|#\|%\|")\s([^\n]*$)`)
			reMultilineComment = regex.MustCompile(`(/\(.\|\n)?\/\|<!--(.\|\n)?-->\|\{-(.\|\n)?-\}\|\(\(.\|\n)?\\)\|"""(.\|\n)?"""\|'''(.\|\n)?''')`)
			reLiteralNumber = regex.MustCompile(`(0x[0-9A-Fa-f]([0-9A-Fa-f]\|\.)\|\d(\d\|\.))([uU][lL]{0,2}\|([eE][-+]\d)?[fFlL])`)
			reShebang = regex.MustCompile(`(?m)^#!(?:/[0-9A-Za-z_]+)/(?:([0-9A-Za-z_]+)\|[0-9A-Za-z_]+(?:\s[0-9A-Za-z_]+=[0-9A-Za-z_]+\s)\s([0-9A-Za-z_]+))(?:\s-[0-9A-Za-z_]+\s)$`)
			rePunctuation = regex.MustCompile(`;\|\{\|\}\|\(\|\)\|\[\|\]`)
			reSGML = regex.MustCompile(`(<\/?[^\s<>=\d"']+)(?:\s(.\|\n)*?\/?>\|>)`)
			reSGMLComment = regex.MustCompile(`(<!--(.\|\n)*?-->)`)
			reSGMLAttributes = regex.MustCompile(`\s+([0-9A-Za-z_]+=)\|\s+([^\s>]+)`)
			reSGMLLoneAttribute = regex.MustCompile(`([0-9A-Za-z_]+)`)
			reRegularToken = regex.MustCompile(`[0-9A-Za-z_\.@#\/\*]+`)
			reOperators = regex.MustCompile(`<<?\|\+\|\-\|\*\|\/\|%\|&&?\|\\|\\|?`)

			`regexToSkip = []regex.EnryRegexp{`
Added frequencies.go generation 2017-05-25 10:33:26 +00:00			`// The order must be this`
			`reLiteralStringQuotes,`
			`reMultilineComment,`
			`reSingleLineComment,`
			`reLiteralNumber,`
			`}`
			`)`

			`func extractAndReplaceShebang(content []byte) ([]byte, [][]byte) {`
			`var shebangTokens [][]byte`
			`matches := reShebang.FindAllSubmatch(content, -1)`
			`if matches != nil {`
			`shebangTokens = make([][]byte, 0, 2)`
			`for _, match := range matches {`
			`shebangToken := getShebangToken(match)`
			`shebangTokens = append(shebangTokens, shebangToken)`
			`}`

			reShebang.ReplaceAll(content, []byte(` `))
			`}`

			`return content, shebangTokens`
			`}`

			`func getShebangToken(matchedShebang [][]byte) []byte {`
			const prefix = `SHEBANG#!`
			`var token []byte`
			`for i := 1; i < len(matchedShebang); i++ {`
			`if len(matchedShebang[i]) > 0 {`
			`token = matchedShebang[i]`
			`break`
			`}`
			`}`

			`tokenShebang := append([]byte(prefix), token...)`
			`return tokenShebang`
			`}`

Refactor Oniguruma integration Instead of use a command to change imports before build, using a build tag to generate the correct binary. This will allow applications to compile enry using oniguruma with less troubles. Signed-off-by: Antonio Jesus Navarro Perez <antnavper@gmail.com> 2018-08-28 15:27:18 +00:00			`func commonExtractAndReplace(content []byte, re regex.EnryRegexp) ([]byte, [][]byte) {`
Added frequencies.go generation 2017-05-25 10:33:26 +00:00			`tokens := re.FindAll(content, -1)`
			content = re.ReplaceAll(content, []byte(` `))
			`return content, tokens`
			`}`

			`func extractAndReplacePunctuation(content []byte) ([]byte, [][]byte) {`
Refactor Oniguruma integration Instead of use a command to change imports before build, using a build tag to generate the correct binary. This will allow applications to compile enry using oniguruma with less troubles. Signed-off-by: Antonio Jesus Navarro Perez <antnavper@gmail.com> 2018-08-28 15:27:18 +00:00			`return commonExtractAndReplace(content, rePunctuation)`
Added frequencies.go generation 2017-05-25 10:33:26 +00:00			`}`

			`func extractAndReplaceRegular(content []byte) ([]byte, [][]byte) {`
Refactor Oniguruma integration Instead of use a command to change imports before build, using a build tag to generate the correct binary. This will allow applications to compile enry using oniguruma with less troubles. Signed-off-by: Antonio Jesus Navarro Perez <antnavper@gmail.com> 2018-08-28 15:27:18 +00:00			`return commonExtractAndReplace(content, reRegularToken)`
Added frequencies.go generation 2017-05-25 10:33:26 +00:00			`}`

			`func extractAndReplaceOperator(content []byte) ([]byte, [][]byte) {`
Refactor Oniguruma integration Instead of use a command to change imports before build, using a build tag to generate the correct binary. This will allow applications to compile enry using oniguruma with less troubles. Signed-off-by: Antonio Jesus Navarro Perez <antnavper@gmail.com> 2018-08-28 15:27:18 +00:00			`return commonExtractAndReplace(content, reOperators)`
Added frequencies.go generation 2017-05-25 10:33:26 +00:00			`}`

			`func extractAndReplaceSGML(content []byte) ([]byte, [][]byte) {`
			`var SGMLTokens [][]byte`
			`matches := reSGML.FindAllSubmatch(content, -1)`
			`if matches != nil {`
			`SGMLTokens = make([][]byte, 0, 2)`
			`for _, match := range matches {`
			`if reSGMLComment.Match(match[0]) {`
			`continue`
			`}`

			`token := append(match[1], '>')`
			`SGMLTokens = append(SGMLTokens, token)`
			`attributes := getSGMLAttributes(match[0])`
			`SGMLTokens = append(SGMLTokens, attributes...)`
			`}`

			content = reSGML.ReplaceAll(content, []byte(` `))
			`}`

			`return content, SGMLTokens`
			`}`

			`func getSGMLAttributes(SGMLTag []byte) [][]byte {`
			`var attributes [][]byte`
			`matches := reSGMLAttributes.FindAllSubmatch(SGMLTag, -1)`
			`if matches != nil {`
			`attributes = make([][]byte, 0, 5)`
			`for _, match := range matches {`
			`if len(match[1]) != 0 {`
			`attributes = append(attributes, match[1])`
			`}`

			`if len(match[2]) != 0 {`
			`loneAttributes := reSGMLLoneAttribute.FindAll(match[2], -1)`
			`attributes = append(attributes, loneAttributes...)`
			`}`
			`}`
			`}`

			`return attributes`
			`}`

			`func skipCommentsAndLiterals(content []byte) ([]byte, [][]byte) {`
			`for _, skip := range regexToSkip {`
			content = skip.ReplaceAll(content, []byte(` `))
			`}`

			`return content, nil`
			`}`

			`func extractRemainders(content []byte) ([]byte, [][]byte) {`
			`splitted := bytes.Fields(content)`
			`remainderTokens := make([][]byte, 0, len(splitted)*3)`
			`for _, remainder := range splitted {`
			`remainders := bytes.Split(remainder, nil)`
			`remainderTokens = append(remainderTokens, remainders...)`
			`}`

			`return content, remainderTokens`
			`}`