tartrazine/internal/tokenizer/tokenize.go

package tokenizer

import (
	"bytes"

	"gopkg.in/src-d/enry.v1/regex"
)

const byteLimit = 100000

func Tokenize(content []byte) []string {
	if len(content) > byteLimit {
		content = content[:byteLimit]
	}

	tokens := make([][]byte, 0, 50)
	for _, extract := range extractTokens {
		var extractedTokens [][]byte
		content, extractedTokens = extract(content)
		tokens = append(tokens, extractedTokens...)
	}

	return toString(tokens)
}

func toString(tokens [][]byte) []string {
	stokens := make([]string, 0, len(tokens))
	for _, token := range tokens {
		stokens = append(stokens, string(token))
	}

	return stokens
}

var (
	extractTokens = []func(content []byte) (replacedContent []byte, tokens [][]byte){
		// The order to must be this
		extractAndReplaceShebang,
		extractAndReplaceSGML,
		skipCommentsAndLiterals,
		extractAndReplacePunctuation,
		extractAndReplaceRegular,
		extractAndReplaceOperator,
		extractRemainders,
	}

	// Differences between golang regexp and oniguruma:
	// 1. no (?s) in oniguruma - makes dot match \n
	// 2. no (?U) in oniguruma - ungreedy *
	// 3. (?m) implies dot matches \n in oniguruma
	// 4. oniguruma handles \w differently - impossible, but true
	//
	// Workarounds:
	// 1. (.|\n)
	// 2. replace * with *?
	// 3. replace . with [^\n]
	// 4. replace \w with [0-9A-Za-z_]
	//
	// Original golang regexps:
	//
	// reLiteralStringQuotes = regexp.MustCompile(`(?sU)(".*"|'.*')`)
	// reSingleLineComment   = regexp.MustCompile(`(?m)(//|--|#|%|")\s(.*$)`)
	// reMultilineComment    = regexp.MustCompile(`(?sU)(/\*.*\*/|<!--.*-->|\{-.*-\}|\(\*.*\*\)|""".*"""|'''.*''')`)
	// reLiteralNumber       = regexp.MustCompile(`(0x[0-9A-Fa-f]([0-9A-Fa-f]|\.)*|\d(\d|\.)*)([uU][lL]{0,2}|([eE][-+]\d*)?[fFlL]*)`)
	// reShebang             = regexp.MustCompile(`(?m)^#!(?:/\w+)*/(?:(\w+)|\w+(?:\s*\w+=\w+\s*)*\s*(\w+))(?:\s*-\w+\s*)*$`)
	// rePunctuation         = regexp.MustCompile(`;|\{|\}|\(|\)|\[|\]`)
	// reSGML                = regexp.MustCompile(`(?sU)(<\/?[^\s<>=\d"']+)(?:\s.*\/?>|>)`)
	// reSGMLComment         = regexp.MustCompile(`(?sU)(<!--.*-->)`)
	// reSGMLAttributes      = regexp.MustCompile(`\s+(\w+=)|\s+([^\s>]+)`)
	// reSGMLLoneAttribute   = regexp.MustCompile(`(\w+)`)
	// reRegularToken        = regexp.MustCompile(`[\w\.@#\/\*]+`)
	// reOperators           = regexp.MustCompile(`<<?|\+|\-|\*|\/|%|&&?|\|\|?`)
	//
	// These regexps were converted to work in the same way for both engines:
	//
	reLiteralStringQuotes = regex.MustCompile(`("(.|\n)*?"|'(.|\n)*?')`)
	reSingleLineComment   = regex.MustCompile(`(?m)(//|--|#|%|")\s([^\n]*$)`)
	reMultilineComment    = regex.MustCompile(`(/\*(.|\n)*?\*/|<!--(.|\n)*?-->|\{-(.|\n)*?-\}|\(\*(.|\n)*?\*\)|"""(.|\n)*?"""|'''(.|\n)*?''')`)
	reLiteralNumber       = regex.MustCompile(`(0x[0-9A-Fa-f]([0-9A-Fa-f]|\.)*|\d(\d|\.)*)([uU][lL]{0,2}|([eE][-+]\d*)?[fFlL]*)`)
	reShebang             = regex.MustCompile(`(?m)^#!(?:/[0-9A-Za-z_]+)*/(?:([0-9A-Za-z_]+)|[0-9A-Za-z_]+(?:\s*[0-9A-Za-z_]+=[0-9A-Za-z_]+\s*)*\s*([0-9A-Za-z_]+))(?:\s*-[0-9A-Za-z_]+\s*)*$`)
	rePunctuation         = regex.MustCompile(`;|\{|\}|\(|\)|\[|\]`)
	reSGML                = regex.MustCompile(`(<\/?[^\s<>=\d"']+)(?:\s(.|\n)*?\/?>|>)`)
	reSGMLComment         = regex.MustCompile(`(<!--(.|\n)*?-->)`)
	reSGMLAttributes      = regex.MustCompile(`\s+([0-9A-Za-z_]+=)|\s+([^\s>]+)`)
	reSGMLLoneAttribute   = regex.MustCompile(`([0-9A-Za-z_]+)`)
	reRegularToken        = regex.MustCompile(`[0-9A-Za-z_\.@#\/\*]+`)
	reOperators           = regex.MustCompile(`<<?|\+|\-|\*|\/|%|&&?|\|\|?`)

	regexToSkip = []regex.EnryRegexp{
		// The order must be this
		reLiteralStringQuotes,
		reMultilineComment,
		reSingleLineComment,
		reLiteralNumber,
	}
)

func extractAndReplaceShebang(content []byte) ([]byte, [][]byte) {
	var shebangTokens [][]byte
	matches := reShebang.FindAllSubmatch(content, -1)
	if matches != nil {
		shebangTokens = make([][]byte, 0, 2)
		for _, match := range matches {
			shebangToken := getShebangToken(match)
			shebangTokens = append(shebangTokens, shebangToken)
		}

		reShebang.ReplaceAll(content, []byte(` `))
	}

	return content, shebangTokens
}

func getShebangToken(matchedShebang [][]byte) []byte {
	const prefix = `SHEBANG#!`
	var token []byte
	for i := 1; i < len(matchedShebang); i++ {
		if len(matchedShebang[i]) > 0 {
			token = matchedShebang[i]
			break
		}
	}

	tokenShebang := append([]byte(prefix), token...)
	return tokenShebang
}

func commonExtractAndReplace(content []byte, re regex.EnryRegexp) ([]byte, [][]byte) {
	tokens := re.FindAll(content, -1)
	content = re.ReplaceAll(content, []byte(` `))
	return content, tokens
}

func extractAndReplacePunctuation(content []byte) ([]byte, [][]byte) {
	return commonExtractAndReplace(content, rePunctuation)
}

func extractAndReplaceRegular(content []byte) ([]byte, [][]byte) {
	return commonExtractAndReplace(content, reRegularToken)
}

func extractAndReplaceOperator(content []byte) ([]byte, [][]byte) {
	return commonExtractAndReplace(content, reOperators)
}

func extractAndReplaceSGML(content []byte) ([]byte, [][]byte) {
	var SGMLTokens [][]byte
	matches := reSGML.FindAllSubmatch(content, -1)
	if matches != nil {
		SGMLTokens = make([][]byte, 0, 2)
		for _, match := range matches {
			if reSGMLComment.Match(match[0]) {
				continue
			}

			token := append(match[1], '>')
			SGMLTokens = append(SGMLTokens, token)
			attributes := getSGMLAttributes(match[0])
			SGMLTokens = append(SGMLTokens, attributes...)
		}

		content = reSGML.ReplaceAll(content, []byte(` `))
	}

	return content, SGMLTokens
}

func getSGMLAttributes(SGMLTag []byte) [][]byte {
	var attributes [][]byte
	matches := reSGMLAttributes.FindAllSubmatch(SGMLTag, -1)
	if matches != nil {
		attributes = make([][]byte, 0, 5)
		for _, match := range matches {
			if len(match[1]) != 0 {
				attributes = append(attributes, match[1])
			}

			if len(match[2]) != 0 {
				loneAttributes := reSGMLLoneAttribute.FindAll(match[2], -1)
				attributes = append(attributes, loneAttributes...)
			}
		}
	}

	return attributes
}

func skipCommentsAndLiterals(content []byte) ([]byte, [][]byte) {
	for _, skip := range regexToSkip {
		content = skip.ReplaceAll(content, []byte(` `))
	}

	return content, nil
}

func extractRemainders(content []byte) ([]byte, [][]byte) {
	splitted := bytes.Fields(content)
	remainderTokens := make([][]byte, 0, len(splitted)*3)
	for _, remainder := range splitted {
		remainders := bytes.Split(remainder, nil)
		remainderTokens = append(remainderTokens, remainders...)
	}

	return content, remainderTokens
}
Added frequencies.go generation 2017-05-25 10:33:26 +00:00			`package tokenizer`

			`import (`
			`"bytes"`
Refactor Oniguruma integration Instead of use a command to change imports before build, using a build tag to generate the correct binary. This will allow applications to compile enry using oniguruma with less troubles. Signed-off-by: Antonio Jesus Navarro Perez <antnavper@gmail.com> 2018-08-28 15:27:18 +00:00
			`"gopkg.in/src-d/enry.v1/regex"`
Added frequencies.go generation 2017-05-25 10:33:26 +00:00			`)`

Rearranged code 2017-05-29 08:05:16 +00:00			`const byteLimit = 100000`

Added frequencies.go generation 2017-05-25 10:33:26 +00:00			`func Tokenize(content []byte) []string {`
Rearranged code 2017-05-29 08:05:16 +00:00			`if len(content) > byteLimit {`
			`content = content[:byteLimit]`
			`}`

Added frequencies.go generation 2017-05-25 10:33:26 +00:00			`tokens := make([][]byte, 0, 50)`
			`for _, extract := range extractTokens {`
			`var extractedTokens [][]byte`
			`content, extractedTokens = extract(content)`
			`tokens = append(tokens, extractedTokens...)`
			`}`

			`return toString(tokens)`
			`}`

			`func toString(tokens [][]byte) []string {`
			`stokens := make([]string, 0, len(tokens))`
			`for _, token := range tokens {`
			`stokens = append(stokens, string(token))`
			`}`

			`return stokens`
			`}`

			`var (`
			`extractTokens = []func(content []byte) (replacedContent []byte, tokens [][]byte){`
			`// The order to must be this`
			`extractAndReplaceShebang,`
			`extractAndReplaceSGML,`
			`skipCommentsAndLiterals,`
			`extractAndReplacePunctuation,`
			`extractAndReplaceRegular,`
			`extractAndReplaceOperator,`
			`extractRemainders,`
			`}`

Rename onigumura to oniguruma This change names the dependency like its called. The link to the package was correct, but all other references were renamed where I could find time with git grep. Signed-off-by: Zeger-Jan van de Weg <git@zjvandeweg.nl> 2018-03-28 18:52:49 +00:00			`// Differences between golang regexp and oniguruma:`
			`// 1. no (?s) in oniguruma - makes dot match \n`
			`// 2. no (?U) in oniguruma - ungreedy *`
			`// 3. (?m) implies dot matches \n in oniguruma`
			`// 4. oniguruma handles \w differently - impossible, but true`
Make tokenizer regexps work under rubex Signed-off-by: Vadim Markovtsev <vadim@sourced.tech> 2017-09-28 21:33:25 +00:00			`//`
			`// Workarounds:`
			`// 1. (.\|\n)`
			`// 2. replace * with *?`
			`// 3. replace . with [^\n]`
			`// 4. replace \w with [0-9A-Za-z_]`
			`//`
			`// Original golang regexps:`
			`//`
			// reLiteralStringQuotes = regexp.MustCompile(`(?sU)("."\|'.')`)
			// reSingleLineComment = regexp.MustCompile(`(?m)(//\|--\|#\|%\|")\s(.*$)`)
			// reMultilineComment = regexp.MustCompile(`(?sU)(/\.\/\|<!--.-->\|\{-.-\}\|\(\.\\)\|"""."""\|'''.''')`)
			// reLiteralNumber = regexp.MustCompile(`(0x[0-9A-Fa-f]([0-9A-Fa-f]\|\.)\|\d(\d\|\.))([uU][lL]{0,2}\|([eE][-+]\d)?[fFlL])`)
			// reShebang = regexp.MustCompile(`(?m)^#!(?:/\w+)/(?:(\w+)\|\w+(?:\s\w+=\w+\s)\s(\w+))(?:\s-\w+\s)$`)
			// rePunctuation = regexp.MustCompile(`;\|\{\|\}\|\(\|\)\|\[\|\]`)
			// reSGML = regexp.MustCompile(`(?sU)(<\/?[^\s<>=\d"']+)(?:\s.*\/?>\|>)`)
			// reSGMLComment = regexp.MustCompile(`(?sU)(<!--.*-->)`)
			// reSGMLAttributes = regexp.MustCompile(`\s+(\w+=)\|\s+([^\s>]+)`)
			// reSGMLLoneAttribute = regexp.MustCompile(`(\w+)`)
			// reRegularToken = regexp.MustCompile(`[\w\.@#\/\*]+`)
			// reOperators = regexp.MustCompile(`<<?\|\+\|\-\|\*\|\/\|%\|&&?\|\\|\\|?`)
			`//`
			`// These regexps were converted to work in the same way for both engines:`
			`//`
Refactor Oniguruma integration Instead of use a command to change imports before build, using a build tag to generate the correct binary. This will allow applications to compile enry using oniguruma with less troubles. Signed-off-by: Antonio Jesus Navarro Perez <antnavper@gmail.com> 2018-08-28 15:27:18 +00:00			reLiteralStringQuotes = regex.MustCompile(`("(.\|\n)?"\|'(.\|\n)?')`)
			reSingleLineComment = regex.MustCompile(`(?m)(//\|--\|#\|%\|")\s([^\n]*$)`)
			reMultilineComment = regex.MustCompile(`(/\(.\|\n)?\/\|<!--(.\|\n)?-->\|\{-(.\|\n)?-\}\|\(\(.\|\n)?\\)\|"""(.\|\n)?"""\|'''(.\|\n)?''')`)
			reLiteralNumber = regex.MustCompile(`(0x[0-9A-Fa-f]([0-9A-Fa-f]\|\.)\|\d(\d\|\.))([uU][lL]{0,2}\|([eE][-+]\d)?[fFlL])`)
			reShebang = regex.MustCompile(`(?m)^#!(?:/[0-9A-Za-z_]+)/(?:([0-9A-Za-z_]+)\|[0-9A-Za-z_]+(?:\s[0-9A-Za-z_]+=[0-9A-Za-z_]+\s)\s([0-9A-Za-z_]+))(?:\s-[0-9A-Za-z_]+\s)$`)
			rePunctuation = regex.MustCompile(`;\|\{\|\}\|\(\|\)\|\[\|\]`)
			reSGML = regex.MustCompile(`(<\/?[^\s<>=\d"']+)(?:\s(.\|\n)*?\/?>\|>)`)
			reSGMLComment = regex.MustCompile(`(<!--(.\|\n)*?-->)`)
			reSGMLAttributes = regex.MustCompile(`\s+([0-9A-Za-z_]+=)\|\s+([^\s>]+)`)
			reSGMLLoneAttribute = regex.MustCompile(`([0-9A-Za-z_]+)`)
			reRegularToken = regex.MustCompile(`[0-9A-Za-z_\.@#\/\*]+`)
			reOperators = regex.MustCompile(`<<?\|\+\|\-\|\*\|\/\|%\|&&?\|\\|\\|?`)

			`regexToSkip = []regex.EnryRegexp{`
Added frequencies.go generation 2017-05-25 10:33:26 +00:00			`// The order must be this`
			`reLiteralStringQuotes,`
			`reMultilineComment,`
			`reSingleLineComment,`
			`reLiteralNumber,`
			`}`
			`)`

			`func extractAndReplaceShebang(content []byte) ([]byte, [][]byte) {`
			`var shebangTokens [][]byte`
			`matches := reShebang.FindAllSubmatch(content, -1)`
			`if matches != nil {`
			`shebangTokens = make([][]byte, 0, 2)`
			`for _, match := range matches {`
			`shebangToken := getShebangToken(match)`
			`shebangTokens = append(shebangTokens, shebangToken)`
			`}`

			reShebang.ReplaceAll(content, []byte(` `))
			`}`

			`return content, shebangTokens`
			`}`

			`func getShebangToken(matchedShebang [][]byte) []byte {`
			const prefix = `SHEBANG#!`
			`var token []byte`
			`for i := 1; i < len(matchedShebang); i++ {`
			`if len(matchedShebang[i]) > 0 {`
			`token = matchedShebang[i]`
			`break`
			`}`
			`}`

			`tokenShebang := append([]byte(prefix), token...)`
			`return tokenShebang`
			`}`

Refactor Oniguruma integration Instead of use a command to change imports before build, using a build tag to generate the correct binary. This will allow applications to compile enry using oniguruma with less troubles. Signed-off-by: Antonio Jesus Navarro Perez <antnavper@gmail.com> 2018-08-28 15:27:18 +00:00			`func commonExtractAndReplace(content []byte, re regex.EnryRegexp) ([]byte, [][]byte) {`
Added frequencies.go generation 2017-05-25 10:33:26 +00:00			`tokens := re.FindAll(content, -1)`
			content = re.ReplaceAll(content, []byte(` `))
			`return content, tokens`
			`}`

			`func extractAndReplacePunctuation(content []byte) ([]byte, [][]byte) {`
Refactor Oniguruma integration Instead of use a command to change imports before build, using a build tag to generate the correct binary. This will allow applications to compile enry using oniguruma with less troubles. Signed-off-by: Antonio Jesus Navarro Perez <antnavper@gmail.com> 2018-08-28 15:27:18 +00:00			`return commonExtractAndReplace(content, rePunctuation)`
Added frequencies.go generation 2017-05-25 10:33:26 +00:00			`}`

			`func extractAndReplaceRegular(content []byte) ([]byte, [][]byte) {`
Refactor Oniguruma integration Instead of use a command to change imports before build, using a build tag to generate the correct binary. This will allow applications to compile enry using oniguruma with less troubles. Signed-off-by: Antonio Jesus Navarro Perez <antnavper@gmail.com> 2018-08-28 15:27:18 +00:00			`return commonExtractAndReplace(content, reRegularToken)`
Added frequencies.go generation 2017-05-25 10:33:26 +00:00			`}`

			`func extractAndReplaceOperator(content []byte) ([]byte, [][]byte) {`
Refactor Oniguruma integration Instead of use a command to change imports before build, using a build tag to generate the correct binary. This will allow applications to compile enry using oniguruma with less troubles. Signed-off-by: Antonio Jesus Navarro Perez <antnavper@gmail.com> 2018-08-28 15:27:18 +00:00			`return commonExtractAndReplace(content, reOperators)`
Added frequencies.go generation 2017-05-25 10:33:26 +00:00			`}`

			`func extractAndReplaceSGML(content []byte) ([]byte, [][]byte) {`
			`var SGMLTokens [][]byte`
			`matches := reSGML.FindAllSubmatch(content, -1)`
			`if matches != nil {`
			`SGMLTokens = make([][]byte, 0, 2)`
			`for _, match := range matches {`
			`if reSGMLComment.Match(match[0]) {`
			`continue`
			`}`

			`token := append(match[1], '>')`
			`SGMLTokens = append(SGMLTokens, token)`
			`attributes := getSGMLAttributes(match[0])`
			`SGMLTokens = append(SGMLTokens, attributes...)`
			`}`

			content = reSGML.ReplaceAll(content, []byte(` `))
			`}`

			`return content, SGMLTokens`
			`}`

			`func getSGMLAttributes(SGMLTag []byte) [][]byte {`
			`var attributes [][]byte`
			`matches := reSGMLAttributes.FindAllSubmatch(SGMLTag, -1)`
			`if matches != nil {`
			`attributes = make([][]byte, 0, 5)`
			`for _, match := range matches {`
			`if len(match[1]) != 0 {`
			`attributes = append(attributes, match[1])`
			`}`

			`if len(match[2]) != 0 {`
			`loneAttributes := reSGMLLoneAttribute.FindAll(match[2], -1)`
			`attributes = append(attributes, loneAttributes...)`
			`}`
			`}`
			`}`

			`return attributes`
			`}`

			`func skipCommentsAndLiterals(content []byte) ([]byte, [][]byte) {`
			`for _, skip := range regexToSkip {`
			content = skip.ReplaceAll(content, []byte(` `))
			`}`

			`return content, nil`
			`}`

			`func extractRemainders(content []byte) ([]byte, [][]byte) {`
			`splitted := bytes.Fields(content)`
			`remainderTokens := make([][]byte, 0, len(splitted)*3)`
			`for _, remainder := range splitted {`
			`remainders := bytes.Split(remainder, nil)`
			`remainderTokens = append(remainderTokens, remainders...)`
			`}`

			`return content, remainderTokens`
			`}`