mirror of
https://github.com/ralsina/tartrazine.git
synced 2024-11-12 22:42:23 +00:00
7d277b11de
Addresses #196. Several of the tokenizer's processing steps wind up editing the source, and we don't want those changes to be observed by the caller, which may use the source for other purposes afterward. Signed-off-by: M. J. Fromberger <michael.j.fromberger@gmail.com>
210 lines
6.6 KiB
Go
210 lines
6.6 KiB
Go
package tokenizer
|
|
|
|
import (
|
|
"bytes"
|
|
|
|
"gopkg.in/src-d/enry.v1/regex"
|
|
)
|
|
|
|
const byteLimit = 100000
|
|
|
|
func Tokenize(content []byte) []string {
|
|
if len(content) > byteLimit {
|
|
content = content[:byteLimit]
|
|
}
|
|
|
|
// Copy the input so that changes wrought by the tokenization steps do not
|
|
// modify the caller's copy of the input. See #196.
|
|
content = append([]byte(nil), content...)
|
|
|
|
tokens := make([][]byte, 0, 50)
|
|
for _, extract := range extractTokens {
|
|
var extractedTokens [][]byte
|
|
content, extractedTokens = extract(content)
|
|
tokens = append(tokens, extractedTokens...)
|
|
}
|
|
|
|
return toString(tokens)
|
|
}
|
|
|
|
func toString(tokens [][]byte) []string {
|
|
stokens := make([]string, 0, len(tokens))
|
|
for _, token := range tokens {
|
|
stokens = append(stokens, string(token))
|
|
}
|
|
|
|
return stokens
|
|
}
|
|
|
|
var (
|
|
extractTokens = []func(content []byte) (replacedContent []byte, tokens [][]byte){
|
|
// The order to must be this
|
|
extractAndReplaceShebang,
|
|
extractAndReplaceSGML,
|
|
skipCommentsAndLiterals,
|
|
extractAndReplacePunctuation,
|
|
extractAndReplaceRegular,
|
|
extractAndReplaceOperator,
|
|
extractRemainders,
|
|
}
|
|
|
|
// Differences between golang regexp and oniguruma:
|
|
// 1. no (?s) in oniguruma - makes dot match \n
|
|
// 2. no (?U) in oniguruma - ungreedy *
|
|
// 3. (?m) implies dot matches \n in oniguruma
|
|
// 4. oniguruma handles \w differently - impossible, but true
|
|
//
|
|
// Workarounds:
|
|
// 1. (.|\n)
|
|
// 2. replace * with *?
|
|
// 3. replace . with [^\n]
|
|
// 4. replace \w with [0-9A-Za-z_]
|
|
//
|
|
// Original golang regexps:
|
|
//
|
|
// reLiteralStringQuotes = regexp.MustCompile(`(?sU)(".*"|'.*')`)
|
|
// reSingleLineComment = regexp.MustCompile(`(?m)(//|--|#|%|")\s(.*$)`)
|
|
// reMultilineComment = regexp.MustCompile(`(?sU)(/\*.*\*/|<!--.*-->|\{-.*-\}|\(\*.*\*\)|""".*"""|'''.*''')`)
|
|
// reLiteralNumber = regexp.MustCompile(`(0x[0-9A-Fa-f]([0-9A-Fa-f]|\.)*|\d(\d|\.)*)([uU][lL]{0,2}|([eE][-+]\d*)?[fFlL]*)`)
|
|
// reShebang = regexp.MustCompile(`(?m)^#!(?:/\w+)*/(?:(\w+)|\w+(?:\s*\w+=\w+\s*)*\s*(\w+))(?:\s*-\w+\s*)*$`)
|
|
// rePunctuation = regexp.MustCompile(`;|\{|\}|\(|\)|\[|\]`)
|
|
// reSGML = regexp.MustCompile(`(?sU)(<\/?[^\s<>=\d"']+)(?:\s.*\/?>|>)`)
|
|
// reSGMLComment = regexp.MustCompile(`(?sU)(<!--.*-->)`)
|
|
// reSGMLAttributes = regexp.MustCompile(`\s+(\w+=)|\s+([^\s>]+)`)
|
|
// reSGMLLoneAttribute = regexp.MustCompile(`(\w+)`)
|
|
// reRegularToken = regexp.MustCompile(`[\w\.@#\/\*]+`)
|
|
// reOperators = regexp.MustCompile(`<<?|\+|\-|\*|\/|%|&&?|\|\|?`)
|
|
//
|
|
// These regexps were converted to work in the same way for both engines:
|
|
//
|
|
reLiteralStringQuotes = regex.MustCompile(`("(.|\n)*?"|'(.|\n)*?')`)
|
|
reSingleLineComment = regex.MustCompile(`(?m)(//|--|#|%|")\s([^\n]*$)`)
|
|
reMultilineComment = regex.MustCompile(`(/\*(.|\n)*?\*/|<!--(.|\n)*?-->|\{-(.|\n)*?-\}|\(\*(.|\n)*?\*\)|"""(.|\n)*?"""|'''(.|\n)*?''')`)
|
|
reLiteralNumber = regex.MustCompile(`(0x[0-9A-Fa-f]([0-9A-Fa-f]|\.)*|\d(\d|\.)*)([uU][lL]{0,2}|([eE][-+]\d*)?[fFlL]*)`)
|
|
reShebang = regex.MustCompile(`(?m)^#!(?:/[0-9A-Za-z_]+)*/(?:([0-9A-Za-z_]+)|[0-9A-Za-z_]+(?:\s*[0-9A-Za-z_]+=[0-9A-Za-z_]+\s*)*\s*([0-9A-Za-z_]+))(?:\s*-[0-9A-Za-z_]+\s*)*$`)
|
|
rePunctuation = regex.MustCompile(`;|\{|\}|\(|\)|\[|\]`)
|
|
reSGML = regex.MustCompile(`(<\/?[^\s<>=\d"']+)(?:\s(.|\n)*?\/?>|>)`)
|
|
reSGMLComment = regex.MustCompile(`(<!--(.|\n)*?-->)`)
|
|
reSGMLAttributes = regex.MustCompile(`\s+([0-9A-Za-z_]+=)|\s+([^\s>]+)`)
|
|
reSGMLLoneAttribute = regex.MustCompile(`([0-9A-Za-z_]+)`)
|
|
reRegularToken = regex.MustCompile(`[0-9A-Za-z_\.@#\/\*]+`)
|
|
reOperators = regex.MustCompile(`<<?|\+|\-|\*|\/|%|&&?|\|\|?`)
|
|
|
|
regexToSkip = []regex.EnryRegexp{
|
|
// The order must be this
|
|
reLiteralStringQuotes,
|
|
reMultilineComment,
|
|
reSingleLineComment,
|
|
reLiteralNumber,
|
|
}
|
|
)
|
|
|
|
func extractAndReplaceShebang(content []byte) ([]byte, [][]byte) {
|
|
var shebangTokens [][]byte
|
|
matches := reShebang.FindAllSubmatch(content, -1)
|
|
if matches != nil {
|
|
shebangTokens = make([][]byte, 0, 2)
|
|
for _, match := range matches {
|
|
shebangToken := getShebangToken(match)
|
|
shebangTokens = append(shebangTokens, shebangToken)
|
|
}
|
|
|
|
reShebang.ReplaceAll(content, []byte(` `))
|
|
}
|
|
|
|
return content, shebangTokens
|
|
}
|
|
|
|
func getShebangToken(matchedShebang [][]byte) []byte {
|
|
const prefix = `SHEBANG#!`
|
|
var token []byte
|
|
for i := 1; i < len(matchedShebang); i++ {
|
|
if len(matchedShebang[i]) > 0 {
|
|
token = matchedShebang[i]
|
|
break
|
|
}
|
|
}
|
|
|
|
tokenShebang := append([]byte(prefix), token...)
|
|
return tokenShebang
|
|
}
|
|
|
|
func commonExtractAndReplace(content []byte, re regex.EnryRegexp) ([]byte, [][]byte) {
|
|
tokens := re.FindAll(content, -1)
|
|
content = re.ReplaceAll(content, []byte(` `))
|
|
return content, tokens
|
|
}
|
|
|
|
func extractAndReplacePunctuation(content []byte) ([]byte, [][]byte) {
|
|
return commonExtractAndReplace(content, rePunctuation)
|
|
}
|
|
|
|
func extractAndReplaceRegular(content []byte) ([]byte, [][]byte) {
|
|
return commonExtractAndReplace(content, reRegularToken)
|
|
}
|
|
|
|
func extractAndReplaceOperator(content []byte) ([]byte, [][]byte) {
|
|
return commonExtractAndReplace(content, reOperators)
|
|
}
|
|
|
|
func extractAndReplaceSGML(content []byte) ([]byte, [][]byte) {
|
|
var SGMLTokens [][]byte
|
|
matches := reSGML.FindAllSubmatch(content, -1)
|
|
if matches != nil {
|
|
SGMLTokens = make([][]byte, 0, 2)
|
|
for _, match := range matches {
|
|
if reSGMLComment.Match(match[0]) {
|
|
continue
|
|
}
|
|
|
|
token := append(match[1], '>')
|
|
SGMLTokens = append(SGMLTokens, token)
|
|
attributes := getSGMLAttributes(match[0])
|
|
SGMLTokens = append(SGMLTokens, attributes...)
|
|
}
|
|
|
|
content = reSGML.ReplaceAll(content, []byte(` `))
|
|
}
|
|
|
|
return content, SGMLTokens
|
|
}
|
|
|
|
func getSGMLAttributes(SGMLTag []byte) [][]byte {
|
|
var attributes [][]byte
|
|
matches := reSGMLAttributes.FindAllSubmatch(SGMLTag, -1)
|
|
if matches != nil {
|
|
attributes = make([][]byte, 0, 5)
|
|
for _, match := range matches {
|
|
if len(match[1]) != 0 {
|
|
attributes = append(attributes, match[1])
|
|
}
|
|
|
|
if len(match[2]) != 0 {
|
|
loneAttributes := reSGMLLoneAttribute.FindAll(match[2], -1)
|
|
attributes = append(attributes, loneAttributes...)
|
|
}
|
|
}
|
|
}
|
|
|
|
return attributes
|
|
}
|
|
|
|
func skipCommentsAndLiterals(content []byte) ([]byte, [][]byte) {
|
|
for _, skip := range regexToSkip {
|
|
content = skip.ReplaceAll(content, []byte(` `))
|
|
}
|
|
|
|
return content, nil
|
|
}
|
|
|
|
func extractRemainders(content []byte) ([]byte, [][]byte) {
|
|
splitted := bytes.Fields(content)
|
|
remainderTokens := make([][]byte, 0, len(splitted)*3)
|
|
for _, remainder := range splitted {
|
|
remainders := bytes.Split(remainder, nil)
|
|
remainderTokens = append(remainderTokens, remainders...)
|
|
}
|
|
|
|
return content, remainderTokens
|
|
}
|