Added frequencies.go generation

2025-06-19 06:33:06 -03:00 · 2017-05-25 12:33:26 +02:00
parent a63c8bdf81
commit fcf30a07c8
17 changed files with 258239 additions and 24 deletions
--- a/internal/tokenizer/tokenize.go
+++ b/internal/tokenizer/tokenize.go
@ -0,0 +1,169 @@
+package tokenizer
+
+import (
+	"bytes"
+	"regexp"
+)
+
+func Tokenize(content []byte) []string {
+	tokens := make([][]byte, 0, 50)
+	for _, extract := range extractTokens {
+		var extractedTokens [][]byte
+		content, extractedTokens = extract(content)
+		tokens = append(tokens, extractedTokens...)
+	}
+
+	return toString(tokens)
+}
+
+func toString(tokens [][]byte) []string {
+	stokens := make([]string, 0, len(tokens))
+	for _, token := range tokens {
+		stokens = append(stokens, string(token))
+	}
+
+	return stokens
+}
+
+var (
+	extractTokens = []func(content []byte) (replacedContent []byte, tokens [][]byte){
+		// The order to must be this
+		extractAndReplaceShebang,
+		extractAndReplaceSGML,
+		skipCommentsAndLiterals,
+		extractAndReplacePunctuation,
+		extractAndReplaceRegular,
+		extractAndReplaceOperator,
+		extractRemainders,
+	}
+
+	reLiteralStringQuotes = regexp.MustCompile(`(?sU)(".*"|'.*')`)
+	reSingleLineComment   = regexp.MustCompile(`(?m)(//|--|#|%|")(.*$)`)
+	reMultilineComment    = regexp.MustCompile(`(?sU)(/\*.*\*/|<!--.*-->|\{-.*-\}|\(\*.*\*\)|""".*"""|'''.*''')`)
+	reLiteralNumber       = regexp.MustCompile(`(0x[0-9A-Fa-f]([0-9A-Fa-f]|\.)*|\d(\d|\.)*)([uU][lL]{0,2}|([eE][-+]\d*)?[fFlL]*)`)
+	reShebang             = regexp.MustCompile(`(?m)^#!(?:/\w+)*/(?:(\w+)|\w+(?:\s*\w+=\w+\s*)*\s*(\w+))(?:\s*-\w+\s*)*$`)
+	rePunctuation         = regexp.MustCompile(`;|\{|\}|\(|\)|\[|\]`)
+	reSGML                = regexp.MustCompile(`(?sU)(<\/?[^\s<>=\d"']+)(?:\s.*\/?>|>)`)
+	reSGMLComment         = regexp.MustCompile(`(?sU)(<!--.*-->)`)
+	reSGMLAttributes      = regexp.MustCompile(`\s+(\w+=)|\s+([^\s>]+)`)
+	reSGMLLoneAttribute   = regexp.MustCompile(`(\w+)`)
+	reRegularToken        = regexp.MustCompile(`[\w\.@#\/\*]+`)
+	reOperators           = regexp.MustCompile(`<<?|\+|\-|\*|\/|%|&&?|\|\|?`)
+
+	regexToSkip = []*regexp.Regexp{
+		// The order must be this
+		reLiteralStringQuotes,
+		reMultilineComment,
+		reSingleLineComment,
+		reLiteralNumber,
+	}
+)
+
+func extractAndReplaceShebang(content []byte) ([]byte, [][]byte) {
+	var shebangTokens [][]byte
+	matches := reShebang.FindAllSubmatch(content, -1)
+	if matches != nil {
+		shebangTokens = make([][]byte, 0, 2)
+		for _, match := range matches {
+			shebangToken := getShebangToken(match)
+			shebangTokens = append(shebangTokens, shebangToken)
+		}
+
+		reShebang.ReplaceAll(content, []byte(` `))
+	}
+
+	return content, shebangTokens
+}
+
+func getShebangToken(matchedShebang [][]byte) []byte {
+	const prefix = `SHEBANG#!`
+	var token []byte
+	for i := 1; i < len(matchedShebang); i++ {
+		if len(matchedShebang[i]) > 0 {
+			token = matchedShebang[i]
+			break
+		}
+	}
+
+	tokenShebang := append([]byte(prefix), token...)
+	return tokenShebang
+}
+
+func commonExtracAndReplace(content []byte, re *regexp.Regexp) ([]byte, [][]byte) {
+	tokens := re.FindAll(content, -1)
+	content = re.ReplaceAll(content, []byte(` `))
+	return content, tokens
+}
+
+func extractAndReplacePunctuation(content []byte) ([]byte, [][]byte) {
+	return commonExtracAndReplace(content, rePunctuation)
+}
+
+func extractAndReplaceRegular(content []byte) ([]byte, [][]byte) {
+	return commonExtracAndReplace(content, reRegularToken)
+}
+
+func extractAndReplaceOperator(content []byte) ([]byte, [][]byte) {
+	return commonExtracAndReplace(content, reOperators)
+}
+
+func extractAndReplaceSGML(content []byte) ([]byte, [][]byte) {
+	var SGMLTokens [][]byte
+	matches := reSGML.FindAllSubmatch(content, -1)
+	if matches != nil {
+		SGMLTokens = make([][]byte, 0, 2)
+		for _, match := range matches {
+			if reSGMLComment.Match(match[0]) {
+				continue
+			}
+
+			token := append(match[1], '>')
+			SGMLTokens = append(SGMLTokens, token)
+			attributes := getSGMLAttributes(match[0])
+			SGMLTokens = append(SGMLTokens, attributes...)
+		}
+
+		content = reSGML.ReplaceAll(content, []byte(` `))
+	}
+
+	return content, SGMLTokens
+}
+
+func getSGMLAttributes(SGMLTag []byte) [][]byte {
+	var attributes [][]byte
+	matches := reSGMLAttributes.FindAllSubmatch(SGMLTag, -1)
+	if matches != nil {
+		attributes = make([][]byte, 0, 5)
+		for _, match := range matches {
+			if len(match[1]) != 0 {
+				attributes = append(attributes, match[1])
+			}
+
+			if len(match[2]) != 0 {
+				loneAttributes := reSGMLLoneAttribute.FindAll(match[2], -1)
+				attributes = append(attributes, loneAttributes...)
+			}
+		}
+	}
+
+	return attributes
+}
+
+func skipCommentsAndLiterals(content []byte) ([]byte, [][]byte) {
+	for _, skip := range regexToSkip {
+		content = skip.ReplaceAll(content, []byte(` `))
+	}
+
+	return content, nil
+}
+
+func extractRemainders(content []byte) ([]byte, [][]byte) {
+	splitted := bytes.Fields(content)
+	remainderTokens := make([][]byte, 0, len(splitted)*3)
+	for _, remainder := range splitted {
+		remainders := bytes.Split(remainder, nil)
+		remainderTokens = append(remainderTokens, remainders...)
+	}
+
+	return content, remainderTokens
+}
--- a/internal/tokenizer/tokenize_test.go
+++ b/internal/tokenizer/tokenize_test.go
@ -0,0 +1,107 @@
+package tokenizer
+
+import (
+	"fmt"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+)
+
+const (
+	testContent = `#!/usr/bin/ruby
+
+#!/usr/bin/env node
+
+aaa
+
+#!/usr/bin/env A=B foo=bar awk -f
+
+#!python
+
+func Tokenize(content []byte) []string {
+	splitted := bytes.Fields(content)
+	tokens := /* make([]string, 0, len(splitted))
+	no comment -- comment
+	for _, tokenByte := range splitted {
+		token64 := base64.StdEncoding.EncodeToString(tokenByte)
+		tokens = append(tokens, token64)
+		notcatchasanumber3.5
+	}*/
+othercode
+	/* testing multiple 
+	
+		multiline comments*/
+
+<!-- com
+	ment -->
+<!-- comment 2-->
+ppp no comment # comment
+
+"literal1"
+
+abb (tokenByte, 0xAF02) | ,3.2L
+
+'literal2' notcatchasanumber3.5
+
+	5 += number * anotherNumber
+	if isTrue && isToo {
+		0b00001000 >> 1
+	}
+
+	return tokens
+
+oneBool = 3 <= 2
+varBool = 3<=2>
+ 
+  PyErr_SetString(PyExc_RuntimeError, "Relative import is not supported for Python <=2.4.");
+
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml">
+    <head>
+        <title id="hola" class="">This is a XHTML sample file</title>
+        <style type="text/css"><![CDATA[
+            #example {
+                background-color: yellow;
+            }
+        ]]></style>
+    </head>
+    <body>
+        <div id="example">
+            Just a simple <strong>XHTML</strong> test page.
+        </div>
+    </body>
+</html>`
+)
+
+var (
+	tokensFromTestContent = []string{"SHEBANG#!ruby", "SHEBANG#!node", "SHEBANG#!awk", "<!DOCTYPE>", "PUBLIC", "W3C", "DTD", "XHTML", "1", "0",
+		"Strict", "EN", "http", "www", "w3", "org", "TR", "xhtml1", "DTD", "xhtml1", "strict", "dtd", "<html>", "<head>", "<title>", "class=",
+		"</title>", "<style>", "<![CDATA[>", "example", "background", "color", "yellow", "</style>", "</head>", "<body>", "<div>", "<strong>",
+		"</strong>", "</div>", "</body>", "</html>", "(", "[", "]", ")", "[", "]", "{", "(", ")", "(", ")", "{", "}", "(", ")", ";", ";", "}",
+		"]", "]", "aaa", "func", "Tokenize", "content", "byte", "string", "splitted", "bytes.Fields", "content", "tokens", "othercode", "ppp",
+		"no", "comment", "abb", "tokenByte", "notcatchasanumber", "number", "*", "anotherNumber", "if", "isTrue", "isToo", "b", "return",
+		"tokens", "oneBool", "varBool", "PyErr_SetString", "PyExc_RuntimeError", "html", "PUBLIC", "xmlns", "id", "class", "This", "is", "a",
+		"XHTML", "sample", "file", "type", "background", "color", "yellow", "id", "Just", "a", "simple", "XHTML", "test", "page.", "|", "+",
+		"&&", "<", "<", "-", ":", "=", ":", "=", ",", ",", "=", ">", ">", "=", "=", "=", "=", ">", ",", ">", "=", ">", "=", "=", ">", "=", ">",
+		":", ">", "=", ">"}
+)
+
+func TestTokenize(t *testing.T) {
+	tests := []struct {
+		name     string
+		content  []byte
+		expected []string
+	}{
+		{name: "content", content: []byte(testContent), expected: tokensFromTestContent},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			tokens := Tokenize(test.content)
+			assert.Equal(t, len(test.expected), len(tokens), fmt.Sprintf("token' slice length = %v, want %v", len(test.expected), len(tokens)))
+			for i, expectedToken := range test.expected {
+				assert.Equal(t, expectedToken, tokens[i], fmt.Sprintf("token = %v, want %v", tokens[i], expectedToken))
+			}
+		})
+	}
+}