mirror of
https://github.com/ralsina/tartrazine.git
synced 2025-05-24 08:18:52 -03:00
tokenizer: fix a bug and regenerate the code \w latest Go
See https://github.com/bzz/enry/pull/4 for details. Test Plan: - go test ./...
This commit is contained in:
parent
e08125d7ee
commit
e32a70a784
10
.travis.yml
10
.travis.yml
@ -1,8 +1,9 @@
|
|||||||
dist: trusty
|
dist: trusty
|
||||||
language: go
|
language: go
|
||||||
go:
|
go:
|
||||||
- '1.12.x'
|
- '1.14.x'
|
||||||
- '1.11.x'
|
- '1.13.x'
|
||||||
|
|
||||||
env:
|
env:
|
||||||
global:
|
global:
|
||||||
- GO_VERSION_FOR_JVM='1.11.x'
|
- GO_VERSION_FOR_JVM='1.11.x'
|
||||||
@ -12,8 +13,6 @@ env:
|
|||||||
matrix:
|
matrix:
|
||||||
- ONIGURUMA=0
|
- ONIGURUMA=0
|
||||||
- ONIGURUMA=1
|
- ONIGURUMA=1
|
||||||
matrix:
|
|
||||||
fast_finish: true
|
|
||||||
|
|
||||||
stages:
|
stages:
|
||||||
- name: test
|
- name: test
|
||||||
@ -22,7 +21,6 @@ stages:
|
|||||||
- name: publish
|
- name: publish
|
||||||
if: tag IS present
|
if: tag IS present
|
||||||
|
|
||||||
stage: test
|
|
||||||
install:
|
install:
|
||||||
- >
|
- >
|
||||||
if [[ "${ONIGURUMA}" -gt 0 ]]; then
|
if [[ "${ONIGURUMA}" -gt 0 ]]; then
|
||||||
@ -36,7 +34,7 @@ install:
|
|||||||
sudo dpkg -i "libonig-dev_${ONIGURUMA_VERSION}-1_amd64.deb"
|
sudo dpkg -i "libonig-dev_${ONIGURUMA_VERSION}-1_amd64.deb"
|
||||||
fi;
|
fi;
|
||||||
script:
|
script:
|
||||||
- make test-coverage
|
- go test ./...
|
||||||
after_success:
|
after_success:
|
||||||
- bash <(curl -s https://codecov.io/bash)
|
- bash <(curl -s https://codecov.io/bash)
|
||||||
|
|
||||||
|
193399
data/frequencies.go
193399
data/frequencies.go
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -18,10 +18,6 @@ func Tokenize(content []byte) []string {
|
|||||||
content = content[:ByteLimit]
|
content = content[:ByteLimit]
|
||||||
}
|
}
|
||||||
|
|
||||||
// Copy the input so that changes wrought by the tokenization steps do not
|
|
||||||
// modify the caller's copy of the input. See #196.
|
|
||||||
content = append([]byte(nil), content...)
|
|
||||||
|
|
||||||
tokens := make([][]byte, 0, 50)
|
tokens := make([][]byte, 0, 50)
|
||||||
for _, extract := range extractTokens {
|
for _, extract := range extractTokens {
|
||||||
var extractedTokens [][]byte
|
var extractedTokens [][]byte
|
||||||
@ -162,7 +158,7 @@ func extractAndReplaceSGML(content []byte) ([]byte, [][]byte) {
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
token := append(match[1], '>')
|
token := append(append([]byte(nil), match[1]...), '>')
|
||||||
SGMLTokens = append(SGMLTokens, token)
|
SGMLTokens = append(SGMLTokens, token)
|
||||||
attributes := getSGMLAttributes(match[0])
|
attributes := getSGMLAttributes(match[0])
|
||||||
SGMLTokens = append(SGMLTokens, attributes...)
|
SGMLTokens = append(SGMLTokens, attributes...)
|
||||||
|
@ -78,18 +78,19 @@ varBool = 3<=2>
|
|||||||
)
|
)
|
||||||
|
|
||||||
var (
|
var (
|
||||||
tokensFromTestContent = []string{"SHEBANG#!ruby", "SHEBANG#!node", "SHEBANG#!awk", "<!DOCTYPE>", "PUBLIC", "W3C", "DTD", "XHTML", "1", "0",
|
tokensFromTestContent = []string{"SHEBANG#!ruby", "SHEBANG#!node", "SHEBANG#!awk", "<!DOCTYPE>", "html", "PUBLIC",
|
||||||
"Strict", "EN", "http", "www", "w3", "org", "TR", "xhtml1", "DTD", "xhtml1", "strict", "dtd", "<html>", "<head>", "<title>", "class=",
|
"W3C", "DTD", "XHTML", "1", "0", "Strict", "EN", "http", "www", "w3", "org", "TR", "xhtml1", "DTD", "xhtml1",
|
||||||
"</title>", "<style>", "<![CDATA[>", "example", "background", "color", "yellow", "</style>", "</head>", "<body>", "<div>", "<strong>",
|
"strict", "dtd", "<html>", "xmlns=", "<head>", "<title>", "id=", "class=", "</title>", "<style>", "type=",
|
||||||
"</strong>", "</div>", "</body>", "</html>", "(", "[", "]", ")", "[", "]", "{", "(", ")", "(", ")", "{", "}", "(", ")", ";", "{", ";",
|
"<![CDATA[>", "example", "background", "color", "yellow", "</style>", "</head>", "<body>", "<div>", "id=",
|
||||||
"}", "]", "]", "#", "/usr/bin/ruby", "#", "/usr/bin/env", "node", "aaa", "#", "/usr/bin/env", "A", "B", "foo", "bar", "awk", "f", "#",
|
"<strong>", "</strong>", "</div>", "</body>", "</html>", "(", "[", "]", ")", "[", "]", "{", "(", ")", "(", ")",
|
||||||
"python", "func", "Tokenize", "content", "byte", "string", "splitted", "bytes.Fields", "content", "tokens", "othercode", "ppp", "no",
|
"{", "}", "(", ")", ";", "#", "/usr/bin/ruby", "#", "/usr/bin/env", "node", "aaa", "#", "/usr/bin/env", "A",
|
||||||
"comment", "abb", "tokenByte", "notcatchasanumber", "number", "*", "anotherNumber", "if", "isTrue", "isToo", "b", "return", "tokens",
|
"B", "foo", "bar", "awk", "f", "#", "python", "func", "Tokenize", "content", "byte", "string", "splitted",
|
||||||
"oneBool", "varBool", "#ifndef", "#i", "m", "not", "a", "comment", "if", "the", "single", "line", "comment", "symbol", "is", "not",
|
"bytes.Fields", "content", "tokens", "othercode", "ppp", "no", "comment", "abb", "tokenByte",
|
||||||
"followed", "by", "a", "white", "PyErr_SetString", "PyExc_RuntimeError", "html", "PUBLIC", "xmlns", "id", "class", "This", "is", "a",
|
"notcatchasanumber", "number", "*", "anotherNumber", "if", "isTrue", "isToo", "b", "return", "tokens",
|
||||||
"XHTML", "sample", "file", "type", "#example", "background", "color", "yellow", "id", "Just", "a", "simple", "XHTML", "test", "page.",
|
"oneBool", "varBool", "#ifndef", "#i", "m", "not", "a", "comment", "if", "the", "single", "line", "comment",
|
||||||
"-", "|", "+", "&&", "<", "<", "-", "!", "!", "!", "=", "=", "!", ":", "=", ":", "=", ",", ",", "=", ">", ">", "=", "=", "=", "=", ">",
|
"symbol", "is", "not", "followed", "by", "a", "white", "PyErr_SetString", "PyExc_RuntimeError", "This", "is",
|
||||||
"'", ",", ">", "=", ">", "=", "=", ">", "=", ">", ":", ">", "=", ">"}
|
"a", "XHTML", "sample", "file", "Just", "a", "simple", "XHTML", "test", "page.", "-", "|", "+", "&&", "<", "<",
|
||||||
|
"!", "!", "!", "=", "=", "!", ":", "=", ":", "=", ",", ",", "=", ">", ">", "=", "=", "=", "=", ">", "'", ","}
|
||||||
|
|
||||||
tests = []struct {
|
tests = []struct {
|
||||||
name string
|
name string
|
||||||
@ -108,6 +109,7 @@ func TestTokenize(t *testing.T) {
|
|||||||
after := string(test.content)
|
after := string(test.content)
|
||||||
require.Equal(t, before, after, "the input slice was modified")
|
require.Equal(t, before, after, "the input slice was modified")
|
||||||
require.Equal(t, len(test.expected), len(tokens), fmt.Sprintf("token' slice length = %v, want %v", len(test.expected), len(tokens)))
|
require.Equal(t, len(test.expected), len(tokens), fmt.Sprintf("token' slice length = %v, want %v", len(test.expected), len(tokens)))
|
||||||
|
|
||||||
for i, expectedToken := range test.expected {
|
for i, expectedToken := range test.expected {
|
||||||
assert.Equal(t, expectedToken, tokens[i], fmt.Sprintf("token = %v, want %v", tokens[i], expectedToken))
|
assert.Equal(t, expectedToken, tokens[i], fmt.Sprintf("token = %v, want %v", tokens[i], expectedToken))
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user