mirror of
https://github.com/ralsina/tartrazine.git
synced 2024-11-10 05:22:23 +00:00
Added frequencies.go generation
This commit is contained in:
parent
a63c8bdf81
commit
fcf30a07c8
@ -2,7 +2,7 @@ package slinguist
|
||||
|
||||
// CODE GENERATED AUTOMATICALLY WITH gopkg.in/src-d/simple-linguist.v1/internal/code-generator
|
||||
// THIS FILE SHOULD NOT BE EDITED BY HAND
|
||||
// Extracted from github/linguist commit: dae33dc2b20cddc85d1300435c3be7118a7115a9
|
||||
// Extracted from github/linguist commit: 60f864a138650dd17fafc94814be9ee2d3aaef8c
|
||||
|
||||
// languagesByAlias keeps alias for different languages and use the name of the languages as a alias too. All the
|
||||
// keys (alias or not) are written in lower case and the whitespaces has been replaced by underscores.
|
||||
@ -577,6 +577,7 @@ var languagesByAlias = map[string]string{
|
||||
"textile": "Textile",
|
||||
"thrift": "Thrift",
|
||||
"ti_program": "TI Program",
|
||||
"tl": "Type Language",
|
||||
"tla": "TLA",
|
||||
"toml": "TOML",
|
||||
"ts": "TypeScript",
|
||||
@ -584,6 +585,7 @@ var languagesByAlias = map[string]string{
|
||||
"turtle": "Turtle",
|
||||
"twig": "Twig",
|
||||
"txl": "TXL",
|
||||
"type_language": "Type Language",
|
||||
"typescript": "TypeScript",
|
||||
"udiff": "Diff",
|
||||
"unified_parallel_c": "Unified Parallel C",
|
||||
|
@ -2,7 +2,7 @@ package slinguist
|
||||
|
||||
// CODE GENERATED AUTOMATICALLY WITH gopkg.in/src-d/simple-linguist.v1/internal/code-generator
|
||||
// THIS FILE SHOULD NOT BE EDITED BY HAND
|
||||
// Extracted from github/linguist commit: dae33dc2b20cddc85d1300435c3be7118a7115a9
|
||||
// Extracted from github/linguist commit: 60f864a138650dd17fafc94814be9ee2d3aaef8c
|
||||
|
||||
import (
|
||||
"path/filepath"
|
||||
|
@ -2,7 +2,7 @@ package slinguist
|
||||
|
||||
// CODE GENERATED AUTOMATICALLY WITH gopkg.in/src-d/simple-linguist.v1/internal/code-generator
|
||||
// THIS FILE SHOULD NOT BE EDITED BY HAND
|
||||
// Extracted from github/linguist commit: dae33dc2b20cddc85d1300435c3be7118a7115a9
|
||||
// Extracted from github/linguist commit: 60f864a138650dd17fafc94814be9ee2d3aaef8c
|
||||
|
||||
import "gopkg.in/toqueteos/substring.v1"
|
||||
|
||||
|
@ -2,7 +2,7 @@ package slinguist
|
||||
|
||||
// CODE GENERATED AUTOMATICALLY WITH gopkg.in/src-d/simple-linguist.v1/internal/code-generator
|
||||
// THIS FILE SHOULD NOT BE EDITED BY HAND
|
||||
// Extracted from github/linguist commit: dae33dc2b20cddc85d1300435c3be7118a7115a9
|
||||
// Extracted from github/linguist commit: 60f864a138650dd17fafc94814be9ee2d3aaef8c
|
||||
|
||||
var languagesByExtension = map[string][]string{
|
||||
".1": {"Roff"},
|
||||
@ -850,6 +850,7 @@ var languagesByExtension = map[string][]string{
|
||||
".thor": {"Ruby"},
|
||||
".thrift": {"Thrift"},
|
||||
".thy": {"Isabelle"},
|
||||
".tl": {"Type Language"},
|
||||
".tla": {"TLA"},
|
||||
".tm": {"Tcl"},
|
||||
".tmCommand": {"XML"},
|
||||
|
@ -2,7 +2,7 @@ package slinguist
|
||||
|
||||
// CODE GENERATED AUTOMATICALLY WITH gopkg.in/src-d/simple-linguist.v1/internal/code-generator
|
||||
// THIS FILE SHOULD NOT BE EDITED BY HAND
|
||||
// Extracted from github/linguist commit: dae33dc2b20cddc85d1300435c3be7118a7115a9
|
||||
// Extracted from github/linguist commit: 60f864a138650dd17fafc94814be9ee2d3aaef8c
|
||||
|
||||
var languagesByFilename = map[string]string{
|
||||
".Rprofile": "R",
|
||||
|
128805
frequencies.go
Normal file
128805
frequencies.go
Normal file
File diff suppressed because it is too large
Load Diff
24
internal/code-generator/assets/frequencies.go.tmpl
Normal file
24
internal/code-generator/assets/frequencies.go.tmpl
Normal file
@ -0,0 +1,24 @@
|
||||
package slinguist
|
||||
|
||||
// CODE GENERATED AUTOMATICALLY WITH gopkg.in/src-d/simple-linguist.v1/internal/code-generator
|
||||
// THIS FILE SHOULD NOT BE EDITED BY HAND
|
||||
// Extracted from github/linguist commit: {{ getCommit }}
|
||||
|
||||
var DefaultClassifier Classifier = &classifier{
|
||||
languagesLogProbabilities: map[string]float64{
|
||||
{{ $freqs := . -}}
|
||||
{{range $index, $language := orderKeys .Languages -}}
|
||||
"{{ $language }}": {{ languageLogProbability $language -}},
|
||||
{{end -}}
|
||||
},
|
||||
tokensLogProbabilities: map[string]map[string]float64{
|
||||
{{range $index, $language := orderMapMapKeys .Tokens -}}
|
||||
"{{ $language }}": map[string]float64{
|
||||
{{range $i, $token := index $freqs.Tokens $language | orderKeys -}}
|
||||
{{ quote $token }}: {{ tokenLogProbability $language $token }},
|
||||
{{end -}}
|
||||
},
|
||||
{{end -}}
|
||||
},
|
||||
tokensTotal: {{ toFloat64 .TokensTotal -}},
|
||||
}
|
@ -21,6 +21,14 @@ func FromFile(fileToParse, outPath, tmplPath, tmplName, commit string, generate
|
||||
return err
|
||||
}
|
||||
|
||||
if err := formatedWrite(outPath, source); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func formatedWrite(outPath string, source []byte) error {
|
||||
formatedSource, err := format.Source(source)
|
||||
if err != nil {
|
||||
return err
|
||||
|
@ -4,15 +4,20 @@ import (
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/suite"
|
||||
)
|
||||
|
||||
const (
|
||||
lingustURL = "https://github.com/github/linguist.git"
|
||||
commitTree = "60f864a138650dd17fafc94814be9ee2d3aaef8c"
|
||||
commitTest = "fe8b44ab8a225b1ffa75b983b916ea22fee5b6f7"
|
||||
|
||||
// Languages test
|
||||
// Extensions test
|
||||
extensionsTestFile = "test_files/extensions.test.yml"
|
||||
extensionsGold = "test_files/extensions.gold"
|
||||
extensionsTestTmplPath = "../assets/extensions.go.tmpl"
|
||||
@ -59,9 +64,48 @@ const (
|
||||
aliasesGold = "test_files/aliases.gold"
|
||||
aliasesTestTmplPath = "../assets/aliases.go.tmpl"
|
||||
aliasesTestTmplName = "aliases.go.tmpl"
|
||||
|
||||
// Frequencies test
|
||||
frequenciesTestDir = "/samples"
|
||||
frequenciesGold = "test_files/frequencies.gold"
|
||||
frequenciesTestTmplPath = "../assets/frequencies.go.tmpl"
|
||||
frequenciesTestTmplName = "frequencies.go.tmpl"
|
||||
)
|
||||
|
||||
func TestFromFile(t *testing.T) {
|
||||
type GeneratorTestSuite struct {
|
||||
suite.Suite
|
||||
tmpLinguist string
|
||||
}
|
||||
|
||||
func (g *GeneratorTestSuite) SetupSuite() {
|
||||
tmpLinguist, err := ioutil.TempDir("", "linguist-")
|
||||
assert.NoError(g.T(), err)
|
||||
g.tmpLinguist = tmpLinguist
|
||||
|
||||
cmd := exec.Command("git", "clone", lingustURL, tmpLinguist)
|
||||
err = cmd.Run()
|
||||
assert.NoError(g.T(), err)
|
||||
|
||||
cwd, err := os.Getwd()
|
||||
assert.NoError(g.T(), err)
|
||||
|
||||
err = os.Chdir(tmpLinguist)
|
||||
assert.NoError(g.T(), err)
|
||||
|
||||
cmd = exec.Command("git", "checkout", commitTree)
|
||||
err = cmd.Run()
|
||||
assert.NoError(g.T(), err)
|
||||
|
||||
err = os.Chdir(cwd)
|
||||
assert.NoError(g.T(), err)
|
||||
}
|
||||
|
||||
func (g *GeneratorTestSuite) TearDownSuite() {
|
||||
err := os.RemoveAll(g.tmpLinguist)
|
||||
assert.NoError(g.T(), err)
|
||||
}
|
||||
|
||||
func (g *GeneratorTestSuite) TestFromFile() {
|
||||
tests := []struct {
|
||||
name string
|
||||
fileToParse string
|
||||
@ -145,20 +189,57 @@ func TestFromFile(t *testing.T) {
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
gold, err := ioutil.ReadFile(tt.wantOut)
|
||||
assert.NoError(t, err)
|
||||
for _, test := range tests {
|
||||
gold, err := ioutil.ReadFile(test.wantOut)
|
||||
assert.NoError(g.T(), err)
|
||||
|
||||
outPath, err := ioutil.TempFile("/tmp", "generator-test-")
|
||||
assert.NoError(t, err)
|
||||
defer os.Remove(outPath.Name())
|
||||
outPath, err := ioutil.TempFile("/tmp", "generator-test-")
|
||||
assert.NoError(g.T(), err)
|
||||
defer os.Remove(outPath.Name())
|
||||
|
||||
err = FromFile(tt.fileToParse, outPath.Name(), tt.tmplPath, tt.tmplName, tt.commit, tt.generate)
|
||||
assert.NoError(t, err)
|
||||
out, err := ioutil.ReadFile(outPath.Name())
|
||||
assert.NoError(t, err)
|
||||
assert.EqualValues(t, gold, out, fmt.Sprintf("FromFile() = %v, want %v", string(out), string(tt.wantOut)))
|
||||
})
|
||||
err = FromFile(test.fileToParse, outPath.Name(), test.tmplPath, test.tmplName, test.commit, test.generate)
|
||||
assert.NoError(g.T(), err)
|
||||
out, err := ioutil.ReadFile(outPath.Name())
|
||||
assert.NoError(g.T(), err)
|
||||
assert.EqualValues(g.T(), gold, out, fmt.Sprintf("FromFile() = %v, want %v", string(out), string(test.wantOut)))
|
||||
}
|
||||
}
|
||||
|
||||
func (g *GeneratorTestSuite) TestFrequencies() {
|
||||
tests := []struct {
|
||||
name string
|
||||
samplesDir string
|
||||
tmplPath string
|
||||
tmplName string
|
||||
commit string
|
||||
wantOut string
|
||||
}{
|
||||
{
|
||||
name: "Frequencies_1",
|
||||
samplesDir: filepath.Join(g.tmpLinguist, frequenciesTestDir),
|
||||
tmplPath: frequenciesTestTmplPath,
|
||||
tmplName: frequenciesTestTmplName,
|
||||
commit: commitTree,
|
||||
wantOut: frequenciesGold,
|
||||
},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
gold, err := ioutil.ReadFile(test.wantOut)
|
||||
assert.NoError(g.T(), err)
|
||||
|
||||
outPath, err := ioutil.TempFile("/tmp", "frequencies-test-")
|
||||
assert.NoError(g.T(), err)
|
||||
defer os.Remove(outPath.Name())
|
||||
|
||||
err = Frequencies(test.samplesDir, test.tmplPath, test.tmplName, test.commit, outPath.Name())
|
||||
assert.NoError(g.T(), err)
|
||||
out, err := ioutil.ReadFile(outPath.Name())
|
||||
assert.NoError(g.T(), err)
|
||||
assert.EqualValues(g.T(), gold, out, fmt.Sprintf("Frequencies() = %v, want %v", string(out), string(test.wantOut)))
|
||||
}
|
||||
}
|
||||
|
||||
func TestGeneratorTestSuite(t *testing.T) {
|
||||
suite.Run(t, new(GeneratorTestSuite))
|
||||
}
|
||||
|
202
internal/code-generator/generator/samplesfreq.go
Normal file
202
internal/code-generator/generator/samplesfreq.go
Normal file
@ -0,0 +1,202 @@
|
||||
package generator
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"io"
|
||||
"io/ioutil"
|
||||
"log"
|
||||
"math"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strconv"
|
||||
"text/template"
|
||||
|
||||
"gopkg.in/src-d/simple-linguist.v1/internal/tokenizer"
|
||||
)
|
||||
|
||||
type samplesFrequencies struct {
|
||||
LanguageTotal int `json:"language_total,omitempty"`
|
||||
Languages map[string]int `json:"languages,omitempty"`
|
||||
TokensTotal int `json:"tokens_total,omitempty"`
|
||||
Tokens map[string]map[string]int `json:"tokens,omitempty"`
|
||||
LanguageTokens map[string]int `json:"language_tokens,omitempty"`
|
||||
}
|
||||
|
||||
// Frequencies reads directories in samplesDir, retrieves information about frequencies of languages and tokens, and write
|
||||
// the file outPath using frequenciesTmplName as a template.
|
||||
func Frequencies(samplesDir, frequenciesTmplPath, frequenciesTmplName, commit, outPath string) error {
|
||||
freqs, err := getFrequencies(samplesDir)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
buf := &bytes.Buffer{}
|
||||
if err := executeFrequenciesTemplate(buf, freqs, frequenciesTmplPath, frequenciesTmplName, commit); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if err := formatedWrite(outPath, buf.Bytes()); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func getFrequencies(samplesDir string) (*samplesFrequencies, error) {
|
||||
entries, err := ioutil.ReadDir(samplesDir)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var languageTotal int
|
||||
var languages = make(map[string]int)
|
||||
var tokensTotal int
|
||||
var tokens = make(map[string]map[string]int)
|
||||
var languageTokens = make(map[string]int)
|
||||
|
||||
for _, entry := range entries {
|
||||
if !entry.IsDir() {
|
||||
log.Println(err)
|
||||
continue
|
||||
}
|
||||
|
||||
samples, err := getSamples(samplesDir, entry)
|
||||
if err != nil {
|
||||
log.Println(err)
|
||||
}
|
||||
|
||||
if len(samples) == 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
samplesTokens, err := getTokens(samples)
|
||||
if err != nil {
|
||||
log.Println(err)
|
||||
continue
|
||||
}
|
||||
|
||||
lang := entry.Name()
|
||||
languageTotal += len(samples)
|
||||
languages[lang] = len(samples)
|
||||
tokensTotal += len(samplesTokens)
|
||||
languageTokens[lang] = len(samplesTokens)
|
||||
tokens[lang] = make(map[string]int)
|
||||
for _, token := range samplesTokens {
|
||||
tokens[lang][token]++
|
||||
}
|
||||
}
|
||||
|
||||
return &samplesFrequencies{
|
||||
TokensTotal: tokensTotal,
|
||||
LanguageTotal: languageTotal,
|
||||
Tokens: tokens,
|
||||
LanguageTokens: languageTokens,
|
||||
Languages: languages,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func getSamples(samplesDir string, langDir os.FileInfo) ([]string, error) {
|
||||
const subDir = "filenames"
|
||||
|
||||
samples := []string{}
|
||||
path := filepath.Join(samplesDir, langDir.Name())
|
||||
entries, err := ioutil.ReadDir(path)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
for _, entry := range entries {
|
||||
if entry.Mode().IsRegular() {
|
||||
samples = append(samples, filepath.Join(path, entry.Name()))
|
||||
}
|
||||
|
||||
if entry.IsDir() && entry.Name() == subDir {
|
||||
subSamples, err := getSubSamples(samplesDir, langDir.Name(), entry)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
samples = append(samples, subSamples...)
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return samples, nil
|
||||
}
|
||||
|
||||
func getSubSamples(samplesDir, langDir string, subLangDir os.FileInfo) ([]string, error) {
|
||||
subSamples := []string{}
|
||||
path := filepath.Join(samplesDir, langDir, subLangDir.Name())
|
||||
entries, err := ioutil.ReadDir(path)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
for _, entry := range entries {
|
||||
if entry.Mode().IsRegular() {
|
||||
subSamples = append(subSamples, filepath.Join(path, entry.Name()))
|
||||
}
|
||||
}
|
||||
|
||||
return subSamples, nil
|
||||
}
|
||||
|
||||
func getTokens(samples []string) ([]string, error) {
|
||||
tokens := make([]string, 0, 20)
|
||||
var anyError error
|
||||
for _, sample := range samples {
|
||||
content, err := ioutil.ReadFile(sample)
|
||||
if err != nil {
|
||||
anyError = err
|
||||
continue
|
||||
}
|
||||
|
||||
t := tokenizer.Tokenize(content)
|
||||
tokens = append(tokens, t...)
|
||||
}
|
||||
|
||||
return tokens, anyError
|
||||
}
|
||||
|
||||
func executeFrequenciesTemplate(out io.Writer, freqs *samplesFrequencies, frequenciesTmplPath, frequenciesTmpl, commit string) error {
|
||||
fmap := template.FuncMap{
|
||||
"getCommit": func() string { return commit },
|
||||
"toFloat64": func(num int) string { return fmt.Sprintf("%f", float64(num)) },
|
||||
"orderKeys": func(m map[string]int) []string {
|
||||
keys := make([]string, 0, len(m))
|
||||
for key := range m {
|
||||
keys = append(keys, key)
|
||||
}
|
||||
|
||||
sort.Strings(keys)
|
||||
return keys
|
||||
},
|
||||
"languageLogProbability": func(language string) string {
|
||||
num := math.Log(float64(freqs.Languages[language]) / float64(freqs.LanguageTotal))
|
||||
return fmt.Sprintf("%f", num)
|
||||
},
|
||||
"orderMapMapKeys": func(mm map[string]map[string]int) []string {
|
||||
keys := make([]string, 0, len(mm))
|
||||
for key := range mm {
|
||||
keys = append(keys, key)
|
||||
}
|
||||
|
||||
sort.Strings(keys)
|
||||
return keys
|
||||
},
|
||||
"tokenLogProbability": func(language, token string) string {
|
||||
num := math.Log(float64(freqs.Tokens[language][token]) / float64(freqs.LanguageTokens[language]))
|
||||
return fmt.Sprintf("%f", num)
|
||||
},
|
||||
"quote": strconv.Quote,
|
||||
}
|
||||
|
||||
t := template.Must(template.New(frequenciesTmpl).Funcs(fmap).ParseFiles(frequenciesTmplPath))
|
||||
if err := t.Execute(out, freqs); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
128805
internal/code-generator/generator/test_files/frequencies.gold
Normal file
128805
internal/code-generator/generator/test_files/frequencies.gold
Normal file
File diff suppressed because it is too large
Load Diff
@ -54,7 +54,13 @@ const (
|
||||
aliasesTmplPath = "internal/code-generator/assets/aliases.go.tmpl"
|
||||
aliasesTmpl = "aliases.go.tmpl"
|
||||
|
||||
commitPath = ".git/refs/heads/master"
|
||||
// frequencies.go generation
|
||||
samplesDir = ".linguist/samples"
|
||||
frequenciesFile = "frequencies.go"
|
||||
frequenciesTmplPath = "internal/code-generator/assets/frequencies.go.tmpl"
|
||||
frequenciesTmpl = "frequencies.go.tmpl"
|
||||
|
||||
commitPath = ".linguist/.git/refs/heads/master"
|
||||
)
|
||||
|
||||
type generatorArgs struct {
|
||||
@ -88,6 +94,10 @@ func main() {
|
||||
log.Println(err)
|
||||
}
|
||||
}
|
||||
|
||||
if err := generator.Frequencies(samplesDir, frequenciesTmplPath, frequenciesTmpl, commit, frequenciesFile); err != nil {
|
||||
log.Println(err)
|
||||
}
|
||||
}
|
||||
|
||||
func getCommit(path string) (string, error) {
|
||||
|
169
internal/tokenizer/tokenize.go
Normal file
169
internal/tokenizer/tokenize.go
Normal file
@ -0,0 +1,169 @@
|
||||
package tokenizer
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"regexp"
|
||||
)
|
||||
|
||||
func Tokenize(content []byte) []string {
|
||||
tokens := make([][]byte, 0, 50)
|
||||
for _, extract := range extractTokens {
|
||||
var extractedTokens [][]byte
|
||||
content, extractedTokens = extract(content)
|
||||
tokens = append(tokens, extractedTokens...)
|
||||
}
|
||||
|
||||
return toString(tokens)
|
||||
}
|
||||
|
||||
func toString(tokens [][]byte) []string {
|
||||
stokens := make([]string, 0, len(tokens))
|
||||
for _, token := range tokens {
|
||||
stokens = append(stokens, string(token))
|
||||
}
|
||||
|
||||
return stokens
|
||||
}
|
||||
|
||||
var (
|
||||
extractTokens = []func(content []byte) (replacedContent []byte, tokens [][]byte){
|
||||
// The order to must be this
|
||||
extractAndReplaceShebang,
|
||||
extractAndReplaceSGML,
|
||||
skipCommentsAndLiterals,
|
||||
extractAndReplacePunctuation,
|
||||
extractAndReplaceRegular,
|
||||
extractAndReplaceOperator,
|
||||
extractRemainders,
|
||||
}
|
||||
|
||||
reLiteralStringQuotes = regexp.MustCompile(`(?sU)(".*"|'.*')`)
|
||||
reSingleLineComment = regexp.MustCompile(`(?m)(//|--|#|%|")(.*$)`)
|
||||
reMultilineComment = regexp.MustCompile(`(?sU)(/\*.*\*/|<!--.*-->|\{-.*-\}|\(\*.*\*\)|""".*"""|'''.*''')`)
|
||||
reLiteralNumber = regexp.MustCompile(`(0x[0-9A-Fa-f]([0-9A-Fa-f]|\.)*|\d(\d|\.)*)([uU][lL]{0,2}|([eE][-+]\d*)?[fFlL]*)`)
|
||||
reShebang = regexp.MustCompile(`(?m)^#!(?:/\w+)*/(?:(\w+)|\w+(?:\s*\w+=\w+\s*)*\s*(\w+))(?:\s*-\w+\s*)*$`)
|
||||
rePunctuation = regexp.MustCompile(`;|\{|\}|\(|\)|\[|\]`)
|
||||
reSGML = regexp.MustCompile(`(?sU)(<\/?[^\s<>=\d"']+)(?:\s.*\/?>|>)`)
|
||||
reSGMLComment = regexp.MustCompile(`(?sU)(<!--.*-->)`)
|
||||
reSGMLAttributes = regexp.MustCompile(`\s+(\w+=)|\s+([^\s>]+)`)
|
||||
reSGMLLoneAttribute = regexp.MustCompile(`(\w+)`)
|
||||
reRegularToken = regexp.MustCompile(`[\w\.@#\/\*]+`)
|
||||
reOperators = regexp.MustCompile(`<<?|\+|\-|\*|\/|%|&&?|\|\|?`)
|
||||
|
||||
regexToSkip = []*regexp.Regexp{
|
||||
// The order must be this
|
||||
reLiteralStringQuotes,
|
||||
reMultilineComment,
|
||||
reSingleLineComment,
|
||||
reLiteralNumber,
|
||||
}
|
||||
)
|
||||
|
||||
func extractAndReplaceShebang(content []byte) ([]byte, [][]byte) {
|
||||
var shebangTokens [][]byte
|
||||
matches := reShebang.FindAllSubmatch(content, -1)
|
||||
if matches != nil {
|
||||
shebangTokens = make([][]byte, 0, 2)
|
||||
for _, match := range matches {
|
||||
shebangToken := getShebangToken(match)
|
||||
shebangTokens = append(shebangTokens, shebangToken)
|
||||
}
|
||||
|
||||
reShebang.ReplaceAll(content, []byte(` `))
|
||||
}
|
||||
|
||||
return content, shebangTokens
|
||||
}
|
||||
|
||||
func getShebangToken(matchedShebang [][]byte) []byte {
|
||||
const prefix = `SHEBANG#!`
|
||||
var token []byte
|
||||
for i := 1; i < len(matchedShebang); i++ {
|
||||
if len(matchedShebang[i]) > 0 {
|
||||
token = matchedShebang[i]
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
tokenShebang := append([]byte(prefix), token...)
|
||||
return tokenShebang
|
||||
}
|
||||
|
||||
func commonExtracAndReplace(content []byte, re *regexp.Regexp) ([]byte, [][]byte) {
|
||||
tokens := re.FindAll(content, -1)
|
||||
content = re.ReplaceAll(content, []byte(` `))
|
||||
return content, tokens
|
||||
}
|
||||
|
||||
func extractAndReplacePunctuation(content []byte) ([]byte, [][]byte) {
|
||||
return commonExtracAndReplace(content, rePunctuation)
|
||||
}
|
||||
|
||||
func extractAndReplaceRegular(content []byte) ([]byte, [][]byte) {
|
||||
return commonExtracAndReplace(content, reRegularToken)
|
||||
}
|
||||
|
||||
func extractAndReplaceOperator(content []byte) ([]byte, [][]byte) {
|
||||
return commonExtracAndReplace(content, reOperators)
|
||||
}
|
||||
|
||||
func extractAndReplaceSGML(content []byte) ([]byte, [][]byte) {
|
||||
var SGMLTokens [][]byte
|
||||
matches := reSGML.FindAllSubmatch(content, -1)
|
||||
if matches != nil {
|
||||
SGMLTokens = make([][]byte, 0, 2)
|
||||
for _, match := range matches {
|
||||
if reSGMLComment.Match(match[0]) {
|
||||
continue
|
||||
}
|
||||
|
||||
token := append(match[1], '>')
|
||||
SGMLTokens = append(SGMLTokens, token)
|
||||
attributes := getSGMLAttributes(match[0])
|
||||
SGMLTokens = append(SGMLTokens, attributes...)
|
||||
}
|
||||
|
||||
content = reSGML.ReplaceAll(content, []byte(` `))
|
||||
}
|
||||
|
||||
return content, SGMLTokens
|
||||
}
|
||||
|
||||
func getSGMLAttributes(SGMLTag []byte) [][]byte {
|
||||
var attributes [][]byte
|
||||
matches := reSGMLAttributes.FindAllSubmatch(SGMLTag, -1)
|
||||
if matches != nil {
|
||||
attributes = make([][]byte, 0, 5)
|
||||
for _, match := range matches {
|
||||
if len(match[1]) != 0 {
|
||||
attributes = append(attributes, match[1])
|
||||
}
|
||||
|
||||
if len(match[2]) != 0 {
|
||||
loneAttributes := reSGMLLoneAttribute.FindAll(match[2], -1)
|
||||
attributes = append(attributes, loneAttributes...)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return attributes
|
||||
}
|
||||
|
||||
func skipCommentsAndLiterals(content []byte) ([]byte, [][]byte) {
|
||||
for _, skip := range regexToSkip {
|
||||
content = skip.ReplaceAll(content, []byte(` `))
|
||||
}
|
||||
|
||||
return content, nil
|
||||
}
|
||||
|
||||
func extractRemainders(content []byte) ([]byte, [][]byte) {
|
||||
splitted := bytes.Fields(content)
|
||||
remainderTokens := make([][]byte, 0, len(splitted)*3)
|
||||
for _, remainder := range splitted {
|
||||
remainders := bytes.Split(remainder, nil)
|
||||
remainderTokens = append(remainderTokens, remainders...)
|
||||
}
|
||||
|
||||
return content, remainderTokens
|
||||
}
|
107
internal/tokenizer/tokenize_test.go
Normal file
107
internal/tokenizer/tokenize_test.go
Normal file
@ -0,0 +1,107 @@
|
||||
package tokenizer
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
const (
|
||||
testContent = `#!/usr/bin/ruby
|
||||
|
||||
#!/usr/bin/env node
|
||||
|
||||
aaa
|
||||
|
||||
#!/usr/bin/env A=B foo=bar awk -f
|
||||
|
||||
#!python
|
||||
|
||||
func Tokenize(content []byte) []string {
|
||||
splitted := bytes.Fields(content)
|
||||
tokens := /* make([]string, 0, len(splitted))
|
||||
no comment -- comment
|
||||
for _, tokenByte := range splitted {
|
||||
token64 := base64.StdEncoding.EncodeToString(tokenByte)
|
||||
tokens = append(tokens, token64)
|
||||
notcatchasanumber3.5
|
||||
}*/
|
||||
othercode
|
||||
/* testing multiple
|
||||
|
||||
multiline comments*/
|
||||
|
||||
<!-- com
|
||||
ment -->
|
||||
<!-- comment 2-->
|
||||
ppp no comment # comment
|
||||
|
||||
"literal1"
|
||||
|
||||
abb (tokenByte, 0xAF02) | ,3.2L
|
||||
|
||||
'literal2' notcatchasanumber3.5
|
||||
|
||||
5 += number * anotherNumber
|
||||
if isTrue && isToo {
|
||||
0b00001000 >> 1
|
||||
}
|
||||
|
||||
return tokens
|
||||
|
||||
oneBool = 3 <= 2
|
||||
varBool = 3<=2>
|
||||
|
||||
PyErr_SetString(PyExc_RuntimeError, "Relative import is not supported for Python <=2.4.");
|
||||
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
||||
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||
<head>
|
||||
<title id="hola" class="">This is a XHTML sample file</title>
|
||||
<style type="text/css"><![CDATA[
|
||||
#example {
|
||||
background-color: yellow;
|
||||
}
|
||||
]]></style>
|
||||
</head>
|
||||
<body>
|
||||
<div id="example">
|
||||
Just a simple <strong>XHTML</strong> test page.
|
||||
</div>
|
||||
</body>
|
||||
</html>`
|
||||
)
|
||||
|
||||
var (
|
||||
tokensFromTestContent = []string{"SHEBANG#!ruby", "SHEBANG#!node", "SHEBANG#!awk", "<!DOCTYPE>", "PUBLIC", "W3C", "DTD", "XHTML", "1", "0",
|
||||
"Strict", "EN", "http", "www", "w3", "org", "TR", "xhtml1", "DTD", "xhtml1", "strict", "dtd", "<html>", "<head>", "<title>", "class=",
|
||||
"</title>", "<style>", "<![CDATA[>", "example", "background", "color", "yellow", "</style>", "</head>", "<body>", "<div>", "<strong>",
|
||||
"</strong>", "</div>", "</body>", "</html>", "(", "[", "]", ")", "[", "]", "{", "(", ")", "(", ")", "{", "}", "(", ")", ";", ";", "}",
|
||||
"]", "]", "aaa", "func", "Tokenize", "content", "byte", "string", "splitted", "bytes.Fields", "content", "tokens", "othercode", "ppp",
|
||||
"no", "comment", "abb", "tokenByte", "notcatchasanumber", "number", "*", "anotherNumber", "if", "isTrue", "isToo", "b", "return",
|
||||
"tokens", "oneBool", "varBool", "PyErr_SetString", "PyExc_RuntimeError", "html", "PUBLIC", "xmlns", "id", "class", "This", "is", "a",
|
||||
"XHTML", "sample", "file", "type", "background", "color", "yellow", "id", "Just", "a", "simple", "XHTML", "test", "page.", "|", "+",
|
||||
"&&", "<", "<", "-", ":", "=", ":", "=", ",", ",", "=", ">", ">", "=", "=", "=", "=", ">", ",", ">", "=", ">", "=", "=", ">", "=", ">",
|
||||
":", ">", "=", ">"}
|
||||
)
|
||||
|
||||
func TestTokenize(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
content []byte
|
||||
expected []string
|
||||
}{
|
||||
{name: "content", content: []byte(testContent), expected: tokensFromTestContent},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
t.Run(test.name, func(t *testing.T) {
|
||||
tokens := Tokenize(test.content)
|
||||
assert.Equal(t, len(test.expected), len(tokens), fmt.Sprintf("token' slice length = %v, want %v", len(test.expected), len(tokens)))
|
||||
for i, expectedToken := range test.expected {
|
||||
assert.Equal(t, expectedToken, tokens[i], fmt.Sprintf("token = %v, want %v", tokens[i], expectedToken))
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
@ -2,7 +2,7 @@ package slinguist
|
||||
|
||||
// CODE GENERATED AUTOMATICALLY WITH gopkg.in/src-d/simple-linguist.v1/internal/code-generator
|
||||
// THIS FILE SHOULD NOT BE EDITED BY HAND
|
||||
// Extracted from github/linguist commit: dae33dc2b20cddc85d1300435c3be7118a7115a9
|
||||
// Extracted from github/linguist commit: 60f864a138650dd17fafc94814be9ee2d3aaef8c
|
||||
|
||||
var languagesByInterpreter = map[string][]string{
|
||||
"Rscript": {"R"},
|
||||
|
@ -2,7 +2,7 @@ package slinguist
|
||||
|
||||
// CODE GENERATED AUTOMATICALLY WITH gopkg.in/src-d/simple-linguist.v1/internal/code-generator
|
||||
// THIS FILE SHOULD NOT BE EDITED BY HAND
|
||||
// Extracted from github/linguist commit: dae33dc2b20cddc85d1300435c3be7118a7115a9
|
||||
// Extracted from github/linguist commit: 60f864a138650dd17fafc94814be9ee2d3aaef8c
|
||||
|
||||
var languagesType = map[string]Type{
|
||||
"1C Enterprise": Programming,
|
||||
@ -408,6 +408,7 @@ var languagesType = map[string]Type{
|
||||
"Turing": Programming,
|
||||
"Turtle": Data,
|
||||
"Twig": Markup,
|
||||
"Type Language": Data,
|
||||
"TypeScript": Programming,
|
||||
"Unified Parallel C": Programming,
|
||||
"Unity3D Asset": Data,
|
||||
|
@ -2,7 +2,7 @@ package slinguist
|
||||
|
||||
// CODE GENERATED AUTOMATICALLY WITH gopkg.in/src-d/simple-linguist.v1/internal/code-generator
|
||||
// THIS FILE SHOULD NOT BE EDITED BY HAND
|
||||
// Extracted from github/linguist commit: dae33dc2b20cddc85d1300435c3be7118a7115a9
|
||||
// Extracted from github/linguist commit: 60f864a138650dd17fafc94814be9ee2d3aaef8c
|
||||
|
||||
import "gopkg.in/toqueteos/substring.v1"
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user