mirror of
https://github.com/ralsina/tartrazine.git
synced 2025-05-23 08:30:07 -03:00
Added frequencies.go generation
This commit is contained in:
parent
a63c8bdf81
commit
fcf30a07c8
@ -2,7 +2,7 @@ package slinguist
|
|||||||
|
|
||||||
// CODE GENERATED AUTOMATICALLY WITH gopkg.in/src-d/simple-linguist.v1/internal/code-generator
|
// CODE GENERATED AUTOMATICALLY WITH gopkg.in/src-d/simple-linguist.v1/internal/code-generator
|
||||||
// THIS FILE SHOULD NOT BE EDITED BY HAND
|
// THIS FILE SHOULD NOT BE EDITED BY HAND
|
||||||
// Extracted from github/linguist commit: dae33dc2b20cddc85d1300435c3be7118a7115a9
|
// Extracted from github/linguist commit: 60f864a138650dd17fafc94814be9ee2d3aaef8c
|
||||||
|
|
||||||
// languagesByAlias keeps alias for different languages and use the name of the languages as a alias too. All the
|
// languagesByAlias keeps alias for different languages and use the name of the languages as a alias too. All the
|
||||||
// keys (alias or not) are written in lower case and the whitespaces has been replaced by underscores.
|
// keys (alias or not) are written in lower case and the whitespaces has been replaced by underscores.
|
||||||
@ -577,6 +577,7 @@ var languagesByAlias = map[string]string{
|
|||||||
"textile": "Textile",
|
"textile": "Textile",
|
||||||
"thrift": "Thrift",
|
"thrift": "Thrift",
|
||||||
"ti_program": "TI Program",
|
"ti_program": "TI Program",
|
||||||
|
"tl": "Type Language",
|
||||||
"tla": "TLA",
|
"tla": "TLA",
|
||||||
"toml": "TOML",
|
"toml": "TOML",
|
||||||
"ts": "TypeScript",
|
"ts": "TypeScript",
|
||||||
@ -584,6 +585,7 @@ var languagesByAlias = map[string]string{
|
|||||||
"turtle": "Turtle",
|
"turtle": "Turtle",
|
||||||
"twig": "Twig",
|
"twig": "Twig",
|
||||||
"txl": "TXL",
|
"txl": "TXL",
|
||||||
|
"type_language": "Type Language",
|
||||||
"typescript": "TypeScript",
|
"typescript": "TypeScript",
|
||||||
"udiff": "Diff",
|
"udiff": "Diff",
|
||||||
"unified_parallel_c": "Unified Parallel C",
|
"unified_parallel_c": "Unified Parallel C",
|
||||||
|
@ -2,7 +2,7 @@ package slinguist
|
|||||||
|
|
||||||
// CODE GENERATED AUTOMATICALLY WITH gopkg.in/src-d/simple-linguist.v1/internal/code-generator
|
// CODE GENERATED AUTOMATICALLY WITH gopkg.in/src-d/simple-linguist.v1/internal/code-generator
|
||||||
// THIS FILE SHOULD NOT BE EDITED BY HAND
|
// THIS FILE SHOULD NOT BE EDITED BY HAND
|
||||||
// Extracted from github/linguist commit: dae33dc2b20cddc85d1300435c3be7118a7115a9
|
// Extracted from github/linguist commit: 60f864a138650dd17fafc94814be9ee2d3aaef8c
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
|
@ -2,7 +2,7 @@ package slinguist
|
|||||||
|
|
||||||
// CODE GENERATED AUTOMATICALLY WITH gopkg.in/src-d/simple-linguist.v1/internal/code-generator
|
// CODE GENERATED AUTOMATICALLY WITH gopkg.in/src-d/simple-linguist.v1/internal/code-generator
|
||||||
// THIS FILE SHOULD NOT BE EDITED BY HAND
|
// THIS FILE SHOULD NOT BE EDITED BY HAND
|
||||||
// Extracted from github/linguist commit: dae33dc2b20cddc85d1300435c3be7118a7115a9
|
// Extracted from github/linguist commit: 60f864a138650dd17fafc94814be9ee2d3aaef8c
|
||||||
|
|
||||||
import "gopkg.in/toqueteos/substring.v1"
|
import "gopkg.in/toqueteos/substring.v1"
|
||||||
|
|
||||||
|
@ -2,7 +2,7 @@ package slinguist
|
|||||||
|
|
||||||
// CODE GENERATED AUTOMATICALLY WITH gopkg.in/src-d/simple-linguist.v1/internal/code-generator
|
// CODE GENERATED AUTOMATICALLY WITH gopkg.in/src-d/simple-linguist.v1/internal/code-generator
|
||||||
// THIS FILE SHOULD NOT BE EDITED BY HAND
|
// THIS FILE SHOULD NOT BE EDITED BY HAND
|
||||||
// Extracted from github/linguist commit: dae33dc2b20cddc85d1300435c3be7118a7115a9
|
// Extracted from github/linguist commit: 60f864a138650dd17fafc94814be9ee2d3aaef8c
|
||||||
|
|
||||||
var languagesByExtension = map[string][]string{
|
var languagesByExtension = map[string][]string{
|
||||||
".1": {"Roff"},
|
".1": {"Roff"},
|
||||||
@ -850,6 +850,7 @@ var languagesByExtension = map[string][]string{
|
|||||||
".thor": {"Ruby"},
|
".thor": {"Ruby"},
|
||||||
".thrift": {"Thrift"},
|
".thrift": {"Thrift"},
|
||||||
".thy": {"Isabelle"},
|
".thy": {"Isabelle"},
|
||||||
|
".tl": {"Type Language"},
|
||||||
".tla": {"TLA"},
|
".tla": {"TLA"},
|
||||||
".tm": {"Tcl"},
|
".tm": {"Tcl"},
|
||||||
".tmCommand": {"XML"},
|
".tmCommand": {"XML"},
|
||||||
|
@ -2,7 +2,7 @@ package slinguist
|
|||||||
|
|
||||||
// CODE GENERATED AUTOMATICALLY WITH gopkg.in/src-d/simple-linguist.v1/internal/code-generator
|
// CODE GENERATED AUTOMATICALLY WITH gopkg.in/src-d/simple-linguist.v1/internal/code-generator
|
||||||
// THIS FILE SHOULD NOT BE EDITED BY HAND
|
// THIS FILE SHOULD NOT BE EDITED BY HAND
|
||||||
// Extracted from github/linguist commit: dae33dc2b20cddc85d1300435c3be7118a7115a9
|
// Extracted from github/linguist commit: 60f864a138650dd17fafc94814be9ee2d3aaef8c
|
||||||
|
|
||||||
var languagesByFilename = map[string]string{
|
var languagesByFilename = map[string]string{
|
||||||
".Rprofile": "R",
|
".Rprofile": "R",
|
||||||
|
128805
frequencies.go
Normal file
128805
frequencies.go
Normal file
File diff suppressed because it is too large
Load Diff
24
internal/code-generator/assets/frequencies.go.tmpl
Normal file
24
internal/code-generator/assets/frequencies.go.tmpl
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
package slinguist
|
||||||
|
|
||||||
|
// CODE GENERATED AUTOMATICALLY WITH gopkg.in/src-d/simple-linguist.v1/internal/code-generator
|
||||||
|
// THIS FILE SHOULD NOT BE EDITED BY HAND
|
||||||
|
// Extracted from github/linguist commit: {{ getCommit }}
|
||||||
|
|
||||||
|
var DefaultClassifier Classifier = &classifier{
|
||||||
|
languagesLogProbabilities: map[string]float64{
|
||||||
|
{{ $freqs := . -}}
|
||||||
|
{{range $index, $language := orderKeys .Languages -}}
|
||||||
|
"{{ $language }}": {{ languageLogProbability $language -}},
|
||||||
|
{{end -}}
|
||||||
|
},
|
||||||
|
tokensLogProbabilities: map[string]map[string]float64{
|
||||||
|
{{range $index, $language := orderMapMapKeys .Tokens -}}
|
||||||
|
"{{ $language }}": map[string]float64{
|
||||||
|
{{range $i, $token := index $freqs.Tokens $language | orderKeys -}}
|
||||||
|
{{ quote $token }}: {{ tokenLogProbability $language $token }},
|
||||||
|
{{end -}}
|
||||||
|
},
|
||||||
|
{{end -}}
|
||||||
|
},
|
||||||
|
tokensTotal: {{ toFloat64 .TokensTotal -}},
|
||||||
|
}
|
@ -21,6 +21,14 @@ func FromFile(fileToParse, outPath, tmplPath, tmplName, commit string, generate
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if err := formatedWrite(outPath, source); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func formatedWrite(outPath string, source []byte) error {
|
||||||
formatedSource, err := format.Source(source)
|
formatedSource, err := format.Source(source)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
|
@ -4,15 +4,20 @@ import (
|
|||||||
"fmt"
|
"fmt"
|
||||||
"io/ioutil"
|
"io/ioutil"
|
||||||
"os"
|
"os"
|
||||||
|
"os/exec"
|
||||||
|
"path/filepath"
|
||||||
"testing"
|
"testing"
|
||||||
|
|
||||||
"github.com/stretchr/testify/assert"
|
"github.com/stretchr/testify/assert"
|
||||||
|
"github.com/stretchr/testify/suite"
|
||||||
)
|
)
|
||||||
|
|
||||||
const (
|
const (
|
||||||
|
lingustURL = "https://github.com/github/linguist.git"
|
||||||
|
commitTree = "60f864a138650dd17fafc94814be9ee2d3aaef8c"
|
||||||
commitTest = "fe8b44ab8a225b1ffa75b983b916ea22fee5b6f7"
|
commitTest = "fe8b44ab8a225b1ffa75b983b916ea22fee5b6f7"
|
||||||
|
|
||||||
// Languages test
|
// Extensions test
|
||||||
extensionsTestFile = "test_files/extensions.test.yml"
|
extensionsTestFile = "test_files/extensions.test.yml"
|
||||||
extensionsGold = "test_files/extensions.gold"
|
extensionsGold = "test_files/extensions.gold"
|
||||||
extensionsTestTmplPath = "../assets/extensions.go.tmpl"
|
extensionsTestTmplPath = "../assets/extensions.go.tmpl"
|
||||||
@ -59,9 +64,48 @@ const (
|
|||||||
aliasesGold = "test_files/aliases.gold"
|
aliasesGold = "test_files/aliases.gold"
|
||||||
aliasesTestTmplPath = "../assets/aliases.go.tmpl"
|
aliasesTestTmplPath = "../assets/aliases.go.tmpl"
|
||||||
aliasesTestTmplName = "aliases.go.tmpl"
|
aliasesTestTmplName = "aliases.go.tmpl"
|
||||||
|
|
||||||
|
// Frequencies test
|
||||||
|
frequenciesTestDir = "/samples"
|
||||||
|
frequenciesGold = "test_files/frequencies.gold"
|
||||||
|
frequenciesTestTmplPath = "../assets/frequencies.go.tmpl"
|
||||||
|
frequenciesTestTmplName = "frequencies.go.tmpl"
|
||||||
)
|
)
|
||||||
|
|
||||||
func TestFromFile(t *testing.T) {
|
type GeneratorTestSuite struct {
|
||||||
|
suite.Suite
|
||||||
|
tmpLinguist string
|
||||||
|
}
|
||||||
|
|
||||||
|
func (g *GeneratorTestSuite) SetupSuite() {
|
||||||
|
tmpLinguist, err := ioutil.TempDir("", "linguist-")
|
||||||
|
assert.NoError(g.T(), err)
|
||||||
|
g.tmpLinguist = tmpLinguist
|
||||||
|
|
||||||
|
cmd := exec.Command("git", "clone", lingustURL, tmpLinguist)
|
||||||
|
err = cmd.Run()
|
||||||
|
assert.NoError(g.T(), err)
|
||||||
|
|
||||||
|
cwd, err := os.Getwd()
|
||||||
|
assert.NoError(g.T(), err)
|
||||||
|
|
||||||
|
err = os.Chdir(tmpLinguist)
|
||||||
|
assert.NoError(g.T(), err)
|
||||||
|
|
||||||
|
cmd = exec.Command("git", "checkout", commitTree)
|
||||||
|
err = cmd.Run()
|
||||||
|
assert.NoError(g.T(), err)
|
||||||
|
|
||||||
|
err = os.Chdir(cwd)
|
||||||
|
assert.NoError(g.T(), err)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (g *GeneratorTestSuite) TearDownSuite() {
|
||||||
|
err := os.RemoveAll(g.tmpLinguist)
|
||||||
|
assert.NoError(g.T(), err)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (g *GeneratorTestSuite) TestFromFile() {
|
||||||
tests := []struct {
|
tests := []struct {
|
||||||
name string
|
name string
|
||||||
fileToParse string
|
fileToParse string
|
||||||
@ -145,20 +189,57 @@ func TestFromFile(t *testing.T) {
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, tt := range tests {
|
for _, test := range tests {
|
||||||
t.Run(tt.name, func(t *testing.T) {
|
gold, err := ioutil.ReadFile(test.wantOut)
|
||||||
gold, err := ioutil.ReadFile(tt.wantOut)
|
assert.NoError(g.T(), err)
|
||||||
assert.NoError(t, err)
|
|
||||||
|
|
||||||
outPath, err := ioutil.TempFile("/tmp", "generator-test-")
|
outPath, err := ioutil.TempFile("/tmp", "generator-test-")
|
||||||
assert.NoError(t, err)
|
assert.NoError(g.T(), err)
|
||||||
defer os.Remove(outPath.Name())
|
defer os.Remove(outPath.Name())
|
||||||
|
|
||||||
err = FromFile(tt.fileToParse, outPath.Name(), tt.tmplPath, tt.tmplName, tt.commit, tt.generate)
|
err = FromFile(test.fileToParse, outPath.Name(), test.tmplPath, test.tmplName, test.commit, test.generate)
|
||||||
assert.NoError(t, err)
|
assert.NoError(g.T(), err)
|
||||||
out, err := ioutil.ReadFile(outPath.Name())
|
out, err := ioutil.ReadFile(outPath.Name())
|
||||||
assert.NoError(t, err)
|
assert.NoError(g.T(), err)
|
||||||
assert.EqualValues(t, gold, out, fmt.Sprintf("FromFile() = %v, want %v", string(out), string(tt.wantOut)))
|
assert.EqualValues(g.T(), gold, out, fmt.Sprintf("FromFile() = %v, want %v", string(out), string(test.wantOut)))
|
||||||
})
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (g *GeneratorTestSuite) TestFrequencies() {
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
samplesDir string
|
||||||
|
tmplPath string
|
||||||
|
tmplName string
|
||||||
|
commit string
|
||||||
|
wantOut string
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "Frequencies_1",
|
||||||
|
samplesDir: filepath.Join(g.tmpLinguist, frequenciesTestDir),
|
||||||
|
tmplPath: frequenciesTestTmplPath,
|
||||||
|
tmplName: frequenciesTestTmplName,
|
||||||
|
commit: commitTree,
|
||||||
|
wantOut: frequenciesGold,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, test := range tests {
|
||||||
|
gold, err := ioutil.ReadFile(test.wantOut)
|
||||||
|
assert.NoError(g.T(), err)
|
||||||
|
|
||||||
|
outPath, err := ioutil.TempFile("/tmp", "frequencies-test-")
|
||||||
|
assert.NoError(g.T(), err)
|
||||||
|
defer os.Remove(outPath.Name())
|
||||||
|
|
||||||
|
err = Frequencies(test.samplesDir, test.tmplPath, test.tmplName, test.commit, outPath.Name())
|
||||||
|
assert.NoError(g.T(), err)
|
||||||
|
out, err := ioutil.ReadFile(outPath.Name())
|
||||||
|
assert.NoError(g.T(), err)
|
||||||
|
assert.EqualValues(g.T(), gold, out, fmt.Sprintf("Frequencies() = %v, want %v", string(out), string(test.wantOut)))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestGeneratorTestSuite(t *testing.T) {
|
||||||
|
suite.Run(t, new(GeneratorTestSuite))
|
||||||
|
}
|
||||||
|
202
internal/code-generator/generator/samplesfreq.go
Normal file
202
internal/code-generator/generator/samplesfreq.go
Normal file
@ -0,0 +1,202 @@
|
|||||||
|
package generator
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"fmt"
|
||||||
|
"io"
|
||||||
|
"io/ioutil"
|
||||||
|
"log"
|
||||||
|
"math"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"sort"
|
||||||
|
"strconv"
|
||||||
|
"text/template"
|
||||||
|
|
||||||
|
"gopkg.in/src-d/simple-linguist.v1/internal/tokenizer"
|
||||||
|
)
|
||||||
|
|
||||||
|
type samplesFrequencies struct {
|
||||||
|
LanguageTotal int `json:"language_total,omitempty"`
|
||||||
|
Languages map[string]int `json:"languages,omitempty"`
|
||||||
|
TokensTotal int `json:"tokens_total,omitempty"`
|
||||||
|
Tokens map[string]map[string]int `json:"tokens,omitempty"`
|
||||||
|
LanguageTokens map[string]int `json:"language_tokens,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// Frequencies reads directories in samplesDir, retrieves information about frequencies of languages and tokens, and write
|
||||||
|
// the file outPath using frequenciesTmplName as a template.
|
||||||
|
func Frequencies(samplesDir, frequenciesTmplPath, frequenciesTmplName, commit, outPath string) error {
|
||||||
|
freqs, err := getFrequencies(samplesDir)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
buf := &bytes.Buffer{}
|
||||||
|
if err := executeFrequenciesTemplate(buf, freqs, frequenciesTmplPath, frequenciesTmplName, commit); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := formatedWrite(outPath, buf.Bytes()); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func getFrequencies(samplesDir string) (*samplesFrequencies, error) {
|
||||||
|
entries, err := ioutil.ReadDir(samplesDir)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
var languageTotal int
|
||||||
|
var languages = make(map[string]int)
|
||||||
|
var tokensTotal int
|
||||||
|
var tokens = make(map[string]map[string]int)
|
||||||
|
var languageTokens = make(map[string]int)
|
||||||
|
|
||||||
|
for _, entry := range entries {
|
||||||
|
if !entry.IsDir() {
|
||||||
|
log.Println(err)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
samples, err := getSamples(samplesDir, entry)
|
||||||
|
if err != nil {
|
||||||
|
log.Println(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(samples) == 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
samplesTokens, err := getTokens(samples)
|
||||||
|
if err != nil {
|
||||||
|
log.Println(err)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
lang := entry.Name()
|
||||||
|
languageTotal += len(samples)
|
||||||
|
languages[lang] = len(samples)
|
||||||
|
tokensTotal += len(samplesTokens)
|
||||||
|
languageTokens[lang] = len(samplesTokens)
|
||||||
|
tokens[lang] = make(map[string]int)
|
||||||
|
for _, token := range samplesTokens {
|
||||||
|
tokens[lang][token]++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return &samplesFrequencies{
|
||||||
|
TokensTotal: tokensTotal,
|
||||||
|
LanguageTotal: languageTotal,
|
||||||
|
Tokens: tokens,
|
||||||
|
LanguageTokens: languageTokens,
|
||||||
|
Languages: languages,
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func getSamples(samplesDir string, langDir os.FileInfo) ([]string, error) {
|
||||||
|
const subDir = "filenames"
|
||||||
|
|
||||||
|
samples := []string{}
|
||||||
|
path := filepath.Join(samplesDir, langDir.Name())
|
||||||
|
entries, err := ioutil.ReadDir(path)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, entry := range entries {
|
||||||
|
if entry.Mode().IsRegular() {
|
||||||
|
samples = append(samples, filepath.Join(path, entry.Name()))
|
||||||
|
}
|
||||||
|
|
||||||
|
if entry.IsDir() && entry.Name() == subDir {
|
||||||
|
subSamples, err := getSubSamples(samplesDir, langDir.Name(), entry)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
samples = append(samples, subSamples...)
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
return samples, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func getSubSamples(samplesDir, langDir string, subLangDir os.FileInfo) ([]string, error) {
|
||||||
|
subSamples := []string{}
|
||||||
|
path := filepath.Join(samplesDir, langDir, subLangDir.Name())
|
||||||
|
entries, err := ioutil.ReadDir(path)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, entry := range entries {
|
||||||
|
if entry.Mode().IsRegular() {
|
||||||
|
subSamples = append(subSamples, filepath.Join(path, entry.Name()))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return subSamples, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func getTokens(samples []string) ([]string, error) {
|
||||||
|
tokens := make([]string, 0, 20)
|
||||||
|
var anyError error
|
||||||
|
for _, sample := range samples {
|
||||||
|
content, err := ioutil.ReadFile(sample)
|
||||||
|
if err != nil {
|
||||||
|
anyError = err
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
t := tokenizer.Tokenize(content)
|
||||||
|
tokens = append(tokens, t...)
|
||||||
|
}
|
||||||
|
|
||||||
|
return tokens, anyError
|
||||||
|
}
|
||||||
|
|
||||||
|
func executeFrequenciesTemplate(out io.Writer, freqs *samplesFrequencies, frequenciesTmplPath, frequenciesTmpl, commit string) error {
|
||||||
|
fmap := template.FuncMap{
|
||||||
|
"getCommit": func() string { return commit },
|
||||||
|
"toFloat64": func(num int) string { return fmt.Sprintf("%f", float64(num)) },
|
||||||
|
"orderKeys": func(m map[string]int) []string {
|
||||||
|
keys := make([]string, 0, len(m))
|
||||||
|
for key := range m {
|
||||||
|
keys = append(keys, key)
|
||||||
|
}
|
||||||
|
|
||||||
|
sort.Strings(keys)
|
||||||
|
return keys
|
||||||
|
},
|
||||||
|
"languageLogProbability": func(language string) string {
|
||||||
|
num := math.Log(float64(freqs.Languages[language]) / float64(freqs.LanguageTotal))
|
||||||
|
return fmt.Sprintf("%f", num)
|
||||||
|
},
|
||||||
|
"orderMapMapKeys": func(mm map[string]map[string]int) []string {
|
||||||
|
keys := make([]string, 0, len(mm))
|
||||||
|
for key := range mm {
|
||||||
|
keys = append(keys, key)
|
||||||
|
}
|
||||||
|
|
||||||
|
sort.Strings(keys)
|
||||||
|
return keys
|
||||||
|
},
|
||||||
|
"tokenLogProbability": func(language, token string) string {
|
||||||
|
num := math.Log(float64(freqs.Tokens[language][token]) / float64(freqs.LanguageTokens[language]))
|
||||||
|
return fmt.Sprintf("%f", num)
|
||||||
|
},
|
||||||
|
"quote": strconv.Quote,
|
||||||
|
}
|
||||||
|
|
||||||
|
t := template.Must(template.New(frequenciesTmpl).Funcs(fmap).ParseFiles(frequenciesTmplPath))
|
||||||
|
if err := t.Execute(out, freqs); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
128805
internal/code-generator/generator/test_files/frequencies.gold
Normal file
128805
internal/code-generator/generator/test_files/frequencies.gold
Normal file
File diff suppressed because it is too large
Load Diff
@ -54,7 +54,13 @@ const (
|
|||||||
aliasesTmplPath = "internal/code-generator/assets/aliases.go.tmpl"
|
aliasesTmplPath = "internal/code-generator/assets/aliases.go.tmpl"
|
||||||
aliasesTmpl = "aliases.go.tmpl"
|
aliasesTmpl = "aliases.go.tmpl"
|
||||||
|
|
||||||
commitPath = ".git/refs/heads/master"
|
// frequencies.go generation
|
||||||
|
samplesDir = ".linguist/samples"
|
||||||
|
frequenciesFile = "frequencies.go"
|
||||||
|
frequenciesTmplPath = "internal/code-generator/assets/frequencies.go.tmpl"
|
||||||
|
frequenciesTmpl = "frequencies.go.tmpl"
|
||||||
|
|
||||||
|
commitPath = ".linguist/.git/refs/heads/master"
|
||||||
)
|
)
|
||||||
|
|
||||||
type generatorArgs struct {
|
type generatorArgs struct {
|
||||||
@ -88,6 +94,10 @@ func main() {
|
|||||||
log.Println(err)
|
log.Println(err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if err := generator.Frequencies(samplesDir, frequenciesTmplPath, frequenciesTmpl, commit, frequenciesFile); err != nil {
|
||||||
|
log.Println(err)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func getCommit(path string) (string, error) {
|
func getCommit(path string) (string, error) {
|
||||||
|
169
internal/tokenizer/tokenize.go
Normal file
169
internal/tokenizer/tokenize.go
Normal file
@ -0,0 +1,169 @@
|
|||||||
|
package tokenizer
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"regexp"
|
||||||
|
)
|
||||||
|
|
||||||
|
func Tokenize(content []byte) []string {
|
||||||
|
tokens := make([][]byte, 0, 50)
|
||||||
|
for _, extract := range extractTokens {
|
||||||
|
var extractedTokens [][]byte
|
||||||
|
content, extractedTokens = extract(content)
|
||||||
|
tokens = append(tokens, extractedTokens...)
|
||||||
|
}
|
||||||
|
|
||||||
|
return toString(tokens)
|
||||||
|
}
|
||||||
|
|
||||||
|
func toString(tokens [][]byte) []string {
|
||||||
|
stokens := make([]string, 0, len(tokens))
|
||||||
|
for _, token := range tokens {
|
||||||
|
stokens = append(stokens, string(token))
|
||||||
|
}
|
||||||
|
|
||||||
|
return stokens
|
||||||
|
}
|
||||||
|
|
||||||
|
var (
|
||||||
|
extractTokens = []func(content []byte) (replacedContent []byte, tokens [][]byte){
|
||||||
|
// The order to must be this
|
||||||
|
extractAndReplaceShebang,
|
||||||
|
extractAndReplaceSGML,
|
||||||
|
skipCommentsAndLiterals,
|
||||||
|
extractAndReplacePunctuation,
|
||||||
|
extractAndReplaceRegular,
|
||||||
|
extractAndReplaceOperator,
|
||||||
|
extractRemainders,
|
||||||
|
}
|
||||||
|
|
||||||
|
reLiteralStringQuotes = regexp.MustCompile(`(?sU)(".*"|'.*')`)
|
||||||
|
reSingleLineComment = regexp.MustCompile(`(?m)(//|--|#|%|")(.*$)`)
|
||||||
|
reMultilineComment = regexp.MustCompile(`(?sU)(/\*.*\*/|<!--.*-->|\{-.*-\}|\(\*.*\*\)|""".*"""|'''.*''')`)
|
||||||
|
reLiteralNumber = regexp.MustCompile(`(0x[0-9A-Fa-f]([0-9A-Fa-f]|\.)*|\d(\d|\.)*)([uU][lL]{0,2}|([eE][-+]\d*)?[fFlL]*)`)
|
||||||
|
reShebang = regexp.MustCompile(`(?m)^#!(?:/\w+)*/(?:(\w+)|\w+(?:\s*\w+=\w+\s*)*\s*(\w+))(?:\s*-\w+\s*)*$`)
|
||||||
|
rePunctuation = regexp.MustCompile(`;|\{|\}|\(|\)|\[|\]`)
|
||||||
|
reSGML = regexp.MustCompile(`(?sU)(<\/?[^\s<>=\d"']+)(?:\s.*\/?>|>)`)
|
||||||
|
reSGMLComment = regexp.MustCompile(`(?sU)(<!--.*-->)`)
|
||||||
|
reSGMLAttributes = regexp.MustCompile(`\s+(\w+=)|\s+([^\s>]+)`)
|
||||||
|
reSGMLLoneAttribute = regexp.MustCompile(`(\w+)`)
|
||||||
|
reRegularToken = regexp.MustCompile(`[\w\.@#\/\*]+`)
|
||||||
|
reOperators = regexp.MustCompile(`<<?|\+|\-|\*|\/|%|&&?|\|\|?`)
|
||||||
|
|
||||||
|
regexToSkip = []*regexp.Regexp{
|
||||||
|
// The order must be this
|
||||||
|
reLiteralStringQuotes,
|
||||||
|
reMultilineComment,
|
||||||
|
reSingleLineComment,
|
||||||
|
reLiteralNumber,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
func extractAndReplaceShebang(content []byte) ([]byte, [][]byte) {
|
||||||
|
var shebangTokens [][]byte
|
||||||
|
matches := reShebang.FindAllSubmatch(content, -1)
|
||||||
|
if matches != nil {
|
||||||
|
shebangTokens = make([][]byte, 0, 2)
|
||||||
|
for _, match := range matches {
|
||||||
|
shebangToken := getShebangToken(match)
|
||||||
|
shebangTokens = append(shebangTokens, shebangToken)
|
||||||
|
}
|
||||||
|
|
||||||
|
reShebang.ReplaceAll(content, []byte(` `))
|
||||||
|
}
|
||||||
|
|
||||||
|
return content, shebangTokens
|
||||||
|
}
|
||||||
|
|
||||||
|
func getShebangToken(matchedShebang [][]byte) []byte {
|
||||||
|
const prefix = `SHEBANG#!`
|
||||||
|
var token []byte
|
||||||
|
for i := 1; i < len(matchedShebang); i++ {
|
||||||
|
if len(matchedShebang[i]) > 0 {
|
||||||
|
token = matchedShebang[i]
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
tokenShebang := append([]byte(prefix), token...)
|
||||||
|
return tokenShebang
|
||||||
|
}
|
||||||
|
|
||||||
|
func commonExtracAndReplace(content []byte, re *regexp.Regexp) ([]byte, [][]byte) {
|
||||||
|
tokens := re.FindAll(content, -1)
|
||||||
|
content = re.ReplaceAll(content, []byte(` `))
|
||||||
|
return content, tokens
|
||||||
|
}
|
||||||
|
|
||||||
|
func extractAndReplacePunctuation(content []byte) ([]byte, [][]byte) {
|
||||||
|
return commonExtracAndReplace(content, rePunctuation)
|
||||||
|
}
|
||||||
|
|
||||||
|
func extractAndReplaceRegular(content []byte) ([]byte, [][]byte) {
|
||||||
|
return commonExtracAndReplace(content, reRegularToken)
|
||||||
|
}
|
||||||
|
|
||||||
|
func extractAndReplaceOperator(content []byte) ([]byte, [][]byte) {
|
||||||
|
return commonExtracAndReplace(content, reOperators)
|
||||||
|
}
|
||||||
|
|
||||||
|
func extractAndReplaceSGML(content []byte) ([]byte, [][]byte) {
|
||||||
|
var SGMLTokens [][]byte
|
||||||
|
matches := reSGML.FindAllSubmatch(content, -1)
|
||||||
|
if matches != nil {
|
||||||
|
SGMLTokens = make([][]byte, 0, 2)
|
||||||
|
for _, match := range matches {
|
||||||
|
if reSGMLComment.Match(match[0]) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
token := append(match[1], '>')
|
||||||
|
SGMLTokens = append(SGMLTokens, token)
|
||||||
|
attributes := getSGMLAttributes(match[0])
|
||||||
|
SGMLTokens = append(SGMLTokens, attributes...)
|
||||||
|
}
|
||||||
|
|
||||||
|
content = reSGML.ReplaceAll(content, []byte(` `))
|
||||||
|
}
|
||||||
|
|
||||||
|
return content, SGMLTokens
|
||||||
|
}
|
||||||
|
|
||||||
|
func getSGMLAttributes(SGMLTag []byte) [][]byte {
|
||||||
|
var attributes [][]byte
|
||||||
|
matches := reSGMLAttributes.FindAllSubmatch(SGMLTag, -1)
|
||||||
|
if matches != nil {
|
||||||
|
attributes = make([][]byte, 0, 5)
|
||||||
|
for _, match := range matches {
|
||||||
|
if len(match[1]) != 0 {
|
||||||
|
attributes = append(attributes, match[1])
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(match[2]) != 0 {
|
||||||
|
loneAttributes := reSGMLLoneAttribute.FindAll(match[2], -1)
|
||||||
|
attributes = append(attributes, loneAttributes...)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return attributes
|
||||||
|
}
|
||||||
|
|
||||||
|
func skipCommentsAndLiterals(content []byte) ([]byte, [][]byte) {
|
||||||
|
for _, skip := range regexToSkip {
|
||||||
|
content = skip.ReplaceAll(content, []byte(` `))
|
||||||
|
}
|
||||||
|
|
||||||
|
return content, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func extractRemainders(content []byte) ([]byte, [][]byte) {
|
||||||
|
splitted := bytes.Fields(content)
|
||||||
|
remainderTokens := make([][]byte, 0, len(splitted)*3)
|
||||||
|
for _, remainder := range splitted {
|
||||||
|
remainders := bytes.Split(remainder, nil)
|
||||||
|
remainderTokens = append(remainderTokens, remainders...)
|
||||||
|
}
|
||||||
|
|
||||||
|
return content, remainderTokens
|
||||||
|
}
|
107
internal/tokenizer/tokenize_test.go
Normal file
107
internal/tokenizer/tokenize_test.go
Normal file
@ -0,0 +1,107 @@
|
|||||||
|
package tokenizer
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/stretchr/testify/assert"
|
||||||
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
testContent = `#!/usr/bin/ruby
|
||||||
|
|
||||||
|
#!/usr/bin/env node
|
||||||
|
|
||||||
|
aaa
|
||||||
|
|
||||||
|
#!/usr/bin/env A=B foo=bar awk -f
|
||||||
|
|
||||||
|
#!python
|
||||||
|
|
||||||
|
func Tokenize(content []byte) []string {
|
||||||
|
splitted := bytes.Fields(content)
|
||||||
|
tokens := /* make([]string, 0, len(splitted))
|
||||||
|
no comment -- comment
|
||||||
|
for _, tokenByte := range splitted {
|
||||||
|
token64 := base64.StdEncoding.EncodeToString(tokenByte)
|
||||||
|
tokens = append(tokens, token64)
|
||||||
|
notcatchasanumber3.5
|
||||||
|
}*/
|
||||||
|
othercode
|
||||||
|
/* testing multiple
|
||||||
|
|
||||||
|
multiline comments*/
|
||||||
|
|
||||||
|
<!-- com
|
||||||
|
ment -->
|
||||||
|
<!-- comment 2-->
|
||||||
|
ppp no comment # comment
|
||||||
|
|
||||||
|
"literal1"
|
||||||
|
|
||||||
|
abb (tokenByte, 0xAF02) | ,3.2L
|
||||||
|
|
||||||
|
'literal2' notcatchasanumber3.5
|
||||||
|
|
||||||
|
5 += number * anotherNumber
|
||||||
|
if isTrue && isToo {
|
||||||
|
0b00001000 >> 1
|
||||||
|
}
|
||||||
|
|
||||||
|
return tokens
|
||||||
|
|
||||||
|
oneBool = 3 <= 2
|
||||||
|
varBool = 3<=2>
|
||||||
|
|
||||||
|
PyErr_SetString(PyExc_RuntimeError, "Relative import is not supported for Python <=2.4.");
|
||||||
|
|
||||||
|
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
||||||
|
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||||
|
<head>
|
||||||
|
<title id="hola" class="">This is a XHTML sample file</title>
|
||||||
|
<style type="text/css"><![CDATA[
|
||||||
|
#example {
|
||||||
|
background-color: yellow;
|
||||||
|
}
|
||||||
|
]]></style>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<div id="example">
|
||||||
|
Just a simple <strong>XHTML</strong> test page.
|
||||||
|
</div>
|
||||||
|
</body>
|
||||||
|
</html>`
|
||||||
|
)
|
||||||
|
|
||||||
|
var (
|
||||||
|
tokensFromTestContent = []string{"SHEBANG#!ruby", "SHEBANG#!node", "SHEBANG#!awk", "<!DOCTYPE>", "PUBLIC", "W3C", "DTD", "XHTML", "1", "0",
|
||||||
|
"Strict", "EN", "http", "www", "w3", "org", "TR", "xhtml1", "DTD", "xhtml1", "strict", "dtd", "<html>", "<head>", "<title>", "class=",
|
||||||
|
"</title>", "<style>", "<![CDATA[>", "example", "background", "color", "yellow", "</style>", "</head>", "<body>", "<div>", "<strong>",
|
||||||
|
"</strong>", "</div>", "</body>", "</html>", "(", "[", "]", ")", "[", "]", "{", "(", ")", "(", ")", "{", "}", "(", ")", ";", ";", "}",
|
||||||
|
"]", "]", "aaa", "func", "Tokenize", "content", "byte", "string", "splitted", "bytes.Fields", "content", "tokens", "othercode", "ppp",
|
||||||
|
"no", "comment", "abb", "tokenByte", "notcatchasanumber", "number", "*", "anotherNumber", "if", "isTrue", "isToo", "b", "return",
|
||||||
|
"tokens", "oneBool", "varBool", "PyErr_SetString", "PyExc_RuntimeError", "html", "PUBLIC", "xmlns", "id", "class", "This", "is", "a",
|
||||||
|
"XHTML", "sample", "file", "type", "background", "color", "yellow", "id", "Just", "a", "simple", "XHTML", "test", "page.", "|", "+",
|
||||||
|
"&&", "<", "<", "-", ":", "=", ":", "=", ",", ",", "=", ">", ">", "=", "=", "=", "=", ">", ",", ">", "=", ">", "=", "=", ">", "=", ">",
|
||||||
|
":", ">", "=", ">"}
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestTokenize(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
content []byte
|
||||||
|
expected []string
|
||||||
|
}{
|
||||||
|
{name: "content", content: []byte(testContent), expected: tokensFromTestContent},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, test := range tests {
|
||||||
|
t.Run(test.name, func(t *testing.T) {
|
||||||
|
tokens := Tokenize(test.content)
|
||||||
|
assert.Equal(t, len(test.expected), len(tokens), fmt.Sprintf("token' slice length = %v, want %v", len(test.expected), len(tokens)))
|
||||||
|
for i, expectedToken := range test.expected {
|
||||||
|
assert.Equal(t, expectedToken, tokens[i], fmt.Sprintf("token = %v, want %v", tokens[i], expectedToken))
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
@ -2,7 +2,7 @@ package slinguist
|
|||||||
|
|
||||||
// CODE GENERATED AUTOMATICALLY WITH gopkg.in/src-d/simple-linguist.v1/internal/code-generator
|
// CODE GENERATED AUTOMATICALLY WITH gopkg.in/src-d/simple-linguist.v1/internal/code-generator
|
||||||
// THIS FILE SHOULD NOT BE EDITED BY HAND
|
// THIS FILE SHOULD NOT BE EDITED BY HAND
|
||||||
// Extracted from github/linguist commit: dae33dc2b20cddc85d1300435c3be7118a7115a9
|
// Extracted from github/linguist commit: 60f864a138650dd17fafc94814be9ee2d3aaef8c
|
||||||
|
|
||||||
var languagesByInterpreter = map[string][]string{
|
var languagesByInterpreter = map[string][]string{
|
||||||
"Rscript": {"R"},
|
"Rscript": {"R"},
|
||||||
|
@ -2,7 +2,7 @@ package slinguist
|
|||||||
|
|
||||||
// CODE GENERATED AUTOMATICALLY WITH gopkg.in/src-d/simple-linguist.v1/internal/code-generator
|
// CODE GENERATED AUTOMATICALLY WITH gopkg.in/src-d/simple-linguist.v1/internal/code-generator
|
||||||
// THIS FILE SHOULD NOT BE EDITED BY HAND
|
// THIS FILE SHOULD NOT BE EDITED BY HAND
|
||||||
// Extracted from github/linguist commit: dae33dc2b20cddc85d1300435c3be7118a7115a9
|
// Extracted from github/linguist commit: 60f864a138650dd17fafc94814be9ee2d3aaef8c
|
||||||
|
|
||||||
var languagesType = map[string]Type{
|
var languagesType = map[string]Type{
|
||||||
"1C Enterprise": Programming,
|
"1C Enterprise": Programming,
|
||||||
@ -408,6 +408,7 @@ var languagesType = map[string]Type{
|
|||||||
"Turing": Programming,
|
"Turing": Programming,
|
||||||
"Turtle": Data,
|
"Turtle": Data,
|
||||||
"Twig": Markup,
|
"Twig": Markup,
|
||||||
|
"Type Language": Data,
|
||||||
"TypeScript": Programming,
|
"TypeScript": Programming,
|
||||||
"Unified Parallel C": Programming,
|
"Unified Parallel C": Programming,
|
||||||
"Unity3D Asset": Data,
|
"Unity3D Asset": Data,
|
||||||
|
@ -2,7 +2,7 @@ package slinguist
|
|||||||
|
|
||||||
// CODE GENERATED AUTOMATICALLY WITH gopkg.in/src-d/simple-linguist.v1/internal/code-generator
|
// CODE GENERATED AUTOMATICALLY WITH gopkg.in/src-d/simple-linguist.v1/internal/code-generator
|
||||||
// THIS FILE SHOULD NOT BE EDITED BY HAND
|
// THIS FILE SHOULD NOT BE EDITED BY HAND
|
||||||
// Extracted from github/linguist commit: dae33dc2b20cddc85d1300435c3be7118a7115a9
|
// Extracted from github/linguist commit: 60f864a138650dd17fafc94814be9ee2d3aaef8c
|
||||||
|
|
||||||
import "gopkg.in/toqueteos/substring.v1"
|
import "gopkg.in/toqueteos/substring.v1"
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user