Added frequencies.go generation

This commit is contained in:
Manuel Carmona
2017-05-25 12:33:26 +02:00
parent a63c8bdf81
commit fcf30a07c8
17 changed files with 258239 additions and 24 deletions

View File

@ -0,0 +1,24 @@
package slinguist
// CODE GENERATED AUTOMATICALLY WITH gopkg.in/src-d/simple-linguist.v1/internal/code-generator
// THIS FILE SHOULD NOT BE EDITED BY HAND
// Extracted from github/linguist commit: {{ getCommit }}
var DefaultClassifier Classifier = &classifier{
languagesLogProbabilities: map[string]float64{
{{ $freqs := . -}}
{{range $index, $language := orderKeys .Languages -}}
"{{ $language }}": {{ languageLogProbability $language -}},
{{end -}}
},
tokensLogProbabilities: map[string]map[string]float64{
{{range $index, $language := orderMapMapKeys .Tokens -}}
"{{ $language }}": map[string]float64{
{{range $i, $token := index $freqs.Tokens $language | orderKeys -}}
{{ quote $token }}: {{ tokenLogProbability $language $token }},
{{end -}}
},
{{end -}}
},
tokensTotal: {{ toFloat64 .TokensTotal -}},
}

View File

@ -21,6 +21,14 @@ func FromFile(fileToParse, outPath, tmplPath, tmplName, commit string, generate
return err
}
if err := formatedWrite(outPath, source); err != nil {
return err
}
return nil
}
func formatedWrite(outPath string, source []byte) error {
formatedSource, err := format.Source(source)
if err != nil {
return err

View File

@ -4,15 +4,20 @@ import (
"fmt"
"io/ioutil"
"os"
"os/exec"
"path/filepath"
"testing"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/suite"
)
const (
lingustURL = "https://github.com/github/linguist.git"
commitTree = "60f864a138650dd17fafc94814be9ee2d3aaef8c"
commitTest = "fe8b44ab8a225b1ffa75b983b916ea22fee5b6f7"
// Languages test
// Extensions test
extensionsTestFile = "test_files/extensions.test.yml"
extensionsGold = "test_files/extensions.gold"
extensionsTestTmplPath = "../assets/extensions.go.tmpl"
@ -59,9 +64,48 @@ const (
aliasesGold = "test_files/aliases.gold"
aliasesTestTmplPath = "../assets/aliases.go.tmpl"
aliasesTestTmplName = "aliases.go.tmpl"
// Frequencies test
frequenciesTestDir = "/samples"
frequenciesGold = "test_files/frequencies.gold"
frequenciesTestTmplPath = "../assets/frequencies.go.tmpl"
frequenciesTestTmplName = "frequencies.go.tmpl"
)
func TestFromFile(t *testing.T) {
type GeneratorTestSuite struct {
suite.Suite
tmpLinguist string
}
func (g *GeneratorTestSuite) SetupSuite() {
tmpLinguist, err := ioutil.TempDir("", "linguist-")
assert.NoError(g.T(), err)
g.tmpLinguist = tmpLinguist
cmd := exec.Command("git", "clone", lingustURL, tmpLinguist)
err = cmd.Run()
assert.NoError(g.T(), err)
cwd, err := os.Getwd()
assert.NoError(g.T(), err)
err = os.Chdir(tmpLinguist)
assert.NoError(g.T(), err)
cmd = exec.Command("git", "checkout", commitTree)
err = cmd.Run()
assert.NoError(g.T(), err)
err = os.Chdir(cwd)
assert.NoError(g.T(), err)
}
func (g *GeneratorTestSuite) TearDownSuite() {
err := os.RemoveAll(g.tmpLinguist)
assert.NoError(g.T(), err)
}
func (g *GeneratorTestSuite) TestFromFile() {
tests := []struct {
name string
fileToParse string
@ -145,20 +189,57 @@ func TestFromFile(t *testing.T) {
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
gold, err := ioutil.ReadFile(tt.wantOut)
assert.NoError(t, err)
for _, test := range tests {
gold, err := ioutil.ReadFile(test.wantOut)
assert.NoError(g.T(), err)
outPath, err := ioutil.TempFile("/tmp", "generator-test-")
assert.NoError(t, err)
defer os.Remove(outPath.Name())
outPath, err := ioutil.TempFile("/tmp", "generator-test-")
assert.NoError(g.T(), err)
defer os.Remove(outPath.Name())
err = FromFile(tt.fileToParse, outPath.Name(), tt.tmplPath, tt.tmplName, tt.commit, tt.generate)
assert.NoError(t, err)
out, err := ioutil.ReadFile(outPath.Name())
assert.NoError(t, err)
assert.EqualValues(t, gold, out, fmt.Sprintf("FromFile() = %v, want %v", string(out), string(tt.wantOut)))
})
err = FromFile(test.fileToParse, outPath.Name(), test.tmplPath, test.tmplName, test.commit, test.generate)
assert.NoError(g.T(), err)
out, err := ioutil.ReadFile(outPath.Name())
assert.NoError(g.T(), err)
assert.EqualValues(g.T(), gold, out, fmt.Sprintf("FromFile() = %v, want %v", string(out), string(test.wantOut)))
}
}
func (g *GeneratorTestSuite) TestFrequencies() {
tests := []struct {
name string
samplesDir string
tmplPath string
tmplName string
commit string
wantOut string
}{
{
name: "Frequencies_1",
samplesDir: filepath.Join(g.tmpLinguist, frequenciesTestDir),
tmplPath: frequenciesTestTmplPath,
tmplName: frequenciesTestTmplName,
commit: commitTree,
wantOut: frequenciesGold,
},
}
for _, test := range tests {
gold, err := ioutil.ReadFile(test.wantOut)
assert.NoError(g.T(), err)
outPath, err := ioutil.TempFile("/tmp", "frequencies-test-")
assert.NoError(g.T(), err)
defer os.Remove(outPath.Name())
err = Frequencies(test.samplesDir, test.tmplPath, test.tmplName, test.commit, outPath.Name())
assert.NoError(g.T(), err)
out, err := ioutil.ReadFile(outPath.Name())
assert.NoError(g.T(), err)
assert.EqualValues(g.T(), gold, out, fmt.Sprintf("Frequencies() = %v, want %v", string(out), string(test.wantOut)))
}
}
func TestGeneratorTestSuite(t *testing.T) {
suite.Run(t, new(GeneratorTestSuite))
}

View File

@ -0,0 +1,202 @@
package generator
import (
"bytes"
"fmt"
"io"
"io/ioutil"
"log"
"math"
"os"
"path/filepath"
"sort"
"strconv"
"text/template"
"gopkg.in/src-d/simple-linguist.v1/internal/tokenizer"
)
type samplesFrequencies struct {
LanguageTotal int `json:"language_total,omitempty"`
Languages map[string]int `json:"languages,omitempty"`
TokensTotal int `json:"tokens_total,omitempty"`
Tokens map[string]map[string]int `json:"tokens,omitempty"`
LanguageTokens map[string]int `json:"language_tokens,omitempty"`
}
// Frequencies reads directories in samplesDir, retrieves information about frequencies of languages and tokens, and write
// the file outPath using frequenciesTmplName as a template.
func Frequencies(samplesDir, frequenciesTmplPath, frequenciesTmplName, commit, outPath string) error {
freqs, err := getFrequencies(samplesDir)
if err != nil {
return err
}
buf := &bytes.Buffer{}
if err := executeFrequenciesTemplate(buf, freqs, frequenciesTmplPath, frequenciesTmplName, commit); err != nil {
return err
}
if err := formatedWrite(outPath, buf.Bytes()); err != nil {
return err
}
return nil
}
func getFrequencies(samplesDir string) (*samplesFrequencies, error) {
entries, err := ioutil.ReadDir(samplesDir)
if err != nil {
return nil, err
}
var languageTotal int
var languages = make(map[string]int)
var tokensTotal int
var tokens = make(map[string]map[string]int)
var languageTokens = make(map[string]int)
for _, entry := range entries {
if !entry.IsDir() {
log.Println(err)
continue
}
samples, err := getSamples(samplesDir, entry)
if err != nil {
log.Println(err)
}
if len(samples) == 0 {
continue
}
samplesTokens, err := getTokens(samples)
if err != nil {
log.Println(err)
continue
}
lang := entry.Name()
languageTotal += len(samples)
languages[lang] = len(samples)
tokensTotal += len(samplesTokens)
languageTokens[lang] = len(samplesTokens)
tokens[lang] = make(map[string]int)
for _, token := range samplesTokens {
tokens[lang][token]++
}
}
return &samplesFrequencies{
TokensTotal: tokensTotal,
LanguageTotal: languageTotal,
Tokens: tokens,
LanguageTokens: languageTokens,
Languages: languages,
}, nil
}
func getSamples(samplesDir string, langDir os.FileInfo) ([]string, error) {
const subDir = "filenames"
samples := []string{}
path := filepath.Join(samplesDir, langDir.Name())
entries, err := ioutil.ReadDir(path)
if err != nil {
return nil, err
}
for _, entry := range entries {
if entry.Mode().IsRegular() {
samples = append(samples, filepath.Join(path, entry.Name()))
}
if entry.IsDir() && entry.Name() == subDir {
subSamples, err := getSubSamples(samplesDir, langDir.Name(), entry)
if err != nil {
return nil, err
}
samples = append(samples, subSamples...)
}
}
return samples, nil
}
func getSubSamples(samplesDir, langDir string, subLangDir os.FileInfo) ([]string, error) {
subSamples := []string{}
path := filepath.Join(samplesDir, langDir, subLangDir.Name())
entries, err := ioutil.ReadDir(path)
if err != nil {
return nil, err
}
for _, entry := range entries {
if entry.Mode().IsRegular() {
subSamples = append(subSamples, filepath.Join(path, entry.Name()))
}
}
return subSamples, nil
}
func getTokens(samples []string) ([]string, error) {
tokens := make([]string, 0, 20)
var anyError error
for _, sample := range samples {
content, err := ioutil.ReadFile(sample)
if err != nil {
anyError = err
continue
}
t := tokenizer.Tokenize(content)
tokens = append(tokens, t...)
}
return tokens, anyError
}
func executeFrequenciesTemplate(out io.Writer, freqs *samplesFrequencies, frequenciesTmplPath, frequenciesTmpl, commit string) error {
fmap := template.FuncMap{
"getCommit": func() string { return commit },
"toFloat64": func(num int) string { return fmt.Sprintf("%f", float64(num)) },
"orderKeys": func(m map[string]int) []string {
keys := make([]string, 0, len(m))
for key := range m {
keys = append(keys, key)
}
sort.Strings(keys)
return keys
},
"languageLogProbability": func(language string) string {
num := math.Log(float64(freqs.Languages[language]) / float64(freqs.LanguageTotal))
return fmt.Sprintf("%f", num)
},
"orderMapMapKeys": func(mm map[string]map[string]int) []string {
keys := make([]string, 0, len(mm))
for key := range mm {
keys = append(keys, key)
}
sort.Strings(keys)
return keys
},
"tokenLogProbability": func(language, token string) string {
num := math.Log(float64(freqs.Tokens[language][token]) / float64(freqs.LanguageTokens[language]))
return fmt.Sprintf("%f", num)
},
"quote": strconv.Quote,
}
t := template.Must(template.New(frequenciesTmpl).Funcs(fmap).ParseFiles(frequenciesTmplPath))
if err := t.Execute(out, freqs); err != nil {
return err
}
return nil
}

File diff suppressed because it is too large Load Diff

View File

@ -54,7 +54,13 @@ const (
aliasesTmplPath = "internal/code-generator/assets/aliases.go.tmpl"
aliasesTmpl = "aliases.go.tmpl"
commitPath = ".git/refs/heads/master"
// frequencies.go generation
samplesDir = ".linguist/samples"
frequenciesFile = "frequencies.go"
frequenciesTmplPath = "internal/code-generator/assets/frequencies.go.tmpl"
frequenciesTmpl = "frequencies.go.tmpl"
commitPath = ".linguist/.git/refs/heads/master"
)
type generatorArgs struct {
@ -88,6 +94,10 @@ func main() {
log.Println(err)
}
}
if err := generator.Frequencies(samplesDir, frequenciesTmplPath, frequenciesTmpl, commit, frequenciesFile); err != nil {
log.Println(err)
}
}
func getCommit(path string) (string, error) {