tartrazine/internal/code-generator/generator/frequencies.go
Josh Soref d4d3d66352 spelling: skipping
Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com>
2022-10-06 13:25:49 -04:00

203 lines
5.3 KiB
Go

package generator
import (
"bytes"
"fmt"
"io"
"io/ioutil"
"log"
"math"
"os"
"path/filepath"
"sort"
"strconv"
"strings"
"text/template"
"github.com/go-enry/go-enry/v2/internal/tokenizer"
)
type samplesFrequencies struct {
LanguageTotal int `json:"language_total,omitempty"`
Languages map[string]int `json:"languages,omitempty"`
TokensTotal int `json:"tokens_total,omitempty"`
Tokens map[string]map[string]int `json:"tokens,omitempty"`
LanguageTokens map[string]int `json:"language_tokens,omitempty"`
}
// Frequencies reads directories in samplesDir, retrieves information about frequencies of languages and tokens, and write
// the file outPath using tmplName as a template. It complies with type File signature.
func Frequencies(fileToParse, samplesDir, outPath, tmplPath, tmplName, commit string) error {
freqs, err := getFrequencies(samplesDir)
if err != nil {
return err
}
if _, ok := os.LookupEnv("ENRY_DEBUG"); ok {
log.Printf("Total samples: %d\n", freqs.LanguageTotal)
log.Printf("Total tokens: %d\n", freqs.TokensTotal)
keys := make([]string, 0, len(freqs.Languages))
for k := range freqs.Languages {
keys = append(keys, k)
}
sort.Strings(keys)
for _, k := range keys {
fmt.Printf(" %s: %d\n", k, freqs.Languages[k])
}
}
buf := &bytes.Buffer{}
if err := executeFrequenciesTemplate(buf, freqs, tmplPath, tmplName, commit); err != nil {
return err
}
return formatedWrite(outPath, buf.Bytes())
}
func getFrequencies(samplesDir string) (*samplesFrequencies, error) {
langDirs, err := ioutil.ReadDir(samplesDir)
if err != nil {
return nil, err
}
var languageTotal int
var languages = make(map[string]int)
var tokensTotal int
var tokens = make(map[string]map[string]int)
var languageTokens = make(map[string]int)
for _, langDir := range langDirs {
if !langDir.IsDir() {
continue
}
lang := langDir.Name()
samples, err := readSamples(filepath.Join(samplesDir, lang))
if err != nil {
log.Println(err)
}
if len(samples) == 0 {
continue
}
samplesTokens, err := getTokens(samples)
if err != nil {
log.Println(err)
continue
}
languageTotal += len(samples)
languages[lang] = len(samples)
tokensTotal += len(samplesTokens)
languageTokens[lang] = len(samplesTokens)
tokens[lang] = make(map[string]int)
for _, token := range samplesTokens {
tokens[lang][token]++
}
}
return &samplesFrequencies{
TokensTotal: tokensTotal,
LanguageTotal: languageTotal,
Tokens: tokens,
LanguageTokens: languageTokens,
Languages: languages,
}, nil
}
// readSamples collects ./samples/ filenames from the Linguist codebase, skipping symlinks.
func readSamples(samplesLangDir string) ([]string, error) {
const specialSubDir = "filenames"
var samples []string
err := filepath.Walk(samplesLangDir, func(path string, info os.FileInfo, err error) error {
if err != nil {
fmt.Printf("failure accessing a path %q: %v\n", path, err)
return err
}
if info.IsDir() {
switch info.Name() {
case filepath.Base(samplesLangDir):
return nil
case specialSubDir:
return nil
default:
return filepath.SkipDir
}
}
// skip git file symlinks on win and *nix
if isKnownSymlinkInLinguist(path) || !info.Mode().IsRegular() {
return nil
}
samples = append(samples, path)
return nil
})
return samples, err
}
// isKnownSymlinkInLinguist checks if the file name is on the list of known symlinks.
// On Windows, there is no symlink support in Git [1] and those become regular text files,
// so we have to skip these files manually, maintaining a list here :/
// 1. https://github.com/git-for-windows/git/wiki/Symbolic-Links
//
// $ find -L .linguist/samples -xtype l
func isKnownSymlinkInLinguist(path string) bool {
return strings.HasSuffix(path, filepath.Join("Ant Build System", "filenames", "build.xml")) ||
strings.HasSuffix(path, filepath.Join("Markdown", "symlink.md"))
}
func getTokens(samples []string) ([]string, error) {
tokens := make([]string, 0, 20)
var anyError error
for _, sample := range samples {
content, err := ioutil.ReadFile(sample)
if err != nil {
anyError = err
continue
}
t := tokenizer.Tokenize(content)
tokens = append(tokens, t...)
}
return tokens, anyError
}
func executeFrequenciesTemplate(out io.Writer, freqs *samplesFrequencies, tmplPath, tmplName, commit string) error {
fmap := template.FuncMap{
"toFloat64": func(num int) string { return fmt.Sprintf("%f", float64(num)) },
"orderKeys": func(m map[string]int) []string {
keys := make([]string, 0, len(m))
for key := range m {
keys = append(keys, key)
}
sort.Strings(keys)
return keys
},
"languageLogProbability": func(language string) string {
num := math.Log(float64(freqs.Languages[language]) / float64(freqs.LanguageTotal))
return fmt.Sprintf("%f", num)
},
"orderMapMapKeys": func(mm map[string]map[string]int) []string {
keys := make([]string, 0, len(mm))
for key := range mm {
keys = append(keys, key)
}
sort.Strings(keys)
return keys
},
"tokenLogProbability": func(language, token string) string {
num := math.Log(float64(freqs.Tokens[language][token]) / float64(freqs.LanguageTokens[language]))
return fmt.Sprintf("%f", num)
},
"quote": strconv.Quote,
}
return executeTemplate(out, tmplName, tmplPath, commit, fmap, freqs)
}