diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 2f42467..87a5764 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -13,10 +13,16 @@ jobs: with: go-version: ${{ matrix.go-version }} + - name: Set git on win to use LF + run: | + git config --global core.autocrlf false + git config --global core.eol lf - name: Checkout code uses: actions/checkout@v2 - name: Test run: go test ./... + env: + ENRY_DEBUG: 1 test-oniguruma: strategy: matrix: diff --git a/internal/code-generator/generator/samplesfreq.go b/internal/code-generator/generator/frequencies.go similarity index 69% rename from internal/code-generator/generator/samplesfreq.go rename to internal/code-generator/generator/frequencies.go index 3dd2142..2c6ba74 100644 --- a/internal/code-generator/generator/samplesfreq.go +++ b/internal/code-generator/generator/frequencies.go @@ -7,9 +7,11 @@ import ( "io/ioutil" "log" "math" + "os" "path/filepath" "sort" "strconv" + "strings" "text/template" "github.com/go-enry/go-enry/v2/internal/tokenizer" @@ -31,6 +33,21 @@ func Frequencies(fileToParse, samplesDir, outPath, tmplPath, tmplName, commit st return err } + if _, ok := os.LookupEnv("ENRY_DEBUG"); ok { + log.Printf("Total samples: %d\n", freqs.LanguageTotal) + log.Printf("Total tokens: %d\n", freqs.TokensTotal) + + keys := make([]string, 0, len(freqs.Languages)) + for k := range freqs.Languages { + keys = append(keys, k) + } + sort.Strings(keys) + + for _, k := range keys { + fmt.Printf(" %s: %d\n", k, freqs.Languages[k]) + } + } + buf := &bytes.Buffer{} if err := executeFrequenciesTemplate(buf, freqs, tmplPath, tmplName, commit); err != nil { return err @@ -91,49 +108,46 @@ func getFrequencies(samplesDir string) (*samplesFrequencies, error) { }, nil } +// readSamples collects ./samples/ filenames from the Linguist codebase, skiping symlinks. func readSamples(samplesLangDir string) ([]string, error) { - const samplesLangFilesDir = "filenames" - sampleFiles, err := ioutil.ReadDir(samplesLangDir) - if err != nil { - return nil, err - } - + const specialSubDir = "filenames" var samples []string - for _, sampleFile := range sampleFiles { - filename := filepath.Join(samplesLangDir, sampleFile.Name()) - if sampleFile.Mode().IsRegular() { - samples = append(samples, filename) - continue - } - if sampleFile.IsDir() && sampleFile.Name() == samplesLangFilesDir { - subSamples, err := readSubSamples(filename) - if err != nil { - return nil, err + err := filepath.Walk(samplesLangDir, func(path string, info os.FileInfo, err error) error { + if err != nil { + fmt.Printf("failure accessing a path %q: %v\n", path, err) + return err + } + if info.IsDir() { + switch info.Name() { + case filepath.Base(samplesLangDir): + return nil + case specialSubDir: + return nil + default: + return filepath.SkipDir } - - samples = append(samples, subSamples...) } + // skip git file symlinks on win and *nix + if isKnownSymlinkInLinguist(path) || !info.Mode().IsRegular() { + return nil + } + samples = append(samples, path) + return nil + }) - } - - return samples, nil + return samples, err } -func readSubSamples(path string) ([]string, error) { - subSamples := []string{} - entries, err := ioutil.ReadDir(path) - if err != nil { - return nil, err - } - - for _, entry := range entries { - if entry.Mode().IsRegular() { - subSamples = append(subSamples, filepath.Join(path, entry.Name())) - } - } - - return subSamples, nil +// isKnownSymlinkInLinguist checks if the file name is on the list of known symlinks. +// On Windows, there is no symlink support in Git [1] and those become regular text files, +// so we have to skip these files manually, maintaing a list here :/ +// 1. https://github.com/git-for-windows/git/wiki/Symbolic-Links +// +// $ find -L .linguist/samples -xtype l +func isKnownSymlinkInLinguist(path string) bool { + return strings.HasSuffix(path, filepath.Join("Ant Build System", "filenames", "build.xml")) || + strings.HasSuffix(path, filepath.Join("Markdown", "symlink.md")) } func getTokens(samples []string) ([]string, error) { diff --git a/internal/code-generator/generator/generator_test.go b/internal/code-generator/generator/generator_test.go index 6f4a744..b243abf 100644 --- a/internal/code-generator/generator/generator_test.go +++ b/internal/code-generator/generator/generator_test.go @@ -9,7 +9,10 @@ import ( "strings" "testing" + "github.com/go-enry/go-enry/v2/internal/tokenizer" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" "github.com/stretchr/testify/suite" ) @@ -302,9 +305,25 @@ func (s *GeneratorTestSuite) TestGenerationFiles() { expected := normalizeSpaces(string(gold)) actual := normalizeSpaces(string(out)) assert.Equal(s.T(), expected, actual, "Test %s", test.name) + if expected != actual { + s.T().Logf("%s generated is different from %q", test.name, test.wantOut) + s.T().Logf("Expected %q", expected[:400]) + s.T().Logf("Actual %q", actual[:400]) + } + } } +func (s *GeneratorTestSuite) TestTokenizerOnATS() { + const suspiciousSample = "samples/ATS/csv_parse.hats" + sFile := filepath.Join(s.tmpLinguist, suspiciousSample) + content, err := ioutil.ReadFile(sFile) + require.NoError(s.T(), err) + + tokens := tokenizer.Tokenize(content) + assert.Equal(s.T(), 381, len(tokens), "Number of tokens using LF as line endings") +} + // normalizeSpaces returns a copy of str with whitespaces normalized. // We use this to compare generated source as gofmt format may change. // E.g for changes between Go 1.10 and 1.11 see diff --git a/internal/code-generator/main.go b/internal/code-generator/main.go index f4d809a..9b3a2a9 100644 --- a/internal/code-generator/main.go +++ b/internal/code-generator/main.go @@ -3,81 +3,84 @@ package main import ( "io/ioutil" "log" + "path/filepath" "github.com/go-enry/go-enry/v2/internal/code-generator/generator" ) -const ( - // languages info file - languagesYAML = ".linguist/lib/linguist/languages.yml" +var ( + // directories + samplesDir = filepath.Join(".linguist", "samples") + libDir = filepath.Join(".linguist", "lib", "linguist") + assetsDir = filepath.Join("internal", "code-generator", "assets") - // linguist's samples directory - samplesDir = ".linguist/samples" + // languages info file + languagesYAML = filepath.Join(libDir, "languages.yml") // extension.go generation - extensionsFile = "data/extension.go" - extensionsTmplPath = "internal/code-generator/assets/extension.go.tmpl" + extensionsFile = filepath.Join("data", "extension.go") + extensionsTmplPath = filepath.Join(assetsDir, "extension.go.tmpl") extensionsTmpl = "extension.go.tmpl" // content.go generation - heuristicsYAML = ".linguist/lib/linguist/heuristics.yml" - contentFile = "data/content.go" - contentTmplPath = "internal/code-generator/assets/content.go.tmpl" + heuristicsYAML = filepath.Join(libDir, "heuristics.yml") + contentFile = filepath.Join("data", "content.go") + contentTmplPath = filepath.Join(assetsDir, "content.go.tmpl") contentTmpl = "content.go.tmpl" // vendor.go generation - vendorYAML = ".linguist/lib/linguist/vendor.yml" - vendorFile = "data/vendor.go" - vendorTmplPath = "internal/code-generator/assets/vendor.go.tmpl" + vendorYAML = filepath.Join(libDir, "vendor.yml") + vendorFile = filepath.Join("data", "vendor.go") + vendorTmplPath = filepath.Join(assetsDir, "vendor.go.tmpl") vendorTmpl = "vendor.go.tmpl" // documentation.go generation - documentationYAML = ".linguist/lib/linguist/documentation.yml" - documentationFile = "data/documentation.go" - documentationTmplPath = "internal/code-generator/assets/documentation.go.tmpl" + documentationYAML = filepath.Join(libDir, "documentation.yml") + documentationFile = filepath.Join("data", "documentation.go") + documentationTmplPath = filepath.Join(assetsDir, "documentation.go.tmpl") documentationTmpl = "documentation.go.tmpl" // type.go generation - typeFile = "data/type.go" - typeTmplPath = "internal/code-generator/assets/type.go.tmpl" + typeFile = filepath.Join("data", "type.go") + typeTmplPath = filepath.Join(assetsDir, "type.go.tmpl") typeTmpl = "type.go.tmpl" // interpreter.go generation - interpretersFile = "data/interpreter.go" - interpretersTmplPath = "internal/code-generator/assets/interpreter.go.tmpl" + interpretersFile = filepath.Join("data", "interpreter.go") + interpretersTmplPath = filepath.Join(assetsDir, "interpreter.go.tmpl") interpretersTmpl = "interpreter.go.tmpl" // filename.go generation - filenamesFile = "data/filename.go" - filenamesTmplPath = "internal/code-generator/assets/filename.go.tmpl" + filenamesFile = filepath.Join("data", "filename.go") + filenamesTmplPath = filepath.Join(assetsDir, "filename.go.tmpl") filenamesTmpl = "filename.go.tmpl" // alias.go generation - aliasesFile = "data/alias.go" - aliasesTmplPath = "internal/code-generator/assets/alias.go.tmpl" + aliasesFile = filepath.Join("data", "alias.go") + aliasesTmplPath = filepath.Join(assetsDir, "alias.go.tmpl") aliasesTmpl = "alias.go.tmpl" // frequencies.go generation - frequenciesFile = "data/frequencies.go" - frequenciesTmplPath = "internal/code-generator/assets/frequencies.go.tmpl" + frequenciesFile = filepath.Join("data", "frequencies.go") + frequenciesTmplPath = filepath.Join(assetsDir, "frequencies.go.tmpl") frequenciesTmpl = "frequencies.go.tmpl" // commit.go generation - commitFile = "data/commit.go" - commitTmplPath = "internal/code-generator/assets/commit.go.tmpl" + commitFile = filepath.Join("data", "commit.go") + commitTmplPath = filepath.Join(assetsDir, "commit.go.tmpl") commitTmpl = "commit.go.tmpl" // mimeType.go generation - mimeTypeFile = "data/mimeType.go" - mimeTypeTmplPath = "internal/code-generator/assets/mimeType.go.tmpl" + mimeTypeFile = filepath.Join("data", "mimeType.go") + mimeTypeTmplPath = filepath.Join(assetsDir, "mimeType.go.tmpl") mimeTypeTmpl = "mimeType.go.tmpl" // colors.go generation - colorsFile = "data/colors.go" - colorsTmplPath = "internal/code-generator/assets/colors.go.tmpl" + colorsFile = filepath.Join("data", "colors.go") + colorsTmplPath = filepath.Join(assetsDir, "colors.go.tmpl") colorsTmpl = "colors.go.tmpl" - commitPath = ".linguist/.git/HEAD" + commitPath = filepath.Join(".linguist", ".git", "HEAD") ) type generatorFiles struct { @@ -125,7 +128,7 @@ func getCommit(path string) (string, error) { } if string(commit) == "ref: refs/heads/master\n" { - path = ".linguist/.git/" + string(commit[5:len(commit)-1]) + path = filepath.Join(".linguist", ".git", string(commit[5:len(commit)-1])) commit, err = ioutil.ReadFile(path) if err != nil { return "", err