From b0f94ad69335d1350c09630e26e77d8011852c0f Mon Sep 17 00:00:00 2001 From: Alexander Bezzubov Date: Wed, 25 Mar 2020 14:00:24 +0100 Subject: [PATCH 1/8] generator: CLI tool fix to support win paths On Win `make code-generate` produces unreasonable Bayesian classifier weights from Linguist samples silently, failing only the final classification tests. TestPlan: - go test ./internal/code-generator/... \ -run Test_GeneratorTestSuite -testify.m TestGenerationFiles Signed-off-by: Alexander Bezzubov --- internal/code-generator/main.go | 71 +++++++++++++++++---------------- 1 file changed, 37 insertions(+), 34 deletions(-) diff --git a/internal/code-generator/main.go b/internal/code-generator/main.go index f4d809a..93b752b 100644 --- a/internal/code-generator/main.go +++ b/internal/code-generator/main.go @@ -3,81 +3,84 @@ package main import ( "io/ioutil" "log" + "path/filepath" "github.com/go-enry/go-enry/v2/internal/code-generator/generator" ) -const ( - // languages info file - languagesYAML = ".linguist/lib/linguist/languages.yml" +var ( + // directories + samplesDir = filepath.Join(".linguist", "samples") + libDir = filepath.Join(".linguist", "lib", "linguist") + assetsDir = filepath.Join("internal", "code-generator", "assets") - // linguist's samples directory - samplesDir = ".linguist/samples" + // languages info file + languagesYAML = filepath.Join(libDir, "languages.yml") // extension.go generation - extensionsFile = "data/extension.go" - extensionsTmplPath = "internal/code-generator/assets/extension.go.tmpl" + extensionsFile = filepath.Join("data", "extension.go") + extensionsTmplPath = filepath.Join(assetsDir, "extension.go.tmpl") extensionsTmpl = "extension.go.tmpl" // content.go generation - heuristicsYAML = ".linguist/lib/linguist/heuristics.yml" - contentFile = "data/content.go" - contentTmplPath = "internal/code-generator/assets/content.go.tmpl" + heuristicsYAML = filepath.Join(libDir, "heuristics.yml") + contentFile = filepath.Join("data", "content.go") + contentTmplPath = filepath.Join(assetsDir, "content.go.tmpl") contentTmpl = "content.go.tmpl" // vendor.go generation - vendorYAML = ".linguist/lib/linguist/vendor.yml" - vendorFile = "data/vendor.go" - vendorTmplPath = "internal/code-generator/assets/vendor.go.tmpl" + vendorYAML = filepath.Join(libDir, "vendor.yml") + vendorFile = filepath.Join("data", "vendor.go") + vendorTmplPath = filepath.Join(assetsDir, "vendor.go.tmpl") vendorTmpl = "vendor.go.tmpl" // documentation.go generation - documentationYAML = ".linguist/lib/linguist/documentation.yml" - documentationFile = "data/documentation.go" - documentationTmplPath = "internal/code-generator/assets/documentation.go.tmpl" + documentationYAML = filepath.Join(libDir, "documentation.yml") + documentationFile = filepath.Join("data", "documentation.go") + documentationTmplPath = filepath.Join(assetsDir, "documentation.go.tmpl") documentationTmpl = "documentation.go.tmpl" // type.go generation - typeFile = "data/type.go" - typeTmplPath = "internal/code-generator/assets/type.go.tmpl" + typeFile = filepath.Join("data", "type.go") + typeTmplPath = filepath.Join(assetsDir, "type.go.tmpl") typeTmpl = "type.go.tmpl" // interpreter.go generation - interpretersFile = "data/interpreter.go" - interpretersTmplPath = "internal/code-generator/assets/interpreter.go.tmpl" + interpretersFile = filepath.Join("data", "interpreter.go") + interpretersTmplPath = filepath.Join(assetsDir, "interpreter.go.tmpl") interpretersTmpl = "interpreter.go.tmpl" // filename.go generation - filenamesFile = "data/filename.go" - filenamesTmplPath = "internal/code-generator/assets/filename.go.tmpl" + filenamesFile = filepath.Join("data", "filename.go") + filenamesTmplPath = filepath.Join(assetsDir, "filename.go.tmpl") filenamesTmpl = "filename.go.tmpl" // alias.go generation - aliasesFile = "data/alias.go" - aliasesTmplPath = "internal/code-generator/assets/alias.go.tmpl" + aliasesFile = filepath.Join("data", "alias.go") + aliasesTmplPath = filepath.Join(assetsDir, "alias.go.tmpl") aliasesTmpl = "alias.go.tmpl" // frequencies.go generation - frequenciesFile = "data/frequencies.go" - frequenciesTmplPath = "internal/code-generator/assets/frequencies.go.tmpl" + frequenciesFile = filepath.Join("data", "frequencies.go") + frequenciesTmplPath = filepath.Join(assetsDir, "frequencies.go.tmpl") frequenciesTmpl = "frequencies.go.tmpl" // commit.go generation - commitFile = "data/commit.go" - commitTmplPath = "internal/code-generator/assets/commit.go.tmpl" + commitFile = filepath.Join("data", "commit.go") + commitTmplPath = filepath.Join(assetsDir, "commit.go.tmpl") commitTmpl = "commit.go.tmpl" // mimeType.go generation - mimeTypeFile = "data/mimeType.go" - mimeTypeTmplPath = "internal/code-generator/assets/mimeType.go.tmpl" + mimeTypeFile = filepath.Join("data", "mimeType.go") + mimeTypeTmplPath = filepath.Join(assetsDir, "mimeType.go.tmpl") mimeTypeTmpl = "mimeType.go.tmpl" // colors.go generation - colorsFile = "data/colors.go" - colorsTmplPath = "internal/code-generator/assets/colors.go.tmpl" + colorsFile = filepath.Join("data", "colors.go") + colorsTmplPath = filepath.Join(assetsDir, "colors.go.tmpl") colorsTmpl = "colors.go.tmpl" - commitPath = ".linguist/.git/HEAD" + commitPath = filepath.Join(".linguist", ".git", "HEAD") ) type generatorFiles struct { @@ -125,7 +128,7 @@ func getCommit(path string) (string, error) { } if string(commit) == "ref: refs/heads/master\n" { - path = ".linguist/.git/" + string(commit[5:len(commit)-1]) + path = filepath.Join(".linguist", ".git/", string(commit[5:len(commit)-1])) commit, err = ioutil.ReadFile(path) if err != nil { return "", err From 3a5f4b2db17c3696bcaf2b74c80782d034ca1c94 Mon Sep 17 00:00:00 2001 From: Alexander Bezzubov Date: Wed, 25 Mar 2020 14:20:26 +0100 Subject: [PATCH 2/8] generator: mode debug output in case of failure Signed-off-by: Alexander Bezzubov --- internal/code-generator/generator/generator_test.go | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/internal/code-generator/generator/generator_test.go b/internal/code-generator/generator/generator_test.go index 6f4a744..4f8b105 100644 --- a/internal/code-generator/generator/generator_test.go +++ b/internal/code-generator/generator/generator_test.go @@ -302,6 +302,12 @@ func (s *GeneratorTestSuite) TestGenerationFiles() { expected := normalizeSpaces(string(gold)) actual := normalizeSpaces(string(out)) assert.Equal(s.T(), expected, actual, "Test %s", test.name) + if expected != actual { + s.T().Logf("%s generated is different from %q", test.name, test.wantOut) + s.T().Logf("Expected %q", expected[:400]) + s.T().Logf("Actual %q", actual[:400]) + } + } } From b78e4423f098ba1f04aee85e8313286617c789da Mon Sep 17 00:00:00 2001 From: Alexander Date: Wed, 25 Mar 2020 19:27:46 +0100 Subject: [PATCH 3/8] generator: drop platform-specific separator Co-Authored-By: Lauris BH --- internal/code-generator/main.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/code-generator/main.go b/internal/code-generator/main.go index 93b752b..9b3a2a9 100644 --- a/internal/code-generator/main.go +++ b/internal/code-generator/main.go @@ -128,7 +128,7 @@ func getCommit(path string) (string, error) { } if string(commit) == "ref: refs/heads/master\n" { - path = filepath.Join(".linguist", ".git/", string(commit[5:len(commit)-1])) + path = filepath.Join(".linguist", ".git", string(commit[5:len(commit)-1])) commit, err = ioutil.ReadFile(path) if err != nil { return "", err From 78eee0cf7e3f5caeac5755da8cd6bf2390c8e13c Mon Sep 17 00:00:00 2001 From: Alexander Bezzubov Date: Sat, 28 Mar 2020 20:22:01 +0100 Subject: [PATCH 4/8] generator: flag to debug building of bayesian classifier It seems that reading ./samples/ from Linguist consumes a different number of files from filesystem on different OSes. This change adds ENRY_DEBUG env var to print some debug output about calculations of token stats from samples. TestPlan: - ENRY_DEBUG=1 go test -v ./internal/code-generator/generator \ -run Test_GeneratorTestSuite -testify.m TestGenerationFiles Signed-off-by: Alexander Bezzubov --- internal/code-generator/generator/samplesfreq.go | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/internal/code-generator/generator/samplesfreq.go b/internal/code-generator/generator/samplesfreq.go index 3dd2142..ca695b7 100644 --- a/internal/code-generator/generator/samplesfreq.go +++ b/internal/code-generator/generator/samplesfreq.go @@ -7,6 +7,7 @@ import ( "io/ioutil" "log" "math" + "os" "path/filepath" "sort" "strconv" @@ -31,6 +32,21 @@ func Frequencies(fileToParse, samplesDir, outPath, tmplPath, tmplName, commit st return err } + if _, ok := os.LookupEnv("ENRY_DEBUG"); ok { + log.Printf("Total samples: %d\n", freqs.LanguageTotal) + log.Printf("Total tokens: %d\n", freqs.TokensTotal) + + keys := make([]string, 0, len(freqs.Languages)) + for k := range freqs.Languages { + keys = append(keys, k) + } + sort.Strings(keys) + + for _, k := range keys { + fmt.Printf(" %s: %d\n", k, freqs.Languages[k]) + } + } + buf := &bytes.Buffer{} if err := executeFrequenciesTemplate(buf, freqs, tmplPath, tmplName, commit); err != nil { return err From 9c082eb2d4615f6e9d96f400d0783d2b991ed85e Mon Sep 17 00:00:00 2001 From: Alexander Bezzubov Date: Sat, 28 Mar 2020 20:36:48 +0100 Subject: [PATCH 5/8] ci: add ENRY_DEBUG flag Signed-off-by: Alexander Bezzubov --- .github/workflows/test.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 2f42467..8ae3dd5 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -17,6 +17,8 @@ jobs: uses: actions/checkout@v2 - name: Test run: go test ./... + env: + ENRY_DEBUG: 1 test-oniguruma: strategy: matrix: From 9be0211f04246267b0f54e74712560c32aa3354e Mon Sep 17 00:00:00 2001 From: Alexander Bezzubov Date: Sat, 28 Mar 2020 20:27:50 +0100 Subject: [PATCH 6/8] generator: skip symlinks on *nix and win As Git on win does not support symlinks [1], we have to hard-code the paths to fils under ./samples/ in Linguist codebase that are known to be a symlink. 1. https://github.com/git-for-windows/git/wiki/Symbolic-Links TestPlan: - go test ./internal/code-generator/generator -run Test_GeneratorTestSuite Signed-off-by: Alexander Bezzubov --- .../{samplesfreq.go => frequencies.go} | 68 +++++++++---------- 1 file changed, 33 insertions(+), 35 deletions(-) rename internal/code-generator/generator/{samplesfreq.go => frequencies.go} (76%) diff --git a/internal/code-generator/generator/samplesfreq.go b/internal/code-generator/generator/frequencies.go similarity index 76% rename from internal/code-generator/generator/samplesfreq.go rename to internal/code-generator/generator/frequencies.go index ca695b7..2c6ba74 100644 --- a/internal/code-generator/generator/samplesfreq.go +++ b/internal/code-generator/generator/frequencies.go @@ -11,6 +11,7 @@ import ( "path/filepath" "sort" "strconv" + "strings" "text/template" "github.com/go-enry/go-enry/v2/internal/tokenizer" @@ -107,49 +108,46 @@ func getFrequencies(samplesDir string) (*samplesFrequencies, error) { }, nil } +// readSamples collects ./samples/ filenames from the Linguist codebase, skiping symlinks. func readSamples(samplesLangDir string) ([]string, error) { - const samplesLangFilesDir = "filenames" - sampleFiles, err := ioutil.ReadDir(samplesLangDir) - if err != nil { - return nil, err - } - + const specialSubDir = "filenames" var samples []string - for _, sampleFile := range sampleFiles { - filename := filepath.Join(samplesLangDir, sampleFile.Name()) - if sampleFile.Mode().IsRegular() { - samples = append(samples, filename) - continue - } - if sampleFile.IsDir() && sampleFile.Name() == samplesLangFilesDir { - subSamples, err := readSubSamples(filename) - if err != nil { - return nil, err + err := filepath.Walk(samplesLangDir, func(path string, info os.FileInfo, err error) error { + if err != nil { + fmt.Printf("failure accessing a path %q: %v\n", path, err) + return err + } + if info.IsDir() { + switch info.Name() { + case filepath.Base(samplesLangDir): + return nil + case specialSubDir: + return nil + default: + return filepath.SkipDir } - - samples = append(samples, subSamples...) } + // skip git file symlinks on win and *nix + if isKnownSymlinkInLinguist(path) || !info.Mode().IsRegular() { + return nil + } + samples = append(samples, path) + return nil + }) - } - - return samples, nil + return samples, err } -func readSubSamples(path string) ([]string, error) { - subSamples := []string{} - entries, err := ioutil.ReadDir(path) - if err != nil { - return nil, err - } - - for _, entry := range entries { - if entry.Mode().IsRegular() { - subSamples = append(subSamples, filepath.Join(path, entry.Name())) - } - } - - return subSamples, nil +// isKnownSymlinkInLinguist checks if the file name is on the list of known symlinks. +// On Windows, there is no symlink support in Git [1] and those become regular text files, +// so we have to skip these files manually, maintaing a list here :/ +// 1. https://github.com/git-for-windows/git/wiki/Symbolic-Links +// +// $ find -L .linguist/samples -xtype l +func isKnownSymlinkInLinguist(path string) bool { + return strings.HasSuffix(path, filepath.Join("Ant Build System", "filenames", "build.xml")) || + strings.HasSuffix(path, filepath.Join("Markdown", "symlink.md")) } func getTokens(samples []string) ([]string, error) { From 3ea961e5ab26e6f097523b62aab3376452f477b9 Mon Sep 17 00:00:00 2001 From: Alexander Bezzubov Date: Sat, 28 Mar 2020 22:47:57 +0100 Subject: [PATCH 7/8] generator: change-detector tests on EOL-dependant sample Signed-off-by: Alexander Bezzubov --- internal/code-generator/generator/generator_test.go | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/internal/code-generator/generator/generator_test.go b/internal/code-generator/generator/generator_test.go index 4f8b105..b243abf 100644 --- a/internal/code-generator/generator/generator_test.go +++ b/internal/code-generator/generator/generator_test.go @@ -9,7 +9,10 @@ import ( "strings" "testing" + "github.com/go-enry/go-enry/v2/internal/tokenizer" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" "github.com/stretchr/testify/suite" ) @@ -311,6 +314,16 @@ func (s *GeneratorTestSuite) TestGenerationFiles() { } } +func (s *GeneratorTestSuite) TestTokenizerOnATS() { + const suspiciousSample = "samples/ATS/csv_parse.hats" + sFile := filepath.Join(s.tmpLinguist, suspiciousSample) + content, err := ioutil.ReadFile(sFile) + require.NoError(s.T(), err) + + tokens := tokenizer.Tokenize(content) + assert.Equal(s.T(), 381, len(tokens), "Number of tokens using LF as line endings") +} + // normalizeSpaces returns a copy of str with whitespaces normalized. // We use this to compare generated source as gofmt format may change. // E.g for changes between Go 1.10 and 1.11 see From 172486906ac876ce3943181733d112c2da0ccaa1 Mon Sep 17 00:00:00 2001 From: Alexander Bezzubov Date: Sun, 29 Mar 2020 19:22:56 +0200 Subject: [PATCH 8/8] ci: force git to use LF on win to pass tests on linguist samples This mitigates the problem that tokenizer uses regex that matches platform-specific line endings TestPlan: - go test ./internal/code-generator/generator \ -run Test_GeneratorTestSuite -testify.m TestTokenizerOnATS Signed-off-by: Alexander Bezzubov --- .github/workflows/test.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 8ae3dd5..87a5764 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -13,6 +13,10 @@ jobs: with: go-version: ${{ matrix.go-version }} + - name: Set git on win to use LF + run: | + git config --global core.autocrlf false + git config --global core.eol lf - name: Checkout code uses: actions/checkout@v2 - name: Test