From 9ce235f39ee1209f372b03beb2c99ded053d406e Mon Sep 17 00:00:00 2001 From: Alex Bezzubov Date: Fri, 3 Mar 2023 23:17:32 +0100 Subject: [PATCH] test: add new corner cases for linguist v7.23 test plan - go test -run '^Test_EnryOnLinguistCorpus$' github.com/go-enry/go-enry/v2 --- README.md | 4 ++++ linguist_corpus_test.go | 10 ++++++++-- utils_test.go | 3 ++- 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index ff769e6..047232d 100644 --- a/README.md +++ b/README.md @@ -174,6 +174,10 @@ Parsing [linguist/samples](https://github.com/github/linguist/tree/master/sample - [Heuristics for ".csc", ".gsc" and ".gsh" extension](https://github.com/github/linguist/blob/7469c7982d93f2ad922230d712f586a353dc1a42/lib/linguist/heuristics.yml#L650-L651) in GSC could not be parsed, due to unsupported non-backtracking subexpressions in RE2 regexp engine. +- [Heuristic for ".txt"](https://github.com/github/linguist/blob/bf853f1c663903e3ee35935189760191f1c45e1c/lib/linguist/heuristics.yml#L680-L702) detecting 'Adblock Filter List' regexp syntax not supported by RE2 + +- [IsVendor('bootstrap.css') == false](https://github.com/github/linguist/blob/v7.23.0/lib/linguist/vendor.yml#L77) v7.23 first unsupported RE syntax outside content heuristics + - As of [Linguist v5.3.2](https://github.com/github/linguist/releases/tag/v5.3.2) it is using [flex-based scanner in C for tokenization](https://github.com/github/linguist/pull/3846). Enry still uses [extract_token](https://github.com/github/linguist/pull/3846/files#diff-d5179df0b71620e3fac4535cd1368d15L60) regex-based algorithm. See [#193](https://github.com/src-d/enry/issues/193). - Bayesian classifier can't distinguish "SQL" from "PLpgSQL. See [#194](https://github.com/src-d/enry/issues/194). diff --git a/linguist_corpus_test.go b/linguist_corpus_test.go index a8856fd..98059ef 100644 --- a/linguist_corpus_test.go +++ b/linguist_corpus_test.go @@ -24,9 +24,15 @@ func Test_EnryOnLinguistCorpus(t *testing.T) { func (s *linguistCorpusSuite) TestLinguistSamples() { const filenamesDir = "filenames" var cornerCases = map[string]bool{ - "drop_stuff.sql": true, // https://github.com/src-d/enry/issues/194 - "textobj-rubyblock.vba": true, // Because of unsupported negative lookahead RE syntax (https://github.com/github/linguist/blob/8083cb5a89cee2d99f5a988f165994d0243f0d1e/lib/linguist/heuristics.yml#L521) + "drop_stuff.sql": false, // not the case in v7.23, https://github.com/src-d/enry/issues/194 + "textobj-rubyblock.vba": true, // unsupported negative lookahead RE syntax (https://github.com/github/linguist/blob/8083cb5a89cee2d99f5a988f165994d0243f0d1e/lib/linguist/heuristics.yml#L521) // .es and .ice fail heuristics parsing, but do not fail any tests + // 'Adblock Filter List' hack https://github.com/github/linguist/blob/bf853f1c663903e3ee35935189760191f1c45e1c/lib/linguist/heuristics.yml#L680-L702 + "Imperial Units Remover.txt": true, + "abp-filters-anti-cv.txt": true, + "anti-facebook.txt": true, + "fake-news.txt": true, + "test_rules.txt": true, } var total, failed, ok, other int diff --git a/utils_test.go b/utils_test.go index 75f2a22..4d426c1 100644 --- a/utils_test.go +++ b/utils_test.go @@ -51,8 +51,9 @@ var vendorTests = []struct { {false, "some/python/venv/", false}, {false, "foo/.imageset/bar", true}, {false, "Vagrantfile", true}, + {false, "custom.bootstrap.css", true}, {true, "src/bootstrap-custom.js", true}, - // {true, "/css/bootstrap.rtl.css", true}, // from linguist v7.23 + {true, "/css/bootstrap.rtl.css", true}, // from linguist v7.23 } func TestIsVendor(t *testing.T) {