diff --git a/README.md b/README.md index fcb6085..03f8bac 100644 --- a/README.md +++ b/README.md @@ -173,8 +173,6 @@ Parsing [linguist/samples](https://github.com/github/linguist/tree/master/sample - Detection of [generated files](https://github.com/github/linguist/blob/bf95666fc15e49d556f2def4d0a85338423c25f3/lib/linguist/generated.rb#L53) is not supported yet. (Thus they are not excluded from CLI output). See [#213](https://github.com/src-d/enry/issues/213). -- XML detection strategy is not implemented. See [#192](https://github.com/src-d/enry/issues/192). - - Overriding languages and types though `.gitattributes` is not yet supported. See [#18](https://github.com/src-d/enry/issues/18). - `enry` CLI output does NOT exclude `.gitignore`ed files and git submodules, as Linguist does diff --git a/common.go b/common.go index d6c8b7e..4764cf7 100644 --- a/common.go +++ b/common.go @@ -22,6 +22,7 @@ var DefaultStrategies = []Strategy{ GetLanguagesByFilename, GetLanguagesByShebang, GetLanguagesByExtension, + GetLanguagesByXML, GetLanguagesByManpage, GetLanguagesByContent, GetLanguagesByClassifier, @@ -329,15 +330,23 @@ func getInterpreter(data []byte) (interpreter string) { return } -func getFirstLine(content []byte) []byte { - nlpos := bytes.IndexByte(content, '\n') - if nlpos < 0 { - return content +func getFirstLines(content []byte, count int) []byte { + nlpos := -1 + for ; count > 0; count-- { + pos := bytes.IndexByte(content[nlpos+1:], '\n') + if pos < 0 { + return content + } + nlpos += pos + 1 } return content[:nlpos] } +func getFirstLine(content []byte) []byte { + return getFirstLines(content, 1) +} + func hasShebang(line []byte) bool { const shebang = `#!` prefix := []byte(shebang) @@ -404,6 +413,29 @@ func GetLanguagesByManpage(filename string, _ []byte, _ []string) []string { return nil } +var ( + xmlHeader = regex.MustCompile(` 0 { + return candidates + } + + header := getFirstLines(content, 2) + + // Check if contains XML header + if xmlHeader.Match(header) { + return []string{ + "XML", + } + } + + return nil +} + func getDotIndexes(filename string) []int { dots := make([]int, 0, 2) for i, letter := range filename { diff --git a/common_test.go b/common_test.go index 15ae503..b79eff3 100644 --- a/common_test.go +++ b/common_test.go @@ -23,6 +23,7 @@ type EnryTestSuite struct { tmpLinguist string needToClone bool samplesDir string + testFixturesDir string } func (s *EnryTestSuite) TestRegexpEdgeCases() { @@ -72,6 +73,9 @@ func (s *EnryTestSuite) SetupSuite() { s.samplesDir = filepath.Join(s.tmpLinguist, "samples") s.T().Logf("using samples from %s", s.samplesDir) + s.testFixturesDir = filepath.Join(s.tmpLinguist, "test", "fixtures") + s.T().Logf("using test fixtures from %s", s.samplesDir) + cwd, err := os.Getwd() assert.NoError(s.T(), err) @@ -314,6 +318,31 @@ func (s *EnryTestSuite) TestGetLanguagesByManpage() { } } + +func (s *EnryTestSuite) TestGetLanguagesByXML() { + tests := []struct { + name string + filename string + candidates []string + expected []string + + }{ + {name: "TestGetLanguagesByXML_1", filename: filepath.Join(s.testFixturesDir, "XML/app.config"), expected: []string{"XML"}}, + {name: "TestGetLanguagesByXML_2", filename: filepath.Join(s.testFixturesDir, "XML/AssertionIDRequestOptionalAttributes.xml.svn-base"), expected: []string{"XML"}}, + // no XML header so should not be identified by this strategy + {name: "TestGetLanguagesByXML_3", filename: filepath.Join(s.samplesDir, "XML/libsomething.dll.config"), expected: nil}, + {name: "TestGetLanguagesByXML_4", filename: filepath.Join(s.samplesDir, "Eagle/Eagle.sch"), candidates: []string{"Eagle"}, expected: []string{"Eagle"}}, + } + + for _, test := range tests { + content, err := ioutil.ReadFile(test.filename) + assert.NoError(s.T(), err) + + languages := GetLanguagesByXML(test.filename, content, test.candidates) + assert.Equal(s.T(), test.expected, languages, fmt.Sprintf("%v: languages = %v, expected: %v", test.name, languages, test.expected)) + } +} + func (s *EnryTestSuite) TestGetLanguagesByClassifier() { test := []struct { name string