Merge pull request #40 from lafriks-fork/feat/strategy_xml

Add XML strategy
This commit is contained in:
Alexander 2020-12-02 00:10:52 +01:00 committed by GitHub
commit 3faf9450da
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 65 additions and 6 deletions

View File

@ -173,8 +173,6 @@ Parsing [linguist/samples](https://github.com/github/linguist/tree/master/sample
- Detection of [generated files](https://github.com/github/linguist/blob/bf95666fc15e49d556f2def4d0a85338423c25f3/lib/linguist/generated.rb#L53) is not supported yet. - Detection of [generated files](https://github.com/github/linguist/blob/bf95666fc15e49d556f2def4d0a85338423c25f3/lib/linguist/generated.rb#L53) is not supported yet.
(Thus they are not excluded from CLI output). See [#213](https://github.com/src-d/enry/issues/213). (Thus they are not excluded from CLI output). See [#213](https://github.com/src-d/enry/issues/213).
- XML detection strategy is not implemented. See [#192](https://github.com/src-d/enry/issues/192).
- Overriding languages and types though `.gitattributes` is not yet supported. See [#18](https://github.com/src-d/enry/issues/18). - Overriding languages and types though `.gitattributes` is not yet supported. See [#18](https://github.com/src-d/enry/issues/18).
- `enry` CLI output does NOT exclude `.gitignore`ed files and git submodules, as Linguist does - `enry` CLI output does NOT exclude `.gitignore`ed files and git submodules, as Linguist does

View File

@ -22,6 +22,7 @@ var DefaultStrategies = []Strategy{
GetLanguagesByFilename, GetLanguagesByFilename,
GetLanguagesByShebang, GetLanguagesByShebang,
GetLanguagesByExtension, GetLanguagesByExtension,
GetLanguagesByXML,
GetLanguagesByManpage, GetLanguagesByManpage,
GetLanguagesByContent, GetLanguagesByContent,
GetLanguagesByClassifier, GetLanguagesByClassifier,
@ -329,15 +330,23 @@ func getInterpreter(data []byte) (interpreter string) {
return return
} }
func getFirstLine(content []byte) []byte { func getFirstLines(content []byte, count int) []byte {
nlpos := bytes.IndexByte(content, '\n') nlpos := -1
if nlpos < 0 { for ; count > 0; count-- {
return content pos := bytes.IndexByte(content[nlpos+1:], '\n')
if pos < 0 {
return content
}
nlpos += pos + 1
} }
return content[:nlpos] return content[:nlpos]
} }
func getFirstLine(content []byte) []byte {
return getFirstLines(content, 1)
}
func hasShebang(line []byte) bool { func hasShebang(line []byte) bool {
const shebang = `#!` const shebang = `#!`
prefix := []byte(shebang) prefix := []byte(shebang)
@ -404,6 +413,29 @@ func GetLanguagesByManpage(filename string, _ []byte, _ []string) []string {
return nil return nil
} }
var (
xmlHeader = regex.MustCompile(`<?xml version=`)
)
// GetLanguagesByXML returns a slice of possible XML language for the given filename.
// It complies with the signature to be a Strategy type.
func GetLanguagesByXML(_ string, content []byte, candidates []string) []string {
if len(candidates) > 0 {
return candidates
}
header := getFirstLines(content, 2)
// Check if contains XML header
if xmlHeader.Match(header) {
return []string{
"XML",
}
}
return nil
}
func getDotIndexes(filename string) []int { func getDotIndexes(filename string) []int {
dots := make([]int, 0, 2) dots := make([]int, 0, 2)
for i, letter := range filename { for i, letter := range filename {

View File

@ -23,6 +23,7 @@ type EnryTestSuite struct {
tmpLinguist string tmpLinguist string
needToClone bool needToClone bool
samplesDir string samplesDir string
testFixturesDir string
} }
func (s *EnryTestSuite) TestRegexpEdgeCases() { func (s *EnryTestSuite) TestRegexpEdgeCases() {
@ -72,6 +73,9 @@ func (s *EnryTestSuite) SetupSuite() {
s.samplesDir = filepath.Join(s.tmpLinguist, "samples") s.samplesDir = filepath.Join(s.tmpLinguist, "samples")
s.T().Logf("using samples from %s", s.samplesDir) s.T().Logf("using samples from %s", s.samplesDir)
s.testFixturesDir = filepath.Join(s.tmpLinguist, "test", "fixtures")
s.T().Logf("using test fixtures from %s", s.samplesDir)
cwd, err := os.Getwd() cwd, err := os.Getwd()
assert.NoError(s.T(), err) assert.NoError(s.T(), err)
@ -314,6 +318,31 @@ func (s *EnryTestSuite) TestGetLanguagesByManpage() {
} }
} }
func (s *EnryTestSuite) TestGetLanguagesByXML() {
tests := []struct {
name string
filename string
candidates []string
expected []string
}{
{name: "TestGetLanguagesByXML_1", filename: filepath.Join(s.testFixturesDir, "XML/app.config"), expected: []string{"XML"}},
{name: "TestGetLanguagesByXML_2", filename: filepath.Join(s.testFixturesDir, "XML/AssertionIDRequestOptionalAttributes.xml.svn-base"), expected: []string{"XML"}},
// no XML header so should not be identified by this strategy
{name: "TestGetLanguagesByXML_3", filename: filepath.Join(s.samplesDir, "XML/libsomething.dll.config"), expected: nil},
{name: "TestGetLanguagesByXML_4", filename: filepath.Join(s.samplesDir, "Eagle/Eagle.sch"), candidates: []string{"Eagle"}, expected: []string{"Eagle"}},
}
for _, test := range tests {
content, err := ioutil.ReadFile(test.filename)
assert.NoError(s.T(), err)
languages := GetLanguagesByXML(test.filename, content, test.candidates)
assert.Equal(s.T(), test.expected, languages, fmt.Sprintf("%v: languages = %v, expected: %v", test.name, languages, test.expected))
}
}
func (s *EnryTestSuite) TestGetLanguagesByClassifier() { func (s *EnryTestSuite) TestGetLanguagesByClassifier() {
test := []struct { test := []struct {
name string name string