refactoring: remove un-used code, add go doc, fix ci (#199)

Refactoring, consisting of
 - remove unused method `isAuxiliaryLanguage` and `FileCountList`
   in order to reduce public API surfaces (go/java)
 - add GoDoc to public APIs
 - ci: java profile use latest go src
  It also now mimics https://docs.travis-ci.com/user/languages/go/#go-import-path
  for non-go build image, as code relies on internal imports.

TEST PLAN:
 - make test
This commit is contained in:
Alexander 2019-02-05 22:54:14 +01:00 committed by GitHub
parent fe18dc0830
commit 13d3d66d37
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 51 additions and 149 deletions

View File

@ -45,11 +45,19 @@ jobs:
stage: test
language: scala
jdk: oraclejdk8
before_install:
# mimics exact behavior of 'go_import_path' for non-go build image
- export GOPATH=${TRAVIS_HOME}/gopath
- mkdir -p ${GOPATH}/src/gopkg.in/src-d/enry.v1
- tar -Pczf ${TRAVIS_TMPDIR}/src_archive.tar.gz -C ${TRAVIS_BUILD_DIR} . && tar -Pxzf ${TRAVIS_TMPDIR}/src_archive.tar.gz -C ${TRAVIS_HOME}/gopath/src/gopkg.in/src-d/enry.v1
- export TRAVIS_BUILD_DIR=${TRAVIS_HOME}/gopath/src/gopkg.in/src-d/enry.v1
- cd ${TRAVIS_HOME}/gopath/src/gopkg.in/src-d/enry.v1
install:
- gimme version
- eval "$(curl -sL https://raw.githubusercontent.com/travis-ci/gimme/master/gimme | GIMME_GO_VERSION=$GO_VERSION bash)"
- go version
- go get -v gopkg.in/src-d/enry.v1/...
- echo $PWD; echo $GOPATH
- go get -v ./...
before_script:
- cd java
- make

View File

@ -278,7 +278,7 @@ func printFileAnalysis(file string, limit int64, isJSON bool) error {
// functions below can work on a sample
fileType := getFileType(file, data)
language := enry.GetLanguage(file, data)
mimeType := enry.GetMimeType(file, language)
mimeType := enry.GetMIMEType(file, language)
if isJSON {
return json.NewEncoder(os.Stdout).Encode(map[string]interface{}{

View File

@ -26,6 +26,7 @@ var DefaultStrategies = []Strategy{
GetLanguagesByClassifier,
}
// DefaultClassifier is a naive Bayes classifier based on Linguist samples.
var DefaultClassifier Classifier = &classifier{
languagesLogProbabilities: data.LanguagesLogProbabilities,
tokensLogProbabilities: data.TokensLogProbabilities,

13
enry.go
View File

@ -1,3 +1,16 @@
/*
Package enry implements multiple strategies for programming language identification.
Identification is made based on file name and file content using a seriece
of strategies to narrow down possible option.
Each strategy is available as a separate API call, as well as a main enty point
GetLanguage(filename string, content []byte) (language string)
It is a port of the https://github.com/github/linguist from Ruby.
Upstream Linguist YAML files are used to generate datastructures for data
package.
*/
package enry // import "gopkg.in/src-d/enry.v1"
//go:generate make code-generate

View File

@ -28,7 +28,7 @@ $(RESOURCES_DIR): os-shared-lib
cp -R $(RESOURCES_SRC) $(RESOURCES_DIR)
$(JNAERATOR_JAR): $(RESOURCES_DIR)
mkdir $(JNAERATOR_DIR) && \
mkdir -p $(JNAERATOR_DIR) && \
wget $(JNAERATOR_JAR_URL) -O $(JNAERATOR_JAR)
os-shared-lib:

View File

@ -9,16 +9,6 @@ public class Enry {
private static final EnryLibrary nativeLib = EnryLibrary.INSTANCE;
/**
* Returns whether the given language is auxiliary or not.
*
* @param language name of the language, e.g. PHP, HTML, ...
* @return if it's an auxiliary language
*/
public static synchronized boolean isAuxiliaryLanguage(String language) {
return toJavaBool(nativeLib.IsAuxiliaryLanguage(toGoString(language)));
}
/**
* Returns the language of the given file based on the filename and its
* contents.

View File

@ -6,12 +6,6 @@ import static org.junit.Assert.*;
public class EnryTest {
@Test
public void isAuxiliaryLanguage() {
assertTrue(Enry.isAuxiliaryLanguage("HTML"));
assertFalse(Enry.isAuxiliaryLanguage("Go"));
}
@Test
public void getLanguage() {
String code = "<?php $foo = bar();";

View File

@ -93,12 +93,7 @@ func GetLanguagesByVimModeline(filename string, content []byte, candidates []str
//export GetMimeType
func GetMimeType(path string, language string) string {
return enry.GetMimeType(path, language)
}
//export IsAuxiliaryLanguage
func IsAuxiliaryLanguage(lang string) bool {
return enry.IsAuxiliaryLanguage(lang)
return enry.GetMIMEType(path, language)
}
//export IsBinary

View File

@ -8,53 +8,20 @@ import (
"gopkg.in/src-d/enry.v1/data"
)
var (
auxiliaryLanguages = map[string]bool{
"Other": true, "XML": true, "YAML": true, "TOML": true, "INI": true,
"JSON": true, "TeX": true, "Public Key": true, "AsciiDoc": true,
"AGS Script": true, "VimL": true, "Diff": true, "CMake": true, "fish": true,
"Awk": true, "Graphviz (DOT)": true, "Markdown": true, "desktop": true,
"XSLT": true, "SQL": true, "RMarkdown": true, "IRC log": true,
"reStructuredText": true, "Twig": true, "CSS": true, "Batchfile": true,
"Text": true, "HTML+ERB": true, "HTML": true, "Gettext Catalog": true,
"Smarty": true, "Raw token data": true,
}
const binSniffLen = 8000
configurationLanguages = map[string]bool{
"XML": true, "JSON": true, "TOML": true, "YAML": true, "INI": true, "SQL": true,
}
)
// IsAuxiliaryLanguage returns whether or not lang is an auxiliary language.
func IsAuxiliaryLanguage(lang string) bool {
_, ok := auxiliaryLanguages[lang]
return ok
var configurationLanguages = map[string]bool{
"XML": true, "JSON": true, "TOML": true, "YAML": true, "INI": true, "SQL": true,
}
// IsConfiguration returns whether or not path is using a configuration language.
// IsConfiguration tells if filename is in one of the configuration languages.
func IsConfiguration(path string) bool {
language, _ := GetLanguageByExtension(path)
_, is := configurationLanguages[language]
return is
}
// IsDotFile returns whether or not path has dot as a prefix.
func IsDotFile(path string) bool {
path = filepath.Clean(path)
base := filepath.Base(path)
return strings.HasPrefix(base, ".") && base != "." && base != ".."
}
// IsVendor returns whether or not path is a vendor path.
func IsVendor(path string) bool {
return data.VendorMatchers.Match(path)
}
// IsDocumentation returns whether or not path is a documentation path.
func IsDocumentation(path string) bool {
return data.DocumentationMatchers.Match(path)
}
// IsImage tells if a given file is an image (PNG, JPEG or GIF format).
func IsImage(path string) bool {
extension := filepath.Ext(path)
if extension == ".png" || extension == ".jpg" || extension == ".jpeg" || extension == ".gif" {
@ -64,7 +31,8 @@ func IsImage(path string) bool {
return false
}
func GetMimeType(path string, language string) string {
// GetMIMEType returns a MIME type of a given file based on its languages.
func GetMIMEType(path string, language string) string {
if mime, ok := data.LanguagesMime[language]; ok {
return mime
}
@ -76,13 +44,27 @@ func GetMimeType(path string, language string) string {
return "text/plain"
}
const sniffLen = 8000
// IsDocumentation returns whether or not path is a documentation path.
func IsDocumentation(path string) bool {
return data.DocumentationMatchers.Match(path)
}
// IsDotFile returns whether or not path has dot as a prefix.
func IsDotFile(path string) bool {
base := filepath.Base(filepath.Clean(path))
return strings.HasPrefix(base, ".") && base != "."
}
// IsVendor returns whether or not path is a vendor path.
func IsVendor(path string) bool {
return data.VendorMatchers.Match(path)
}
// IsBinary detects if data is a binary value based on:
// http://git.kernel.org/cgit/git/git.git/tree/xdiff-interface.c?id=HEAD#n198
func IsBinary(data []byte) bool {
if len(data) > sniffLen {
data = data[:sniffLen]
if len(data) > binSniffLen {
data = data[:binSniffLen]
}
if bytes.IndexByte(data, byte(0)) == -1 {
@ -91,17 +73,3 @@ func IsBinary(data []byte) bool {
return true
}
// FileCount type stores language name and count of files belonging to the
// language.
type FileCount struct {
Name string
Count int
}
// FileCountList type is a list of FileCounts.
type FileCountList []FileCount
func (fcl FileCountList) Len() int { return len(fcl) }
func (fcl FileCountList) Less(i, j int) bool { return fcl[i].Count < fcl[j].Count }
func (fcl FileCountList) Swap(i, j int) { fcl[i], fcl[j] = fcl[j], fcl[i] }

View File

@ -3,38 +3,11 @@ package enry
import (
"bytes"
"fmt"
"sort"
"testing"
"github.com/stretchr/testify/assert"
)
func TestIsAuxiliaryLanguage(t *testing.T) {
type testType struct {
name string
lang string
expected bool
}
tests := []testType{
{name: "TestIsAuxiliaryLanguage_Invalid", lang: "invalid", expected: false},
}
for k := range auxiliaryLanguages {
t := testType{
name: fmt.Sprintf("TestIsAuxiliaryLanguage_%s", k),
lang: k,
expected: true,
}
tests = append(tests, t)
}
for _, test := range tests {
is := IsAuxiliaryLanguage(test.lang)
assert.Equal(t, is, test.expected,
fmt.Sprintf("%v: is = %v, expected: %v", test.name, is, test.expected))
}
}
func TestIsVendor(t *testing.T) {
tests := []struct {
name string
@ -106,7 +79,7 @@ func TestGetMimeType(t *testing.T) {
}
for _, test := range tests {
is := GetMimeType(test.path, test.lang)
is := GetMIMEType(test.path, test.lang)
assert.Equal(t, is, test.expected, fmt.Sprintf("%v: is = %v, expected: %v", test.name, is, test.expected))
}
}
@ -160,43 +133,3 @@ func TestIsDotFile(t *testing.T) {
assert.Equal(t, test.expected, is, fmt.Sprintf("%v: is = %v, expected: %v", test.name, is, test.expected))
}
}
func TestFileCountListSort(t *testing.T) {
sampleData := FileCountList{{"a", 8}, {"b", 65}, {"c", 20}, {"d", 90}}
const ascending = "ASC"
const descending = "DESC"
tests := []struct {
name string
data FileCountList
order string
expectedData FileCountList
}{
{
name: "ascending order",
data: sampleData,
order: ascending,
expectedData: FileCountList{{"a", 8}, {"c", 20}, {"b", 65}, {"d", 90}},
},
{
name: "descending order",
data: sampleData,
order: descending,
expectedData: FileCountList{{"d", 90}, {"b", 65}, {"c", 20}, {"a", 8}},
},
}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
if test.order == descending {
sort.Sort(sort.Reverse(test.data))
} else {
sort.Sort(test.data)
}
for i := 0; i < len(test.data); i++ {
assert.Equal(t, test.data[i], test.expectedData[i], fmt.Sprintf("%v: FileCount at position %d = %v, expected: %v", test.name, i, test.data[i], test.expectedData[i]))
}
})
}
}