tartrazine/common.go
2017-06-06 11:30:23 +02:00

189 lines
5.9 KiB
Go

package slinguist
import (
"math"
"path/filepath"
"strings"
)
// OtherLanguage is used as a zero value when a function can not return a specific language.
const OtherLanguage = "Other"
// Strategy type fix the signature for the functions that can be used as a strategy.
type Strategy func(filename string, content []byte) (languages []string)
var strategies = []Strategy{
GetLanguagesByModeline,
GetLanguagesByFilename,
GetLanguagesByShebang,
GetLanguagesByExtension,
GetLanguagesByContent,
}
// GetLanguage applies a sequence of strategies based on the given filename and content
// to find out the most probably language to return.
func GetLanguage(filename string, content []byte) string {
candidates := map[string]float64{}
for _, strategy := range strategies {
languages := strategy(filename, content)
if len(languages) == 1 {
return languages[0]
}
if len(languages) > 0 {
for _, language := range languages {
candidates[language]++
}
}
}
if len(candidates) == 0 {
return OtherLanguage
}
lang := GetLanguageByClassifier(content, candidates, nil)
return lang
}
// GetLanguageByModeline returns the language of the given content looking for the modeline,
// and safe to indicate the sureness of returned language.
func GetLanguageByModeline(content []byte) (lang string, safe bool) {
return getLangAndSafe("", content, GetLanguagesByModeline)
}
// GetLanguageByFilename returns a language based on the given filename, and safe to indicate
// the sureness of returned language.
func GetLanguageByFilename(filename string) (lang string, safe bool) {
return getLangAndSafe(filename, nil, GetLanguagesByFilename)
}
// GetLanguagesByFilename returns a slice of possible languages for the given filename, content will be ignored.
// It accomplish the signature to be a Strategy type.
func GetLanguagesByFilename(filename string, content []byte) []string {
return languagesByFilename[filename]
}
// GetLanguageByShebang returns the language of the given content looking for the shebang line,
// and safe to indicate the sureness of returned language.
func GetLanguageByShebang(content []byte) (lang string, safe bool) {
return getLangAndSafe("", content, GetLanguagesByShebang)
}
// GetLanguageByExtension returns a language based on the given filename, and safe to indicate
// the sureness of returned language.
func GetLanguageByExtension(filename string) (lang string, safe bool) {
return getLangAndSafe(filename, nil, GetLanguagesByExtension)
}
// GetLanguagesByExtension returns a slice of possible languages for the given filename, content will be ignored.
// It accomplish the signature to be a Strategy type.
func GetLanguagesByExtension(filename string, content []byte) []string {
ext := strings.ToLower(filepath.Ext(filename))
return languagesByExtension[ext]
}
// GetLanguageByContent returns a language based on the filename and heuristics applies to the content,
// and safe to indicate the sureness of returned language.
func GetLanguageByContent(filename string, content []byte) (lang string, safe bool) {
return getLangAndSafe(filename, content, GetLanguagesByContent)
}
// GetLanguagesByContent returns a slice of possible languages for the given content, filename will be ignored.
// It accomplish the signature to be a Strategy type.
func GetLanguagesByContent(filename string, content []byte) []string {
ext := strings.ToLower(filepath.Ext(filename))
fnMatcher, ok := contentMatchers[ext]
if !ok {
return nil
}
return fnMatcher(content)
}
func getLangAndSafe(filename string, content []byte, getLanguageByStrategy Strategy) (lang string, safe bool) {
languages := getLanguageByStrategy(filename, content)
if len(languages) == 0 {
lang = OtherLanguage
return
}
lang = languages[0]
safe = len(languages) == 1
return
}
// GetLanguageByClassifier takes in a content and a list of candidates, and apply the classifier's Classify method to
// get the most probably language. If classifier is null then DefaultClassfier will be used. If there aren't candidates
// OtherLanguage is returned.
func GetLanguageByClassifier(content []byte, candidates map[string]float64, classifier Classifier) string {
scores := GetLanguagesByClassifier(content, candidates, classifier)
if len(scores) == 0 {
return OtherLanguage
}
return getLangugeHigherScore(scores)
}
func getLangugeHigherScore(scores map[string]float64) string {
var language string
higher := -math.MaxFloat64
for lang, score := range scores {
if higher < score {
language = lang
higher = score
}
}
return language
}
// GetLanguagesByClassifier returns a map of possible languages as keys and a score as value based on content and candidates. The values can be ordered
// with the highest value as the most probably language. If classifier is null then DefaultClassfier will be used.
func GetLanguagesByClassifier(content []byte, candidates map[string]float64, classifier Classifier) map[string]float64 {
if classifier == nil {
classifier = DefaultClassifier
}
return classifier.Classify(content, candidates)
}
// GetLanguageExtensions returns the different extensions being used by the language.
func GetLanguageExtensions(language string) []string {
return extensionsByLanguage[language]
}
// Type represent language's type. Either data, programming, markup, prose, or unknown.
type Type int
// Type's values.
const (
Unknown Type = iota
Data
Programming
Markup
Prose
)
// GetLanguageType returns the given language's type.
func GetLanguageType(language string) (langType Type) {
langType, ok := languagesType[language]
if !ok {
langType = Unknown
}
return langType
}
// GetLanguageByAlias returns either the language related to the given alias and ok set to true
// or Otherlanguage and ok set to false if the alias is not recognized.
func GetLanguageByAlias(alias string) (lang string, ok bool) {
a := strings.Split(alias, `,`)[0]
a = strings.ToLower(a)
lang, ok = languagesByAlias[a]
if !ok {
lang = OtherLanguage
}
return
}