Merge pull request #28 from mcarmonaa/classifier

Added Classifier as the last step in detection strategies
2025-09-18 11:18:12 +00:00 · 2017-06-01 10:07:50 +02:00
parent 708f2e40bf f8b8f7f5c4
commit d9c959522f
24 changed files with 258513 additions and 129 deletions
--- a/alias.go
+++ b/alias.go
@@ -2,11 +2,12 @@ package slinguist

 import "strings"

-// GetLanguageByAlias returns the language related to the given alias or Otherlanguage otherwise.
-func GetLanguageByAlias(alias string) (lang string) {
+// GetLanguageByAlias returns the language related to the given alias and ok set to true,
+// or Otherlanguage and ok set to false otherwise.
+func GetLanguageByAlias(alias string) (lang string, ok bool) {
 	a := strings.Split(alias, `,`)[0]
 	a = strings.ToLower(a)
-	lang, ok := languagesByAlias[a]
+	lang, ok = languagesByAlias[a]
 	if !ok {
 		lang = OtherLanguage
 	}
--- a/alias_test.go
+++ b/alias_test.go
@@ -6,21 +6,23 @@ func (s *TSuite) TestGetLanguageByAlias(c *C) {
 	tests := []struct {
 		alias        string
 		expectedLang string
+		expectedOk   bool
 	}{
-		{alias: "BestLanguageEver", expectedLang: OtherLanguage},
-		{alias: "aspx-vb", expectedLang: "ASP"},
-		{alias: "C++", expectedLang: "C++"},
-		{alias: "c++", expectedLang: "C++"},
-		{alias: "objc", expectedLang: "Objective-C"},
-		{alias: "golang", expectedLang: "Go"},
-		{alias: "GOLANG", expectedLang: "Go"},
-		{alias: "bsdmake", expectedLang: "Makefile"},
-		{alias: "xhTmL", expectedLang: "HTML"},
-		{alias: "python", expectedLang: "Python"},
+		{alias: "BestLanguageEver", expectedLang: OtherLanguage, expectedOk: false},
+		{alias: "aspx-vb", expectedLang: "ASP", expectedOk: true},
+		{alias: "C++", expectedLang: "C++", expectedOk: true},
+		{alias: "c++", expectedLang: "C++", expectedOk: true},
+		{alias: "objc", expectedLang: "Objective-C", expectedOk: true},
+		{alias: "golang", expectedLang: "Go", expectedOk: true},
+		{alias: "GOLANG", expectedLang: "Go", expectedOk: true},
+		{alias: "bsdmake", expectedLang: "Makefile", expectedOk: true},
+		{alias: "xhTmL", expectedLang: "HTML", expectedOk: true},
+		{alias: "python", expectedLang: "Python", expectedOk: true},
 	}

 	for _, test := range tests {
-		lang := GetLanguageByAlias(test.alias)
+		lang, ok := GetLanguageByAlias(test.alias)
 		c.Assert(lang, Equals, test.expectedLang)
+		c.Assert(ok, Equals, test.expectedOk)
 	}
 }
--- a/aliases_map.go
+++ b/aliases_map.go
@@ -2,7 +2,7 @@ package slinguist

 // CODE GENERATED AUTOMATICALLY WITH gopkg.in/src-d/simple-linguist.v1/internal/code-generator
 // THIS FILE SHOULD NOT BE EDITED BY HAND
-// Extracted from github/linguist commit: dae33dc2b20cddc85d1300435c3be7118a7115a9
+// Extracted from github/linguist commit: 60f864a138650dd17fafc94814be9ee2d3aaef8c

 // languagesByAlias keeps alias for different languages and use the name of the languages as a alias too. All the
 // keys (alias or not) are written in lower case and the whitespaces has been replaced by underscores.
@@ -107,6 +107,7 @@ var languagesByAlias = map[string]string{
 	"clipper":                  "xBase",
 	"clips":                    "CLIPS",
 	"clojure":                  "Clojure",
+	"closure_templates":        "Closure Templates",
 	"cmake":                    "CMake",
 	"cobol":                    "COBOL",
 	"coffee":                   "CoffeeScript",
@@ -137,6 +138,7 @@ var languagesByAlias = map[string]string{
 	"csv":                      "CSV",
 	"cucumber":                 "Gherkin",
 	"cuda":                     "Cuda",
+	"cweb":                     "CWeb",
 	"cycript":                  "Cycript",
 	"cython":                   "Cython",
 	"d":                        "D",
@@ -281,6 +283,7 @@ var languagesByAlias = map[string]string{
 	"jflex":                    "JFlex",
 	"jison":                    "Jison",
 	"jison_lex":                "Jison Lex",
+	"jolie":                    "Jolie",
 	"jruby":                    "Ruby",
 	"js":                       "JavaScript",
 	"json":                     "JSON",
@@ -433,6 +436,7 @@ var languagesByAlias = map[string]string{
 	"pascal":                "Pascal",
 	"pasm":                  "Parrot Assembly",
 	"pawn":                  "PAWN",
+	"pep8":                  "Pep8",
 	"perl":                  "Perl",
 	"perl6":                 "Perl6",
 	"php":                   "PHP",
@@ -529,6 +533,7 @@ var languagesByAlias = map[string]string{
 	"scss":                  "SCSS",
 	"self":                  "Self",
 	"sh":                    "Shell",
+	"shaderlab":             "ShaderLab",
 	"shell":                 "Shell",
 	"shell-script":          "Shell",
 	"shellsession":          "ShellSession",
@@ -572,6 +577,7 @@ var languagesByAlias = map[string]string{
 	"textile":               "Textile",
 	"thrift":                "Thrift",
 	"ti_program":            "TI Program",
+	"tl":                    "Type Language",
 	"tla":                   "TLA",
 	"toml":                  "TOML",
 	"ts":                    "TypeScript",
@@ -579,6 +585,7 @@ var languagesByAlias = map[string]string{
 	"turtle":                "Turtle",
 	"twig":                  "Twig",
 	"txl":                   "TXL",
+	"type_language":         "Type Language",
 	"typescript":            "TypeScript",
 	"udiff":                 "Diff",
 	"unified_parallel_c":    "Unified Parallel C",
--- a/classifier.go
+++ b/classifier.go
@@ -0,0 +1,100 @@
+package slinguist
+
+import (
+	"math"
+
+	"gopkg.in/src-d/simple-linguist.v1/internal/tokenizer"
+)
+
+// GetLanguageByClassifier takes in a content and a list of candidates, and apply the classifier's Classify method to
+// get the most probably language. If classifier is null then DefaultClassfier will be used.
+func GetLanguageByClassifier(content []byte, candidates []string, classifier Classifier) string {
+	if classifier == nil {
+		classifier = DefaultClassifier
+	}
+
+	scores := classifier.Classify(content, candidates)
+	if len(scores) == 0 {
+		return OtherLanguage
+	}
+
+	return getLangugeHigherScore(scores)
+}
+
+func getLangugeHigherScore(scores map[string]float64) string {
+	var language string
+	higher := -math.MaxFloat64
+	for lang, score := range scores {
+		if higher < score {
+			language = lang
+			higher = score
+		}
+	}
+
+	return language
+}
+
+// Classifier is the interface that contains the method Classify which is in charge to assign scores to the possibles candidates.
+// The scores must order the candidates so as the highest score be the most probably language of the content.
+type Classifier interface {
+	Classify(content []byte, candidates []string) map[string]float64
+}
+
+type classifier struct {
+	languagesLogProbabilities map[string]float64
+	tokensLogProbabilities    map[string]map[string]float64
+	tokensTotal               float64
+}
+
+func (c *classifier) Classify(content []byte, candidates []string) map[string]float64 {
+	if len(content) == 0 {
+		return nil
+	}
+
+	var languages []string
+	if len(candidates) == 0 {
+		languages = c.knownLangs()
+	} else {
+		languages = make([]string, 0, len(candidates))
+		for _, candidate := range candidates {
+			if lang, ok := GetLanguageByAlias(candidate); ok {
+				languages = append(languages, lang)
+			}
+		}
+	}
+
+	tokens := tokenizer.Tokenize(content)
+	scores := make(map[string]float64, len(languages))
+	for _, language := range languages {
+		scores[language] = c.tokensLogProbability(tokens, language) + c.languagesLogProbabilities[language]
+	}
+
+	return scores
+}
+
+func (c *classifier) knownLangs() []string {
+	langs := make([]string, 0, len(c.languagesLogProbabilities))
+	for lang := range c.languagesLogProbabilities {
+		langs = append(langs, lang)
+	}
+
+	return langs
+}
+
+func (c *classifier) tokensLogProbability(tokens []string, language string) float64 {
+	var sum float64
+	for _, token := range tokens {
+		sum += c.tokenProbability(token, language)
+	}
+
+	return sum
+}
+
+func (c *classifier) tokenProbability(token, language string) float64 {
+	tokenProb, ok := c.tokensLogProbabilities[language][token]
+	if !ok {
+		tokenProb = math.Log(1.000000 / c.tokensTotal)
+	}
+
+	return tokenProb
+}
--- a/classifier_test.go
+++ b/classifier_test.go
@@ -0,0 +1,32 @@
+package slinguist
+
+import (
+	"io/ioutil"
+	"path/filepath"
+
+	. "gopkg.in/check.v1"
+)
+
+func (s *TSuite) TestGetLanguageByClassifier(c *C) {
+	const samples = `.linguist/samples/`
+	test := []struct {
+		filename     string
+		candidates   []string
+		expectedLang string
+	}{
+		{filename: filepath.Join(samples, "C/blob.c"), candidates: []string{"python", "ruby", "c", "c++"}, expectedLang: "C"},
+		{filename: filepath.Join(samples, "C/blob.c"), candidates: nil, expectedLang: "C"},
+		{filename: filepath.Join(samples, "C/main.c"), candidates: nil, expectedLang: "C"},
+		{filename: filepath.Join(samples, "C/blob.c"), candidates: []string{"python", "ruby", "c++"}, expectedLang: "C++"},
+		{filename: filepath.Join(samples, "C/blob.c"), candidates: []string{"ruby"}, expectedLang: "Ruby"},
+		{filename: filepath.Join(samples, "Python/django-models-base.py"), candidates: []string{"python", "ruby", "c", "c++"}, expectedLang: "Python"},
+		{filename: filepath.Join(samples, "Python/django-models-base.py"), candidates: nil, expectedLang: "Python"},
+	}
+
+	for _, test := range test {
+		content, err := ioutil.ReadFile(test.filename)
+		c.Assert(err, Equals, nil)
+		lang := GetLanguageByClassifier(content, test.candidates, nil)
+		c.Assert(lang, Equals, test.expectedLang)
+	}
+}
--- a/common.go
+++ b/common.go
@@ -52,7 +52,11 @@ func GetLanguage(filename string, content []byte) string {
 		return lang
 	}

-	lang, _ := GetLanguageByContent(filename, content)
+	if lang, safe := GetLanguageByContent(filename, content); safe {
+		return lang
+	}
+
+	lang := GetLanguageByClassifier(content, nil, nil)
 	return lang
 }

--- a/content.go
+++ b/content.go
@@ -2,7 +2,7 @@ package slinguist

 // CODE GENERATED AUTOMATICALLY WITH gopkg.in/src-d/simple-linguist.v1/internal/code-generator
 // THIS FILE SHOULD NOT BE EDITED BY HAND
-// Extracted from github/linguist commit: dae33dc2b20cddc85d1300435c3be7118a7115a9
+// Extracted from github/linguist commit: 60f864a138650dd17fafc94814be9ee2d3aaef8c

 import (
 	"path/filepath"
@@ -117,8 +117,8 @@ var matchers = map[string]languageMatcher{
 			return "Forth", true
 		} else if f_FilebenchWML_Matcher_0.Match(i) {
 			return "Filebench WML", true
-		} else if f_FORTRAN_Matcher_0.Match(i) {
-			return "FORTRAN", true
+		} else if f_Fortran_Matcher_0.Match(i) {
+			return "Fortran", true
 		}

 		return OtherLanguage, false
@@ -126,8 +126,8 @@ var matchers = map[string]languageMatcher{
 	".for": func(i []byte) (string, bool) {
 		if for_Forth_Matcher_0.Match(i) {
 			return "Forth", true
-		} else if for_FORTRAN_Matcher_0.Match(i) {
-			return "FORTRAN", true
+		} else if for_Fortran_Matcher_0.Match(i) {
+			return "Fortran", true
 		}

 		return OtherLanguage, false
@@ -184,8 +184,8 @@ var matchers = map[string]languageMatcher{
 			return "Common Lisp", true
 		} else if l_Lex_Matcher_0.Match(i) {
 			return "Lex", true
-		} else if l_Groff_Matcher_0.Match(i) {
-			return "Groff", true
+		} else if l_Roff_Matcher_0.Match(i) {
+			return "Roff", true
 		} else if l_PicoLisp_Matcher_0.Match(i) {
 			return "PicoLisp", true
 		}
@@ -239,8 +239,8 @@ var matchers = map[string]languageMatcher{
 	".md": func(i []byte) (string, bool) {
 		if md_Markdown_Matcher_0.Match(i) || md_Markdown_Matcher_1.Match(i) {
 			return "Markdown", true
-		} else if md_GCCmachinedescription_Matcher_0.Match(i) {
-			return "GCC machine description", true
+		} else if md_GCCMachineDescription_Matcher_0.Match(i) {
+			return "GCC Machine Description", true
 		}

 		return "Markdown", true
@@ -264,15 +264,15 @@ var matchers = map[string]languageMatcher{
 		return "Linux Kernel Module", false
 	},
 	".ms": func(i []byte) (string, bool) {
-		if ms_Groff_Matcher_0.Match(i) {
-			return "Groff", true
+		if ms_Roff_Matcher_0.Match(i) {
+			return "Roff", true
 		}

 		return "MAXScript", true
 	},
 	".n": func(i []byte) (string, bool) {
-		if n_Groff_Matcher_0.Match(i) {
-			return "Groff", true
+		if n_Roff_Matcher_0.Match(i) {
+			return "Roff", true
 		} else if n_Nemerle_Matcher_0.Match(i) {
 			return "Nemerle", true
 		}
@@ -314,19 +314,10 @@ var matchers = map[string]languageMatcher{
 		return OtherLanguage, false
 	},
 	".pm": func(i []byte) (string, bool) {
-		if pm_Perl_Matcher_0.Match(i) {
-			return "Perl", true
-		} else if pm_Perl6_Matcher_0.Match(i) {
+		if pm_Perl6_Matcher_0.Match(i) {
 			return "Perl6", true
-		}
-
-		return OtherLanguage, false
-	},
-	".t": func(i []byte) (string, bool) {
-		if t_Perl_Matcher_0.Match(i) {
+		} else if pm_Perl_Matcher_0.Match(i) {
 			return "Perl", true
-		} else if t_Perl6_Matcher_0.Match(i) {
-			return "Perl6", true
 		}

 		return OtherLanguage, false
@@ -372,8 +363,8 @@ var matchers = map[string]languageMatcher{
 	".rno": func(i []byte) (string, bool) {
 		if rno_RUNOFF_Matcher_0.Match(i) {
 			return "RUNOFF", true
-		} else if rno_Groff_Matcher_0.Match(i) {
-			return "Groff", true
+		} else if rno_Roff_Matcher_0.Match(i) {
+			return "Roff", true
 		}

 		return OtherLanguage, false
@@ -423,6 +414,17 @@ var matchers = map[string]languageMatcher{

 		return OtherLanguage, false
 	},
+	".t": func(i []byte) (string, bool) {
+		if t_Turing_Matcher_0.Match(i) {
+			return "Turing", true
+		} else if t_Perl6_Matcher_0.Match(i) {
+			return "Perl6", true
+		} else if t_Perl_Matcher_0.Match(i) {
+			return "Perl", true
+		}
+
+		return OtherLanguage, false
+	},
 	".toc": func(i []byte) (string, bool) {
 		if toc_WorldofWarcraftAddonData_Matcher_0.Match(i) {
 			return "World of Warcraft Addon Data", true
@@ -481,9 +483,9 @@ var (
 	es_Erlang_Matcher_0                    = regexp.MustCompile(`(?m)^\s*(?:%%|main\s*\(.*?\)\s*->)`)
 	f_Forth_Matcher_0                      = regexp.MustCompile(`(?m)^: `)
 	f_FilebenchWML_Matcher_0               = regexp.MustCompile(`(?m)flowop`)
-	f_FORTRAN_Matcher_0                    = regexp.MustCompile(`(?mi)^([c*][^abd-z]|      (subroutine|program|end|data)\s|\s*!)`)
+	f_Fortran_Matcher_0                    = regexp.MustCompile(`(?mi)^([c*][^abd-z]|      (subroutine|program|end|data)\s|\s*!)`)
 	for_Forth_Matcher_0                    = regexp.MustCompile(`(?m)^: `)
-	for_FORTRAN_Matcher_0                  = regexp.MustCompile(`(?mi)^([c*][^abd-z]|      (subroutine|program|end|data)\s|\s*!)`)
+	for_Fortran_Matcher_0                  = regexp.MustCompile(`(?mi)^([c*][^abd-z]|      (subroutine|program|end|data)\s|\s*!)`)
 	fr_Forth_Matcher_0                     = regexp.MustCompile(`(?m)^(: |also |new-device|previous )`)
 	fr_Frege_Matcher_0                     = regexp.MustCompile(`(?m)^\s*(import|module|package|data|type) `)
 	fs_Forth_Matcher_0                     = regexp.MustCompile(`(?m)^(: |new-device)`)
@@ -503,7 +505,7 @@ var (
 	inc_POVDashRaySDL_Matcher_0            = regexp.MustCompile(`(?m)^\s*#(declare|local|macro|while)\s`)
 	l_CommonLisp_Matcher_0                 = regexp.MustCompile(`(?m)\(def(un|macro)\s`)
 	l_Lex_Matcher_0                        = regexp.MustCompile(`(?m)^(%[%{}]xs|<.*>)`)
-	l_Groff_Matcher_0                      = regexp.MustCompile(`(?mi)^\.[a-z][a-z](\s|$)`)
+	l_Roff_Matcher_0                       = regexp.MustCompile(`(?mi)^\.[a-z][a-z](\s|$)`)
 	l_PicoLisp_Matcher_0                   = regexp.MustCompile(`(?m)^\((de|class|rel|code|data|must)\s`)
 	ls_LoomScript_Matcher_0                = regexp.MustCompile(`(?m)^\s*package\s*[\w\.\/\*\s]*\s*{`)
 	lsp_CommonLisp_Matcher_0               = regexp.MustCompile(`(?mi)^\s*\((defun|in-package|defpackage) `)
@@ -519,14 +521,14 @@ var (
 	m_Limbo_Matcher_0                      = regexp.MustCompile(`(?m)^\w+\s*:\s*module\s*{`)
 	md_Markdown_Matcher_0                  = regexp.MustCompile(`(?mi)(^[-a-z0-9=#!\*\[|>])|<\/`)
 	md_Markdown_Matcher_1                  = regexp.MustCompile(`(?m)^$`)
-	md_GCCmachinedescription_Matcher_0     = regexp.MustCompile(`(?m)^(;;|\(define_)`)
+	md_GCCMachineDescription_Matcher_0     = regexp.MustCompile(`(?m)^(;;|\(define_)`)
 	ml_OCaml_Matcher_0                     = regexp.MustCompile(`(?m)(^\s*module)|let rec |match\s+(\S+\s)+with`)
 	ml_StandardML_Matcher_0                = regexp.MustCompile(`(?m)=> |case\s+(\S+\s)+of`)
 	mod_XML_Matcher_0                      = regexp.MustCompile(`(?m)<!ENTITY `)
 	mod_ModulaDash2_Matcher_0              = regexp.MustCompile(`(?mi)^\s*MODULE [\w\.]+;`)
 	mod_ModulaDash2_Matcher_1              = regexp.MustCompile(`(?mi)^\s*END [\w\.]+;`)
-	ms_Groff_Matcher_0                     = regexp.MustCompile(`(?mi)^[.'][a-z][a-z](\s|$)`)
-	n_Groff_Matcher_0                      = regexp.MustCompile(`(?m)^[.']`)
+	ms_Roff_Matcher_0                      = regexp.MustCompile(`(?mi)^[.'][a-z][a-z](\s|$)`)
+	n_Roff_Matcher_0                       = regexp.MustCompile(`(?m)^[.']`)
 	n_Nemerle_Matcher_0                    = regexp.MustCompile(`(?m)^(module|namespace|using)\s`)
 	ncl_Text_Matcher_0                     = regexp.MustCompile(`(?m)THE_TITLE`)
 	nl_NL_Matcher_0                        = regexp.MustCompile(`(?m)^(b|g)[0-9]+ `)
@@ -535,10 +537,8 @@ var (
 	pl_Prolog_Matcher_0                    = regexp.MustCompile(`(?m)^[^#]*:-`)
 	pl_Perl_Matcher_0                      = regexp.MustCompile(`(?m)use strict|use\s+v?5\.`)
 	pl_Perl6_Matcher_0                     = regexp.MustCompile(`(?m)^(use v6|(my )?class|module)`)
-	pm_Perl_Matcher_0                      = regexp.MustCompile(`(?m)use strict|use\s+v?5\.`)
-	pm_Perl6_Matcher_0                     = regexp.MustCompile(`(?m)^(use v6|(my )?class|module)`)
-	t_Perl_Matcher_0                       = regexp.MustCompile(`(?m)use strict|use\s+v?5\.`)
-	t_Perl6_Matcher_0                      = regexp.MustCompile(`(?m)^(use v6|(my )?class|module)`)
+	pm_Perl6_Matcher_0                     = regexp.MustCompile(`(?m)^\s*(?:use\s+v6\s*;|(?:\bmy\s+)?class|module)\b`)
+	pm_Perl_Matcher_0                      = regexp.MustCompile(`(?m)\buse\s+(?:strict\b|v?5\.)`)
 	pod_Pod_Matcher_0                      = regexp.MustCompile(`(?m)^=\w+\b`)
 	pro_Prolog_Matcher_0                   = regexp.MustCompile(`(?m)^[^#]+:-`)
 	pro_INI_Matcher_0                      = regexp.MustCompile(`(?m)last_client=`)
@@ -550,7 +550,7 @@ var (
 	r_Rebol_Matcher_0                      = regexp.MustCompile(`(?mi)\bRebol\b`)
 	r_R_Matcher_0                          = regexp.MustCompile(`(?m)<-|^\s*#`)
 	rno_RUNOFF_Matcher_0                   = regexp.MustCompile(`(?mi)^\.!|^\.end lit(?:eral)?\b`)
-	rno_Groff_Matcher_0                    = regexp.MustCompile(`(?m)^\.\\" `)
+	rno_Roff_Matcher_0                     = regexp.MustCompile(`(?m)^\.\\" `)
 	rpy_Python_Matcher_0                   = regexp.MustCompile(`(?ms)(^(import|from|class|def)\s)`)
 	rs_Rust_Matcher_0                      = regexp.MustCompile(`(?m)^(use |fn |mod |pub |macro_rules|impl|#!?\[)`)
 	rs_RenderScript_Matcher_0              = regexp.MustCompile(`(?m)#include|#pragma\s+(rs|version)|__attribute__`)
@@ -569,6 +569,9 @@ var (
 	sql_PLSQL_Matcher_1                    = regexp.MustCompile(`(?mi)constructor\W+function`)
 	sql_SQL_Matcher_0                      = regexp.MustCompile(`(?mi)! /begin|boolean|package|exception`)
 	srt_SubRipText_Matcher_0               = regexp.MustCompile(`(?m)^(\d{2}:\d{2}:\d{2},\d{3})\s*(-->)\s*(\d{2}:\d{2}:\d{2},\d{3})$`)
+	t_Turing_Matcher_0                     = regexp.MustCompile(`(?m)^\s*%[ \t]+|^\s*var\s+\w+\s*:=\s*\w+`)
+	t_Perl6_Matcher_0                      = regexp.MustCompile(`(?m)^\s*(?:use\s+v6\s*;|\bmodule\b|\b(?:my\s+)?class\b)`)
+	t_Perl_Matcher_0                       = regexp.MustCompile(`(?m)\buse\s+(?:strict\b|v?5\.)`)
 	toc_WorldofWarcraftAddonData_Matcher_0 = regexp.MustCompile(`(?m)^## |@no-lib-strip@`)
 	toc_TeX_Matcher_0                      = regexp.MustCompile(`(?m)^\\(contentsline|defcounter|beamer|boolfalse)`)
 	ts_XML_Matcher_0                       = regexp.MustCompile(`(?m)<TS`)
--- a/documentation_matchers.go
+++ b/documentation_matchers.go
@@ -2,7 +2,7 @@ package slinguist

 // CODE GENERATED AUTOMATICALLY WITH gopkg.in/src-d/simple-linguist.v1/internal/code-generator
 // THIS FILE SHOULD NOT BE EDITED BY HAND
-// Extracted from github/linguist commit: dae33dc2b20cddc85d1300435c3be7118a7115a9
+// Extracted from github/linguist commit: 60f864a138650dd17fafc94814be9ee2d3aaef8c

 import "gopkg.in/toqueteos/substring.v1"

--- a/extensions_map.go
+++ b/extensions_map.go
@@ -2,7 +2,7 @@ package slinguist

 // CODE GENERATED AUTOMATICALLY WITH gopkg.in/src-d/simple-linguist.v1/internal/code-generator
 // THIS FILE SHOULD NOT BE EDITED BY HAND
-// Extracted from github/linguist commit: dae33dc2b20cddc85d1300435c3be7118a7115a9
+// Extracted from github/linguist commit: 60f864a138650dd17fafc94814be9ee2d3aaef8c

 var languagesByExtension = map[string][]string{
 	".1":                   {"Roff"},
@@ -39,6 +39,8 @@ var languagesByExtension = map[string][]string{
 	".abnf":                {"ABNF"},
 	".ada":                 {"Ada"},
 	".adb":                 {"Ada"},
+	".adml":                {"XML"},
+	".admx":                {"XML"},
 	".ado":                 {"Stata"},
 	".adoc":                {"AsciiDoc"},
 	".adp":                 {"Tcl"},
@@ -94,6 +96,7 @@ var languagesByExtension = map[string][]string{
 	".bats":                {"Shell"},
 	".bb":                  {"BitBake", "BlitzBasic"},
 	".bbx":                 {"TeX"},
+	".bdy":                 {"PLSQL"},
 	".befunge":             {"Befunge"},
 	".bf":                  {"Brainfuck", "HyPhy"},
 	".bib":                 {"TeX"},
@@ -132,6 +135,7 @@ var languagesByExtension = map[string][]string{
 	".cfm":                 {"ColdFusion"},
 	".cfml":                {"ColdFusion"},
 	".cgi":                 {"Perl", "Python", "Shell"},
+	".cginc":               {"HLSL"},
 	".ch":                  {"Charity", "xBase"},
 	".chem":                {"Pic"},
 	".chpl":                {"Chapel"},
@@ -197,7 +201,7 @@ var languagesByExtension = map[string][]string{
 	".dats":                {"ATS"},
 	".db2":                 {"SQLPL"},
 	".dcl":                 {"Clean"},
-	".ddl":                 {"SQL"},
+	".ddl":                 {"PLSQL", "SQL"},
 	".decls":               {"BlitzBasic"},
 	".desktop":             {"desktop"},
 	".desktop.in":          {"desktop"},
@@ -273,6 +277,7 @@ var languagesByExtension = map[string][]string{
 	".fish":                {"fish"},
 	".flex":                {"JFlex"},
 	".flux":                {"FLUX"},
+	".fnc":                 {"PLSQL"},
 	".for":                 {"Formatted", "Forth", "Fortran"},
 	".forth":               {"Forth"},
 	".fp":                  {"GLSL"},
@@ -385,10 +390,10 @@ var languagesByExtension = map[string][]string{
 	".ins":                 {"TeX"},
 	".intr":                {"Dylan"},
 	".io":                  {"Io"},
+	".iol":                 {"Jolie"},
 	".ipf":                 {"IGOR Pro"},
 	".ipp":                 {"C++"},
 	".ipynb":               {"Jupyter Notebook"},
-	".irbrc":               {"Ruby"},
 	".irclog":              {"IRC log"},
 	".iss":                 {"Inno Setup"},
 	".ivy":                 {"XML"},
@@ -494,6 +499,7 @@ var languagesByExtension = map[string][]string{
 	".minid":               {"MiniD"},
 	".mir":                 {"Mirah"},
 	".mirah":               {"Mirah"},
+	".mjml":                {"XML"},
 	".mk":                  {"Makefile"},
 	".mkd":                 {"Markdown"},
 	".mkdn":                {"Markdown"},
@@ -554,6 +560,7 @@ var languagesByExtension = map[string][]string{
 	".no":                  {"Text"},
 	".nproj":               {"XML"},
 	".nqp":                 {"Perl6"},
+	".nr":                  {"Roff"},
 	".nse":                 {"Lua"},
 	".nsh":                 {"NSIS"},
 	".nsi":                 {"NSIS"},
@@ -567,6 +574,7 @@ var languagesByExtension = map[string][]string{
 	".obj":                 {"Wavefront Object"},
 	".objdump":             {"ObjDump"},
 	".odd":                 {"XML"},
+	".ol":                  {"Jolie"},
 	".omgrofl":             {"Omgrofl"},
 	".ooc":                 {"ooc"},
 	".opa":                 {"Opa"},
@@ -602,6 +610,7 @@ var languagesByExtension = map[string][]string{
 	".pd":                  {"Pure Data"},
 	".pd_lua":              {"Lua"},
 	".pde":                 {"Processing"},
+	".pep":                 {"Pep8"},
 	".perl":                {"Perl"},
 	".ph":                  {"Perl"},
 	".php":                 {"Hack", "PHP"},
@@ -642,7 +651,7 @@ var languagesByExtension = map[string][]string{
 	".pov":                 {"POV-Ray SDL"},
 	".pp":                  {"Pascal", "Puppet"},
 	".pprx":                {"REXX"},
-	".prc":                 {"SQL"},
+	".prc":                 {"PLSQL", "SQL"},
 	".prefab":              {"Unity3D Asset"},
 	".prefs":               {"INI"},
 	".prg":                 {"xBase"},
@@ -671,6 +680,7 @@ var languagesByExtension = map[string][]string{
 	".py":                  {"Python"},
 	".py3":                 {"Python"},
 	".pyde":                {"Python"},
+	".pyi":                 {"Python"},
 	".pyp":                 {"Python"},
 	".pyt":                 {"Python"},
 	".pytb":                {"Python traceback"},
@@ -766,7 +776,7 @@ var languagesByExtension = map[string][]string{
 	".sh":                  {"Shell"},
 	".sh-session":          {"ShellSession"},
 	".sh.in":               {"Shell"},
-	".shader":              {"GLSL"},
+	".shader":              {"GLSL", "ShaderLab"},
 	".shen":                {"Shen"},
 	".sig":                 {"Standard ML"},
 	".sj":                  {"Objective-J"},
@@ -780,8 +790,10 @@ var languagesByExtension = map[string][]string{
 	".sml":                 {"Standard ML"},
 	".smt":                 {"SMT"},
 	".smt2":                {"SMT"},
+	".soy":                 {"Closure Templates"},
 	".sp":                  {"SourcePawn"},
 	".sparql":              {"SPARQL"},
+	".spc":                 {"PLSQL"},
 	".spec":                {"Python", "RPM Spec", "Ruby"},
 	".spin":                {"Propeller Spin"},
 	".sps":                 {"Scheme"},
@@ -830,12 +842,15 @@ var languagesByExtension = map[string][]string{
 	".tcl":                 {"Tcl"},
 	".tcsh":                {"Tcsh"},
 	".tea":                 {"Tea"},
+	".tesc":                {"GLSL"},
+	".tese":                {"GLSL"},
 	".tex":                 {"TeX"},
 	".textile":             {"Textile"},
 	".tf":                  {"HCL"},
 	".thor":                {"Ruby"},
 	".thrift":              {"Thrift"},
 	".thy":                 {"Isabelle"},
+	".tl":                  {"Type Language"},
 	".tla":                 {"TLA"},
 	".tm":                  {"Tcl"},
 	".tmCommand":           {"XML"},
@@ -850,8 +865,11 @@ var languagesByExtension = map[string][]string{
 	".toml":                {"TOML"},
 	".tool":                {"Shell"},
 	".topojson":            {"JSON"},
+	".tpb":                 {"PLSQL"},
 	".tpl":                 {"Smarty"},
 	".tpp":                 {"C++"},
+	".tps":                 {"PLSQL"},
+	".trg":                 {"PLSQL"},
 	".ts":                  {"TypeScript", "XML"},
 	".tst":                 {"GAP", "Scilab"},
 	".tsx":                 {"TypeScript", "XML"},
@@ -901,10 +919,13 @@ var languagesByExtension = map[string][]string{
 	".vrx":                 {"GLSL"},
 	".vsh":                 {"GLSL"},
 	".vshader":             {"GLSL"},
+	".vsixmanifest":        {"XML"},
 	".vssettings":          {"XML"},
+	".vstemplate":          {"XML"},
 	".vue":                 {"Vue"},
+	".vw":                  {"PLSQL"},
 	".vxml":                {"XML"},
-	".w":                   {"C"},
+	".w":                   {"CWeb"},
 	".watchr":              {"Ruby"},
 	".webidl":              {"WebIDL"},
 	".weechatlog":          {"IRC log"},
@@ -914,6 +935,7 @@ var languagesByExtension = map[string][]string{
 	".wl":                  {"Mathematica"},
 	".wlt":                 {"Mathematica"},
 	".wlua":                {"Lua"},
+	".workbook":            {"Markdown"},
 	".wsdl":                {"XML"},
 	".wsf":                 {"XML"},
 	".wsgi":                {"Python"},
--- a/filenames_map.go
+++ b/filenames_map.go
@@ -2,7 +2,7 @@ package slinguist

 // CODE GENERATED AUTOMATICALLY WITH gopkg.in/src-d/simple-linguist.v1/internal/code-generator
 // THIS FILE SHOULD NOT BE EDITED BY HAND
-// Extracted from github/linguist commit: dae33dc2b20cddc85d1300435c3be7118a7115a9
+// Extracted from github/linguist commit: 60f864a138650dd17fafc94814be9ee2d3aaef8c

 var languagesByFilename = map[string]string{
 	".Rprofile":          "R",
@@ -22,6 +22,7 @@ var languagesByFilename = map[string]string{
 	".factor-rc":         "Factor",
 	".gclient":           "Python",
 	".gnus":              "Emacs Lisp",
+	".irbrc":             "Ruby",
 	".jshintrc":          "JSON",
 	".nvimrc":            "Vim script",
 	".php_cs":            "PHP",
@@ -85,6 +86,7 @@ var languagesByFilename = map[string]string{
 	"README.me":          "Text",
 	"README.mysql":       "Text",
 	"ROOT":               "Isabelle ROOT",
+	"Rakefile":           "Ruby",
 	"Rexfile":            "Perl6",
 	"SConscript":         "Python",
 	"SConstruct":         "Python",
--- a/frequencies.go
+++ b/frequencies.go
--- a/internal/code-generator/assets/frequencies.go.tmpl
+++ b/internal/code-generator/assets/frequencies.go.tmpl
@@ -0,0 +1,24 @@
+package slinguist
+
+// CODE GENERATED AUTOMATICALLY WITH gopkg.in/src-d/simple-linguist.v1/internal/code-generator
+// THIS FILE SHOULD NOT BE EDITED BY HAND
+// Extracted from github/linguist commit: {{ getCommit }}
+
+var DefaultClassifier Classifier = &classifier{
+	languagesLogProbabilities: map[string]float64{
+	{{ $freqs := . -}}
+	{{range $index, $language := orderKeys .Languages -}}
+		"{{ $language }}":	{{ languageLogProbability $language -}},
+	{{end -}}
+	},
+	tokensLogProbabilities: map[string]map[string]float64{
+	{{range $index, $language := orderMapMapKeys .Tokens -}}
+		"{{ $language }}": map[string]float64{	
+		{{range $i, $token := index $freqs.Tokens $language | orderKeys -}}
+			{{ quote $token }}: {{ tokenLogProbability $language $token }},
+		{{end -}}
+		},
+	{{end -}}
+	},
+	tokensTotal: {{ toFloat64 .TokensTotal -}},
+}
--- a/internal/code-generator/generator/generator.go
+++ b/internal/code-generator/generator/generator.go
@@ -21,6 +21,10 @@ func FromFile(fileToParse, outPath, tmplPath, tmplName, commit string, generate
 		return err
 	}

+	return formatedWrite(outPath, source)
+}
+
+func formatedWrite(outPath string, source []byte) error {
 	formatedSource, err := format.Source(source)
 	if err != nil {
 		return err
--- a/internal/code-generator/generator/generator_test.go
+++ b/internal/code-generator/generator/generator_test.go
@@ -4,15 +4,20 @@ import (
 	"fmt"
 	"io/ioutil"
 	"os"
+	"os/exec"
+	"path/filepath"
 	"testing"

 	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/suite"
 )

 const (
+	lingustURL = "https://github.com/github/linguist.git"
+	commitTree = "60f864a138650dd17fafc94814be9ee2d3aaef8c"
 	commitTest = "fe8b44ab8a225b1ffa75b983b916ea22fee5b6f7"

-	// Languages test
+	// Extensions test
 	extensionsTestFile     = "test_files/extensions.test.yml"
 	extensionsGold         = "test_files/extensions.gold"
 	extensionsTestTmplPath = "../assets/extensions.go.tmpl"
@@ -59,9 +64,48 @@ const (
 	aliasesGold         = "test_files/aliases.gold"
 	aliasesTestTmplPath = "../assets/aliases.go.tmpl"
 	aliasesTestTmplName = "aliases.go.tmpl"
+
+	// Frequencies test
+	frequenciesTestDir      = "/samples"
+	frequenciesGold         = "test_files/frequencies.gold"
+	frequenciesTestTmplPath = "../assets/frequencies.go.tmpl"
+	frequenciesTestTmplName = "frequencies.go.tmpl"
 )

-func TestFromFile(t *testing.T) {
+type GeneratorTestSuite struct {
+	suite.Suite
+	tmpLinguist string
+}
+
+func (g *GeneratorTestSuite) SetupSuite() {
+	tmpLinguist, err := ioutil.TempDir("", "linguist-")
+	assert.NoError(g.T(), err)
+	g.tmpLinguist = tmpLinguist
+
+	cmd := exec.Command("git", "clone", lingustURL, tmpLinguist)
+	err = cmd.Run()
+	assert.NoError(g.T(), err)
+
+	cwd, err := os.Getwd()
+	assert.NoError(g.T(), err)
+
+	err = os.Chdir(tmpLinguist)
+	assert.NoError(g.T(), err)
+
+	cmd = exec.Command("git", "checkout", commitTree)
+	err = cmd.Run()
+	assert.NoError(g.T(), err)
+
+	err = os.Chdir(cwd)
+	assert.NoError(g.T(), err)
+}
+
+func (g *GeneratorTestSuite) TearDownSuite() {
+	err := os.RemoveAll(g.tmpLinguist)
+	assert.NoError(g.T(), err)
+}
+
+func (g *GeneratorTestSuite) TestFromFile() {
 	tests := []struct {
 		name        string
 		fileToParse string
@@ -145,20 +189,57 @@ func TestFromFile(t *testing.T) {
 		},
 	}

-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			gold, err := ioutil.ReadFile(tt.wantOut)
-			assert.NoError(t, err)
+	for _, test := range tests {
+		gold, err := ioutil.ReadFile(test.wantOut)
+		assert.NoError(g.T(), err)

 		outPath, err := ioutil.TempFile("/tmp", "generator-test-")
-			assert.NoError(t, err)
+		assert.NoError(g.T(), err)
 		defer os.Remove(outPath.Name())

-			err = FromFile(tt.fileToParse, outPath.Name(), tt.tmplPath, tt.tmplName, tt.commit, tt.generate)
-			assert.NoError(t, err)
+		err = FromFile(test.fileToParse, outPath.Name(), test.tmplPath, test.tmplName, test.commit, test.generate)
+		assert.NoError(g.T(), err)
 		out, err := ioutil.ReadFile(outPath.Name())
-			assert.NoError(t, err)
-			assert.EqualValues(t, gold, out, fmt.Sprintf("FromFile() = %v, want %v", string(out), string(tt.wantOut)))
-		})
+		assert.NoError(g.T(), err)
+		assert.EqualValues(g.T(), gold, out, fmt.Sprintf("FromFile() = %v, want %v", string(out), string(test.wantOut)))
 	}
 }
+
+func (g *GeneratorTestSuite) TestFrequencies() {
+	tests := []struct {
+		name       string
+		samplesDir string
+		tmplPath   string
+		tmplName   string
+		commit     string
+		wantOut    string
+	}{
+		{
+			name:       "Frequencies_1",
+			samplesDir: filepath.Join(g.tmpLinguist, frequenciesTestDir),
+			tmplPath:   frequenciesTestTmplPath,
+			tmplName:   frequenciesTestTmplName,
+			commit:     commitTree,
+			wantOut:    frequenciesGold,
+		},
+	}
+
+	for _, test := range tests {
+		gold, err := ioutil.ReadFile(test.wantOut)
+		assert.NoError(g.T(), err)
+
+		outPath, err := ioutil.TempFile("/tmp", "frequencies-test-")
+		assert.NoError(g.T(), err)
+		defer os.Remove(outPath.Name())
+
+		err = Frequencies(test.samplesDir, test.tmplPath, test.tmplName, test.commit, outPath.Name())
+		assert.NoError(g.T(), err)
+		out, err := ioutil.ReadFile(outPath.Name())
+		assert.NoError(g.T(), err)
+		assert.EqualValues(g.T(), gold, out, fmt.Sprintf("Frequencies() = %v, want %v", string(out), string(test.wantOut)))
+	}
+}
+
+func TestGeneratorTestSuite(t *testing.T) {
+	suite.Run(t, new(GeneratorTestSuite))
+}
--- a/internal/code-generator/generator/samplesfreq.go
+++ b/internal/code-generator/generator/samplesfreq.go
@@ -0,0 +1,198 @@
+package generator
+
+import (
+	"bytes"
+	"fmt"
+	"io"
+	"io/ioutil"
+	"log"
+	"math"
+	"os"
+	"path/filepath"
+	"sort"
+	"strconv"
+	"text/template"
+
+	"gopkg.in/src-d/simple-linguist.v1/internal/tokenizer"
+)
+
+const samplesSubDir = "filenames"
+
+type samplesFrequencies struct {
+	LanguageTotal  int                       `json:"language_total,omitempty"`
+	Languages      map[string]int            `json:"languages,omitempty"`
+	TokensTotal    int                       `json:"tokens_total,omitempty"`
+	Tokens         map[string]map[string]int `json:"tokens,omitempty"`
+	LanguageTokens map[string]int            `json:"language_tokens,omitempty"`
+}
+
+// Frequencies reads directories in samplesDir, retrieves information about frequencies of languages and tokens, and write
+// the file outPath using frequenciesTmplName as a template.
+func Frequencies(samplesDir, frequenciesTmplPath, frequenciesTmplName, commit, outPath string) error {
+	freqs, err := getFrequencies(samplesDir)
+	if err != nil {
+		return err
+	}
+
+	buf := &bytes.Buffer{}
+	if err := executeFrequenciesTemplate(buf, freqs, frequenciesTmplPath, frequenciesTmplName, commit); err != nil {
+		return err
+	}
+
+	return formatedWrite(outPath, buf.Bytes())
+}
+
+func getFrequencies(samplesDir string) (*samplesFrequencies, error) {
+	entries, err := ioutil.ReadDir(samplesDir)
+	if err != nil {
+		return nil, err
+	}
+
+	var languageTotal int
+	var languages = make(map[string]int)
+	var tokensTotal int
+	var tokens = make(map[string]map[string]int)
+	var languageTokens = make(map[string]int)
+
+	for _, entry := range entries {
+		if !entry.IsDir() {
+			log.Println(err)
+			continue
+		}
+
+		samples, err := getSamples(samplesDir, entry)
+		if err != nil {
+			log.Println(err)
+		}
+
+		if len(samples) == 0 {
+			continue
+		}
+
+		samplesTokens, err := getTokens(samples)
+		if err != nil {
+			log.Println(err)
+			continue
+		}
+
+		lang := entry.Name()
+		languageTotal += len(samples)
+		languages[lang] = len(samples)
+		tokensTotal += len(samplesTokens)
+		languageTokens[lang] = len(samplesTokens)
+		tokens[lang] = make(map[string]int)
+		for _, token := range samplesTokens {
+			tokens[lang][token]++
+		}
+	}
+
+	return &samplesFrequencies{
+		TokensTotal:    tokensTotal,
+		LanguageTotal:  languageTotal,
+		Tokens:         tokens,
+		LanguageTokens: languageTokens,
+		Languages:      languages,
+	}, nil
+}
+
+func getSamples(samplesDir string, langDir os.FileInfo) ([]string, error) {
+	samples := []string{}
+	path := filepath.Join(samplesDir, langDir.Name())
+	entries, err := ioutil.ReadDir(path)
+	if err != nil {
+		return nil, err
+	}
+
+	for _, entry := range entries {
+		if entry.Mode().IsRegular() {
+			samples = append(samples, filepath.Join(path, entry.Name()))
+		}
+
+		if entry.IsDir() && entry.Name() == samplesSubDir {
+			subSamples, err := getSubSamples(samplesDir, langDir.Name(), entry)
+			if err != nil {
+				return nil, err
+			}
+
+			samples = append(samples, subSamples...)
+		}
+
+	}
+
+	return samples, nil
+}
+
+func getSubSamples(samplesDir, langDir string, subLangDir os.FileInfo) ([]string, error) {
+	subSamples := []string{}
+	path := filepath.Join(samplesDir, langDir, subLangDir.Name())
+	entries, err := ioutil.ReadDir(path)
+	if err != nil {
+		return nil, err
+	}
+
+	for _, entry := range entries {
+		if entry.Mode().IsRegular() {
+			subSamples = append(subSamples, filepath.Join(path, entry.Name()))
+		}
+	}
+
+	return subSamples, nil
+}
+
+func getTokens(samples []string) ([]string, error) {
+	tokens := make([]string, 0, 20)
+	var anyError error
+	for _, sample := range samples {
+		content, err := ioutil.ReadFile(sample)
+		if err != nil {
+			anyError = err
+			continue
+		}
+
+		t := tokenizer.Tokenize(content)
+		tokens = append(tokens, t...)
+	}
+
+	return tokens, anyError
+}
+
+func executeFrequenciesTemplate(out io.Writer, freqs *samplesFrequencies, frequenciesTmplPath, frequenciesTmpl, commit string) error {
+	fmap := template.FuncMap{
+		"getCommit": func() string { return commit },
+		"toFloat64": func(num int) string { return fmt.Sprintf("%f", float64(num)) },
+		"orderKeys": func(m map[string]int) []string {
+			keys := make([]string, 0, len(m))
+			for key := range m {
+				keys = append(keys, key)
+			}
+
+			sort.Strings(keys)
+			return keys
+		},
+		"languageLogProbability": func(language string) string {
+			num := math.Log(float64(freqs.Languages[language]) / float64(freqs.LanguageTotal))
+			return fmt.Sprintf("%f", num)
+		},
+		"orderMapMapKeys": func(mm map[string]map[string]int) []string {
+			keys := make([]string, 0, len(mm))
+			for key := range mm {
+				keys = append(keys, key)
+			}
+
+			sort.Strings(keys)
+			return keys
+		},
+		"tokenLogProbability": func(language, token string) string {
+			num := math.Log(float64(freqs.Tokens[language][token]) / float64(freqs.LanguageTokens[language]))
+			return fmt.Sprintf("%f", num)
+		},
+		"quote": strconv.Quote,
+	}
+
+	t := template.Must(template.New(frequenciesTmpl).Funcs(fmap).ParseFiles(frequenciesTmplPath))
+	if err := t.Execute(out, freqs); err != nil {
+		return err
+	}
+
+	return nil
+}
--- a/internal/code-generator/generator/test_files/frequencies.gold
+++ b/internal/code-generator/generator/test_files/frequencies.gold
--- a/internal/code-generator/main.go
+++ b/internal/code-generator/main.go
@@ -54,7 +54,13 @@ const (
 	aliasesTmplPath = "internal/code-generator/assets/aliases.go.tmpl"
 	aliasesTmpl     = "aliases.go.tmpl"

-	commitPath = ".git/refs/heads/master"
+	// frequencies.go generation
+	samplesDir          = ".linguist/samples"
+	frequenciesFile     = "frequencies.go"
+	frequenciesTmplPath = "internal/code-generator/assets/frequencies.go.tmpl"
+	frequenciesTmpl     = "frequencies.go.tmpl"
+
+	commitPath = ".linguist/.git/refs/heads/master"
 )

 type generatorArgs struct {
@@ -88,6 +94,10 @@ func main() {
 			log.Println(err)
 		}
 	}
+
+	if err := generator.Frequencies(samplesDir, frequenciesTmplPath, frequenciesTmpl, commit, frequenciesFile); err != nil {
+		log.Println(err)
+	}
 }

 func getCommit(path string) (string, error) {
--- a/internal/tokenizer/tokenize.go
+++ b/internal/tokenizer/tokenize.go
@@ -0,0 +1,169 @@
+package tokenizer
+
+import (
+	"bytes"
+	"regexp"
+)
+
+func Tokenize(content []byte) []string {
+	tokens := make([][]byte, 0, 50)
+	for _, extract := range extractTokens {
+		var extractedTokens [][]byte
+		content, extractedTokens = extract(content)
+		tokens = append(tokens, extractedTokens...)
+	}
+
+	return toString(tokens)
+}
+
+func toString(tokens [][]byte) []string {
+	stokens := make([]string, 0, len(tokens))
+	for _, token := range tokens {
+		stokens = append(stokens, string(token))
+	}
+
+	return stokens
+}
+
+var (
+	extractTokens = []func(content []byte) (replacedContent []byte, tokens [][]byte){
+		// The order to must be this
+		extractAndReplaceShebang,
+		extractAndReplaceSGML,
+		skipCommentsAndLiterals,
+		extractAndReplacePunctuation,
+		extractAndReplaceRegular,
+		extractAndReplaceOperator,
+		extractRemainders,
+	}
+
+	reLiteralStringQuotes = regexp.MustCompile(`(?sU)(".*"|'.*')`)
+	reSingleLineComment   = regexp.MustCompile(`(?m)(//|--|#|%|")(.*$)`)
+	reMultilineComment    = regexp.MustCompile(`(?sU)(/\*.*\*/|<!--.*-->|\{-.*-\}|\(\*.*\*\)|""".*"""|'''.*''')`)
+	reLiteralNumber       = regexp.MustCompile(`(0x[0-9A-Fa-f]([0-9A-Fa-f]|\.)*|\d(\d|\.)*)([uU][lL]{0,2}|([eE][-+]\d*)?[fFlL]*)`)
+	reShebang             = regexp.MustCompile(`(?m)^#!(?:/\w+)*/(?:(\w+)|\w+(?:\s*\w+=\w+\s*)*\s*(\w+))(?:\s*-\w+\s*)*$`)
+	rePunctuation         = regexp.MustCompile(`;|\{|\}|\(|\)|\[|\]`)
+	reSGML                = regexp.MustCompile(`(?sU)(<\/?[^\s<>=\d"']+)(?:\s.*\/?>|>)`)
+	reSGMLComment         = regexp.MustCompile(`(?sU)(<!--.*-->)`)
+	reSGMLAttributes      = regexp.MustCompile(`\s+(\w+=)|\s+([^\s>]+)`)
+	reSGMLLoneAttribute   = regexp.MustCompile(`(\w+)`)
+	reRegularToken        = regexp.MustCompile(`[\w\.@#\/\*]+`)
+	reOperators           = regexp.MustCompile(`<<?|\+|\-|\*|\/|%|&&?|\|\|?`)
+
+	regexToSkip = []*regexp.Regexp{
+		// The order must be this
+		reLiteralStringQuotes,
+		reMultilineComment,
+		reSingleLineComment,
+		reLiteralNumber,
+	}
+)
+
+func extractAndReplaceShebang(content []byte) ([]byte, [][]byte) {
+	var shebangTokens [][]byte
+	matches := reShebang.FindAllSubmatch(content, -1)
+	if matches != nil {
+		shebangTokens = make([][]byte, 0, 2)
+		for _, match := range matches {
+			shebangToken := getShebangToken(match)
+			shebangTokens = append(shebangTokens, shebangToken)
+		}
+
+		reShebang.ReplaceAll(content, []byte(` `))
+	}
+
+	return content, shebangTokens
+}
+
+func getShebangToken(matchedShebang [][]byte) []byte {
+	const prefix = `SHEBANG#!`
+	var token []byte
+	for i := 1; i < len(matchedShebang); i++ {
+		if len(matchedShebang[i]) > 0 {
+			token = matchedShebang[i]
+			break
+		}
+	}
+
+	tokenShebang := append([]byte(prefix), token...)
+	return tokenShebang
+}
+
+func commonExtracAndReplace(content []byte, re *regexp.Regexp) ([]byte, [][]byte) {
+	tokens := re.FindAll(content, -1)
+	content = re.ReplaceAll(content, []byte(` `))
+	return content, tokens
+}
+
+func extractAndReplacePunctuation(content []byte) ([]byte, [][]byte) {
+	return commonExtracAndReplace(content, rePunctuation)
+}
+
+func extractAndReplaceRegular(content []byte) ([]byte, [][]byte) {
+	return commonExtracAndReplace(content, reRegularToken)
+}
+
+func extractAndReplaceOperator(content []byte) ([]byte, [][]byte) {
+	return commonExtracAndReplace(content, reOperators)
+}
+
+func extractAndReplaceSGML(content []byte) ([]byte, [][]byte) {
+	var SGMLTokens [][]byte
+	matches := reSGML.FindAllSubmatch(content, -1)
+	if matches != nil {
+		SGMLTokens = make([][]byte, 0, 2)
+		for _, match := range matches {
+			if reSGMLComment.Match(match[0]) {
+				continue
+			}
+
+			token := append(match[1], '>')
+			SGMLTokens = append(SGMLTokens, token)
+			attributes := getSGMLAttributes(match[0])
+			SGMLTokens = append(SGMLTokens, attributes...)
+		}
+
+		content = reSGML.ReplaceAll(content, []byte(` `))
+	}
+
+	return content, SGMLTokens
+}
+
+func getSGMLAttributes(SGMLTag []byte) [][]byte {
+	var attributes [][]byte
+	matches := reSGMLAttributes.FindAllSubmatch(SGMLTag, -1)
+	if matches != nil {
+		attributes = make([][]byte, 0, 5)
+		for _, match := range matches {
+			if len(match[1]) != 0 {
+				attributes = append(attributes, match[1])
+			}
+
+			if len(match[2]) != 0 {
+				loneAttributes := reSGMLLoneAttribute.FindAll(match[2], -1)
+				attributes = append(attributes, loneAttributes...)
+			}
+		}
+	}
+
+	return attributes
+}
+
+func skipCommentsAndLiterals(content []byte) ([]byte, [][]byte) {
+	for _, skip := range regexToSkip {
+		content = skip.ReplaceAll(content, []byte(` `))
+	}
+
+	return content, nil
+}
+
+func extractRemainders(content []byte) ([]byte, [][]byte) {
+	splitted := bytes.Fields(content)
+	remainderTokens := make([][]byte, 0, len(splitted)*3)
+	for _, remainder := range splitted {
+		remainders := bytes.Split(remainder, nil)
+		remainderTokens = append(remainderTokens, remainders...)
+	}
+
+	return content, remainderTokens
+}
--- a/internal/tokenizer/tokenize_test.go
+++ b/internal/tokenizer/tokenize_test.go
@@ -0,0 +1,107 @@
+package tokenizer
+
+import (
+	"fmt"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+)
+
+const (
+	testContent = `#!/usr/bin/ruby
+
+#!/usr/bin/env node
+
+aaa
+
+#!/usr/bin/env A=B foo=bar awk -f
+
+#!python
+
+func Tokenize(content []byte) []string {
+	splitted := bytes.Fields(content)
+	tokens := /* make([]string, 0, len(splitted))
+	no comment -- comment
+	for _, tokenByte := range splitted {
+		token64 := base64.StdEncoding.EncodeToString(tokenByte)
+		tokens = append(tokens, token64)
+		notcatchasanumber3.5
+	}*/
+othercode
+	/* testing multiple 
+	
+		multiline comments*/
+
+<!-- com
+	ment -->
+<!-- comment 2-->
+ppp no comment # comment
+
+"literal1"
+
+abb (tokenByte, 0xAF02) | ,3.2L
+
+'literal2' notcatchasanumber3.5
+
+	5 += number * anotherNumber
+	if isTrue && isToo {
+		0b00001000 >> 1
+	}
+
+	return tokens
+
+oneBool = 3 <= 2
+varBool = 3<=2>
+ 
+  PyErr_SetString(PyExc_RuntimeError, "Relative import is not supported for Python <=2.4.");
+
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml">
+    <head>
+        <title id="hola" class="">This is a XHTML sample file</title>
+        <style type="text/css"><![CDATA[
+            #example {
+                background-color: yellow;
+            }
+        ]]></style>
+    </head>
+    <body>
+        <div id="example">
+            Just a simple <strong>XHTML</strong> test page.
+        </div>
+    </body>
+</html>`
+)
+
+var (
+	tokensFromTestContent = []string{"SHEBANG#!ruby", "SHEBANG#!node", "SHEBANG#!awk", "<!DOCTYPE>", "PUBLIC", "W3C", "DTD", "XHTML", "1", "0",
+		"Strict", "EN", "http", "www", "w3", "org", "TR", "xhtml1", "DTD", "xhtml1", "strict", "dtd", "<html>", "<head>", "<title>", "class=",
+		"</title>", "<style>", "<![CDATA[>", "example", "background", "color", "yellow", "</style>", "</head>", "<body>", "<div>", "<strong>",
+		"</strong>", "</div>", "</body>", "</html>", "(", "[", "]", ")", "[", "]", "{", "(", ")", "(", ")", "{", "}", "(", ")", ";", ";", "}",
+		"]", "]", "aaa", "func", "Tokenize", "content", "byte", "string", "splitted", "bytes.Fields", "content", "tokens", "othercode", "ppp",
+		"no", "comment", "abb", "tokenByte", "notcatchasanumber", "number", "*", "anotherNumber", "if", "isTrue", "isToo", "b", "return",
+		"tokens", "oneBool", "varBool", "PyErr_SetString", "PyExc_RuntimeError", "html", "PUBLIC", "xmlns", "id", "class", "This", "is", "a",
+		"XHTML", "sample", "file", "type", "background", "color", "yellow", "id", "Just", "a", "simple", "XHTML", "test", "page.", "|", "+",
+		"&&", "<", "<", "-", ":", "=", ":", "=", ",", ",", "=", ">", ">", "=", "=", "=", "=", ">", ",", ">", "=", ">", "=", "=", ">", "=", ">",
+		":", ">", "=", ">"}
+)
+
+func TestTokenize(t *testing.T) {
+	tests := []struct {
+		name     string
+		content  []byte
+		expected []string
+	}{
+		{name: "content", content: []byte(testContent), expected: tokensFromTestContent},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			tokens := Tokenize(test.content)
+			assert.Equal(t, len(test.expected), len(tokens), fmt.Sprintf("token' slice length = %v, want %v", len(test.expected), len(tokens)))
+			for i, expectedToken := range test.expected {
+				assert.Equal(t, expectedToken, tokens[i], fmt.Sprintf("token = %v, want %v", tokens[i], expectedToken))
+			}
+		})
+	}
+}
--- a/interpreters_map.go
+++ b/interpreters_map.go
@@ -2,7 +2,7 @@ package slinguist

 // CODE GENERATED AUTOMATICALLY WITH gopkg.in/src-d/simple-linguist.v1/internal/code-generator
 // THIS FILE SHOULD NOT BE EDITED BY HAND
-// Extracted from github/linguist commit: dae33dc2b20cddc85d1300435c3be7118a7115a9
+// Extracted from github/linguist commit: 60f864a138650dd17fafc94814be9ee2d3aaef8c

 var languagesByInterpreter = map[string][]string{
 	"Rscript":     {"R"},
@@ -36,6 +36,7 @@ var languagesByInterpreter = map[string][]string{
 	"io":          {"Io"},
 	"ioke":        {"Ioke"},
 	"jconsole":    {"J"},
+	"jolie":       {"Jolie"},
 	"jruby":       {"Ruby"},
 	"julia":       {"Julia"},
 	"lisp":        {"Common Lisp"},
--- a/modeline.go
+++ b/modeline.go
@@ -10,10 +10,9 @@ import (
 func GetLanguageByModeline(content []byte) (lang string, safe bool) {
 	headFoot := getHeaderAndFooter(content)
 	for _, getLang := range modelinesFunc {
-		lang = getLang(headFoot)
-		safe = lang != OtherLanguage
+		lang, safe = getLang(headFoot)
 		if safe {
-			return
+			break
 		}
 	}

@@ -23,7 +22,7 @@ func GetLanguageByModeline(content []byte) (lang string, safe bool) {
 func getHeaderAndFooter(content []byte) []byte {
 	const (
 		searchScope = 5
-		eol         = `\n`
+		eol         = "\n"
 	)

 	if bytes.Count(content, []byte(eol)) < 2*searchScope {
@@ -37,7 +36,7 @@ func getHeaderAndFooter(content []byte) []byte {
 	return bytes.Join(headerAndFooter, []byte(eol))
 }

-var modelinesFunc = []func(content []byte) string{
+var modelinesFunc = []func(content []byte) (string, bool){
 	GetLanguageByEmacsModeline,
 	GetLanguageByVimModeline,
 }
@@ -50,11 +49,11 @@ var (
 )

 // GetLanguageByEmacsModeline detecs if the content has a emacs modeline and try to get a
-// language basing on alias. If couldn't retrieve a valid language, it returns OtherLanguage.
-func GetLanguageByEmacsModeline(content []byte) (lang string) {
+// language basing on alias. If couldn't retrieve a valid language, it returns OtherLanguage and false.
+func GetLanguageByEmacsModeline(content []byte) (string, bool) {
 	matched := reEmacsModeline.FindAllSubmatch(content, -1)
 	if matched == nil {
-		return OtherLanguage
+		return OtherLanguage, false
 	}

 	// only take the last matched line, discard previous lines
@@ -67,23 +66,22 @@ func GetLanguageByEmacsModeline(content []byte) (lang string) {
 		alias = string(lastLineMatched)
 	}

-	lang = GetLanguageByAlias(alias)
-	return
+	return GetLanguageByAlias(alias)
 }

 // GetLanguageByVimModeline detecs if the content has a vim modeline and try to get a
-// language basing on alias. If couldn't retrieve a valid language, it returns OtherLanguage.
-func GetLanguageByVimModeline(content []byte) (lang string) {
+// language basing on alias. If couldn't retrieve a valid language, it returns OtherLanguage and false.
+func GetLanguageByVimModeline(content []byte) (string, bool) {
 	matched := reVimModeline.FindAllSubmatch(content, -1)
 	if matched == nil {
-		return OtherLanguage
+		return OtherLanguage, false
 	}

 	// only take the last matched line, discard previous lines
 	lastLineMatched := matched[len(matched)-1][1]
 	matchedAlias := reVimLang.FindAllSubmatch(lastLineMatched, -1)
 	if matchedAlias == nil {
-		return OtherLanguage
+		return OtherLanguage, false
 	}

 	alias := string(matchedAlias[0][1])
@@ -100,6 +98,5 @@ func GetLanguageByVimModeline(content []byte) (lang string) {
 		}
 	}

-	lang = GetLanguageByAlias(alias)
-	return
+	return GetLanguageByAlias(alias)
 }
--- a/modeline_test.go
+++ b/modeline_test.go
@@ -9,6 +9,7 @@ import (

 const (
 	modelinesDir = ".linguist/test/fixtures/Data/Modelines"
+	samplesDir   = ".linguist/samples"
 )

 func (s *TSuite) TestGetLanguageByModeline(c *C) {
@@ -18,42 +19,43 @@ func (s *TSuite) TestGetLanguageByModeline(c *C) {
 		expectedSafe bool
 	}{
 		// Emacs
-		{filename: "example_smalltalk.md", expectedLang: "Smalltalk", expectedSafe: true},
-		{filename: "fundamentalEmacs.c", expectedLang: "Text", expectedSafe: true},
-		{filename: "iamphp.inc", expectedLang: "PHP", expectedSafe: true},
-		{filename: "seeplusplusEmacs1", expectedLang: "C++", expectedSafe: true},
-		{filename: "seeplusplusEmacs2", expectedLang: "C++", expectedSafe: true},
-		{filename: "seeplusplusEmacs3", expectedLang: "C++", expectedSafe: true},
-		{filename: "seeplusplusEmacs4", expectedLang: "C++", expectedSafe: true},
-		{filename: "seeplusplusEmacs5", expectedLang: "C++", expectedSafe: true},
-		{filename: "seeplusplusEmacs6", expectedLang: "C++", expectedSafe: true},
-		{filename: "seeplusplusEmacs7", expectedLang: "C++", expectedSafe: true},
-		{filename: "seeplusplusEmacs9", expectedLang: "C++", expectedSafe: true},
-		{filename: "seeplusplusEmacs10", expectedLang: "C++", expectedSafe: true},
-		{filename: "seeplusplusEmacs11", expectedLang: "C++", expectedSafe: true},
-		{filename: "seeplusplusEmacs12", expectedLang: "C++", expectedSafe: true},
+		{filename: filepath.Join(modelinesDir, "example_smalltalk.md"), expectedLang: "Smalltalk", expectedSafe: true},
+		{filename: filepath.Join(modelinesDir, "fundamentalEmacs.c"), expectedLang: "Text", expectedSafe: true},
+		{filename: filepath.Join(modelinesDir, "iamphp.inc"), expectedLang: "PHP", expectedSafe: true},
+		{filename: filepath.Join(modelinesDir, "seeplusplusEmacs1"), expectedLang: "C++", expectedSafe: true},
+		{filename: filepath.Join(modelinesDir, "seeplusplusEmacs2"), expectedLang: "C++", expectedSafe: true},
+		{filename: filepath.Join(modelinesDir, "seeplusplusEmacs3"), expectedLang: "C++", expectedSafe: true},
+		{filename: filepath.Join(modelinesDir, "seeplusplusEmacs4"), expectedLang: "C++", expectedSafe: true},
+		{filename: filepath.Join(modelinesDir, "seeplusplusEmacs5"), expectedLang: "C++", expectedSafe: true},
+		{filename: filepath.Join(modelinesDir, "seeplusplusEmacs6"), expectedLang: "C++", expectedSafe: true},
+		{filename: filepath.Join(modelinesDir, "seeplusplusEmacs7"), expectedLang: "C++", expectedSafe: true},
+		{filename: filepath.Join(modelinesDir, "seeplusplusEmacs9"), expectedLang: "C++", expectedSafe: true},
+		{filename: filepath.Join(modelinesDir, "seeplusplusEmacs10"), expectedLang: "C++", expectedSafe: true},
+		{filename: filepath.Join(modelinesDir, "seeplusplusEmacs11"), expectedLang: "C++", expectedSafe: true},
+		{filename: filepath.Join(modelinesDir, "seeplusplusEmacs12"), expectedLang: "C++", expectedSafe: true},

 		// Vim
-		{filename: "seeplusplus", expectedLang: "C++", expectedSafe: true},
-		{filename: "iamjs.pl", expectedLang: "JavaScript", expectedSafe: true},
-		{filename: "iamjs2.pl", expectedLang: "JavaScript", expectedSafe: true},
-		{filename: "not_perl.pl", expectedLang: "Prolog", expectedSafe: true},
-		{filename: "ruby", expectedLang: "Ruby", expectedSafe: true},
-		{filename: "ruby2", expectedLang: "Ruby", expectedSafe: true},
-		{filename: "ruby3", expectedLang: "Ruby", expectedSafe: true},
-		{filename: "ruby4", expectedLang: "Ruby", expectedSafe: true},
-		{filename: "ruby5", expectedLang: "Ruby", expectedSafe: true},
-		{filename: "ruby6", expectedLang: "Ruby", expectedSafe: true},
-		{filename: "ruby7", expectedLang: "Ruby", expectedSafe: true},
-		{filename: "ruby8", expectedLang: "Ruby", expectedSafe: true},
-		{filename: "ruby9", expectedLang: "Ruby", expectedSafe: true},
-		{filename: "ruby10", expectedLang: "Ruby", expectedSafe: true},
-		{filename: "ruby11", expectedLang: "Ruby", expectedSafe: true},
-		{filename: "ruby12", expectedLang: "Ruby", expectedSafe: true},
+		{filename: filepath.Join(modelinesDir, "seeplusplus"), expectedLang: "C++", expectedSafe: true},
+		{filename: filepath.Join(modelinesDir, "iamjs.pl"), expectedLang: "JavaScript", expectedSafe: true},
+		{filename: filepath.Join(modelinesDir, "iamjs2.pl"), expectedLang: "JavaScript", expectedSafe: true},
+		{filename: filepath.Join(modelinesDir, "not_perl.pl"), expectedLang: "Prolog", expectedSafe: true},
+		{filename: filepath.Join(modelinesDir, "ruby"), expectedLang: "Ruby", expectedSafe: true},
+		{filename: filepath.Join(modelinesDir, "ruby2"), expectedLang: "Ruby", expectedSafe: true},
+		{filename: filepath.Join(modelinesDir, "ruby3"), expectedLang: "Ruby", expectedSafe: true},
+		{filename: filepath.Join(modelinesDir, "ruby4"), expectedLang: "Ruby", expectedSafe: true},
+		{filename: filepath.Join(modelinesDir, "ruby5"), expectedLang: "Ruby", expectedSafe: true},
+		{filename: filepath.Join(modelinesDir, "ruby6"), expectedLang: "Ruby", expectedSafe: true},
+		{filename: filepath.Join(modelinesDir, "ruby7"), expectedLang: "Ruby", expectedSafe: true},
+		{filename: filepath.Join(modelinesDir, "ruby8"), expectedLang: "Ruby", expectedSafe: true},
+		{filename: filepath.Join(modelinesDir, "ruby9"), expectedLang: "Ruby", expectedSafe: true},
+		{filename: filepath.Join(modelinesDir, "ruby10"), expectedLang: "Ruby", expectedSafe: true},
+		{filename: filepath.Join(modelinesDir, "ruby11"), expectedLang: "Ruby", expectedSafe: true},
+		{filename: filepath.Join(modelinesDir, "ruby12"), expectedLang: "Ruby", expectedSafe: true},
+		{filename: filepath.Join(samplesDir, "C/main.c"), expectedLang: OtherLanguage, expectedSafe: false},
 	}

 	for _, test := range linguistTests {
-		content, err := ioutil.ReadFile(filepath.Join(modelinesDir, test.filename))
+		content, err := ioutil.ReadFile(test.filename)
 		c.Assert(err, Equals, nil)

 		lang, safe := GetLanguageByModeline(content)
@@ -64,6 +66,7 @@ func (s *TSuite) TestGetLanguageByModeline(c *C) {
 	const (
 		wrongVim  = `# vim: set syntax=ruby ft  =python filetype=perl :`
 		rightVim  = `/* vim: set syntax=python ft   =python filetype=python */`
+		noLangVim = `/* vim: set shiftwidth=4 softtabstop=0 cindent cinoptions={1s: */`
 	)

 	tests := []struct {
@@ -73,6 +76,7 @@ func (s *TSuite) TestGetLanguageByModeline(c *C) {
 	}{
 		{content: []byte(wrongVim), expectedLang: OtherLanguage, expectedSafe: false},
 		{content: []byte(rightVim), expectedLang: "Python", expectedSafe: true},
+		{content: []byte(noLangVim), expectedLang: OtherLanguage, expectedSafe: false},
 	}

 	for _, test := range tests {
--- a/types_map.go
+++ b/types_map.go
@@ -2,7 +2,7 @@ package slinguist

 // CODE GENERATED AUTOMATICALLY WITH gopkg.in/src-d/simple-linguist.v1/internal/code-generator
 // THIS FILE SHOULD NOT BE EDITED BY HAND
-// Extracted from github/linguist commit: dae33dc2b20cddc85d1300435c3be7118a7115a9
+// Extracted from github/linguist commit: 60f864a138650dd17fafc94814be9ee2d3aaef8c

 var languagesType = map[string]Type{
 	"1C Enterprise":    Programming,
@@ -59,6 +59,7 @@ var languagesType = map[string]Type{
 	"CSON":                     Data,
 	"CSS":                      Markup,
 	"CSV":                      Data,
+	"CWeb":                     Programming,
 	"Cap'n Proto":              Programming,
 	"CartoCSS":                 Programming,
 	"Ceylon":                   Programming,
@@ -70,6 +71,7 @@ var languagesType = map[string]Type{
 	"Clean":                    Programming,
 	"Click":                    Programming,
 	"Clojure":                  Programming,
+	"Closure Templates":        Markup,
 	"CoffeeScript":             Programming,
 	"ColdFusion":               Programming,
 	"ColdFusion CFC":           Programming,
@@ -193,6 +195,7 @@ var languagesType = map[string]Type{
 	"JavaScript":               Programming,
 	"Jison":                    Programming,
 	"Jison Lex":                Programming,
+	"Jolie":                    Programming,
 	"Julia":                    Programming,
 	"Jupyter Notebook":         Markup,
 	"KRL":                      Programming,
@@ -297,6 +300,7 @@ var languagesType = map[string]Type{
 	"Parrot Assembly":                Programming,
 	"Parrot Internal Representation": Programming,
 	"Pascal":                       Programming,
+	"Pep8":                         Programming,
 	"Perl":                         Programming,
 	"Perl6":                        Programming,
 	"Pic":                          Markup,
@@ -368,6 +372,7 @@ var languagesType = map[string]Type{
 	"Scheme":                       Programming,
 	"Scilab":                       Programming,
 	"Self":                         Programming,
+	"ShaderLab":                    Programming,
 	"Shell":                        Programming,
 	"ShellSession":                 Programming,
 	"Shen":                         Programming,
@@ -403,6 +408,7 @@ var languagesType = map[string]Type{
 	"Turing":                       Programming,
 	"Turtle":                       Data,
 	"Twig":                         Markup,
+	"Type Language":                Data,
 	"TypeScript":                   Programming,
 	"Unified Parallel C":           Programming,
 	"Unity3D Asset":                Data,
--- a/vendor_matchers.go
+++ b/vendor_matchers.go
@@ -2,7 +2,7 @@ package slinguist

 // CODE GENERATED AUTOMATICALLY WITH gopkg.in/src-d/simple-linguist.v1/internal/code-generator
 // THIS FILE SHOULD NOT BE EDITED BY HAND
-// Extracted from github/linguist commit: dae33dc2b20cddc85d1300435c3be7118a7115a9
+// Extracted from github/linguist commit: 60f864a138650dd17fafc94814be9ee2d3aaef8c

 import "gopkg.in/toqueteos/substring.v1"