tartrazine/data/generated.go
Miguel Molina 8ff885a3a8
implement IsGenerated helper to filter out generated files
Closes #17

Implements the IsGenerated helper function to filter out generated
files using the rules and matchers in:
- https://github.com/github/linguist/blob/master/lib/linguist/generated.rb

Since the vast majority of matchers have very different logic, it cannot
be autogenerated directly from linguist like other logics in enry, so it's
translated by hand.

There are three different types of matchers in this implementation:
- By extension, which mark as generated based only in the extension. These
  are the fastest matchers, so they're done first.
- By file name, which matches patterns against the filename. These
  are performed in second place. Unlike linguist, we try to use string
  functions instead of regexps as much as possible.
- Finally, the rest of the matchers, which go into the content and try
  to identify if they're generated or not based on the content. Unlike
  linguist, we try to only read the content we need and not split it
  all unless it's necessary and use byte functions instead of regexps
  as much as possible.

Signed-off-by: Miguel Molina <miguel@erizocosmi.co>
2020-05-28 08:55:13 +02:00

807 lines
18 KiB
Go

package data
import (
"bytes"
"strings"
"github.com/go-enry/go-enry/v2/regex"
)
// GeneratedCodeExtensions contains all extensions that belong to generated
// files for sure.
var GeneratedCodeExtensions = map[string]struct{}{
// XCode files
".nib": {},
".xcworkspacedata": {},
".xcuserstate": {},
}
// GeneratedCodeNameMatcher is a function that tells whether the file with the
// given name is generated.
type GeneratedCodeNameMatcher func(string) bool
func nameMatches(pattern string) GeneratedCodeNameMatcher {
r := regex.MustCompile(pattern)
return func(name string) bool {
return r.MatchString(name)
}
}
func nameContains(pattern string) GeneratedCodeNameMatcher {
return func(name string) bool {
return strings.Contains(name, pattern)
}
}
func nameEndsWith(pattern string) GeneratedCodeNameMatcher {
return func(name string) bool {
return strings.HasSuffix(name, pattern)
}
}
// GeneratedCodeNameMatchers are all the matchers that check whether the code
// is generated based only on the file name.
var GeneratedCodeNameMatchers = []GeneratedCodeNameMatcher{
// Cocoa pods
nameMatches(`(^Pods|\/Pods)\/`),
// Carthage build
nameMatches(`(^|\/)Carthage\/Build\/`),
// NET designer file
nameMatches(`(?i)\.designer\.(cs|vb)$`),
// Generated NET specflow feature file
nameEndsWith(".feature.cs"),
// Node modules
nameContains("node_modules/"),
// Go vendor
nameMatches(`vendor\/([-0-9A-Za-z]+\.)+(com|edu|gov|in|me|net|org|fm|io)`),
// Go lock
nameEndsWith("Gopkg.lock"),
nameEndsWith("glide.lock"),
// Esy lock
nameMatches(`(^|\/)(\w+\.)?esy.lock$`),
// NPM shrinkwrap
nameEndsWith("npm-shrinkwrap.json"),
// NPM package lock
nameEndsWith("package-lock.json"),
// Yarn plugnplay
nameMatches(`(^|\/)\.pnp\.(c|m)?js$`),
// Godeps
nameContains("Godeps/"),
// Composer lock
nameEndsWith("composer.lock"),
// Generated by zephir
nameMatches(`.\.zep\.(?:c|h|php)$`),
// Cargo lock
nameEndsWith("Cargo.lock"),
// Pipenv lock
nameEndsWith("Pipfile.lock"),
// GraphQL relay
nameContains("__generated__/"),
}
// GeneratedCodeMatcher checks whether the file with the given data is
// generated code.
type GeneratedCodeMatcher func(path, ext string, content []byte) bool
// GeneratedCodeMatchers is the list of all generated code matchers that
// rely on checking the content of the file to make the guess.
var GeneratedCodeMatchers = []GeneratedCodeMatcher{
isMinifiedFile,
hasSourceMapReference,
isSourceMap,
isCompiledCoffeeScript,
isGeneratedNetDocfile,
isGeneratedJavaScriptPEGParser,
isGeneratedPostScript,
isGeneratedGo,
isGeneratedProtobuf,
isGeneratedJavaScriptProtocolBuffer,
isGeneratedApacheThrift,
isGeneratedJNIHeader,
isVCRCassette,
isCompiledCythonFile,
isGeneratedModule,
isGeneratedUnity3DMeta,
isGeneratedRacc,
isGeneratedJFlex,
isGeneratedGrammarKit,
isGeneratedRoxygen2,
isGeneratedJison,
isGeneratedGRPCCpp,
isGeneratedDart,
isGeneratedPerlPPPortHeader,
isGeneratedGameMakerStudio,
isGeneratedGimp,
isGeneratedVisualStudio6,
isGeneratedHaxe,
isGeneratedHTML,
isGeneratedJooq,
}
func canBeMinified(ext string) bool {
return ext == ".js" || ext == ".css"
}
// isMinifiedFile returns whether the file may be minified.
// We consider a minified file any css or js file whose average number of chars
// per line is more than 110.
func isMinifiedFile(path, ext string, content []byte) bool {
if !canBeMinified(ext) {
return false
}
var chars, lines uint64
forEachLine(content, func(line []byte) {
chars += uint64(len(line))
lines++
})
if lines == 0 {
return false
}
return chars/lines > 110
}
var sourceMapRegex = regex.MustCompile(`^\/[*\/][\#@] source(?:Mapping)?URL|sourceURL=`)
// hasSourceMapReference returns whether the file contains a reference to a
// source-map file.
func hasSourceMapReference(_ string, ext string, content []byte) bool {
if !canBeMinified(ext) {
return false
}
for _, line := range getLines(content, -2) {
if sourceMapRegex.Match(line) {
return true
}
}
return false
}
var sourceMapRegexps = []regex.EnryRegexp{
regex.MustCompile(`^{"version":\d+,`),
regex.MustCompile(`^\/\*\* Begin line maps\. \*\*\/{`),
}
// isSourceMap returns whether the file itself is a source map.
func isSourceMap(path, _ string, content []byte) bool {
if strings.HasSuffix(path, ".js.map") || strings.HasSuffix(path, ".css.map") {
return true
}
firstLine := getLines(content, 1)[0]
for _, r := range sourceMapRegexps {
if r.Match(firstLine) {
return true
}
}
return false
}
func isCompiledCoffeeScript(path, ext string, content []byte) bool {
if ext != ".js" {
return false
}
firstLine := getLines(content, 1)[0]
lastLines := getLines(content, -2)
if string(firstLine) == "(function() {" &&
string(lastLines[1]) == "}).call(this);" &&
string(lastLines[0]) == "" {
score := 0
forEachLine(content, func(line []byte) {
if bytes.Contains(line, []byte("var ")) {
// Underscored temp vars are likely to be Coffee
score += 1 * countAppearancesInLine(line, "_fn", "_i", "_len", "_ref", "_results")
// bind and extend functions are very Coffee specific
score += 3 * countAppearancesInLine(line, "__bind", "__extends", "__hasProp", "__indexOf", "__slice")
}
})
// Require a score of 3. This is fairly abritrary. Consider tweaking later.
// See: https://github.com/github/linguist/blob/master/lib/linguist/generated.rb#L176-L213
return score >= 3
}
return false
}
func isGeneratedNetDocfile(_, ext string, content []byte) bool {
if ext != ".xml" {
return false
}
lines := bytes.Split(content, []byte{'\n'})
if len(lines) <= 3 {
return false
}
return bytes.Contains(lines[1], []byte("<doc>")) &&
bytes.Contains(lines[2], []byte("<assembly>")) &&
bytes.Contains(lines[len(lines)-2], []byte("</doc>"))
}
var pegJavaScriptGeneratedRegex = regex.MustCompile(`^(?:[^\/]|\/[^\*])*\/\*(?:[^\*]|\*[^\/])*Generated by PEG.js`)
func isGeneratedJavaScriptPEGParser(_, ext string, content []byte) bool {
if ext != ".js" {
return false
}
// PEG.js-generated parsers include a comment near the top of the file
// that marks them as such.
return pegJavaScriptGeneratedRegex.Match(bytes.Join(getLines(content, 5), []byte("")))
}
var postScriptType1And42Regex = regex.MustCompile(`(\n|\r\n|\r)\s*(?:currentfile eexec\s+|\/sfnts\s+\[)`)
var postScriptRegexes = []regex.EnryRegexp{
regex.MustCompile(`[0-9]|draw|mpage|ImageMagick|inkscape|MATLAB`),
regex.MustCompile(`PCBNEW|pnmtops|\(Unknown\)|Serif Affinity|Filterimage -tops`),
}
func isGeneratedPostScript(_, ext string, content []byte) bool {
if ext != ".ps" && ext != ".eps" && ext != ".pfa" {
return false
}
// Type 1 and Type 42 fonts converted to PostScript are stored as hex-encoded byte streams; these
// streams are always preceded the `eexec` operator (if Type 1), or the `/sfnts` key (if Type 42).
if postScriptType1And42Regex.Match(content) {
return true
}
// We analyze the "%%Creator:" comment, which contains the author/generator
// of the file. If there is one, it should be in one of the first few lines.
var creator []byte
for _, line := range getLines(content, 10) {
if bytes.HasPrefix(line, []byte("%%Creator: ")) {
creator = line
break
}
}
if len(creator) == 0 {
return false
}
// EAGLE doesn't include a version number when it generates PostScript.
// However, it does prepend its name to the document's "%%Title" field.
if bytes.Contains(creator, []byte("EAGLE")) {
for _, line := range getLines(content, 5) {
if bytes.HasPrefix(line, []byte("%%Title: EAGLE Drawing ")) {
return true
}
}
}
// Most generators write their version number, while human authors' or companies'
// names don't contain numbers. So look if the line contains digits. Also
// look for some special cases without version numbers.
for _, r := range postScriptRegexes {
if r.Match(creator) {
return true
}
}
return false
}
func isGeneratedGo(_, ext string, content []byte) bool {
if ext != ".go" {
return false
}
lines := getLines(content, 40)
if len(lines) <= 1 {
return false
}
for _, line := range lines {
if bytes.Contains(line, []byte("Code generated by")) {
return true
}
}
return false
}
var protoExtensions = map[string]struct{}{
".py": {},
".java": {},
".h": {},
".cc": {},
".cpp": {},
".m": {},
".rb": {},
".php": {},
}
func isGeneratedProtobuf(_, ext string, content []byte) bool {
if _, ok := protoExtensions[ext]; !ok {
return false
}
lines := getLines(content, 3)
if len(lines) <= 1 {
return false
}
for _, line := range lines {
if bytes.Contains(line, []byte("Generated by the protocol buffer compiler. DO NOT EDIT!")) {
return true
}
}
return false
}
func isGeneratedJavaScriptProtocolBuffer(_, ext string, content []byte) bool {
if ext != ".js" {
return false
}
lines := getLines(content, 6)
if len(lines) < 6 {
return false
}
return bytes.Contains(lines[5], []byte("GENERATED CODE -- DO NOT EDIT!"))
}
var apacheThriftExtensions = map[string]struct{}{
".rb": {},
".py": {},
".go": {},
".js": {},
".m": {},
".java": {},
".h": {},
".cc": {},
".cpp": {},
".php": {},
}
func isGeneratedApacheThrift(_, ext string, content []byte) bool {
if _, ok := apacheThriftExtensions[ext]; !ok {
return false
}
for _, line := range getLines(content, 6) {
if bytes.Contains(line, []byte("Autogenerated by Thrift Compiler")) {
return true
}
}
return false
}
func isGeneratedJNIHeader(_, ext string, content []byte) bool {
if ext != ".h" {
return false
}
lines := getLines(content, 2)
if len(lines) < 2 {
return false
}
return bytes.Contains(lines[0], []byte("/* DO NOT EDIT THIS FILE - it is machine generated */")) &&
bytes.Contains(lines[1], []byte("#include <jni.h>"))
}
func isVCRCassette(_, ext string, content []byte) bool {
if ext != ".yml" {
return false
}
lines := getLines(content, -2)
if len(lines) < 2 {
return false
}
return bytes.Contains(lines[1], []byte("recorded_with: VCR"))
}
func isCompiledCythonFile(_, ext string, content []byte) bool {
if ext != ".c" && ext != ".cpp" {
return false
}
lines := getLines(content, 1)
if len(lines) < 1 {
return false
}
return bytes.Contains(lines[0], []byte("Generated by Cython"))
}
func isGeneratedModule(_, ext string, content []byte) bool {
if ext != ".mod" {
return false
}
lines := getLines(content, 1)
if len(lines) < 1 {
return false
}
return bytes.Contains(lines[0], []byte("PCBNEW-LibModule-V")) ||
bytes.Contains(lines[0], []byte("GFORTRAN module version '"))
}
func isGeneratedUnity3DMeta(_, ext string, content []byte) bool {
if ext != ".meta" {
return false
}
lines := getLines(content, 1)
if len(lines) < 1 {
return false
}
return bytes.Contains(lines[0], []byte("fileFormatVersion: "))
}
func isGeneratedRacc(_, ext string, content []byte) bool {
if ext != ".rb" {
return false
}
lines := getLines(content, 3)
if len(lines) < 3 {
return false
}
return bytes.HasPrefix(lines[2], []byte("# This file is automatically generated by Racc"))
}
func isGeneratedJFlex(_, ext string, content []byte) bool {
if ext != ".java" {
return false
}
lines := getLines(content, 1)
if len(lines) < 1 {
return false
}
return bytes.HasPrefix(lines[0], []byte("/* The following code was generated by JFlex "))
}
func isGeneratedGrammarKit(_, ext string, content []byte) bool {
if ext != ".java" {
return false
}
lines := getLines(content, 1)
if len(lines) < 1 {
return false
}
return bytes.Contains(lines[0], []byte("// This is a generated file. Not intended for manual editing."))
}
func isGeneratedRoxygen2(_, ext string, content []byte) bool {
if ext != ".rd" {
return false
}
lines := getLines(content, 1)
if len(lines) < 1 {
return false
}
return bytes.Contains(lines[0], []byte("% Generated by roxygen2: do not edit by hand"))
}
func isGeneratedJison(_, ext string, content []byte) bool {
if ext != ".js" {
return false
}
lines := getLines(content, 1)
if len(lines) < 1 {
return false
}
return bytes.Contains(lines[0], []byte("/* parser generated by jison ")) ||
bytes.Contains(lines[0], []byte("/* generated by jison-lex "))
}
func isGeneratedGRPCCpp(_, ext string, content []byte) bool {
switch ext {
case ".cpp", ".hpp", ".h", ".cc":
lines := getLines(content, 1)
if len(lines) < 1 {
return false
}
return bytes.Contains(lines[0], []byte("// Generated by the gRPC"))
default:
return false
}
}
var dartRegex = regex.MustCompile(`generated code\W{2,3}do not modify`)
func isGeneratedDart(_, ext string, content []byte) bool {
if ext != ".dart" {
return false
}
lines := getLines(content, 1)
if len(lines) < 1 {
return false
}
return dartRegex.Match(bytes.ToLower(lines[0]))
}
func isGeneratedPerlPPPortHeader(name, _ string, content []byte) bool {
if !strings.HasSuffix(name, "ppport.h") {
return false
}
lines := getLines(content, 10)
if len(lines) < 10 {
return false
}
return bytes.Contains(lines[8], []byte("Automatically created by Devel::PPPort"))
}
var (
gameMakerStudioFirstLineRegex = regex.MustCompile(`^\d\.\d\.\d.+\|\{`)
gameMakerStudioThirdLineRegex = regex.MustCompile(`\"modelName\"\:\s*\"GM`)
)
func isGeneratedGameMakerStudio(_, ext string, content []byte) bool {
if ext != ".yy" && ext != ".yyp" {
return false
}
lines := getLines(content, 3)
if len(lines) < 3 {
return false
}
return gameMakerStudioThirdLineRegex.Match(lines[2]) ||
gameMakerStudioFirstLineRegex.Match(lines[0])
}
var gimpRegexes = []regex.EnryRegexp{
regex.MustCompile(`\/\* GIMP [a-zA-Z0-9\- ]+ C\-Source image dump \(.+?\.c\) \*\/`),
regex.MustCompile(`\/\* GIMP header image file format \([a-zA-Z0-9\- ]+\)\: .+?\.h \*\/`),
}
func isGeneratedGimp(_, ext string, content []byte) bool {
if ext != ".c" && ext != ".h" {
return false
}
lines := getLines(content, 1)
if len(lines) < 1 {
return false
}
for _, r := range gimpRegexes {
if r.Match(lines[0]) {
return true
}
}
return false
}
func isGeneratedVisualStudio6(_, ext string, content []byte) bool {
if ext != ".dsp" {
return false
}
for _, l := range getLines(content, 3) {
if bytes.Contains(l, []byte("# Microsoft Developer Studio Generated Build File")) {
return true
}
}
return false
}
var haxeExtensions = map[string]struct{}{
".js": {},
".py": {},
".lua": {},
".cpp": {},
".h": {},
".java": {},
".cs": {},
".php": {},
}
func isGeneratedHaxe(_, ext string, content []byte) bool {
if _, ok := haxeExtensions[ext]; !ok {
return false
}
for _, l := range getLines(content, 3) {
if bytes.Contains(l, []byte("Generated by Haxe")) {
return true
}
}
return false
}
var (
doxygenRegex = regex.MustCompile(`<!--\s+Generated by Doxygen\s+[.0-9]+\s*-->`)
htmlMetaRegex = regex.MustCompile(`<meta(\s+[^>]+)>`)
htmlMetaContentRegex = regex.MustCompile(`\s+(name|content|value)\s*=\s*("[^"]+"|'[^']+'|[^\s"']+)`)
orgModeMetaRegex = regex.MustCompile(`org\s+mode`)
)
func isGeneratedHTML(_, ext string, content []byte) bool {
if ext != ".html" && ext != ".htm" && ext != ".xhtml" {
return false
}
lines := getLines(content, 30)
// Pkgdown
for _, l := range lines[:2] {
if bytes.Contains(l, []byte("<!-- Generated by pkgdown: do not edit by hand -->")) {
return true
}
}
// Mandoc
if len(lines) > 2 &&
bytes.HasPrefix(lines[2], []byte("<!-- This is an automatically generated file.")) {
return true
}
// Doxygen
for _, l := range lines {
if doxygenRegex.Match(l) {
return true
}
}
// HTML tag: <meta name="generator" content="" />
part := bytes.ToLower(bytes.Join(lines, []byte{' '}))
part = bytes.ReplaceAll(part, []byte{'\n'}, []byte{})
part = bytes.ReplaceAll(part, []byte{'\r'}, []byte{})
matches := htmlMetaRegex.FindAll(part, -1)
if len(matches) == 0 {
return false
}
for _, m := range matches {
var name, value, content string
ms := htmlMetaContentRegex.FindAllStringSubmatch(string(m), -1)
for _, m := range ms {
switch m[1] {
case "name":
name = m[2]
case "value":
value = m[2]
case "content":
content = m[2]
}
}
var val = value
if val == "" {
val = content
}
name = strings.Trim(name, `"'`)
val = strings.Trim(val, `"'`)
if name != "generator" || val == "" {
continue
}
if strings.Contains(val, "jlatex2html") ||
strings.Contains(val, "latex2html") ||
strings.Contains(val, "groff") ||
strings.Contains(val, "makeinfo") ||
strings.Contains(val, "texi2html") ||
strings.Contains(val, "ronn") ||
orgModeMetaRegex.MatchString(val) {
return true
}
}
return false
}
func isGeneratedJooq(_, ext string, content []byte) bool {
if ext != ".java" {
return false
}
for _, l := range getLines(content, 2) {
if bytes.Contains(l, []byte("This file is generated by jOOQ.")) {
return true
}
}
return false
}
// getLines returns up to the first n lines. A negative index will return up to
// the last n lines in reverse order.
func getLines(content []byte, n int) [][]byte {
var result [][]byte
if n < 0 {
for pos := len(content); pos > 0 && len(result) < -n; {
nlpos := bytes.LastIndexByte(content[:pos], '\n')
if nlpos+1 < len(content)-1 {
result = append(result, content[nlpos+1:pos])
}
pos = nlpos
}
} else {
for pos := 0; pos < len(content) && len(result) < n; {
nlpos := bytes.IndexByte(content[pos:], '\n')
if nlpos < 0 && pos < len(content) {
nlpos = len(content)
} else if nlpos >= 0 {
nlpos += pos
}
result = append(result, content[pos:nlpos])
pos = nlpos + 1
}
}
return result
}
func forEachLine(content []byte, cb func([]byte)) {
var pos int
for pos < len(content) {
nlpos := bytes.IndexByte(content[pos:], '\n')
if nlpos < 0 && pos < len(content) {
nlpos = len(content)
} else if nlpos >= 0 {
nlpos += pos
}
cb(content[pos:nlpos])
pos = nlpos + 1
}
}
func countAppearancesInLine(line []byte, targets ...string) int {
var count int
for _, t := range targets {
count += bytes.Count(line, []byte(t))
}
return count
}