tartrazine/data/generated.go
Miguel Molina 78696c2272
data: bailout in some cases if there arent enough lines
Signed-off-by: Miguel Molina <miguel@erizocosmi.co>
2020-05-28 13:39:59 +02:00

824 lines
18 KiB
Go

package data
import (
"bytes"
"strings"
"github.com/go-enry/go-enry/v2/regex"
)
// GeneratedCodeExtensions contains all extensions that belong to generated
// files for sure.
var GeneratedCodeExtensions = map[string]struct{}{
// XCode files
".nib": {},
".xcworkspacedata": {},
".xcuserstate": {},
}
// GeneratedCodeNameMatcher is a function that tells whether the file with the
// given name is generated.
type GeneratedCodeNameMatcher func(string) bool
func nameMatches(pattern string) GeneratedCodeNameMatcher {
r := regex.MustCompile(pattern)
return func(name string) bool {
return r.MatchString(name)
}
}
func nameContains(pattern string) GeneratedCodeNameMatcher {
return func(name string) bool {
return strings.Contains(name, pattern)
}
}
func nameEndsWith(pattern string) GeneratedCodeNameMatcher {
return func(name string) bool {
return strings.HasSuffix(name, pattern)
}
}
// GeneratedCodeNameMatchers are all the matchers that check whether the code
// is generated based only on the file name.
var GeneratedCodeNameMatchers = []GeneratedCodeNameMatcher{
// Cocoa pods
nameMatches(`(^Pods|\/Pods)\/`),
// Carthage build
nameMatches(`(^|\/)Carthage\/Build\/`),
// NET designer file
nameMatches(`(?i)\.designer\.(cs|vb)$`),
// Generated NET specflow feature file
nameEndsWith(".feature.cs"),
// Node modules
nameContains("node_modules/"),
// Go vendor
nameMatches(`vendor\/([-0-9A-Za-z]+\.)+(com|edu|gov|in|me|net|org|fm|io)`),
// Go lock
nameEndsWith("Gopkg.lock"),
nameEndsWith("glide.lock"),
// Esy lock
nameMatches(`(^|\/)(\w+\.)?esy.lock$`),
// NPM shrinkwrap
nameEndsWith("npm-shrinkwrap.json"),
// NPM package lock
nameEndsWith("package-lock.json"),
// Yarn plugnplay
nameMatches(`(^|\/)\.pnp\.(c|m)?js$`),
// Godeps
nameContains("Godeps/"),
// Composer lock
nameEndsWith("composer.lock"),
// Generated by zephir
nameMatches(`.\.zep\.(?:c|h|php)$`),
// Cargo lock
nameEndsWith("Cargo.lock"),
// Pipenv lock
nameEndsWith("Pipfile.lock"),
// GraphQL relay
nameContains("__generated__/"),
}
// GeneratedCodeMatcher checks whether the file with the given data is
// generated code.
type GeneratedCodeMatcher func(path, ext string, content []byte) bool
// GeneratedCodeMatchers is the list of all generated code matchers that
// rely on checking the content of the file to make the guess.
var GeneratedCodeMatchers = []GeneratedCodeMatcher{
isMinifiedFile,
hasSourceMapReference,
isSourceMap,
isCompiledCoffeeScript,
isGeneratedNetDocfile,
isGeneratedJavaScriptPEGParser,
isGeneratedPostScript,
isGeneratedGo,
isGeneratedProtobuf,
isGeneratedJavaScriptProtocolBuffer,
isGeneratedApacheThrift,
isGeneratedJNIHeader,
isVCRCassette,
isCompiledCythonFile,
isGeneratedModule,
isGeneratedUnity3DMeta,
isGeneratedRacc,
isGeneratedJFlex,
isGeneratedGrammarKit,
isGeneratedRoxygen2,
isGeneratedJison,
isGeneratedGRPCCpp,
isGeneratedDart,
isGeneratedPerlPPPortHeader,
isGeneratedGameMakerStudio,
isGeneratedGimp,
isGeneratedVisualStudio6,
isGeneratedHaxe,
isGeneratedHTML,
isGeneratedJooq,
}
func canBeMinified(ext string) bool {
return ext == ".js" || ext == ".css"
}
// isMinifiedFile returns whether the file may be minified.
// We consider a minified file any css or js file whose average number of chars
// per line is more than 110.
func isMinifiedFile(path, ext string, content []byte) bool {
if !canBeMinified(ext) {
return false
}
var chars, lines uint64
forEachLine(content, func(line []byte) {
chars += uint64(len(line))
lines++
})
if lines == 0 {
return false
}
return chars/lines > 110
}
var sourceMapRegex = regex.MustCompile(`^\/[*\/][\#@] source(?:Mapping)?URL|sourceURL=`)
// hasSourceMapReference returns whether the file contains a reference to a
// source-map file.
func hasSourceMapReference(_ string, ext string, content []byte) bool {
if !canBeMinified(ext) {
return false
}
for _, line := range getLines(content, -2) {
if sourceMapRegex.Match(line) {
return true
}
}
return false
}
var sourceMapRegexps = []regex.EnryRegexp{
regex.MustCompile(`^{"version":\d+,`),
regex.MustCompile(`^\/\*\* Begin line maps\. \*\*\/{`),
}
// isSourceMap returns whether the file itself is a source map.
func isSourceMap(path, _ string, content []byte) bool {
if strings.HasSuffix(path, ".js.map") || strings.HasSuffix(path, ".css.map") {
return true
}
firstLine := getFirstLine(content)
if len(firstLine) == 0 {
return false
}
for _, r := range sourceMapRegexps {
if r.Match(firstLine) {
return true
}
}
return false
}
func isCompiledCoffeeScript(path, ext string, content []byte) bool {
if ext != ".js" {
return false
}
firstLine := getFirstLine(content)
lastLines := getLines(content, -2)
if len(lastLines) < 2 {
return false
}
if string(firstLine) == "(function() {" &&
string(lastLines[1]) == "}).call(this);" &&
string(lastLines[0]) == "" {
score := 0
forEachLine(content, func(line []byte) {
if bytes.Contains(line, []byte("var ")) {
// Underscored temp vars are likely to be Coffee
score += 1 * countAppearancesInLine(line, "_fn", "_i", "_len", "_ref", "_results")
// bind and extend functions are very Coffee specific
score += 3 * countAppearancesInLine(line, "__bind", "__extends", "__hasProp", "__indexOf", "__slice")
}
})
// Require a score of 3. This is fairly abritrary. Consider tweaking later.
// See: https://github.com/github/linguist/blob/master/lib/linguist/generated.rb#L176-L213
return score >= 3
}
return false
}
func isGeneratedNetDocfile(_, ext string, content []byte) bool {
if ext != ".xml" {
return false
}
lines := bytes.Split(content, []byte{'\n'})
if len(lines) <= 3 {
return false
}
return bytes.Contains(lines[1], []byte("<doc>")) &&
bytes.Contains(lines[2], []byte("<assembly>")) &&
bytes.Contains(lines[len(lines)-2], []byte("</doc>"))
}
var pegJavaScriptGeneratedRegex = regex.MustCompile(`^(?:[^\/]|\/[^\*])*\/\*(?:[^\*]|\*[^\/])*Generated by PEG.js`)
func isGeneratedJavaScriptPEGParser(_, ext string, content []byte) bool {
if ext != ".js" {
return false
}
// PEG.js-generated parsers include a comment near the top of the file
// that marks them as such.
return pegJavaScriptGeneratedRegex.Match(bytes.Join(getLines(content, 5), []byte("")))
}
var postScriptType1And42Regex = regex.MustCompile(`(\n|\r\n|\r)\s*(?:currentfile eexec\s+|\/sfnts\s+\[)`)
var postScriptRegexes = []regex.EnryRegexp{
regex.MustCompile(`[0-9]|draw|mpage|ImageMagick|inkscape|MATLAB`),
regex.MustCompile(`PCBNEW|pnmtops|\(Unknown\)|Serif Affinity|Filterimage -tops`),
}
func isGeneratedPostScript(_, ext string, content []byte) bool {
if ext != ".ps" && ext != ".eps" && ext != ".pfa" {
return false
}
// Type 1 and Type 42 fonts converted to PostScript are stored as hex-encoded byte streams; these
// streams are always preceded the `eexec` operator (if Type 1), or the `/sfnts` key (if Type 42).
if postScriptType1And42Regex.Match(content) {
return true
}
// We analyze the "%%Creator:" comment, which contains the author/generator
// of the file. If there is one, it should be in one of the first few lines.
var creator []byte
for _, line := range getLines(content, 10) {
if bytes.HasPrefix(line, []byte("%%Creator: ")) {
creator = line
break
}
}
if len(creator) == 0 {
return false
}
// EAGLE doesn't include a version number when it generates PostScript.
// However, it does prepend its name to the document's "%%Title" field.
if bytes.Contains(creator, []byte("EAGLE")) {
for _, line := range getLines(content, 5) {
if bytes.HasPrefix(line, []byte("%%Title: EAGLE Drawing ")) {
return true
}
}
}
// Most generators write their version number, while human authors' or companies'
// names don't contain numbers. So look if the line contains digits. Also
// look for some special cases without version numbers.
for _, r := range postScriptRegexes {
if r.Match(creator) {
return true
}
}
return false
}
func isGeneratedGo(_, ext string, content []byte) bool {
if ext != ".go" {
return false
}
lines := getLines(content, 40)
if len(lines) <= 1 {
return false
}
for _, line := range lines {
if bytes.Contains(line, []byte("Code generated by")) {
return true
}
}
return false
}
var protoExtensions = map[string]struct{}{
".py": {},
".java": {},
".h": {},
".cc": {},
".cpp": {},
".m": {},
".rb": {},
".php": {},
}
func isGeneratedProtobuf(_, ext string, content []byte) bool {
if _, ok := protoExtensions[ext]; !ok {
return false
}
lines := getLines(content, 3)
if len(lines) <= 1 {
return false
}
for _, line := range lines {
if bytes.Contains(line, []byte("Generated by the protocol buffer compiler. DO NOT EDIT!")) {
return true
}
}
return false
}
func isGeneratedJavaScriptProtocolBuffer(_, ext string, content []byte) bool {
if ext != ".js" {
return false
}
lines := getLines(content, 6)
if len(lines) < 6 {
return false
}
return bytes.Contains(lines[5], []byte("GENERATED CODE -- DO NOT EDIT!"))
}
var apacheThriftExtensions = map[string]struct{}{
".rb": {},
".py": {},
".go": {},
".js": {},
".m": {},
".java": {},
".h": {},
".cc": {},
".cpp": {},
".php": {},
}
func isGeneratedApacheThrift(_, ext string, content []byte) bool {
if _, ok := apacheThriftExtensions[ext]; !ok {
return false
}
for _, line := range getLines(content, 6) {
if bytes.Contains(line, []byte("Autogenerated by Thrift Compiler")) {
return true
}
}
return false
}
func isGeneratedJNIHeader(_, ext string, content []byte) bool {
if ext != ".h" {
return false
}
lines := getLines(content, 2)
if len(lines) < 2 {
return false
}
return bytes.Contains(lines[0], []byte("/* DO NOT EDIT THIS FILE - it is machine generated */")) &&
bytes.Contains(lines[1], []byte("#include <jni.h>"))
}
func isVCRCassette(_, ext string, content []byte) bool {
if ext != ".yml" {
return false
}
lines := getLines(content, -2)
if len(lines) < 2 {
return false
}
return bytes.Contains(lines[1], []byte("recorded_with: VCR"))
}
func isCompiledCythonFile(_, ext string, content []byte) bool {
if ext != ".c" && ext != ".cpp" {
return false
}
lines := getLines(content, 1)
if len(lines) < 1 {
return false
}
return bytes.Contains(lines[0], []byte("Generated by Cython"))
}
func isGeneratedModule(_, ext string, content []byte) bool {
if ext != ".mod" {
return false
}
lines := getLines(content, 1)
if len(lines) < 1 {
return false
}
return bytes.Contains(lines[0], []byte("PCBNEW-LibModule-V")) ||
bytes.Contains(lines[0], []byte("GFORTRAN module version '"))
}
func isGeneratedUnity3DMeta(_, ext string, content []byte) bool {
if ext != ".meta" {
return false
}
lines := getLines(content, 1)
if len(lines) < 1 {
return false
}
return bytes.Contains(lines[0], []byte("fileFormatVersion: "))
}
func isGeneratedRacc(_, ext string, content []byte) bool {
if ext != ".rb" {
return false
}
lines := getLines(content, 3)
if len(lines) < 3 {
return false
}
return bytes.HasPrefix(lines[2], []byte("# This file is automatically generated by Racc"))
}
func isGeneratedJFlex(_, ext string, content []byte) bool {
if ext != ".java" {
return false
}
lines := getLines(content, 1)
if len(lines) < 1 {
return false
}
return bytes.HasPrefix(lines[0], []byte("/* The following code was generated by JFlex "))
}
func isGeneratedGrammarKit(_, ext string, content []byte) bool {
if ext != ".java" {
return false
}
lines := getLines(content, 1)
if len(lines) < 1 {
return false
}
return bytes.Contains(lines[0], []byte("// This is a generated file. Not intended for manual editing."))
}
func isGeneratedRoxygen2(_, ext string, content []byte) bool {
if ext != ".rd" {
return false
}
lines := getLines(content, 1)
if len(lines) < 1 {
return false
}
return bytes.Contains(lines[0], []byte("% Generated by roxygen2: do not edit by hand"))
}
func isGeneratedJison(_, ext string, content []byte) bool {
if ext != ".js" {
return false
}
lines := getLines(content, 1)
if len(lines) < 1 {
return false
}
return bytes.Contains(lines[0], []byte("/* parser generated by jison ")) ||
bytes.Contains(lines[0], []byte("/* generated by jison-lex "))
}
func isGeneratedGRPCCpp(_, ext string, content []byte) bool {
switch ext {
case ".cpp", ".hpp", ".h", ".cc":
lines := getLines(content, 1)
if len(lines) < 1 {
return false
}
return bytes.Contains(lines[0], []byte("// Generated by the gRPC"))
default:
return false
}
}
var dartRegex = regex.MustCompile(`generated code\W{2,3}do not modify`)
func isGeneratedDart(_, ext string, content []byte) bool {
if ext != ".dart" {
return false
}
lines := getLines(content, 1)
if len(lines) < 1 {
return false
}
return dartRegex.Match(bytes.ToLower(lines[0]))
}
func isGeneratedPerlPPPortHeader(name, _ string, content []byte) bool {
if !strings.HasSuffix(name, "ppport.h") {
return false
}
lines := getLines(content, 10)
if len(lines) < 10 {
return false
}
return bytes.Contains(lines[8], []byte("Automatically created by Devel::PPPort"))
}
var (
gameMakerStudioFirstLineRegex = regex.MustCompile(`^\d\.\d\.\d.+\|\{`)
gameMakerStudioThirdLineRegex = regex.MustCompile(`\"modelName\"\:\s*\"GM`)
)
func isGeneratedGameMakerStudio(_, ext string, content []byte) bool {
if ext != ".yy" && ext != ".yyp" {
return false
}
lines := getLines(content, 3)
if len(lines) < 3 {
return false
}
return gameMakerStudioThirdLineRegex.Match(lines[2]) ||
gameMakerStudioFirstLineRegex.Match(lines[0])
}
var gimpRegexes = []regex.EnryRegexp{
regex.MustCompile(`\/\* GIMP [a-zA-Z0-9\- ]+ C\-Source image dump \(.+?\.c\) \*\/`),
regex.MustCompile(`\/\* GIMP header image file format \([a-zA-Z0-9\- ]+\)\: .+?\.h \*\/`),
}
func isGeneratedGimp(_, ext string, content []byte) bool {
if ext != ".c" && ext != ".h" {
return false
}
lines := getLines(content, 1)
if len(lines) < 1 {
return false
}
for _, r := range gimpRegexes {
if r.Match(lines[0]) {
return true
}
}
return false
}
func isGeneratedVisualStudio6(_, ext string, content []byte) bool {
if ext != ".dsp" {
return false
}
for _, l := range getLines(content, 3) {
if bytes.Contains(l, []byte("# Microsoft Developer Studio Generated Build File")) {
return true
}
}
return false
}
var haxeExtensions = map[string]struct{}{
".js": {},
".py": {},
".lua": {},
".cpp": {},
".h": {},
".java": {},
".cs": {},
".php": {},
}
func isGeneratedHaxe(_, ext string, content []byte) bool {
if _, ok := haxeExtensions[ext]; !ok {
return false
}
for _, l := range getLines(content, 3) {
if bytes.Contains(l, []byte("Generated by Haxe")) {
return true
}
}
return false
}
var (
doxygenRegex = regex.MustCompile(`<!--\s+Generated by Doxygen\s+[.0-9]+\s*-->`)
htmlMetaRegex = regex.MustCompile(`<meta(\s+[^>]+)>`)
htmlMetaContentRegex = regex.MustCompile(`\s+(name|content|value)\s*=\s*("[^"]+"|'[^']+'|[^\s"']+)`)
orgModeMetaRegex = regex.MustCompile(`org\s+mode`)
)
func isGeneratedHTML(_, ext string, content []byte) bool {
if ext != ".html" && ext != ".htm" && ext != ".xhtml" {
return false
}
lines := getLines(content, 30)
// Pkgdown
if len(lines) >= 2 {
for _, l := range lines[:2] {
if bytes.Contains(l, []byte("<!-- Generated by pkgdown: do not edit by hand -->")) {
return true
}
}
}
// Mandoc
if len(lines) > 2 &&
bytes.HasPrefix(lines[2], []byte("<!-- This is an automatically generated file.")) {
return true
}
// Doxygen
for _, l := range lines {
if doxygenRegex.Match(l) {
return true
}
}
// HTML tag: <meta name="generator" content="" />
part := bytes.ToLower(bytes.Join(lines, []byte{' '}))
part = bytes.ReplaceAll(part, []byte{'\n'}, []byte{})
part = bytes.ReplaceAll(part, []byte{'\r'}, []byte{})
matches := htmlMetaRegex.FindAll(part, -1)
if len(matches) == 0 {
return false
}
for _, m := range matches {
var name, value, content string
ms := htmlMetaContentRegex.FindAllStringSubmatch(string(m), -1)
for _, m := range ms {
switch m[1] {
case "name":
name = m[2]
case "value":
value = m[2]
case "content":
content = m[2]
}
}
var val = value
if val == "" {
val = content
}
name = strings.Trim(name, `"'`)
val = strings.Trim(val, `"'`)
if name != "generator" || val == "" {
continue
}
if strings.Contains(val, "jlatex2html") ||
strings.Contains(val, "latex2html") ||
strings.Contains(val, "groff") ||
strings.Contains(val, "makeinfo") ||
strings.Contains(val, "texi2html") ||
strings.Contains(val, "ronn") ||
orgModeMetaRegex.MatchString(val) {
return true
}
}
return false
}
func isGeneratedJooq(_, ext string, content []byte) bool {
if ext != ".java" {
return false
}
for _, l := range getLines(content, 2) {
if bytes.Contains(l, []byte("This file is generated by jOOQ.")) {
return true
}
}
return false
}
func getFirstLine(content []byte) []byte {
lines := getLines(content, 1)
if len(lines) > 0 {
return lines[0]
}
return nil
}
// getLines returns up to the first n lines. A negative index will return up to
// the last n lines in reverse order.
func getLines(content []byte, n int) [][]byte {
var result [][]byte
if n < 0 {
for pos := len(content); pos > 0 && len(result) < -n; {
nlpos := bytes.LastIndexByte(content[:pos], '\n')
if nlpos+1 < len(content)-1 {
result = append(result, content[nlpos+1:pos])
}
pos = nlpos
}
} else {
for pos := 0; pos < len(content) && len(result) < n; {
nlpos := bytes.IndexByte(content[pos:], '\n')
if nlpos < 0 && pos < len(content) {
nlpos = len(content)
} else if nlpos >= 0 {
nlpos += pos
}
result = append(result, content[pos:nlpos])
pos = nlpos + 1
}
}
return result
}
func forEachLine(content []byte, cb func([]byte)) {
var pos int
for pos < len(content) {
nlpos := bytes.IndexByte(content[pos:], '\n')
if nlpos < 0 && pos < len(content) {
nlpos = len(content)
} else if nlpos >= 0 {
nlpos += pos
}
cb(content[pos:nlpos])
pos = nlpos + 1
}
}
func countAppearancesInLine(line []byte, targets ...string) int {
var count int
for _, t := range targets {
count += bytes.Count(line, []byte(t))
}
return count
}