implement IsGenerated helper to filter out generated files

Closes #17

Implements the IsGenerated helper function to filter out generated
files using the rules and matchers in:
- https://github.com/github/linguist/blob/master/lib/linguist/generated.rb

Since the vast majority of matchers have very different logic, it cannot
be autogenerated directly from linguist like other logics in enry, so it's
translated by hand.

There are three different types of matchers in this implementation:
- By extension, which mark as generated based only in the extension. These
  are the fastest matchers, so they're done first.
- By file name, which matches patterns against the filename. These
  are performed in second place. Unlike linguist, we try to use string
  functions instead of regexps as much as possible.
- Finally, the rest of the matchers, which go into the content and try
  to identify if they're generated or not based on the content. Unlike
  linguist, we try to only read the content we need and not split it
  all unless it's necessary and use byte functions instead of regexps
  as much as possible.

Signed-off-by: Miguel Molina <miguel@erizocosmi.co>
This commit is contained in:
Miguel Molina
2020-05-27 15:07:57 +02:00
parent bda45fdc8e
commit 8ff885a3a8
54 changed files with 4513 additions and 98 deletions

View File

@ -0,0 +1,8 @@
<!DOCTYPE html>
<html>
<head>
<meta
content = "Org mode"
name = "generator" >
</head>
</html>

View File

@ -0,0 +1,12 @@
<!DOCTYPE html>
<html>
<head>
<meta
data-foo="Bar"
http-equiv="Content-Type: text/html; charset=UTF-8"
content = "Org mode"
id="Some wicked id"
wrong-content="whoops"
name = "generator" >
</head>
</html>

View File

@ -0,0 +1,6 @@
<!DOCTYPE html>
<html>
<head>
<meta name = "generator" content = "Org mode" />
</head>
</html>

View File

@ -0,0 +1,7 @@
<!DOCTYPE html>
<html>
<meta >
<meta name="generator" content="something invalid">
<meta name="generator" content="makeinfo 4.8"
/>
</html>

View File

@ -0,0 +1,21 @@
<!-- Creator : groff version 1.22.4 -->
<!-- CreationDate: Tue Jul 2 20:06:41 2019 -->
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
"http://www.w3.org/TR/html4/loose.dtd">
<html>
<head>
<meta name="generator" content="groff -Thtml, see www.gnu.org">
<meta http-equiv="Content-Type" content="text/html; charset=US-ASCII">
<meta name="Content-Style" content="text/css">
<style type="text/css">
p { margin-top: 0; margin-bottom: 0; vertical-align: top }
pre { margin-top: 0; margin-bottom: 0; vertical-align: top }
table { margin-top: 0; margin-bottom: 0; vertical-align: top }
h1 { text-align: center }
</style>
<title></title>
</head>
<body>
... document truncated
</body>
</html>

View File

@ -0,0 +1,25 @@
<?xml version="1.0" encoding="us-ascii"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1 plus MathML 2.0//EN"
"http://www.w3.org/TR/MathML2/dtd/xhtml-math11-f.dtd"
[<!ENTITY mathml "http://www.w3.org/1998/Math/MathML">]>
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
<head>
<meta name="generator" content="groff -Txhtml, see www.gnu.org"/>
<meta http-equiv="Content-Type" content="text/html; charset=US-ASCII"/>
<meta name="Content-Style" content="text/css"/>
<style type="text/css">
.center { text-align: center }
.right { text-align: right }
p { margin-top: 0; margin-bottom: 0; vertical-align: top }
pre { margin-top: 0; margin-bottom: 0; vertical-align: top }
table { margin-top: 0; margin-bottom: 0; vertical-align: top }
h1 { text-align: center }
</style>
<!-- Creator : groff version 1.22.4 -->
<!-- CreationDate: Tue Jul 2 20:08:09 2019 -->
<title></title>
</head>
<body>
... truncated
</body>
</html>

View File

@ -0,0 +1,47 @@
<html lang="en">
<head>
<title>Asm Mode - GNU Emacs Manual</title>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
<meta name="description" content="GNU Emacs Manual">
<meta name="generator" content="makeinfo 4.8">
<link title="Top" rel="start" href="index.html#Top">
<link rel="up" href="Programs.html#Programs" title="Programs">
<link rel="prev" href="C-Modes.html#C-Modes" title="C Modes">
<link rel="next" href="Fortran.html#Fortran" title="Fortran">
<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
<!--
This is the `GNU Emacs Manual',
updated for Emacs version {No value for `EMACSVER'}.
Copyright (C) 1985--1987, 1993--2019 Free Software Foundation, Inc.
Permission is granted to copy, distribute and/or modify this
document under the terms of the GNU Free Documentation License,
Version 1.3 or any later version published by the Free Software
Foundation; with the Invariant Sections being ``The GNU
Manifesto,'' ``Distribution'' and ``GNU GENERAL PUBLIC LICENSE,''
with the Front-Cover Texts being ``A GNU Manual,'' and with the
Back-Cover Texts as in (a) below. A copy of the license is
included in the section entitled ``GNU Free Documentation
License.''
(a) The FSF's Back-Cover Text is: ``You have the freedom to copy
and modify this GNU manual. Buying copies from the FSF supports
it in developing GNU and promoting software freedom.''
-->
<meta http-equiv="Content-Style-Type" content="text/css">
<style type="text/css"><!--
pre.display { font-family:inherit }
pre.format { font-family:inherit }
pre.smalldisplay { font-family:inherit; font-size:smaller }
pre.smallformat { font-family:inherit; font-size:smaller }
pre.smallexample { font-size:smaller }
pre.smalllisp { font-size:smaller }
span.sc { font-variant:small-caps }
span.roman { font-family:serif; font-weight:normal; }
span.sansserif { font-family:sans-serif; font-weight:normal; }
--></style>
</head>
<body>
</body>
</html>

View File

@ -0,0 +1,40 @@
<!DOCTYPE html>
<html>
<!-- This is an automatically generated file. Do not edit.
$OpenBSD: mandoc.1,v 1.161 2019/02/23 18:52:45 schwarze Exp $
Copyright (c) 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv>
Copyright (c) 2012, 2014-2018 Ingo Schwarze <schwarze@openbsd.org>
Permission to use, copy, modify, and distribute this software for any
purpose with or without fee is hereby granted, provided that the above
copyright notice and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
-->
<head>
<meta charset="utf-8"/>
<style>
table.head, table.foot { width: 100%; }
td.head-rtitle, td.foot-os { text-align: right; }
td.head-vol { text-align: center; }
div.Pp { margin: 1ex 0ex; }
div.Nd, div.Bf, div.Op { display: inline; }
span.Pa, span.Ad { font-style: italic; }
span.Ms { font-weight: bold; }
dl.Bl-diag > dt { font-weight: bold; }
code.Nm, code.Fl, code.Cm, code.Ic, code.In, code.Fd, code.Fn,
code.Cd { font-weight: bold; font-family: inherit; }
</style>
<title>MANDOC(1)</title>
</head>
<body>
... document truncated
</body>
</html>

View File

@ -0,0 +1,4 @@
<!DOCTYPE html>
<html>
<meta name="generator" value="Some sick tool nobody's using yet" />
</html>

View File

@ -0,0 +1,73 @@
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
<!--Converted with jLaTeX2HTML 2002-2-1 (1.70) JA patch-2.0
patched version by: Kenshi Muto, Debian Project.
* modified by: Shige TAKENO
LaTeX2HTML 2002-2-1 (1.70),
original version by: Nikos Drakos, CBLU, University of Leeds
* revised and updated by: Marcus Hennecke, Ross Moore, Herb Swan
* with significant contributions from:
Jens Lippmann, Marek Rouchal, Martin Wilck and others -->
<HTML>
<HEAD>
<TITLE>Quality/Complexity metric plugins</TITLE>
<META NAME="description" CONTENT="Quality/Complexity metric plugins">
<META NAME="keywords" CONTENT="main">
<META NAME="resource-type" CONTENT="document">
<META NAME="distribution" CONTENT="global">
<META NAME="Generator" CONTENT="jLaTeX2HTML v2002-2-1 JA patch-2.0">
<META HTTP-EQUIV="Content-Style-Type" CONTENT="text/css">
<LINK REL="STYLESHEET" HREF="main.css">
<LINK REL="next" HREF="node79.html">
<LINK REL="previous" HREF="node77.html">
<LINK REL="up" HREF="node74.html">
<LINK REL="next" HREF="node79.html">
</HEAD>
<BODY >
<!--Navigation Panel-->
<A NAME="tex2html1237"
HREF="node79.html">
<IMG WIDTH="37" HEIGHT="24" ALIGN="BOTTOM" BORDER="0" ALT="next"
SRC="file:/usr/share/latex2html/icons/next.png"></A>
<A NAME="tex2html1233"
HREF="node74.html">
<IMG WIDTH="26" HEIGHT="24" ALIGN="BOTTOM" BORDER="0" ALT="up"
SRC="file:/usr/share/latex2html/icons/up.png"></A>
<A NAME="tex2html1227"
HREF="node77.html">
<IMG WIDTH="63" HEIGHT="24" ALIGN="BOTTOM" BORDER="0" ALT="previous"
SRC="file:/usr/share/latex2html/icons/prev.png"></A>
<A NAME="tex2html1235"
HREF="node1.html">
<IMG WIDTH="65" HEIGHT="24" ALIGN="BOTTOM" BORDER="0" ALT="contents"
SRC="file:/usr/share/latex2html/icons/contents.png"></A>
<BR>
<B> Next:</B> <A NAME="tex2html1238"
HREF="node79.html">Time signal generator</A>
<B> Up:</B> <A NAME="tex2html1234"
HREF="node74.html">Additional simulator components</A>
<B> Previous:</B> <A NAME="tex2html1228"
HREF="node77.html">Weather scenario ()</A>
&nbsp; <B> <A NAME="tex2html1236"
HREF="node1.html">Contents</A></B>
<BR>
<BR>
<!--End of Navigation Panel-->
<H3><A NAME="SECTION000101400000000000000">
Quality/Complexity metric plugins</A>
</H3>
Various plugins to measure schedule quality and problem complexity to suit the application.
<P>
<BR><HR>
<ADDRESS>
Steve Fraser
2008-01-31
</ADDRESS>
</BODY>
</HTML>

View File

@ -0,0 +1,20 @@
<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en">
<head>
<!-- 2019-06-08 Sat 04:15 -->
<meta http-equiv="Content-Type" /
content="text/html;charset=utf-8" />
<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<title>unpackaged.el</title>
<meta name="generator" content="Org mode" />
<meta name="author" content="Adam Porter" />
<style type="text/css">
<!--/*--><![CDATA[/*><!--*/
</style>
</head>
<body>
</body>
</html>

31
_testdata/HTML/pages.html Normal file
View File

@ -0,0 +1,31 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
<title>Related Pages</title>
<link href="qt.css" rel="stylesheet" type="text/css"/>
</head>
<body>
<div class=header>
<a class=headerLink href="index.html">Main Page</a> &middot;
<a class=headerLink href="classoverview.html">Class Overview</a> &middot;
<a class=headerLink href="hierarchy.html">Hierarchy</a> &middot;
<a class=headerLink href="annotated.html">All Classes</a>
</div>
<!-- Generated by Doxygen 1.8.1.2 -->
</div><!-- top -->
<div class="header">
<div class="headertitle">
<div class="title">Related Pages</div> </div>
</div><!--header-->
<div class="contents">
<div class="textblock">Here is a list of all related documentation pages:</div><div class="directory">
<table class="directory">
<tr id="row_0_" class="even"><td class="entry"><img src="ftv2node.png" alt="o" width="16" height="22" /><a class="el" href="classoverview.html" target="_self">Class Overview</a></td><td class="desc"></td></tr>
<tr id="row_1_"><td class="entry"><img src="ftv2lastnode.png" alt="\" width="16" height="22" /><a class="el" href="thelayoutsystem.html" target="_self">The Layout System</a></td><td class="desc"></td></tr>
</table>
</div><!-- directory -->
</div><!-- contents -->
<div class="footer" />Generated with <a href="http://www.doxygen.org/index.html">Doxygen</a> 1.8.1.2</div>
</body>
</html>

View File

@ -0,0 +1,10 @@
<!DOCTYPE html>
<html>
<head>
<meta
name ="generator"
content =
"Org mode"
>
</head>
</html>

View File

@ -0,0 +1,6 @@
<!DOCTYPE html>
<html>
<head>
<meta name = generator content = makeinfo />
</head>
</html>

View File

@ -0,0 +1,6 @@
<!DOCTYPE html>
<html>
<head>
<meta name = 'generator' content = 'Org mode'>
</head>
</html>

4
_testdata/HTML/ronn.html Normal file
View File

@ -0,0 +1,4 @@
<!DOCTYPE html>
<html>
<meta name="generator" value="Ronn/v0.7.3 (http://github.com/rtomayko/ronn/tree/0.7.3)" />
</html>

View File

@ -0,0 +1,4 @@
<!DOCTYPE html>
<html>
<meta name="generator" content="Some sick tool nobody's using yet" />
</html>

View File

@ -0,0 +1,8 @@
<!DOCTYPE html>
<html>
<head>
<META NAME= 'GENERATOR'
CONTENT
= 'ORG MODE'>
</head>
</html>