From 6264bfc7549d132ebca92da66d99072d8bdc4659 Mon Sep 17 00:00:00 2001 From: Roberto Alsina Date: Thu, 15 Aug 2024 15:39:02 -0300 Subject: [PATCH] Beginning deserialization of data --- TODO.md | 5 +- heuristics/LICENSE.txt | 22 + heuristics/heuristics.yml | 913 ++++++++++++++++++++++++++++++++++++++ src/actions.cr | 96 ++-- src/bytes_regex.cr | 6 +- src/heuristics.cr | 42 ++ src/main.cr | 13 +- src/rules.cr | 31 +- 8 files changed, 1065 insertions(+), 63 deletions(-) create mode 100644 heuristics/LICENSE.txt create mode 100644 heuristics/heuristics.yml create mode 100644 src/heuristics.cr diff --git a/TODO.md b/TODO.md index 91d427d..95e5e2b 100644 --- a/TODO.md +++ b/TODO.md @@ -9,7 +9,4 @@ * ✅ Implement lexer loader by file extension * ✅ Add --line-numbers to terminal formatter * Implement lexer loader by mime type -* ✅ Implement Delegating lexers -* ✅ Add RstLexer -* Add Mako template lexer -* Implement heuristic lexer detection +* Implement Pygment's "DelegateLexer" diff --git a/heuristics/LICENSE.txt b/heuristics/LICENSE.txt new file mode 100644 index 0000000..acc8e6f --- /dev/null +++ b/heuristics/LICENSE.txt @@ -0,0 +1,22 @@ +Copyright (c) 2017 GitHub, Inc. + +Permission is hereby granted, free of charge, to any person +obtaining a copy of this software and associated documentation +files (the "Software"), to deal in the Software without +restriction, including without limitation the rights to use, +copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the +Software is furnished to do so, subject to the following +conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +OTHER DEALINGS IN THE SOFTWARE. diff --git a/heuristics/heuristics.yml b/heuristics/heuristics.yml new file mode 100644 index 0000000..8a6fcea --- /dev/null +++ b/heuristics/heuristics.yml @@ -0,0 +1,913 @@ +# A collection of simple regexp-based rules that can be applied to content +# to disambiguate languages with the same file extension. +# +# There are two top-level keys: disambiguations and named_patterns. +# +# disambiguations - a list of disambiguation rules, one for each +# extension or group of extensions. +# extensions - an array of file extensions that this block applies to. +# rules - list of rules that are applied in order to the content +# of a file with a matching extension. Rules are evaluated +# until one of them matches. If none matches, no language +# is returned. +# language - Language to be returned if the rule matches. +# pattern - Ruby-compatible regular expression that makes the rule +# match. If no pattern is specified, the rule always matches. +# Pattern can be a string with a single regular expression +# or an array of strings that will be merged in a single +# regular expression (with union). +# and - An and block merges multiple rules and checks that all of +# them must match. +# negative_pattern - Same as pattern, but checks for absence of matches. +# named_pattern - A pattern can be reused by specifying it in the +# named_patterns section and referencing it here by its +# key. +# named_patterns - Key-value map of reusable named patterns. +# +# Please keep this list alphabetized. +# +--- +disambiguations: +- extensions: ['.1', '.2', '.3', '.4', '.5', '.6', '.7', '.8', '.9'] + rules: + - language: Roff Manpage + and: + - named_pattern: mdoc-date + - named_pattern: mdoc-title + - named_pattern: mdoc-heading + - language: Roff Manpage + and: + - named_pattern: man-title + - named_pattern: man-heading + - language: Roff + pattern: '^\.(?:[A-Za-z]{2}(?:\s|$)|\\")' +- extensions: ['.1in', '.1m', '.1x', '.3in', '.3m', '.3p', '.3pm', '.3qt', '.3x', '.man', '.mdoc'] + rules: + - language: Roff Manpage + and: + - named_pattern: mdoc-date + - named_pattern: mdoc-title + - named_pattern: mdoc-heading + - language: Roff Manpage + and: + - named_pattern: man-title + - named_pattern: man-heading + - language: Roff +- extensions: ['.al'] + rules: + # AL pattern source from https://github.com/microsoft/AL/blob/master/grammar/alsyntax.tmlanguage - keyword.other.applicationobject.al + - language: AL + and: + - pattern: '\b(?i:(CODEUNIT|PAGE|PAGEEXTENSION|PAGECUSTOMIZATION|DOTNET|ENUM|ENUMEXTENSION|VALUE|QUERY|REPORT|TABLE|TABLEEXTENSION|XMLPORT|PROFILE|CONTROLADDIN|REPORTEXTENSION|INTERFACE|PERMISSIONSET|PERMISSIONSETEXTENSION|ENTITLEMENT))\b' + # Open-ended fallback to Perl AutoLoader + - language: Perl +- extensions: ['.app'] + rules: + - language: Erlang + pattern: '^\{\s*(?:application|''application'')\s*,\s*(?:[a-z]+[\w@]*|''[^'']+'')\s*,\s*\[(?:.|[\r\n])*\]\s*\}\.[ \t]*$' +- extensions: ['.as'] + rules: + - language: ActionScript + pattern: '^\s*(?:package(?:\s+[\w.]+)?\s+(?:\{|$)|import\s+[\w.*]+\s*;|(?=.*?(?:intrinsic|extends))(intrinsic\s+)?class\s+[\w<>.]+(?:\s+extends\s+[\w<>.]+)?|(?:(?:public|protected|private|static)\s+)*(?:(?:var|const|local)\s+\w+\s*:\s*[\w<>.]+(?:\s*=.*)?\s*;|function\s+\w+\s*\((?:\s*\w+\s*:\s*[\w<>.]+\s*(,\s*\w+\s*:\s*[\w<>.]+\s*)*)?\)))' +- extensions: ['.asc'] + rules: + - language: Public Key + pattern: '^(----[- ]BEGIN|ssh-(rsa|dss)) ' + - language: AsciiDoc + pattern: '^[=-]+\s|\{\{[A-Za-z]' + - language: AGS Script + pattern: '^(\/\/.+|((import|export)\s+)?(function|int|float|char)\s+((room|repeatedly|on|game)_)?([A-Za-z]+[A-Za-z_0-9]+)\s*[;\(])' +- extensions: ['.asm'] + rules: + - language: Motorola 68K Assembly + named_pattern: m68k +- extensions: ['.asy'] + rules: + - language: LTspice Symbol + pattern: '^SymbolType[ \t]' + - language: Asymptote +- extensions: ['.bas'] + rules: + - language: FreeBasic + pattern: '^[ \t]*#(?i)(?:define|endif|endmacro|ifn?def|include|lang|macro)(?:$|\s)' + - language: BASIC + pattern: '\A\s*\d' + - language: VBA + and: + - named_pattern: vb-module + - named_pattern: vba + - language: Visual Basic 6.0 + named_pattern: vb-module +- extensions: ['.bb'] + rules: + - language: BlitzBasic + pattern: '(<^\s*; |End Function)' + - language: BitBake + pattern: '^(# |include|require|inherit)\b' + - language: Clojure + pattern: '\((def|defn|defmacro|let)\s' +- extensions: ['.bf'] + rules: + - language: Beef + pattern: '(?-m)^\s*using\s+(System|Beefy)(\.(.*))?;\s*$' + - language: HyPhy + pattern: + - '(?-m)^\s*#include\s+".*";\s*$' + - '\sfprintf\s*\(' + - language: Brainfuck + pattern: '(>\+>|>\+<)' +- extensions: ['.bi'] + rules: + - language: FreeBasic + pattern: '^[ \t]*#(?i)(?:define|endif|endmacro|ifn?def|if|include|lang|macro)(?:$|\s)' +- extensions: ['.bs'] + rules: + - language: Bikeshed + pattern: '^(?i:\r\n]*>' + - language: BrighterScript + pattern: + - (?i:^\s*(?=^sub\s)(?:sub\s*\w+\(.*?\))|(?::\s*sub\(.*?\))$) + - (?i:^\s*(end\ssub)$) + - (?i:^\s*(?=^function\s)(?:function\s*\w+\(.*?\)\s*as\s*\w*)|(?::\s*function\(.*?\)\s*as\s*\w*)$) + - (?i:^\s*(end\sfunction)$) + - language: Bluespec BH + pattern: '^package\s+[A-Za-z_][A-Za-z0-9_'']*(?:\s*\(|\s+where)' +- extensions: ['.builds'] + rules: + - language: XML + pattern: '^(\s*)(?i:\s+\{' + - language: Eiffel + pattern: + - '^\s*\w+\s*(?:,\s*\w+)*[:]\s*\w+\s' + - '^\s*\w+\s*(?:\(\s*\w+[:][^)]+\))?(?:[:]\s*\w+)?(?:--.+\s+)*\s+(?:do|local)\s' + - '^\s*(?:across|deferred|elseif|ensure|feature|from|inherit|inspect|invariant|note|once|require|undefine|variant|when)\s*$' + - language: Euphoria + named_pattern: euphoria +- extensions: ['.ecl'] + rules: + - language: ECLiPSe + pattern: '^[^#]+:-' + - language: ECL + pattern: ':=' +- extensions: ['.es'] + rules: + - language: Erlang + pattern: '^\s*(?:%%|main\s*\(.*?\)\s*->)' + - language: JavaScript + pattern: '\/\/|("|'')use strict\1|export\s+default\s|\/\*(?:.|[\r\n])*?\*\/' +- extensions: ['.ex'] + rules: + - language: Elixir + pattern: + - '^\s*@moduledoc\s' + - '^\s*(?:cond|import|quote|unless)\s' + - '^\s*def(?:exception|impl|macro|module|protocol)[(\s]' + - language: Euphoria + named_pattern: euphoria +- extensions: ['.f'] + rules: + - language: Forth + pattern: '^: ' + - language: Filebench WML + pattern: 'flowop' + - language: Fortran + named_pattern: fortran +- extensions: ['.for'] + rules: + - language: Forth + pattern: '^: ' + - language: Fortran + named_pattern: fortran +- extensions: ['.fr'] + rules: + - language: Forth + pattern: '^(: |also |new-device|previous )' + - language: Frege + pattern: '^\s*(import|module|package|data|type) ' + - language: Text +- extensions: ['.frm'] + rules: + - language: VBA + and: + - named_pattern: vb-form + - pattern: '^\s*Begin\s+\{[0-9A-Z\-]*\}\s?' + - language: Visual Basic 6.0 + and: + - named_pattern: vb-form + - pattern: '^\s*Begin\s+VB\.Form\s+' +- extensions: ['.fs'] + rules: + - language: Forth + pattern: '^(: |new-device)' + - language: 'F#' + pattern: '^\s*(#light|import|let|module|namespace|open|type)' + - language: GLSL + pattern: '^\s*(#version|precision|uniform|varying|vec[234])' + - language: Filterscript + pattern: '#include|#pragma\s+(rs|version)|__attribute__' +- extensions: ['.ftl'] + rules: + - language: FreeMarker + pattern: '^(?:<|[a-zA-Z-][a-zA-Z0-9_-]+[ \t]+\w)|\$\{\w+[^\r\n]*?\}|^[ \t]*(?:<#--.*?-->|<#([a-z]+)(?=\s|>)[^>]*>.*?|\[#--.*?--\]|\[#([a-z]+)(?=\s|\])[^\]]*\].*?\[#\2\])' + - language: Fluent + pattern: '^-?[a-zA-Z][a-zA-Z0-9_-]* *=|\{\$-?[a-zA-Z][-\w]*(?:\.[a-zA-Z][-\w]*)?\}' +- extensions: ['.g'] + rules: + - language: GAP + pattern: '\s*(Declare|BindGlobal|KeyDependentOperation|Install(Method|GlobalFunction)|SetPackageInfo)' + - language: G-code + pattern: '^[MG][0-9]+(?:\r?\n|\r)' +- extensions: ['.gd'] + rules: + - language: GAP + pattern: '\s*(Declare|BindGlobal|KeyDependentOperation)' + - language: GDScript + pattern: '\s*(extends|var|const|enum|func|class|signal|tool|yield|assert|onready)' +- extensions: ['.gml'] + rules: + - language: XML + pattern: '(?i:^\s*(<\?xml|xmlns))' + - language: Graph Modeling Language + pattern: '(?i:^\s*(graph|node)\s+\[$)' + - language: Gerber Image + pattern: '^[DGMT][0-9]{2}\*$' + - language: Game Maker Language +- extensions: ['.gs'] + rules: + - language: GLSL + pattern: '^#version\s+[0-9]+\b' + - language: Gosu + pattern: '^uses (java|gw)\.' + - language: Genie + pattern: '^\[indent=[0-9]+\]' +- extensions: ['.gsc'] + rules: + - language: GSC + named_pattern: gsc +- extensions: ['.gsh'] + rules: + - language: GSC + named_pattern: gsc +- extensions: ['.gts'] + rules: + - language: Gerber Image + pattern: '^G0.' + - language: Glimmer TS + negative_pattern: '^G0.' +- extensions: ['.h'] + rules: + - language: Objective-C + named_pattern: objectivec + - language: C++ + named_pattern: cpp + - language: C +- extensions: ['.hh'] + rules: + - language: Hack + pattern: '<\?hh' +- extensions: ['.html'] + rules: + - language: Ecmarkup + pattern: ')' + - language: HTML +- extensions: ['.i'] + rules: + - language: Motorola 68K Assembly + named_pattern: m68k + - language: SWIG + pattern: '^[ \t]*%[a-z_]+\b|^%[{}]$' +- extensions: ['.ice'] + rules: + - language: JSON + pattern: '\A\s*[{\[]' + - language: Slice +- extensions: ['.inc'] + rules: + - language: Motorola 68K Assembly + named_pattern: m68k + - language: PHP + pattern: '^<\?(?:php)?' + - language: SourcePawn + pattern: + - '^public\s+(?:SharedPlugin(?:\s+|:)__pl_\w+\s*=(?:\s*\{)?|(?:void\s+)?__pl_\w+_SetNTVOptional\(\)(?:\s*\{)?)' + - '^methodmap\s+\w+\s+<\s+\w+' + - '^\s*MarkNativeAsOptional\s*\(' + - language: NASL + pattern: + - '^\s*include\s*\(\s*(?:"|'')[\\/\w\-\.:\s]+\.(?:nasl|inc)\s*(?:"|'')\s*\)\s*;' + - '^\s*(?:global|local)_var\s+(?:\w+(?:\s*=\s*[\w\-"'']+)?\s*)(?:,\s*\w+(?:\s*=\s*[\w\-"'']+)?\s*)*+\s*;' + - '^\s*namespace\s+\w+\s*\{' + - '^\s*object\s+\w+\s*(?:extends\s+\w+(?:::\w+)?)?\s*\{' + - '^\s*(?:public\s+|private\s+|\s*)function\s+\w+\s*\([\w\s,]*\)\s*\{' + - language: POV-Ray SDL + pattern: '^\s*#(declare|local|macro|while)\s' + - language: Pascal + pattern: + - '(?i:^\s*\{\$(?:mode|ifdef|undef|define)[ ]+[a-z0-9_]+\})' + - '^\s*end[.;]\s*$' + - language: BitBake + pattern: '^inherit(\s+[\w.-]+)+\s*$' +- extensions: ['.json'] + rules: + - language: OASv2-json + pattern: '"swagger":\s?"2.[0-9.]+"' + - language: OASv3-json + pattern: '"openapi":\s?"3.[0-9.]+"' + - language: JSON +- extensions: ['.l'] + rules: + - language: Common Lisp + pattern: '\(def(un|macro)\s' + - language: Lex + pattern: '^(%[%{}]xs|<.*>)' + - language: Roff + pattern: '^\.[A-Za-z]{2}(\s|$)' + - language: PicoLisp + pattern: '^\((de|class|rel|code|data|must)\s' +- extensions: ['.lean'] + rules: + - language: Lean + pattern: '^import [a-z]' + - language: Lean 4 + pattern: '^import [A-Z]' +- extensions: ['.ls'] + rules: + - language: LoomScript + pattern: '^\s*package\s*[\w\.\/\*\s]*\s*\{' + - language: LiveScript +- extensions: ['.lsp', '.lisp'] + rules: + - language: Common Lisp + pattern: '^\s*\((?i:defun|in-package|defpackage) ' + - language: NewLisp + pattern: '^\s*\(define ' +- extensions: ['.m'] + rules: + - language: Objective-C + named_pattern: objectivec + - language: Mercury + pattern: ':- module' + - language: MUF + pattern: '^: ' + - language: M + pattern: '^\s*;' + - language: Mathematica + and: + - pattern: '\(\*' + - pattern: '\*\)$' + - language: MATLAB + pattern: '^\s*%' + - language: Limbo + pattern: '^\w+\s*:\s*module\s*\{' +- extensions: ['.m4'] + rules: + - language: M4Sugar + pattern: + - 'AC_DEFUN|AC_PREREQ|AC_INIT' + - '^_?m4_' + - language: 'M4' +- extensions: ['.mask'] + rules: + - language: Unity3D Asset + pattern: 'tag:unity3d.com' +- extensions: ['.mc'] + rules: + - language: Win32 Message File + pattern: '(?i)^[ \t]*(?>\/\*\s*)?MessageId=|^\.$' + - language: M4 + pattern: '^dnl|^divert\((?:-?\d+)?\)|^\w+\(`[^\r\n]*?''[),]' + - language: Monkey C + pattern: '\b(?:using|module|function|class|var)\s+\w' +- extensions: ['.md'] + rules: + - language: Markdown + pattern: + - '(^[-A-Za-z0-9=#!\*\[|>])|<\/' + - '\A\z' + - language: GCC Machine Description + pattern: '^(;;|\(define_)' + - language: Markdown +- extensions: ['.ml'] + rules: + - language: OCaml + pattern: '(^\s*module)|let rec |match\s+(\S+\s)+with' + - language: Standard ML + pattern: '=> |case\s+(\S+\s)+of' +- extensions: ['.mod'] + rules: + - language: XML + pattern: '\s*$)' + - language: OpenStep Property List +- extensions: ['.plt'] + rules: + - language: Prolog + pattern: '^\s*:-' +- extensions: ['.pm'] + rules: + - language: Perl + and: + - negative_pattern: '^\s*use\s+v6\b' + - named_pattern: perl + - language: Raku + named_pattern: raku + - language: X PixMap + pattern: '^\s*\/\* XPM \*\/' +- extensions: ['.pod'] + rules: + - language: Pod 6 + pattern: '^[\s&&[^\r\n]]*=(comment|begin pod|begin para|item\d+)' + - language: Pod +- extensions: ['.pp'] + rules: + - language: Pascal + pattern: '^\s*end[.;]' + - language: Puppet + pattern: '^\s+\w+\s+=>\s' +- extensions: ['.pro'] + rules: + - language: Proguard + pattern: '^-(include\b.*\.pro$|keep\b|keepclassmembers\b|keepattributes\b)' + - language: Prolog + pattern: '^[^\[#]+:-' + - language: INI + pattern: 'last_client=' + - language: QMake + and: + - pattern: HEADERS + - pattern: SOURCES + - language: IDL + pattern: '^\s*(?i:function|pro|compile_opt) \w[ \w,:]*$' +- extensions: ['.properties'] + rules: + - language: INI + and: + - named_pattern: key_equals_value + - pattern: '^[;\[]' + - language: Java Properties + and: + - named_pattern: key_equals_value + - pattern: '^[#!]' + - language: INI + named_pattern: key_equals_value + - language: Java Properties + pattern: '^[^#!][^:]*:' +- extensions: ['.q'] + rules: + - language: q + pattern: '((?i:[A-Z.][\w.]*:\{)|^\\(cd?|d|l|p|ts?) )' + - language: HiveQL + pattern: '(?i:SELECT\s+[\w*,]+\s+FROM|(CREATE|ALTER|DROP)\s(DATABASE|SCHEMA|TABLE))' +- extensions: ['.qs'] + rules: + - language: Q# + pattern: '^((\/{2,3})?\s*(namespace|operation)\b)' + - language: Qt Script + pattern: '(\w+\.prototype\.\w+|===|\bvar\b)' +- extensions: ['.r'] + rules: + - language: Rebol + pattern: '(?i:\bRebol\b)' + - language: Rez + pattern: '(#include\s+["<](Types\.r|Carbon\/Carbon\.r)[">])|((resource|data|type)\s+''[A-Za-z0-9]{4}''\s+((\(.*\)\s+){0,1}){)' + - language: R + pattern: '<-|^\s*#' +- extensions: ['.re'] + rules: + - language: Reason + pattern: + - '^\s*module\s+type\s' + - '^\s*(?:include|open)\s+\w+\s*;\s*$' + - '^\s*let\s+(?:module\s\w+\s*=\s*\{|\w+:\s+.*=.*;\s*$)' + - language: C++ + pattern: + - '^\s*#(?:(?:if|ifdef|define|pragma)\s+\w|\s*include\s+<[^>]+>)' + - '^\s*template\s*<' +- extensions: ['.res'] + rules: + - language: ReScript + pattern: + - '^\s*(let|module|type)\s+\w*\s+=\s+' + - '^\s*(?:include|open)\s+\w+\s*$' +- extensions: ['.rno'] + rules: + - language: RUNOFF + pattern: '(?i:^\.!|^\f|\f$|^\.end lit(?:eral)?\b|^\.[a-zA-Z].*?;\.[a-zA-Z](?:[; \t])|\^\*[^\s*][^*]*\\\*(?=$|\s)|^\.c;[ \t]*\w+)' + - language: Roff + pattern: '^\.\\" ' +- extensions: ['.rpy'] + rules: + - language: Python + pattern: '^(import|from|class|def)\s' + - language: "Ren'Py" +- extensions: ['.rs'] + rules: + - language: Rust + pattern: '^(use |fn |mod |pub |macro_rules|impl|#!?\[)' + - language: RenderScript + pattern: '#include|#pragma\s+(rs|version)|__attribute__' + - language: XML + pattern: '^\s*<\?xml' +- extensions: ['.s'] + rules: + - language: Motorola 68K Assembly + named_pattern: m68k +- extensions: ['.sc'] + rules: + - language: SuperCollider + pattern: '(?i:\^(this|super)\.|^\s*~\w+\s*=\.)' + - language: Scala + pattern: '(^\s*import (scala|java)\.|^\s*class\b)' +- extensions: ['.scd'] + rules: + - language: SuperCollider + pattern: '(?i:\^(this|super)\.|^\s*(~\w+\s*=\.|SynthDef\b))' + - language: Markdown + # Markdown syntax for scdoc + pattern: '^#+\s+(NAME|SYNOPSIS|DESCRIPTION)' +- extensions: ['.sol'] + rules: + - language: Solidity + pattern: '\bpragma\s+solidity\b|\b(?:abstract\s+)?contract\s+(?!\d)[a-zA-Z0-9$_]+(?:\s+is\s+(?:[a-zA-Z0-9$_][^\{]*?)?)?\s*\{' + - language: Gerber Image + pattern: '^[DGMT][0-9]{2}\*(?:\r?\n|\r)' +- extensions: ['.sql'] + rules: + # Postgres + - language: PLpgSQL + pattern: '(?i:^\\i\b|AS\s+\$\$|LANGUAGE\s+''?plpgsql''?|BEGIN(\s+WORK)?\s*;)' + # IBM db2 + - language: SQLPL + pattern: '(?i:ALTER\s+MODULE|MODE\s+DB2SQL|\bSYS(CAT|PROC)\.|ASSOCIATE\s+RESULT\s+SET|\bEND!\s*$)' + # Oracle + - language: PLSQL + pattern: '(?i:\$\$PLSQL_|XMLTYPE|systimestamp|\.nextval|CONNECT\s+BY|AUTHID\s+(DEFINER|CURRENT_USER)|constructor\W+function)' + # T-SQL + - language: TSQL + pattern: '(?i:^\s*GO\b|BEGIN(\s+TRY|\s+CATCH)|OUTPUT\s+INSERTED|DECLARE\s+@|\[dbo\])' + - language: SQL +- extensions: ['.srt'] + rules: + - language: SubRip Text + pattern: '^(\d{2}:\d{2}:\d{2},\d{3})\s*(-->)\s*(\d{2}:\d{2}:\d{2},\d{3})$' +- extensions: ['.st'] + rules: + - language: StringTemplate + pattern: '\$\w+[($]|(.)!\s*.+?\s*!\1||\[!\s*.+?\s*!\]|\{!\s*.+?\s*!\}' + - language: Smalltalk + pattern: '\A\s*[\[{(^"''\w#]|[a-zA-Z_]\w*\s*:=\s*[a-zA-Z_]\w*|class\s*>>\s*[a-zA-Z_]\w*|^[a-zA-Z_]\w*\s+[a-zA-Z_]\w*:|^Class\s*\{|if(?:True|False):\s*\[' +- extensions: ['.star'] + rules: + - language: STAR + pattern: '^loop_\s*$' + - language: Starlark +- extensions: ['.stl'] + rules: + - language: STL + pattern: '\A\s*solid(?:$|\s)[\s\S]*^endsolid(?:$|\s)' +- extensions: ['.sw'] + rules: + - language: Sway + pattern: '^\s*(?:(?:abi|dep|fn|impl|mod|pub|trait)\s|#\[)' + - language: XML + pattern: '^\s*<\?xml\s+version' +- extensions: ['.t'] + rules: + - language: Perl + and: + - negative_pattern: '^\s*use\s+v6\b' + - named_pattern: perl + - language: Raku + pattern: '^\s*(?:use\s+v6\b|\bmodule\b|\bmy\s+class\b)' + - language: Turing + pattern: '^\s*%[ \t]+|^\s*var\s+\w+(\s*:\s*\w+)?\s*:=\s*\w+' +- extensions: ['.tag'] + rules: + - language: Java Server Pages + pattern: '<%[@!=\s]?\s*(taglib|tag|include|attribute|variable)\s' +- extensions: ['.tlv'] + rules: + - language: TL-Verilog + pattern: '^\\.{0,10}TLV_version' +- extensions: ['.toc'] + rules: + - language: World of Warcraft Addon Data + pattern: '^## |@no-lib-strip@' + - language: TeX + pattern: '^\\(contentsline|defcounter|beamer|boolfalse)' +- extensions: ['.ts'] + rules: + - language: XML + pattern: ' ' + # Heads up - we don't usually write heuristics like this (with no regex match) + - language: Scilab +- extensions: ['.tsx'] + rules: + - language: TSX + pattern: '^\s*(import.+(from\s+|require\()[''"]react|\/\/\/\s*]?[0-9]+|m)?|[ \t]ex)(?=:(?=[ \t]*set?[ \t][^\r\n:]+:)|:(?![ \t]*set?[ \t]))(?:(?:[ \t]*:[ \t]*|[ \t])\w*(?:[ \t]*=(?:[^\\\s]|\\.)*)?)*[ \t:](?:filetype|ft|syntax)[ \t]*=(help)(?=$|\s|:)' + - language: Adblock Filter List + pattern: |- + (?x)\A + \[ + (? + (?: + [Aa]d[Bb]lock + (?:[ \t][Pp]lus)? + | + u[Bb]lock + (?:[ \t][Oo]rigin)? + | + [Aa]d[Gg]uard + ) + (?:[ \t] \d+(?:\.\d+)*+)? + ) + (?: + [ \t]?;[ \t]? + \g + )*+ + \] + # HACK: This is a contrived use of heuristics needed to address + # an unusual edge-case. See https://git.io/JULye for discussion. + - language: Text +- extensions: ['.typ'] + rules: + - language: Typst + pattern: '^#(import|show|let|set)' + - language: XML +- extensions: ['.url'] + rules: + - language: INI + pattern: '^\[InternetShortcut\](?:\r?\n|\r)(?>[^\s\[][^\r\n]*(?:\r?\n|\r))*URL=' +- extensions: ['.v'] + rules: + - language: Coq + pattern: '(?:^|\s)(?:Proof|Qed)\.(?:$|\s)|(?:^|\s)Require[ \t]+(Import|Export)\s' + - language: Verilog + pattern: '^[ \t]*module\s+[^\s()]+\s+\#?\(|^[ \t]*`(?:define|ifdef|ifndef|include|timescale)|^[ \t]*always[ \t]+@|^[ \t]*initial[ \t]+(begin|@)' + - language: V + pattern: '\$(?:if|else)[ \t]|^[ \t]*fn\s+[^\s()]+\(.*?\).*?\{|^[ \t]*for\s*\{' +- extensions: ['.vba'] + rules: + - language: Vim Script + pattern: '^UseVimball' + - language: VBA +- extensions: ['.w'] + rules: + - language: OpenEdge ABL + pattern: '&ANALYZE-SUSPEND _UIB-CODE-BLOCK _CUSTOM _DEFINITIONS' + - language: CWeb + pattern: '^@(<|\w+\.)' +- extensions: ['.x'] + rules: + - language: DirectX 3D File + pattern: '^xof 030(2|3)(?:txt|bin|tzip|bzip)\b' + - language: RPC + pattern: '\b(program|version)\s+\w+\s*\{|\bunion\s+\w+\s+switch\s*\(' + - language: Logos + pattern: '^%(end|ctor|hook|group)\b' + - language: Linker Script + pattern: 'OUTPUT_ARCH\(|OUTPUT_FORMAT\(|SECTIONS' +- extensions: ['.yaml', '.yml'] + rules: + - language: MiniYAML + pattern: '^\t+.*?[^\s:].*?:' + negative_pattern: '---' + - language: OASv2-yaml + pattern: 'swagger:\s?''?"?2.[0-9.]+''?"?' + - language: OASv3-yaml + pattern: 'openapi:\s?''?"?3.[0-9.]+''?"?' + - language: YAML +- extensions: ['.yy'] + rules: + - language: JSON + pattern: '\"modelName\"\:\s*\"GM' + - language: Yacc +named_patterns: + cpp: + - '^\s*#\s*include <(cstdint|string|vector|map|list|array|bitset|queue|stack|forward_list|unordered_map|unordered_set|(i|o|io)stream)>' + - '^\s*template\s*<' + - '^[ \t]*(try|constexpr)' + - '^[ \t]*catch\s*\(' + - '^[ \t]*(class|(using[ \t]+)?namespace)\s+\w+' + - '^[ \t]*(private|public|protected):$' + - '__has_cpp_attribute|__cplusplus >' + - 'std::\w+' + euphoria: + - '^\s*namespace\s' + - '^\s*(?:public\s+)?include\s' + - '^\s*(?:(?:public|export|global)\s+)?(?:atom|constant|enum|function|integer|object|procedure|sequence|type)\s' + fortran: '^(?i:[c*][^abd-z]| (subroutine|program|end|data)\s|\s*!)' + gsc: + - '^\s*#\s*(?:using|insert|include|define|namespace)[ \t]+\w' + - '^\s*(?>(?:autoexec|private)\s+){0,2}function\s+(?>(?:autoexec|private)\s+){0,2}\w+\s*\(' + - '\b(?:level|self)[ \t]+thread[ \t]+(?:\[\[[ \t]*(?>\w+\.)*\w+[ \t]*\]\]|\w+)[ \t]*\([^\r\n\)]*\)[ \t]*;' + - '^[ \t]*#[ \t]*(?:precache|using_animtree)[ \t]*\(' + key_equals_value: '^[^#!;][^=]*=' + m68k: + - '(?im)\bmoveq(?:\.l)?\s+#(?:\$-?[0-9a-f]{1,3}|%[0-1]{1,8}|-?[0-9]{1,3}),\s*d[0-7]\b' + - '(?im)^\s*move(?:\.[bwl])?\s+(?:sr|usp),\s*[^\s]+' + - '(?im)^\s*move\.[bwl]\s+.*\b[ad]\d' + - '(?im)^\s*movem\.[bwl]\b' + - '(?im)^\s*move[mp](?:\.[wl])?\b' + - '(?im)^\s*btst\b' + - '(?im)^\s*dbra\b' + man-heading: '^[.''][ \t]*SH +(?:[^"\s]+|"[^"\s]+)' + man-title: '^[.''][ \t]*TH +(?:[^"\s]+|"[^"]+") +"?(?:[1-9]|@[^\s@]+@)' + mdoc-date: '^[.''][ \t]*Dd +(?:[^"\s]+|"[^"]+")' + mdoc-heading: '^[.''][ \t]*Sh +(?:[^"\s]|"[^"]+")' + mdoc-title: '^[.''][ \t]*Dt +(?:[^"\s]+|"[^"]+") +"?(?:[1-9]|@[^\s@]+@)' + objectivec: '^\s*(@(interface|class|protocol|property|end|synchronised|selector|implementation)\b|#import\s+.+\.h[">])' + perl: + - '\buse\s+(?:strict\b|v?5\b)' + - '^\s*use\s+(?:constant|overload)\b' + - '^\s*(?:\*|(?:our\s*)?@)EXPORT\s*=' + - '^\s*package\s+[^\W\d]\w*(?:::\w+)*\s*(?:[;{]|\sv?\d)' + - '[\s$][^\W\d]\w*(?::\w+)*->[a-zA-Z_\[({]' + raku: '^\s*(?:use\s+v6\b|\bmodule\b|\b(?:my\s+)?class\b)' + vb-class: '^[ ]*VERSION [0-9]\.[0-9] CLASS' + vb-form: '^[ ]*VERSION [0-9]\.[0-9]{2}' + vb-module: '^[ ]*Attribute VB_Name = ' + vba: + - '\b(?:VBA|[vV]ba)(?:\b|[0-9A-Z_])' + # VBA7 new 64-bit features + - '^[ ]*(?:Public|Private)? Declare PtrSafe (?:Sub|Function)\b' + - '^[ ]*#If Win64\b' + - '^[ ]*(?:Dim|Const) [0-9a-zA-Z_]*[ ]*As Long(?:Ptr|Long)\b' + # Top module declarations unique to VBA + - '^[ ]*Option (?:Private Module|Compare (?:Database|Text|Binary))\b' + # General VBA libraries and objects + - '(?: |\()(?:Access|Excel|Outlook|PowerPoint|Visio|Word|VBIDE)\.\w' + - '\b(?:(?:Active)?VBProjects?|VBComponents?|Application\.(?:VBE|ScreenUpdating))\b' + # AutoCAD, Outlook, PowerPoint and Word objects + - '\b(?:ThisDrawing|AcadObject|Active(?:Explorer|Inspector|Window\.Presentation|Presentation|Document)|Selection\.(?:Find|Paragraphs))\b' + # Excel objects + - '\b(?:(?:This|Active)?Workbooks?|Worksheets?|Active(?:Sheet|Chart|Cell)|WorksheetFunction)\b' + - '\b(?:Range\(".*|Cells\([0-9a-zA-Z_]*, (?:[0-9a-zA-Z_]*|"[a-zA-Z]{1,3}"))\)' diff --git a/src/actions.cr b/src/actions.cr index da514a5..33fc4f8 100644 --- a/src/actions.cr +++ b/src/actions.cr @@ -23,17 +23,14 @@ module Tartrazine struct Action property actions : Array(Action) = [] of Action - @content_index : Array(Int32) = [] of Int32 - @depth : Int32 = 0 - @lexer_index : Int32 = 0 - @lexer_name : String = "" - @states : Array(String) = [] of String - @states_to_push : Array(String) = [] of String - @token_type : String = "" - @type : ActionType = ActionType::Token + property token_type : String = "" + property states_to_push : Array(String) = [] of String + property depth = 0 + property lexer_name : String = "" + property states_to_combine : Array(String) = [] of String - def initialize(t : String, xml : XML::Node?) - @type = ActionType.parse(t.capitalize) + def initialize(@type : String, @xml : XML::Node?) + # Extract information from the XML node we will use later # Some actions may have actions in them, like this: # @@ -44,30 +41,31 @@ module Tartrazine # # The token actions match with the first 2 groups in the regex # the using action matches the 3rd and shunts it to another lexer - xml.children.each do |node| + + known_types = %w(token push pop bygroups using usingself include combined) + raise Exception.new( + "Unknown action type: #{@type}") unless known_types.includes? @type + + @xml.children.each do |node| next unless node.element? @actions << Action.new(node.name, node) end - # Prefetch the attributes we ned from the XML and keep them case @type - when ActionType::Token - @token_type = xml["type"] - when ActionType::Push + when "token" + @token_type = xml["type"]? || "" + when "push" @states_to_push = xml.attributes.select { |attrib| attrib.name == "state" - }.map &.content - when ActionType::Pop - @depth = xml["depth"].to_i - when ActionType::Using - @lexer_name = xml["lexer"].downcase - when ActionType::Combined - @states = xml.attributes.select { |attrib| + }.map &.content || [] of String + when "pop" + @depth = xml["depth"]?.try &.to_i || 0 + when "using" + @lexer_name = xml["lexer"]?.try &.downcase || "" + when "combined" + @states_to_combine = xml.attributes.select { |attrib| attrib.name == "state" }.map &.content - when ActionType::Usingbygroup - @lexer_index = xml["lexer"].to_i - @content_index = xml["content"].split(",").map(&.to_i) end end @@ -77,21 +75,25 @@ module Tartrazine when ActionType::Token raise Exception.new "Can't have a token without a match" if match.empty? [Token.new(type: @token_type, value: String.new(match[match_group].value))] - when ActionType::Push - to_push = @states_to_push.empty? ? [tokenizer.state_stack.last] : @states_to_push - to_push.each do |state| - if state == "#pop" && tokenizer.state_stack.size > 1 + when "push" + if @states_to_push.empty? + # Push without a state means push the current state + @states_to_push = [lexer.state_stack.last] + end + @states_to_push.each do |state| + if state == "#pop" # Pop the state - tokenizer.state_stack.pop + lexer.state_stack.pop else # Really push - tokenizer.state_stack << state + lexer.state_stack << state end end [] of Token - when ActionType::Pop - to_pop = [@depth, tokenizer.state_stack.size - 1].min - tokenizer.state_stack.pop(to_pop) + when "pop" + if lexer.state_stack.size > @depth + lexer.state_stack.pop(@depth) + end [] of Token when ActionType::Bygroups # FIXME: handle @@ -102,7 +104,7 @@ module Tartrazine # # where that None means skipping a group # - raise Exception.new "Can't have a token without a match" if match.nil? + raise Exception.new "Can't have a token without a match" if match.empty? # Each group matches an action. If the group match is empty, # the action is skipped. @@ -111,8 +113,7 @@ module Tartrazine begin next if match[i + 1].size == 0 rescue IndexError - # FIXME: This should not actually happen - # No match for this group + # No match for the last group next end result += e.emit(match, tokenizer, i + 1) @@ -121,19 +122,16 @@ module Tartrazine when ActionType::Using # Shunt to another lexer entirely return [] of Token if match.empty? - Tartrazine.lexer(@lexer_name).tokenizer( - String.new(match[match_group].value), - secondary: true).to_a - when ActionType::Usingself + Tartrazine.lexer(@lexer_name).tokenize(String.new(match[match_group].value), usingself: true) + when "usingself" # Shunt to another copy of this lexer return [] of Token if match.empty? - tokenizer.lexer.tokenizer( - String.new(match[match_group].value), - secondary: true).to_a - when ActionType::Combined - # Combine two or more states into one anonymous state - new_state = @states.map { |name| - tokenizer.lexer.states[name] + new_lexer = Lexer.from_xml(lexer.xml) + new_lexer.tokenize(String.new(match[match_group].value), usingself: true) + when "combined" + # Combine two states into one anonymous state + new_state = @states_to_combine.map { |name| + lexer.states[name] }.reduce { |state1, state2| state1 + state2 } @@ -151,7 +149,7 @@ module Tartrazine content, secondary: true).to_a else - raise Exception.new("Unknown action type: #{@type}") + raise Exception.new("Unhandled action type: #{type}") end end end diff --git a/src/bytes_regex.cr b/src/bytes_regex.cr index c73f694..6e277a8 100644 --- a/src/bytes_regex.cr +++ b/src/bytes_regex.cr @@ -31,6 +31,7 @@ module BytesRegex end def match(str : Bytes, pos = 0) : Array(Match) + match = [] of Match rc = LibPCRE2.match( @re, str, @@ -39,9 +40,9 @@ module BytesRegex LibPCRE2::NO_UTF_CHECK, @match_data, nil) - if rc > 0 + if rc >= 0 ovector = LibPCRE2.get_ovector_pointer(@match_data) - (0...rc).map do |i| + (0...rc).each do |i| m_start = ovector[2 * i] m_end = ovector[2 * i + 1] if m_start == m_end @@ -54,6 +55,7 @@ module BytesRegex else [] of Match end + match end end diff --git a/src/heuristics.cr b/src/heuristics.cr new file mode 100644 index 0000000..2a06f3e --- /dev/null +++ b/src/heuristics.cr @@ -0,0 +1,42 @@ +require "yaml" + +module Tartrazine + # Use linguist's heuristics to disambiguate between languages + + class Heuristic + include YAML::Serializable + + property disambiguations : Array(Disambiguation) + property named_patterns : Hash(String, String | Array(String)) + + # Run the heuristics on the given filename and content + def run(filename, content) + ext = File.extname filename + disambiguation = disambiguations.find do |item| + item.extensions.includes? ext + end + p! disambiguation + end + end + + class Disambiguation + include YAML::Serializable + property extensions : Array(String) + property rules : Array(LangRule) + end + + class Rule + include YAML::Serializable + property pattern : (String | Array(String))? + property named_pattern : String? + property and : Array(Rule)? + end + + class LangRule < Rule + include YAML::Serializable + property language : String | Array(String) + end +end + +h = Tartrazine::Heuristic.from_yaml(File.read("heuristics/heuristics.yml")) +h.run("../elkjs/src/elk.h", File.read("../elkjs/src/elk.h")) diff --git a/src/main.cr b/src/main.cr index 8b92e58..357a860 100644 --- a/src/main.cr +++ b/src/main.cr @@ -1,6 +1,18 @@ require "docopt" require "./**" +# Performance data (in milliseconds): +# +# Docopt parsing: 0.5 +# Instantiating a theme: 0.1 +# Instantiating a formatter: 1.0 +# Instantiating a lexer: 2.0 +# Tokenizing crycco.cr: 16.0 +# Formatting: 0.5 +# I/O: 1.5 +# --------------------------------- +# Total: 21.6 + HELP = <<-HELP tartrazine: a syntax highlighting tool @@ -84,7 +96,6 @@ if options["-f"] end lexer = Tartrazine.lexer(name: options["-l"].as(String), filename: options["FILE"].as(String)) - input = File.open(options["FILE"].as(String)).gets_to_end if options["-o"].nil? diff --git a/src/rules.cr b/src/rules.cr index 6eaa8d7..37000db 100644 --- a/src/rules.cr +++ b/src/rules.cr @@ -15,11 +15,28 @@ module Tartrazine alias Match = BytesRegex::Match alias MatchData = Array(Match) - abstract struct BaseRule - abstract def match(text : Bytes, pos : Int32, tokenizer : Tokenizer) : Tuple(Bool, Int32, Array(Token)) - abstract def initialize(node : XML::Node) + class Rule + property pattern : Regex = Regex.new "" + property actions : Array(Action) = [] of Action - @actions : Array(Action) = [] of Action + def match(text : Bytes, pos, lexer) : Tuple(Bool, Int32, Array(Token)) + match = pattern.match(text, pos) + # We don't match if the match doesn't move the cursor + # because that causes infinite loops + return false, pos, [] of Token if match.empty? || match[0].size == 0 + tokens = [] of Token + actions.each do |action| + tokens += action.emit(match, lexer) + end + return true, pos + match[0].size, tokens + end + + def initialize(node : XML::Node, multiline, dotall, ignorecase) + pattern = node["pattern"] + pattern = "(?m)" + pattern if multiline + @pattern = Regex.new(pattern, multiline, dotall, ignorecase, true) + add_actions(node) + end def add_actions(node : XML::Node) node.children.each do |child| @@ -56,9 +73,9 @@ module Tartrazine struct IncludeStateRule < BaseRule @state : String = "" - def match(text : Bytes, pos : Int32, tokenizer : Tokenizer) : Tuple(Bool, Int32, Array(Token)) - tokenizer.@lexer.states[@state].rules.each do |rule| - matched, new_pos, new_tokens = rule.match(text, pos, tokenizer) + def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token)) + lexer.states[state].rules.each do |rule| + matched, new_pos, new_tokens = rule.match(text, pos, lexer) return true, new_pos, new_tokens if matched end return false, pos, [] of Token