diff --git a/heuristics/LICENSE.txt b/lexers/LICENSE-heuristics
similarity index 100%
rename from heuristics/LICENSE.txt
rename to lexers/LICENSE-heuristics
diff --git a/lexers/groff.xml b/lexers/groff.xml
index 3af0a43..f647b21 100644
--- a/lexers/groff.xml
+++ b/lexers/groff.xml
@@ -3,6 +3,7 @@
Groff
groff
nroff
+ roff
man
*.[1-9]
*.1p
@@ -87,4 +88,4 @@
-
\ No newline at end of file
+
diff --git a/heuristics/heuristics.yml b/lexers/heuristics.yml
similarity index 99%
rename from heuristics/heuristics.yml
rename to lexers/heuristics.yml
index 8a6fcea..1c74ed8 100644
--- a/heuristics/heuristics.yml
+++ b/lexers/heuristics.yml
@@ -30,12 +30,12 @@
disambiguations:
- extensions: ['.1', '.2', '.3', '.4', '.5', '.6', '.7', '.8', '.9']
rules:
- - language: Roff Manpage
+ - language: man
and:
- named_pattern: mdoc-date
- named_pattern: mdoc-title
- named_pattern: mdoc-heading
- - language: Roff Manpage
+ - language: man
and:
- named_pattern: man-title
- named_pattern: man-heading
@@ -43,12 +43,12 @@ disambiguations:
pattern: '^\.(?:[A-Za-z]{2}(?:\s|$)|\\")'
- extensions: ['.1in', '.1m', '.1x', '.3in', '.3m', '.3p', '.3pm', '.3qt', '.3x', '.man', '.mdoc']
rules:
- - language: Roff Manpage
+ - language: man
and:
- named_pattern: mdoc-date
- named_pattern: mdoc-title
- named_pattern: mdoc-heading
- - language: Roff Manpage
+ - language: man
and:
- named_pattern: man-title
- named_pattern: man-heading
diff --git a/scripts/lexer_metadata.py b/scripts/lexer_metadata.py
index 29d5f9d..9acbb48 100644
--- a/scripts/lexer_metadata.py
+++ b/scripts/lexer_metadata.py
@@ -52,6 +52,6 @@ with open("src/constants/lexers.cr", "w") as f:
f.write(" LEXERS_BY_FILENAME = {\n")
for k in sorted(lexer_by_filename.keys()):
v = lexer_by_filename[k]
- f.write(f'"{k}" => {str(list(v)).replace("'", "\"")}, \n')
+ f.write(f'"{k}" => {str(sorted(list(v))).replace("'", "\"")}, \n')
f.write("}\n")
f.write("end\n")
diff --git a/src/constants/lexers.cr b/src/constants/lexers.cr
index 1f8a368..0d2a416 100644
--- a/src/constants/lexers.cr
+++ b/src/constants/lexers.cr
@@ -328,6 +328,7 @@ module Tartrazine
"restructuredtext" => "rst",
"rexx" => "rexx",
"rkt" => "racket",
+ "roff" => "groff",
"rpmspec" => "rpm_spec",
"rs" => "rust",
"rst" => "rst",
@@ -730,8 +731,8 @@ module Tartrazine
"*.applescript" => ["applescript"],
"*.aql" => ["arangodb_aql"],
"*.arexx" => ["rexx"],
- "*.as" => ["actionscript_3", "actionscript"],
- "*.asm" => ["tasm", "nasm", "z80_assembly"],
+ "*.as" => ["actionscript", "actionscript_3"],
+ "*.asm" => ["nasm", "tasm", "z80_assembly"],
"*.au3" => ["autoit"],
"*.automount" => ["systemd"],
"*.aux" => ["tex"],
@@ -739,7 +740,7 @@ module Tartrazine
"*.awk" => ["awk"],
"*.b" => ["brainfuck"],
"*.bal" => ["ballerina"],
- "*.bas" => ["vb_net", "qbasic"],
+ "*.bas" => ["qbasic", "vb_net"],
"*.bash" => ["bash"],
"*.bat" => ["batchfile"],
"*.batch" => ["psl"],
@@ -750,7 +751,7 @@ module Tartrazine
"*.bnf" => ["bnf"],
"*.bqn" => ["bqn"],
"*.bzl" => ["python"],
- "*.c" => ["c++", "c"],
+ "*.c" => ["c", "c++"],
"*.c++" => ["c++"],
"*.capnp" => ["cap_n_proto"],
"*.cc" => ["c++"],
@@ -839,7 +840,7 @@ module Tartrazine
"*.fx" => ["hlsl"],
"*.fxh" => ["hlsl"],
"*.fzn" => ["minizinc"],
- "*.gd" => ["gdscript3", "gdscript"],
+ "*.gd" => ["gdscript", "gdscript3"],
"*.gemspec" => ["ruby"],
"*.geo" => ["glsl"],
"*.gleam" => ["gleam"],
@@ -849,7 +850,7 @@ module Tartrazine
"*.graphql" => ["graphql"],
"*.graphqls" => ["graphql"],
"*.groovy" => ["groovy"],
- "*.h" => ["c++", "c", "objective-c"],
+ "*.h" => ["c", "c++", "objective-c"],
"*.h++" => ["c++"],
"*.ha" => ["hare"],
"*.handlebars" => ["handlebars"],
@@ -872,7 +873,7 @@ module Tartrazine
"*.idc" => ["c"],
"*.idr" => ["idris"],
"*.ijs" => ["j"],
- "*.inc" => ["objectpascal", "povray", "php", "sourcepawn"],
+ "*.inc" => ["objectpascal", "php", "povray", "sourcepawn"],
"*.inf" => ["ini"],
"*.ini" => ["ini"],
"*.ino" => ["arduino"],
@@ -898,13 +899,13 @@ module Tartrazine
"*.lpk" => ["objectpascal"],
"*.lpr" => ["objectpascal"],
"*.lua" => ["lua"],
- "*.m" => ["mathematica", "octave", "matlab", "objective-c", "mason"],
+ "*.m" => ["mason", "mathematica", "matlab", "objective-c", "octave"],
"*.ma" => ["mathematica"],
"*.mak" => ["makefile"],
"*.man" => ["groff"],
"*.mao" => ["mako"],
"*.markdown" => ["markdown"],
- "*.mc" => ["monkeyc", "mason"],
+ "*.mc" => ["mason", "monkeyc"],
"*.mcfunction" => ["mcfunction"],
"*.md" => ["markdown"],
"*.metal" => ["metal"],
@@ -961,7 +962,7 @@ module Tartrazine
"*.pml" => ["promela"],
"*.pony" => ["pony"],
"*.pov" => ["povray"],
- "*.pp" => ["puppet", "objectpascal"],
+ "*.pp" => ["objectpascal", "puppet"],
"*.pq" => ["powerquery"],
"*.pr" => ["promela"],
"*.prm" => ["promela"],
@@ -1010,7 +1011,7 @@ module Tartrazine
"*.rst" => ["rst"],
"*.rvt" => ["tcl"],
"*.rx" => ["rexx"],
- "*.s" => ["armasm", "r", "gas"],
+ "*.s" => ["armasm", "gas", "r"],
"*.sage" => ["python"],
"*.sas" => ["sas"],
"*.sass" => ["sass"],
@@ -1023,7 +1024,7 @@ module Tartrazine
"*.scope" => ["systemd"],
"*.scss" => ["scss"],
"*.sed" => ["sed"],
- "*.service" => ["systemd", "ini"],
+ "*.service" => ["ini", "systemd"],
"*.sh" => ["bash"],
"*.sh-session" => ["bash_session"],
"*.sieve" => ["sieve"],
@@ -1033,7 +1034,7 @@ module Tartrazine
"*.smali" => ["smali"],
"*.sml" => ["standard_ml"],
"*.snobol" => ["snobol"],
- "*.socket" => ["systemd", "ini"],
+ "*.socket" => ["ini", "systemd"],
"*.sol" => ["solidity"],
"*.sp" => ["sourcepawn"],
"*.sparql" => ["sparql"],
@@ -1068,7 +1069,7 @@ module Tartrazine
"*.tpl" => ["smarty"],
"*.tpp" => ["c++"],
"*.trig" => ["psl"],
- "*.ts" => ["typoscript", "typescript"],
+ "*.ts" => ["typescript", "typoscript"],
"*.tst" => ["scilab"],
"*.tsx" => ["typescript"],
"*.ttl" => ["turtle"],
@@ -1104,7 +1105,7 @@ module Tartrazine
"*.xml" => ["xml"],
"*.xsd" => ["xml"],
"*.xsl" => ["xml"],
- "*.xslt" => ["xml", "html"],
+ "*.xslt" => ["html", "xml"],
"*.yaml" => ["yaml"],
"*.yang" => ["yang"],
"*.yml" => ["yaml"],
diff --git a/src/heuristics.cr b/src/heuristics.cr
index 11de614..2aebe9a 100644
--- a/src/heuristics.cr
+++ b/src/heuristics.cr
@@ -1,13 +1,12 @@
require "yaml"
- # Use linguist's heuristics to disambiguate between languages
- # This is *shamelessly* stolen from https://github.com/github-linguist/linguist
- # and ported to Crystal. Deepest thanks to the authors of Linguist
- # for licensing it liberally.
- #
- # Consider this code (c) 2017 GitHub, Inc. even if I wrote it.
- module Linguist
-
+# Use linguist's heuristics to disambiguate between languages
+# This is *shamelessly* stolen from https://github.com/github-linguist/linguist
+# and ported to Crystal. Deepest thanks to the authors of Linguist
+# for licensing it liberally.
+#
+# Consider this code (c) 2017 GitHub, Inc. even if I wrote it.
+module Linguist
class Heuristic
include YAML::Serializable
@@ -80,7 +79,3 @@ require "yaml"
end
end
end
-
-h = Linguist::Heuristic.from_yaml(File.read("heuristics/heuristics.yml"))
-fname = "/usr/include/sqlite3.h"
-p! h.run(fname, File.read(fname))
diff --git a/src/lexer.cr b/src/lexer.cr
index e38a261..39652ba 100644
--- a/src/lexer.cr
+++ b/src/lexer.cr
@@ -36,12 +36,30 @@ module Tartrazine
when 1
lexer_file_name = candidates.first
else
- raise Exception.new("Multiple lexers match the filename: #{candidates.to_a.join(", ")}")
+ lexer_file_name = self.lexer_by_content(filename)
+ begin
+ return self.lexer(lexer_file_name)
+ rescue ex : Exception
+ raise Exception.new("Multiple lexers match the filename: #{candidates.to_a.join(", ")}, heuristics suggest #{lexer_file_name} but there is no matching lexer.")
+ end
end
Lexer.from_xml(LexerFiles.get("/#{lexer_file_name}.xml").gets_to_end)
end
+ private def self.lexer_by_content(fname : String) : String?
+ h = Linguist::Heuristic.from_yaml(LexerFiles.get("/heuristics.yml").gets_to_end)
+ result = h.run(fname, File.read(fname))
+ case result
+ when Nil
+ raise Exception.new "No lexer found for #{fname}"
+ when String
+ result.as(String)
+ when Array(String)
+ result.first
+ end
+ end
+
private def self.create_delegating_lexer(name : String) : BaseLexer
language, root = name.split("+", 2)
language_lexer = lexer(language)