mirror of
https://github.com/ralsina/tartrazine.git
synced 2024-12-05 00:00:34 +00:00
Integrate heuristics into lexer selection
This commit is contained in:
parent
a5926af518
commit
5a3b50d7a3
@ -3,6 +3,7 @@
|
||||
<name>Groff</name>
|
||||
<alias>groff</alias>
|
||||
<alias>nroff</alias>
|
||||
<alias>roff</alias>
|
||||
<alias>man</alias>
|
||||
<filename>*.[1-9]</filename>
|
||||
<filename>*.1p</filename>
|
||||
@ -87,4 +88,4 @@
|
||||
</rule>
|
||||
</state>
|
||||
</rules>
|
||||
</lexer>
|
||||
</lexer>
|
||||
|
@ -30,12 +30,12 @@
|
||||
disambiguations:
|
||||
- extensions: ['.1', '.2', '.3', '.4', '.5', '.6', '.7', '.8', '.9']
|
||||
rules:
|
||||
- language: Roff Manpage
|
||||
- language: man
|
||||
and:
|
||||
- named_pattern: mdoc-date
|
||||
- named_pattern: mdoc-title
|
||||
- named_pattern: mdoc-heading
|
||||
- language: Roff Manpage
|
||||
- language: man
|
||||
and:
|
||||
- named_pattern: man-title
|
||||
- named_pattern: man-heading
|
||||
@ -43,12 +43,12 @@ disambiguations:
|
||||
pattern: '^\.(?:[A-Za-z]{2}(?:\s|$)|\\")'
|
||||
- extensions: ['.1in', '.1m', '.1x', '.3in', '.3m', '.3p', '.3pm', '.3qt', '.3x', '.man', '.mdoc']
|
||||
rules:
|
||||
- language: Roff Manpage
|
||||
- language: man
|
||||
and:
|
||||
- named_pattern: mdoc-date
|
||||
- named_pattern: mdoc-title
|
||||
- named_pattern: mdoc-heading
|
||||
- language: Roff Manpage
|
||||
- language: man
|
||||
and:
|
||||
- named_pattern: man-title
|
||||
- named_pattern: man-heading
|
@ -52,6 +52,6 @@ with open("src/constants/lexers.cr", "w") as f:
|
||||
f.write(" LEXERS_BY_FILENAME = {\n")
|
||||
for k in sorted(lexer_by_filename.keys()):
|
||||
v = lexer_by_filename[k]
|
||||
f.write(f'"{k}" => {str(list(v)).replace("'", "\"")}, \n')
|
||||
f.write(f'"{k}" => {str(sorted(list(v))).replace("'", "\"")}, \n')
|
||||
f.write("}\n")
|
||||
f.write("end\n")
|
||||
|
@ -328,6 +328,7 @@ module Tartrazine
|
||||
"restructuredtext" => "rst",
|
||||
"rexx" => "rexx",
|
||||
"rkt" => "racket",
|
||||
"roff" => "groff",
|
||||
"rpmspec" => "rpm_spec",
|
||||
"rs" => "rust",
|
||||
"rst" => "rst",
|
||||
@ -730,8 +731,8 @@ module Tartrazine
|
||||
"*.applescript" => ["applescript"],
|
||||
"*.aql" => ["arangodb_aql"],
|
||||
"*.arexx" => ["rexx"],
|
||||
"*.as" => ["actionscript_3", "actionscript"],
|
||||
"*.asm" => ["tasm", "nasm", "z80_assembly"],
|
||||
"*.as" => ["actionscript", "actionscript_3"],
|
||||
"*.asm" => ["nasm", "tasm", "z80_assembly"],
|
||||
"*.au3" => ["autoit"],
|
||||
"*.automount" => ["systemd"],
|
||||
"*.aux" => ["tex"],
|
||||
@ -739,7 +740,7 @@ module Tartrazine
|
||||
"*.awk" => ["awk"],
|
||||
"*.b" => ["brainfuck"],
|
||||
"*.bal" => ["ballerina"],
|
||||
"*.bas" => ["vb_net", "qbasic"],
|
||||
"*.bas" => ["qbasic", "vb_net"],
|
||||
"*.bash" => ["bash"],
|
||||
"*.bat" => ["batchfile"],
|
||||
"*.batch" => ["psl"],
|
||||
@ -750,7 +751,7 @@ module Tartrazine
|
||||
"*.bnf" => ["bnf"],
|
||||
"*.bqn" => ["bqn"],
|
||||
"*.bzl" => ["python"],
|
||||
"*.c" => ["c++", "c"],
|
||||
"*.c" => ["c", "c++"],
|
||||
"*.c++" => ["c++"],
|
||||
"*.capnp" => ["cap_n_proto"],
|
||||
"*.cc" => ["c++"],
|
||||
@ -839,7 +840,7 @@ module Tartrazine
|
||||
"*.fx" => ["hlsl"],
|
||||
"*.fxh" => ["hlsl"],
|
||||
"*.fzn" => ["minizinc"],
|
||||
"*.gd" => ["gdscript3", "gdscript"],
|
||||
"*.gd" => ["gdscript", "gdscript3"],
|
||||
"*.gemspec" => ["ruby"],
|
||||
"*.geo" => ["glsl"],
|
||||
"*.gleam" => ["gleam"],
|
||||
@ -849,7 +850,7 @@ module Tartrazine
|
||||
"*.graphql" => ["graphql"],
|
||||
"*.graphqls" => ["graphql"],
|
||||
"*.groovy" => ["groovy"],
|
||||
"*.h" => ["c++", "c", "objective-c"],
|
||||
"*.h" => ["c", "c++", "objective-c"],
|
||||
"*.h++" => ["c++"],
|
||||
"*.ha" => ["hare"],
|
||||
"*.handlebars" => ["handlebars"],
|
||||
@ -872,7 +873,7 @@ module Tartrazine
|
||||
"*.idc" => ["c"],
|
||||
"*.idr" => ["idris"],
|
||||
"*.ijs" => ["j"],
|
||||
"*.inc" => ["objectpascal", "povray", "php", "sourcepawn"],
|
||||
"*.inc" => ["objectpascal", "php", "povray", "sourcepawn"],
|
||||
"*.inf" => ["ini"],
|
||||
"*.ini" => ["ini"],
|
||||
"*.ino" => ["arduino"],
|
||||
@ -898,13 +899,13 @@ module Tartrazine
|
||||
"*.lpk" => ["objectpascal"],
|
||||
"*.lpr" => ["objectpascal"],
|
||||
"*.lua" => ["lua"],
|
||||
"*.m" => ["mathematica", "octave", "matlab", "objective-c", "mason"],
|
||||
"*.m" => ["mason", "mathematica", "matlab", "objective-c", "octave"],
|
||||
"*.ma" => ["mathematica"],
|
||||
"*.mak" => ["makefile"],
|
||||
"*.man" => ["groff"],
|
||||
"*.mao" => ["mako"],
|
||||
"*.markdown" => ["markdown"],
|
||||
"*.mc" => ["monkeyc", "mason"],
|
||||
"*.mc" => ["mason", "monkeyc"],
|
||||
"*.mcfunction" => ["mcfunction"],
|
||||
"*.md" => ["markdown"],
|
||||
"*.metal" => ["metal"],
|
||||
@ -961,7 +962,7 @@ module Tartrazine
|
||||
"*.pml" => ["promela"],
|
||||
"*.pony" => ["pony"],
|
||||
"*.pov" => ["povray"],
|
||||
"*.pp" => ["puppet", "objectpascal"],
|
||||
"*.pp" => ["objectpascal", "puppet"],
|
||||
"*.pq" => ["powerquery"],
|
||||
"*.pr" => ["promela"],
|
||||
"*.prm" => ["promela"],
|
||||
@ -1010,7 +1011,7 @@ module Tartrazine
|
||||
"*.rst" => ["rst"],
|
||||
"*.rvt" => ["tcl"],
|
||||
"*.rx" => ["rexx"],
|
||||
"*.s" => ["armasm", "r", "gas"],
|
||||
"*.s" => ["armasm", "gas", "r"],
|
||||
"*.sage" => ["python"],
|
||||
"*.sas" => ["sas"],
|
||||
"*.sass" => ["sass"],
|
||||
@ -1023,7 +1024,7 @@ module Tartrazine
|
||||
"*.scope" => ["systemd"],
|
||||
"*.scss" => ["scss"],
|
||||
"*.sed" => ["sed"],
|
||||
"*.service" => ["systemd", "ini"],
|
||||
"*.service" => ["ini", "systemd"],
|
||||
"*.sh" => ["bash"],
|
||||
"*.sh-session" => ["bash_session"],
|
||||
"*.sieve" => ["sieve"],
|
||||
@ -1033,7 +1034,7 @@ module Tartrazine
|
||||
"*.smali" => ["smali"],
|
||||
"*.sml" => ["standard_ml"],
|
||||
"*.snobol" => ["snobol"],
|
||||
"*.socket" => ["systemd", "ini"],
|
||||
"*.socket" => ["ini", "systemd"],
|
||||
"*.sol" => ["solidity"],
|
||||
"*.sp" => ["sourcepawn"],
|
||||
"*.sparql" => ["sparql"],
|
||||
@ -1068,7 +1069,7 @@ module Tartrazine
|
||||
"*.tpl" => ["smarty"],
|
||||
"*.tpp" => ["c++"],
|
||||
"*.trig" => ["psl"],
|
||||
"*.ts" => ["typoscript", "typescript"],
|
||||
"*.ts" => ["typescript", "typoscript"],
|
||||
"*.tst" => ["scilab"],
|
||||
"*.tsx" => ["typescript"],
|
||||
"*.ttl" => ["turtle"],
|
||||
@ -1104,7 +1105,7 @@ module Tartrazine
|
||||
"*.xml" => ["xml"],
|
||||
"*.xsd" => ["xml"],
|
||||
"*.xsl" => ["xml"],
|
||||
"*.xslt" => ["xml", "html"],
|
||||
"*.xslt" => ["html", "xml"],
|
||||
"*.yaml" => ["yaml"],
|
||||
"*.yang" => ["yang"],
|
||||
"*.yml" => ["yaml"],
|
||||
|
@ -1,13 +1,12 @@
|
||||
require "yaml"
|
||||
|
||||
# Use linguist's heuristics to disambiguate between languages
|
||||
# This is *shamelessly* stolen from https://github.com/github-linguist/linguist
|
||||
# and ported to Crystal. Deepest thanks to the authors of Linguist
|
||||
# for licensing it liberally.
|
||||
#
|
||||
# Consider this code (c) 2017 GitHub, Inc. even if I wrote it.
|
||||
module Linguist
|
||||
|
||||
# Use linguist's heuristics to disambiguate between languages
|
||||
# This is *shamelessly* stolen from https://github.com/github-linguist/linguist
|
||||
# and ported to Crystal. Deepest thanks to the authors of Linguist
|
||||
# for licensing it liberally.
|
||||
#
|
||||
# Consider this code (c) 2017 GitHub, Inc. even if I wrote it.
|
||||
module Linguist
|
||||
class Heuristic
|
||||
include YAML::Serializable
|
||||
|
||||
@ -80,7 +79,3 @@ require "yaml"
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
h = Linguist::Heuristic.from_yaml(File.read("heuristics/heuristics.yml"))
|
||||
fname = "/usr/include/sqlite3.h"
|
||||
p! h.run(fname, File.read(fname))
|
||||
|
20
src/lexer.cr
20
src/lexer.cr
@ -36,12 +36,30 @@ module Tartrazine
|
||||
when 1
|
||||
lexer_file_name = candidates.first
|
||||
else
|
||||
raise Exception.new("Multiple lexers match the filename: #{candidates.to_a.join(", ")}")
|
||||
lexer_file_name = self.lexer_by_content(filename)
|
||||
begin
|
||||
return self.lexer(lexer_file_name)
|
||||
rescue ex : Exception
|
||||
raise Exception.new("Multiple lexers match the filename: #{candidates.to_a.join(", ")}, heuristics suggest #{lexer_file_name} but there is no matching lexer.")
|
||||
end
|
||||
end
|
||||
|
||||
Lexer.from_xml(LexerFiles.get("/#{lexer_file_name}.xml").gets_to_end)
|
||||
end
|
||||
|
||||
private def self.lexer_by_content(fname : String) : String?
|
||||
h = Linguist::Heuristic.from_yaml(LexerFiles.get("/heuristics.yml").gets_to_end)
|
||||
result = h.run(fname, File.read(fname))
|
||||
case result
|
||||
when Nil
|
||||
raise Exception.new "No lexer found for #{fname}"
|
||||
when String
|
||||
result.as(String)
|
||||
when Array(String)
|
||||
result.first
|
||||
end
|
||||
end
|
||||
|
||||
private def self.create_delegating_lexer(name : String) : BaseLexer
|
||||
language, root = name.split("+", 2)
|
||||
language_lexer = lexer(language)
|
||||
|
Loading…
Reference in New Issue
Block a user