Integrate heuristics into lexer selection

This commit is contained in:
Roberto Alsina 2024-08-24 21:35:06 -03:00
parent a5926af518
commit 72afec773e
6 changed files with 44 additions and 29 deletions

View File

@ -3,6 +3,7 @@
<name>Groff</name> <name>Groff</name>
<alias>groff</alias> <alias>groff</alias>
<alias>nroff</alias> <alias>nroff</alias>
<alias>roff</alias>
<alias>man</alias> <alias>man</alias>
<filename>*.[1-9]</filename> <filename>*.[1-9]</filename>
<filename>*.1p</filename> <filename>*.1p</filename>
@ -87,4 +88,4 @@
</rule> </rule>
</state> </state>
</rules> </rules>
</lexer> </lexer>

View File

@ -30,12 +30,12 @@
disambiguations: disambiguations:
- extensions: ['.1', '.2', '.3', '.4', '.5', '.6', '.7', '.8', '.9'] - extensions: ['.1', '.2', '.3', '.4', '.5', '.6', '.7', '.8', '.9']
rules: rules:
- language: Roff Manpage - language: man
and: and:
- named_pattern: mdoc-date - named_pattern: mdoc-date
- named_pattern: mdoc-title - named_pattern: mdoc-title
- named_pattern: mdoc-heading - named_pattern: mdoc-heading
- language: Roff Manpage - language: man
and: and:
- named_pattern: man-title - named_pattern: man-title
- named_pattern: man-heading - named_pattern: man-heading
@ -43,12 +43,12 @@ disambiguations:
pattern: '^\.(?:[A-Za-z]{2}(?:\s|$)|\\")' pattern: '^\.(?:[A-Za-z]{2}(?:\s|$)|\\")'
- extensions: ['.1in', '.1m', '.1x', '.3in', '.3m', '.3p', '.3pm', '.3qt', '.3x', '.man', '.mdoc'] - extensions: ['.1in', '.1m', '.1x', '.3in', '.3m', '.3p', '.3pm', '.3qt', '.3x', '.man', '.mdoc']
rules: rules:
- language: Roff Manpage - language: man
and: and:
- named_pattern: mdoc-date - named_pattern: mdoc-date
- named_pattern: mdoc-title - named_pattern: mdoc-title
- named_pattern: mdoc-heading - named_pattern: mdoc-heading
- language: Roff Manpage - language: man
and: and:
- named_pattern: man-title - named_pattern: man-title
- named_pattern: man-heading - named_pattern: man-heading

View File

@ -328,6 +328,7 @@ module Tartrazine
"restructuredtext" => "rst", "restructuredtext" => "rst",
"rexx" => "rexx", "rexx" => "rexx",
"rkt" => "racket", "rkt" => "racket",
"roff" => "groff",
"rpmspec" => "rpm_spec", "rpmspec" => "rpm_spec",
"rs" => "rust", "rs" => "rust",
"rst" => "rst", "rst" => "rst",
@ -731,7 +732,7 @@ module Tartrazine
"*.aql" => ["arangodb_aql"], "*.aql" => ["arangodb_aql"],
"*.arexx" => ["rexx"], "*.arexx" => ["rexx"],
"*.as" => ["actionscript_3", "actionscript"], "*.as" => ["actionscript_3", "actionscript"],
"*.asm" => ["tasm", "nasm", "z80_assembly"], "*.asm" => ["nasm", "z80_assembly", "tasm"],
"*.au3" => ["autoit"], "*.au3" => ["autoit"],
"*.automount" => ["systemd"], "*.automount" => ["systemd"],
"*.aux" => ["tex"], "*.aux" => ["tex"],
@ -750,7 +751,7 @@ module Tartrazine
"*.bnf" => ["bnf"], "*.bnf" => ["bnf"],
"*.bqn" => ["bqn"], "*.bqn" => ["bqn"],
"*.bzl" => ["python"], "*.bzl" => ["python"],
"*.c" => ["c++", "c"], "*.c" => ["c", "c++"],
"*.c++" => ["c++"], "*.c++" => ["c++"],
"*.capnp" => ["cap_n_proto"], "*.capnp" => ["cap_n_proto"],
"*.cc" => ["c++"], "*.cc" => ["c++"],
@ -839,7 +840,7 @@ module Tartrazine
"*.fx" => ["hlsl"], "*.fx" => ["hlsl"],
"*.fxh" => ["hlsl"], "*.fxh" => ["hlsl"],
"*.fzn" => ["minizinc"], "*.fzn" => ["minizinc"],
"*.gd" => ["gdscript3", "gdscript"], "*.gd" => ["gdscript", "gdscript3"],
"*.gemspec" => ["ruby"], "*.gemspec" => ["ruby"],
"*.geo" => ["glsl"], "*.geo" => ["glsl"],
"*.gleam" => ["gleam"], "*.gleam" => ["gleam"],
@ -849,7 +850,7 @@ module Tartrazine
"*.graphql" => ["graphql"], "*.graphql" => ["graphql"],
"*.graphqls" => ["graphql"], "*.graphqls" => ["graphql"],
"*.groovy" => ["groovy"], "*.groovy" => ["groovy"],
"*.h" => ["c++", "c", "objective-c"], "*.h" => ["objective-c", "c", "c++"],
"*.h++" => ["c++"], "*.h++" => ["c++"],
"*.ha" => ["hare"], "*.ha" => ["hare"],
"*.handlebars" => ["handlebars"], "*.handlebars" => ["handlebars"],
@ -872,7 +873,7 @@ module Tartrazine
"*.idc" => ["c"], "*.idc" => ["c"],
"*.idr" => ["idris"], "*.idr" => ["idris"],
"*.ijs" => ["j"], "*.ijs" => ["j"],
"*.inc" => ["objectpascal", "povray", "php", "sourcepawn"], "*.inc" => ["php", "objectpascal", "povray", "sourcepawn"],
"*.inf" => ["ini"], "*.inf" => ["ini"],
"*.ini" => ["ini"], "*.ini" => ["ini"],
"*.ino" => ["arduino"], "*.ino" => ["arduino"],
@ -898,13 +899,13 @@ module Tartrazine
"*.lpk" => ["objectpascal"], "*.lpk" => ["objectpascal"],
"*.lpr" => ["objectpascal"], "*.lpr" => ["objectpascal"],
"*.lua" => ["lua"], "*.lua" => ["lua"],
"*.m" => ["mathematica", "octave", "matlab", "objective-c", "mason"], "*.m" => ["mathematica", "mason", "octave", "objective-c", "matlab"],
"*.ma" => ["mathematica"], "*.ma" => ["mathematica"],
"*.mak" => ["makefile"], "*.mak" => ["makefile"],
"*.man" => ["groff"], "*.man" => ["groff"],
"*.mao" => ["mako"], "*.mao" => ["mako"],
"*.markdown" => ["markdown"], "*.markdown" => ["markdown"],
"*.mc" => ["monkeyc", "mason"], "*.mc" => ["mason", "monkeyc"],
"*.mcfunction" => ["mcfunction"], "*.mcfunction" => ["mcfunction"],
"*.md" => ["markdown"], "*.md" => ["markdown"],
"*.metal" => ["metal"], "*.metal" => ["metal"],
@ -953,7 +954,7 @@ module Tartrazine
"*.php" => ["php"], "*.php" => ["php"],
"*.php[345]" => ["php"], "*.php[345]" => ["php"],
"*.pig" => ["pig"], "*.pig" => ["pig"],
"*.pl" => ["perl", "prolog"], "*.pl" => ["prolog", "perl"],
"*.plc" => ["plutus_core"], "*.plc" => ["plutus_core"],
"*.plot" => ["gnuplot"], "*.plot" => ["gnuplot"],
"*.plt" => ["gnuplot"], "*.plt" => ["gnuplot"],
@ -1039,7 +1040,7 @@ module Tartrazine
"*.sparql" => ["sparql"], "*.sparql" => ["sparql"],
"*.spec" => ["rpm_spec"], "*.spec" => ["rpm_spec"],
"*.spt" => ["cheetah"], "*.spt" => ["cheetah"],
"*.sql" => ["mysql", "sql"], "*.sql" => ["sql", "mysql"],
"*.ss" => ["scheme"], "*.ss" => ["scheme"],
"*.st" => ["smalltalk"], "*.st" => ["smalltalk"],
"*.stas" => ["stas"], "*.stas" => ["stas"],
@ -1078,7 +1079,7 @@ module Tartrazine
"*.twig" => ["twig"], "*.twig" => ["twig"],
"*.txt" => ["plaintext"], "*.txt" => ["plaintext"],
"*.uc" => ["ucode"], "*.uc" => ["ucode"],
"*.v" => ["coq", "v", "verilog"], "*.v" => ["verilog", "v", "coq"],
"*.vala" => ["vala"], "*.vala" => ["vala"],
"*.vapi" => ["vala"], "*.vapi" => ["vala"],
"*.vb" => ["vb_net"], "*.vb" => ["vb_net"],
@ -1104,7 +1105,7 @@ module Tartrazine
"*.xml" => ["xml"], "*.xml" => ["xml"],
"*.xsd" => ["xml"], "*.xsd" => ["xml"],
"*.xsl" => ["xml"], "*.xsl" => ["xml"],
"*.xslt" => ["xml", "html"], "*.xslt" => ["html", "xml"],
"*.yaml" => ["yaml"], "*.yaml" => ["yaml"],
"*.yang" => ["yang"], "*.yang" => ["yang"],
"*.yml" => ["yaml"], "*.yml" => ["yaml"],

View File

@ -1,13 +1,12 @@
require "yaml" require "yaml"
# Use linguist's heuristics to disambiguate between languages # Use linguist's heuristics to disambiguate between languages
# This is *shamelessly* stolen from https://github.com/github-linguist/linguist # This is *shamelessly* stolen from https://github.com/github-linguist/linguist
# and ported to Crystal. Deepest thanks to the authors of Linguist # and ported to Crystal. Deepest thanks to the authors of Linguist
# for licensing it liberally. # for licensing it liberally.
# #
# Consider this code (c) 2017 GitHub, Inc. even if I wrote it. # Consider this code (c) 2017 GitHub, Inc. even if I wrote it.
module Linguist module Linguist
class Heuristic class Heuristic
include YAML::Serializable include YAML::Serializable
@ -80,7 +79,3 @@ require "yaml"
end end
end end
end end
h = Linguist::Heuristic.from_yaml(File.read("heuristics/heuristics.yml"))
fname = "/usr/include/sqlite3.h"
p! h.run(fname, File.read(fname))

View File

@ -36,12 +36,30 @@ module Tartrazine
when 1 when 1
lexer_file_name = candidates.first lexer_file_name = candidates.first
else else
raise Exception.new("Multiple lexers match the filename: #{candidates.to_a.join(", ")}") lexer_file_name = self.lexer_by_content(filename)
begin
return self.lexer(lexer_file_name)
rescue ex : Exception
raise Exception.new("Multiple lexers match the filename: #{candidates.to_a.join(", ")}, heuristics suggest #{lexer_file_name} but there is no matching lexer.")
end
end end
Lexer.from_xml(LexerFiles.get("/#{lexer_file_name}.xml").gets_to_end) Lexer.from_xml(LexerFiles.get("/#{lexer_file_name}.xml").gets_to_end)
end end
private def self.lexer_by_content(fname : String) : String?
h = Linguist::Heuristic.from_yaml(LexerFiles.get("/heuristics.yml").gets_to_end)
result = h.run(fname, File.read(fname))
case result
when Nil
raise Exception.new "No lexer found for #{fname}"
when String
result.as(String)
when Array(String)
result.first
end
end
private def self.create_delegating_lexer(name : String) : BaseLexer private def self.create_delegating_lexer(name : String) : BaseLexer
language, root = name.split("+", 2) language, root = name.split("+", 2)
language_lexer = lexer(language) language_lexer = lexer(language)