mirror of
https://github.com/ralsina/tartrazine.git
synced 2024-11-10 05:22:23 +00:00
Integrate heuristics into lexer selection
This commit is contained in:
parent
a5926af518
commit
72afec773e
@ -3,6 +3,7 @@
|
|||||||
<name>Groff</name>
|
<name>Groff</name>
|
||||||
<alias>groff</alias>
|
<alias>groff</alias>
|
||||||
<alias>nroff</alias>
|
<alias>nroff</alias>
|
||||||
|
<alias>roff</alias>
|
||||||
<alias>man</alias>
|
<alias>man</alias>
|
||||||
<filename>*.[1-9]</filename>
|
<filename>*.[1-9]</filename>
|
||||||
<filename>*.1p</filename>
|
<filename>*.1p</filename>
|
||||||
|
@ -30,12 +30,12 @@
|
|||||||
disambiguations:
|
disambiguations:
|
||||||
- extensions: ['.1', '.2', '.3', '.4', '.5', '.6', '.7', '.8', '.9']
|
- extensions: ['.1', '.2', '.3', '.4', '.5', '.6', '.7', '.8', '.9']
|
||||||
rules:
|
rules:
|
||||||
- language: Roff Manpage
|
- language: man
|
||||||
and:
|
and:
|
||||||
- named_pattern: mdoc-date
|
- named_pattern: mdoc-date
|
||||||
- named_pattern: mdoc-title
|
- named_pattern: mdoc-title
|
||||||
- named_pattern: mdoc-heading
|
- named_pattern: mdoc-heading
|
||||||
- language: Roff Manpage
|
- language: man
|
||||||
and:
|
and:
|
||||||
- named_pattern: man-title
|
- named_pattern: man-title
|
||||||
- named_pattern: man-heading
|
- named_pattern: man-heading
|
||||||
@ -43,12 +43,12 @@ disambiguations:
|
|||||||
pattern: '^\.(?:[A-Za-z]{2}(?:\s|$)|\\")'
|
pattern: '^\.(?:[A-Za-z]{2}(?:\s|$)|\\")'
|
||||||
- extensions: ['.1in', '.1m', '.1x', '.3in', '.3m', '.3p', '.3pm', '.3qt', '.3x', '.man', '.mdoc']
|
- extensions: ['.1in', '.1m', '.1x', '.3in', '.3m', '.3p', '.3pm', '.3qt', '.3x', '.man', '.mdoc']
|
||||||
rules:
|
rules:
|
||||||
- language: Roff Manpage
|
- language: man
|
||||||
and:
|
and:
|
||||||
- named_pattern: mdoc-date
|
- named_pattern: mdoc-date
|
||||||
- named_pattern: mdoc-title
|
- named_pattern: mdoc-title
|
||||||
- named_pattern: mdoc-heading
|
- named_pattern: mdoc-heading
|
||||||
- language: Roff Manpage
|
- language: man
|
||||||
and:
|
and:
|
||||||
- named_pattern: man-title
|
- named_pattern: man-title
|
||||||
- named_pattern: man-heading
|
- named_pattern: man-heading
|
@ -328,6 +328,7 @@ module Tartrazine
|
|||||||
"restructuredtext" => "rst",
|
"restructuredtext" => "rst",
|
||||||
"rexx" => "rexx",
|
"rexx" => "rexx",
|
||||||
"rkt" => "racket",
|
"rkt" => "racket",
|
||||||
|
"roff" => "groff",
|
||||||
"rpmspec" => "rpm_spec",
|
"rpmspec" => "rpm_spec",
|
||||||
"rs" => "rust",
|
"rs" => "rust",
|
||||||
"rst" => "rst",
|
"rst" => "rst",
|
||||||
@ -731,7 +732,7 @@ module Tartrazine
|
|||||||
"*.aql" => ["arangodb_aql"],
|
"*.aql" => ["arangodb_aql"],
|
||||||
"*.arexx" => ["rexx"],
|
"*.arexx" => ["rexx"],
|
||||||
"*.as" => ["actionscript_3", "actionscript"],
|
"*.as" => ["actionscript_3", "actionscript"],
|
||||||
"*.asm" => ["tasm", "nasm", "z80_assembly"],
|
"*.asm" => ["nasm", "z80_assembly", "tasm"],
|
||||||
"*.au3" => ["autoit"],
|
"*.au3" => ["autoit"],
|
||||||
"*.automount" => ["systemd"],
|
"*.automount" => ["systemd"],
|
||||||
"*.aux" => ["tex"],
|
"*.aux" => ["tex"],
|
||||||
@ -750,7 +751,7 @@ module Tartrazine
|
|||||||
"*.bnf" => ["bnf"],
|
"*.bnf" => ["bnf"],
|
||||||
"*.bqn" => ["bqn"],
|
"*.bqn" => ["bqn"],
|
||||||
"*.bzl" => ["python"],
|
"*.bzl" => ["python"],
|
||||||
"*.c" => ["c++", "c"],
|
"*.c" => ["c", "c++"],
|
||||||
"*.c++" => ["c++"],
|
"*.c++" => ["c++"],
|
||||||
"*.capnp" => ["cap_n_proto"],
|
"*.capnp" => ["cap_n_proto"],
|
||||||
"*.cc" => ["c++"],
|
"*.cc" => ["c++"],
|
||||||
@ -839,7 +840,7 @@ module Tartrazine
|
|||||||
"*.fx" => ["hlsl"],
|
"*.fx" => ["hlsl"],
|
||||||
"*.fxh" => ["hlsl"],
|
"*.fxh" => ["hlsl"],
|
||||||
"*.fzn" => ["minizinc"],
|
"*.fzn" => ["minizinc"],
|
||||||
"*.gd" => ["gdscript3", "gdscript"],
|
"*.gd" => ["gdscript", "gdscript3"],
|
||||||
"*.gemspec" => ["ruby"],
|
"*.gemspec" => ["ruby"],
|
||||||
"*.geo" => ["glsl"],
|
"*.geo" => ["glsl"],
|
||||||
"*.gleam" => ["gleam"],
|
"*.gleam" => ["gleam"],
|
||||||
@ -849,7 +850,7 @@ module Tartrazine
|
|||||||
"*.graphql" => ["graphql"],
|
"*.graphql" => ["graphql"],
|
||||||
"*.graphqls" => ["graphql"],
|
"*.graphqls" => ["graphql"],
|
||||||
"*.groovy" => ["groovy"],
|
"*.groovy" => ["groovy"],
|
||||||
"*.h" => ["c++", "c", "objective-c"],
|
"*.h" => ["objective-c", "c", "c++"],
|
||||||
"*.h++" => ["c++"],
|
"*.h++" => ["c++"],
|
||||||
"*.ha" => ["hare"],
|
"*.ha" => ["hare"],
|
||||||
"*.handlebars" => ["handlebars"],
|
"*.handlebars" => ["handlebars"],
|
||||||
@ -872,7 +873,7 @@ module Tartrazine
|
|||||||
"*.idc" => ["c"],
|
"*.idc" => ["c"],
|
||||||
"*.idr" => ["idris"],
|
"*.idr" => ["idris"],
|
||||||
"*.ijs" => ["j"],
|
"*.ijs" => ["j"],
|
||||||
"*.inc" => ["objectpascal", "povray", "php", "sourcepawn"],
|
"*.inc" => ["php", "objectpascal", "povray", "sourcepawn"],
|
||||||
"*.inf" => ["ini"],
|
"*.inf" => ["ini"],
|
||||||
"*.ini" => ["ini"],
|
"*.ini" => ["ini"],
|
||||||
"*.ino" => ["arduino"],
|
"*.ino" => ["arduino"],
|
||||||
@ -898,13 +899,13 @@ module Tartrazine
|
|||||||
"*.lpk" => ["objectpascal"],
|
"*.lpk" => ["objectpascal"],
|
||||||
"*.lpr" => ["objectpascal"],
|
"*.lpr" => ["objectpascal"],
|
||||||
"*.lua" => ["lua"],
|
"*.lua" => ["lua"],
|
||||||
"*.m" => ["mathematica", "octave", "matlab", "objective-c", "mason"],
|
"*.m" => ["mathematica", "mason", "octave", "objective-c", "matlab"],
|
||||||
"*.ma" => ["mathematica"],
|
"*.ma" => ["mathematica"],
|
||||||
"*.mak" => ["makefile"],
|
"*.mak" => ["makefile"],
|
||||||
"*.man" => ["groff"],
|
"*.man" => ["groff"],
|
||||||
"*.mao" => ["mako"],
|
"*.mao" => ["mako"],
|
||||||
"*.markdown" => ["markdown"],
|
"*.markdown" => ["markdown"],
|
||||||
"*.mc" => ["monkeyc", "mason"],
|
"*.mc" => ["mason", "monkeyc"],
|
||||||
"*.mcfunction" => ["mcfunction"],
|
"*.mcfunction" => ["mcfunction"],
|
||||||
"*.md" => ["markdown"],
|
"*.md" => ["markdown"],
|
||||||
"*.metal" => ["metal"],
|
"*.metal" => ["metal"],
|
||||||
@ -953,7 +954,7 @@ module Tartrazine
|
|||||||
"*.php" => ["php"],
|
"*.php" => ["php"],
|
||||||
"*.php[345]" => ["php"],
|
"*.php[345]" => ["php"],
|
||||||
"*.pig" => ["pig"],
|
"*.pig" => ["pig"],
|
||||||
"*.pl" => ["perl", "prolog"],
|
"*.pl" => ["prolog", "perl"],
|
||||||
"*.plc" => ["plutus_core"],
|
"*.plc" => ["plutus_core"],
|
||||||
"*.plot" => ["gnuplot"],
|
"*.plot" => ["gnuplot"],
|
||||||
"*.plt" => ["gnuplot"],
|
"*.plt" => ["gnuplot"],
|
||||||
@ -1039,7 +1040,7 @@ module Tartrazine
|
|||||||
"*.sparql" => ["sparql"],
|
"*.sparql" => ["sparql"],
|
||||||
"*.spec" => ["rpm_spec"],
|
"*.spec" => ["rpm_spec"],
|
||||||
"*.spt" => ["cheetah"],
|
"*.spt" => ["cheetah"],
|
||||||
"*.sql" => ["mysql", "sql"],
|
"*.sql" => ["sql", "mysql"],
|
||||||
"*.ss" => ["scheme"],
|
"*.ss" => ["scheme"],
|
||||||
"*.st" => ["smalltalk"],
|
"*.st" => ["smalltalk"],
|
||||||
"*.stas" => ["stas"],
|
"*.stas" => ["stas"],
|
||||||
@ -1078,7 +1079,7 @@ module Tartrazine
|
|||||||
"*.twig" => ["twig"],
|
"*.twig" => ["twig"],
|
||||||
"*.txt" => ["plaintext"],
|
"*.txt" => ["plaintext"],
|
||||||
"*.uc" => ["ucode"],
|
"*.uc" => ["ucode"],
|
||||||
"*.v" => ["coq", "v", "verilog"],
|
"*.v" => ["verilog", "v", "coq"],
|
||||||
"*.vala" => ["vala"],
|
"*.vala" => ["vala"],
|
||||||
"*.vapi" => ["vala"],
|
"*.vapi" => ["vala"],
|
||||||
"*.vb" => ["vb_net"],
|
"*.vb" => ["vb_net"],
|
||||||
@ -1104,7 +1105,7 @@ module Tartrazine
|
|||||||
"*.xml" => ["xml"],
|
"*.xml" => ["xml"],
|
||||||
"*.xsd" => ["xml"],
|
"*.xsd" => ["xml"],
|
||||||
"*.xsl" => ["xml"],
|
"*.xsl" => ["xml"],
|
||||||
"*.xslt" => ["xml", "html"],
|
"*.xslt" => ["html", "xml"],
|
||||||
"*.yaml" => ["yaml"],
|
"*.yaml" => ["yaml"],
|
||||||
"*.yang" => ["yang"],
|
"*.yang" => ["yang"],
|
||||||
"*.yml" => ["yaml"],
|
"*.yml" => ["yaml"],
|
||||||
|
@ -1,13 +1,12 @@
|
|||||||
require "yaml"
|
require "yaml"
|
||||||
|
|
||||||
# Use linguist's heuristics to disambiguate between languages
|
# Use linguist's heuristics to disambiguate between languages
|
||||||
# This is *shamelessly* stolen from https://github.com/github-linguist/linguist
|
# This is *shamelessly* stolen from https://github.com/github-linguist/linguist
|
||||||
# and ported to Crystal. Deepest thanks to the authors of Linguist
|
# and ported to Crystal. Deepest thanks to the authors of Linguist
|
||||||
# for licensing it liberally.
|
# for licensing it liberally.
|
||||||
#
|
#
|
||||||
# Consider this code (c) 2017 GitHub, Inc. even if I wrote it.
|
# Consider this code (c) 2017 GitHub, Inc. even if I wrote it.
|
||||||
module Linguist
|
module Linguist
|
||||||
|
|
||||||
class Heuristic
|
class Heuristic
|
||||||
include YAML::Serializable
|
include YAML::Serializable
|
||||||
|
|
||||||
@ -80,7 +79,3 @@ require "yaml"
|
|||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
h = Linguist::Heuristic.from_yaml(File.read("heuristics/heuristics.yml"))
|
|
||||||
fname = "/usr/include/sqlite3.h"
|
|
||||||
p! h.run(fname, File.read(fname))
|
|
||||||
|
20
src/lexer.cr
20
src/lexer.cr
@ -36,12 +36,30 @@ module Tartrazine
|
|||||||
when 1
|
when 1
|
||||||
lexer_file_name = candidates.first
|
lexer_file_name = candidates.first
|
||||||
else
|
else
|
||||||
raise Exception.new("Multiple lexers match the filename: #{candidates.to_a.join(", ")}")
|
lexer_file_name = self.lexer_by_content(filename)
|
||||||
|
begin
|
||||||
|
return self.lexer(lexer_file_name)
|
||||||
|
rescue ex : Exception
|
||||||
|
raise Exception.new("Multiple lexers match the filename: #{candidates.to_a.join(", ")}, heuristics suggest #{lexer_file_name} but there is no matching lexer.")
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
Lexer.from_xml(LexerFiles.get("/#{lexer_file_name}.xml").gets_to_end)
|
Lexer.from_xml(LexerFiles.get("/#{lexer_file_name}.xml").gets_to_end)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
private def self.lexer_by_content(fname : String) : String?
|
||||||
|
h = Linguist::Heuristic.from_yaml(LexerFiles.get("/heuristics.yml").gets_to_end)
|
||||||
|
result = h.run(fname, File.read(fname))
|
||||||
|
case result
|
||||||
|
when Nil
|
||||||
|
raise Exception.new "No lexer found for #{fname}"
|
||||||
|
when String
|
||||||
|
result.as(String)
|
||||||
|
when Array(String)
|
||||||
|
result.first
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
private def self.create_delegating_lexer(name : String) : BaseLexer
|
private def self.create_delegating_lexer(name : String) : BaseLexer
|
||||||
language, root = name.split("+", 2)
|
language, root = name.split("+", 2)
|
||||||
language_lexer = lexer(language)
|
language_lexer = lexer(language)
|
||||||
|
Loading…
Reference in New Issue
Block a user