From 72afec773e6ee18915340415e1277c663b8762ac Mon Sep 17 00:00:00 2001 From: Roberto Alsina Date: Sat, 24 Aug 2024 21:35:06 -0300 Subject: [PATCH] Integrate heuristics into lexer selection --- .../LICENSE.txt => lexers/LICENSE-heuristics | 0 lexers/groff.xml | 3 ++- {heuristics => lexers}/heuristics.yml | 8 +++---- src/constants/lexers.cr | 23 ++++++++++--------- src/heuristics.cr | 19 ++++++--------- src/lexer.cr | 20 +++++++++++++++- 6 files changed, 44 insertions(+), 29 deletions(-) rename heuristics/LICENSE.txt => lexers/LICENSE-heuristics (100%) rename {heuristics => lexers}/heuristics.yml (99%) diff --git a/heuristics/LICENSE.txt b/lexers/LICENSE-heuristics similarity index 100% rename from heuristics/LICENSE.txt rename to lexers/LICENSE-heuristics diff --git a/lexers/groff.xml b/lexers/groff.xml index 3af0a43..f647b21 100644 --- a/lexers/groff.xml +++ b/lexers/groff.xml @@ -3,6 +3,7 @@ Groff groff nroff + roff man *.[1-9] *.1p @@ -87,4 +88,4 @@ - \ No newline at end of file + diff --git a/heuristics/heuristics.yml b/lexers/heuristics.yml similarity index 99% rename from heuristics/heuristics.yml rename to lexers/heuristics.yml index 8a6fcea..1c74ed8 100644 --- a/heuristics/heuristics.yml +++ b/lexers/heuristics.yml @@ -30,12 +30,12 @@ disambiguations: - extensions: ['.1', '.2', '.3', '.4', '.5', '.6', '.7', '.8', '.9'] rules: - - language: Roff Manpage + - language: man and: - named_pattern: mdoc-date - named_pattern: mdoc-title - named_pattern: mdoc-heading - - language: Roff Manpage + - language: man and: - named_pattern: man-title - named_pattern: man-heading @@ -43,12 +43,12 @@ disambiguations: pattern: '^\.(?:[A-Za-z]{2}(?:\s|$)|\\")' - extensions: ['.1in', '.1m', '.1x', '.3in', '.3m', '.3p', '.3pm', '.3qt', '.3x', '.man', '.mdoc'] rules: - - language: Roff Manpage + - language: man and: - named_pattern: mdoc-date - named_pattern: mdoc-title - named_pattern: mdoc-heading - - language: Roff Manpage + - language: man and: - named_pattern: man-title - named_pattern: man-heading diff --git a/src/constants/lexers.cr b/src/constants/lexers.cr index 1f8a368..45eb12f 100644 --- a/src/constants/lexers.cr +++ b/src/constants/lexers.cr @@ -328,6 +328,7 @@ module Tartrazine "restructuredtext" => "rst", "rexx" => "rexx", "rkt" => "racket", + "roff" => "groff", "rpmspec" => "rpm_spec", "rs" => "rust", "rst" => "rst", @@ -731,7 +732,7 @@ module Tartrazine "*.aql" => ["arangodb_aql"], "*.arexx" => ["rexx"], "*.as" => ["actionscript_3", "actionscript"], - "*.asm" => ["tasm", "nasm", "z80_assembly"], + "*.asm" => ["nasm", "z80_assembly", "tasm"], "*.au3" => ["autoit"], "*.automount" => ["systemd"], "*.aux" => ["tex"], @@ -750,7 +751,7 @@ module Tartrazine "*.bnf" => ["bnf"], "*.bqn" => ["bqn"], "*.bzl" => ["python"], - "*.c" => ["c++", "c"], + "*.c" => ["c", "c++"], "*.c++" => ["c++"], "*.capnp" => ["cap_n_proto"], "*.cc" => ["c++"], @@ -839,7 +840,7 @@ module Tartrazine "*.fx" => ["hlsl"], "*.fxh" => ["hlsl"], "*.fzn" => ["minizinc"], - "*.gd" => ["gdscript3", "gdscript"], + "*.gd" => ["gdscript", "gdscript3"], "*.gemspec" => ["ruby"], "*.geo" => ["glsl"], "*.gleam" => ["gleam"], @@ -849,7 +850,7 @@ module Tartrazine "*.graphql" => ["graphql"], "*.graphqls" => ["graphql"], "*.groovy" => ["groovy"], - "*.h" => ["c++", "c", "objective-c"], + "*.h" => ["objective-c", "c", "c++"], "*.h++" => ["c++"], "*.ha" => ["hare"], "*.handlebars" => ["handlebars"], @@ -872,7 +873,7 @@ module Tartrazine "*.idc" => ["c"], "*.idr" => ["idris"], "*.ijs" => ["j"], - "*.inc" => ["objectpascal", "povray", "php", "sourcepawn"], + "*.inc" => ["php", "objectpascal", "povray", "sourcepawn"], "*.inf" => ["ini"], "*.ini" => ["ini"], "*.ino" => ["arduino"], @@ -898,13 +899,13 @@ module Tartrazine "*.lpk" => ["objectpascal"], "*.lpr" => ["objectpascal"], "*.lua" => ["lua"], - "*.m" => ["mathematica", "octave", "matlab", "objective-c", "mason"], + "*.m" => ["mathematica", "mason", "octave", "objective-c", "matlab"], "*.ma" => ["mathematica"], "*.mak" => ["makefile"], "*.man" => ["groff"], "*.mao" => ["mako"], "*.markdown" => ["markdown"], - "*.mc" => ["monkeyc", "mason"], + "*.mc" => ["mason", "monkeyc"], "*.mcfunction" => ["mcfunction"], "*.md" => ["markdown"], "*.metal" => ["metal"], @@ -953,7 +954,7 @@ module Tartrazine "*.php" => ["php"], "*.php[345]" => ["php"], "*.pig" => ["pig"], - "*.pl" => ["perl", "prolog"], + "*.pl" => ["prolog", "perl"], "*.plc" => ["plutus_core"], "*.plot" => ["gnuplot"], "*.plt" => ["gnuplot"], @@ -1039,7 +1040,7 @@ module Tartrazine "*.sparql" => ["sparql"], "*.spec" => ["rpm_spec"], "*.spt" => ["cheetah"], - "*.sql" => ["mysql", "sql"], + "*.sql" => ["sql", "mysql"], "*.ss" => ["scheme"], "*.st" => ["smalltalk"], "*.stas" => ["stas"], @@ -1078,7 +1079,7 @@ module Tartrazine "*.twig" => ["twig"], "*.txt" => ["plaintext"], "*.uc" => ["ucode"], - "*.v" => ["coq", "v", "verilog"], + "*.v" => ["verilog", "v", "coq"], "*.vala" => ["vala"], "*.vapi" => ["vala"], "*.vb" => ["vb_net"], @@ -1104,7 +1105,7 @@ module Tartrazine "*.xml" => ["xml"], "*.xsd" => ["xml"], "*.xsl" => ["xml"], - "*.xslt" => ["xml", "html"], + "*.xslt" => ["html", "xml"], "*.yaml" => ["yaml"], "*.yang" => ["yang"], "*.yml" => ["yaml"], diff --git a/src/heuristics.cr b/src/heuristics.cr index 11de614..2aebe9a 100644 --- a/src/heuristics.cr +++ b/src/heuristics.cr @@ -1,13 +1,12 @@ require "yaml" - # Use linguist's heuristics to disambiguate between languages - # This is *shamelessly* stolen from https://github.com/github-linguist/linguist - # and ported to Crystal. Deepest thanks to the authors of Linguist - # for licensing it liberally. - # - # Consider this code (c) 2017 GitHub, Inc. even if I wrote it. - module Linguist - +# Use linguist's heuristics to disambiguate between languages +# This is *shamelessly* stolen from https://github.com/github-linguist/linguist +# and ported to Crystal. Deepest thanks to the authors of Linguist +# for licensing it liberally. +# +# Consider this code (c) 2017 GitHub, Inc. even if I wrote it. +module Linguist class Heuristic include YAML::Serializable @@ -80,7 +79,3 @@ require "yaml" end end end - -h = Linguist::Heuristic.from_yaml(File.read("heuristics/heuristics.yml")) -fname = "/usr/include/sqlite3.h" -p! h.run(fname, File.read(fname)) diff --git a/src/lexer.cr b/src/lexer.cr index e38a261..39652ba 100644 --- a/src/lexer.cr +++ b/src/lexer.cr @@ -36,12 +36,30 @@ module Tartrazine when 1 lexer_file_name = candidates.first else - raise Exception.new("Multiple lexers match the filename: #{candidates.to_a.join(", ")}") + lexer_file_name = self.lexer_by_content(filename) + begin + return self.lexer(lexer_file_name) + rescue ex : Exception + raise Exception.new("Multiple lexers match the filename: #{candidates.to_a.join(", ")}, heuristics suggest #{lexer_file_name} but there is no matching lexer.") + end end Lexer.from_xml(LexerFiles.get("/#{lexer_file_name}.xml").gets_to_end) end + private def self.lexer_by_content(fname : String) : String? + h = Linguist::Heuristic.from_yaml(LexerFiles.get("/heuristics.yml").gets_to_end) + result = h.run(fname, File.read(fname)) + case result + when Nil + raise Exception.new "No lexer found for #{fname}" + when String + result.as(String) + when Array(String) + result.first + end + end + private def self.create_delegating_lexer(name : String) : BaseLexer language, root = name.split("+", 2) language_lexer = lexer(language)