2 Commits

Author SHA1 Message Date
df88047ca8 v0.6.1 2024-08-24 21:45:57 -03:00
5a3b50d7a3 Integrate heuristics into lexer selection 2024-08-24 21:39:39 -03:00
9 changed files with 51 additions and 36 deletions

View File

@ -29,7 +29,7 @@ This only covers the RegexLexers, which are the most common ones,
but it means the supported languages are a subset of Chroma's, which but it means the supported languages are a subset of Chroma's, which
is a subset of Pygments'. is a subset of Pygments'.
Currently Tartrazine supports ... 248 languages. Currently Tartrazine supports ... 247 languages.
It has 331 themes (63 from Chroma, the rest are base16 themes via It has 331 themes (63 from Chroma, the rest are base16 themes via
[Sixteen](https://github.com/ralsina/sixteen) [Sixteen](https://github.com/ralsina/sixteen)

View File

@ -3,6 +3,7 @@
<name>Groff</name> <name>Groff</name>
<alias>groff</alias> <alias>groff</alias>
<alias>nroff</alias> <alias>nroff</alias>
<alias>roff</alias>
<alias>man</alias> <alias>man</alias>
<filename>*.[1-9]</filename> <filename>*.[1-9]</filename>
<filename>*.1p</filename> <filename>*.1p</filename>
@ -87,4 +88,4 @@
</rule> </rule>
</state> </state>
</rules> </rules>
</lexer> </lexer>

View File

@ -30,12 +30,12 @@
disambiguations: disambiguations:
- extensions: ['.1', '.2', '.3', '.4', '.5', '.6', '.7', '.8', '.9'] - extensions: ['.1', '.2', '.3', '.4', '.5', '.6', '.7', '.8', '.9']
rules: rules:
- language: Roff Manpage - language: man
and: and:
- named_pattern: mdoc-date - named_pattern: mdoc-date
- named_pattern: mdoc-title - named_pattern: mdoc-title
- named_pattern: mdoc-heading - named_pattern: mdoc-heading
- language: Roff Manpage - language: man
and: and:
- named_pattern: man-title - named_pattern: man-title
- named_pattern: man-heading - named_pattern: man-heading
@ -43,12 +43,12 @@ disambiguations:
pattern: '^\.(?:[A-Za-z]{2}(?:\s|$)|\\")' pattern: '^\.(?:[A-Za-z]{2}(?:\s|$)|\\")'
- extensions: ['.1in', '.1m', '.1x', '.3in', '.3m', '.3p', '.3pm', '.3qt', '.3x', '.man', '.mdoc'] - extensions: ['.1in', '.1m', '.1x', '.3in', '.3m', '.3p', '.3pm', '.3qt', '.3x', '.man', '.mdoc']
rules: rules:
- language: Roff Manpage - language: man
and: and:
- named_pattern: mdoc-date - named_pattern: mdoc-date
- named_pattern: mdoc-title - named_pattern: mdoc-title
- named_pattern: mdoc-heading - named_pattern: mdoc-heading
- language: Roff Manpage - language: man
and: and:
- named_pattern: man-title - named_pattern: man-title
- named_pattern: man-heading - named_pattern: man-heading

View File

@ -52,6 +52,6 @@ with open("src/constants/lexers.cr", "w") as f:
f.write(" LEXERS_BY_FILENAME = {\n") f.write(" LEXERS_BY_FILENAME = {\n")
for k in sorted(lexer_by_filename.keys()): for k in sorted(lexer_by_filename.keys()):
v = lexer_by_filename[k] v = lexer_by_filename[k]
f.write(f'"{k}" => {str(list(v)).replace("'", "\"")}, \n') f.write(f'"{k}" => {str(sorted(list(v))).replace("'", "\"")}, \n')
f.write("}\n") f.write("}\n")
f.write("end\n") f.write("end\n")

View File

@ -1,5 +1,5 @@
name: tartrazine name: tartrazine
version: 0.6.0 version: 0.6.1
authors: authors:
- Roberto Alsina <roberto.alsina@gmail.com> - Roberto Alsina <roberto.alsina@gmail.com>

View File

@ -328,6 +328,7 @@ module Tartrazine
"restructuredtext" => "rst", "restructuredtext" => "rst",
"rexx" => "rexx", "rexx" => "rexx",
"rkt" => "racket", "rkt" => "racket",
"roff" => "groff",
"rpmspec" => "rpm_spec", "rpmspec" => "rpm_spec",
"rs" => "rust", "rs" => "rust",
"rst" => "rst", "rst" => "rst",
@ -730,8 +731,8 @@ module Tartrazine
"*.applescript" => ["applescript"], "*.applescript" => ["applescript"],
"*.aql" => ["arangodb_aql"], "*.aql" => ["arangodb_aql"],
"*.arexx" => ["rexx"], "*.arexx" => ["rexx"],
"*.as" => ["actionscript_3", "actionscript"], "*.as" => ["actionscript", "actionscript_3"],
"*.asm" => ["tasm", "nasm", "z80_assembly"], "*.asm" => ["nasm", "tasm", "z80_assembly"],
"*.au3" => ["autoit"], "*.au3" => ["autoit"],
"*.automount" => ["systemd"], "*.automount" => ["systemd"],
"*.aux" => ["tex"], "*.aux" => ["tex"],
@ -739,7 +740,7 @@ module Tartrazine
"*.awk" => ["awk"], "*.awk" => ["awk"],
"*.b" => ["brainfuck"], "*.b" => ["brainfuck"],
"*.bal" => ["ballerina"], "*.bal" => ["ballerina"],
"*.bas" => ["vb_net", "qbasic"], "*.bas" => ["qbasic", "vb_net"],
"*.bash" => ["bash"], "*.bash" => ["bash"],
"*.bat" => ["batchfile"], "*.bat" => ["batchfile"],
"*.batch" => ["psl"], "*.batch" => ["psl"],
@ -750,7 +751,7 @@ module Tartrazine
"*.bnf" => ["bnf"], "*.bnf" => ["bnf"],
"*.bqn" => ["bqn"], "*.bqn" => ["bqn"],
"*.bzl" => ["python"], "*.bzl" => ["python"],
"*.c" => ["c++", "c"], "*.c" => ["c", "c++"],
"*.c++" => ["c++"], "*.c++" => ["c++"],
"*.capnp" => ["cap_n_proto"], "*.capnp" => ["cap_n_proto"],
"*.cc" => ["c++"], "*.cc" => ["c++"],
@ -839,7 +840,7 @@ module Tartrazine
"*.fx" => ["hlsl"], "*.fx" => ["hlsl"],
"*.fxh" => ["hlsl"], "*.fxh" => ["hlsl"],
"*.fzn" => ["minizinc"], "*.fzn" => ["minizinc"],
"*.gd" => ["gdscript3", "gdscript"], "*.gd" => ["gdscript", "gdscript3"],
"*.gemspec" => ["ruby"], "*.gemspec" => ["ruby"],
"*.geo" => ["glsl"], "*.geo" => ["glsl"],
"*.gleam" => ["gleam"], "*.gleam" => ["gleam"],
@ -849,7 +850,7 @@ module Tartrazine
"*.graphql" => ["graphql"], "*.graphql" => ["graphql"],
"*.graphqls" => ["graphql"], "*.graphqls" => ["graphql"],
"*.groovy" => ["groovy"], "*.groovy" => ["groovy"],
"*.h" => ["c++", "c", "objective-c"], "*.h" => ["c", "c++", "objective-c"],
"*.h++" => ["c++"], "*.h++" => ["c++"],
"*.ha" => ["hare"], "*.ha" => ["hare"],
"*.handlebars" => ["handlebars"], "*.handlebars" => ["handlebars"],
@ -872,7 +873,7 @@ module Tartrazine
"*.idc" => ["c"], "*.idc" => ["c"],
"*.idr" => ["idris"], "*.idr" => ["idris"],
"*.ijs" => ["j"], "*.ijs" => ["j"],
"*.inc" => ["objectpascal", "povray", "php", "sourcepawn"], "*.inc" => ["objectpascal", "php", "povray", "sourcepawn"],
"*.inf" => ["ini"], "*.inf" => ["ini"],
"*.ini" => ["ini"], "*.ini" => ["ini"],
"*.ino" => ["arduino"], "*.ino" => ["arduino"],
@ -898,13 +899,13 @@ module Tartrazine
"*.lpk" => ["objectpascal"], "*.lpk" => ["objectpascal"],
"*.lpr" => ["objectpascal"], "*.lpr" => ["objectpascal"],
"*.lua" => ["lua"], "*.lua" => ["lua"],
"*.m" => ["mathematica", "octave", "matlab", "objective-c", "mason"], "*.m" => ["mason", "mathematica", "matlab", "objective-c", "octave"],
"*.ma" => ["mathematica"], "*.ma" => ["mathematica"],
"*.mak" => ["makefile"], "*.mak" => ["makefile"],
"*.man" => ["groff"], "*.man" => ["groff"],
"*.mao" => ["mako"], "*.mao" => ["mako"],
"*.markdown" => ["markdown"], "*.markdown" => ["markdown"],
"*.mc" => ["monkeyc", "mason"], "*.mc" => ["mason", "monkeyc"],
"*.mcfunction" => ["mcfunction"], "*.mcfunction" => ["mcfunction"],
"*.md" => ["markdown"], "*.md" => ["markdown"],
"*.metal" => ["metal"], "*.metal" => ["metal"],
@ -961,7 +962,7 @@ module Tartrazine
"*.pml" => ["promela"], "*.pml" => ["promela"],
"*.pony" => ["pony"], "*.pony" => ["pony"],
"*.pov" => ["povray"], "*.pov" => ["povray"],
"*.pp" => ["puppet", "objectpascal"], "*.pp" => ["objectpascal", "puppet"],
"*.pq" => ["powerquery"], "*.pq" => ["powerquery"],
"*.pr" => ["promela"], "*.pr" => ["promela"],
"*.prm" => ["promela"], "*.prm" => ["promela"],
@ -1010,7 +1011,7 @@ module Tartrazine
"*.rst" => ["rst"], "*.rst" => ["rst"],
"*.rvt" => ["tcl"], "*.rvt" => ["tcl"],
"*.rx" => ["rexx"], "*.rx" => ["rexx"],
"*.s" => ["armasm", "r", "gas"], "*.s" => ["armasm", "gas", "r"],
"*.sage" => ["python"], "*.sage" => ["python"],
"*.sas" => ["sas"], "*.sas" => ["sas"],
"*.sass" => ["sass"], "*.sass" => ["sass"],
@ -1023,7 +1024,7 @@ module Tartrazine
"*.scope" => ["systemd"], "*.scope" => ["systemd"],
"*.scss" => ["scss"], "*.scss" => ["scss"],
"*.sed" => ["sed"], "*.sed" => ["sed"],
"*.service" => ["systemd", "ini"], "*.service" => ["ini", "systemd"],
"*.sh" => ["bash"], "*.sh" => ["bash"],
"*.sh-session" => ["bash_session"], "*.sh-session" => ["bash_session"],
"*.sieve" => ["sieve"], "*.sieve" => ["sieve"],
@ -1033,7 +1034,7 @@ module Tartrazine
"*.smali" => ["smali"], "*.smali" => ["smali"],
"*.sml" => ["standard_ml"], "*.sml" => ["standard_ml"],
"*.snobol" => ["snobol"], "*.snobol" => ["snobol"],
"*.socket" => ["systemd", "ini"], "*.socket" => ["ini", "systemd"],
"*.sol" => ["solidity"], "*.sol" => ["solidity"],
"*.sp" => ["sourcepawn"], "*.sp" => ["sourcepawn"],
"*.sparql" => ["sparql"], "*.sparql" => ["sparql"],
@ -1068,7 +1069,7 @@ module Tartrazine
"*.tpl" => ["smarty"], "*.tpl" => ["smarty"],
"*.tpp" => ["c++"], "*.tpp" => ["c++"],
"*.trig" => ["psl"], "*.trig" => ["psl"],
"*.ts" => ["typoscript", "typescript"], "*.ts" => ["typescript", "typoscript"],
"*.tst" => ["scilab"], "*.tst" => ["scilab"],
"*.tsx" => ["typescript"], "*.tsx" => ["typescript"],
"*.ttl" => ["turtle"], "*.ttl" => ["turtle"],
@ -1104,7 +1105,7 @@ module Tartrazine
"*.xml" => ["xml"], "*.xml" => ["xml"],
"*.xsd" => ["xml"], "*.xsd" => ["xml"],
"*.xsl" => ["xml"], "*.xsl" => ["xml"],
"*.xslt" => ["xml", "html"], "*.xslt" => ["html", "xml"],
"*.yaml" => ["yaml"], "*.yaml" => ["yaml"],
"*.yang" => ["yang"], "*.yang" => ["yang"],
"*.yml" => ["yaml"], "*.yml" => ["yaml"],

View File

@ -1,13 +1,12 @@
require "yaml" require "yaml"
# Use linguist's heuristics to disambiguate between languages # Use linguist's heuristics to disambiguate between languages
# This is *shamelessly* stolen from https://github.com/github-linguist/linguist # This is *shamelessly* stolen from https://github.com/github-linguist/linguist
# and ported to Crystal. Deepest thanks to the authors of Linguist # and ported to Crystal. Deepest thanks to the authors of Linguist
# for licensing it liberally. # for licensing it liberally.
# #
# Consider this code (c) 2017 GitHub, Inc. even if I wrote it. # Consider this code (c) 2017 GitHub, Inc. even if I wrote it.
module Linguist module Linguist
class Heuristic class Heuristic
include YAML::Serializable include YAML::Serializable
@ -80,7 +79,3 @@ require "yaml"
end end
end end
end end
h = Linguist::Heuristic.from_yaml(File.read("heuristics/heuristics.yml"))
fname = "/usr/include/sqlite3.h"
p! h.run(fname, File.read(fname))

View File

@ -36,12 +36,30 @@ module Tartrazine
when 1 when 1
lexer_file_name = candidates.first lexer_file_name = candidates.first
else else
raise Exception.new("Multiple lexers match the filename: #{candidates.to_a.join(", ")}") lexer_file_name = self.lexer_by_content(filename)
begin
return self.lexer(lexer_file_name)
rescue ex : Exception
raise Exception.new("Multiple lexers match the filename: #{candidates.to_a.join(", ")}, heuristics suggest #{lexer_file_name} but there is no matching lexer.")
end
end end
Lexer.from_xml(LexerFiles.get("/#{lexer_file_name}.xml").gets_to_end) Lexer.from_xml(LexerFiles.get("/#{lexer_file_name}.xml").gets_to_end)
end end
private def self.lexer_by_content(fname : String) : String?
h = Linguist::Heuristic.from_yaml(LexerFiles.get("/heuristics.yml").gets_to_end)
result = h.run(fname, File.read(fname))
case result
when Nil
raise Exception.new "No lexer found for #{fname}"
when String
result.as(String)
when Array(String)
result.first
end
end
private def self.create_delegating_lexer(name : String) : BaseLexer private def self.create_delegating_lexer(name : String) : BaseLexer
language, root = name.split("+", 2) language, root = name.split("+", 2)
language_lexer = lexer(language) language_lexer = lexer(language)