Improve shebang parsing to detect correct interpreter

This commit is contained in:
Lauris BH 2021-09-25 19:17:49 +03:00
parent 7c24e3d5d2
commit 4686615d9e
2 changed files with 55 additions and 11 deletions

View File

@ -111,13 +111,6 @@ func getFirstLanguageAndSafe(languages []string) (language string, safe bool) {
return return
} }
// getLanguageBySpecificClassifier returns the most probably language for the given content using
// classifier to detect language.
func getLanguageBySpecificClassifier(content []byte, candidates []string, classifier classifier) (language string, safe bool) {
languages := getLanguagesBySpecificClassifier(content, candidates, classifier)
return getFirstLanguageAndSafe(languages)
}
// GetLanguages applies a sequence of strategies based on the given filename and content // GetLanguages applies a sequence of strategies based on the given filename and content
// to find out the most probable languages to return. // to find out the most probable languages to return.
// //
@ -300,9 +293,11 @@ func GetLanguagesByShebang(_ string, content []byte, _ []string) (languages []st
var ( var (
shebangExecHack = regex.MustCompile(`exec (\w+).+\$0.+\$@`) shebangExecHack = regex.MustCompile(`exec (\w+).+\$0.+\$@`)
pythonVersion = regex.MustCompile(`python\d\.\d+`) pythonVersion = regex.MustCompile(`python\d\.\d+`)
envOptArgs = regex.MustCompile(`-[i0uCSv]*|--\S+`)
envVarArgs = regex.MustCompile(`\S+=\S+`)
) )
func getInterpreter(data []byte) (interpreter string) { func getInterpreter(data []byte) string {
line := getFirstLine(data) line := getFirstLine(data)
if !hasShebang(line) { if !hasShebang(line) {
return "" return ""
@ -317,7 +312,7 @@ func getInterpreter(data []byte) (interpreter string) {
// Extract interpreter name from path. Use path.Base because // Extract interpreter name from path. Use path.Base because
// shebang on Cygwin/Windows still use a forward slash // shebang on Cygwin/Windows still use a forward slash
interpreter = path.Base(string(splitted[0])) interpreter := path.Base(string(splitted[0]))
// #!/usr/bin/env [...] // #!/usr/bin/env [...]
if interpreter == "env" { if interpreter == "env" {
@ -325,6 +320,13 @@ func getInterpreter(data []byte) (interpreter string) {
// /usr/bin/env with no arguments // /usr/bin/env with no arguments
return "" return ""
} }
for len(splitted) > 2 {
if envOptArgs.Match(splitted[1]) || envVarArgs.Match(splitted[1]) {
splitted = append(splitted[:1], splitted[2:]...)
continue
}
break
}
interpreter = path.Base(string(splitted[1])) interpreter = path.Base(string(splitted[1]))
} }
@ -342,7 +344,7 @@ func getInterpreter(data []byte) (interpreter string) {
interpreter = "" interpreter = ""
} }
return return interpreter
} }
func getFirstLines(content []byte, count int) []byte { func getFirstLines(content []byte, count int) []byte {

View File

@ -297,7 +297,49 @@ println("The shell script says ",vm.arglist.concat(" "));`
{name: "TestGetLanguagesByShebang_9", content: []byte(multilineExecHack), expected: []string{"Tcl"}}, {name: "TestGetLanguagesByShebang_9", content: []byte(multilineExecHack), expected: []string{"Tcl"}},
{name: "TestGetLanguagesByShebang_10", content: []byte(multilineNoExecHack), expected: []string{"Shell"}}, {name: "TestGetLanguagesByShebang_10", content: []byte(multilineNoExecHack), expected: []string{"Shell"}},
{name: "TestGetLanguagesByShebang_11", content: []byte(`#!/envinpath/python`), expected: []string{"Python"}}, {name: "TestGetLanguagesByShebang_11", content: []byte(`#!/envinpath/python`), expected: []string{"Python"}},
{name: "TestGetLanguagesByShebang_12", content: []byte(`#!`), expected: nil},
{name: "TestGetLanguagesByShebang_12", content: []byte(""), expected: nil},
{name: "TestGetLanguagesByShebang_13", content: []byte("foo"), expected: nil},
{name: "TestGetLanguagesByShebang_14", content: []byte("#bar"), expected: nil},
{name: "TestGetLanguagesByShebang_15", content: []byte("#baz"), expected: nil},
{name: "TestGetLanguagesByShebang_16", content: []byte("///"), expected: nil},
{name: "TestGetLanguagesByShebang_17", content: []byte("\n\n\n\n\n"), expected: nil},
{name: "TestGetLanguagesByShebang_18", content: []byte(" #!/usr/sbin/ruby"), expected: nil},
{name: "TestGetLanguagesByShebang_19", content: []byte("\n#!/usr/sbin/ruby"), expected: nil},
{name: "TestGetLanguagesByShebang_20", content: []byte("#!"), expected: nil},
{name: "TestGetLanguagesByShebang_21", content: []byte("#! "), expected: nil},
{name: "TestGetLanguagesByShebang_22", content: []byte("#!/usr/bin/env"), expected: nil},
{name: "TestGetLanguagesByShebang_23", content: []byte("#!/usr/bin/env osascript -l JavaScript"), expected: nil},
{name: "TestGetLanguagesByShebang_24", content: []byte("#!/usr/bin/env osascript -l AppleScript"), expected: nil},
{name: "TestGetLanguagesByShebang_25", content: []byte("#!/usr/bin/env osascript -l foobar"), expected: nil},
{name: "TestGetLanguagesByShebang_26", content: []byte("#!/usr/bin/osascript -l JavaScript"), expected: nil},
{name: "TestGetLanguagesByShebang_27", content: []byte("#!/usr/bin/osascript -l foobar"), expected: nil},
{name: "TestGetLanguagesByShebang_28", content: []byte("#!/usr/sbin/ruby\n# bar"), expected: []string{"Ruby"}},
{name: "TestGetLanguagesByShebang_29", content: []byte("#!/usr/bin/ruby\n# foo"), expected: []string{"Ruby"}},
{name: "TestGetLanguagesByShebang_30", content: []byte("#!/usr/sbin/ruby"), expected: []string{"Ruby"}},
{name: "TestGetLanguagesByShebang_31", content: []byte("#!/usr/sbin/ruby foo bar baz\n"), expected: []string{"Ruby"}},
{name: "TestGetLanguagesByShebang_32", content: []byte("#!/usr/bin/env Rscript\n# example R script\n#\n"), expected: []string{"R"}},
{name: "TestGetLanguagesByShebang_33", content: []byte("#!/usr/bin/env ruby\n# baz"), expected: []string{"Ruby"}},
{name: "TestGetLanguagesByShebang_34", content: []byte("#!/usr/bin/bash\n"), expected: []string{"Shell"}},
{name: "TestGetLanguagesByShebang_35", content: []byte("#!/bin/sh"), expected: []string{"Shell"}},
{name: "TestGetLanguagesByShebang_36", content: []byte("#!/bin/python\n# foo\n# bar\n# baz"), expected: []string{"Python"}},
{name: "TestGetLanguagesByShebang_37", content: []byte("#!/usr/bin/python2.7\n\n\n\n"), expected: []string{"Python"}},
{name: "TestGetLanguagesByShebang_38", content: []byte("#!/usr/bin/python3\n\n\n\n"), expected: []string{"Python"}},
{name: "TestGetLanguagesByShebang_39", content: []byte("#!/usr/bin/sbcl --script\n\n"), expected: []string{"Common Lisp"}},
{name: "TestGetLanguagesByShebang_40", content: []byte("#! perl"), expected: []string{"Perl", "Pod"}},
{name: "TestGetLanguagesByShebang_41", content: []byte("#!/bin/sh\n\n\nexec ruby $0 $@"), expected: []string{"Ruby"}},
{name: "TestGetLanguagesByShebang_42", content: []byte("#! /usr/bin/env A=003 B=149 C=150 D=xzd E=base64 F=tar G=gz H=head I=tail sh"), expected: []string{"Shell"}},
{name: "TestGetLanguagesByShebang_43", content: []byte("#!/usr/bin/env foo=bar bar=foo python -cos=__import__(\"os\");"), expected: []string{"Python"}},
{name: "TestGetLanguagesByShebang_44", content: []byte("#!/usr/bin/env osascript"), expected: []string{"AppleScript"}},
{name: "TestGetLanguagesByShebang_45", content: []byte("#!/usr/bin/osascript"), expected: []string{"AppleScript"}},
{name: "TestGetLanguagesByShebang_46", content: []byte("#!/usr/bin/env -vS ruby -wKU\nputs ?t+?e+?s+?t"), expected: []string{"Ruby"}},
{name: "TestGetLanguagesByShebang_47", content: []byte("#!/usr/bin/env --split-string sed -f\ny/a/A/"), expected: []string{"sed"}},
{name: "TestGetLanguagesByShebang_48", content: []byte("#!/usr/bin/env -S GH_TOKEN=ghp_*** deno run --allow-net\nconsole.log(1);"), expected: []string{"TypeScript"}},
} }
for _, test := range tests { for _, test := range tests {