std\sys\pal\windows/args.rs
1//! The Windows command line is just a string
2//! <https://docs.microsoft.com/en-us/archive/blogs/larryosterman/the-windows-command-line-is-just-a-string>
3//!
4//! This module implements the parsing necessary to turn that string into a list of arguments.
5
6#[cfg(test)]
7mod tests;
8
9use super::ensure_no_nuls;
10use super::os::current_exe;
11use crate::ffi::{OsStr, OsString};
12use crate::num::NonZero;
13use crate::os::windows::prelude::*;
14use crate::path::{Path, PathBuf};
15use crate::sys::path::get_long_path;
16use crate::sys::{c, to_u16s};
17use crate::sys_common::AsInner;
18use crate::sys_common::wstr::WStrUnits;
19use crate::{fmt, io, iter, vec};
20
21pub fn args() -> Args {
22 // SAFETY: `GetCommandLineW` returns a pointer to a null terminated UTF-16
23 // string so it's safe for `WStrUnits` to use.
24 unsafe {
25 let lp_cmd_line = c::GetCommandLineW();
26 let parsed_args_list = parse_lp_cmd_line(WStrUnits::new(lp_cmd_line), || {
27 current_exe().map(PathBuf::into_os_string).unwrap_or_else(|_| OsString::new())
28 });
29
30 Args { parsed_args_list: parsed_args_list.into_iter() }
31 }
32}
33
34/// Implements the Windows command-line argument parsing algorithm.
35///
36/// Microsoft's documentation for the Windows CLI argument format can be found at
37/// <https://docs.microsoft.com/en-us/cpp/cpp/main-function-command-line-args?view=msvc-160#parsing-c-command-line-arguments>
38///
39/// A more in-depth explanation is here:
40/// <https://daviddeley.com/autohotkey/parameters/parameters.htm#WIN>
41///
42/// Windows includes a function to do command line parsing in shell32.dll.
43/// However, this is not used for two reasons:
44///
45/// 1. Linking with that DLL causes the process to be registered as a GUI application.
46/// GUI applications add a bunch of overhead, even if no windows are drawn. See
47/// <https://randomascii.wordpress.com/2018/12/03/a-not-called-function-can-cause-a-5x-slowdown/>.
48///
49/// 2. It does not follow the modern C/C++ argv rules outlined in the first two links above.
50///
51/// This function was tested for equivalence to the C/C++ parsing rules using an
52/// extensive test suite available at
53/// <https://github.com/ChrisDenton/winarg/tree/std>.
54fn parse_lp_cmd_line<'a, F: Fn() -> OsString>(
55 lp_cmd_line: Option<WStrUnits<'a>>,
56 exe_name: F,
57) -> Vec<OsString> {
58 const BACKSLASH: NonZero<u16> = NonZero::new(b'\\' as u16).unwrap();
59 const QUOTE: NonZero<u16> = NonZero::new(b'"' as u16).unwrap();
60 const TAB: NonZero<u16> = NonZero::new(b'\t' as u16).unwrap();
61 const SPACE: NonZero<u16> = NonZero::new(b' ' as u16).unwrap();
62
63 let mut ret_val = Vec::new();
64 // If the cmd line pointer is null or it points to an empty string then
65 // return the name of the executable as argv[0].
66 if lp_cmd_line.as_ref().and_then(|cmd| cmd.peek()).is_none() {
67 ret_val.push(exe_name());
68 return ret_val;
69 }
70 let mut code_units = lp_cmd_line.unwrap();
71
72 // The executable name at the beginning is special.
73 let mut in_quotes = false;
74 let mut cur = Vec::new();
75 for w in &mut code_units {
76 match w {
77 // A quote mark always toggles `in_quotes` no matter what because
78 // there are no escape characters when parsing the executable name.
79 QUOTE => in_quotes = !in_quotes,
80 // If not `in_quotes` then whitespace ends argv[0].
81 SPACE | TAB if !in_quotes => break,
82 // In all other cases the code unit is taken literally.
83 _ => cur.push(w.get()),
84 }
85 }
86 // Skip whitespace.
87 code_units.advance_while(|w| w == SPACE || w == TAB);
88 ret_val.push(OsString::from_wide(&cur));
89
90 // Parse the arguments according to these rules:
91 // * All code units are taken literally except space, tab, quote and backslash.
92 // * When not `in_quotes`, space and tab separate arguments. Consecutive spaces and tabs are
93 // treated as a single separator.
94 // * A space or tab `in_quotes` is taken literally.
95 // * A quote toggles `in_quotes` mode unless it's escaped. An escaped quote is taken literally.
96 // * A quote can be escaped if preceded by an odd number of backslashes.
97 // * If any number of backslashes is immediately followed by a quote then the number of
98 // backslashes is halved (rounding down).
99 // * Backslashes not followed by a quote are all taken literally.
100 // * If `in_quotes` then a quote can also be escaped using another quote
101 // (i.e. two consecutive quotes become one literal quote).
102 let mut cur = Vec::new();
103 let mut in_quotes = false;
104 while let Some(w) = code_units.next() {
105 match w {
106 // If not `in_quotes`, a space or tab ends the argument.
107 SPACE | TAB if !in_quotes => {
108 ret_val.push(OsString::from_wide(&cur[..]));
109 cur.truncate(0);
110
111 // Skip whitespace.
112 code_units.advance_while(|w| w == SPACE || w == TAB);
113 }
114 // Backslashes can escape quotes or backslashes but only if consecutive backslashes are followed by a quote.
115 BACKSLASH => {
116 let backslash_count = code_units.advance_while(|w| w == BACKSLASH) + 1;
117 if code_units.peek() == Some(QUOTE) {
118 cur.extend(iter::repeat(BACKSLASH.get()).take(backslash_count / 2));
119 // The quote is escaped if there are an odd number of backslashes.
120 if backslash_count % 2 == 1 {
121 code_units.next();
122 cur.push(QUOTE.get());
123 }
124 } else {
125 // If there is no quote on the end then there is no escaping.
126 cur.extend(iter::repeat(BACKSLASH.get()).take(backslash_count));
127 }
128 }
129 // If `in_quotes` and not backslash escaped (see above) then a quote either
130 // unsets `in_quote` or is escaped by another quote.
131 QUOTE if in_quotes => match code_units.peek() {
132 // Two consecutive quotes when `in_quotes` produces one literal quote.
133 Some(QUOTE) => {
134 cur.push(QUOTE.get());
135 code_units.next();
136 }
137 // Otherwise set `in_quotes`.
138 Some(_) => in_quotes = false,
139 // The end of the command line.
140 // Push `cur` even if empty, which we do by breaking while `in_quotes` is still set.
141 None => break,
142 },
143 // If not `in_quotes` and not BACKSLASH escaped (see above) then a quote sets `in_quote`.
144 QUOTE => in_quotes = true,
145 // Everything else is always taken literally.
146 _ => cur.push(w.get()),
147 }
148 }
149 // Push the final argument, if any.
150 if !cur.is_empty() || in_quotes {
151 ret_val.push(OsString::from_wide(&cur[..]));
152 }
153 ret_val
154}
155
156pub struct Args {
157 parsed_args_list: vec::IntoIter<OsString>,
158}
159
160impl fmt::Debug for Args {
161 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
162 self.parsed_args_list.as_slice().fmt(f)
163 }
164}
165
166impl Iterator for Args {
167 type Item = OsString;
168 fn next(&mut self) -> Option<OsString> {
169 self.parsed_args_list.next()
170 }
171 fn size_hint(&self) -> (usize, Option<usize>) {
172 self.parsed_args_list.size_hint()
173 }
174}
175
176impl DoubleEndedIterator for Args {
177 fn next_back(&mut self) -> Option<OsString> {
178 self.parsed_args_list.next_back()
179 }
180}
181
182impl ExactSizeIterator for Args {
183 fn len(&self) -> usize {
184 self.parsed_args_list.len()
185 }
186}
187
188#[derive(Debug)]
189pub(crate) enum Arg {
190 /// Add quotes (if needed)
191 Regular(OsString),
192 /// Append raw string without quoting
193 Raw(OsString),
194}
195
196enum Quote {
197 // Every arg is quoted
198 Always,
199 // Whitespace and empty args are quoted
200 Auto,
201 // Arg appended without any changes (#29494)
202 Never,
203}
204
205pub(crate) fn append_arg(cmd: &mut Vec<u16>, arg: &Arg, force_quotes: bool) -> io::Result<()> {
206 let (arg, quote) = match arg {
207 Arg::Regular(arg) => (arg, if force_quotes { Quote::Always } else { Quote::Auto }),
208 Arg::Raw(arg) => (arg, Quote::Never),
209 };
210
211 // If an argument has 0 characters then we need to quote it to ensure
212 // that it actually gets passed through on the command line or otherwise
213 // it will be dropped entirely when parsed on the other end.
214 ensure_no_nuls(arg)?;
215 let arg_bytes = arg.as_encoded_bytes();
216 let (quote, escape) = match quote {
217 Quote::Always => (true, true),
218 Quote::Auto => {
219 (arg_bytes.iter().any(|c| *c == b' ' || *c == b'\t') || arg_bytes.is_empty(), true)
220 }
221 Quote::Never => (false, false),
222 };
223 if quote {
224 cmd.push('"' as u16);
225 }
226
227 let mut backslashes: usize = 0;
228 for x in arg.encode_wide() {
229 if escape {
230 if x == '\\' as u16 {
231 backslashes += 1;
232 } else {
233 if x == '"' as u16 {
234 // Add n+1 backslashes to total 2n+1 before internal '"'.
235 cmd.extend((0..=backslashes).map(|_| '\\' as u16));
236 }
237 backslashes = 0;
238 }
239 }
240 cmd.push(x);
241 }
242
243 if quote {
244 // Add n backslashes to total 2n before ending '"'.
245 cmd.extend((0..backslashes).map(|_| '\\' as u16));
246 cmd.push('"' as u16);
247 }
248 Ok(())
249}
250
251fn append_bat_arg(cmd: &mut Vec<u16>, arg: &OsStr, mut quote: bool) -> io::Result<()> {
252 ensure_no_nuls(arg)?;
253 // If an argument has 0 characters then we need to quote it to ensure
254 // that it actually gets passed through on the command line or otherwise
255 // it will be dropped entirely when parsed on the other end.
256 //
257 // We also need to quote the argument if it ends with `\` to guard against
258 // bat usage such as `"%~2"` (i.e. force quote arguments) otherwise a
259 // trailing slash will escape the closing quote.
260 if arg.is_empty() || arg.as_encoded_bytes().last() == Some(&b'\\') {
261 quote = true;
262 }
263 for cp in arg.as_inner().inner.code_points() {
264 if let Some(cp) = cp.to_char() {
265 // Rather than trying to find every ascii symbol that must be quoted,
266 // we assume that all ascii symbols must be quoted unless they're known to be good.
267 // We also quote Unicode control blocks for good measure.
268 // Note an unquoted `\` is fine so long as the argument isn't otherwise quoted.
269 static UNQUOTED: &str = r"#$*+-./:?@\_";
270 let ascii_needs_quotes =
271 cp.is_ascii() && !(cp.is_ascii_alphanumeric() || UNQUOTED.contains(cp));
272 if ascii_needs_quotes || cp.is_control() {
273 quote = true;
274 }
275 }
276 }
277
278 if quote {
279 cmd.push('"' as u16);
280 }
281 // Loop through the string, escaping `\` only if followed by `"`.
282 // And escaping `"` by doubling them.
283 let mut backslashes: usize = 0;
284 for x in arg.encode_wide() {
285 if x == '\\' as u16 {
286 backslashes += 1;
287 } else {
288 if x == '"' as u16 {
289 // Add n backslashes to total 2n before internal `"`.
290 cmd.extend((0..backslashes).map(|_| '\\' as u16));
291 // Appending an additional double-quote acts as an escape.
292 cmd.push(b'"' as u16)
293 } else if x == '%' as u16 || x == '\r' as u16 {
294 // yt-dlp hack: replaces `%` with `%%cd:~,%` to stop %VAR% being expanded as an environment variable.
295 //
296 // # Explanation
297 //
298 // cmd supports extracting a substring from a variable using the following syntax:
299 // %variable:~start_index,end_index%
300 //
301 // In the above command `cd` is used as the variable and the start_index and end_index are left blank.
302 // `cd` is a built-in variable that dynamically expands to the current directory so it's always available.
303 // Explicitly omitting both the start and end index creates a zero-length substring.
304 //
305 // Therefore it all resolves to nothing. However, by doing this no-op we distract cmd.exe
306 // from potentially expanding %variables% in the argument.
307 cmd.extend_from_slice(&[
308 '%' as u16, '%' as u16, 'c' as u16, 'd' as u16, ':' as u16, '~' as u16,
309 ',' as u16,
310 ]);
311 }
312 backslashes = 0;
313 }
314 cmd.push(x);
315 }
316 if quote {
317 // Add n backslashes to total 2n before ending `"`.
318 cmd.extend((0..backslashes).map(|_| '\\' as u16));
319 cmd.push('"' as u16);
320 }
321 Ok(())
322}
323
324pub(crate) fn make_bat_command_line(
325 script: &[u16],
326 args: &[Arg],
327 force_quotes: bool,
328) -> io::Result<Vec<u16>> {
329 const INVALID_ARGUMENT_ERROR: io::Error =
330 io::const_error!(io::ErrorKind::InvalidInput, r#"batch file arguments are invalid"#);
331 // Set the start of the command line to `cmd.exe /c "`
332 // It is necessary to surround the command in an extra pair of quotes,
333 // hence the trailing quote here. It will be closed after all arguments
334 // have been added.
335 // Using /e:ON enables "command extensions" which is essential for the `%` hack to work.
336 let mut cmd: Vec<u16> = "cmd.exe /e:ON /v:OFF /d /c \"".encode_utf16().collect();
337
338 // Push the script name surrounded by its quote pair.
339 cmd.push(b'"' as u16);
340 // Windows file names cannot contain a `"` character or end with `\\`.
341 // If the script name does then return an error.
342 if script.contains(&(b'"' as u16)) || script.last() == Some(&(b'\\' as u16)) {
343 return Err(io::const_error!(
344 io::ErrorKind::InvalidInput,
345 "Windows file names may not contain `\"` or end with `\\`"
346 ));
347 }
348 cmd.extend_from_slice(script.strip_suffix(&[0]).unwrap_or(script));
349 cmd.push(b'"' as u16);
350
351 // Append the arguments.
352 // FIXME: This needs tests to ensure that the arguments are properly
353 // reconstructed by the batch script by default.
354 for arg in args {
355 cmd.push(' ' as u16);
356 match arg {
357 Arg::Regular(arg_os) => {
358 let arg_bytes = arg_os.as_encoded_bytes();
359 // Disallow \r and \n as they may truncate the arguments.
360 const DISALLOWED: &[u8] = b"\r\n";
361 if arg_bytes.iter().any(|c| DISALLOWED.contains(c)) {
362 return Err(INVALID_ARGUMENT_ERROR);
363 }
364 append_bat_arg(&mut cmd, arg_os, force_quotes)?;
365 }
366 _ => {
367 // Raw arguments are passed on as-is.
368 // It's the user's responsibility to properly handle arguments in this case.
369 append_arg(&mut cmd, arg, force_quotes)?;
370 }
371 };
372 }
373
374 // Close the quote we left opened earlier.
375 cmd.push(b'"' as u16);
376
377 Ok(cmd)
378}
379
380/// Takes a path and tries to return a non-verbatim path.
381///
382/// This is necessary because cmd.exe does not support verbatim paths.
383pub(crate) fn to_user_path(path: &Path) -> io::Result<Vec<u16>> {
384 from_wide_to_user_path(to_u16s(path)?)
385}
386pub(crate) fn from_wide_to_user_path(mut path: Vec<u16>) -> io::Result<Vec<u16>> {
387 use super::fill_utf16_buf;
388 use crate::ptr;
389
390 // UTF-16 encoded code points, used in parsing and building UTF-16 paths.
391 // All of these are in the ASCII range so they can be cast directly to `u16`.
392 const SEP: u16 = b'\\' as _;
393 const QUERY: u16 = b'?' as _;
394 const COLON: u16 = b':' as _;
395 const U: u16 = b'U' as _;
396 const N: u16 = b'N' as _;
397 const C: u16 = b'C' as _;
398
399 // Early return if the path is too long to remove the verbatim prefix.
400 const LEGACY_MAX_PATH: usize = 260;
401 if path.len() > LEGACY_MAX_PATH {
402 return Ok(path);
403 }
404
405 match &path[..] {
406 // `\\?\C:\...` => `C:\...`
407 [SEP, SEP, QUERY, SEP, _, COLON, SEP, ..] => unsafe {
408 let lpfilename = path[4..].as_ptr();
409 fill_utf16_buf(
410 |buffer, size| c::GetFullPathNameW(lpfilename, size, buffer, ptr::null_mut()),
411 |full_path: &[u16]| {
412 if full_path == &path[4..path.len() - 1] {
413 let mut path: Vec<u16> = full_path.into();
414 path.push(0);
415 path
416 } else {
417 path
418 }
419 },
420 )
421 },
422 // `\\?\UNC\...` => `\\...`
423 [SEP, SEP, QUERY, SEP, U, N, C, SEP, ..] => unsafe {
424 // Change the `C` in `UNC\` to `\` so we can get a slice that starts with `\\`.
425 path[6] = b'\\' as u16;
426 let lpfilename = path[6..].as_ptr();
427 fill_utf16_buf(
428 |buffer, size| c::GetFullPathNameW(lpfilename, size, buffer, ptr::null_mut()),
429 |full_path: &[u16]| {
430 if full_path == &path[6..path.len() - 1] {
431 let mut path: Vec<u16> = full_path.into();
432 path.push(0);
433 path
434 } else {
435 // Restore the 'C' in "UNC".
436 path[6] = b'C' as u16;
437 path
438 }
439 },
440 )
441 },
442 // For everything else, leave the path unchanged.
443 _ => get_long_path(path, false),
444 }
445}