shlex-1.3.0/.cargo_vcs_info.json0000644000000001360000000000100121600ustar { "git": { "sha1": "4a0724b0b62ef715467875b040a890ce75a8a829" }, "path_in_vcs": "" }shlex-1.3.0/.github/workflows/test.yml000064400000000000000000000013321046102023000160460ustar 00000000000000name: Rust on: pull_request: push: jobs: check: name: Check runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 - uses: ATiltedTree/setup-rust@v1 with: rust-version: stable - run: cargo check test: name: Test runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 - uses: ATiltedTree/setup-rust@v1 with: rust-version: stable - run: cargo test test_no_default_features: name: Test (no default features) runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 - uses: ATiltedTree/setup-rust@v1 with: rust-version: stable - run: cargo test --no-default-features shlex-1.3.0/.gitignore000064400000000000000000000001101046102023000127300ustar 00000000000000nocommit/ target/ artifacts/ corpus/ /Cargo.lock **/*.rs.bk .*.sw? .sw? shlex-1.3.0/CHANGELOG.md000064400000000000000000000007701046102023000125650ustar 00000000000000# 1.2.0 * Adds `bytes` module to support operating directly on byte strings. # 1.1.0 * Adds the `std` feature (enabled by default) * Disabling the `std` feature makes the crate work in `#![no_std]` mode, assuming presence of the `alloc` crate # 1.0.0 * Adds the `join` convenience function. * Fixes parsing of `'\\n'` to match the behavior of bash/Zsh/Python `shlex`. The result was previously `\n`, now it is `\\n`. # 0.1.1 * Adds handling of `#` comments. # 0.1.0 This is the initial release. shlex-1.3.0/Cargo.toml0000644000000020620000000000100101560ustar # THIS FILE IS AUTOMATICALLY GENERATED BY CARGO # # When uploading crates to the registry Cargo will automatically # "normalize" Cargo.toml files for maximal compatibility # with all versions of Cargo and also rewrite `path` dependencies # to registry (e.g., crates.io) dependencies. # # If you are reading this file be aware that the original Cargo.toml # will likely look very different (and much more reasonable). # See Cargo.toml.orig for the original contents. [package] rust-version = "1.46.0" name = "shlex" version = "1.3.0" authors = [ "comex ", "Fenhl ", "Adrian Taylor ", "Alex Touchet ", "Daniel Parks ", "Garrett Berg ", ] description = "Split a string into shell words, like Python's shlex." readme = "README.md" categories = [ "command-line-interface", "parser-implementations", ] license = "MIT OR Apache-2.0" repository = "https://github.com/comex/rust-shlex" [features] default = ["std"] std = [] shlex-1.3.0/Cargo.toml.orig000064400000000000000000000011111046102023000136310ustar 00000000000000[package] name = "shlex" version = "1.3.0" authors = [ "comex ", "Fenhl ", "Adrian Taylor ", "Alex Touchet ", "Daniel Parks ", "Garrett Berg ", ] license = "MIT OR Apache-2.0" repository = "https://github.com/comex/rust-shlex" description = "Split a string into shell words, like Python's shlex." categories = [ "command-line-interface", "parser-implementations" ] rust-version = "1.46.0" [features] std = [] default = ["std"] shlex-1.3.0/LICENSE-APACHE000064400000000000000000000010661046102023000126770ustar 00000000000000Copyright 2015 Nicholas Allegra (comex). Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. shlex-1.3.0/LICENSE-MIT000064400000000000000000000021041046102023000124010ustar 00000000000000The MIT License (MIT) Copyright (c) 2015 Nicholas Allegra (comex). Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. shlex-1.3.0/README.md000064400000000000000000000036411046102023000122330ustar 00000000000000[![ci badge]][ci link] [![crates.io badge]][crates.io link] [![docs.rs badge]][docs.rs link] [crates.io badge]: https://img.shields.io/crates/v/shlex.svg?style=flat-square [crates.io link]: https://crates.io/crates/shlex [docs.rs badge]: https://img.shields.io/badge/docs-online-dddddd.svg?style=flat-square [docs.rs link]: https://docs.rs/shlex [ci badge]: https://img.shields.io/github/actions/workflow/status/comex/rust-shlex/test.yml?branch=master&style=flat-square [ci link]: https://github.com/comex/rust-shlex/actions Same idea as (but implementation not directly based on) the Python shlex module. However, this implementation does not support any of the Python module's customization because it makes parsing slower and is fairly useless. You only get the default settings of shlex.split, which mimic the POSIX shell: This implementation also deviates from the Python version in not treating \r specially, which I believe is more compliant. This crate can be used on either normal Rust strings, or on byte strings with the `bytes` module. The algorithms used are oblivious to UTF-8 high bytes, so internally they all work on bytes directly as a micro-optimization. Disabling the `std` feature (which is enabled by default) will allow the crate to work in `no_std` environments, where the `alloc` crate, and a global allocator, are available. # LICENSE The source code in this repository is Licensed under either of - Apache License, Version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or https://www.apache.org/licenses/LICENSE-2.0) - MIT license ([LICENSE-MIT](LICENSE-MIT) or https://opensource.org/licenses/MIT) at your option. Unless you explicitly state otherwise, any contribution intentionally submitted for inclusion in the work by you, as defined in the Apache-2.0 license, shall be dual licensed as above, without any additional terms or conditions. shlex-1.3.0/src/bytes.rs000064400000000000000000000526371046102023000132500ustar 00000000000000// Copyright 2015 Nicholas Allegra (comex). // Licensed under the Apache License, Version 2.0 or // the MIT license , at your option. This file may not be // copied, modified, or distributed except according to those terms. //! [`Shlex`] and friends for byte strings. //! //! This is used internally by the [outer module](crate), and may be more //! convenient if you are working with byte slices (`[u8]`) or types that are //! wrappers around bytes, such as [`OsStr`](std::ffi::OsStr): //! //! ```rust //! #[cfg(unix)] { //! use shlex::bytes::quote; //! use std::ffi::OsStr; //! use std::os::unix::ffi::OsStrExt; //! //! // `\x80` is invalid in UTF-8. //! let os_str = OsStr::from_bytes(b"a\x80b c"); //! assert_eq!(quote(os_str.as_bytes()), &b"'a\x80b c'"[..]); //! } //! ``` //! //! (On Windows, `OsStr` uses 16 bit wide characters so this will not work.) extern crate alloc; use alloc::vec::Vec; use alloc::borrow::Cow; #[cfg(test)] use alloc::vec; #[cfg(test)] use alloc::borrow::ToOwned; #[cfg(all(doc, not(doctest)))] use crate::{self as shlex, quoting_warning}; use super::QuoteError; /// An iterator that takes an input byte string and splits it into the words using the same syntax as /// the POSIX shell. pub struct Shlex<'a> { in_iter: core::slice::Iter<'a, u8>, /// The number of newlines read so far, plus one. pub line_no: usize, /// An input string is erroneous if it ends while inside a quotation or right after an /// unescaped backslash. Since Iterator does not have a mechanism to return an error, if that /// happens, Shlex just throws out the last token, ends the iteration, and sets 'had_error' to /// true; best to check it after you're done iterating. pub had_error: bool, } impl<'a> Shlex<'a> { pub fn new(in_bytes: &'a [u8]) -> Self { Shlex { in_iter: in_bytes.iter(), line_no: 1, had_error: false, } } fn parse_word(&mut self, mut ch: u8) -> Option> { let mut result: Vec = Vec::new(); loop { match ch as char { '"' => if let Err(()) = self.parse_double(&mut result) { self.had_error = true; return None; }, '\'' => if let Err(()) = self.parse_single(&mut result) { self.had_error = true; return None; }, '\\' => if let Some(ch2) = self.next_char() { if ch2 != '\n' as u8 { result.push(ch2); } } else { self.had_error = true; return None; }, ' ' | '\t' | '\n' => { break; }, _ => { result.push(ch as u8); }, } if let Some(ch2) = self.next_char() { ch = ch2; } else { break; } } Some(result) } fn parse_double(&mut self, result: &mut Vec) -> Result<(), ()> { loop { if let Some(ch2) = self.next_char() { match ch2 as char { '\\' => { if let Some(ch3) = self.next_char() { match ch3 as char { // \$ => $ '$' | '`' | '"' | '\\' => { result.push(ch3); }, // \ => nothing '\n' => {}, // \x => =x _ => { result.push('\\' as u8); result.push(ch3); } } } else { return Err(()); } }, '"' => { return Ok(()); }, _ => { result.push(ch2); }, } } else { return Err(()); } } } fn parse_single(&mut self, result: &mut Vec) -> Result<(), ()> { loop { if let Some(ch2) = self.next_char() { match ch2 as char { '\'' => { return Ok(()); }, _ => { result.push(ch2); }, } } else { return Err(()); } } } fn next_char(&mut self) -> Option { let res = self.in_iter.next().copied(); if res == Some(b'\n') { self.line_no += 1; } res } } impl<'a> Iterator for Shlex<'a> { type Item = Vec; fn next(&mut self) -> Option { if let Some(mut ch) = self.next_char() { // skip initial whitespace loop { match ch as char { ' ' | '\t' | '\n' => {}, '#' => { while let Some(ch2) = self.next_char() { if ch2 as char == '\n' { break; } } }, _ => { break; } } if let Some(ch2) = self.next_char() { ch = ch2; } else { return None; } } self.parse_word(ch) } else { // no initial character None } } } /// Convenience function that consumes the whole byte string at once. Returns None if the input was /// erroneous. pub fn split(in_bytes: &[u8]) -> Option>> { let mut shl = Shlex::new(in_bytes); let res = shl.by_ref().collect(); if shl.had_error { None } else { Some(res) } } /// A more configurable interface to quote strings. If you only want the default settings you can /// use the convenience functions [`try_quote`] and [`try_join`]. /// /// The string equivalent is [`shlex::Quoter`]. #[derive(Default, Debug, Clone)] pub struct Quoter { allow_nul: bool, // TODO: more options } impl Quoter { /// Create a new [`Quoter`] with default settings. #[inline] pub fn new() -> Self { Self::default() } /// Set whether to allow [nul bytes](quoting_warning#nul-bytes). By default they are not /// allowed and will result in an error of [`QuoteError::Nul`]. #[inline] pub fn allow_nul(mut self, allow: bool) -> Self { self.allow_nul = allow; self } /// Convenience function that consumes an iterable of words and turns it into a single byte string, /// quoting words when necessary. Consecutive words will be separated by a single space. pub fn join<'a, I: IntoIterator>(&self, words: I) -> Result, QuoteError> { Ok(words.into_iter() .map(|word| self.quote(word)) .collect::>, QuoteError>>()? .join(&b' ')) } /// Given a single word, return a byte string suitable to encode it as a shell argument. /// /// If given valid UTF-8, this will never produce invalid UTF-8. This is because it only /// ever inserts valid ASCII characters before or after existing ASCII characters (or /// returns two single quotes if the input was an empty string). It will never modify a /// multibyte UTF-8 character. pub fn quote<'a>(&self, mut in_bytes: &'a [u8]) -> Result, QuoteError> { if in_bytes.is_empty() { // Empty string. Special case that isn't meaningful as only part of a word. return Ok(b"''"[..].into()); } if !self.allow_nul && in_bytes.iter().any(|&b| b == b'\0') { return Err(QuoteError::Nul); } let mut out: Vec = Vec::new(); while !in_bytes.is_empty() { // Pick a quoting strategy for some prefix of the input. Normally this will cover the // entire input, but in some case we might need to divide the input into multiple chunks // that are quoted differently. let (cur_len, strategy) = quoting_strategy(in_bytes); if cur_len == in_bytes.len() && strategy == QuotingStrategy::Unquoted && out.is_empty() { // Entire string can be represented unquoted. Reuse the allocation. return Ok(in_bytes.into()); } let (cur_chunk, rest) = in_bytes.split_at(cur_len); assert!(rest.len() < in_bytes.len()); // no infinite loop in_bytes = rest; append_quoted_chunk(&mut out, cur_chunk, strategy); } Ok(out.into()) } } #[derive(PartialEq)] enum QuotingStrategy { /// No quotes and no backslash escapes. (If backslash escapes would be necessary, we use a /// different strategy instead.) Unquoted, /// Single quoted. SingleQuoted, /// Double quotes, potentially with backslash escapes. DoubleQuoted, // TODO: add $'xxx' and "$(printf 'xxx')" styles } /// Is this ASCII byte okay to emit unquoted? const fn unquoted_ok(c: u8) -> bool { match c as char { // Allowed characters: '+' | '-' | '.' | '/' | ':' | '@' | ']' | '_' | '0'..='9' | 'A'..='Z' | 'a'..='z' => true, // Non-allowed characters: // From POSIX https://pubs.opengroup.org/onlinepubs/9699919799/utilities/V3_chap02.html // "The application shall quote the following characters if they are to represent themselves:" '|' | '&' | ';' | '<' | '>' | '(' | ')' | '$' | '`' | '\\' | '"' | '\'' | ' ' | '\t' | '\n' | // "and the following may need to be quoted under certain circumstances[..]:" '*' | '?' | '[' | '#' | '~' | '=' | '%' | // Brace expansion. These ought to be in the POSIX list but aren't yet; // see: https://www.austingroupbugs.net/view.php?id=1193 '{' | '}' | // Also quote comma, just to be safe in the extremely odd case that the user of this crate // is intentionally placing a quoted string inside a brace expansion, e.g.: // format!("echo foo{{a,b,{}}}" | shlex::quote(some_str)) ',' | // '\r' is allowed in a word by all real shells I tested, but is treated as a word // separator by Python `shlex` | and might be translated to '\n' in interactive mode. '\r' | // '!' and '^' are treated specially in interactive mode; see quoting_warning. '!' | '^' | // Nul bytes and control characters. '\x00' ..= '\x1f' | '\x7f' => false, '\u{80}' ..= '\u{10ffff}' => { // This is unreachable since `unquoted_ok` is only called for 0..128. // Non-ASCII bytes are handled separately in `quoting_strategy`. // Can't call unreachable!() from `const fn` on old Rust, so... unquoted_ok(c) }, } // Note: The logic cited above for quoting comma might suggest that `..` should also be quoted, // it as a special case of brace expansion). But it's not necessary. There are three cases: // // 1. The user wants comma-based brace expansion, but the untrusted string being `quote`d // contains `..`, so they get something like `{foo,bar,3..5}`. // => That's safe; both Bash and Zsh expand this to `foo bar 3..5` rather than // `foo bar 3 4 5`. The presence of commas disables sequence expression expansion. // // 2. The user wants comma-based brace expansion where the contents of the braces are a // variable number of `quote`d strings and nothing else. There happens to be exactly // one string and it contains `..`, so they get something like `{3..5}`. // => Then this will expand as a sequence expression, which is unintended. But I don't mind, // because any such code is already buggy. Suppose the untrusted string *didn't* contain // `,` or `..`, resulting in shell input like `{foo}`. Then the shell would interpret it // as the literal string `{foo}` rather than brace-expanding it into `foo`. // // 3. The user wants a sequence expression and wants to supply an untrusted string as one of // the endpoints or the increment. // => Well, that's just silly, since the endpoints can only be numbers or single letters. } /// Optimized version of `unquoted_ok`. fn unquoted_ok_fast(c: u8) -> bool { const UNQUOTED_OK_MASK: u128 = { // Make a mask of all bytes in 0..<0x80 that pass. let mut c = 0u8; let mut mask = 0u128; while c < 0x80 { if unquoted_ok(c) { mask |= 1u128 << c; } c += 1; } mask }; ((UNQUOTED_OK_MASK >> c) & 1) != 0 } /// Is this ASCII byte okay to emit in single quotes? fn single_quoted_ok(c: u8) -> bool { match c { // No single quotes in single quotes. b'\'' => false, // To work around a Bash bug, ^ is only allowed right after an opening single quote; see // quoting_warning. b'^' => false, // Backslashes in single quotes are literal according to POSIX, but Fish treats them as an // escape character. Ban them. Fish doesn't aim to be POSIX-compatible, but we *can* // achieve Fish compatibility using double quotes, so we might as well. b'\\' => false, _ => true } } /// Is this ASCII byte okay to emit in double quotes? fn double_quoted_ok(c: u8) -> bool { match c { // Work around Python `shlex` bug where parsing "\`" and "\$" doesn't strip the // backslash, even though POSIX requires it. b'`' | b'$' => false, // '!' and '^' are treated specially in interactive mode; see quoting_warning. b'!' | b'^' => false, _ => true } } /// Given an input, return a quoting strategy that can cover some prefix of the string, along with /// the size of that prefix. /// /// Precondition: input size is nonzero. (Empty strings are handled by the caller.) /// Postcondition: returned size is nonzero. #[cfg_attr(manual_codegen_check, inline(never))] fn quoting_strategy(in_bytes: &[u8]) -> (usize, QuotingStrategy) { const UNQUOTED_OK: u8 = 1; const SINGLE_QUOTED_OK: u8 = 2; const DOUBLE_QUOTED_OK: u8 = 4; let mut prev_ok = SINGLE_QUOTED_OK | DOUBLE_QUOTED_OK | UNQUOTED_OK; let mut i = 0; if in_bytes[0] == b'^' { // To work around a Bash bug, ^ is only allowed right after an opening single quote; see // quoting_warning. prev_ok = SINGLE_QUOTED_OK; i = 1; } while i < in_bytes.len() { let c = in_bytes[i]; let mut cur_ok = prev_ok; if c >= 0x80 { // Normally, non-ASCII characters shouldn't require quoting, but see quoting_warning.md // about \xa0. For now, just treat all non-ASCII characters as requiring quotes. This // also ensures things are safe in the off-chance that you're in a legacy 8-bit locale that // has additional characters satisfying `isblank`. cur_ok &= !UNQUOTED_OK; } else { if !unquoted_ok_fast(c) { cur_ok &= !UNQUOTED_OK; } if !single_quoted_ok(c){ cur_ok &= !SINGLE_QUOTED_OK; } if !double_quoted_ok(c) { cur_ok &= !DOUBLE_QUOTED_OK; } } if cur_ok == 0 { // There are no quoting strategies that would work for both the previous characters and // this one. So we have to end the chunk before this character. The caller will call // `quoting_strategy` again to handle the rest of the string. break; } prev_ok = cur_ok; i += 1; } // Pick the best allowed strategy. let strategy = if prev_ok & UNQUOTED_OK != 0 { QuotingStrategy::Unquoted } else if prev_ok & SINGLE_QUOTED_OK != 0 { QuotingStrategy::SingleQuoted } else if prev_ok & DOUBLE_QUOTED_OK != 0 { QuotingStrategy::DoubleQuoted } else { unreachable!() }; debug_assert!(i > 0); (i, strategy) } fn append_quoted_chunk(out: &mut Vec, cur_chunk: &[u8], strategy: QuotingStrategy) { match strategy { QuotingStrategy::Unquoted => { out.extend_from_slice(cur_chunk); }, QuotingStrategy::SingleQuoted => { out.reserve(cur_chunk.len() + 2); out.push(b'\''); out.extend_from_slice(cur_chunk); out.push(b'\''); }, QuotingStrategy::DoubleQuoted => { out.reserve(cur_chunk.len() + 2); out.push(b'"'); for &c in cur_chunk.into_iter() { if let b'$' | b'`' | b'"' | b'\\' = c { // Add a preceding backslash. // Note: We shouldn't actually get here for $ and ` because they don't pass // `double_quoted_ok`. out.push(b'\\'); } // Add the character itself. out.push(c); } out.push(b'"'); }, } } /// Convenience function that consumes an iterable of words and turns it into a single byte string, /// quoting words when necessary. Consecutive words will be separated by a single space. /// /// Uses default settings except that nul bytes are passed through, which [may be /// dangerous](quoting_warning#nul-bytes), leading to this function being deprecated. /// /// Equivalent to [`Quoter::new().allow_nul(true).join(words).unwrap()`](Quoter). /// /// (That configuration never returns `Err`, so this function does not panic.) /// /// The string equivalent is [shlex::join]. #[deprecated(since = "1.3.0", note = "replace with `try_join(words)?` to avoid nul byte danger")] pub fn join<'a, I: IntoIterator>(words: I) -> Vec { Quoter::new().allow_nul(true).join(words).unwrap() } /// Convenience function that consumes an iterable of words and turns it into a single byte string, /// quoting words when necessary. Consecutive words will be separated by a single space. /// /// Uses default settings. The only error that can be returned is [`QuoteError::Nul`]. /// /// Equivalent to [`Quoter::new().join(words)`](Quoter). /// /// The string equivalent is [shlex::try_join]. pub fn try_join<'a, I: IntoIterator>(words: I) -> Result, QuoteError> { Quoter::new().join(words) } /// Given a single word, return a string suitable to encode it as a shell argument. /// /// Uses default settings except that nul bytes are passed through, which [may be /// dangerous](quoting_warning#nul-bytes), leading to this function being deprecated. /// /// Equivalent to [`Quoter::new().allow_nul(true).quote(in_bytes).unwrap()`](Quoter). /// /// (That configuration never returns `Err`, so this function does not panic.) /// /// The string equivalent is [shlex::quote]. #[deprecated(since = "1.3.0", note = "replace with `try_quote(str)?` to avoid nul byte danger")] pub fn quote(in_bytes: &[u8]) -> Cow<[u8]> { Quoter::new().allow_nul(true).quote(in_bytes).unwrap() } /// Given a single word, return a string suitable to encode it as a shell argument. /// /// Uses default settings. The only error that can be returned is [`QuoteError::Nul`]. /// /// Equivalent to [`Quoter::new().quote(in_bytes)`](Quoter). /// /// (That configuration never returns `Err`, so this function does not panic.) /// /// The string equivalent is [shlex::try_quote]. pub fn try_quote(in_bytes: &[u8]) -> Result, QuoteError> { Quoter::new().quote(in_bytes) } #[cfg(test)] const INVALID_UTF8: &[u8] = b"\xa1"; #[cfg(test)] const INVALID_UTF8_SINGLEQUOTED: &[u8] = b"'\xa1'"; #[test] #[allow(invalid_from_utf8)] fn test_invalid_utf8() { // Check that our test string is actually invalid UTF-8. assert!(core::str::from_utf8(INVALID_UTF8).is_err()); } #[cfg(test)] static SPLIT_TEST_ITEMS: &'static [(&'static [u8], Option<&'static [&'static [u8]]>)] = &[ (b"foo$baz", Some(&[b"foo$baz"])), (b"foo baz", Some(&[b"foo", b"baz"])), (b"foo\"bar\"baz", Some(&[b"foobarbaz"])), (b"foo \"bar\"baz", Some(&[b"foo", b"barbaz"])), (b" foo \nbar", Some(&[b"foo", b"bar"])), (b"foo\\\nbar", Some(&[b"foobar"])), (b"\"foo\\\nbar\"", Some(&[b"foobar"])), (b"'baz\\$b'", Some(&[b"baz\\$b"])), (b"'baz\\\''", None), (b"\\", None), (b"\"\\", None), (b"'\\", None), (b"\"", None), (b"'", None), (b"foo #bar\nbaz", Some(&[b"foo", b"baz"])), (b"foo #bar", Some(&[b"foo"])), (b"foo#bar", Some(&[b"foo#bar"])), (b"foo\"#bar", None), (b"'\\n'", Some(&[b"\\n"])), (b"'\\\\n'", Some(&[b"\\\\n"])), (INVALID_UTF8, Some(&[INVALID_UTF8])), ]; #[test] fn test_split() { for &(input, output) in SPLIT_TEST_ITEMS { assert_eq!(split(input), output.map(|o| o.iter().map(|&x| x.to_owned()).collect())); } } #[test] fn test_lineno() { let mut sh = Shlex::new(b"\nfoo\nbar"); while let Some(word) = sh.next() { if word == b"bar" { assert_eq!(sh.line_no, 3); } } } #[test] #[allow(deprecated)] fn test_quote() { // Validate behavior with invalid UTF-8: assert_eq!(quote(INVALID_UTF8), INVALID_UTF8_SINGLEQUOTED); // Replicate a few tests from lib.rs. No need to replicate all of them. assert_eq!(quote(b""), &b"''"[..]); assert_eq!(quote(b"foobar"), &b"foobar"[..]); assert_eq!(quote(b"foo bar"), &b"'foo bar'"[..]); assert_eq!(quote(b"'\""), &b"\"'\\\"\""[..]); assert_eq!(quote(b""), &b"''"[..]); } #[test] #[allow(deprecated)] fn test_join() { // Validate behavior with invalid UTF-8: assert_eq!(join(vec![INVALID_UTF8]), INVALID_UTF8_SINGLEQUOTED); // Replicate a few tests from lib.rs. No need to replicate all of them. assert_eq!(join(vec![]), &b""[..]); assert_eq!(join(vec![&b""[..]]), b"''"); } shlex-1.3.0/src/lib.rs000064400000000000000000000320321046102023000126530ustar 00000000000000// Copyright 2015 Nicholas Allegra (comex). // Licensed under the Apache License, Version 2.0 or // the MIT license , at your option. This file may not be // copied, modified, or distributed except according to those terms. //! Parse strings like, and escape strings for, POSIX shells. //! //! Same idea as (but implementation not directly based on) the Python shlex module. //! //! Disabling the `std` feature (which is enabled by default) will allow the crate to work in //! `no_std` environments, where the `alloc` crate, and a global allocator, are available. //! //! ## Warning //! //! The [`try_quote`]/[`try_join`] family of APIs does not quote control characters (because they //! cannot be quoted portably). //! //! This is fully safe in noninteractive contexts, like shell scripts and `sh -c` arguments (or //! even scripts `source`d from interactive shells). //! //! But if you are quoting for human consumption, you should keep in mind that ugly inputs produce //! ugly outputs (which may not be copy-pastable). //! //! And if by chance you are piping the output of [`try_quote`]/[`try_join`] directly to the stdin //! of an interactive shell, you should stop, because control characters can lead to arbitrary //! command injection. //! //! For more information, and for information about more minor issues, please see [quoting_warning]. //! //! ## Compatibility //! //! This crate's quoting functionality tries to be compatible with **any POSIX-compatible shell**; //! it's tested against `bash`, `zsh`, `dash`, Busybox `ash`, and `mksh`, plus `fish` (which is not //! POSIX-compatible but close enough). //! //! It also aims to be compatible with Python `shlex` and C `wordexp`. #![cfg_attr(not(feature = "std"), no_std)] extern crate alloc; use alloc::vec::Vec; use alloc::borrow::Cow; use alloc::string::String; #[cfg(test)] use alloc::vec; #[cfg(test)] use alloc::borrow::ToOwned; pub mod bytes; #[cfg(all(doc, not(doctest)))] #[path = "quoting_warning.md"] pub mod quoting_warning; /// An iterator that takes an input string and splits it into the words using the same syntax as /// the POSIX shell. /// /// See [`bytes::Shlex`]. pub struct Shlex<'a>(bytes::Shlex<'a>); impl<'a> Shlex<'a> { pub fn new(in_str: &'a str) -> Self { Self(bytes::Shlex::new(in_str.as_bytes())) } } impl<'a> Iterator for Shlex<'a> { type Item = String; fn next(&mut self) -> Option { self.0.next().map(|byte_word| { // Safety: given valid UTF-8, bytes::Shlex will always return valid UTF-8. unsafe { String::from_utf8_unchecked(byte_word) } }) } } impl<'a> core::ops::Deref for Shlex<'a> { type Target = bytes::Shlex<'a>; fn deref(&self) -> &Self::Target { &self.0 } } impl<'a> core::ops::DerefMut for Shlex<'a> { fn deref_mut(&mut self) -> &mut Self::Target { &mut self.0 } } /// Convenience function that consumes the whole string at once. Returns None if the input was /// erroneous. pub fn split(in_str: &str) -> Option> { let mut shl = Shlex::new(in_str); let res = shl.by_ref().collect(); if shl.had_error { None } else { Some(res) } } /// Errors from [`Quoter::quote`], [`Quoter::join`], etc. (and their [`bytes`] counterparts). /// /// By default, the only error that can be returned is [`QuoteError::Nul`]. If you call /// `allow_nul(true)`, then no errors can be returned at all. Any error variants added in the /// future will not be enabled by default; they will be enabled through corresponding non-default /// [`Quoter`] options. /// /// ...In theory. In the unlikely event that additional classes of inputs are discovered that, /// like nul bytes, are fundamentally unsafe to quote even for non-interactive shells, the risk /// will be mitigated by adding corresponding [`QuoteError`] variants that *are* enabled by /// default. #[non_exhaustive] #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] pub enum QuoteError { /// The input contained a nul byte. In most cases, shells fundamentally [cannot handle strings /// containing nul bytes](quoting_warning#nul-bytes), no matter how they are quoted. But if /// you're sure you can handle nul bytes, you can call `allow_nul(true)` on the `Quoter` to let /// them pass through. Nul, } impl core::fmt::Display for QuoteError { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { match self { QuoteError::Nul => f.write_str("cannot shell-quote string containing nul byte"), } } } #[cfg(feature = "std")] impl std::error::Error for QuoteError {} /// A more configurable interface to quote strings. If you only want the default settings you can /// use the convenience functions [`try_quote`] and [`try_join`]. /// /// The bytes equivalent is [`bytes::Quoter`]. #[derive(Default, Debug, Clone)] pub struct Quoter { inner: bytes::Quoter, } impl Quoter { /// Create a new [`Quoter`] with default settings. #[inline] pub fn new() -> Self { Self::default() } /// Set whether to allow [nul bytes](quoting_warning#nul-bytes). By default they are not /// allowed and will result in an error of [`QuoteError::Nul`]. #[inline] pub fn allow_nul(mut self, allow: bool) -> Self { self.inner = self.inner.allow_nul(allow); self } /// Convenience function that consumes an iterable of words and turns it into a single string, /// quoting words when necessary. Consecutive words will be separated by a single space. pub fn join<'a, I: IntoIterator>(&self, words: I) -> Result { // Safety: given valid UTF-8, bytes::join() will always return valid UTF-8. self.inner.join(words.into_iter().map(|s| s.as_bytes())) .map(|bytes| unsafe { String::from_utf8_unchecked(bytes) }) } /// Given a single word, return a string suitable to encode it as a shell argument. pub fn quote<'a>(&self, in_str: &'a str) -> Result, QuoteError> { Ok(match self.inner.quote(in_str.as_bytes())? { Cow::Borrowed(out) => { // Safety: given valid UTF-8, bytes::quote() will always return valid UTF-8. unsafe { core::str::from_utf8_unchecked(out) }.into() } Cow::Owned(out) => { // Safety: given valid UTF-8, bytes::quote() will always return valid UTF-8. unsafe { String::from_utf8_unchecked(out) }.into() } }) } } impl From for Quoter { fn from(inner: bytes::Quoter) -> Quoter { Quoter { inner } } } impl From for bytes::Quoter { fn from(quoter: Quoter) -> bytes::Quoter { quoter.inner } } /// Convenience function that consumes an iterable of words and turns it into a single string, /// quoting words when necessary. Consecutive words will be separated by a single space. /// /// Uses default settings except that nul bytes are passed through, which [may be /// dangerous](quoting_warning#nul-bytes), leading to this function being deprecated. /// /// Equivalent to [`Quoter::new().allow_nul(true).join(words).unwrap()`](Quoter). /// /// (That configuration never returns `Err`, so this function does not panic.) /// /// The bytes equivalent is [bytes::join]. #[deprecated(since = "1.3.0", note = "replace with `try_join(words)?` to avoid nul byte danger")] pub fn join<'a, I: IntoIterator>(words: I) -> String { Quoter::new().allow_nul(true).join(words).unwrap() } /// Convenience function that consumes an iterable of words and turns it into a single string, /// quoting words when necessary. Consecutive words will be separated by a single space. /// /// Uses default settings. The only error that can be returned is [`QuoteError::Nul`]. /// /// Equivalent to [`Quoter::new().join(words)`](Quoter). /// /// The bytes equivalent is [bytes::try_join]. pub fn try_join<'a, I: IntoIterator>(words: I) -> Result { Quoter::new().join(words) } /// Given a single word, return a string suitable to encode it as a shell argument. /// /// Uses default settings except that nul bytes are passed through, which [may be /// dangerous](quoting_warning#nul-bytes), leading to this function being deprecated. /// /// Equivalent to [`Quoter::new().allow_nul(true).quote(in_str).unwrap()`](Quoter). /// /// (That configuration never returns `Err`, so this function does not panic.) /// /// The bytes equivalent is [bytes::quote]. #[deprecated(since = "1.3.0", note = "replace with `try_quote(str)?` to avoid nul byte danger")] pub fn quote(in_str: &str) -> Cow { Quoter::new().allow_nul(true).quote(in_str).unwrap() } /// Given a single word, return a string suitable to encode it as a shell argument. /// /// Uses default settings. The only error that can be returned is [`QuoteError::Nul`]. /// /// Equivalent to [`Quoter::new().quote(in_str)`](Quoter). /// /// (That configuration never returns `Err`, so this function does not panic.) /// /// The bytes equivalent is [bytes::try_quote]. pub fn try_quote(in_str: &str) -> Result, QuoteError> { Quoter::new().quote(in_str) } #[cfg(test)] static SPLIT_TEST_ITEMS: &'static [(&'static str, Option<&'static [&'static str]>)] = &[ ("foo$baz", Some(&["foo$baz"])), ("foo baz", Some(&["foo", "baz"])), ("foo\"bar\"baz", Some(&["foobarbaz"])), ("foo \"bar\"baz", Some(&["foo", "barbaz"])), (" foo \nbar", Some(&["foo", "bar"])), ("foo\\\nbar", Some(&["foobar"])), ("\"foo\\\nbar\"", Some(&["foobar"])), ("'baz\\$b'", Some(&["baz\\$b"])), ("'baz\\\''", None), ("\\", None), ("\"\\", None), ("'\\", None), ("\"", None), ("'", None), ("foo #bar\nbaz", Some(&["foo", "baz"])), ("foo #bar", Some(&["foo"])), ("foo#bar", Some(&["foo#bar"])), ("foo\"#bar", None), ("'\\n'", Some(&["\\n"])), ("'\\\\n'", Some(&["\\\\n"])), ]; #[test] fn test_split() { for &(input, output) in SPLIT_TEST_ITEMS { assert_eq!(split(input), output.map(|o| o.iter().map(|&x| x.to_owned()).collect())); } } #[test] fn test_lineno() { let mut sh = Shlex::new("\nfoo\nbar"); while let Some(word) = sh.next() { if word == "bar" { assert_eq!(sh.line_no, 3); } } } #[test] #[cfg_attr(not(feature = "std"), allow(unreachable_code, unused_mut))] fn test_quote() { // This is a list of (unquoted, quoted) pairs. // But it's using a single long (raw) string literal with an ad-hoc format, just because it's // hard to read if we have to put the test strings through Rust escaping on top of the escaping // being tested. (Even raw string literals are noisy for short strings). // Ad-hoc: "NL" is replaced with a literal newline; no other escape sequences. let tests = r#" <> => <''> => => <'foo bar'> <"foo bar'"> => <"\"foo bar'\""> <'foo bar'> => <"'foo bar'"> <"> => <'"'> <"'> => <"\"'"> => <'hello!world'> <'hello!world> => <"'hello"'!world'> <'hello!> => <"'hello"'!'> => <'hello ''^ world'> => => <'!world'"'"> <{a, b}> => <'{a, b}'> => <'NL'> <^> => <'^'> => => <'NLx''^'> => <'NL''^x'> => <'NL ''^x'> <{a,b}> => <'{a,b}'> => <'a,b'> <'$> => <"'"'$'> <"^> => <'"''^'> "#; let mut ok = true; for test in tests.trim().split('\n') { let parts: Vec = test .replace("NL", "\n") .split("=>") .map(|part| part.trim().trim_start_matches('<').trim_end_matches('>').to_owned()) .collect(); assert!(parts.len() == 2); let unquoted = &*parts[0]; let quoted_expected = &*parts[1]; let quoted_actual = try_quote(&parts[0]).unwrap(); if quoted_expected != quoted_actual { #[cfg(not(feature = "std"))] panic!("FAIL: for input <{}>, expected <{}>, got <{}>", unquoted, quoted_expected, quoted_actual); #[cfg(feature = "std")] println!("FAIL: for input <{}>, expected <{}>, got <{}>", unquoted, quoted_expected, quoted_actual); ok = false; } } assert!(ok); } #[test] #[allow(deprecated)] fn test_join() { assert_eq!(join(vec![]), ""); assert_eq!(join(vec![""]), "''"); assert_eq!(join(vec!["a", "b"]), "a b"); assert_eq!(join(vec!["foo bar", "baz"]), "'foo bar' baz"); } #[test] fn test_fallible() { assert_eq!(try_join(vec!["\0"]), Err(QuoteError::Nul)); assert_eq!(try_quote("\0"), Err(QuoteError::Nul)); } shlex-1.3.0/src/quoting_warning.md000064400000000000000000000404171046102023000153020ustar 00000000000000// vim: textwidth=99 /* Meta note: This file is loaded as a .rs file by rustdoc only. */ /*! A more detailed version of the [warning at the top level](super#warning) about the `quote`/`join` family of APIs. In general, passing the output of these APIs to a shell should recover the original string(s). This page lists cases where it fails to do so. In noninteractive contexts, there are only minor issues. 'Noninteractive' includes shell scripts and `sh -c` arguments, or even scripts `source`d from interactive shells. The issues are: - [Nul bytes](#nul-bytes) - [Overlong commands](#overlong-commands) If you are writing directly to the stdin of an interactive (`-i`) shell (i.e., if you are pretending to be a terminal), or if you are writing to a cooked-mode pty (even if the other end is noninteractive), then there is a **severe** security issue: - [Control characters](#control-characters-interactive-contexts-only) Finally, there are some [solved issues](#solved-issues). # List of issues ## Nul bytes For non-interactive shells, the most problematic input is nul bytes (bytes with value 0). The non-deprecated functions all default to returning [`QuoteError::Nul`] when encountering them, but the deprecated [`quote`] and [`join`] functions leave them as-is. In Unix, nul bytes can't appear in command arguments, environment variables, or filenames. It's not a question of proper quoting; they just can't be used at all. This is a consequence of Unix's system calls all being designed around nul-terminated C strings. Shells inherit that limitation. Most of them do not accept nul bytes in strings even internally. Even when they do, it's pretty much useless or even dangerous, since you can't pass them to external commands. In some cases, you might fail to pass the nul byte to the shell in the first place. For example, the following code uses [`join`] to tunnel a command over an SSH connection: ```rust std::process::Command::new("ssh") .arg("myhost") .arg("--") .arg(join(my_cmd_args)) ``` If any argument in `my_cmd_args` contains a nul byte, then `join(my_cmd_args)` will contain a nul byte. But `join(my_cmd_args)` is itself being passed as an argument to a command (the ssh command), and command arguments can't contain nul bytes! So this will simply result in the `Command` failing to launch. Still, there are other ways to smuggle nul bytes into a shell. How the shell reacts depends on the shell and the method of smuggling. For example, here is Bash 5.2.21 exhibiting three different behaviors: - With ANSI-C quoting, the string is truncated at the first nul byte: ```bash $ echo $'foo\0bar' | hexdump -C 00000000 66 6f 6f 0a |foo.| ``` - With command substitution, nul bytes are removed with a warning: ```bash $ echo $(printf 'foo\0bar') | hexdump -C bash: warning: command substitution: ignored null byte in input 00000000 66 6f 6f 62 61 72 0a |foobar.| ``` - When a nul byte appears directly in a shell script, it's removed with no warning: ```bash $ printf 'echo "foo\0bar"' | bash | hexdump -C 00000000 66 6f 6f 62 61 72 0a |foobar.| ``` Zsh, in contrast, actually allows nul bytes internally, in shell variables and even arguments to builtin commands. But if a variable is exported to the environment, or if an argument is used for an external command, then the child process will see it silently truncated at the first nul. This might actually be more dangerous, depending on the use case. ## Overlong commands If you pass a long string into a shell, several things might happen: - It might succeed, yet the shell might have trouble actually doing anything with it. For example: ```bash x=$(printf '%010000000d' 0); /bin/echo $x bash: /bin/echo: Argument list too long ``` - If you're using certain shells (e.g. Busybox Ash) *and* using a pty for communication, then the shell will impose a line length limit, ignoring all input past the limit. - If you're using a pty in cooked mode, then by default, if you write so many bytes as input that it fills the kernel's internal buffer, the kernel will simply drop those bytes, instead of blocking waiting for the shell to empty out the buffer. In other words, random bits of input can be lost, which is obviously insecure. Future versions of this crate may add an option to [`Quoter`] to check the length for you. ## Control characters (*interactive contexts only*) Control characters are the bytes from `\x00` to `\x1f`, plus `\x7f`. `\x00` (the nul byte) is discussed [above](#nul-bytes), but what about the rest? Well, many of them correspond to terminal keyboard shortcuts. For example, when you press Ctrl-A at a shell prompt, your terminal sends the byte `\x01`. The shell sees that byte and (if not configured differently) takes the standard action for Ctrl-A, which is to move the cursor to the beginning of the line. This means that it's quite dangerous to pipe bytes to an interactive shell. For example, here is a program that tries to tell Bash to echo an arbitrary string, 'safely': ```rust use std::process::{Command, Stdio}; use std::io::Write; let evil_string = "\x01do_something_evil; "; let quoted = shlex::try_quote(evil_string).unwrap(); println!("quoted string is {:?}", quoted); let mut bash = Command::new("bash") .arg("-i") // force interactive mode .stdin(Stdio::piped()) .spawn() .unwrap(); let stdin = bash.stdin.as_mut().unwrap(); write!(stdin, "echo {}\n", quoted).unwrap(); ``` Here's the output of the program (with irrelevant bits removed): ```text quoted string is "'\u{1}do_something_evil; '" /tmp comex$ do_something_evil; 'echo ' bash: do_something_evil: command not found bash: echo : command not found ``` Even though we quoted it, Bash still ran an arbitrary command! This is not because the quoting was insufficient, per se. In single quotes, all input is supposed to be treated as raw data until the closing single quote. And in fact, this would work fine without the `"-i"` argument. But line input is a separate stage from shell syntax parsing. After all, if you type a single quote on the keyboard, you wouldn't expect it to disable all your keyboard shortcuts. So a control character always has its designated effect, no matter if it's quoted or backslash-escaped. Also, some control characters are interpreted by the kernel tty layer instead, like CTRL-C to send SIGINT. These can be an issue even with noninteractive shells, but only if using a pty for communication, as opposed to a pipe. To be safe, you just have to avoid sending them. ### Why not just use hex escapes? In any normal programming languages, this would be no big deal. Any normal language has a way to escape arbitrary characters in strings by writing out their numeric values. For example, Rust lets you write them in hexadecimal, like `"\x4f"` (or `"\u{1d546}"` for Unicode). In this way, arbitrary strings can be represented using only 'nice' simple characters. Any remotely suspicious character can be replaced with a numeric escape sequence, where the escape sequence itself consists only of alphanumeric characters and some punctuation. The result may not be the most readable[^choices], but it's quite safe from being misinterpreted or corrupted in transit. Shell is not normal. It has no numeric escape sequences. There are a few different ways to quote characters (unquoted, unquoted-with-backslash, single quotes, double quotes), but all of them involve writing the character itself. If the input contains a control character, the output must contain that same character. ### Mitigation: terminal filters In practice, automating interactive shells like in the above example is pretty uncommon these days. In most cases, the only way for a programmatically generated string to make its way to the input of an interactive shell is if a human copies and pastes it into their terminal. And many terminals detect when you paste a string containing control characters. iTerm2 strips them out; gnome-terminal replaces them with alternate characters[^gr]; Kitty outright prompts for confirmation. This mitigates the risk. But it's not perfect. Some other terminals don't implement this check or implement it incorrectly. Also, these checks tend to not filter the tab character, which could trigger tab completion. In most cases that's a non-issue, because most shells support paste bracketing, which disables tab and some other control characters[^bracketing] within pasted text. But in some cases paste bracketing gets disabled. ### Future possibility: ANSI-C quoting I said that shell syntax has no numeric escapes, but that only applies to *portable* shell syntax. Bash and Zsh support an obscure alternate quoting style with the syntax `$'foo'`. It's called ["ANSI-C quoting"][ansic], and inside it you can use all the escape sequences supported by C, including hex escapes: ```bash $ echo $'\x41\n\x42' A B ``` But other shells don't support it — including Dash, a popular choice for `/bin/sh`, and Busybox's Ash, frequently seen on stripped-down embedded systems. This crate's quoting functionality [tries to be compatible](crate#compatibility) with those shells, plus all other POSIX-compatible shells. That makes ANSI-C quoting a no-go. Still, future versions of this crate may provide an option to enable ANSI-C quoting, at the cost of reduced portability. ### Future possibility: printf Another option would be to invoke the `printf` command, which is required by POSIX to support octal escapes. For example, you could 'escape' the Rust string `"\x01"` into the shell syntax `"$(printf '\001')"`. The shell will execute the command `printf` with the first argument being literally a backslash followed by three digits; `printf` will output the actual byte with value 1; and the shell will substitute that back into the original command. The problem is that 'escaping' a string into a command substitution just feels too surprising. If nothing else, it only works with an actual shell; [other languages' shell parsing routines](crate#compatibility) wouldn't understand it. Neither would this crate's own parser, though that could be fixed. Future versions of this crate may provide an option to use `printf` for quoting. ### Special note: newlines Did you know that `\r` and `\n` are control characters? They aren't as dangerous as other control characters (if quoted properly). But there's still an issue with them in interactive contexts. Namely, in some cases, interactive shells and/or the tty layer will 'helpfully' translate between different line ending conventions. The possibilities include replacing `\r` with `\n`, replacing `\n` with `\r\n`, and others. This can't result in command injection, but it's still a lossy transformation which can result in a failure to round-trip (i.e. the shell sees a different string from what was originally passed to `quote`). Numeric escapes would solve this as well. # Solved issues ## Solved: Past vulnerability (GHSA-r7qv-8r2h-pg27 / RUSTSEC-2024-XXX) Versions of this crate before 1.3.0 did not quote `{`, `}`, and `\xa0`. See: - - (TODO: Add Rustsec link) ## Solved: `!` and `^` There are two non-control characters which have a special meaning in interactive contexts only: `!` and `^`. Luckily, these can be escaped adequately. The `!` character triggers [history expansion][he]; the `^` character can trigger a variant of history expansion known as [Quick Substitution][qs]. Both of these characters get expanded even inside of double-quoted strings\! If we're in a double-quoted string, then we can't just escape these characters with a backslash. Only a specific set of characters can be backslash-escaped inside double quotes; the set of supported characters depends on the shell, but it often doesn't include `!` and `^`.[^escbs] Trying to backslash-escape an unsupported character produces a literal backslash: ```bash $ echo "\!" \! ``` However, these characters don't get expanded in single-quoted strings, so this crate just single-quotes them. But there's a Bash bug where `^` actually does get partially expanded in single-quoted strings: ```bash $ echo ' > ^a^b > ' !!:s^a^b ``` To work around that, this crate forces `^` to appear right after an opening single quote. For example, the string `"^` is quoted into `'"''^'` instead of `'"^'`. This restriction is overkill, since `^` is only meaningful right after a newline, but it's a sufficient restriction (after all, a `^` character can't be preceded by a newline if it's forced to be preceded by a single quote), and for now it simplifies things. ## Solved: `\xa0` The byte `\xa0` may be treated as a shell word separator, specifically on Bash on macOS when using the default UTF-8 locale, only when the input is invalid UTF-8. This crate handles the issue by always using quotes for arguments containing this byte. In fact, this crate always uses quotes for arguments containing any non-ASCII bytes. This may be changed in the future, since it's a bit unfriendly to non-English users. But for now it minimizes risk, especially considering the large number of different legacy single-byte locales someone might hypothetically be running their shell in. ### Demonstration ```bash $ echo -e 'ls a\xa0b' | bash ls: a: No such file or directory ls: b: No such file or directory ``` The normal behavior would be to output a single line, e.g.: ```bash $ echo -e 'ls a\xa0b' | bash ls: cannot access 'a'$'\240''b': No such file or directory ``` (The specific quoting in the error doesn't matter.) ### Cause Just for fun, here's why this behavior occurs: Bash decides which bytes serve as word separators based on the libc function [`isblank`][isblank]. On macOS on UTF-8 locales, this passes for `\xa0`, corresponding to U+00A0 NO-BREAK SPACE. This is doubly unique compared to the other systems I tested (Linux/glibc, Linux/musl, and Windows/MSVC). First, the other systems don't allow bytes in the range [0x80, 0xFF] to pass isfoo functions in UTF-8 locales, even if the corresponding Unicode codepoint does pass, as determined by the wide-character equivalent function, iswfoo. Second, the other systems don't treat U+00A0 as blank (even using `iswblank`). Meanwhile, Bash checks for multi-byte sequences and forbids them from being treated as special characters, so the proper UTF-8 encoding of U+00A0, `b"\xc2\xa0"`, is not treated as a word separator. Treatment as a word separator only happens for `b"\xa0"` alone, which is illegal UTF-8. [ansic]: https://www.gnu.org/software/bash/manual/html_node/ANSI_002dC-Quoting.html [he]: https://www.gnu.org/software/bash/manual/html_node/History-Interaction.html [qs]: https://www.gnu.org/software/bash/manual/html_node/Event-Designators.html [isblank]: https://man7.org/linux/man-pages/man3/isblank.3p.html [nul]: #nul-bytes [^choices]: This can lead to tough choices over which characters to escape and which to leave as-is, especially when Unicode gets involved and you have to balance the risk of confusion with the benefit of properly supporting non-English languages.

We don't have the luxury of those choices. [^gr]: For example, backspace (in Unicode lingo, U+0008 BACKSPACE) turns into U+2408 SYMBOL FOR BACKSPACE. [^bracketing]: It typically disables almost all handling of control characters by the shell proper, but one necessary exception is the end-of-paste sequence itself (which starts with the control character `\x1b`). In addition, paste bracketing does not suppress handling of control characters by the kernel tty layer, such as `\x03` sending SIGINT (which typically clears the currently typed command, making it dangerous in a similar way to `\x01`). [^escbs]: For example, Dash doesn't remove the backslash from `"\!"` because it simply doesn't know anything about `!` as a special character: it doesn't support history expansion. On the other end of the spectrum, Zsh supports history expansion and does remove the backslash — though only in interactive mode. Bash's behavior is weirder. It supports history expansion, and if you write `"\!"`, the backslash does prevent history expansion from occurring — but it doesn't get removed! */ // `use` declarations to make auto links work: use ::{quote, join, Shlex, Quoter, QuoteError}; // TODO: add more about copy-paste and human readability.