From 56dc953617198ce6f36956ca5776da10fe7087b5 Mon Sep 17 00:00:00 2001 From: Ed Page Date: Mon, 27 Mar 2023 23:26:41 -0500 Subject: [PATCH 1/2] doc(lex): Clarify safety of unsafe --- clap_lex/src/ext.rs | 60 +++++++++++++++++++++++++++++++-------------- 1 file changed, 41 insertions(+), 19 deletions(-) diff --git a/clap_lex/src/ext.rs b/clap_lex/src/ext.rs index 6d319988210..4cd098c6a70 100644 --- a/clap_lex/src/ext.rs +++ b/clap_lex/src/ext.rs @@ -212,7 +212,8 @@ pub trait OsStrExt: private::Sealed { impl OsStrExt for OsStr { fn try_str(&self) -> Result<&str, std::str::Utf8Error> { - let bytes = to_bytes(self); + // SAFETY: Only interacting with `OsStr` as `&str + let bytes = unsafe { to_bytes(self) }; std::str::from_utf8(bytes) } @@ -221,17 +222,24 @@ impl OsStrExt for OsStr { } fn find(&self, needle: &str) -> Option { + // SAFETY: Only interacting with `OsStr` as `&str + let bytes = unsafe { to_bytes(self) }; (0..=self.len().checked_sub(needle.len())?) - .find(|&x| to_bytes(self)[x..].starts_with(needle.as_bytes())) + .find(|&x| bytes[x..].starts_with(needle.as_bytes())) } fn strip_prefix(&self, prefix: &str) -> Option<&OsStr> { - to_bytes(self) - .strip_prefix(prefix.as_bytes()) - .map(to_os_str) + // SAFETY: Only interacting with `OsStr` as `&str + let bytes = unsafe { to_bytes(self) }; + bytes.strip_prefix(prefix.as_bytes()).map(|s| { + // SAFETY: Only interacting with `OsStr` as `&str + unsafe { to_os_str(s) } + }) } fn starts_with(&self, prefix: &str) -> bool { - to_bytes(self).starts_with(prefix.as_bytes()) + // SAFETY: Only interacting with `OsStr` as `&str + let bytes = unsafe { to_bytes(self) }; + bytes.starts_with(prefix.as_bytes()) } fn split<'s, 'n>(&'s self, needle: &'n str) -> Split<'s, 'n> { @@ -243,17 +251,24 @@ impl OsStrExt for OsStr { } fn split_at(&self, index: usize) -> (&OsStr, &OsStr) { - let (first, second) = to_bytes(self).split_at(index); - (to_os_str(first), to_os_str(second)) + // BUG: This is unsafe + unsafe { + let bytes = to_bytes(self); + let (first, second) = bytes.split_at(index); + (to_os_str(first), to_os_str(second)) + } } fn split_once(&self, needle: &'_ str) -> Option<(&OsStr, &OsStr)> { let start = self.find(needle)?; let end = start + needle.len(); - let haystack = to_bytes(self); - let first = &haystack[0..start]; - let second = &haystack[end..]; - Some((to_os_str(first), to_os_str(second))) + // SAFETY: Only interacting with `OsStr` as `&str + unsafe { + let haystack = to_bytes(self); + let first = &haystack[0..start]; + let second = &haystack[end..]; + Some((to_os_str(first), to_os_str(second))) + } } } @@ -265,14 +280,15 @@ mod private { /// Allow access to raw bytes /// -/// **Note:** the bytes only make sense when compared with ASCII or `&str` +/// # Safety /// -/// **Note:** This must never be serialized as there is no guarantee at how invalid UTF-8 will be -/// encoded, even within the same version of this crate (since its dependent on rustc version) -fn to_bytes(s: &OsStr) -> &[u8] { +/// - The bytes only make sense when compared with ASCII or `&str` +/// - This must never be serialized as there is no guarantee at how invalid UTF-8 will be +/// encoded, even within the same version of this crate (since its dependent on rustc version) +unsafe fn to_bytes(s: &OsStr) -> &[u8] { // SAFETY: // - Lifetimes are the same - // - Types are compatible (`OsStr` is a transparent wrapper for `[u8]`) + // - Types are compatible (`OsStr` is effectively a transparent wrapper for `[u8]`) // - The primary contract is that the encoding for invalid surrogate code points is not // guaranteed which isn't a problem here // @@ -282,10 +298,16 @@ fn to_bytes(s: &OsStr) -> &[u8] { } /// Restore raw bytes as `OsStr` -fn to_os_str(s: &[u8]) -> &OsStr { +/// +/// # Safety +/// +/// - The bytes only make sense when compared with ASCII or `&str` +/// - This must never be serialized as there is no guarantee at how invalid UTF-8 will be +/// encoded, even within the same version of this crate (since its dependent on rustc version) +unsafe fn to_os_str(s: &[u8]) -> &OsStr { // SAFETY: // - Lifetimes are the same - // - Types are compatible (`OsStr` is a transparent wrapper for `[u8]`) + // - Types are compatible (`OsStr` is effectively a transparent wrapper for `[u8]`) // - The primary contract is that the encoding for invalid surrogate code points is not // guaranteed which isn't a problem here // From 48dc66f65291060b995fd86e6d30a093ee8bed77 Mon Sep 17 00:00:00 2001 From: Ed Page Date: Mon, 27 Mar 2023 23:32:30 -0500 Subject: [PATCH 2/2] fix(clap_lex): Deprecate unsound OsStrExt::split_at --- clap_lex/src/ext.rs | 18 +++++++++++++++--- clap_lex/src/lib.rs | 7 +++++-- 2 files changed, 20 insertions(+), 5 deletions(-) diff --git a/clap_lex/src/ext.rs b/clap_lex/src/ext.rs index 4cd098c6a70..3186580424b 100644 --- a/clap_lex/src/ext.rs +++ b/clap_lex/src/ext.rs @@ -193,6 +193,7 @@ pub trait OsStrExt: private::Sealed { /// assert_eq!("Per", first); /// assert_eq!(" Martin-Löf", last); /// ``` + #[deprecated(since = "4.1.0", note = "This is not sound for all `index`")] fn split_at(&self, index: usize) -> (&OsStr, &OsStr); /// Splits the string on the first occurrence of the specified delimiter and /// returns prefix before delimiter and suffix after delimiter. @@ -251,7 +252,7 @@ impl OsStrExt for OsStr { } fn split_at(&self, index: usize) -> (&OsStr, &OsStr) { - // BUG: This is unsafe + // BUG: This is unsafe and has been deprecated unsafe { let bytes = to_bytes(self); let (first, second) = bytes.split_at(index); @@ -294,7 +295,7 @@ unsafe fn to_bytes(s: &OsStr) -> &[u8] { // // There is a proposal to support this natively (https://github.com/rust-lang/rust/pull/95290) // but its in limbo - unsafe { std::mem::transmute(s) } + std::mem::transmute(s) } /// Restore raw bytes as `OsStr` @@ -313,7 +314,7 @@ unsafe fn to_os_str(s: &[u8]) -> &OsStr { // // There is a proposal to support this natively (https://github.com/rust-lang/rust/pull/95290) // but its in limbo - unsafe { std::mem::transmute(s) } + std::mem::transmute(s) } pub struct Split<'s, 'n> { @@ -341,3 +342,14 @@ impl<'s, 'n> Iterator for Split<'s, 'n> { } } } + +/// Split an `OsStr` +/// +/// # Safety +/// +/// `index` must be at a valid UTF-8 boundary +pub(crate) unsafe fn split_at(os: &OsStr, index: usize) -> (&OsStr, &OsStr) { + let bytes = to_bytes(os); + let (first, second) = bytes.split_at(index); + (to_os_str(first), to_os_str(second)) +} diff --git a/clap_lex/src/lib.rs b/clap_lex/src/lib.rs index ea13e4baf86..b349fba51d0 100644 --- a/clap_lex/src/lib.rs +++ b/clap_lex/src/lib.rs @@ -433,7 +433,9 @@ impl<'s> ShortFlags<'s> { if let Some((index, _)) = self.utf8_prefix.next() { self.utf8_prefix = "".char_indices(); self.invalid_suffix = None; - return Some(self.inner.split_at(index).1); + // SAFETY: `char_indices` ensures `index` is at a valid UTF-8 boundary + let remainder = unsafe { ext::split_at(self.inner, index).1 }; + return Some(remainder); } if let Some(suffix) = self.invalid_suffix { @@ -457,7 +459,8 @@ fn split_nonutf8_once(b: &OsStr) -> (&str, Option<&OsStr>) { match b.try_str() { Ok(s) => (s, None), Err(err) => { - let (valid, after_valid) = b.split_at(err.valid_up_to()); + // SAFETY: `char_indices` ensures `index` is at a valid UTF-8 boundary + let (valid, after_valid) = unsafe { ext::split_at(b, err.valid_up_to()) }; let valid = valid.try_str().unwrap(); (valid, Some(after_valid)) }