From 6419a0d937921fee1c8700f4bfcee397520e1ef7 Mon Sep 17 00:00:00 2001 From: Ed Page Date: Sat, 25 Mar 2023 01:59:24 -0500 Subject: [PATCH] perf!(lex): Build faster by removing `os_str_bytes` We are doing direct transmutes between `OsStr` and `[u8]`. https://github.com/rust-lang/rust/pull/95290 would make this natively supported but I got tired of waitin for it. This only saves about 1/4s off of `cargo build`. This took 2.9 KiB off of `cargo bloat --release --example git` --- Cargo.lock | 4 - clap_builder/src/builder/debug_asserts.rs | 14 +- clap_builder/src/parser/parser.rs | 56 ++-- clap_complete/Cargo.toml | 3 +- clap_complete/src/dynamic.rs | 58 ++-- clap_lex/Cargo.toml | 3 - clap_lex/src/ext.rs | 321 ++++++++++++++++++++++ clap_lex/src/lib.rs | 121 ++++---- clap_lex/tests/parsed.rs | 12 +- clap_lex/tests/shorts.rs | 4 +- 10 files changed, 432 insertions(+), 164 deletions(-) create mode 100644 clap_lex/src/ext.rs diff --git a/Cargo.lock b/Cargo.lock index ef0ac63d092..645f7560e5c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -230,7 +230,6 @@ dependencies = [ "clap 4.1.13", "clap_lex 0.3.3", "is_executable", - "os_str_bytes", "pathdiff", "shlex", "snapbox", @@ -269,9 +268,6 @@ dependencies = [ [[package]] name = "clap_lex" version = "0.3.3" -dependencies = [ - "os_str_bytes", -] [[package]] name = "clap_mangen" diff --git a/clap_builder/src/builder/debug_asserts.rs b/clap_builder/src/builder/debug_asserts.rs index dbd8bbbc674..7a7fd6ae183 100644 --- a/clap_builder/src/builder/debug_asserts.rs +++ b/clap_builder/src/builder/debug_asserts.rs @@ -1,6 +1,6 @@ use std::cmp::Ordering; -use clap_lex::RawOsStr; +use clap_lex::OsStrExt as _; use crate::builder::OsStr; use crate::builder::ValueRange; @@ -841,16 +841,16 @@ fn assert_defaults<'d>( for default_os in defaults { let value_parser = arg.get_value_parser(); let assert_cmd = Command::new("assert"); - if let Some(delim) = arg.get_value_delimiter() { - let default_os = RawOsStr::new(default_os); - for part in default_os.split(delim) { - if let Err(err) = value_parser.parse_ref(&assert_cmd, Some(arg), &part.to_os_str()) - { + if let Some(val_delim) = arg.get_value_delimiter() { + let mut val_delim_buffer = [0; 4]; + let val_delim = val_delim.encode_utf8(&mut val_delim_buffer); + for part in default_os.split(val_delim) { + if let Err(err) = value_parser.parse_ref(&assert_cmd, Some(arg), part) { panic!( "Argument `{}`'s {}={:?} failed validation: {}", arg.get_id(), field, - part.to_str_lossy(), + part.to_string_lossy(), err ); } diff --git a/clap_builder/src/parser/parser.rs b/clap_builder/src/parser/parser.rs index 25790343181..79111015f9b 100644 --- a/clap_builder/src/parser/parser.rs +++ b/clap_builder/src/parser/parser.rs @@ -4,9 +4,7 @@ use std::{ ffi::{OsStr, OsString}, }; -// Third Party -use clap_lex::RawOsStr; -use clap_lex::RawOsString; +use clap_lex::OsStrExt as _; // Internal use crate::builder::{Arg, Command}; @@ -93,9 +91,8 @@ impl<'cmd> Parser<'cmd> { } debug!( - "Parser::get_matches_with: Begin parsing '{:?}' ({:?})", + "Parser::get_matches_with: Begin parsing '{:?}'", arg_os.to_value_os(), - arg_os.to_value_os().as_raw_bytes() ); // Has the user already passed '--'? Meaning only positional args follow @@ -291,7 +288,7 @@ impl<'cmd> Parser<'cmd> { } else { let trailing_values = false; let arg_values = matcher.pending_values_mut(id, None, trailing_values); - arg_values.push(arg_os.to_value_os().to_os_str().into_owned()); + arg_values.push(arg_os.to_value_os().to_owned()); if matcher.needs_more_vals(arg) { ParseResult::Opt(arg.get_id().clone()) } else { @@ -411,7 +408,7 @@ impl<'cmd> Parser<'cmd> { Some(Identifier::Index), trailing_values, ); - arg_values.push(arg_os.to_value_os().to_os_str().into_owned()); + arg_values.push(arg_os.to_value_os().to_owned()); } // Only increment the positional counter if it doesn't allow multiples @@ -548,7 +545,7 @@ impl<'cmd> Parser<'cmd> { // Checks if the arg matches a subcommand name, or any of its aliases (if defined) fn possible_subcommand( &self, - arg: Result<&str, &RawOsStr>, + arg: Result<&str, &OsStr>, valid_arg_found: bool, ) -> Option<&str> { debug!("Parser::possible_subcommand: arg={:?}", arg); @@ -723,8 +720,8 @@ impl<'cmd> Parser<'cmd> { fn parse_long_arg( &mut self, matcher: &mut ArgMatcher, - long_arg: Result<&str, &RawOsStr>, - long_value: Option<&RawOsStr>, + long_arg: &str, + long_value: Option<&OsStr>, parse_state: &ParseState, pos_counter: usize, valid_arg_found: &mut bool, @@ -741,14 +738,6 @@ impl<'cmd> Parser<'cmd> { } debug!("Parser::parse_long_arg: Does it contain '='..."); - let long_arg = match long_arg { - Ok(long_arg) => long_arg, - Err(long_arg) => { - return Ok(ParseResult::NoMatchingArg { - arg: long_arg.to_str_lossy().into_owned(), - }); - } - }; if long_arg.is_empty() { debug_assert!( long_value.is_some(), @@ -805,7 +794,7 @@ impl<'cmd> Parser<'cmd> { used.push(arg.get_id().clone()); Ok(ParseResult::UnneededAttachedValue { - rest: rest.to_str_lossy().into_owned(), + rest: rest.to_string_lossy().into_owned(), used, arg: arg.to_string(), }) @@ -902,7 +891,7 @@ impl<'cmd> Parser<'cmd> { Ok(c) => c, Err(rest) => { return Ok(ParseResult::NoMatchingArg { - arg: format!("-{}", rest.to_str_lossy()), + arg: format!("-{}", rest.to_string_lossy()), }); } }; @@ -938,8 +927,8 @@ impl<'cmd> Parser<'cmd> { // Cloning the iterator, so we rollback if it isn't there. let val = short_arg.clone().next_value_os().unwrap_or_default(); debug!( - "Parser::parse_short_arg:iter:{}: val={:?} (bytes), val={:?} (ascii), short_arg={:?}", - c, val, val.as_raw_bytes(), short_arg + "Parser::parse_short_arg:iter:{}: val={:?}, short_arg={:?}", + c, val, short_arg ); let val = Some(val).filter(|v| !v.is_empty()); @@ -950,7 +939,7 @@ impl<'cmd> Parser<'cmd> { // // e.g. `-xvf`, when require_equals && x.min_vals == 0, we don't // consume the `vf`, even if it's provided as value. - let (val, has_eq) = if let Some(val) = val.and_then(|v| v.strip_prefix('=')) { + let (val, has_eq) = if let Some(val) = val.and_then(|v| v.strip_prefix("=")) { (Some(val), true) } else { (val, false) @@ -991,7 +980,7 @@ impl<'cmd> Parser<'cmd> { fn parse_opt_value( &self, ident: Identifier, - attached_value: Option<&RawOsStr>, + attached_value: Option<&OsStr>, arg: &Arg, matcher: &mut ArgMatcher, has_eq: bool, @@ -1032,7 +1021,7 @@ impl<'cmd> Parser<'cmd> { }) } } else if let Some(v) = attached_value { - let arg_values = vec![v.to_os_str().into_owned()]; + let arg_values = vec![v.to_owned()]; let trailing_idx = None; let react_result = ok!(self.react( Some(ident), @@ -1054,13 +1043,8 @@ impl<'cmd> Parser<'cmd> { } } - fn check_terminator(&self, arg: &Arg, val: &RawOsStr) -> Option { - if Some(val) - == arg - .terminator - .as_ref() - .map(|s| RawOsStr::from_str(s.as_str())) - { + fn check_terminator(&self, arg: &Arg, val: &OsStr) -> Option { + if Some(val) == arg.terminator.as_ref().map(|s| OsStr::new(s.as_str())) { debug!("Parser::check_terminator: terminator={:?}", arg.terminator); Some(ParseResult::ValuesDone) } else { @@ -1156,17 +1140,17 @@ impl<'cmd> Parser<'cmd> { if self.cmd.is_dont_delimit_trailing_values_set() && trailing_idx == Some(0) { // Nothing to do } else { + let mut val_delim_buffer = [0; 4]; + let val_delim = val_delim.encode_utf8(&mut val_delim_buffer); let mut split_raw_vals = Vec::with_capacity(raw_vals.len()); for (i, raw_val) in raw_vals.into_iter().enumerate() { - let raw_val = RawOsString::new(raw_val); if !raw_val.contains(val_delim) || (self.cmd.is_dont_delimit_trailing_values_set() && trailing_idx == Some(i)) { - split_raw_vals.push(raw_val.into_os_string()); + split_raw_vals.push(raw_val); } else { - split_raw_vals - .extend(raw_val.split(val_delim).map(|x| x.to_os_str().into_owned())); + split_raw_vals.extend(raw_val.split(val_delim).map(|x| x.to_owned())); } } raw_vals = split_raw_vals diff --git a/clap_complete/Cargo.toml b/clap_complete/Cargo.toml index a8c41616bb4..d60e4984c09 100644 --- a/clap_complete/Cargo.toml +++ b/clap_complete/Cargo.toml @@ -35,7 +35,6 @@ bench = false clap = { path = "../", version = "4.1.0", default-features = false, features = ["std"] } clap_lex = { path = "../clap_lex", version = "0.3.0", optional = true } is_executable = { version = "1.0.1", optional = true } -os_str_bytes = { version = "6.0.0", default-features = false, features = ["raw_os_str"], optional = true } pathdiff = { version = "0.2.1", optional = true } shlex = { version = "1.1.0", optional = true } unicode-xid = { version = "0.2.2", optional = true } @@ -52,5 +51,5 @@ required-features = ["unstable-dynamic"] [features] default = [] -unstable-dynamic = ["dep:clap_lex", "dep:shlex", "dep:unicode-xid", "dep:os_str_bytes", "clap/derive", "dep:is_executable", "dep:pathdiff"] +unstable-dynamic = ["dep:clap_lex", "dep:shlex", "dep:unicode-xid", "clap/derive", "dep:is_executable", "dep:pathdiff"] debug = ["clap/debug"] diff --git a/clap_complete/src/dynamic.rs b/clap_complete/src/dynamic.rs index 929841ec8aa..6c0881c3557 100644 --- a/clap_complete/src/dynamic.rs +++ b/clap_complete/src/dynamic.rs @@ -2,9 +2,11 @@ /// Complete commands within bash pub mod bash { + use std::ffi::OsStr; use std::ffi::OsString; use std::io::Write; + use clap_lex::OsStrExt as _; use unicode_xid::UnicodeXID; #[derive(clap::Subcommand)] @@ -320,11 +322,7 @@ complete OPTIONS -F _clap_complete_NAME EXECUTABLES return complete_arg(&arg, current_cmd, current_dir, pos_index, is_escaped); } - debug!( - "complete::next: Begin parsing '{:?}' ({:?})", - arg.to_value_os(), - arg.to_value_os().as_raw_bytes() - ); + debug!("complete::next: Begin parsing '{:?}'", arg.to_value_os(),); if let Ok(value) = arg.to_value() { if let Some(next_cmd) = current_cmd.find_subcommand(value) { @@ -370,28 +368,23 @@ complete OPTIONS -F _clap_complete_NAME EXECUTABLES if !is_escaped { if let Some((flag, value)) = arg.to_long() { - if let Ok(flag) = flag { - if let Some(value) = value { - if let Some(arg) = cmd.get_arguments().find(|a| a.get_long() == Some(flag)) - { - completions.extend( - complete_arg_value(value.to_str().ok_or(value), arg, current_dir) - .into_iter() - .map(|os| { - // HACK: Need better `OsStr` manipulation - format!("--{}={}", flag, os.to_string_lossy()).into() - }), - ) - } - } else { + if let Some(value) = value { + if let Some(arg) = cmd.get_arguments().find(|a| a.get_long() == Some(flag)) { completions.extend( - crate::generator::utils::longs_and_visible_aliases(cmd) + complete_arg_value(value.to_str().ok_or(value), arg, current_dir) .into_iter() - .filter_map(|f| { - f.starts_with(flag).then(|| format!("--{}", f).into()) + .map(|os| { + // HACK: Need better `OsStr` manipulation + format!("--{}={}", flag, os.to_string_lossy()).into() }), - ); + ) } + } else { + completions.extend( + crate::generator::utils::longs_and_visible_aliases(cmd) + .into_iter() + .filter_map(|f| f.starts_with(flag).then(|| format!("--{}", f).into())), + ); } } else if arg.is_escape() || arg.is_stdio() || arg.is_empty() { // HACK: Assuming knowledge of is_escape / is_stdio @@ -408,7 +401,7 @@ complete OPTIONS -F _clap_complete_NAME EXECUTABLES crate::generator::utils::shorts_and_visible_aliases(cmd) .into_iter() // HACK: Need better `OsStr` manipulation - .map(|f| format!("{}{}", arg.to_value_os().to_str_lossy(), f).into()), + .map(|f| format!("{}{}", arg.to_value_os().to_string_lossy(), f).into()), ); } } @@ -428,7 +421,7 @@ complete OPTIONS -F _clap_complete_NAME EXECUTABLES } fn complete_arg_value( - value: Result<&str, &clap_lex::RawOsStr>, + value: Result<&str, &OsStr>, arg: &clap::Arg, current_dir: Option<&std::path::Path>, ) -> Vec { @@ -444,7 +437,7 @@ complete OPTIONS -F _clap_complete_NAME EXECUTABLES } } else { let value_os = match value { - Ok(value) => clap_lex::RawOsStr::from_str(value), + Ok(value) => OsStr::new(value), Err(value_os) => value_os, }; match arg.get_value_hint() { @@ -485,7 +478,7 @@ complete OPTIONS -F _clap_complete_NAME EXECUTABLES } fn complete_path( - value_os: &clap_lex::RawOsStr, + value_os: &OsStr, current_dir: Option<&std::path::Path>, is_wanted: impl Fn(&std::path::Path) -> bool, ) -> Vec { @@ -499,10 +492,11 @@ complete OPTIONS -F _clap_complete_NAME EXECUTABLES } }; let (existing, prefix) = value_os - .split_once('\\') - .unwrap_or((clap_lex::RawOsStr::from_str(""), value_os)); - let root = current_dir.join(existing.to_os_str()); + .split_once("\\") + .unwrap_or((OsStr::new(""), value_os)); + let root = current_dir.join(existing); debug!("complete_path: root={:?}, prefix={:?}", root, prefix); + let prefix = prefix.to_string_lossy(); for entry in std::fs::read_dir(&root) .ok() @@ -510,8 +504,8 @@ complete OPTIONS -F _clap_complete_NAME EXECUTABLES .flatten() .filter_map(Result::ok) { - let raw_file_name = clap_lex::RawOsString::new(entry.file_name()); - if !raw_file_name.starts_with_os(prefix) { + let raw_file_name = OsString::from(entry.file_name()); + if !raw_file_name.starts_with(&prefix) { continue; } diff --git a/clap_lex/Cargo.toml b/clap_lex/Cargo.toml index da2d1e7351f..3729e459b1b 100644 --- a/clap_lex/Cargo.toml +++ b/clap_lex/Cargo.toml @@ -28,6 +28,3 @@ pre-release-replacements = [ [lib] bench = false - -[dependencies] -os_str_bytes = { version = "6.0.0", default-features = false, features = ["raw_os_str"] } diff --git a/clap_lex/src/ext.rs b/clap_lex/src/ext.rs new file mode 100644 index 00000000000..6d319988210 --- /dev/null +++ b/clap_lex/src/ext.rs @@ -0,0 +1,321 @@ +use std::ffi::OsStr; + +pub trait OsStrExt: private::Sealed { + /// Converts to a string slice. + fn try_str(&self) -> Result<&str, std::str::Utf8Error>; + /// Returns `true` if the given pattern matches a sub-slice of + /// this string slice. + /// + /// Returns `false` if it does not. + /// + /// # Examples + /// + /// ```rust + /// use clap_lex::OsStrExt as _; + /// let bananas = std::ffi::OsStr::new("bananas"); + /// + /// assert!(bananas.contains("nana")); + /// assert!(!bananas.contains("apples")); + /// ``` + fn contains(&self, needle: &str) -> bool; + /// Returns the byte index of the first character of this string slice that + /// matches the pattern. + /// + /// Returns [`None`] if the pattern doesn't match. + /// + /// # Examples + /// + /// ```rust + /// use clap_lex::OsStrExt as _; + /// let s = std::ffi::OsStr::new("Löwe 老虎 Léopard Gepardi"); + /// + /// assert_eq!(s.find("L"), Some(0)); + /// assert_eq!(s.find("é"), Some(14)); + /// assert_eq!(s.find("par"), Some(17)); + /// ``` + /// + /// Not finding the pattern: + /// + /// ```rust + /// use clap_lex::OsStrExt as _; + /// let s = std::ffi::OsStr::new("Löwe 老虎 Léopard"); + /// + /// assert_eq!(s.find("1"), None); + /// ``` + fn find(&self, needle: &str) -> Option; + /// Returns a string slice with the prefix removed. + /// + /// If the string starts with the pattern `prefix`, returns substring after the prefix, wrapped + /// in `Some`. + /// + /// If the string does not start with `prefix`, returns `None`. + /// + /// # Examples + /// + /// ``` + /// use std::ffi::OsStr; + /// use clap_lex::OsStrExt as _; + /// assert_eq!(OsStr::new("foo:bar").strip_prefix("foo:"), Some(OsStr::new("bar"))); + /// assert_eq!(OsStr::new("foo:bar").strip_prefix("bar"), None); + /// assert_eq!(OsStr::new("foofoo").strip_prefix("foo"), Some(OsStr::new("foo"))); + /// ``` + fn strip_prefix(&self, prefix: &str) -> Option<&OsStr>; + /// Returns `true` if the given pattern matches a prefix of this + /// string slice. + /// + /// Returns `false` if it does not. + /// + /// # Examples + /// + /// ``` + /// use clap_lex::OsStrExt as _; + /// let bananas = std::ffi::OsStr::new("bananas"); + /// + /// assert!(bananas.starts_with("bana")); + /// assert!(!bananas.starts_with("nana")); + /// ``` + fn starts_with(&self, prefix: &str) -> bool; + /// An iterator over substrings of this string slice, separated by + /// characters matched by a pattern. + /// + /// # Examples + /// + /// Simple patterns: + /// + /// ``` + /// use std::ffi::OsStr; + /// use clap_lex::OsStrExt as _; + /// let v: Vec<_> = OsStr::new("Mary had a little lamb").split(" ").collect(); + /// assert_eq!(v, [OsStr::new("Mary"), OsStr::new("had"), OsStr::new("a"), OsStr::new("little"), OsStr::new("lamb")]); + /// + /// let v: Vec<_> = OsStr::new("").split("X").collect(); + /// assert_eq!(v, [OsStr::new("")]); + /// + /// let v: Vec<_> = OsStr::new("lionXXtigerXleopard").split("X").collect(); + /// assert_eq!(v, [OsStr::new("lion"), OsStr::new(""), OsStr::new("tiger"), OsStr::new("leopard")]); + /// + /// let v: Vec<_> = OsStr::new("lion::tiger::leopard").split("::").collect(); + /// assert_eq!(v, [OsStr::new("lion"), OsStr::new("tiger"), OsStr::new("leopard")]); + /// ``` + /// + /// If a string contains multiple contiguous separators, you will end up + /// with empty strings in the output: + /// + /// ``` + /// use std::ffi::OsStr; + /// use clap_lex::OsStrExt as _; + /// let x = OsStr::new("||||a||b|c"); + /// let d: Vec<_> = x.split("|").collect(); + /// + /// assert_eq!(d, &[OsStr::new(""), OsStr::new(""), OsStr::new(""), OsStr::new(""), OsStr::new("a"), OsStr::new(""), OsStr::new("b"), OsStr::new("c")]); + /// ``` + /// + /// Contiguous separators are separated by the empty string. + /// + /// ``` + /// use std::ffi::OsStr; + /// use clap_lex::OsStrExt as _; + /// let x = OsStr::new("(///)"); + /// let d: Vec<_> = x.split("/").collect(); + /// + /// assert_eq!(d, &[OsStr::new("("), OsStr::new(""), OsStr::new(""), OsStr::new(")")]); + /// ``` + /// + /// Separators at the start or end of a string are neighbored + /// by empty strings. + /// + /// ``` + /// use std::ffi::OsStr; + /// use clap_lex::OsStrExt as _; + /// let d: Vec<_> = OsStr::new("010").split("0").collect(); + /// assert_eq!(d, &[OsStr::new(""), OsStr::new("1"), OsStr::new("")]); + /// ``` + /// + /// When the empty string is used as a separator, it panics + /// + /// ```should_panic + /// use std::ffi::OsStr; + /// use clap_lex::OsStrExt as _; + /// let f: Vec<_> = OsStr::new("rust").split("").collect(); + /// assert_eq!(f, &[OsStr::new(""), OsStr::new("r"), OsStr::new("u"), OsStr::new("s"), OsStr::new("t"), OsStr::new("")]); + /// ``` + /// + /// Contiguous separators can lead to possibly surprising behavior + /// when whitespace is used as the separator. This code is correct: + /// + /// ``` + /// use std::ffi::OsStr; + /// use clap_lex::OsStrExt as _; + /// let x = OsStr::new(" a b c"); + /// let d: Vec<_> = x.split(" ").collect(); + /// + /// assert_eq!(d, &[OsStr::new(""), OsStr::new(""), OsStr::new(""), OsStr::new(""), OsStr::new("a"), OsStr::new(""), OsStr::new("b"), OsStr::new("c")]); + /// ``` + /// + /// It does _not_ give you: + /// + /// ```,ignore + /// assert_eq!(d, &[OsStr::new("a"), OsStr::new("b"), OsStr::new("c")]); + /// ``` + /// + /// Use [`split_whitespace`] for this behavior. + /// + /// [`split_whitespace`]: str::split_whitespace + fn split<'s, 'n>(&'s self, needle: &'n str) -> Split<'s, 'n>; + /// Divide one string slice into two at an index. + /// + /// The argument, `mid`, should be a byte offset from the start of the + /// string. It must also be on the boundary of a UTF-8 code point. + /// + /// The two slices returned go from the start of the string slice to `mid`, + /// and from `mid` to the end of the string slice. + /// + /// To get mutable string slices instead, see the [`split_at_mut`] + /// method. + /// + /// [`split_at_mut`]: str::split_at_mut + /// + /// # Panics + /// + /// Panics if `mid` is not on a UTF-8 code point boundary, or if it is + /// past the end of the last code point of the string slice. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use clap_lex::OsStrExt as _; + /// let s = std::ffi::OsStr::new("Per Martin-Löf"); + /// + /// let (first, last) = s.split_at(3); + /// + /// assert_eq!("Per", first); + /// assert_eq!(" Martin-Löf", last); + /// ``` + fn split_at(&self, index: usize) -> (&OsStr, &OsStr); + /// Splits the string on the first occurrence of the specified delimiter and + /// returns prefix before delimiter and suffix after delimiter. + /// + /// # Examples + /// + /// ``` + /// use std::ffi::OsStr; + /// use clap_lex::OsStrExt as _; + /// assert_eq!(OsStr::new("cfg").split_once("="), None); + /// assert_eq!(OsStr::new("cfg=").split_once("="), Some((OsStr::new("cfg"), OsStr::new("")))); + /// assert_eq!(OsStr::new("cfg=foo").split_once("="), Some((OsStr::new("cfg"), OsStr::new("foo")))); + /// assert_eq!(OsStr::new("cfg=foo=bar").split_once("="), Some((OsStr::new("cfg"), OsStr::new("foo=bar")))); + /// ``` + fn split_once(&self, needle: &'_ str) -> Option<(&OsStr, &OsStr)>; +} + +impl OsStrExt for OsStr { + fn try_str(&self) -> Result<&str, std::str::Utf8Error> { + let bytes = to_bytes(self); + std::str::from_utf8(bytes) + } + + fn contains(&self, needle: &str) -> bool { + self.find(needle).is_some() + } + + fn find(&self, needle: &str) -> Option { + (0..=self.len().checked_sub(needle.len())?) + .find(|&x| to_bytes(self)[x..].starts_with(needle.as_bytes())) + } + + fn strip_prefix(&self, prefix: &str) -> Option<&OsStr> { + to_bytes(self) + .strip_prefix(prefix.as_bytes()) + .map(to_os_str) + } + fn starts_with(&self, prefix: &str) -> bool { + to_bytes(self).starts_with(prefix.as_bytes()) + } + + fn split<'s, 'n>(&'s self, needle: &'n str) -> Split<'s, 'n> { + assert_ne!(needle, ""); + Split { + haystack: Some(self), + needle, + } + } + + fn split_at(&self, index: usize) -> (&OsStr, &OsStr) { + let (first, second) = to_bytes(self).split_at(index); + (to_os_str(first), to_os_str(second)) + } + + fn split_once(&self, needle: &'_ str) -> Option<(&OsStr, &OsStr)> { + let start = self.find(needle)?; + let end = start + needle.len(); + let haystack = to_bytes(self); + let first = &haystack[0..start]; + let second = &haystack[end..]; + Some((to_os_str(first), to_os_str(second))) + } +} + +mod private { + pub trait Sealed {} + + impl Sealed for std::ffi::OsStr {} +} + +/// Allow access to raw bytes +/// +/// **Note:** the bytes only make sense when compared with ASCII or `&str` +/// +/// **Note:** This must never be serialized as there is no guarantee at how invalid UTF-8 will be +/// encoded, even within the same version of this crate (since its dependent on rustc version) +fn to_bytes(s: &OsStr) -> &[u8] { + // SAFETY: + // - Lifetimes are the same + // - Types are compatible (`OsStr` is a transparent wrapper for `[u8]`) + // - The primary contract is that the encoding for invalid surrogate code points is not + // guaranteed which isn't a problem here + // + // There is a proposal to support this natively (https://github.com/rust-lang/rust/pull/95290) + // but its in limbo + unsafe { std::mem::transmute(s) } +} + +/// Restore raw bytes as `OsStr` +fn to_os_str(s: &[u8]) -> &OsStr { + // SAFETY: + // - Lifetimes are the same + // - Types are compatible (`OsStr` is a transparent wrapper for `[u8]`) + // - The primary contract is that the encoding for invalid surrogate code points is not + // guaranteed which isn't a problem here + // + // There is a proposal to support this natively (https://github.com/rust-lang/rust/pull/95290) + // but its in limbo + unsafe { std::mem::transmute(s) } +} + +pub struct Split<'s, 'n> { + haystack: Option<&'s OsStr>, + needle: &'n str, +} + +impl<'s, 'n> Iterator for Split<'s, 'n> { + type Item = &'s OsStr; + + fn next(&mut self) -> Option { + let haystack = self.haystack?; + match haystack.split_once(self.needle) { + Some((first, second)) => { + if !haystack.is_empty() { + debug_assert_ne!(haystack, second); + } + self.haystack = Some(second); + Some(first) + } + None => { + self.haystack = None; + Some(haystack) + } + } + } +} diff --git a/clap_lex/src/lib.rs b/clap_lex/src/lib.rs index bc9720fc570..333a99aa511 100644 --- a/clap_lex/src/lib.rs +++ b/clap_lex/src/lib.rs @@ -8,6 +8,7 @@ //! //! ```rust //! use std::path::PathBuf; +//! use std::ffi::OsStr; //! //! type BoxedError = Box; //! @@ -26,7 +27,7 @@ //! } //! //! impl Color { -//! fn parse(s: Option<&clap_lex::RawOsStr>) -> Result { +//! fn parse(s: Option<&OsStr>) -> Result { //! let s = s.map(|s| s.to_str().ok_or(s)); //! match s { //! Some(Ok("always")) | Some(Ok("")) | None => { @@ -64,13 +65,13 @@ //! args.paths.push(PathBuf::from("-")); //! } else if let Some((long, value)) = arg.to_long() { //! match long { -//! Ok("verbose") => { +//! "verbose" => { //! if let Some(value) = value { //! return Err(format!("`--verbose` does not take a value, got `{:?}`", value).into()); //! } //! args.verbosity += 1; //! } -//! Ok("color") => { +//! "color" => { //! args.color = Color::parse(value)?; //! } //! _ => { @@ -93,12 +94,12 @@ //! return Err(format!("Unexpected flag: -{}", c).into()); //! } //! Err(e) => { -//! return Err(format!("Unexpected flag: -{}", e.to_str_lossy()).into()); +//! return Err(format!("Unexpected flag: -{}", e.to_string_lossy()).into()); //! } //! } //! } //! } else { -//! args.paths.push(PathBuf::from(arg.to_value_os().to_os_str().into_owned())); +//! args.paths.push(PathBuf::from(arg.to_value_os().to_owned())); //! } //! } //! @@ -109,13 +110,14 @@ //! println!("{:?}", args); //! ``` +mod ext; + use std::ffi::OsStr; use std::ffi::OsString; pub use std::io::SeekFrom; -pub use os_str_bytes::RawOsStr; -pub use os_str_bytes::RawOsString; +pub use ext::OsStrExt; /// Command-line arguments #[derive(Default, Clone, Debug, PartialEq, Eq)] @@ -275,30 +277,27 @@ impl ArgCursor { /// Command-line Argument #[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] pub struct ParsedArg<'s> { - inner: std::borrow::Cow<'s, RawOsStr>, - utf8: Option<&'s str>, + inner: &'s OsStr, } impl<'s> ParsedArg<'s> { fn new(inner: &'s OsStr) -> Self { - let utf8 = inner.to_str(); - let inner = RawOsStr::new(inner); - Self { inner, utf8 } + Self { inner } } /// Argument is length of 0 pub fn is_empty(&self) -> bool { - self.inner.as_ref().is_empty() + self.inner.is_empty() } /// Does the argument look like a stdio argument (`-`) pub fn is_stdio(&self) -> bool { - self.inner.as_ref() == "-" + self.inner == "-" } /// Does the argument look like an argument escape (`--`) pub fn is_escape(&self) -> bool { - self.inner.as_ref() == "--" + self.inner == "--" } /// Does the argument look like a number @@ -309,56 +308,38 @@ impl<'s> ParsedArg<'s> { } /// Treat as a long-flag - pub fn to_long(&self) -> Option<(Result<&str, &RawOsStr>, Option<&RawOsStr>)> { - if let Some(raw) = self.utf8 { - let remainder = raw.strip_prefix("--")?; - if remainder.is_empty() { - debug_assert!(self.is_escape()); - return None; - } + pub fn to_long(&self) -> Option<(&str, Option<&OsStr>)> { + let raw = self.inner; + let remainder = raw.strip_prefix("--")?; + if remainder.is_empty() { + debug_assert!(self.is_escape()); + return None; + } - let (flag, value) = if let Some((p0, p1)) = remainder.split_once('=') { - (p0, Some(p1)) - } else { - (remainder, None) - }; - let flag = Ok(flag); - let value = value.map(RawOsStr::from_str); - Some((flag, value)) + let (flag, value) = if let Some((p0, p1)) = remainder.split_once("=") { + (p0, Some(p1)) } else { - let raw = self.inner.as_ref(); - let remainder = raw.strip_prefix("--")?; - if remainder.is_empty() { - debug_assert!(self.is_escape()); - return None; - } - - let (flag, value) = if let Some((p0, p1)) = remainder.split_once('=') { - (p0, Some(p1)) - } else { - (remainder, None) - }; - let flag = flag.to_str().ok_or(flag); - Some((flag, value)) - } + (remainder, None) + }; + let flag = flag.to_str()?; + Some((flag, value)) } /// Can treat as a long-flag pub fn is_long(&self) -> bool { - self.inner.as_ref().starts_with("--") && !self.is_escape() + self.inner.starts_with("--") && !self.is_escape() } /// Treat as a short-flag pub fn to_short(&self) -> Option> { - if let Some(remainder_os) = self.inner.as_ref().strip_prefix('-') { - if remainder_os.starts_with('-') { + if let Some(remainder_os) = self.inner.strip_prefix("-") { + if remainder_os.starts_with("-") { None } else if remainder_os.is_empty() { debug_assert!(self.is_stdio()); None } else { - let remainder = self.utf8.map(|s| &s[1..]); - Some(ShortFlags::new(remainder_os, remainder)) + Some(ShortFlags::new(remainder_os)) } } else { None @@ -367,48 +348,42 @@ impl<'s> ParsedArg<'s> { /// Can treat as a short-flag pub fn is_short(&self) -> bool { - self.inner.as_ref().starts_with('-') - && !self.is_stdio() - && !self.inner.as_ref().starts_with("--") + self.inner.starts_with("-") && !self.is_stdio() && !self.inner.starts_with("--") } /// Treat as a value /// /// **NOTE:** May return a flag or an escape. - pub fn to_value_os(&self) -> &RawOsStr { - self.inner.as_ref() + pub fn to_value_os(&self) -> &OsStr { + self.inner } /// Treat as a value /// /// **NOTE:** May return a flag or an escape. - pub fn to_value(&self) -> Result<&str, &RawOsStr> { - self.utf8.ok_or_else(|| self.inner.as_ref()) + pub fn to_value(&self) -> Result<&str, &OsStr> { + self.inner.to_str().ok_or(self.inner) } /// Safely print an argument that may contain non-UTF8 content /// /// This may perform lossy conversion, depending on the platform. If you would like an implementation which escapes the path please use Debug instead. pub fn display(&self) -> impl std::fmt::Display + '_ { - self.inner.to_str_lossy() + self.inner.to_string_lossy() } } /// Walk through short flags within a [`ParsedArg`] #[derive(Clone, Debug)] pub struct ShortFlags<'s> { - inner: &'s RawOsStr, + inner: &'s OsStr, utf8_prefix: std::str::CharIndices<'s>, - invalid_suffix: Option<&'s RawOsStr>, + invalid_suffix: Option<&'s OsStr>, } impl<'s> ShortFlags<'s> { - fn new(inner: &'s RawOsStr, utf8: Option<&'s str>) -> Self { - let (utf8_prefix, invalid_suffix) = if let Some(utf8) = utf8 { - (utf8, None) - } else { - split_nonutf8_once(inner) - }; + fn new(inner: &'s OsStr) -> Self { + let (utf8_prefix, invalid_suffix) = split_nonutf8_once(inner); let utf8_prefix = utf8_prefix.char_indices(); Self { inner, @@ -440,7 +415,7 @@ impl<'s> ShortFlags<'s> { /// Advance the iterator, returning the next short flag on success /// /// On error, returns the invalid-UTF8 value - pub fn next_flag(&mut self) -> Option> { + pub fn next_flag(&mut self) -> Option> { if let Some((_, flag)) = self.utf8_prefix.next() { return Some(Ok(flag)); } @@ -454,11 +429,11 @@ impl<'s> ShortFlags<'s> { } /// Advance the iterator, returning everything left as a value - pub fn next_value_os(&mut self) -> Option<&'s RawOsStr> { + pub fn next_value_os(&mut self) -> Option<&'s OsStr> { if let Some((index, _)) = self.utf8_prefix.next() { self.utf8_prefix = "".char_indices(); self.invalid_suffix = None; - return Some(&self.inner[index..]); + return Some(self.inner.split_at(index).1); } if let Some(suffix) = self.invalid_suffix { @@ -471,19 +446,19 @@ impl<'s> ShortFlags<'s> { } impl<'s> Iterator for ShortFlags<'s> { - type Item = Result; + type Item = Result; fn next(&mut self) -> Option { self.next_flag() } } -fn split_nonutf8_once(b: &RawOsStr) -> (&str, Option<&RawOsStr>) { - match std::str::from_utf8(b.as_raw_bytes()) { +fn split_nonutf8_once(b: &OsStr) -> (&str, Option<&OsStr>) { + match b.try_str() { Ok(s) => (s, None), Err(err) => { let (valid, after_valid) = b.split_at(err.valid_up_to()); - let valid = std::str::from_utf8(valid.as_raw_bytes()).unwrap(); + let valid = valid.try_str().unwrap(); (valid, Some(after_valid)) } } diff --git a/clap_lex/tests/parsed.rs b/clap_lex/tests/parsed.rs index 77a1dfb9c61..f1f36b38041 100644 --- a/clap_lex/tests/parsed.rs +++ b/clap_lex/tests/parsed.rs @@ -1,3 +1,5 @@ +use std::ffi::OsStr; + // Despite our design philosophy being to support completion generation, we aren't considering `-` // the start of a long because there is no valid value to return. #[test] @@ -34,7 +36,7 @@ fn to_long_no_value() { assert!(next.is_long()); let (key, value) = next.to_long().unwrap(); - assert_eq!(key, Ok("long")); + assert_eq!(key, "long"); assert_eq!(value, None); } @@ -48,8 +50,8 @@ fn to_long_with_empty_value() { assert!(next.is_long()); let (key, value) = next.to_long().unwrap(); - assert_eq!(key, Ok("long")); - assert_eq!(value, Some(clap_lex::RawOsStr::from_str(""))); + assert_eq!(key, "long"); + assert_eq!(value, Some(OsStr::new(""))); } #[test] @@ -62,8 +64,8 @@ fn to_long_with_value() { assert!(next.is_long()); let (key, value) = next.to_long().unwrap(); - assert_eq!(key, Ok("long")); - assert_eq!(value, Some(clap_lex::RawOsStr::from_str("hello"))); + assert_eq!(key, "long"); + assert_eq!(value, Some(OsStr::new("hello"))); } #[test] diff --git a/clap_lex/tests/shorts.rs b/clap_lex/tests/shorts.rs index 3adc4e3dedc..538afa460e5 100644 --- a/clap_lex/tests/shorts.rs +++ b/clap_lex/tests/shorts.rs @@ -37,7 +37,7 @@ fn next_value_os() { let next = raw.next(&mut cursor).unwrap(); let mut shorts = next.to_short().unwrap(); - let actual = shorts.next_value_os().unwrap().to_str_lossy(); + let actual = shorts.next_value_os().unwrap().to_string_lossy(); assert_eq!(actual, "short"); } @@ -51,7 +51,7 @@ fn next_flag_with_value() { let mut shorts = next.to_short().unwrap(); assert_eq!(shorts.next_flag().unwrap().unwrap(), 's'); - let actual = shorts.next_value_os().unwrap().to_str_lossy(); + let actual = shorts.next_value_os().unwrap().to_string_lossy(); assert_eq!(actual, "hort"); }