From 24e9ecd599e7940eebbcef019d513afd583b2eca Mon Sep 17 00:00:00 2001 From: JMA Date: Fri, 28 Oct 2022 13:27:17 -0500 Subject: [PATCH 1/2] Add split_inclusive --- src/re_bytes.rs | 62 ++++++++++++++++++++++++++++++++++++++++++++ src/re_unicode.rs | 65 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 127 insertions(+) diff --git a/src/re_bytes.rs b/src/re_bytes.rs index d71969257..24917c810 100644 --- a/src/re_bytes.rs +++ b/src/re_bytes.rs @@ -318,6 +318,31 @@ impl Regex { Split { finder: self.find_iter(text), last: 0 } } + /// Returns an iterator of substrings of `text` delimited by a match of the + /// regular expression. Each element of the iterator will include the + /// delimiting match if it appears at the beginning of the element. + /// + /// This method will *not* copy the text given. + /// + /// # Example + /// + /// To split a string delimited by fruit and include the fruit: + /// + /// ```rust + /// # use regex::bytes::Regex; + /// # fn main() { + /// let re = Regex::new(r"(apple|banana|pear)").unwrap(); + /// let fields: Vec<&[u8]> = re.split_inclusive(b"apples: 3 bananas: 2 pears: 4").collect(); + /// assert_eq!(fields, vec![&b""[..], &b"apples: 3 "[..], &b"bananas: 2 "[..], &b"pears: 4"[..]]); + /// # } + /// ``` + pub fn split_inclusive<'r, 't>( + &'r self, + text: &'t [u8], + ) -> SplitInclusive<'r, 't> { + SplitInclusive { finder: self.find_iter(text), last: 0 } + } + /// Returns an iterator of at most `limit` substrings of `text` delimited /// by a match of the regular expression. (A `limit` of `0` will return no /// substrings.) Namely, each element of the iterator corresponds to text @@ -767,6 +792,43 @@ impl<'r, 't> Iterator for Split<'r, 't> { impl<'r, 't> FusedIterator for Split<'r, 't> {} +/// Yields all substrings delimited by a regular expression match inclusive of +/// the match. +/// +/// `'r` is the lifetime of the compiled regular expression and `'t` is the +/// lifetime of the byte string being split. +#[derive(Debug)] +pub struct SplitInclusive<'r, 't> { + finder: Matches<'r, 't>, + last: usize, +} + +impl<'r, 't> Iterator for SplitInclusive<'r, 't> { + type Item = &'t [u8]; + + fn next(&mut self) -> Option<&'t [u8]> { + let text = self.finder.0.text(); + match self.finder.next() { + None => { + if self.last > text.len() { + None + } else { + let s = &text[self.last..]; + self.last = text.len() + 1; // Next call will return None + Some(s) + } + } + Some(m) => { + let matched = &text[self.last..m.start()]; + self.last = m.start(); + Some(matched) + } + } + } +} + +impl<'r, 't> FusedIterator for SplitInclusive<'r, 't> {} + /// Yields at most `N` substrings delimited by a regular expression match. /// /// The last substring will be whatever remains after splitting. diff --git a/src/re_unicode.rs b/src/re_unicode.rs index 60d81a7d9..4797ce595 100644 --- a/src/re_unicode.rs +++ b/src/re_unicode.rs @@ -371,6 +371,34 @@ impl Regex { Split { finder: self.find_iter(text), last: 0 } } + /// Returns an iterator of substrings of `text` delimited by a match of the + /// regular expression. Each element of the iterator will include the + /// delimiting match if it appears at the beginning of the element. + /// + /// This method will *not* copy the text given. + /// + /// # Example + /// + /// To split a string delimited by fruit and include the fruit: + /// + /// ```rust + /// # use regex::Regex; + /// # fn main() { + /// let re = Regex::new(r"(apple|banana|pear)").unwrap(); + /// let fields: Vec<&str> = re + /// .split_inclusive("apples: 3 bananas: 2 pears: 4") + /// .map(|s| s.trim()) + /// .collect(); + /// assert_eq!(fields, vec!["", "apples: 3", "bananas: 2", "pears: 4"]); + /// # } + /// ``` + pub fn split_inclusive<'r, 't>( + &'r self, + text: &'t str, + ) -> SplitInclusive<'r, 't> { + SplitInclusive { finder: self.find_iter(text), last: 0 } + } + /// Returns an iterator of at most `limit` substrings of `text` delimited /// by a match of the regular expression. (A `limit` of `0` will return no /// substrings.) Namely, each element of the iterator corresponds to text @@ -809,6 +837,43 @@ impl<'r, 't> Iterator for Split<'r, 't> { impl<'r, 't> FusedIterator for Split<'r, 't> {} +/// Yields all substrings delimited by a regular expression match inclusive of +/// the match. +/// +/// `'r` is the lifetime of the compiled regular expression and `'t` is the +/// lifetime of the byte string being split. +#[derive(Debug)] +pub struct SplitInclusive<'r, 't> { + finder: Matches<'r, 't>, + last: usize, +} + +impl<'r, 't> Iterator for SplitInclusive<'r, 't> { + type Item = &'t str; + + fn next(&mut self) -> Option<&'t str> { + let text = self.finder.0.text(); + match self.finder.next() { + None => { + if self.last > text.len() { + None + } else { + let s = &text[self.last..]; + self.last = text.len() + 1; // Next call will return None + Some(s) + } + } + Some(m) => { + let matched = &text[self.last..m.start()]; + self.last = m.start(); + Some(matched) + } + } + } +} + +impl<'r, 't> FusedIterator for SplitInclusive<'r, 't> {} + /// Yields at most `N` substrings delimited by a regular expression match. /// /// The last substring will be whatever remains after splitting. From 857e063725a23f0aa34b9da4cd77bebabbd68274 Mon Sep 17 00:00:00 2001 From: JMA Date: Fri, 28 Oct 2022 14:11:10 -0500 Subject: [PATCH 2/2] Update documentation, fix terminator This changeset adjusts the documentation to more closely match that found in std for split_inclusive. It also changes the behavior such that the matched substring appears at the end of each element as a terminator rather than at the head of each element. Sorry; I never actually *read* the split_inclusive docs in std. --- src/re_bytes.rs | 24 ++++++++++++++---------- src/re_unicode.rs | 16 ++++++---------- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/src/re_bytes.rs b/src/re_bytes.rs index 24917c810..9e10efbae 100644 --- a/src/re_bytes.rs +++ b/src/re_bytes.rs @@ -318,22 +318,26 @@ impl Regex { Split { finder: self.find_iter(text), last: 0 } } - /// Returns an iterator of substrings of `text` delimited by a match of the - /// regular expression. Each element of the iterator will include the - /// delimiting match if it appears at the beginning of the element. + /// Returns an iterator of substrings of `text` separated by a match of the + /// regular expression. Differs from the iterator produced by split in that + /// split_inclusive leaves the matched part as the terminator of the + /// substring. /// /// This method will *not* copy the text given. /// /// # Example /// - /// To split a string delimited by fruit and include the fruit: - /// /// ```rust /// # use regex::bytes::Regex; /// # fn main() { - /// let re = Regex::new(r"(apple|banana|pear)").unwrap(); - /// let fields: Vec<&[u8]> = re.split_inclusive(b"apples: 3 bananas: 2 pears: 4").collect(); - /// assert_eq!(fields, vec![&b""[..], &b"apples: 3 "[..], &b"bananas: 2 "[..], &b"pears: 4"[..]]); + /// let re = Regex::new(r"\r?\n").unwrap(); + /// let text = b"Mary had a little lamb\nlittle lamb\r\nlittle lamb."; + /// let v: Vec<&[u8]> = re.split_inclusive(text).collect(); + /// assert_eq!(v, [ + /// &b"Mary had a little lamb\n"[..], + /// &b"little lamb\r\n"[..], + /// &b"little lamb."[..] + /// ]); /// # } /// ``` pub fn split_inclusive<'r, 't>( @@ -819,8 +823,8 @@ impl<'r, 't> Iterator for SplitInclusive<'r, 't> { } } Some(m) => { - let matched = &text[self.last..m.start()]; - self.last = m.start(); + let matched = &text[self.last..m.end()]; + self.last = m.end(); Some(matched) } } diff --git a/src/re_unicode.rs b/src/re_unicode.rs index 4797ce595..667e032f5 100644 --- a/src/re_unicode.rs +++ b/src/re_unicode.rs @@ -379,17 +379,13 @@ impl Regex { /// /// # Example /// - /// To split a string delimited by fruit and include the fruit: - /// /// ```rust /// # use regex::Regex; /// # fn main() { - /// let re = Regex::new(r"(apple|banana|pear)").unwrap(); - /// let fields: Vec<&str> = re - /// .split_inclusive("apples: 3 bananas: 2 pears: 4") - /// .map(|s| s.trim()) - /// .collect(); - /// assert_eq!(fields, vec!["", "apples: 3", "bananas: 2", "pears: 4"]); + /// let re = Regex::new(r"\r?\n").unwrap(); + /// let text = "Mary had a little lamb\nlittle lamb\r\nlittle lamb."; + /// let v: Vec<&str> = re.split_inclusive(text).collect(); + /// assert_eq!(v, ["Mary had a little lamb\n", "little lamb\r\n", "little lamb."]); /// # } /// ``` pub fn split_inclusive<'r, 't>( @@ -864,8 +860,8 @@ impl<'r, 't> Iterator for SplitInclusive<'r, 't> { } } Some(m) => { - let matched = &text[self.last..m.start()]; - self.last = m.start(); + let matched = &text[self.last..m.end()]; + self.last = m.end(); Some(matched) } }