diff --git a/regex-automata/src/meta/regex.rs b/regex-automata/src/meta/regex.rs index ce3bae0fa..93fe6c0bb 100644 --- a/regex-automata/src/meta/regex.rs +++ b/regex-automata/src/meta/regex.rs @@ -892,6 +892,18 @@ impl Regex { ) -> SplitN<'r, 'h> { SplitN { splits: self.split(input), limit } } + + /// TODO: add docs + pub fn split_inclusive<'r, 'h, I: Into>>( + &'r self, + input: I, + ) -> SplitInclusive<'r, 'h> { + SplitInclusive { + finder: self.find_iter(input), + last: 0, + span_to_yield: None, + } + } } /// Lower level search routines that give more control. @@ -2278,6 +2290,58 @@ impl<'r, 'h> Iterator for SplitN<'r, 'h> { impl<'r, 'h> core::iter::FusedIterator for SplitN<'r, 'h> {} +/// TODO: add docs +#[derive(Debug)] +pub struct SplitInclusive<'r, 'h> { + finder: FindMatches<'r, 'h>, + last: usize, + span_to_yield: Option, +} + +impl<'r, 'h> SplitInclusive<'r, 'h> { + /// Returns the current `Input` associated with this iterator. + /// + /// The `start` position on the given `Input` may change during iteration, + /// but all other values are guaranteed to remain invariant. + #[inline] + pub fn input<'s>(&'s self) -> &'s Input<'h> { + self.finder.input() + } +} + +impl<'r, 'h> Iterator for SplitInclusive<'r, 'h> { + type Item = Span; + + fn next(&mut self) -> Option { + if let Some(span) = self.span_to_yield { + self.span_to_yield = None; + return Some(span); + } + + match self.finder.next() { + None => { + let len = self.input().haystack().len(); + if self.last > len { + None + } else { + let span = Span::from(self.last..len); + self.last = len + 1; // Next call will return None + Some(span) + } + } + Some(m) => { + let span = Span::from(self.last..m.start()); // return this right now + self.span_to_yield = Some(Span::from(m.start()..m.end())); // next call will return this + + self.last = m.end(); + Some(span) + } + } + } +} + +impl<'r, 'h> core::iter::FusedIterator for SplitInclusive<'r, 'h> {} + /// Represents mutable scratch space used by regex engines during a search. /// /// Most of the regex engines in this crate require some kind of @@ -3646,4 +3710,51 @@ mod tests { let re = Regex::new(r"[a-zA-Z]+ing").unwrap(); assert_eq!(1, re.find_iter("tingling").count()); } + + #[test] + fn split_inclusive() { + let arr = [ + (r"(x)", "1x2", vec!["1", "x", "2"]), + (r"([^0-9]+)", "1-2", vec!["1", "-", "2"]), + ( + r"([\r\n\t\f\v ]+)", // equivalent to r"(\s+)" + "this is a \n\ntest", + vec!["this", " ", "is", " ", "a", " \n\n", "test"], + ), + ( + r"([^0-9a-zA-Z_])", + " C# is great! (not actually) :)", + vec![ + "", " ", "C", "#", "", " ", "is", " ", "great", "!", "", + " ", "", "(", "not", " ", "actually", ")", "", " ", "", + ":", "", ")", "", + ], + ), + ( + r"([a-zA-Z](?:[a-zA-Z']*[a-zA-Z])?)", + r#"He said, "I'd like to eat cake!""#, + vec![ + "", "He", " ", "said", r#", ""#, "I'd", " ", "like", " ", + "to", " ", "eat", " ", "cake", r#"!""#, + ], + ), + ]; + for (pattern, text, expected_output) in arr { + let out: Vec<_> = Regex::new(pattern) + .unwrap() + .split_inclusive(text) + .map(|sp| &text[sp]) + .collect(); + + assert_eq!( + out, expected_output, + "Regex: {}, Input: {}", + pattern, text + ); + + // make sure that we can get the original string, from the splitted string by + // concatenating it + assert_eq!(out.join(""), text); + } + } }