From 2b1b06b169ed186714ff71935ea418e2397c5de2 Mon Sep 17 00:00:00 2001 From: Shneor <770elmo@gmail.com> Date: Fri, 6 Oct 2023 01:19:09 +0200 Subject: [PATCH 1/5] Added split_inclusive() --- regex-automata/src/meta/regex.rs | 103 +++++++++++++++++++++++++++++++ 1 file changed, 103 insertions(+) diff --git a/regex-automata/src/meta/regex.rs b/regex-automata/src/meta/regex.rs index ce3bae0fa..4a04fa948 100644 --- a/regex-automata/src/meta/regex.rs +++ b/regex-automata/src/meta/regex.rs @@ -892,6 +892,14 @@ impl Regex { ) -> SplitN<'r, 'h> { SplitN { splits: self.split(input), limit } } + + /// TODO: add docs + pub fn split_inclusive<'r, 'h, I: Into>>( + &'r self, + input: I, + ) -> SplitInclusive<'r, 'h> { + SplitInclusive { finder: self.find_iter(input), last: 0, span_to_yield: None } + } } /// Lower level search routines that give more control. @@ -2278,6 +2286,58 @@ impl<'r, 'h> Iterator for SplitN<'r, 'h> { impl<'r, 'h> core::iter::FusedIterator for SplitN<'r, 'h> {} +/// TODO: add docs +#[derive(Debug)] +pub struct SplitInclusive<'r, 'h> { + finder: FindMatches<'r, 'h>, + last: usize, + span_to_yield: Option, +} + +impl<'r, 'h> SplitInclusive<'r, 'h> { + /// Returns the current `Input` associated with this iterator. + /// + /// The `start` position on the given `Input` may change during iteration, + /// but all other values are guaranteed to remain invariant. + #[inline] + pub fn input<'s>(&'s self) -> &'s Input<'h> { + self.finder.input() + } +} + +impl<'r, 'h> Iterator for SplitInclusive<'r, 'h> { + type Item = Span; + + fn next(&mut self) -> Option { + if let Some(span) = self.span_to_yield { + self.span_to_yield = None; + return Some(span) + } + + match self.finder.next() { + None => { + let len = self.input().haystack().len(); + if self.last > len { + None + } else { + let span = Span::from(self.last..len); + self.last = len + 1; // Next call will return None + Some(span) + } + }, + Some(m) => { + let span = Span::from(self.last..m.start()); // return this right now + self.span_to_yield = Some(Span::from(m.start()..m.end())); // next call will return this + + self.last = m.end(); + Some(span) + } + } + } +} + +impl<'r, 'h> core::iter::FusedIterator for SplitInclusive<'r, 'h> {} + /// Represents mutable scratch space used by regex engines during a search. /// /// Most of the regex engines in this crate require some kind of @@ -3646,4 +3706,47 @@ mod tests { let re = Regex::new(r"[a-zA-Z]+ing").unwrap(); assert_eq!(1, re.find_iter("tingling").count()); } + + #[test] + fn split_inclusive() { + let arr = [ + ( + r"(x)", + "1x2", + vec!["1", "x", "2"], + ), + ( + r"([^\d]+)", + "1-2", + vec!["1", "-", "2"], + ), + ( + r"(\s+)", + "this is a \n\ntest", + vec!["this", " ", "is", " ", "a", " \n\n", "test"], + ), + ( + r"([^0-9a-zA-Z_])", + " C# is great! (not actually) :)", + vec!["", " ", "C", "#", "", " ", "is", " ", "great", "!", "", " ", "", "(", "not", " ", "actually", ")", "", " ", "", ":", "", ")", ""], + ), + ( + r"([a-zA-Z](?:[a-zA-Z']*[a-zA-Z])?)", + r#"He said, "I'd like to eat cake!""#, + vec!["", "He", " ", "said", r#", ""#, "I'd", " ", "like", " ", "to", " ", "eat", " ", "cake", r#"!""#], + ), + ]; + for (pattern, text, expected_output) in arr { + let mut out: Vec<_> = Regex::new(pattern).unwrap() + .split_inclusive(text) + .map(|span| &text[span]) + .collect(); + + assert_eq!(out, expected_output, "Regex: {}, Input: {}", pattern, text); + + // make sure that we can get the original string, from the splitted string by + // concatenating it + assert_eq!(out.join(""), text); + } + } } From 1d00c121fd32ae83d772627c195ccdf06f1045a9 Mon Sep 17 00:00:00 2001 From: Shneor <770elmo@gmail.com> Date: Fri, 6 Oct 2023 02:24:13 +0200 Subject: [PATCH 2/5] Fixed 'mut' warning --- regex-automata/src/meta/regex.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/regex-automata/src/meta/regex.rs b/regex-automata/src/meta/regex.rs index 4a04fa948..bfca1e27b 100644 --- a/regex-automata/src/meta/regex.rs +++ b/regex-automata/src/meta/regex.rs @@ -3737,9 +3737,9 @@ mod tests { ), ]; for (pattern, text, expected_output) in arr { - let mut out: Vec<_> = Regex::new(pattern).unwrap() + let out: Vec<_> = Regex::new(pattern).unwrap() .split_inclusive(text) - .map(|span| &text[span]) + .map(|sp| &text[sp]) .collect(); assert_eq!(out, expected_output, "Regex: {}, Input: {}", pattern, text); From 075c62c104faa1e9ab25be307e50d8dd6794d91b Mon Sep 17 00:00:00 2001 From: Shneor <770elmo@gmail.com> Date: Fri, 6 Oct 2023 11:44:48 +0200 Subject: [PATCH 3/5] cargo fmt --- regex-automata/src/meta/regex.rs | 46 +++++++++++++++++++------------- 1 file changed, 27 insertions(+), 19 deletions(-) diff --git a/regex-automata/src/meta/regex.rs b/regex-automata/src/meta/regex.rs index bfca1e27b..4a49b0328 100644 --- a/regex-automata/src/meta/regex.rs +++ b/regex-automata/src/meta/regex.rs @@ -898,7 +898,11 @@ impl Regex { &'r self, input: I, ) -> SplitInclusive<'r, 'h> { - SplitInclusive { finder: self.find_iter(input), last: 0, span_to_yield: None } + SplitInclusive { + finder: self.find_iter(input), + last: 0, + span_to_yield: None, + } } } @@ -2311,7 +2315,7 @@ impl<'r, 'h> Iterator for SplitInclusive<'r, 'h> { fn next(&mut self) -> Option { if let Some(span) = self.span_to_yield { self.span_to_yield = None; - return Some(span) + return Some(span); } match self.finder.next() { @@ -2324,10 +2328,10 @@ impl<'r, 'h> Iterator for SplitInclusive<'r, 'h> { self.last = len + 1; // Next call will return None Some(span) } - }, + } Some(m) => { - let span = Span::from(self.last..m.start()); // return this right now - self.span_to_yield = Some(Span::from(m.start()..m.end())); // next call will return this + let span = Span::from(self.last..m.start()); // return this right now + self.span_to_yield = Some(Span::from(m.start()..m.end())); // next call will return this self.last = m.end(); Some(span) @@ -3710,16 +3714,8 @@ mod tests { #[test] fn split_inclusive() { let arr = [ - ( - r"(x)", - "1x2", - vec!["1", "x", "2"], - ), - ( - r"([^\d]+)", - "1-2", - vec!["1", "-", "2"], - ), + (r"(x)", "1x2", vec!["1", "x", "2"]), + (r"([^\d]+)", "1-2", vec!["1", "-", "2"]), ( r"(\s+)", "this is a \n\ntest", @@ -3728,21 +3724,33 @@ mod tests { ( r"([^0-9a-zA-Z_])", " C# is great! (not actually) :)", - vec!["", " ", "C", "#", "", " ", "is", " ", "great", "!", "", " ", "", "(", "not", " ", "actually", ")", "", " ", "", ":", "", ")", ""], + vec![ + "", " ", "C", "#", "", " ", "is", " ", "great", "!", "", + " ", "", "(", "not", " ", "actually", ")", "", " ", "", + ":", "", ")", "", + ], ), ( r"([a-zA-Z](?:[a-zA-Z']*[a-zA-Z])?)", r#"He said, "I'd like to eat cake!""#, - vec!["", "He", " ", "said", r#", ""#, "I'd", " ", "like", " ", "to", " ", "eat", " ", "cake", r#"!""#], + vec![ + "", "He", " ", "said", r#", ""#, "I'd", " ", "like", " ", + "to", " ", "eat", " ", "cake", r#"!""#, + ], ), ]; for (pattern, text, expected_output) in arr { - let out: Vec<_> = Regex::new(pattern).unwrap() + let out: Vec<_> = Regex::new(pattern) + .unwrap() .split_inclusive(text) .map(|sp| &text[sp]) .collect(); - assert_eq!(out, expected_output, "Regex: {}, Input: {}", pattern, text); + assert_eq!( + out, expected_output, + "Regex: {}, Input: {}", + pattern, text + ); // make sure that we can get the original string, from the splitted string by // concatenating it From 755a48bdf755ea27724172facda769845386d3df Mon Sep 17 00:00:00 2001 From: Shneor <770elmo@gmail.com> Date: Fri, 6 Oct 2023 11:46:45 +0200 Subject: [PATCH 4/5] Replaced \d with 0-9 in regex --- regex-automata/src/meta/regex.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/regex-automata/src/meta/regex.rs b/regex-automata/src/meta/regex.rs index 4a49b0328..c6e9b8b96 100644 --- a/regex-automata/src/meta/regex.rs +++ b/regex-automata/src/meta/regex.rs @@ -3715,7 +3715,7 @@ mod tests { fn split_inclusive() { let arr = [ (r"(x)", "1x2", vec!["1", "x", "2"]), - (r"([^\d]+)", "1-2", vec!["1", "-", "2"]), + (r"([^0-9]+)", "1-2", vec!["1", "-", "2"]), ( r"(\s+)", "this is a \n\ntest", From 77371e597d46b91073f94644a15eb1fda3e0c383 Mon Sep 17 00:00:00 2001 From: Shneor <770elmo@gmail.com> Date: Fri, 6 Oct 2023 12:25:32 +0200 Subject: [PATCH 5/5] Fixed '\s' regex --- regex-automata/src/meta/regex.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/regex-automata/src/meta/regex.rs b/regex-automata/src/meta/regex.rs index c6e9b8b96..93fe6c0bb 100644 --- a/regex-automata/src/meta/regex.rs +++ b/regex-automata/src/meta/regex.rs @@ -3717,7 +3717,7 @@ mod tests { (r"(x)", "1x2", vec!["1", "x", "2"]), (r"([^0-9]+)", "1-2", vec!["1", "-", "2"]), ( - r"(\s+)", + r"([\r\n\t\f\v ]+)", // equivalent to r"(\s+)" "this is a \n\ntest", vec!["this", " ", "is", " ", "a", " \n\n", "test"], ),