From 243851cd359723e3706fddbb69bf3d64dadc0623 Mon Sep 17 00:00:00 2001 From: chris-ha458 Date: Wed, 22 Nov 2023 21:56:20 +0900 Subject: [PATCH 01/10] Update nfa.rs --- regex-automata/src/nfa/thompson/nfa.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/regex-automata/src/nfa/thompson/nfa.rs b/regex-automata/src/nfa/thompson/nfa.rs index 1f57f8ebd..86dd323c1 100644 --- a/regex-automata/src/nfa/thompson/nfa.rs +++ b/regex-automata/src/nfa/thompson/nfa.rs @@ -1471,13 +1471,13 @@ impl fmt::Debug for Inner { } let pattern_len = self.start_pattern.len(); if pattern_len > 1 { - writeln!(f, "")?; + writeln!(f)?; for pid in 0..pattern_len { let sid = self.start_pattern[pid]; writeln!(f, "START({:06?}): {:?}", pid, sid.as_usize())?; } } - writeln!(f, "")?; + writeln!(f)?; writeln!( f, "transition equivalence classes: {:?}", @@ -1819,7 +1819,7 @@ impl SparseTransitions { &self, unit: alphabet::Unit, ) -> Option { - unit.as_u8().map_or(None, |byte| self.matches_byte(byte)) + unit.as_u8().and_then(|byte| self.matches_byte(byte)) } /// This follows the matching transition for a particular byte. @@ -1909,7 +1909,7 @@ impl DenseTransitions { &self, unit: alphabet::Unit, ) -> Option { - unit.as_u8().map_or(None, |byte| self.matches_byte(byte)) + unit.as_u8().and_then(|byte| self.matches_byte(byte)) } /// This follows the matching transition for a particular byte. From ca585b9a818e56623d90054714c7ed1423966ead Mon Sep 17 00:00:00 2001 From: chris-ha458 Date: Wed, 22 Nov 2023 21:57:20 +0900 Subject: [PATCH 02/10] one liners across many modules --- regex-automata/src/nfa/thompson/compiler.rs | 2 +- regex-automata/src/nfa/thompson/pikevm.rs | 4 ++-- regex-automata/src/util/determinize/mod.rs | 2 +- regex-automata/src/util/escape.rs | 2 +- regex-automata/src/util/interpolate.rs | 9 +++------ regex-automata/src/util/pool.rs | 4 ++-- regex-automata/src/util/prefilter/mod.rs | 6 +++--- regex-automata/src/util/sparse_set.rs | 2 +- 8 files changed, 14 insertions(+), 17 deletions(-) diff --git a/regex-automata/src/nfa/thompson/compiler.rs b/regex-automata/src/nfa/thompson/compiler.rs index 2d2172957..6e02fd349 100644 --- a/regex-automata/src/nfa/thompson/compiler.rs +++ b/regex-automata/src/nfa/thompson/compiler.rs @@ -1663,7 +1663,7 @@ impl Compiler { capture_index: u32, name: Option<&str>, ) -> Result { - let name = name.map(|n| Arc::from(n)); + let name = name.map(Arc::from); self.builder.borrow_mut().add_capture_start( StateID::ZERO, capture_index, diff --git a/regex-automata/src/nfa/thompson/pikevm.rs b/regex-automata/src/nfa/thompson/pikevm.rs index 0128c151a..20934e8dd 100644 --- a/regex-automata/src/nfa/thompson/pikevm.rs +++ b/regex-automata/src/nfa/thompson/pikevm.rs @@ -1290,7 +1290,7 @@ impl PikeVM { // the only thing in 'curr'. So we might as well just skip // ahead until we find something that we know might advance us // forward. - if let Some(ref pre) = pre { + if let Some(pre) = pre { let span = Span::from(at..input.end()); match pre.find(input.haystack(), span) { None => break, @@ -1344,7 +1344,7 @@ impl PikeVM { // search. If we re-computed it at every position, we would be // simulating an unanchored search when we were tasked to perform // an anchored search. - if (!hm.is_some() || allmatches) + if (hm.is_none() || allmatches) && (!anchored || at == input.start()) { // Since we are adding to the 'curr' active states and since diff --git a/regex-automata/src/util/determinize/mod.rs b/regex-automata/src/util/determinize/mod.rs index ba32991d0..09bd3a123 100644 --- a/regex-automata/src/util/determinize/mod.rs +++ b/regex-automata/src/util/determinize/mod.rs @@ -131,7 +131,7 @@ pub(crate) fn next( if !state.look_need().is_empty() { // Add look-ahead assertions that are now true based on the current // input unit. - let mut look_have = state.look_have().clone(); + let mut look_have = state.look_have(); match unit.as_u8() { Some(b'\r') => { if !rev || !state.is_half_crlf() { diff --git a/regex-automata/src/util/escape.rs b/regex-automata/src/util/escape.rs index 7f6aa15f5..94506e5ae 100644 --- a/regex-automata/src/util/escape.rs +++ b/regex-automata/src/util/escape.rs @@ -31,7 +31,7 @@ impl core::fmt::Debug for DebugByte { let mut len = 0; for (i, mut b) in core::ascii::escape_default(self.0).enumerate() { // capitalize \xab to \xAB - if i >= 2 && b'a' <= b && b <= b'f' { + if i >= 2 && (b'a'..=b'f').contains(&b) { b -= 32; } bytes[len] = b; diff --git a/regex-automata/src/util/interpolate.rs b/regex-automata/src/util/interpolate.rs index f274629df..951646910 100644 --- a/regex-automata/src/util/interpolate.rs +++ b/regex-automata/src/util/interpolate.rs @@ -107,7 +107,7 @@ pub fn string( } // Handle escaping of '$'. if replacement.as_bytes().get(1).map_or(false, |&b| b == b'$') { - dst.push_str("$"); + dst.push('$'); replacement = &replacement[2..]; continue; } @@ -115,7 +115,7 @@ pub fn string( let cap_ref = match find_cap_ref(replacement.as_bytes()) { Some(cap_ref) => cap_ref, None => { - dst.push_str("$"); + dst.push('$'); replacement = &replacement[1..]; continue; } @@ -321,10 +321,7 @@ fn find_cap_ref_braced(rep: &[u8], mut i: usize) -> Option> { /// Returns true if and only if the given byte is allowed in a capture name /// written in non-brace form. fn is_valid_cap_letter(b: u8) -> bool { - match b { - b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' | b'_' => true, - _ => false, - } + matches!(b, b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' | b'_') } #[cfg(test)] diff --git a/regex-automata/src/util/pool.rs b/regex-automata/src/util/pool.rs index d90d4ecff..cbfc18ccf 100644 --- a/regex-automata/src/util/pool.rs +++ b/regex-automata/src/util/pool.rs @@ -678,7 +678,7 @@ mod inner { #[inline] pub(super) fn value(&self) -> &T { match self.value { - Ok(ref v) => &**v, + Ok(ref v) => v, // SAFETY: This is safe because the only way a PoolGuard gets // created for self.value=Err is when the current thread // corresponds to the owning thread, of which there can only @@ -703,7 +703,7 @@ mod inner { #[inline] pub(super) fn value_mut(&mut self) -> &mut T { match self.value { - Ok(ref mut v) => &mut **v, + Ok(ref mut v) => v, // SAFETY: This is safe because the only way a PoolGuard gets // created for self.value=None is when the current thread // corresponds to the owning thread, of which there can only diff --git a/regex-automata/src/util/prefilter/mod.rs b/regex-automata/src/util/prefilter/mod.rs index 51fc92233..4c4a26b8e 100644 --- a/regex-automata/src/util/prefilter/mod.rs +++ b/regex-automata/src/util/prefilter/mod.rs @@ -478,17 +478,17 @@ pub(crate) trait PrefilterI: impl PrefilterI for Arc

{ #[cfg_attr(feature = "perf-inline", inline(always))] fn find(&self, haystack: &[u8], span: Span) -> Option { - (&**self).find(haystack, span) + (**self).find(haystack, span) } #[cfg_attr(feature = "perf-inline", inline(always))] fn prefix(&self, haystack: &[u8], span: Span) -> Option { - (&**self).prefix(haystack, span) + (**self).prefix(haystack, span) } #[cfg_attr(feature = "perf-inline", inline(always))] fn memory_usage(&self) -> usize { - (&**self).memory_usage() + (**self).memory_usage() } #[cfg_attr(feature = "perf-inline", inline(always))] diff --git a/regex-automata/src/util/sparse_set.rs b/regex-automata/src/util/sparse_set.rs index cbaa0b6f4..e9dfde1dc 100644 --- a/regex-automata/src/util/sparse_set.rs +++ b/regex-automata/src/util/sparse_set.rs @@ -234,6 +234,6 @@ impl<'a> Iterator for SparseSetIter<'a> { #[cfg_attr(feature = "perf-inline", inline(always))] fn next(&mut self) -> Option { - self.0.next().map(|&id| id) + self.0.next().copied() } } From f1afe0e9524e615e5f30b0a7269b5c4e002a7f54 Mon Sep 17 00:00:00 2001 From: chris-ha458 Date: Wed, 22 Nov 2023 21:57:35 +0900 Subject: [PATCH 03/10] Update range_trie.rs --- regex-automata/src/nfa/thompson/range_trie.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/regex-automata/src/nfa/thompson/range_trie.rs b/regex-automata/src/nfa/thompson/range_trie.rs index cd77cc150..b258ab04c 100644 --- a/regex-automata/src/nfa/thompson/range_trie.rs +++ b/regex-automata/src/nfa/thompson/range_trie.rs @@ -235,7 +235,7 @@ impl RangeTrie { /// Clear this range trie such that it is empty. Clearing a range trie /// and reusing it can beneficial because this may reuse allocations. pub fn clear(&mut self) { - self.free.extend(self.states.drain(..)); + self.free.append(&mut self.states); self.add_empty(); // final self.add_empty(); // root } @@ -296,7 +296,7 @@ impl RangeTrie { assert!(!ranges.is_empty()); assert!(ranges.len() <= 4); - let mut stack = mem::replace(&mut self.insert_stack, vec![]); + let mut stack = std::mem::take(&mut self.insert_stack); stack.clear(); stack.push(NextInsert::new(ROOT, ranges)); @@ -866,7 +866,7 @@ impl Split { impl fmt::Debug for RangeTrie { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - writeln!(f, "")?; + writeln!(f)?; for (i, state) in self.states.iter().enumerate() { let status = if i == FINAL.as_usize() { '*' } else { ' ' }; writeln!(f, "{}{:06}: {:?}", status, i, state)?; From c140536f5a28505b158e88cbcdd4cc67c799b422 Mon Sep 17 00:00:00 2001 From: chris-ha458 Date: Wed, 22 Nov 2023 21:57:42 +0900 Subject: [PATCH 04/10] Update state.rs --- regex-automata/src/util/determinize/state.rs | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/regex-automata/src/util/determinize/state.rs b/regex-automata/src/util/determinize/state.rs index effa6f44d..44d59ec0d 100644 --- a/regex-automata/src/util/determinize/state.rs +++ b/regex-automata/src/util/determinize/state.rs @@ -115,7 +115,7 @@ pub(crate) struct State(Arc<[u8]>); /// without having to convert it into a State first. impl core::borrow::Borrow<[u8]> for State { fn borrow(&self) -> &[u8] { - &*self.0 + &self.0 } } @@ -177,7 +177,7 @@ impl State { } fn repr(&self) -> Repr<'_> { - Repr(&*self.0) + Repr(&self.0) } } @@ -460,12 +460,10 @@ impl<'a> Repr<'a> { /// /// If this state is not a match state, then this always returns 0. fn match_len(&self) -> usize { - if !self.is_match() { - return 0; - } else if !self.has_pattern_ids() { - 1 - } else { - self.encoded_pattern_len() + match (self.is_match(), self.has_pattern_ids()) { + (false, _) => 0, + (true, false) => 1, + (true, true) => self.encoded_pattern_len(), } } From 12c0edb2d12b42aa582d184550e86e3148283bf9 Mon Sep 17 00:00:00 2001 From: chris-ha458 Date: Wed, 22 Nov 2023 21:57:46 +0900 Subject: [PATCH 05/10] Update search.rs --- regex-automata/src/util/search.rs | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/regex-automata/src/util/search.rs b/regex-automata/src/util/search.rs index 39aec522b..72e05f14a 100644 --- a/regex-automata/src/util/search.rs +++ b/regex-automata/src/util/search.rs @@ -1365,16 +1365,17 @@ impl<'a> Iterator for PatternSetIter<'a> { type Item = PatternID; fn next(&mut self) -> Option { - while let Some((index, &yes)) = self.it.next() { + // Only valid 'PatternID' values can be inserted into the set + // and construction of the set panics if the capacity would + // permit storing invalid pattern IDs. Thus, 'yes' is only true + // precisely when 'index' corresponds to a valid 'PatternID'. + self.it.by_ref().find_map(|(index, &yes)| { if yes { - // Only valid 'PatternID' values can be inserted into the set - // and construction of the set panics if the capacity would - // permit storing invalid pattern IDs. Thus, 'yes' is only true - // precisely when 'index' corresponds to a valid 'PatternID'. - return Some(PatternID::new_unchecked(index)); + Some(PatternID::new_unchecked(index)) + } else { + None } - } - None + }) } fn size_hint(&self) -> (usize, Option) { @@ -1689,13 +1690,14 @@ impl Anchored { /// # Ok::<(), Box>(()) /// ``` #[non_exhaustive] -#[derive(Clone, Copy, Debug, Eq, PartialEq)] +#[derive(Clone, Copy, Default, Debug, Eq, PartialEq)] pub enum MatchKind { /// Report all possible matches. All, /// Report only the leftmost matches. When multiple leftmost matches exist, /// report the match corresponding to the part of the regex that appears /// first in the syntax. + #[default] LeftmostFirst, // There is prior art in RE2 that shows that we should be able to add // LeftmostLongest too. The tricky part of it is supporting ungreedy @@ -1721,12 +1723,6 @@ impl MatchKind { } } -impl Default for MatchKind { - fn default() -> MatchKind { - MatchKind::LeftmostFirst - } -} - /// An error indicating that a search stopped before reporting whether a /// match exists or not. /// From 094d3166e2bcfc2e44e9bd159c11e8f34d4ac184 Mon Sep 17 00:00:00 2001 From: chris-ha458 Date: Wed, 22 Nov 2023 21:57:49 +0900 Subject: [PATCH 06/10] Update utf8.rs --- regex-automata/src/util/utf8.rs | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/regex-automata/src/util/utf8.rs b/regex-automata/src/util/utf8.rs index 91b27efe0..6c86e8d5f 100644 --- a/regex-automata/src/util/utf8.rs +++ b/regex-automata/src/util/utf8.rs @@ -99,18 +99,13 @@ pub(crate) fn decode_last(bytes: &[u8]) -> Option> { /// `None`. #[cfg_attr(feature = "perf-inline", inline(always))] fn len(byte: u8) -> Option { - if byte <= 0x7F { - return Some(1); - } else if byte & 0b1100_0000 == 0b1000_0000 { - return None; - } else if byte <= 0b1101_1111 { - Some(2) - } else if byte <= 0b1110_1111 { - Some(3) - } else if byte <= 0b1111_0111 { - Some(4) - } else { - None + match byte { + 0b0000_0000..=0b0111_1111 => Some(1), + 0b1000_0000..=0b1011_1111 => None, + 0b1100_0000..=0b1101_1111 => Some(2), + 0b1110_0000..=0b1110_1111 => Some(3), + 0b1111_0000..=0b1111_0111 => Some(4), + _ => None, } } From 9b6b427e730ed8d378749b494407180dbf5da163 Mon Sep 17 00:00:00 2001 From: chris-ha458 Date: Wed, 22 Nov 2023 22:02:58 +0900 Subject: [PATCH 07/10] Update wire.rs --- regex-automata/src/util/wire.rs | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/regex-automata/src/util/wire.rs b/regex-automata/src/util/wire.rs index ecf4fd8c0..2af072881 100644 --- a/regex-automata/src/util/wire.rs +++ b/regex-automata/src/util/wire.rs @@ -482,12 +482,8 @@ pub(crate) fn write_label( /// is longer than 255 bytes. (The size restriction exists so that searching /// for a label during deserialization can be done in small bounded space.) pub(crate) fn write_label_len(label: &str) -> usize { - if label.len() > 255 { - panic!("label must not be longer than 255 bytes"); - } - if label.as_bytes().iter().position(|&b| b == 0).is_some() { - panic!("label must not contain NUL bytes"); - } + assert!(label.len() <= 255, "label must not be longer than 255 bytes"); + assert!(label.bytes().all(|b| b != 0), "label must not contain NUL bytes"); let label_len = label.len() + 1; // +1 for the NUL terminator label_len + padding_len(label_len) } From dfcc1623ba52c42865ebfe1fa6a414d445771b56 Mon Sep 17 00:00:00 2001 From: chris-ha458 Date: Wed, 22 Nov 2023 23:11:04 +0900 Subject: [PATCH 08/10] revert state.rs --- regex-automata/src/util/determinize/state.rs | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/regex-automata/src/util/determinize/state.rs b/regex-automata/src/util/determinize/state.rs index 44d59ec0d..f4bee35ad 100644 --- a/regex-automata/src/util/determinize/state.rs +++ b/regex-automata/src/util/determinize/state.rs @@ -460,10 +460,12 @@ impl<'a> Repr<'a> { /// /// If this state is not a match state, then this always returns 0. fn match_len(&self) -> usize { - match (self.is_match(), self.has_pattern_ids()) { - (false, _) => 0, - (true, false) => 1, - (true, true) => self.encoded_pattern_len(), + if !self.is_match() { + 0 + } else if !self.has_pattern_ids() { + 1 + } else { + self.encoded_pattern_len() } } From a66f467b678f8073f986e243be3272df2f1b59a1 Mon Sep 17 00:00:00 2001 From: chris-ha458 Date: Wed, 22 Nov 2023 23:11:42 +0900 Subject: [PATCH 09/10] Update range_trie.rs --- regex-automata/src/nfa/thompson/range_trie.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/regex-automata/src/nfa/thompson/range_trie.rs b/regex-automata/src/nfa/thompson/range_trie.rs index b258ab04c..a2f0994ed 100644 --- a/regex-automata/src/nfa/thompson/range_trie.rs +++ b/regex-automata/src/nfa/thompson/range_trie.rs @@ -296,7 +296,7 @@ impl RangeTrie { assert!(!ranges.is_empty()); assert!(ranges.len() <= 4); - let mut stack = std::mem::take(&mut self.insert_stack); + let mut stack = core::mem::replace(&mut self.insert_stack, vec![]); stack.clear(); stack.push(NextInsert::new(ROOT, ranges)); From ebfadbe143fe5992a58015c06343bc30e51e18f6 Mon Sep 17 00:00:00 2001 From: chris-ha458 Date: Wed, 22 Nov 2023 23:13:16 +0900 Subject: [PATCH 10/10] Update escape.rs --- regex-automata/src/util/escape.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/regex-automata/src/util/escape.rs b/regex-automata/src/util/escape.rs index 94506e5ae..7f6aa15f5 100644 --- a/regex-automata/src/util/escape.rs +++ b/regex-automata/src/util/escape.rs @@ -31,7 +31,7 @@ impl core::fmt::Debug for DebugByte { let mut len = 0; for (i, mut b) in core::ascii::escape_default(self.0).enumerate() { // capitalize \xab to \xAB - if i >= 2 && (b'a'..=b'f').contains(&b) { + if i >= 2 && b'a' <= b && b <= b'f' { b -= 32; } bytes[len] = b;