From 3527ee0353a9d103fe9945cf9d2cd7dcba061313 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sat, 5 Aug 2023 10:41:24 -0400 Subject: [PATCH] automata: add 'is_match' as its own path to meta regex internals I originally prided myself on not having a dedicated `is_match` routine on the meta regex engine's internal `Strategy` trait, and actually spent a fair amount of attention ensuring that `is_match` and `find` always returned the same results. That is, `is_match` returns true if and only if `find` returns a match. But the fix in the previous commits for #1059 means that a `PikeVM` and a `BoundedBacktracker` can be used to run a search with an NFA that has no capture states. Since both engines are implemented to only track offsets via those capture states, it follows that the only thing that can be returned in such cases is whether a match occurs (and if so, which pattern matched). That in turn means that `is_match` can return `true` while `find` can return `None` for the same search. This is because the latter returns `None` even when a match is found but there are no capture states to record the offsets of the match. This in theory could be resolved by adding APIs to the `PikeVM` and the `BoundedBacktracker` that return a `HalfMatch` without depending on any capture states at all. Then `is_match` could be implemented in terms of those APIs. That is probably the right path, but it's pretty gnarly to do without breaking changes and I don't want to do any breaking changes right now. So instead, we just add a special path to the meta regex engine for `is_match` and permit some cases to have different results between `is_match` and `find`. Sigh. --- regex-automata/src/meta/regex.rs | 9 ++- regex-automata/src/meta/strategy.rs | 120 +++++++++++++++++++++++++++- regex-automata/src/meta/wrappers.rs | 32 ++++++++ 3 files changed, 158 insertions(+), 3 deletions(-) diff --git a/regex-automata/src/meta/regex.rs b/regex-automata/src/meta/regex.rs index 0d40eaa40..3a04b14d8 100644 --- a/regex-automata/src/meta/regex.rs +++ b/regex-automata/src/meta/regex.rs @@ -529,7 +529,14 @@ impl Regex { #[inline] pub fn is_match<'h, I: Into>>(&self, input: I) -> bool { let input = input.into().earliest(true); - self.search_half(&input).is_some() + if self.imp.info.is_impossible(&input) { + return false; + } + let mut guard = self.pool.get(); + let result = self.imp.strat.is_match(&mut guard, &input); + // See 'Regex::search' for why we put the guard back explicitly. + PoolGuard::put(guard); + result } /// Executes a leftmost search and returns the first match that is found, diff --git a/regex-automata/src/meta/strategy.rs b/regex-automata/src/meta/strategy.rs index 86610fbea..ea6c6ab57 100644 --- a/regex-automata/src/meta/strategy.rs +++ b/regex-automata/src/meta/strategy.rs @@ -58,6 +58,8 @@ pub(super) trait Strategy: input: &Input<'_>, ) -> Option; + fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool; + fn search_slots( &self, cache: &mut Cache, @@ -399,6 +401,10 @@ impl Strategy for Pre

{ self.search(cache, input).map(|m| HalfMatch::new(m.pattern(), m.end())) } + fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool { + self.search(cache, input).is_some() + } + fn search_slots( &self, cache: &mut Cache, @@ -623,6 +629,29 @@ impl Core { } } + fn is_match_nofail(&self, cache: &mut Cache, input: &Input<'_>) -> bool { + if let Some(ref e) = self.onepass.get(input) { + trace!( + "using OnePass for is-match search at {:?}", + input.get_span() + ); + e.search_slots(&mut cache.onepass, input, &mut []).is_some() + } else if let Some(ref e) = self.backtrack.get(input) { + trace!( + "using BoundedBacktracker for is-match search at {:?}", + input.get_span() + ); + e.is_match(&mut cache.backtrack, input) + } else { + trace!( + "using PikeVM for is-match search at {:?}", + input.get_span() + ); + let e = self.pikevm.get(); + e.is_match(&mut cache.pikevm, input) + } + } + fn is_capture_search_needed(&self, slots_len: usize) -> bool { slots_len > self.nfa.group_info().implicit_slot_len() } @@ -703,7 +732,7 @@ impl Strategy for Core { // The main difference with 'search' is that if we're using a DFA, we // can use a single forward scan without needing to run the reverse // DFA. - return if let Some(e) = self.dfa.get(input) { + if let Some(e) = self.dfa.get(input) { trace!("using full DFA for half search at {:?}", input.get_span()); match e.try_search_half_fwd(input) { Ok(x) => x, @@ -723,7 +752,38 @@ impl Strategy for Core { } } else { self.search_half_nofail(cache, input) - }; + } + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool { + if let Some(e) = self.dfa.get(input) { + trace!( + "using full DFA for is-match search at {:?}", + input.get_span() + ); + match e.try_search_half_fwd(input) { + Ok(x) => x.is_some(), + Err(_err) => { + trace!("full DFA half search failed: {}", _err); + self.is_match_nofail(cache, input) + } + } + } else if let Some(e) = self.hybrid.get(input) { + trace!( + "using lazy DFA for is-match search at {:?}", + input.get_span() + ); + match e.try_search_half_fwd(&mut cache.hybrid, input) { + Ok(x) => x.is_some(), + Err(_err) => { + trace!("lazy DFA half search failed: {}", _err); + self.is_match_nofail(cache, input) + } + } + } else { + self.is_match_nofail(cache, input) + } } #[cfg_attr(feature = "perf-inline", inline(always))] @@ -983,6 +1043,21 @@ impl Strategy for ReverseAnchored { } } + #[cfg_attr(feature = "perf-inline", inline(always))] + fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool { + if input.get_anchored().is_anchored() { + return self.core.is_match(cache, input); + } + match self.try_search_half_anchored_rev(cache, input) { + Err(_err) => { + trace!("fast reverse anchored search failed: {}", _err); + self.core.is_match_nofail(cache, input) + } + Ok(None) => false, + Ok(Some(_)) => true, + } + } + #[cfg_attr(feature = "perf-inline", inline(always))] fn search_slots( &self, @@ -1335,6 +1410,28 @@ impl Strategy for ReverseSuffix { } } + #[cfg_attr(feature = "perf-inline", inline(always))] + fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool { + if input.get_anchored().is_anchored() { + return self.core.is_match(cache, input); + } + match self.try_search_half_start(cache, input) { + Err(RetryError::Quadratic(_err)) => { + trace!("reverse suffix half optimization failed: {}", _err); + self.core.is_match_nofail(cache, input) + } + Err(RetryError::Fail(_err)) => { + trace!( + "reverse suffix reverse fast half search failed: {}", + _err + ); + self.core.is_match_nofail(cache, input) + } + Ok(None) => false, + Ok(Some(_)) => true, + } + } + #[cfg_attr(feature = "perf-inline", inline(always))] fn search_slots( &self, @@ -1717,6 +1814,25 @@ impl Strategy for ReverseInner { } } + #[cfg_attr(feature = "perf-inline", inline(always))] + fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool { + if input.get_anchored().is_anchored() { + return self.core.is_match(cache, input); + } + match self.try_search_full(cache, input) { + Err(RetryError::Quadratic(_err)) => { + trace!("reverse inner half optimization failed: {}", _err); + self.core.is_match_nofail(cache, input) + } + Err(RetryError::Fail(_err)) => { + trace!("reverse inner fast half search failed: {}", _err); + self.core.is_match_nofail(cache, input) + } + Ok(None) => false, + Ok(Some(_)) => true, + } + } + #[cfg_attr(feature = "perf-inline", inline(always))] fn search_slots( &self, diff --git a/regex-automata/src/meta/wrappers.rs b/regex-automata/src/meta/wrappers.rs index 8f58363a1..08110d9bb 100644 --- a/regex-automata/src/meta/wrappers.rs +++ b/regex-automata/src/meta/wrappers.rs @@ -87,6 +87,15 @@ impl PikeVMEngine { Ok(PikeVMEngine(engine)) } + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn is_match( + &self, + cache: &mut PikeVMCache, + input: &Input<'_>, + ) -> bool { + self.0.is_match(cache.0.as_mut().unwrap(), input.clone()) + } + #[cfg_attr(feature = "perf-inline", inline(always))] pub(crate) fn search_slots( &self, @@ -212,6 +221,29 @@ impl BoundedBacktrackerEngine { } } + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn is_match( + &self, + cache: &mut BoundedBacktrackerCache, + input: &Input<'_>, + ) -> bool { + #[cfg(feature = "nfa-backtrack")] + { + // OK because we only permit access to this engine when we know + // the haystack is short enough for the backtracker to run without + // reporting an error. + self.0 + .try_is_match(cache.0.as_mut().unwrap(), input.clone()) + .unwrap() + } + #[cfg(not(feature = "nfa-backtrack"))] + { + // Impossible to reach because this engine is never constructed + // if the requisite features aren't enabled. + unreachable!() + } + } + #[cfg_attr(feature = "perf-inline", inline(always))] pub(crate) fn search_slots( &self,