From 9cf4a42a9361f42d9aa6afd1245c0e37dc0c8771 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Mon, 4 Mar 2024 07:30:16 -0500 Subject: [PATCH] automata: fix bug where reverse NFA lacked an unanchored prefix Previously, when compiling a Thompson NFA, we were omitting an unanchored prefix when the HIR contained a `^` in its prefix. We did this because unanchored prefix in that case would never match because of the requirement imposed by `^`. The problem with that is it's incorrect when compiling a reverse automaton. For example, in the case of building a reverse NFA for `^Qu`, we should sitll include an unanchored prefix because the `^` in that case has no conflict with it. It would be like if we omitted an unanchored prefix for `Qu$` in a forward NFA, which is obviously wrong. The fix here is pretty simple: in the reverse case, check for `$` in the suffix of the HIR rather than a `^` in the prefix. Fixes #1169 --- regex-automata/src/nfa/thompson/compiler.rs | 89 ++++++++++++++++++++- 1 file changed, 85 insertions(+), 4 deletions(-) diff --git a/regex-automata/src/nfa/thompson/compiler.rs b/regex-automata/src/nfa/thompson/compiler.rs index e6b1c9122..668bca87c 100644 --- a/regex-automata/src/nfa/thompson/compiler.rs +++ b/regex-automata/src/nfa/thompson/compiler.rs @@ -961,10 +961,12 @@ impl Compiler { // for all matches. When an unanchored prefix is not added, then the // NFA's anchored and unanchored start states are equivalent. let all_anchored = exprs.iter().all(|e| { - e.borrow() - .properties() - .look_set_prefix() - .contains(hir::Look::Start) + let props = e.borrow().properties(); + if self.config.get_reverse() { + props.look_set_suffix().contains(hir::Look::End) + } else { + props.look_set_prefix().contains(hir::Look::Start) + } }); let anchored = !self.config.get_unanchored_prefix() || all_anchored; let unanchored_prefix = if anchored { @@ -1928,6 +1930,11 @@ mod tests { State::Sparse(SparseTransitions { transitions }) } + fn s_look(look: Look, next: usize) -> State { + let next = sid(next); + State::Look { look, next } + } + fn s_bin_union(alt1: usize, alt2: usize) -> State { State::BinaryUnion { alt1: sid(alt1), alt2: sid(alt2) } } @@ -1978,6 +1985,80 @@ mod tests { ); } + #[test] + fn compile_no_unanchored_prefix_with_start_anchor() { + let nfa = NFA::compiler() + .configure(NFA::config().which_captures(WhichCaptures::None)) + .build(r"^a") + .unwrap(); + assert_eq!( + nfa.states(), + &[s_look(Look::Start, 1), s_byte(b'a', 2), s_match(0)] + ); + } + + #[test] + fn compile_yes_unanchored_prefix_with_end_anchor() { + let nfa = NFA::compiler() + .configure(NFA::config().which_captures(WhichCaptures::None)) + .build(r"a$") + .unwrap(); + assert_eq!( + nfa.states(), + &[ + s_bin_union(2, 1), + s_range(0, 255, 0), + s_byte(b'a', 3), + s_look(Look::End, 4), + s_match(0), + ] + ); + } + + #[test] + fn compile_yes_reverse_unanchored_prefix_with_start_anchor() { + let nfa = NFA::compiler() + .configure( + NFA::config() + .reverse(true) + .which_captures(WhichCaptures::None), + ) + .build(r"^a") + .unwrap(); + assert_eq!( + nfa.states(), + &[ + s_bin_union(2, 1), + s_range(0, 255, 0), + s_byte(b'a', 3), + // Anchors get flipped in a reverse automaton. + s_look(Look::End, 4), + s_match(0), + ], + ); + } + + #[test] + fn compile_no_reverse_unanchored_prefix_with_end_anchor() { + let nfa = NFA::compiler() + .configure( + NFA::config() + .reverse(true) + .which_captures(WhichCaptures::None), + ) + .build(r"a$") + .unwrap(); + assert_eq!( + nfa.states(), + &[ + // Anchors get flipped in a reverse automaton. + s_look(Look::Start, 1), + s_byte(b'a', 2), + s_match(0), + ], + ); + } + #[test] fn compile_empty() { assert_eq!(build("").states(), &[s_match(0),]);