From 956d6b2d98ee228a4b31f26b2196d30e9bfae388 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Sat, 5 Aug 2023 17:25:31 -0400
Subject: [PATCH] automata: fix incorrect offsets reported by reverse inner
 optimization

Sadly it seems that my days of squashing optimization bugs are still
before me. In this particular case, the reverse inner literal
optimization (which is a new optimization introduced in regex 1.9)
resulted in reporting incorrect match offsets in some cases. The
offending case here is:

    $ regex-cli find match meta --no-table -p '(?:(\d+)[:.])?(\d{1,2})[:.](\d{2})' -y '888:77:66'
    0:1:9:888:77:66

The above reports a match at 1..9, but the correct match is 0..9. The
problem here is that the reverse inner literal optimization is being
applied, which splits the regex into three (conceptual) pieces:

1. `(?:(\d+)[:.])?(\d{1,2})`
2. `[:.]`
3. `(\d{2})`

The reverse inner optimization works by looking for occurrences of (2)
first, then matching (1) in reverse to find the start position of the
match and then searching for (3) in the forward direction to find the
end of the match.

The problem in this particular case is that (2) matches at position `3`
in the `888:77:66` haystack. Since the first section of numbers is
optional, the reverse inner optimization believes a match exists at
offset `1` by virtue of matching (1) in reverse. That is, the
`(\d{1,2})` matches at 1..3 while the `(?:(\d+)[:.])?` doesn't match at
all. The reverse search here is correct in isolation, but it leads to an
overall incorrect result by stopping the search early. The issue is that
the true leftmost match requires (2) to match at 6..7, but since it
matched at 3..4 first, it is considered first and leads to an incorrect
overall match.

To fix this, we add another "trip wire" to the reverse inner
optimization (of which there are already several) that tries to detect
cases where it cannot prove that the match it found is actually the
leftmost match. Namely, if it reports a match offset greater than the
start of the search and otherwise *could* have kept searching, then we
don't know whether we have the true leftmost match. In that case, we
bail on the optimization and let a slower path take over.

This is yet another example of how the nature of regex searching, and in
particular leftmost searching, inhibits the composition of different
regex strategies. Or at least, makes them incredibly subtle.

Fixes #1060
---
 regex-automata/src/meta/limited.rs | 47 ++++++++++++++++++++++++++++++
 testdata/regression.toml           | 17 +++++++++++
 2 files changed, 64 insertions(+)

diff --git a/regex-automata/src/meta/limited.rs b/regex-automata/src/meta/limited.rs
index 005878acd..192a2625e 100644
--- a/regex-automata/src/meta/limited.rs
+++ b/regex-automata/src/meta/limited.rs
@@ -88,7 +88,41 @@ pub(crate) fn dfa_try_search_half_rev(
             return Err(RetryError::Quadratic(RetryQuadraticError::new()));
         }
     }
+    let was_dead = dfa.is_dead_state(sid);
     dfa_eoi_rev(dfa, input, &mut sid, &mut mat)?;
+    // If we reach the beginning of the search and we could otherwise still
+    // potentially keep matching if there was more to match, then we actually
+    // return an error to indicate giving up on this optimization. Why? Because
+    // we can't prove that the real match begins at where we would report it.
+    //
+    // This only happens when all of the following are true:
+    //
+    // 1) We reach the starting point of our search span.
+    // 2) The match we found is before the starting point.
+    // 3) The FSM reports we could possibly find a longer match.
+    //
+    // We need (1) because otherwise the search stopped before the starting
+    // point and there is no possible way to find a more leftmost position.
+    //
+    // We need (2) because if the match found has an offset equal to the minimum
+    // possible offset, then there is no possible more leftmost match.
+    //
+    // We need (3) because if the FSM couldn't continue anyway (i.e., it's in
+    // a dead state), then we know we couldn't find anything more leftmost
+    // than what we have. (We have to check the state we were in prior to the
+    // EOI transition since the EOI transition will usually bring us to a dead
+    // state by virtue of it represents the end-of-input.)
+    if at == input.start()
+        && mat.map_or(false, |m| m.offset() > input.start())
+        && !was_dead
+    {
+        trace!(
+            "reached beginning of search at offset {} without hitting \
+             a dead state, quitting to avoid potential false positive match",
+            at,
+        );
+        return Err(RetryError::Quadratic(RetryQuadraticError::new()));
+    }
     Ok(mat)
 }
 
@@ -140,7 +174,20 @@ pub(crate) fn hybrid_try_search_half_rev(
             return Err(RetryError::Quadratic(RetryQuadraticError::new()));
         }
     }
+    let was_dead = sid.is_dead();
     hybrid_eoi_rev(dfa, cache, input, &mut sid, &mut mat)?;
+    // See the comments in the full DFA routine above for why we need this.
+    if at == input.start()
+        && mat.map_or(false, |m| m.offset() > input.start())
+        && !was_dead
+    {
+        trace!(
+            "reached beginning of search at offset {} without hitting \
+             a dead state, quitting to avoid potential false positive match",
+            at,
+        );
+        return Err(RetryError::Quadratic(RetryQuadraticError::new()));
+    }
     Ok(mat)
 }
 
diff --git a/testdata/regression.toml b/testdata/regression.toml
index bb5e4fd46..a2efa2ad3 100644
--- a/testdata/regression.toml
+++ b/testdata/regression.toml
@@ -739,3 +739,20 @@ matches = [[0, 9]]
 utf8 = false
 match-kind = "all"
 search-kind = "overlapping"
+
+# See: https://github.com/rust-lang/regex/issues/1060
+[[test]]
+name = "reverse-inner-plus-shorter-than-expected"
+regex = '(?:(\d+)[:.])?(\d{1,2})[:.](\d{2})'
+haystack = '102:12:39'
+matches = [[[0, 9], [0, 3], [4, 6], [7, 9]]]
+
+# Like reverse-inner-plus-shorter-than-expected, but using a far simpler regex
+# to demonstrate the extent of the rot. Sigh.
+#
+# See: https://github.com/rust-lang/regex/issues/1060
+[[test]]
+name = "reverse-inner-short"
+regex = '(?:([0-9][0-9][0-9]):)?([0-9][0-9]):([0-9][0-9])'
+haystack = '102:12:39'
+matches = [[[0, 9], [0, 3], [4, 6], [7, 9]]]