From 43ba6b810f2e3f60d7a57f4bcc8e1831b78a011c Mon Sep 17 00:00:00 2001
From: "Victor M. Alvarez" <vmalvarez@virustotal.com>
Date: Tue, 11 Jul 2023 03:10:29 +0200
Subject: [PATCH 001/136] syntax: improve literal extraction from certain
 repetitions

When repetitions didn't have an explicit max value, like in `(ab){2,}`
the literal extractor was producing sub-optimal literals, like `"ab"`
instead of `"abab"`.

Close #1032
---
 regex-syntax/src/hir/literal.rs | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/regex-syntax/src/hir/literal.rs b/regex-syntax/src/hir/literal.rs
index 9461db989..afcd506e0 100644
--- a/regex-syntax/src/hir/literal.rs
+++ b/regex-syntax/src/hir/literal.rs
@@ -477,7 +477,7 @@ impl Extractor {
                 }
                 seq
             }
-            hir::Repetition { min, max: Some(max), .. } if min < max => {
+            hir::Repetition { min, .. } => {
                 assert!(min > 0); // handled above
                 let limit =
                     u32::try_from(self.limit_repeat).unwrap_or(u32::MAX);
@@ -491,10 +491,6 @@ impl Extractor {
                 seq.make_inexact();
                 seq
             }
-            hir::Repetition { .. } => {
-                subseq.make_inexact();
-                subseq
-            }
         }
     }
 
@@ -2655,6 +2651,12 @@ mod tests {
             ]),
             e(r"(ab|cd)(ef|gh)(ij|kl)")
         );
+
+        assert_eq!(inexact([E("abab")], [E("abab")]), e(r"(ab){2}"));
+
+        assert_eq!(inexact([I("abab")], [I("abab")]), e(r"(ab){2,3}"));
+
+        assert_eq!(inexact([I("abab")], [I("abab")]), e(r"(ab){2,}"));
     }
 
     #[test]

From 5e8eaf1f7ab92b68bfabaa004561ccb1269ecb9f Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Mon, 10 Jul 2023 21:10:57 -0400
Subject: [PATCH 002/136] regex-syntax-0.7.4

---
 regex-syntax/Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/regex-syntax/Cargo.toml b/regex-syntax/Cargo.toml
index e6d7965be..b7a149c23 100644
--- a/regex-syntax/Cargo.toml
+++ b/regex-syntax/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "regex-syntax"
-version = "0.7.3"  #:version
+version = "0.7.4"  #:version
 authors = ["The Rust Project Developers", "Andrew Gallant <jamslam@gmail.com>"]
 license = "MIT OR Apache-2.0"
 repository = "https://github.com/rust-lang/regex/tree/master/regex-syntax"

From bbb285b81fd1108536eedc52990a95f30ca6bdf5 Mon Sep 17 00:00:00 2001
From: CosmicHorror <CosmicHorrorDev@pm.me>
Date: Mon, 10 Jul 2023 20:53:21 -0600
Subject: [PATCH 003/136] regex-cli: update installation instructions

PR #1035
---
 regex-cli/README.md | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/regex-cli/README.md b/regex-cli/README.md
index 36dc50e77..376d89091 100644
--- a/regex-cli/README.md
+++ b/regex-cli/README.md
@@ -7,11 +7,10 @@ various regex development tasks such as generating tests.
 
 ### Installation
 
-Currently `regex-cli` is not on crates.io and should be installed from this
-git repository:
+Simply use `cargo` to install from crates.io.
 
 ```
-$ cargo install --git https://github.com/rust-lang/regex regex-cli
+$ cargo install regex-cli
 ```
 
 
From 40585afe940294bc50aad7fc563588668f860f51 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Wed, 12 Jul 2023 09:05:44 -0400
Subject: [PATCH 004/136] pikevm: fix anchored search bug

This fixes a bug where one could ask the PikeVM to perform an anchored
search, but in some cases it could return a match where the start of the
match is greater than the start of the search. For example, an anchored
search of the pattern '.c' on the haystack 'abc' starting at '0' would
report a match at '1..3'. No other engine (other than the meta engine,
which we'll address in a subsequent commit) had this bug.

The issue in the pikevm was our simulation of the '(?s-u:.)*?' prefix
for implementing unanchored searches. Namely, instead of using the NFA
itself to implement the unanchored search (it has both unanchored and
anchored start states), the PikeVM simulates it in code for performance
reasons. This simulation was actually incorrect for the anchored case,
because we were re-computing the epsilon closure for every step in the
search. Effectively, we were simulating an unanchored search
unconditionally.

Now the reason why this bug wasn't caught is because the PikeVM only
gets things half wrong. Namely, the regex '[b-z]c' does not match 'abc'
when starting the search at offset '0' and that's correct. The reason is
that the '[b-z]' doesn't match 'a', where as '.' in the aforementioned
regex does. Since the PikeVM doesn't match there, it's current list of
states becomes empty, and *this* case is anchor-aware and knows not to
continue the search in this case. In other words, the PikeVM only
half-implemented the unanchored search simulation. It gets it right in
some cases, but not all.

We fix the bug by requiring that we only do the epsilon closure when the
search is unanchored, or if it's anchored, that the current position is
at the start of the search. We add a regression test from #1036 as well.

Partially resolves #1036
---
 regex-automata/src/nfa/thompson/pikevm.rs | 10 +++++++++-
 testdata/anchored.toml                    | 10 ++++++++++
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/regex-automata/src/nfa/thompson/pikevm.rs b/regex-automata/src/nfa/thompson/pikevm.rs
index d737fb71e..79ce3c60d 100644
--- a/regex-automata/src/nfa/thompson/pikevm.rs
+++ b/regex-automata/src/nfa/thompson/pikevm.rs
@@ -1356,7 +1356,15 @@ impl PikeVM {
             // matches their behavior. (Generally, 'allmatches' is useful for
             // overlapping searches or leftmost anchored searches to find the
             // longest possible match by ignoring match priority.)
-            if !pid.is_some() || allmatches {
+            //
+            // Additionally, when we're running an anchored search, this
+            // epsilon closure should only be computed at the beginning of the
+            // search. If we re-computed it at every position, we would be
+            // simulating an unanchored search when we were tasked to perform
+            // an anchored search.
+            if (!pid.is_some() || allmatches)
+                && (!anchored || at == input.start())
+            {
                 // Since we are adding to the 'curr' active states and since
                 // this is for the start ID, we use a slots slice that is
                 // guaranteed to have the right length but where every element
diff --git a/testdata/anchored.toml b/testdata/anchored.toml
index cca561de1..7023335ec 100644
--- a/testdata/anchored.toml
+++ b/testdata/anchored.toml
@@ -69,3 +69,13 @@ haystack = 'abcβ'
 matches = [[0, 3]]
 anchored = true
 unicode = false
+
+# Tests that '.c' doesn't match 'abc' when performing an anchored search from
+# the beginning of the haystack. This test found two different bugs in the
+# PikeVM and the meta engine.
+[[test]]
+name = "no-match-at-start"
+regex = '.c'
+haystack = 'abc'
+matches = []
+anchored = true

From 70c7f575a24c1f465fcd51b3f2d0e25ba935da6e Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Wed, 12 Jul 2023 09:49:33 -0400
Subject: [PATCH 005/136] meta: fix anchored search bugs

It turns out that all three of the "reverse" optimizations in the meta
regex engine did not support anchored searches correctly. This was
intended, and in particular, none of these optimizations are active when
the regex is anchored at the beginning. However, a caller can still
request an anchored search even when the regex itself isn't anchored. In
this case, the general best approach is to just do a standard forward
regex search. Namely, the reverse suffix and reverse inner optimizations
are generally throughput optimizations, and anchored searches tend to be
more heavily dominated by latency.

Now it is plausible that we will want to do some optimizations in the
anchored case. For example, we might want to confirm that a required
literal is in the haystack before running a standard forward regex
search. But I think that's future work and will probably benefit from
being a distinct strategy. It's also somewhat tricky to do because while
it will make performance in the "no match" case much better, it will
likely regress performance in the "always match" case.

Anyway, we add more regression tests covering all of these cases and fix
the bug. We fix it by just checking whether the caller requested an
anchored search, and if so, fall back to the core engine.

Fixes #1036
---
 regex-automata/src/meta/strategy.rs | 49 +++++++++++++++++++++++++++++
 testdata/anchored.toml              | 46 +++++++++++++++++++++++++++
 2 files changed, 95 insertions(+)

diff --git a/regex-automata/src/meta/strategy.rs b/regex-automata/src/meta/strategy.rs
index 2de2c385e..aa1d61ef3 100644
--- a/regex-automata/src/meta/strategy.rs
+++ b/regex-automata/src/meta/strategy.rs
@@ -845,6 +845,14 @@ impl ReverseAnchored {
             );
             return Err(core);
         }
+        // Note that the caller can still request an anchored search even when
+        // the regex isn't anchored at the start. We detect that case in the
+        // search routines below and just fallback to the core engine. This
+        // is fine because both searches are anchored. It's just a matter of
+        // picking one. Falling back to the core engine is a little simpler,
+        // since if we used the reverse anchored approach, we'd have to add an
+        // extra check to ensure the match reported starts at the place where
+        // the caller requested the search to start.
         if core.info.is_always_anchored_start() {
             debug!(
                 "skipping reverse anchored optimization because \
@@ -930,6 +938,9 @@ impl Strategy for ReverseAnchored {
 
     #[cfg_attr(feature = "perf-inline", inline(always))]
     fn search(&self, cache: &mut Cache, input: &Input<'_>) -> Option<Match> {
+        if input.get_anchored().is_anchored() {
+            return self.core.search(cache, input);
+        }
         match self.try_search_half_anchored_rev(cache, input) {
             Err(_err) => {
                 trace!("fast reverse anchored search failed: {}", _err);
@@ -948,6 +959,9 @@ impl Strategy for ReverseAnchored {
         cache: &mut Cache,
         input: &Input<'_>,
     ) -> Option<HalfMatch> {
+        if input.get_anchored().is_anchored() {
+            return self.core.search_half(cache, input);
+        }
         match self.try_search_half_anchored_rev(cache, input) {
             Err(_err) => {
                 trace!("fast reverse anchored search failed: {}", _err);
@@ -973,6 +987,9 @@ impl Strategy for ReverseAnchored {
         input: &Input<'_>,
         slots: &mut [Option<NonMaxUsize>],
     ) -> Option<PatternID> {
+        if input.get_anchored().is_anchored() {
+            return self.core.search_slots(cache, input, slots);
+        }
         match self.try_search_half_anchored_rev(cache, input) {
             Err(_err) => {
                 trace!("fast reverse anchored search failed: {}", _err);
@@ -1034,6 +1051,13 @@ impl ReverseSuffix {
         // requires a reverse scan after a literal match to confirm or reject
         // the match. (Although, in the case of confirmation, it then needs to
         // do another forward scan to find the end position.)
+        //
+        // Note that the caller can still request an anchored search even
+        // when the regex isn't anchored. We detect that case in the search
+        // routines below and just fallback to the core engine. Currently this
+        // optimization assumes all searches are unanchored, so if we do want
+        // to enable this optimization for anchored searches, it will need a
+        // little work to support it.
         if core.info.is_always_anchored_start() {
             debug!(
                 "skipping reverse suffix optimization because \
@@ -1211,6 +1235,9 @@ impl Strategy for ReverseSuffix {
 
     #[cfg_attr(feature = "perf-inline", inline(always))]
     fn search(&self, cache: &mut Cache, input: &Input<'_>) -> Option<Match> {
+        if input.get_anchored().is_anchored() {
+            return self.core.search(cache, input);
+        }
         match self.try_search_half_start(cache, input) {
             Err(RetryError::Quadratic(_err)) => {
                 trace!("reverse suffix optimization failed: {}", _err);
@@ -1255,6 +1282,9 @@ impl Strategy for ReverseSuffix {
         cache: &mut Cache,
         input: &Input<'_>,
     ) -> Option<HalfMatch> {
+        if input.get_anchored().is_anchored() {
+            return self.core.search_half(cache, input);
+        }
         match self.try_search_half_start(cache, input) {
             Err(RetryError::Quadratic(_err)) => {
                 trace!("reverse suffix half optimization failed: {}", _err);
@@ -1309,6 +1339,9 @@ impl Strategy for ReverseSuffix {
         input: &Input<'_>,
         slots: &mut [Option<NonMaxUsize>],
     ) -> Option<PatternID> {
+        if input.get_anchored().is_anchored() {
+            return self.core.search_slots(cache, input, slots);
+        }
         if !self.core.is_capture_search_needed(slots.len()) {
             trace!("asked for slots unnecessarily, trying fast path");
             let m = self.search(cache, input)?;
@@ -1396,6 +1429,13 @@ impl ReverseInner {
         // or when the literal scan matches. If it matches, then confirming the
         // match requires a reverse scan followed by a forward scan to confirm
         // or reject, which is a fair bit of work.
+        //
+        // Note that the caller can still request an anchored search even
+        // when the regex isn't anchored. We detect that case in the search
+        // routines below and just fallback to the core engine. Currently this
+        // optimization assumes all searches are unanchored, so if we do want
+        // to enable this optimization for anchored searches, it will need a
+        // little work to support it.
         if core.info.is_always_anchored_start() {
             debug!(
                 "skipping reverse inner optimization because \
@@ -1635,6 +1675,9 @@ impl Strategy for ReverseInner {
 
     #[cfg_attr(feature = "perf-inline", inline(always))]
     fn search(&self, cache: &mut Cache, input: &Input<'_>) -> Option<Match> {
+        if input.get_anchored().is_anchored() {
+            return self.core.search(cache, input);
+        }
         match self.try_search_full(cache, input) {
             Err(RetryError::Quadratic(_err)) => {
                 trace!("reverse inner optimization failed: {}", _err);
@@ -1654,6 +1697,9 @@ impl Strategy for ReverseInner {
         cache: &mut Cache,
         input: &Input<'_>,
     ) -> Option<HalfMatch> {
+        if input.get_anchored().is_anchored() {
+            return self.core.search_half(cache, input);
+        }
         match self.try_search_full(cache, input) {
             Err(RetryError::Quadratic(_err)) => {
                 trace!("reverse inner half optimization failed: {}", _err);
@@ -1675,6 +1721,9 @@ impl Strategy for ReverseInner {
         input: &Input<'_>,
         slots: &mut [Option<NonMaxUsize>],
     ) -> Option<PatternID> {
+        if input.get_anchored().is_anchored() {
+            return self.core.search_slots(cache, input, slots);
+        }
         if !self.core.is_capture_search_needed(slots.len()) {
             trace!("asked for slots unnecessarily, trying fast path");
             let m = self.search(cache, input)?;
diff --git a/testdata/anchored.toml b/testdata/anchored.toml
index 7023335ec..0f2248d09 100644
--- a/testdata/anchored.toml
+++ b/testdata/anchored.toml
@@ -79,3 +79,49 @@ regex = '.c'
 haystack = 'abc'
 matches = []
 anchored = true
+
+# Like above, but at a non-zero start offset.
+[[test]]
+name = "no-match-at-start-bounds"
+regex = '.c'
+haystack = 'aabc'
+bounds = [1, 4]
+matches = []
+anchored = true
+
+# This is like no-match-at-start, but hits the "reverse inner" optimization
+# inside the meta engine. (no-match-at-start hits the "reverse suffix"
+# optimization.)
+[[test]]
+name = "no-match-at-start-reverse-inner"
+regex = '.c[a-z]'
+haystack = 'abcz'
+matches = []
+anchored = true
+
+# Like above, but at a non-zero start offset.
+[[test]]
+name = "no-match-at-start-reverse-inner-bounds"
+regex = '.c[a-z]'
+haystack = 'aabcz'
+bounds = [1, 5]
+matches = []
+anchored = true
+
+# Same as no-match-at-start, but applies to the meta engine's "reverse
+# anchored" optimization.
+[[test]]
+name = "no-match-at-start-reverse-anchored"
+regex = '.c[a-z]$'
+haystack = 'abcz'
+matches = []
+anchored = true
+
+# Like above, but at a non-zero start offset.
+[[test]]
+name = "no-match-at-start-reverse-anchored-bounds"
+regex = '.c[a-z]$'
+haystack = 'aabcz'
+bounds = [1, 5]
+matches = []
+anchored = true

From 961a882e9408b794eb8a9294c04e0aa20a32d95f Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Wed, 12 Jul 2023 10:14:34 -0400
Subject: [PATCH 006/136] regex-automata-0.3.3

---
 regex-automata/Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/regex-automata/Cargo.toml b/regex-automata/Cargo.toml
index 86eb7d8f5..1936cf783 100644
--- a/regex-automata/Cargo.toml
+++ b/regex-automata/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "regex-automata"
-version = "0.3.2"  #:version
+version = "0.3.3"  #:version
 authors = ["The Rust Project Developers", "Andrew Gallant <jamslam@gmail.com>"]
 description = "Automata construction and matching using regular expressions."
 documentation = "https://docs.rs/regex-automata"

From 48daadc0dc5865ced38495258cfffcd3951682e4 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Wed, 12 Jul 2023 11:10:50 -0400
Subject: [PATCH 007/136] regex-automata/test: ignore some tests in 32-bit
 targets

One of these tests (the captures one) is very specific to 64-bit since
it uses a numeric literal that is bigger than what can be fit into
32 bits.

The other two tests, for determinize_size_limit, are not specific to
64-bit targets but do somewhat depend on the specific memory usages in
play. We could probably find some limits that work for both 32-bit and
64-bit, but since 'cross' doesn't run doc tests, doing this is pretty
annoying. So just ignore the tests.

Fixes #1039
---
 regex-automata/src/dfa/dense.rs     | 2 ++
 regex-automata/src/util/captures.rs | 1 +
 2 files changed, 3 insertions(+)

diff --git a/regex-automata/src/dfa/dense.rs b/regex-automata/src/dfa/dense.rs
index 35f037ca6..75ca85e6e 100644
--- a/regex-automata/src/dfa/dense.rs
+++ b/regex-automata/src/dfa/dense.rs
@@ -879,6 +879,7 @@ impl Config {
     ///
     /// ```
     /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+    /// # if !cfg!(target_pointer_width = "64") { return Ok(()); } // see #1039
     /// use regex_automata::{dfa::{dense, Automaton}, Input};
     ///
     /// // 600KB isn't enough!
@@ -912,6 +913,7 @@ impl Config {
     ///
     /// ```
     /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+    /// # if !cfg!(target_pointer_width = "64") { return Ok(()); } // see #1039
     /// use regex_automata::{
     ///     dfa::{dense, Automaton, StartKind},
     ///     Anchored, Input,
diff --git a/regex-automata/src/util/captures.rs b/regex-automata/src/util/captures.rs
index 60b6df7e2..c6517348d 100644
--- a/regex-automata/src/util/captures.rs
+++ b/regex-automata/src/util/captures.rs
@@ -433,6 +433,7 @@ impl Captures {
     ///
     /// ```
     /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+    /// # if !cfg!(target_pointer_width = "64") { return Ok(()); } // see #1039
     /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Span, Match};
     ///
     /// let re = PikeVM::new(r"^(?P<first>\pL+)\s+(?P<last>\pL+)$")?;

From 855c5c4d984e0cfcfd8557ddacdf55eb05828bf2 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Thu, 13 Jul 2023 09:44:28 -0400
Subject: [PATCH 008/136] fuzz: add all fuzzers to OSS-fuzz

I forgot to do this step, and as a result, OSS-fuzz hasn't been running
any of the new fuzzers. Hopefully this is enough.

Ref #1037
---
 fuzz/Cargo.toml        |  3 +++
 fuzz/oss-fuzz-build.sh | 18 ++++++++++++++++--
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml
index 8688e73e0..a7eec2c81 100644
--- a/fuzz/Cargo.toml
+++ b/fuzz/Cargo.toml
@@ -32,6 +32,9 @@ features = ["atty", "humantime", "termcolor"]
 [workspace]
 members = ["."]
 
+# NOTE: If you add a new fuzzer below, please make sure to add it to the
+# oss-fuzz-build.sh script, otherwise it won't get run in OSS-fuzz.
+
 [[bin]]
 name = "fuzz_regex_match"
 path = "fuzz_targets/fuzz_regex_match.rs"
diff --git a/fuzz/oss-fuzz-build.sh b/fuzz/oss-fuzz-build.sh
index 38750250b..f96474739 100755
--- a/fuzz/oss-fuzz-build.sh
+++ b/fuzz/oss-fuzz-build.sh
@@ -1,4 +1,18 @@
 #!/bin/bash -eu
+
 cd $SRC/regex
-cargo fuzz build -O --debug-assertions 
-cp fuzz/target/x86_64-unknown-linux-gnu/release/fuzz_regex_match $OUT/
+cargo fuzz build -O --debug-assertions
+
+targets=(
+  fuzz_regex_match
+  fuzz_regex_lite_match
+  fuzz_regex_automata_deserialize_dense_dfa
+  fuzz_regex_automata_deserialize_sparse_dfa
+  ast_roundtrip
+  ast_fuzz_match
+  ast_fuzz_regex
+  ast_fuzz_match_bytes
+)
+for target in "${targets[@]}"; do
+  cp fuzz/target/x86_64-unknown-linux-gnu/release/$target $OUT/
+done

From e55e96ce3aa2744cb2ca2bbd2d41f49d4171fb4f Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Fri, 21 Jul 2023 08:07:13 -0400
Subject: [PATCH 009/136] doc: clarify ambiguous wording

Fixes #1050
---
 src/lib.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index 191aa2e1a..cd98be103 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -665,8 +665,8 @@ a separate crate, [`regex-syntax`](https://docs.rs/regex-syntax).
 </pre>
 
 Any named character class may appear inside a bracketed `[...]` character
-class. For example, `[\p{Greek}[:digit:]]` matches any Greek or ASCII
-digit. `[\p{Greek}&&\pL]` matches Greek letters.
+class. For example, `[\p{Greek}[:digit:]]` matches any ASCII digit or any
+codepoint in the `Greek` script. `[\p{Greek}&&\pL]` matches Greek letters.
 
 Precedence in character classes, from most binding to least:
 

From 7bc8f884257ecd53e0599e9f1ae97a3ed751d99c Mon Sep 17 00:00:00 2001
From: Leachim <32847549+Licheam@users.noreply.github.com>
Date: Fri, 21 Jul 2023 20:10:13 +0800
Subject: [PATCH 010/136] doc: update the old UTF-8 automata algorithm in
 comment

regex-cli went through a few iterations before its initial release,
but this means some comments in the code that reference it are
now probably stale. This fixes one of them.

PR #1049
---
 regex-automata/src/nfa/thompson/compiler.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/regex-automata/src/nfa/thompson/compiler.rs b/regex-automata/src/nfa/thompson/compiler.rs
index 497fc62b4..2021d93ea 100644
--- a/regex-automata/src/nfa/thompson/compiler.rs
+++ b/regex-automata/src/nfa/thompson/compiler.rs
@@ -1319,7 +1319,7 @@ impl Compiler {
         // compare and contrast performance of the Pike VM when the code below
         // is active vs the code above. Here's an example to try:
         //
-        //   regex-cli find nfa thompson pikevm -b @$smallishru '(?m)^\w{20}'
+        //   regex-cli find match pikevm -b -p '(?m)^\w{20}' -y '@$smallishru'
         //
         // With Unicode classes generated below, this search takes about 45s on
         // my machine. But with the compressed version above, the search takes
@@ -1338,7 +1338,7 @@ impl Compiler {
                     .map(|rng| self.c_range(rng.start, rng.end));
                 self.c_concat(it)
             });
-        self.c_alt(it)
+        self.c_alt_iter(it)
         */
     }
 

From 87f7f3f5125c98eb8c831b3f6ac3688526d5b331 Mon Sep 17 00:00:00 2001
From: Leachim <32847549+Licheam@users.noreply.github.com>
Date: Sun, 23 Jul 2023 20:05:00 +0800
Subject: [PATCH 011/136] automata/doc: fix typo when describing implicit
 unanchored prefix

PR #1052
---
 regex-automata/src/util/search.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/regex-automata/src/util/search.rs b/regex-automata/src/util/search.rs
index b7bf934ea..39aec522b 100644
--- a/regex-automata/src/util/search.rs
+++ b/regex-automata/src/util/search.rs
@@ -246,7 +246,7 @@ impl<'h> Input<'h> {
     /// When a search is anchored (so that's [`Anchored::Yes`] or
     /// [`Anchored::Pattern`]), a match must begin at the start of a search.
     /// When a search is not anchored (that's [`Anchored::No`]), regex engines
-    /// will behave as if the pattern started with a `(?:s-u.)*?`. This prefix
+    /// will behave as if the pattern started with a `(?s-u:.)*?`. This prefix
     /// permits a match to appear anywhere.
     ///
     /// By default, the anchored mode is [`Anchored::No`].

From 9a8720f6b5d946cdb2d2e9be92986e595cda60c5 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Fri, 28 Jul 2023 19:42:33 -0400
Subject: [PATCH 012/136] automata: bump regex-syntax to latest version

Fixes #1056
---
 regex-automata/Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/regex-automata/Cargo.toml b/regex-automata/Cargo.toml
index 1936cf783..c64df5efc 100644
--- a/regex-automata/Cargo.toml
+++ b/regex-automata/Cargo.toml
@@ -85,7 +85,7 @@ internal-instrument-pikevm = ["logging", "std"]
 aho-corasick = { version = "1.0.0", optional = true, default-features = false }
 log = { version = "0.4.14", optional = true }
 memchr = { version = "2.5.0", optional = true, default-features = false }
-regex-syntax = { path = "../regex-syntax", version = "0.7.0", optional = true, default-features = false }
+regex-syntax = { path = "../regex-syntax", version = "0.7.4", optional = true, default-features = false }
 
 [dev-dependencies]
 anyhow = "1.0.69"

From a1910244f873e003efbe6e80ad9302c8ea949430 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Fri, 28 Jul 2023 19:43:04 -0400
Subject: [PATCH 013/136] regex-automata-0.3.4

---
 regex-automata/Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/regex-automata/Cargo.toml b/regex-automata/Cargo.toml
index c64df5efc..1f423c605 100644
--- a/regex-automata/Cargo.toml
+++ b/regex-automata/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "regex-automata"
-version = "0.3.3"  #:version
+version = "0.3.4"  #:version
 authors = ["The Rust Project Developers", "Andrew Gallant <jamslam@gmail.com>"]
 description = "Automata construction and matching using regular expressions."
 documentation = "https://docs.rs/regex-automata"

From e10c9d7b56d3f33f48abf487a3d353f64f67897b Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Thu, 3 Aug 2023 11:11:15 -0400
Subject: [PATCH 014/136] automata: add new 'WhichCaptures' config

This is the first step in fixing a regression in memory usage. The
underlying problem is that regex-automata now natively supports
multi-pattern regexes *with* capturing support. Unfortunately though,
this overall doesn't work too well with the current design of the
PikeVM, because the amount of memory used is `len(captures) *
len(states)`. So basically, as the regex and number of captures
increases, the amount of memory used gets quite high.

This is new functionality that we hope to improve upon over time, so
it's not too big of a deal on its own. But it turns out this impacts
previous uses of RegexSet that have capture groups. The old
implementation just ignored these capture groups because they weren't
supported in a RegexSet, and thus there were no memory problems. But in
the new implementation, nothing tells it that it's okay to ignore the
capture groups. So it winds up allocating space for them even though the
RegexSet APIs don't provide any of that functionality.

So my plan to fix this is to introduce a new configuration knob for
controlling more granularly which capture states are compiled into the
NFA. Previously we only supported "all of them" or "none of them." This
commit adds a new (backwards compatible) knob that also permits "just
implicit groups." That is, one capture group per pattern. This hopefully
leads to less memory usage overall. (Well, it will certaintly be less,
but hopefully it's a big reduction.) We don't actually change anything
here. We just add a new `Config::which_captures` knob, implement the
existing `Config::captures` in terms of `Config::which_captures` and
deprecate `Config::captures`.

If this winds up not being sufficient, then we may need to adapt the
PikeVM to work without any capture groups at all and instead just report
which patterns match. Which is... probably fine?
---
 regex-automata/src/dfa/dense.rs             |   5 +-
 regex-automata/src/hybrid/dfa.rs            |   5 +-
 regex-automata/src/meta/strategy.rs         |  11 +-
 regex-automata/src/nfa/thompson/compiler.rs | 174 +++++++++++++++++---
 regex-automata/src/nfa/thompson/mod.rs      |   2 +-
 regex-automata/src/nfa/thompson/nfa.rs      |   8 +-
 regex-automata/src/util/captures.rs         |  16 +-
 7 files changed, 182 insertions(+), 39 deletions(-)

diff --git a/regex-automata/src/dfa/dense.rs b/regex-automata/src/dfa/dense.rs
index 75ca85e6e..6da865f97 100644
--- a/regex-automata/src/dfa/dense.rs
+++ b/regex-automata/src/dfa/dense.rs
@@ -1170,7 +1170,10 @@ impl Builder {
             .clone()
             // We can always forcefully disable captures because DFAs do not
             // support them.
-            .configure(thompson::Config::new().captures(false))
+            .configure(
+                thompson::Config::new()
+                    .which_captures(thompson::WhichCaptures::None),
+            )
             .build_many(patterns)
             .map_err(BuildError::nfa)?;
         self.build_from_nfa(&nfa)
diff --git a/regex-automata/src/hybrid/dfa.rs b/regex-automata/src/hybrid/dfa.rs
index 86963248f..67261c1a3 100644
--- a/regex-automata/src/hybrid/dfa.rs
+++ b/regex-automata/src/hybrid/dfa.rs
@@ -3973,7 +3973,10 @@ impl Builder {
             .clone()
             // We can always forcefully disable captures because DFAs do not
             // support them.
-            .configure(thompson::Config::new().captures(false))
+            .configure(
+                thompson::Config::new()
+                    .which_captures(thompson::WhichCaptures::None),
+            )
             .build_many(patterns)
             .map_err(BuildError::nfa)?;
         self.build_from_nfa(nfa)
diff --git a/regex-automata/src/meta/strategy.rs b/regex-automata/src/meta/strategy.rs
index aa1d61ef3..52a501bf6 100644
--- a/regex-automata/src/meta/strategy.rs
+++ b/regex-automata/src/meta/strategy.rs
@@ -13,7 +13,7 @@ use crate::{
         regex::{Cache, RegexInfo},
         reverse_inner, wrappers,
     },
-    nfa::thompson::{self, NFA},
+    nfa::thompson::{self, WhichCaptures, NFA},
     util::{
         captures::{Captures, GroupInfo},
         look::LookMatcher,
@@ -452,7 +452,7 @@ impl Core {
             .utf8(info.config().get_utf8_empty())
             .nfa_size_limit(info.config().get_nfa_size_limit())
             .shrink(false)
-            .captures(true)
+            .which_captures(WhichCaptures::All)
             .look_matcher(lookm);
         let nfa = thompson::Compiler::new()
             .configure(thompson_config.clone())
@@ -499,7 +499,10 @@ impl Core {
                     // useful with capturing groups in reverse. And of course,
                     // the lazy DFA ignores capturing groups in all cases.
                     .configure(
-                        thompson_config.clone().captures(false).reverse(true),
+                        thompson_config
+                            .clone()
+                            .which_captures(WhichCaptures::None)
+                            .reverse(true),
                     )
                     .build_many_from_hir(hirs)
                     .map_err(BuildError::nfa)?;
@@ -1480,7 +1483,7 @@ impl ReverseInner {
             .utf8(core.info.config().get_utf8_empty())
             .nfa_size_limit(core.info.config().get_nfa_size_limit())
             .shrink(false)
-            .captures(false)
+            .which_captures(WhichCaptures::None)
             .look_matcher(lookm);
         let result = thompson::Compiler::new()
             .configure(thompson_config)
diff --git a/regex-automata/src/nfa/thompson/compiler.rs b/regex-automata/src/nfa/thompson/compiler.rs
index 2021d93ea..6cc79822a 100644
--- a/regex-automata/src/nfa/thompson/compiler.rs
+++ b/regex-automata/src/nfa/thompson/compiler.rs
@@ -30,7 +30,7 @@ pub struct Config {
     reverse: Option<bool>,
     nfa_size_limit: Option<Option<usize>>,
     shrink: Option<bool>,
-    captures: Option<bool>,
+    which_captures: Option<WhichCaptures>,
     look_matcher: Option<LookMatcher>,
     #[cfg(test)]
     unanchored_prefix: Option<bool>,
@@ -178,12 +178,15 @@ impl Config {
     /// ```
     /// use regex_automata::{
     ///     dfa::{self, Automaton},
-    ///     nfa::thompson::NFA,
+    ///     nfa::thompson::{NFA, WhichCaptures},
     ///     HalfMatch, Input,
     /// };
     ///
     /// let dfa = dfa::dense::Builder::new()
-    ///     .thompson(NFA::config().captures(false).reverse(true))
+    ///     .thompson(NFA::config()
+    ///         .which_captures(WhichCaptures::None)
+    ///         .reverse(true)
+    ///     )
     ///     .build("baz[0-9]+")?;
     /// let expected = Some(HalfMatch::must(0, 3));
     /// assert_eq!(
@@ -277,10 +280,12 @@ impl Config {
     ///
     /// ```
     /// # if cfg!(miri) { return Ok(()); } // miri takes too long
-    /// use regex_automata::nfa::thompson::NFA;
+    /// use regex_automata::nfa::thompson::{NFA, WhichCaptures};
     ///
     /// // Currently we have to disable captures when enabling reverse NFA.
-    /// let config = NFA::config().captures(false).reverse(true);
+    /// let config = NFA::config()
+    ///     .which_captures(WhichCaptures::None)
+    ///     .reverse(true);
     /// let not_shrunk = NFA::compiler()
     ///     .configure(config.clone().shrink(false))
     ///     .build(r"\w")?;
@@ -314,18 +319,70 @@ impl Config {
     /// require capturing groups to be present in the NFA. Building a Pike VM
     /// with an NFA without capturing groups will result in an error.
     ///
+    /// (Note that since this method is deprecated, the example below uses
+    /// [`Config::which_captures`] to disable capture states.)
+    ///
     /// ```
-    /// use regex_automata::nfa::thompson::{pikevm::PikeVM, NFA};
+    /// use regex_automata::nfa::thompson::{
+    ///     pikevm::PikeVM,
+    ///     NFA,
+    ///     WhichCaptures,
+    /// };
     ///
     /// let nfa = NFA::compiler()
-    ///     .configure(NFA::config().captures(false))
+    ///     .configure(NFA::config().which_captures(WhichCaptures::None))
     ///     .build(r"[a-z]+")?;
     /// assert!(PikeVM::new_from_nfa(nfa).is_err());
     ///
     /// # Ok::<(), Box<dyn std::error::Error>>(())
     /// ```
-    pub fn captures(mut self, yes: bool) -> Config {
-        self.captures = Some(yes);
+    #[deprecated(since = "0.3.5", note = "use which_captures instead")]
+    pub fn captures(self, yes: bool) -> Config {
+        self.which_captures(if yes {
+            WhichCaptures::All
+        } else {
+            WhichCaptures::None
+        })
+    }
+
+    /// Configures what kinds of capture groups are compiled into
+    /// [`State::Capture`](crate::nfa::thompson::State::Capture) states in a
+    /// Thompson NFA.
+    ///
+    /// Currently, using any option except for [`WhichCaptures::None`] requires
+    /// disabling the [`reverse`](Config::reverse) setting. If both are
+    /// enabled, then the compiler will return an error. It is expected that
+    /// this limitation will be lifted in the future.
+    ///
+    /// This is set to [`WhichCaptures::All`] by default. Callers may wish to
+    /// use [`WhichCaptures::Implicit`] in cases where one wants avoid the
+    /// overhead of capture states for explicit groups. Usually this occurs
+    /// when one wants to use the `PikeVM` only for determining the overall
+    /// match. Otherwise, the `PikeVM` could use much more memory than is
+    /// necessary.
+    ///
+    /// # Example
+    ///
+    /// This example demonstrates that some regex engines, like the Pike VM,
+    /// require capturing groups to be present in the NFA. Building a Pike VM
+    /// with an NFA without capturing groups will result in an error.
+    ///
+    /// ```
+    /// use regex_automata::nfa::thompson::{
+    ///     pikevm::PikeVM,
+    ///     NFA,
+    ///     WhichCaptures,
+    /// };
+    ///
+    /// let nfa = NFA::compiler()
+    ///     .configure(NFA::config().which_captures(WhichCaptures::None))
+    ///     .build(r"[a-z]+")?;
+    /// assert!(PikeVM::new_from_nfa(nfa).is_err());
+    ///
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    pub fn which_captures(mut self, which_captures: WhichCaptures) -> Config {
+        self.which_captures = Some(which_captures);
         self
     }
 
@@ -405,8 +462,14 @@ impl Config {
     }
 
     /// Return whether NFA compilation is configured to produce capture states.
+    #[deprecated(since = "0.3.5", note = "use get_which_captures instead")]
     pub fn get_captures(&self) -> bool {
-        self.captures.unwrap_or(true)
+        self.get_which_captures().is_any()
+    }
+
+    /// Return what kinds of capture states will be compiled into an NFA.
+    pub fn get_which_captures(&self) -> WhichCaptures {
+        self.which_captures.unwrap_or(WhichCaptures::All)
     }
 
     /// Return the look-around matcher for this NFA.
@@ -439,7 +502,7 @@ impl Config {
             reverse: o.reverse.or(self.reverse),
             nfa_size_limit: o.nfa_size_limit.or(self.nfa_size_limit),
             shrink: o.shrink.or(self.shrink),
-            captures: o.captures.or(self.captures),
+            which_captures: o.which_captures.or(self.which_captures),
             look_matcher: o.look_matcher.or_else(|| self.look_matcher.clone()),
             #[cfg(test)]
             unanchored_prefix: o.unanchored_prefix.or(self.unanchored_prefix),
@@ -447,6 +510,57 @@ impl Config {
     }
 }
 
+/// A configuration indicating which kinds of
+/// [`State::Capture`](crate::nfa::thompson::State::Capture) states to include.
+///
+/// This configuration can be used with [`Config::which_captures`] to control
+/// which capture states are compiled into a Thompson NFA.
+///
+/// The default configuration is [`WhichCaptures::All`].
+#[derive(Clone, Copy, Debug)]
+pub enum WhichCaptures {
+    /// All capture states, including those corresponding to both implicit and
+    /// explicit capture groups, are included in the Thompson NFA.
+    All,
+    /// Only capture states corresponding to implicit capture groups are
+    /// included. Implicit capture groups appear in every pattern implicitly
+    /// and correspond to the overall match of a pattern.
+    ///
+    /// This is useful when one only cares about the overall match of a
+    /// pattern. By excluding capture states from explicit capture groups,
+    /// one might be able to reduce the memory usage of a multi-pattern regex
+    /// substantially if it was otherwise written to have many explicit capture
+    /// groups.
+    Implicit,
+    /// No capture states are compiled into the Thompson NFA.
+    ///
+    /// This is useful when capture states are either not needed (for example,
+    /// if one is only trying to build a DFA) or if they aren't supported (for
+    /// example, a reverse NFA).
+    None,
+}
+
+impl Default for WhichCaptures {
+    fn default() -> WhichCaptures {
+        WhichCaptures::All
+    }
+}
+
+impl WhichCaptures {
+    /// Returns true if this configuration indicates that no capture states
+    /// should be produced in an NFA.
+    pub fn is_none(&self) -> bool {
+        matches!(*self, WhichCaptures::None)
+    }
+
+    /// Returns true if this configuration indicates that some capture states
+    /// should be added to an NFA. Note that this might only include capture
+    /// states for implicit capture groups.
+    pub fn is_any(&self) -> bool {
+        !self.is_none()
+    }
+}
+
 /*
 This compiler below uses Thompson's construction algorithm. The compiler takes
 a regex-syntax::Hir as input and emits an NFA graph as output. The NFA graph
@@ -800,7 +914,9 @@ impl Compiler {
         if exprs.len() > PatternID::LIMIT {
             return Err(BuildError::too_many_patterns(exprs.len()));
         }
-        if self.config.get_reverse() && self.config.get_captures() {
+        if self.config.get_reverse()
+            && self.config.get_which_captures().is_any()
+        {
             return Err(BuildError::unsupported_captures());
         }
 
@@ -978,7 +1094,7 @@ impl Compiler {
         name: Option<&str>,
         expr: &Hir,
     ) -> Result<ThompsonRef, BuildError> {
-        if !self.config.get_captures() {
+        if self.config.get_which_captures().is_none() {
             return self.c(expr);
         }
 
@@ -1728,9 +1844,15 @@ mod tests {
         util::primitives::{PatternID, StateID},
     };
 
+    use super::*;
+
     fn build(pattern: &str) -> NFA {
         NFA::compiler()
-            .configure(NFA::config().captures(false).unanchored_prefix(false))
+            .configure(
+                NFA::config()
+                    .which_captures(WhichCaptures::None)
+                    .unanchored_prefix(false),
+            )
             .build(pattern)
             .unwrap()
     }
@@ -1794,7 +1916,7 @@ mod tests {
     #[test]
     fn compile_unanchored_prefix() {
         let nfa = NFA::compiler()
-            .configure(NFA::config().captures(false))
+            .configure(NFA::config().which_captures(WhichCaptures::None))
             .build(r"a")
             .unwrap();
         assert_eq!(
@@ -1827,7 +1949,11 @@ mod tests {
 
         // Check that non-UTF-8 literals work.
         let nfa = NFA::compiler()
-            .configure(NFA::config().captures(false).unanchored_prefix(false))
+            .configure(
+                NFA::config()
+                    .which_captures(WhichCaptures::None)
+                    .unanchored_prefix(false),
+            )
             .syntax(crate::util::syntax::Config::new().utf8(false))
             .build(r"(?-u)\xFF")
             .unwrap();
@@ -1937,7 +2063,7 @@ mod tests {
         let nfa = NFA::compiler()
             .configure(
                 NFA::config()
-                    .captures(false)
+                    .which_captures(WhichCaptures::None)
                     .reverse(true)
                     .shrink(false)
                     .unanchored_prefix(false),
@@ -1965,7 +2091,11 @@ mod tests {
     #[test]
     fn compile_many_start_pattern() {
         let nfa = NFA::compiler()
-            .configure(NFA::config().captures(false).unanchored_prefix(false))
+            .configure(
+                NFA::config()
+                    .which_captures(WhichCaptures::None)
+                    .unanchored_prefix(false),
+            )
             .build_many(&["a", "b"])
             .unwrap();
         assert_eq!(
@@ -1993,7 +2123,9 @@ mod tests {
         use regex_syntax::hir::{Class, ClassBytes, Hir};
 
         let hir = Hir::class(Class::Bytes(ClassBytes::new(vec![])));
-        let config = NFA::config().captures(false).unanchored_prefix(false);
+        let config = NFA::config()
+            .which_captures(WhichCaptures::None)
+            .unanchored_prefix(false);
         let nfa =
             NFA::compiler().configure(config).build_from_hir(&hir).unwrap();
         assert_eq!(nfa.states(), &[s_fail(), s_match(0)]);
@@ -2005,7 +2137,9 @@ mod tests {
         use regex_syntax::hir::{Class, ClassUnicode, Hir};
 
         let hir = Hir::class(Class::Unicode(ClassUnicode::new(vec![])));
-        let config = NFA::config().captures(false).unanchored_prefix(false);
+        let config = NFA::config()
+            .which_captures(WhichCaptures::None)
+            .unanchored_prefix(false);
         let nfa =
             NFA::compiler().configure(config).build_from_hir(&hir).unwrap();
         assert_eq!(nfa.states(), &[s_fail(), s_match(0)]);
diff --git a/regex-automata/src/nfa/thompson/mod.rs b/regex-automata/src/nfa/thompson/mod.rs
index 3581d738c..cf426736d 100644
--- a/regex-automata/src/nfa/thompson/mod.rs
+++ b/regex-automata/src/nfa/thompson/mod.rs
@@ -78,4 +78,4 @@ pub use self::{
     },
 };
 #[cfg(feature = "syntax")]
-pub use compiler::{Compiler, Config};
+pub use compiler::{Compiler, Config, WhichCaptures};
diff --git a/regex-automata/src/nfa/thompson/nfa.rs b/regex-automata/src/nfa/thompson/nfa.rs
index 86131406c..2108fa338 100644
--- a/regex-automata/src/nfa/thompson/nfa.rs
+++ b/regex-automata/src/nfa/thompson/nfa.rs
@@ -453,10 +453,10 @@ impl NFA {
     /// predict the anchored starting state.
     ///
     /// ```
-    /// use regex_automata::nfa::thompson::{NFA, State};
+    /// use regex_automata::nfa::thompson::{NFA, State, WhichCaptures};
     ///
     /// let nfa = NFA::compiler()
-    ///     .configure(NFA::config().captures(false))
+    ///     .configure(NFA::config().which_captures(WhichCaptures::None))
     ///     .build("a")?;
     /// let state = nfa.state(nfa.start_anchored());
     /// match *state {
@@ -711,7 +711,7 @@ impl NFA {
     /// or not.
     ///
     /// ```
-    /// use regex_automata::nfa::thompson::NFA;
+    /// use regex_automata::nfa::thompson::{NFA, WhichCaptures};
     ///
     /// // Obviously has capture states.
     /// let nfa = NFA::new("(a)")?;
@@ -733,7 +733,7 @@ impl NFA {
     /// // Notice that 'has_capture' is false here even when we have an
     /// // explicit capture group in the pattern.
     /// let nfa = NFA::compiler()
-    ///     .configure(NFA::config().captures(false))
+    ///     .configure(NFA::config().which_captures(WhichCaptures::None))
     ///     .build("(a)")?;
     /// assert!(!nfa.has_capture());
     ///
diff --git a/regex-automata/src/util/captures.rs b/regex-automata/src/util/captures.rs
index c6517348d..cd3a5f8f7 100644
--- a/regex-automata/src/util/captures.rs
+++ b/regex-automata/src/util/captures.rs
@@ -1810,10 +1810,10 @@ impl GroupInfo {
     /// panic even if captures aren't enabled on this NFA:
     ///
     /// ```
-    /// use regex_automata::nfa::thompson::NFA;
+    /// use regex_automata::nfa::thompson::{NFA, WhichCaptures};
     ///
     /// let nfa = NFA::compiler()
-    ///     .configure(NFA::config().captures(false))
+    ///     .configure(NFA::config().which_captures(WhichCaptures::None))
     ///     .build_many(&[
     ///         r"(?P<foo>a)",
     ///         r"a",
@@ -1958,7 +1958,7 @@ impl GroupInfo {
     /// for different patterns and NFA configurations.
     ///
     /// ```
-    /// use regex_automata::{nfa::thompson::NFA, PatternID};
+    /// use regex_automata::{nfa::thompson::{NFA, WhichCaptures}, PatternID};
     ///
     /// let nfa = NFA::new(r"(a)(b)(c)")?;
     /// // There are 3 explicit groups in the pattern's concrete syntax and
@@ -1970,13 +1970,13 @@ impl GroupInfo {
     /// assert_eq!(1, nfa.group_info().group_len(PatternID::ZERO));
     ///
     /// let nfa = NFA::compiler()
-    ///     .configure(NFA::config().captures(false))
+    ///     .configure(NFA::config().which_captures(WhichCaptures::None))
     ///     .build(r"abc")?;
     /// // We disabled capturing groups, so there are none.
     /// assert_eq!(0, nfa.group_info().group_len(PatternID::ZERO));
     ///
     /// let nfa = NFA::compiler()
-    ///     .configure(NFA::config().captures(false))
+    ///     .configure(NFA::config().which_captures(WhichCaptures::None))
     ///     .build(r"(a)(b)(c)")?;
     /// // We disabled capturing groups, so there are none, even if there are
     /// // explicit groups in the concrete syntax.
@@ -2000,7 +2000,7 @@ impl GroupInfo {
     /// for different patterns and NFA configurations.
     ///
     /// ```
-    /// use regex_automata::{nfa::thompson::NFA, PatternID};
+    /// use regex_automata::{nfa::thompson::{NFA, WhichCaptures}, PatternID};
     ///
     /// let nfa = NFA::new(r"(a)(b)(c)")?;
     /// // There are 3 explicit groups in the pattern's concrete syntax and
@@ -2017,13 +2017,13 @@ impl GroupInfo {
     /// assert_eq!(5, nfa.group_info().all_group_len());
     ///
     /// let nfa = NFA::compiler()
-    ///     .configure(NFA::config().captures(false))
+    ///     .configure(NFA::config().which_captures(WhichCaptures::None))
     ///     .build(r"abc")?;
     /// // We disabled capturing groups, so there are none.
     /// assert_eq!(0, nfa.group_info().all_group_len());
     ///
     /// let nfa = NFA::compiler()
-    ///     .configure(NFA::config().captures(false))
+    ///     .configure(NFA::config().which_captures(WhichCaptures::None))
     ///     .build(r"(a)(b)(c)")?;
     /// // We disabled capturing groups, so there are none, even if there are
     /// // explicit groups in the concrete syntax.

From a2ec566c7fc0f878effa7e4d36e8cbd4c51dcf71 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Fri, 4 Aug 2023 10:12:36 -0400
Subject: [PATCH 015/136] automata: respect new 'which_captures' option

The NFA compiler now implements the 'All', 'Implicit' and 'None'
options. We also add some targeted unit tests to confirm basic behavior.
---
 regex-automata/src/nfa/thompson/compiler.rs | 90 ++++++++++++++++++++-
 1 file changed, 87 insertions(+), 3 deletions(-)

diff --git a/regex-automata/src/nfa/thompson/compiler.rs b/regex-automata/src/nfa/thompson/compiler.rs
index 6cc79822a..fc3e57710 100644
--- a/regex-automata/src/nfa/thompson/compiler.rs
+++ b/regex-automata/src/nfa/thompson/compiler.rs
@@ -1094,8 +1094,13 @@ impl Compiler {
         name: Option<&str>,
         expr: &Hir,
     ) -> Result<ThompsonRef, BuildError> {
-        if self.config.get_which_captures().is_none() {
-            return self.c(expr);
+        match self.config.get_which_captures() {
+            // No capture states means we always skip them.
+            WhichCaptures::None => return self.c(expr),
+            // Implicit captures states means we only add when index==0 since
+            // index==0 implies the group is implicit.
+            WhichCaptures::Implicit if index > 0 => return self.c(expr),
+            _ => {}
         }
 
         let start = self.add_capture_start(index, name)?;
@@ -1841,7 +1846,7 @@ mod tests {
 
     use crate::{
         nfa::thompson::{SparseTransitions, State, Transition, NFA},
-        util::primitives::{PatternID, StateID},
+        util::primitives::{PatternID, SmallIndex, StateID},
     };
 
     use super::*;
@@ -1903,6 +1908,15 @@ mod tests {
         }
     }
 
+    fn s_cap(next: usize, pattern: usize, index: usize, slot: usize) -> State {
+        State::Capture {
+            next: sid(next),
+            pattern_id: pid(pattern),
+            group_index: SmallIndex::new(index).unwrap(),
+            slot: SmallIndex::new(slot).unwrap(),
+        }
+    }
+
     fn s_fail() -> State {
         State::Fail
     }
@@ -2144,4 +2158,74 @@ mod tests {
             NFA::compiler().configure(config).build_from_hir(&hir).unwrap();
         assert_eq!(nfa.states(), &[s_fail(), s_match(0)]);
     }
+
+    #[test]
+    fn compile_captures_all() {
+        let nfa = NFA::compiler()
+            .configure(
+                NFA::config()
+                    .unanchored_prefix(false)
+                    .which_captures(WhichCaptures::All),
+            )
+            .build("a(b)c")
+            .unwrap();
+        assert_eq!(
+            nfa.states(),
+            &[
+                s_cap(1, 0, 0, 0),
+                s_byte(b'a', 2),
+                s_cap(3, 0, 1, 2),
+                s_byte(b'b', 4),
+                s_cap(5, 0, 1, 3),
+                s_byte(b'c', 6),
+                s_cap(7, 0, 0, 1),
+                s_match(0)
+            ]
+        );
+        let ginfo = nfa.group_info();
+        assert_eq!(2, ginfo.all_group_len());
+    }
+
+    #[test]
+    fn compile_captures_implicit() {
+        let nfa = NFA::compiler()
+            .configure(
+                NFA::config()
+                    .unanchored_prefix(false)
+                    .which_captures(WhichCaptures::Implicit),
+            )
+            .build("a(b)c")
+            .unwrap();
+        assert_eq!(
+            nfa.states(),
+            &[
+                s_cap(1, 0, 0, 0),
+                s_byte(b'a', 2),
+                s_byte(b'b', 3),
+                s_byte(b'c', 4),
+                s_cap(5, 0, 0, 1),
+                s_match(0)
+            ]
+        );
+        let ginfo = nfa.group_info();
+        assert_eq!(1, ginfo.all_group_len());
+    }
+
+    #[test]
+    fn compile_captures_none() {
+        let nfa = NFA::compiler()
+            .configure(
+                NFA::config()
+                    .unanchored_prefix(false)
+                    .which_captures(WhichCaptures::None),
+            )
+            .build("a(b)c")
+            .unwrap();
+        assert_eq!(
+            nfa.states(),
+            &[s_byte(b'a', 1), s_byte(b'b', 2), s_byte(b'c', 3), s_match(0)]
+        );
+        let ginfo = nfa.group_info();
+        assert_eq!(0, ginfo.all_group_len());
+    }
 }

From 04b11b6e190dc98a7de14fccc4f50c08fcd31237 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Fri, 4 Aug 2023 10:50:23 -0400
Subject: [PATCH 016/136] automata: add 'which_captures' knob to meta::Regex

This propagates the new Thompson NFA compiler option to the meta regex
config API.
---
 regex-automata/src/meta/regex.rs    | 80 +++++++++++++++++++++++++++++
 regex-automata/src/meta/strategy.rs |  2 +-
 2 files changed, 81 insertions(+), 1 deletion(-)

diff --git a/regex-automata/src/meta/regex.rs b/regex-automata/src/meta/regex.rs
index 6e16ceedb..bc043793d 100644
--- a/regex-automata/src/meta/regex.rs
+++ b/regex-automata/src/meta/regex.rs
@@ -16,6 +16,7 @@ use crate::{
         strategy::{self, Strategy},
         wrappers,
     },
+    nfa::thompson::WhichCaptures,
     util::{
         captures::{Captures, GroupInfo},
         iter,
@@ -2429,6 +2430,7 @@ pub struct Config {
     utf8_empty: Option<bool>,
     autopre: Option<bool>,
     pre: Option<Option<Prefilter>>,
+    which_captures: Option<WhichCaptures>,
     nfa_size_limit: Option<Option<usize>>,
     onepass_size_limit: Option<Option<usize>>,
     hybrid_cache_capacity: Option<usize>,
@@ -2619,6 +2621,75 @@ impl Config {
         Config { pre: Some(pre), ..self }
     }
 
+    /// Configures what kinds of groups are compiled as "capturing" in the
+    /// underlying regex engine.
+    ///
+    /// This is set to [`WhichCaptures::All`] by default. Callers may wish to
+    /// use [`WhichCaptures::Implicit`] in cases where one wants avoid the
+    /// overhead of capture states for explicit groups.
+    ///
+    /// Note that another approach to avoiding the overhead of capture groups
+    /// is by using non-capturing groups in the regex pattern. That is,
+    /// `(?:a)` instead of `(a)`. This option is useful when you can't control
+    /// the concrete syntax but know that you don't need the underlying capture
+    /// states. For example, using `WhichCaptures::Implicit` will behave as if
+    /// all explicit capturing groups in the pattern were non-capturing.
+    ///
+    /// Setting this to `WhichCaptures::None` may result in an error when
+    /// building a meta regex.
+    ///
+    /// # Example
+    ///
+    /// This example demonstrates how the results of capture groups can change
+    /// based on this option. First we show the default (all capture groups in
+    /// the pattern are capturing):
+    ///
+    /// ```
+    /// use regex_automata::{meta::Regex, Match, Span};
+    ///
+    /// let re = Regex::new(r"foo([0-9]+)bar")?;
+    /// let hay = "foo123bar";
+    ///
+    /// let mut caps = re.create_captures();
+    /// re.captures(hay, &mut caps);
+    /// assert_eq!(Some(Span::from(0..9)), caps.get_group(0));
+    /// assert_eq!(Some(Span::from(3..6)), caps.get_group(1));
+    ///
+    /// Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    ///
+    /// And now we show the behavior when we only include implicit capture
+    /// groups. In this case, we can only find the overall match span, but the
+    /// spans of any other explicit group don't exist because they are treated
+    /// as non-capturing. (In effect, when `WhichCaptures::Implicit` is used,
+    /// there is no real point in using [`Regex::captures`] since it will never
+    /// be able to report more information than [`Regex::find`].)
+    ///
+    /// ```
+    /// use regex_automata::{
+    ///     meta::Regex,
+    ///     nfa::thompson::WhichCaptures,
+    ///     Match,
+    ///     Span,
+    /// };
+    ///
+    /// let re = Regex::builder()
+    ///     .configure(Regex::config().which_captures(WhichCaptures::Implicit))
+    ///     .build(r"foo([0-9]+)bar")?;
+    /// let hay = "foo123bar";
+    ///
+    /// let mut caps = re.create_captures();
+    /// re.captures(hay, &mut caps);
+    /// assert_eq!(Some(Span::from(0..9)), caps.get_group(0));
+    /// assert_eq!(None, caps.get_group(1));
+    ///
+    /// Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    pub fn which_captures(mut self, which_captures: WhichCaptures) -> Config {
+        self.which_captures = Some(which_captures);
+        self
+    }
+
     /// Sets the size limit, in bytes, to enforce on the construction of every
     /// NFA build by the meta regex engine.
     ///
@@ -2983,6 +3054,14 @@ impl Config {
         self.pre.as_ref().unwrap_or(&None).as_ref()
     }
 
+    /// Returns the capture configuration, as set by
+    /// [`Config::which_captures`].
+    ///
+    /// If it was not explicitly set, then a default value is returned.
+    pub fn get_which_captures(&self) -> WhichCaptures {
+        self.which_captures.unwrap_or(WhichCaptures::All)
+    }
+
     /// Returns NFA size limit, as set by [`Config::nfa_size_limit`].
     ///
     /// If it was not explicitly set, then a default value is returned.
@@ -3126,6 +3205,7 @@ impl Config {
             utf8_empty: o.utf8_empty.or(self.utf8_empty),
             autopre: o.autopre.or(self.autopre),
             pre: o.pre.or_else(|| self.pre.clone()),
+            which_captures: o.which_captures.or(self.which_captures),
             nfa_size_limit: o.nfa_size_limit.or(self.nfa_size_limit),
             onepass_size_limit: o
                 .onepass_size_limit
diff --git a/regex-automata/src/meta/strategy.rs b/regex-automata/src/meta/strategy.rs
index 52a501bf6..86610fbea 100644
--- a/regex-automata/src/meta/strategy.rs
+++ b/regex-automata/src/meta/strategy.rs
@@ -452,7 +452,7 @@ impl Core {
             .utf8(info.config().get_utf8_empty())
             .nfa_size_limit(info.config().get_nfa_size_limit())
             .shrink(false)
-            .which_captures(WhichCaptures::All)
+            .which_captures(info.config().get_which_captures())
             .look_matcher(lookm);
         let nfa = thompson::Compiler::new()
             .configure(thompson_config.clone())

From 3127e3b57d5cfe491c86fcc8a2a451ab666beb8f Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Fri, 4 Aug 2023 10:51:36 -0400
Subject: [PATCH 017/136] regex: use new 'which_captures' knob for RegexSet

While this reduces memory usage by half, unfortunately, it's still quite
a bit more than memory usage prior to regex 1.9. This is because we are
still allocating room to store two offsets per regex for a rather large
regex.
---
 src/builders.rs | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/src/builders.rs b/src/builders.rs
index d19a0ffe2..a0f9b28b5 100644
--- a/src/builders.rs
+++ b/src/builders.rs
@@ -28,7 +28,9 @@ use alloc::{
     vec::Vec,
 };
 
-use regex_automata::{meta, util::syntax, MatchKind};
+use regex_automata::{
+    meta, nfa::thompson::WhichCaptures, util::syntax, MatchKind,
+};
 
 use crate::error::Error;
 
@@ -100,8 +102,12 @@ impl Builder {
     }
 
     fn build_many_string(&self) -> Result<crate::RegexSet, Error> {
-        let metac =
-            self.metac.clone().match_kind(MatchKind::All).utf8_empty(true);
+        let metac = self
+            .metac
+            .clone()
+            .match_kind(MatchKind::All)
+            .utf8_empty(true)
+            .which_captures(WhichCaptures::Implicit);
         let syntaxc = self.syntaxc.clone().utf8(true);
         let patterns = Arc::from(self.pats.as_slice());
         meta::Builder::new()
@@ -113,8 +119,12 @@ impl Builder {
     }
 
     fn build_many_bytes(&self) -> Result<crate::bytes::RegexSet, Error> {
-        let metac =
-            self.metac.clone().match_kind(MatchKind::All).utf8_empty(false);
+        let metac = self
+            .metac
+            .clone()
+            .match_kind(MatchKind::All)
+            .utf8_empty(false)
+            .which_captures(WhichCaptures::Implicit);
         let syntaxc = self.syntaxc.clone().utf8(false);
         let patterns = Arc::from(self.pats.as_slice());
         meta::Builder::new()

From e29b915c3878fadfe63041f29ddb5f4a5bfc4f8d Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Fri, 4 Aug 2023 13:54:56 -0400
Subject: [PATCH 018/136] automata: make PikeVM and backtracker work without
 capture states

Previously, construction of these engines checked to make sure the NFA
given had some capture states in it. If the NFA didn't, construction
failed with an error.

To support the case where the NFA has no capture states at all (to avoid
gratuitous memory allocation), we remove this restriction and tweak the
engine implementations to stop assuming that the NFA has capture states.
This turned out to not be too hard, as we only assumed as much in a few
places.

The main reason why this restriction existed in the first place was
semantics. Namely, it's important that the PikeVM remain infallible. But
what happens when you ask for match offsets in a search with an NFA that
has no capture states? The PikeVM just doesn't support that. Previously
it would panic (and thus the reason construction would fail). But now
instead it will just report "no match." It's a little hokey, but we
justify it to ourselves because "simplicity" and "avoids footguns" are
non-goals of this crate.
---
 regex-automata/src/meta/regex.rs             |  6 ++-
 regex-automata/src/nfa/thompson/backtrack.rs | 29 ++++++------
 regex-automata/src/nfa/thompson/compiler.rs  | 46 +++++++++++++++-----
 regex-automata/src/nfa/thompson/error.rs     | 12 -----
 regex-automata/src/nfa/thompson/pikevm.rs    | 40 +++++++----------
 5 files changed, 72 insertions(+), 61 deletions(-)

diff --git a/regex-automata/src/meta/regex.rs b/regex-automata/src/meta/regex.rs
index bc043793d..0d40eaa40 100644
--- a/regex-automata/src/meta/regex.rs
+++ b/regex-automata/src/meta/regex.rs
@@ -2635,8 +2635,10 @@ impl Config {
     /// states. For example, using `WhichCaptures::Implicit` will behave as if
     /// all explicit capturing groups in the pattern were non-capturing.
     ///
-    /// Setting this to `WhichCaptures::None` may result in an error when
-    /// building a meta regex.
+    /// Setting this to `WhichCaptures::None` is usually not the right thing to
+    /// do. When no capture states are compiled, some regex engines (such as
+    /// the `PikeVM`) won't be able to report match offsets. This will manifest
+    /// as no match being found.
     ///
     /// # Example
     ///
diff --git a/regex-automata/src/nfa/thompson/backtrack.rs b/regex-automata/src/nfa/thompson/backtrack.rs
index 75b6c096b..c68f9fa42 100644
--- a/regex-automata/src/nfa/thompson/backtrack.rs
+++ b/regex-automata/src/nfa/thompson/backtrack.rs
@@ -300,15 +300,6 @@ impl Builder {
         &self,
         nfa: NFA,
     ) -> Result<BoundedBacktracker, BuildError> {
-        // If the NFA has no captures, then the backtracker doesn't work since
-        // it relies on them in order to report match locations. However, in
-        // the special case of an NFA with no patterns, it is allowed, since
-        // no matches can ever be produced. And importantly, an NFA with no
-        // patterns has no capturing groups anyway, so this is necessary to
-        // permit the backtracker to work with regexes with zero patterns.
-        if !nfa.has_capture() && nfa.pattern_len() > 0 {
-            return Err(BuildError::missing_captures());
-        }
         nfa.look_set_any().available().map_err(BuildError::word)?;
         Ok(BoundedBacktracker { config: self.config.clone(), nfa })
     }
@@ -954,8 +945,14 @@ impl BoundedBacktracker {
                 None => return Ok(None),
                 Some(pid) => pid,
             };
-            let start = slots[0].unwrap().get();
-            let end = slots[1].unwrap().get();
+            let start = match slots[0] {
+                None => return Ok(None),
+                Some(s) => s.get(),
+            };
+            let end = match slots[1] {
+                None => return Ok(None),
+                Some(s) => s.get(),
+            };
             return Ok(Some(Match::new(pid, Span { start, end })));
         }
         let ginfo = self.get_nfa().group_info();
@@ -965,8 +962,14 @@ impl BoundedBacktracker {
             None => return Ok(None),
             Some(pid) => pid,
         };
-        let start = slots[pid.as_usize() * 2].unwrap().get();
-        let end = slots[pid.as_usize() * 2 + 1].unwrap().get();
+        let start = match slots[pid.as_usize() * 2] {
+            None => return Ok(None),
+            Some(s) => s.get(),
+        };
+        let end = match slots[pid.as_usize() * 2 + 1] {
+            None => return Ok(None),
+            Some(s) => s.get(),
+        };
         Ok(Some(Match::new(pid, Span { start, end })))
     }
 
diff --git a/regex-automata/src/nfa/thompson/compiler.rs b/regex-automata/src/nfa/thompson/compiler.rs
index fc3e57710..065e9ef27 100644
--- a/regex-automata/src/nfa/thompson/compiler.rs
+++ b/regex-automata/src/nfa/thompson/compiler.rs
@@ -316,8 +316,8 @@ impl Config {
     /// # Example
     ///
     /// This example demonstrates that some regex engines, like the Pike VM,
-    /// require capturing groups to be present in the NFA. Building a Pike VM
-    /// with an NFA without capturing groups will result in an error.
+    /// require capturing states to be present in the NFA to report match
+    /// offsets.
     ///
     /// (Note that since this method is deprecated, the example below uses
     /// [`Config::which_captures`] to disable capture states.)
@@ -329,10 +329,13 @@ impl Config {
     ///     WhichCaptures,
     /// };
     ///
-    /// let nfa = NFA::compiler()
-    ///     .configure(NFA::config().which_captures(WhichCaptures::None))
+    /// let re = PikeVM::builder()
+    ///     .thompson(NFA::config().which_captures(WhichCaptures::None))
     ///     .build(r"[a-z]+")?;
-    /// assert!(PikeVM::new_from_nfa(nfa).is_err());
+    /// let mut cache = re.create_cache();
+    ///
+    /// assert!(re.is_match(&mut cache, "abc"));
+    /// assert_eq!(None, re.find(&mut cache, "abc"));
     ///
     /// # Ok::<(), Box<dyn std::error::Error>>(())
     /// ```
@@ -364,8 +367,8 @@ impl Config {
     /// # Example
     ///
     /// This example demonstrates that some regex engines, like the Pike VM,
-    /// require capturing groups to be present in the NFA. Building a Pike VM
-    /// with an NFA without capturing groups will result in an error.
+    /// require capturing states to be present in the NFA to report match
+    /// offsets.
     ///
     /// ```
     /// use regex_automata::nfa::thompson::{
@@ -374,10 +377,33 @@ impl Config {
     ///     WhichCaptures,
     /// };
     ///
-    /// let nfa = NFA::compiler()
-    ///     .configure(NFA::config().which_captures(WhichCaptures::None))
+    /// let re = PikeVM::builder()
+    ///     .thompson(NFA::config().which_captures(WhichCaptures::None))
+    ///     .build(r"[a-z]+")?;
+    /// let mut cache = re.create_cache();
+    ///
+    /// assert!(re.is_match(&mut cache, "abc"));
+    /// assert_eq!(None, re.find(&mut cache, "abc"));
+    ///
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    ///
+    /// The same applies to the bounded backtracker:
+    ///
+    /// ```
+    /// use regex_automata::nfa::thompson::{
+    ///     backtrack::BoundedBacktracker,
+    ///     NFA,
+    ///     WhichCaptures,
+    /// };
+    ///
+    /// let re = BoundedBacktracker::builder()
+    ///     .thompson(NFA::config().which_captures(WhichCaptures::None))
     ///     .build(r"[a-z]+")?;
-    /// assert!(PikeVM::new_from_nfa(nfa).is_err());
+    /// let mut cache = re.create_cache();
+    ///
+    /// assert!(re.try_is_match(&mut cache, "abc")?);
+    /// assert_eq!(None, re.try_find(&mut cache, "abc")?);
     ///
     /// # Ok::<(), Box<dyn std::error::Error>>(())
     /// ```
diff --git a/regex-automata/src/nfa/thompson/error.rs b/regex-automata/src/nfa/thompson/error.rs
index 82648813b..3c2fa8a21 100644
--- a/regex-automata/src/nfa/thompson/error.rs
+++ b/regex-automata/src/nfa/thompson/error.rs
@@ -68,9 +68,6 @@ enum BuildErrorKind {
         /// The invalid index that was given.
         index: u32,
     },
-    /// An error that occurs when one tries to build an NFA simulation (such as
-    /// the PikeVM) without any capturing groups.
-    MissingCaptures,
     /// An error that occurs when one tries to build a reverse NFA with
     /// captures enabled. Currently, this isn't supported, but we probably
     /// should support it at some point.
@@ -126,10 +123,6 @@ impl BuildError {
         BuildError { kind: BuildErrorKind::InvalidCaptureIndex { index } }
     }
 
-    pub(crate) fn missing_captures() -> BuildError {
-        BuildError { kind: BuildErrorKind::MissingCaptures }
-    }
-
     #[cfg(feature = "syntax")]
     pub(crate) fn unsupported_captures() -> BuildError {
         BuildError { kind: BuildErrorKind::UnsupportedCaptures }
@@ -181,11 +174,6 @@ impl core::fmt::Display for BuildError {
                 "capture group index {} is invalid (too big or discontinuous)",
                 index,
             ),
-            BuildErrorKind::MissingCaptures => write!(
-                f,
-                "operation requires the NFA to have capturing groups, \
-                 but the NFA given contains none",
-            ),
             #[cfg(feature = "syntax")]
             BuildErrorKind::UnsupportedCaptures => write!(
                 f,
diff --git a/regex-automata/src/nfa/thompson/pikevm.rs b/regex-automata/src/nfa/thompson/pikevm.rs
index 79ce3c60d..f5c0b200e 100644
--- a/regex-automata/src/nfa/thompson/pikevm.rs
+++ b/regex-automata/src/nfa/thompson/pikevm.rs
@@ -275,15 +275,6 @@ impl Builder {
     /// construction of the NFA itself will of course be ignored, since the NFA
     /// given here is already built.
     pub fn build_from_nfa(&self, nfa: NFA) -> Result<PikeVM, BuildError> {
-        // If the NFA has no captures, then the PikeVM doesn't work since it
-        // relies on them in order to report match locations. However, in
-        // the special case of an NFA with no patterns, it is allowed, since
-        // no matches can ever be produced. And importantly, an NFA with no
-        // patterns has no capturing groups anyway, so this is necessary to
-        // permit the PikeVM to work with regexes with zero patterns.
-        if !nfa.has_capture() && nfa.pattern_len() > 0 {
-            return Err(BuildError::missing_captures());
-        }
         nfa.look_set_any().available().map_err(BuildError::word)?;
         Ok(PikeVM { config: self.config.clone(), nfa })
     }
@@ -828,16 +819,16 @@ impl PikeVM {
         if self.get_nfa().pattern_len() == 1 {
             let mut slots = [None, None];
             let pid = self.search_slots(cache, &input, &mut slots)?;
-            let start = slots[0].unwrap().get();
-            let end = slots[1].unwrap().get();
+            let start = slots[0]?.get();
+            let end = slots[1]?.get();
             return Some(Match::new(pid, Span { start, end }));
         }
         let ginfo = self.get_nfa().group_info();
         let slots_len = ginfo.implicit_slot_len();
         let mut slots = vec![None; slots_len];
         let pid = self.search_slots(cache, &input, &mut slots)?;
-        let start = slots[pid.as_usize() * 2].unwrap().get();
-        let end = slots[pid.as_usize() * 2 + 1].unwrap().get();
+        let start = slots[pid.as_usize() * 2]?.get();
+        let end = slots[pid.as_usize() * 2 + 1]?.get();
         Some(Match::new(pid, Span { start, end }))
     }
 
@@ -1123,15 +1114,15 @@ impl PikeVM {
         if self.get_nfa().pattern_len() == 1 {
             let mut enough = [None, None];
             let got = self.search_slots_imp(cache, input, &mut enough);
-            // This is OK because we know `enough_slots` is strictly bigger
-            // than `slots`, otherwise this special case isn't reached.
+            // This is OK because we know `enough` is strictly bigger than
+            // `slots`, otherwise this special case isn't reached.
             slots.copy_from_slice(&enough[..slots.len()]);
             return got;
         }
         let mut enough = vec![None; min];
         let got = self.search_slots_imp(cache, input, &mut enough);
-        // This is OK because we know `enough_slots` is strictly bigger than
-        // `slots`, otherwise this special case isn't reached.
+        // This is OK because we know `enough` is strictly bigger than `slots`,
+        // otherwise this special case isn't reached.
         slots.copy_from_slice(&enough[..slots.len()]);
         got
     }
@@ -2108,15 +2099,16 @@ impl SlotTable {
         // if a 'Captures' has fewer slots, e.g., none at all or only slots
         // for tracking the overall match instead of all slots for every
         // group.
-        self.slots_for_captures = nfa.group_info().slot_len();
+        self.slots_for_captures = core::cmp::max(
+            self.slots_per_state,
+            nfa.pattern_len().checked_mul(2).unwrap(),
+        );
         let len = nfa
             .states()
             .len()
-            // We add 1 so that our last row is always empty. We use it as
-            // "scratch" space for computing the epsilon closure off of the
-            // starting state.
-            .checked_add(1)
-            .and_then(|x| x.checked_mul(self.slots_per_state))
+            .checked_mul(self.slots_per_state)
+            // Add space to account for scratch space used during a search.
+            .and_then(|x| x.checked_add(self.slots_for_captures))
             // It seems like this could actually panic on legitimate inputs on
             // 32-bit targets, and very likely to panic on 16-bit. Should we
             // somehow convert this to an error? What about something similar
@@ -2170,7 +2162,7 @@ impl SlotTable {
     /// compute an epsilon closure outside of the user supplied regex, and thus
     /// never want it to have any capturing slots set.
     fn all_absent(&mut self) -> &mut [Option<NonMaxUsize>] {
-        let i = self.table.len() - self.slots_per_state;
+        let i = self.table.len() - self.slots_for_captures;
         &mut self.table[i..i + self.slots_for_captures]
     }
 }

From 930770bb8b4811b80a9cfbd0237d1f225e7c7c20 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Fri, 4 Aug 2023 13:58:30 -0400
Subject: [PATCH 019/136] regex: switch RegexSet to use WhichCaptures::None

And this finally resolves the memory usage problem, as the PikeVM cache
used by the RegexSet in #1059 no longer allocates MBs of memory because
of the existence of impossible-to-use capturing groups.

Fixes #1059
---
 src/builders.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/builders.rs b/src/builders.rs
index a0f9b28b5..46c4824c5 100644
--- a/src/builders.rs
+++ b/src/builders.rs
@@ -107,7 +107,7 @@ impl Builder {
             .clone()
             .match_kind(MatchKind::All)
             .utf8_empty(true)
-            .which_captures(WhichCaptures::Implicit);
+            .which_captures(WhichCaptures::None);
         let syntaxc = self.syntaxc.clone().utf8(true);
         let patterns = Arc::from(self.pats.as_slice());
         meta::Builder::new()
@@ -124,7 +124,7 @@ impl Builder {
             .clone()
             .match_kind(MatchKind::All)
             .utf8_empty(false)
-            .which_captures(WhichCaptures::Implicit);
+            .which_captures(WhichCaptures::None);
         let syntaxc = self.syntaxc.clone().utf8(false);
         let patterns = Arc::from(self.pats.as_slice());
         meta::Builder::new()

From e003cae98254d0ad3ff47b0919143531ddb58689 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Sat, 5 Aug 2023 10:41:24 -0400
Subject: [PATCH 020/136] automata: add 'is_match' as its own path to meta
 regex internals

I originally prided myself on not having a dedicated `is_match` routine
on the meta regex engine's internal `Strategy` trait, and actually spent
a fair amount of attention ensuring that `is_match` and `find` always
returned the same results. That is, `is_match` returns true if and only
if `find` returns a match.

But the fix in the previous commits for #1059 means that a `PikeVM` and
a `BoundedBacktracker` can be used to run a search with an NFA that has
no capture states. Since both engines are implemented to only track
offsets via those capture states, it follows that the only thing that
can be returned in such cases is whether a match occurs (and if so,
which pattern matched). That in turn means that `is_match` can return
`true` while `find` can return `None` for the same search. This is
because the latter returns `None` even when a match is found but there
are no capture states to record the offsets of the match.

This in theory could be resolved by adding APIs to the `PikeVM` and the
`BoundedBacktracker` that return a `HalfMatch` without depending on any
capture states at all. Then `is_match` could be implemented in terms of
those APIs. That is probably the right path, but it's pretty gnarly to
do without breaking changes and I don't want to do any breaking changes
right now.

So instead, we just add a special path to the meta regex engine for
`is_match` and permit some cases to have different results between
`is_match` and `find`. Sigh.
---
 regex-automata/src/meta/regex.rs    |   9 ++-
 regex-automata/src/meta/strategy.rs | 120 +++++++++++++++++++++++++++-
 regex-automata/src/meta/wrappers.rs |  32 ++++++++
 3 files changed, 158 insertions(+), 3 deletions(-)

diff --git a/regex-automata/src/meta/regex.rs b/regex-automata/src/meta/regex.rs
index 0d40eaa40..3a04b14d8 100644
--- a/regex-automata/src/meta/regex.rs
+++ b/regex-automata/src/meta/regex.rs
@@ -529,7 +529,14 @@ impl Regex {
     #[inline]
     pub fn is_match<'h, I: Into<Input<'h>>>(&self, input: I) -> bool {
         let input = input.into().earliest(true);
-        self.search_half(&input).is_some()
+        if self.imp.info.is_impossible(&input) {
+            return false;
+        }
+        let mut guard = self.pool.get();
+        let result = self.imp.strat.is_match(&mut guard, &input);
+        // See 'Regex::search' for why we put the guard back explicitly.
+        PoolGuard::put(guard);
+        result
     }
 
     /// Executes a leftmost search and returns the first match that is found,
diff --git a/regex-automata/src/meta/strategy.rs b/regex-automata/src/meta/strategy.rs
index 86610fbea..ea6c6ab57 100644
--- a/regex-automata/src/meta/strategy.rs
+++ b/regex-automata/src/meta/strategy.rs
@@ -58,6 +58,8 @@ pub(super) trait Strategy:
         input: &Input<'_>,
     ) -> Option<HalfMatch>;
 
+    fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool;
+
     fn search_slots(
         &self,
         cache: &mut Cache,
@@ -399,6 +401,10 @@ impl<P: PrefilterI> Strategy for Pre<P> {
         self.search(cache, input).map(|m| HalfMatch::new(m.pattern(), m.end()))
     }
 
+    fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool {
+        self.search(cache, input).is_some()
+    }
+
     fn search_slots(
         &self,
         cache: &mut Cache,
@@ -623,6 +629,29 @@ impl Core {
         }
     }
 
+    fn is_match_nofail(&self, cache: &mut Cache, input: &Input<'_>) -> bool {
+        if let Some(ref e) = self.onepass.get(input) {
+            trace!(
+                "using OnePass for is-match search at {:?}",
+                input.get_span()
+            );
+            e.search_slots(&mut cache.onepass, input, &mut []).is_some()
+        } else if let Some(ref e) = self.backtrack.get(input) {
+            trace!(
+                "using BoundedBacktracker for is-match search at {:?}",
+                input.get_span()
+            );
+            e.is_match(&mut cache.backtrack, input)
+        } else {
+            trace!(
+                "using PikeVM for is-match search at {:?}",
+                input.get_span()
+            );
+            let e = self.pikevm.get();
+            e.is_match(&mut cache.pikevm, input)
+        }
+    }
+
     fn is_capture_search_needed(&self, slots_len: usize) -> bool {
         slots_len > self.nfa.group_info().implicit_slot_len()
     }
@@ -703,7 +732,7 @@ impl Strategy for Core {
         // The main difference with 'search' is that if we're using a DFA, we
         // can use a single forward scan without needing to run the reverse
         // DFA.
-        return if let Some(e) = self.dfa.get(input) {
+        if let Some(e) = self.dfa.get(input) {
             trace!("using full DFA for half search at {:?}", input.get_span());
             match e.try_search_half_fwd(input) {
                 Ok(x) => x,
@@ -723,7 +752,38 @@ impl Strategy for Core {
             }
         } else {
             self.search_half_nofail(cache, input)
-        };
+        }
+    }
+
+    #[cfg_attr(feature = "perf-inline", inline(always))]
+    fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool {
+        if let Some(e) = self.dfa.get(input) {
+            trace!(
+                "using full DFA for is-match search at {:?}",
+                input.get_span()
+            );
+            match e.try_search_half_fwd(input) {
+                Ok(x) => x.is_some(),
+                Err(_err) => {
+                    trace!("full DFA half search failed: {}", _err);
+                    self.is_match_nofail(cache, input)
+                }
+            }
+        } else if let Some(e) = self.hybrid.get(input) {
+            trace!(
+                "using lazy DFA for is-match search at {:?}",
+                input.get_span()
+            );
+            match e.try_search_half_fwd(&mut cache.hybrid, input) {
+                Ok(x) => x.is_some(),
+                Err(_err) => {
+                    trace!("lazy DFA half search failed: {}", _err);
+                    self.is_match_nofail(cache, input)
+                }
+            }
+        } else {
+            self.is_match_nofail(cache, input)
+        }
     }
 
     #[cfg_attr(feature = "perf-inline", inline(always))]
@@ -983,6 +1043,21 @@ impl Strategy for ReverseAnchored {
         }
     }
 
+    #[cfg_attr(feature = "perf-inline", inline(always))]
+    fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool {
+        if input.get_anchored().is_anchored() {
+            return self.core.is_match(cache, input);
+        }
+        match self.try_search_half_anchored_rev(cache, input) {
+            Err(_err) => {
+                trace!("fast reverse anchored search failed: {}", _err);
+                self.core.is_match_nofail(cache, input)
+            }
+            Ok(None) => false,
+            Ok(Some(_)) => true,
+        }
+    }
+
     #[cfg_attr(feature = "perf-inline", inline(always))]
     fn search_slots(
         &self,
@@ -1335,6 +1410,28 @@ impl Strategy for ReverseSuffix {
         }
     }
 
+    #[cfg_attr(feature = "perf-inline", inline(always))]
+    fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool {
+        if input.get_anchored().is_anchored() {
+            return self.core.is_match(cache, input);
+        }
+        match self.try_search_half_start(cache, input) {
+            Err(RetryError::Quadratic(_err)) => {
+                trace!("reverse suffix half optimization failed: {}", _err);
+                self.core.is_match_nofail(cache, input)
+            }
+            Err(RetryError::Fail(_err)) => {
+                trace!(
+                    "reverse suffix reverse fast half search failed: {}",
+                    _err
+                );
+                self.core.is_match_nofail(cache, input)
+            }
+            Ok(None) => false,
+            Ok(Some(_)) => true,
+        }
+    }
+
     #[cfg_attr(feature = "perf-inline", inline(always))]
     fn search_slots(
         &self,
@@ -1717,6 +1814,25 @@ impl Strategy for ReverseInner {
         }
     }
 
+    #[cfg_attr(feature = "perf-inline", inline(always))]
+    fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool {
+        if input.get_anchored().is_anchored() {
+            return self.core.is_match(cache, input);
+        }
+        match self.try_search_full(cache, input) {
+            Err(RetryError::Quadratic(_err)) => {
+                trace!("reverse inner half optimization failed: {}", _err);
+                self.core.is_match_nofail(cache, input)
+            }
+            Err(RetryError::Fail(_err)) => {
+                trace!("reverse inner fast half search failed: {}", _err);
+                self.core.is_match_nofail(cache, input)
+            }
+            Ok(None) => false,
+            Ok(Some(_)) => true,
+        }
+    }
+
     #[cfg_attr(feature = "perf-inline", inline(always))]
     fn search_slots(
         &self,
diff --git a/regex-automata/src/meta/wrappers.rs b/regex-automata/src/meta/wrappers.rs
index 8f58363a1..08110d9bb 100644
--- a/regex-automata/src/meta/wrappers.rs
+++ b/regex-automata/src/meta/wrappers.rs
@@ -87,6 +87,15 @@ impl PikeVMEngine {
         Ok(PikeVMEngine(engine))
     }
 
+    #[cfg_attr(feature = "perf-inline", inline(always))]
+    pub(crate) fn is_match(
+        &self,
+        cache: &mut PikeVMCache,
+        input: &Input<'_>,
+    ) -> bool {
+        self.0.is_match(cache.0.as_mut().unwrap(), input.clone())
+    }
+
     #[cfg_attr(feature = "perf-inline", inline(always))]
     pub(crate) fn search_slots(
         &self,
@@ -212,6 +221,29 @@ impl BoundedBacktrackerEngine {
         }
     }
 
+    #[cfg_attr(feature = "perf-inline", inline(always))]
+    pub(crate) fn is_match(
+        &self,
+        cache: &mut BoundedBacktrackerCache,
+        input: &Input<'_>,
+    ) -> bool {
+        #[cfg(feature = "nfa-backtrack")]
+        {
+            // OK because we only permit access to this engine when we know
+            // the haystack is short enough for the backtracker to run without
+            // reporting an error.
+            self.0
+                .try_is_match(cache.0.as_mut().unwrap(), input.clone())
+                .unwrap()
+        }
+        #[cfg(not(feature = "nfa-backtrack"))]
+        {
+            // Impossible to reach because this engine is never constructed
+            // if the requisite features aren't enabled.
+            unreachable!()
+        }
+    }
+
     #[cfg_attr(feature = "perf-inline", inline(always))]
     pub(crate) fn search_slots(
         &self,

From d93ddbefd77f61a771a9a71ac345e117c0c43054 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Sat, 5 Aug 2023 13:52:37 -0400
Subject: [PATCH 021/136] automata: add internal HalfMatch APIs for NFA engines

Welp, okay, turns out we do need to know at least the end offset of a
match even when the NFA has no capture states. This is necessary for
correctly handling the case where a regex can match the empty string but
the caller has asked that matches not split a codepoint. If we don't
know the end offset of a match, then we can't correctly determine
whether a match exists or not and are forced to return no match even
when a match exists. We can get away with this I think for `find`-style
APIs where the caller has specifically requested match offsets while
simultaneously configuring the NFA to not track offsets, but with
`is_match`-style APIs, we really should be able to handle it correctly.

We should eventually just expose the `HalfMatch` APIs on `PikeVM` and
`BoundedBacktracker`, but for now we keep them private.
---
 regex-automata/src/nfa/thompson/backtrack.rs | 59 ++++++++-----------
 regex-automata/src/nfa/thompson/pikevm.rs    | 60 +++++++++-----------
 2 files changed, 49 insertions(+), 70 deletions(-)

diff --git a/regex-automata/src/nfa/thompson/backtrack.rs b/regex-automata/src/nfa/thompson/backtrack.rs
index c68f9fa42..eba037c1d 100644
--- a/regex-automata/src/nfa/thompson/backtrack.rs
+++ b/regex-automata/src/nfa/thompson/backtrack.rs
@@ -19,7 +19,7 @@ use crate::{
         empty, iter,
         prefilter::Prefilter,
         primitives::{NonMaxUsize, PatternID, SmallIndex, StateID},
-        search::{Anchored, Input, Match, MatchError, Span},
+        search::{Anchored, HalfMatch, Input, Match, MatchError, Span},
     },
 };
 
@@ -1295,12 +1295,14 @@ impl BoundedBacktracker {
     ) -> Result<Option<PatternID>, MatchError> {
         let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8();
         if !utf8empty {
-            return self.try_search_slots_imp(cache, input, slots);
+            let maybe_hm = self.try_search_slots_imp(cache, input, slots)?;
+            return Ok(maybe_hm.map(|hm| hm.pattern()));
         }
         // See PikeVM::try_search_slots for why we do this.
         let min = self.get_nfa().group_info().implicit_slot_len();
         if slots.len() >= min {
-            return self.try_search_slots_imp(cache, input, slots);
+            let maybe_hm = self.try_search_slots_imp(cache, input, slots)?;
+            return Ok(maybe_hm.map(|hm| hm.pattern()));
         }
         if self.get_nfa().pattern_len() == 1 {
             let mut enough = [None, None];
@@ -1308,14 +1310,14 @@ impl BoundedBacktracker {
             // This is OK because we know `enough_slots` is strictly bigger
             // than `slots`, otherwise this special case isn't reached.
             slots.copy_from_slice(&enough[..slots.len()]);
-            return Ok(got);
+            return Ok(got.map(|hm| hm.pattern()));
         }
         let mut enough = vec![None; min];
         let got = self.try_search_slots_imp(cache, input, &mut enough)?;
         // This is OK because we know `enough_slots` is strictly bigger than
         // `slots`, otherwise this special case isn't reached.
         slots.copy_from_slice(&enough[..slots.len()]);
-        Ok(got)
+        Ok(got.map(|hm| hm.pattern()))
     }
 
     /// This is the actual implementation of `try_search_slots_imp` that
@@ -1328,30 +1330,17 @@ impl BoundedBacktracker {
         cache: &mut Cache,
         input: &Input<'_>,
         slots: &mut [Option<NonMaxUsize>],
-    ) -> Result<Option<PatternID>, MatchError> {
+    ) -> Result<Option<HalfMatch>, MatchError> {
         let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8();
-        let (pid, end) = match self.search_imp(cache, input, slots)? {
+        let hm = match self.search_imp(cache, input, slots)? {
             None => return Ok(None),
-            Some(pid) if !utf8empty => return Ok(Some(pid)),
-            Some(pid) => {
-                let slot_start = pid.as_usize() * 2;
-                let slot_end = slot_start + 1;
-                // OK because we know we have a match and we know our caller
-                // provided slots are big enough (which we make true above if
-                // the caller didn't). Namely, we're only here when 'utf8empty'
-                // is true, and when that's true, we require slots for every
-                // pattern.
-                (pid, slots[slot_end].unwrap().get())
-            }
+            Some(hm) if !utf8empty => return Ok(Some(hm)),
+            Some(hm) => hm,
         };
-        empty::skip_splits_fwd(input, pid, end, |input| {
-            let pid = match self.search_imp(cache, input, slots)? {
-                None => return Ok(None),
-                Some(pid) => pid,
-            };
-            let slot_start = pid.as_usize() * 2;
-            let slot_end = slot_start + 1;
-            Ok(Some((pid, slots[slot_end].unwrap().get())))
+        empty::skip_splits_fwd(input, hm, hm.offset(), |input| {
+            Ok(self
+                .search_imp(cache, input, slots)?
+                .map(|hm| (hm, hm.offset())))
         })
     }
 
@@ -1367,7 +1356,7 @@ impl BoundedBacktracker {
         cache: &mut Cache,
         input: &Input<'_>,
         slots: &mut [Option<NonMaxUsize>],
-    ) -> Result<Option<PatternID>, MatchError> {
+    ) -> Result<Option<HalfMatch>, MatchError> {
         // Unlike in the PikeVM, we write our capturing group spans directly
         // into the caller's captures groups. So we have to make sure we're
         // starting with a blank slate first. In the PikeVM, we avoid this
@@ -1414,10 +1403,9 @@ impl BoundedBacktracker {
                     Some(ref span) => at = span.start,
                 }
             }
-            if let Some(pid) =
-                self.backtrack(cache, input, at, start_id, slots)
+            if let Some(hm) = self.backtrack(cache, input, at, start_id, slots)
             {
-                return Ok(Some(pid));
+                return Ok(Some(hm));
             }
             at += 1;
         }
@@ -1438,14 +1426,13 @@ impl BoundedBacktracker {
         at: usize,
         start_id: StateID,
         slots: &mut [Option<NonMaxUsize>],
-    ) -> Option<PatternID> {
+    ) -> Option<HalfMatch> {
         cache.stack.push(Frame::Step { sid: start_id, at });
         while let Some(frame) = cache.stack.pop() {
             match frame {
                 Frame::Step { sid, at } => {
-                    if let Some(pid) = self.step(cache, input, sid, at, slots)
-                    {
-                        return Some(pid);
+                    if let Some(hm) = self.step(cache, input, sid, at, slots) {
+                        return Some(hm);
                     }
                 }
                 Frame::RestoreCapture { slot, offset } => {
@@ -1475,7 +1462,7 @@ impl BoundedBacktracker {
         mut sid: StateID,
         mut at: usize,
         slots: &mut [Option<NonMaxUsize>],
-    ) -> Option<PatternID> {
+    ) -> Option<HalfMatch> {
         loop {
             if !cache.visited.insert(sid, at - input.start()) {
                 return None;
@@ -1558,7 +1545,7 @@ impl BoundedBacktracker {
                 }
                 State::Fail => return None,
                 State::Match { pattern_id } => {
-                    return Some(pattern_id);
+                    return Some(HalfMatch::new(pattern_id, at));
                 }
             }
         }
diff --git a/regex-automata/src/nfa/thompson/pikevm.rs b/regex-automata/src/nfa/thompson/pikevm.rs
index f5c0b200e..0128c151a 100644
--- a/regex-automata/src/nfa/thompson/pikevm.rs
+++ b/regex-automata/src/nfa/thompson/pikevm.rs
@@ -17,7 +17,9 @@ use crate::{
         empty, iter,
         prefilter::Prefilter,
         primitives::{NonMaxUsize, PatternID, SmallIndex, StateID},
-        search::{Anchored, Input, Match, MatchKind, PatternSet, Span},
+        search::{
+            Anchored, HalfMatch, Input, Match, MatchKind, PatternSet, Span,
+        },
         sparse_set::SparseSet,
     },
 };
@@ -1094,7 +1096,8 @@ impl PikeVM {
     ) -> Option<PatternID> {
         let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8();
         if !utf8empty {
-            return self.search_slots_imp(cache, input, slots);
+            let hm = self.search_slots_imp(cache, input, slots)?;
+            return Some(hm.pattern());
         }
         // There is an unfortunate special case where if the regex can
         // match the empty string and UTF-8 mode is enabled, the search
@@ -1109,7 +1112,8 @@ impl PikeVM {
         // this case.
         let min = self.get_nfa().group_info().implicit_slot_len();
         if slots.len() >= min {
-            return self.search_slots_imp(cache, input, slots);
+            let hm = self.search_slots_imp(cache, input, slots)?;
+            return Some(hm.pattern());
         }
         if self.get_nfa().pattern_len() == 1 {
             let mut enough = [None, None];
@@ -1117,14 +1121,14 @@ impl PikeVM {
             // This is OK because we know `enough` is strictly bigger than
             // `slots`, otherwise this special case isn't reached.
             slots.copy_from_slice(&enough[..slots.len()]);
-            return got;
+            return got.map(|hm| hm.pattern());
         }
         let mut enough = vec![None; min];
         let got = self.search_slots_imp(cache, input, &mut enough);
         // This is OK because we know `enough` is strictly bigger than `slots`,
         // otherwise this special case isn't reached.
         slots.copy_from_slice(&enough[..slots.len()]);
-        got
+        got.map(|hm| hm.pattern())
     }
 
     /// This is the actual implementation of `search_slots_imp` that
@@ -1137,30 +1141,17 @@ impl PikeVM {
         cache: &mut Cache,
         input: &Input<'_>,
         slots: &mut [Option<NonMaxUsize>],
-    ) -> Option<PatternID> {
+    ) -> Option<HalfMatch> {
         let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8();
-        let (pid, end) = match self.search_imp(cache, input, slots) {
+        let hm = match self.search_imp(cache, input, slots) {
             None => return None,
-            Some(pid) if !utf8empty => return Some(pid),
-            Some(pid) => {
-                let slot_start = pid.as_usize() * 2;
-                let slot_end = slot_start + 1;
-                // OK because we know we have a match and we know our caller
-                // provided slots are big enough (which we make true above if
-                // the caller didn't). Namely, we're only here when 'utf8empty'
-                // is true, and when that's true, we require slots for every
-                // pattern.
-                (pid, slots[slot_end].unwrap().get())
-            }
+            Some(hm) if !utf8empty => return Some(hm),
+            Some(hm) => hm,
         };
-        empty::skip_splits_fwd(input, pid, end, |input| {
-            let pid = match self.search_imp(cache, input, slots) {
-                None => return Ok(None),
-                Some(pid) => pid,
-            };
-            let slot_start = pid.as_usize() * 2;
-            let slot_end = slot_start + 1;
-            Ok(Some((pid, slots[slot_end].unwrap().get())))
+        empty::skip_splits_fwd(input, hm, hm.offset(), |input| {
+            Ok(self
+                .search_imp(cache, input, slots)
+                .map(|hm| (hm, hm.offset())))
         })
         // OK because the PikeVM never errors.
         .unwrap()
@@ -1235,7 +1226,7 @@ impl PikeVM {
         cache: &mut Cache,
         input: &Input<'_>,
         slots: &mut [Option<NonMaxUsize>],
-    ) -> Option<PatternID> {
+    ) -> Option<HalfMatch> {
         cache.setup_search(slots.len());
         if input.is_done() {
             return None;
@@ -1264,7 +1255,7 @@ impl PikeVM {
         let pre =
             if anchored { None } else { self.get_config().get_prefilter() };
         let Cache { ref mut stack, ref mut curr, ref mut next } = cache;
-        let mut pid = None;
+        let mut hm = None;
         // Yes, our search doesn't end at input.end(), but includes it. This
         // is necessary because matches are delayed by one byte, just like
         // how the DFA engines work. The delay is used to handle look-behind
@@ -1283,7 +1274,7 @@ impl PikeVM {
             if curr.set.is_empty() {
                 // We have a match and we haven't been instructed to continue
                 // on even after finding a match, so we can quit.
-                if pid.is_some() && !allmatches {
+                if hm.is_some() && !allmatches {
                     break;
                 }
                 // If we're running an anchored search and we've advanced
@@ -1353,7 +1344,7 @@ impl PikeVM {
             // search. If we re-computed it at every position, we would be
             // simulating an unanchored search when we were tasked to perform
             // an anchored search.
-            if (!pid.is_some() || allmatches)
+            if (!hm.is_some() || allmatches)
                 && (!anchored || at == input.start())
             {
                 // Since we are adding to the 'curr' active states and since
@@ -1372,14 +1363,15 @@ impl PikeVM {
                 let slots = next.slot_table.all_absent();
                 self.epsilon_closure(stack, slots, curr, input, at, start_id);
             }
-            if let Some(x) = self.nexts(stack, curr, next, input, at, slots) {
-                pid = Some(x);
+            if let Some(pid) = self.nexts(stack, curr, next, input, at, slots)
+            {
+                hm = Some(HalfMatch::new(pid, at));
             }
             // Unless the caller asked us to return early, we need to mush on
             // to see if we can extend our match. (But note that 'nexts' will
             // quit right after seeing a match when match_kind==LeftmostFirst,
             // as is consistent with leftmost-first match priority.)
-            if input.get_earliest() && pid.is_some() {
+            if input.get_earliest() && hm.is_some() {
                 break;
             }
             core::mem::swap(curr, next);
@@ -1387,7 +1379,7 @@ impl PikeVM {
             at += 1;
         }
         instrument!(|c| c.eprint(&self.nfa));
-        pid
+        hm
     }
 
     /// The implementation for the 'which_overlapping_matches' API. Basically,

From 4f3390c55e98ae2d09312bc27e47216c7a19490c Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Sat, 5 Aug 2023 14:31:40 -0400
Subject: [PATCH 022/136] changelog: 1.9.2

---
 CHANGELOG.md | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index a6a2bcb41..06383f641 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,16 @@
+1.9.2 (2023-08-05)
+==================
+This is a patch release that fixes another memory usage regression. This
+particular regression occurred only when using a `RegexSet`. In some cases,
+much more heap memory (by one or two orders of magnitude) was allocated than in
+versions prior to 1.9.0.
+
+Bug fixes:
+
+* [BUG #1059](https://github.com/rust-lang/regex/issues/1059):
+Fix a memory usage regression when using a `RegexSet`.
+
+
 1.9.1 (2023-07-07)
 ==================
 This is a patch release which fixes a memory usage regression. In the regex

From 2f5bdb07974e037fdd61883fac83942c68b60512 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Sat, 5 Aug 2023 14:31:50 -0400
Subject: [PATCH 023/136] regex-automata-0.3.5

---
 regex-automata/Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/regex-automata/Cargo.toml b/regex-automata/Cargo.toml
index 1f423c605..b403d8250 100644
--- a/regex-automata/Cargo.toml
+++ b/regex-automata/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "regex-automata"
-version = "0.3.4"  #:version
+version = "0.3.5"  #:version
 authors = ["The Rust Project Developers", "Andrew Gallant <jamslam@gmail.com>"]
 description = "Automata construction and matching using regular expressions."
 documentation = "https://docs.rs/regex-automata"

From 8c01708a042399d14638dda5112469235c75f40a Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Sat, 5 Aug 2023 14:32:15 -0400
Subject: [PATCH 024/136] deps: bump regex-automata to 0.3.5

---
 Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Cargo.toml b/Cargo.toml
index bfd6aea61..1119eca99 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -173,7 +173,7 @@ optional = true
 # For the actual regex engines.
 [dependencies.regex-automata]
 path = "regex-automata"
-version = "0.3.1"
+version = "0.3.5"
 default-features = false
 features = ["alloc", "syntax", "meta", "nfa-pikevm"]
 

From bbf0b38df618734b92d7b92acc8a8bf31b6d0046 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Sat, 5 Aug 2023 14:32:22 -0400
Subject: [PATCH 025/136] 1.9.2

---
 Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Cargo.toml b/Cargo.toml
index 1119eca99..54b0e206e 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "regex"
-version = "1.9.1"  #:version
+version = "1.9.2"  #:version
 authors = ["The Rust Project Developers", "Andrew Gallant <jamslam@gmail.com>"]
 license = "MIT OR Apache-2.0"
 readme = "README.md"

From 73f7889021542ea80937b3adacefa5825eaa97fe Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Sat, 5 Aug 2023 17:25:31 -0400
Subject: [PATCH 026/136] automata: fix incorrect offsets reported by reverse
 inner optimization

Sadly it seems that my days of squashing optimization bugs are still
before me. In this particular case, the reverse inner literal
optimization (which is a new optimization introduced in regex 1.9)
resulted in reporting incorrect match offsets in some cases. The
offending case here is:

    $ regex-cli find match meta --no-table -p '(?:(\d+)[:.])?(\d{1,2})[:.](\d{2})' -y '888:77:66'
    0:1:9:888:77:66

The above reports a match at 1..9, but the correct match is 0..9. The
problem here is that the reverse inner literal optimization is being
applied, which splits the regex into three (conceptual) pieces:

1. `(?:(\d+)[:.])?(\d{1,2})`
2. `[:.]`
3. `(\d{2})`

The reverse inner optimization works by looking for occurrences of (2)
first, then matching (1) in reverse to find the start position of the
match and then searching for (3) in the forward direction to find the
end of the match.

The problem in this particular case is that (2) matches at position `3`
in the `888:77:66` haystack. Since the first section of numbers is
optional, the reverse inner optimization believes a match exists at
offset `1` by virtue of matching (1) in reverse. That is, the
`(\d{1,2})` matches at 1..3 while the `(?:(\d+)[:.])?` doesn't match at
all. The reverse search here is correct in isolation, but it leads to an
overall incorrect result by stopping the search early. The issue is that
the true leftmost match requires (2) to match at 6..7, but since it
matched at 3..4 first, it is considered first and leads to an incorrect
overall match.

To fix this, we add another "trip wire" to the reverse inner
optimization (of which there are already several) that tries to detect
cases where it cannot prove that the match it found is actually the
leftmost match. Namely, if it reports a match offset greater than the
start of the search and otherwise *could* have kept searching, then we
don't know whether we have the true leftmost match. In that case, we
bail on the optimization and let a slower path take over.

This is yet another example of how the nature of regex searching, and in
particular leftmost searching, inhibits the composition of different
regex strategies. Or at least, makes them incredibly subtle.

Fixes #1060
---
 regex-automata/src/meta/limited.rs | 47 ++++++++++++++++++++++++++++++
 testdata/regression.toml           | 17 +++++++++++
 2 files changed, 64 insertions(+)

diff --git a/regex-automata/src/meta/limited.rs b/regex-automata/src/meta/limited.rs
index 005878acd..192a2625e 100644
--- a/regex-automata/src/meta/limited.rs
+++ b/regex-automata/src/meta/limited.rs
@@ -88,7 +88,41 @@ pub(crate) fn dfa_try_search_half_rev(
             return Err(RetryError::Quadratic(RetryQuadraticError::new()));
         }
     }
+    let was_dead = dfa.is_dead_state(sid);
     dfa_eoi_rev(dfa, input, &mut sid, &mut mat)?;
+    // If we reach the beginning of the search and we could otherwise still
+    // potentially keep matching if there was more to match, then we actually
+    // return an error to indicate giving up on this optimization. Why? Because
+    // we can't prove that the real match begins at where we would report it.
+    //
+    // This only happens when all of the following are true:
+    //
+    // 1) We reach the starting point of our search span.
+    // 2) The match we found is before the starting point.
+    // 3) The FSM reports we could possibly find a longer match.
+    //
+    // We need (1) because otherwise the search stopped before the starting
+    // point and there is no possible way to find a more leftmost position.
+    //
+    // We need (2) because if the match found has an offset equal to the minimum
+    // possible offset, then there is no possible more leftmost match.
+    //
+    // We need (3) because if the FSM couldn't continue anyway (i.e., it's in
+    // a dead state), then we know we couldn't find anything more leftmost
+    // than what we have. (We have to check the state we were in prior to the
+    // EOI transition since the EOI transition will usually bring us to a dead
+    // state by virtue of it represents the end-of-input.)
+    if at == input.start()
+        && mat.map_or(false, |m| m.offset() > input.start())
+        && !was_dead
+    {
+        trace!(
+            "reached beginning of search at offset {} without hitting \
+             a dead state, quitting to avoid potential false positive match",
+            at,
+        );
+        return Err(RetryError::Quadratic(RetryQuadraticError::new()));
+    }
     Ok(mat)
 }
 
@@ -140,7 +174,20 @@ pub(crate) fn hybrid_try_search_half_rev(
             return Err(RetryError::Quadratic(RetryQuadraticError::new()));
         }
     }
+    let was_dead = sid.is_dead();
     hybrid_eoi_rev(dfa, cache, input, &mut sid, &mut mat)?;
+    // See the comments in the full DFA routine above for why we need this.
+    if at == input.start()
+        && mat.map_or(false, |m| m.offset() > input.start())
+        && !was_dead
+    {
+        trace!(
+            "reached beginning of search at offset {} without hitting \
+             a dead state, quitting to avoid potential false positive match",
+            at,
+        );
+        return Err(RetryError::Quadratic(RetryQuadraticError::new()));
+    }
     Ok(mat)
 }
 
diff --git a/testdata/regression.toml b/testdata/regression.toml
index bb5e4fd46..a2efa2ad3 100644
--- a/testdata/regression.toml
+++ b/testdata/regression.toml
@@ -739,3 +739,20 @@ matches = [[0, 9]]
 utf8 = false
 match-kind = "all"
 search-kind = "overlapping"
+
+# See: https://github.com/rust-lang/regex/issues/1060
+[[test]]
+name = "reverse-inner-plus-shorter-than-expected"
+regex = '(?:(\d+)[:.])?(\d{1,2})[:.](\d{2})'
+haystack = '102:12:39'
+matches = [[[0, 9], [0, 3], [4, 6], [7, 9]]]
+
+# Like reverse-inner-plus-shorter-than-expected, but using a far simpler regex
+# to demonstrate the extent of the rot. Sigh.
+#
+# See: https://github.com/rust-lang/regex/issues/1060
+[[test]]
+name = "reverse-inner-short"
+regex = '(?:([0-9][0-9][0-9]):)?([0-9][0-9]):([0-9][0-9])'
+haystack = '102:12:39'
+matches = [[[0, 9], [0, 3], [4, 6], [7, 9]]]

From 3d21492e399d77415ebcd2eee4432e2feab87893 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Sat, 5 Aug 2023 18:18:48 -0400
Subject: [PATCH 027/136] changelog: 1.9.3

---
 CHANGELOG.md | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 06383f641..764bb11b8 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,18 @@
+1.9.3 (2023-08-05)
+==================
+This is a patch release that fixes a bug where some searches could result in
+incorrect match offsets being reported. It is difficult to characterize the
+types of regexes susceptible to this bug. They generally involve patterns
+that contain no prefix or suffix literals, but have an inner literal along with
+a regex prefix that can conditionally match.
+
+Bug fixes:
+
+* [BUG #1060](https://github.com/rust-lang/regex/issues/1060):
+Fix a bug with the reverse inner literal optimization reporting incorrect match
+offsets.
+
+
 1.9.2 (2023-08-05)
 ==================
 This is a patch release that fixes another memory usage regression. This

From c892d08c7c9ccdef4278ebbe30b5a83f0a145780 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Sat, 5 Aug 2023 18:19:39 -0400
Subject: [PATCH 028/136] regex-automata-0.3.6

---
 regex-automata/Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/regex-automata/Cargo.toml b/regex-automata/Cargo.toml
index b403d8250..22af1d9a3 100644
--- a/regex-automata/Cargo.toml
+++ b/regex-automata/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "regex-automata"
-version = "0.3.5"  #:version
+version = "0.3.6"  #:version
 authors = ["The Rust Project Developers", "Andrew Gallant <jamslam@gmail.com>"]
 description = "Automata construction and matching using regular expressions."
 documentation = "https://docs.rs/regex-automata"

From 446ecd6154854274c70b015e6c2718cdf2f48c57 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Sat, 5 Aug 2023 18:20:04 -0400
Subject: [PATCH 029/136] deps: bump regex-automata to 0.3.6

---
 Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Cargo.toml b/Cargo.toml
index 54b0e206e..e056c5bb7 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -173,7 +173,7 @@ optional = true
 # For the actual regex engines.
 [dependencies.regex-automata]
 path = "regex-automata"
-version = "0.3.5"
+version = "0.3.6"
 default-features = false
 features = ["alloc", "syntax", "meta", "nfa-pikevm"]
 

From 72f889ef3cca59ebac6a026f3646e8d92f056d88 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Sat, 5 Aug 2023 18:20:06 -0400
Subject: [PATCH 030/136] 1.9.3

---
 Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Cargo.toml b/Cargo.toml
index e056c5bb7..cd2e30a5d 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "regex"
-version = "1.9.2"  #:version
+version = "1.9.3"  #:version
 authors = ["The Rust Project Developers", "Andrew Gallant <jamslam@gmail.com>"]
 license = "MIT OR Apache-2.0"
 readme = "README.md"

From 431c4e4867e1eb33eb39b23ed47c9934b2672f8f Mon Sep 17 00:00:00 2001
From: Orson Peters <orsonpeters@gmail.com>
Date: Mon, 7 Aug 2023 14:48:08 +0200
Subject: [PATCH 031/136] doc: fix typo in captures_read

PR #1064
---
 regex-lite/src/string.rs | 4 ++--
 src/regex/bytes.rs       | 4 ++--
 src/regex/string.rs      | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/regex-lite/src/string.rs b/regex-lite/src/string.rs
index 91b81d008..1c6eb4ab9 100644
--- a/regex-lite/src/string.rs
+++ b/regex-lite/src/string.rs
@@ -1186,8 +1186,8 @@ impl Regex {
     /// To create a `CaptureLocations` value, use the
     /// [`Regex::capture_locations`] method.
     ///
-    /// This also the overall match if one was found. When a match is found,
-    /// its offsets are also always stored in `locs` at index `0`.
+    /// This also returns the overall match if one was found. When a match is
+    /// found, its offsets are also always stored in `locs` at index `0`.
     ///
     /// # Panics
     ///
diff --git a/src/regex/bytes.rs b/src/regex/bytes.rs
index 6522ee7e3..03982544b 100644
--- a/src/regex/bytes.rs
+++ b/src/regex/bytes.rs
@@ -1162,8 +1162,8 @@ impl Regex {
     /// To create a `CaptureLocations` value, use the
     /// [`Regex::capture_locations`] method.
     ///
-    /// This also the overall match if one was found. When a match is found,
-    /// its offsets are also always stored in `locs` at index `0`.
+    /// This also returns the overall match if one was found. When a match is
+    /// found, its offsets are also always stored in `locs` at index `0`.
     ///
     /// # Example
     ///
diff --git a/src/regex/string.rs b/src/regex/string.rs
index 65a76740e..b9a3c3390 100644
--- a/src/regex/string.rs
+++ b/src/regex/string.rs
@@ -1153,8 +1153,8 @@ impl Regex {
     /// To create a `CaptureLocations` value, use the
     /// [`Regex::capture_locations`] method.
     ///
-    /// This also the overall match if one was found. When a match is found,
-    /// its offsets are also always stored in `locs` at index `0`.
+    /// This also returns the overall match if one was found. When a match is
+    /// found, its offsets are also always stored in `locs` at index `0`.
     ///
     /// # Panics
     ///

From 10faa44da9134c053c28a55857068909ca29a452 Mon Sep 17 00:00:00 2001
From: Gold Edem Hogan <gold67379@outlook.com>
Date: Wed, 23 Aug 2023 11:29:59 +0000
Subject: [PATCH 032/136] doc: fix a couple typos

PR #1068
---
 src/regex/bytes.rs  | 2 +-
 src/regex/string.rs | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/regex/bytes.rs b/src/regex/bytes.rs
index 03982544b..cc53482cb 100644
--- a/src/regex/bytes.rs
+++ b/src/regex/bytes.rs
@@ -1154,7 +1154,7 @@ impl Regex {
     ///
     /// A [`CaptureLocations`] stores the same byte offsets as a [`Captures`],
     /// but does *not* store a reference to the haystack. This makes its API
-    /// a bit lower level and less convenience. But in exchange, callers
+    /// a bit lower level and less convenient. But in exchange, callers
     /// may allocate their own `CaptureLocations` and reuse it for multiple
     /// searches. This may be helpful if allocating a `Captures` shows up in a
     /// profile as too costly.
diff --git a/src/regex/string.rs b/src/regex/string.rs
index b9a3c3390..d5908ae0d 100644
--- a/src/regex/string.rs
+++ b/src/regex/string.rs
@@ -1145,7 +1145,7 @@ impl Regex {
     ///
     /// A [`CaptureLocations`] stores the same byte offsets as a [`Captures`],
     /// but does *not* store a reference to the haystack. This makes its API
-    /// a bit lower level and less convenience. But in exchange, callers
+    /// a bit lower level and less convenient. But in exchange, callers
     /// may allocate their own `CaptureLocations` and reuse it for multiple
     /// searches. This may be helpful if allocating a `Captures` shows up in a
     /// profile as too costly.

From 81e328a29f8c57cb3622930104177e8606270230 Mon Sep 17 00:00:00 2001
From: Xy Qian <102588769+qianxyz@users.noreply.github.com>
Date: Wed, 23 Aug 2023 19:40:27 -0700
Subject: [PATCH 033/136] doc: fix typo in module-level doc

PR #1069
---
 regex-lite/src/lib.rs | 2 +-
 src/lib.rs            | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/regex-lite/src/lib.rs b/regex-lite/src/lib.rs
index d8e901678..8008b9e59 100644
--- a/regex-lite/src/lib.rs
+++ b/regex-lite/src/lib.rs
@@ -107,7 +107,7 @@ fn main() {
 }
 ```
 
-Foruth, run it with `cargo run`:
+Fourth, run it with `cargo run`:
 
 ```text
 $ cargo run
diff --git a/src/lib.rs b/src/lib.rs
index cd98be103..1e191b692 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -109,7 +109,7 @@ fn main() {
 }
 ```
 
-Foruth, run it with `cargo run`:
+Fourth, run it with `cargo run`:
 
 ```text
 $ cargo run

From 7536e055840f74f1f7bda8ffecf851cb3e500147 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Sat, 26 Aug 2023 08:14:50 -0400
Subject: [PATCH 034/136] syntax: remove superfluous `borrow`

Best guess is that the parser used to use something other than a `&str`,
but I can't remember.
---
 regex-syntax/src/ast/parse.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/regex-syntax/src/ast/parse.rs b/regex-syntax/src/ast/parse.rs
index 9cf64e9ec..47ea2586b 100644
--- a/regex-syntax/src/ast/parse.rs
+++ b/regex-syntax/src/ast/parse.rs
@@ -383,7 +383,7 @@ impl<'s, P: Borrow<Parser>> ParserI<'s, P> {
 
     /// Return a reference to the pattern being parsed.
     fn pattern(&self) -> &str {
-        self.pattern.borrow()
+        self.pattern
     }
 
     /// Create a new error with the given span and error type.

From de0339959b491ae0a26e6f96c0b0dc1635bc0f94 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Sat, 26 Aug 2023 08:48:26 -0400
Subject: [PATCH 035/136] automata: fix incorrect use of Aho-Corasick's
 "standard" semantics

This fixes a bug in how prefilters were applied for multi-regexes
compiled with "all" semantics. It turns out that this corresponds to the
regex crate's RegexSet API, but only its `is_match` routine.

See the comment on the regression test added in this PR for an
explanation of what happened. Basically, it came down to incorrectly
using Aho-Corasick's "standard" semantics, which doesn't necessarily
report leftmost matches. Since the regex crate is really all about
leftmost matching, this can lead to skipping over parts of the haystack
and thus lead to missing matches.

Fixes #1070
---
 .../src/util/prefilter/aho_corasick.rs        | 13 ++++++++--
 regex-automata/src/util/prefilter/mod.rs      |  9 -------
 regex-automata/src/util/prefilter/teddy.rs    |  9 +++++--
 testdata/regression.toml                      | 26 +++++++++++++++++++
 4 files changed, 44 insertions(+), 13 deletions(-)

diff --git a/regex-automata/src/util/prefilter/aho_corasick.rs b/regex-automata/src/util/prefilter/aho_corasick.rs
index a7474d29a..50cce827e 100644
--- a/regex-automata/src/util/prefilter/aho_corasick.rs
+++ b/regex-automata/src/util/prefilter/aho_corasick.rs
@@ -22,11 +22,20 @@ impl AhoCorasick {
         }
         #[cfg(feature = "perf-literal-multisubstring")]
         {
+            // We used to use `aho_corasick::MatchKind::Standard` here when
+            // `kind` was `MatchKind::All`, but this is not correct. The
+            // "standard" Aho-Corasick match semantics are to report a match
+            // immediately as soon as it is seen, but `All` isn't like that.
+            // In particular, with "standard" semantics, given the needles
+            // "abc" and "b" and the haystack "abc," it would report a match
+            // at offset 1 before a match at offset 0. This is never what we
+            // want in the context of the regex engine, regardless of whether
+            // we have leftmost-first or 'all' semantics. Namely, we always
+            // want the leftmost match.
             let ac_match_kind = match kind {
-                MatchKind::LeftmostFirst => {
+                MatchKind::LeftmostFirst | MatchKind::All => {
                     aho_corasick::MatchKind::LeftmostFirst
                 }
-                MatchKind::All => aho_corasick::MatchKind::Standard,
             };
             // This is kind of just an arbitrary number, but basically, if we
             // have a small enough set of literals, then we try to use the VERY
diff --git a/regex-automata/src/util/prefilter/mod.rs b/regex-automata/src/util/prefilter/mod.rs
index ea3eb73d8..51fc92233 100644
--- a/regex-automata/src/util/prefilter/mod.rs
+++ b/regex-automata/src/util/prefilter/mod.rs
@@ -195,15 +195,6 @@ impl Prefilter {
     ///     Some(Span::from(6..9)),
     ///     pre.find(hay.as_bytes(), Span::from(0..hay.len())),
     /// );
-    /// // Now we put 'samwise' back before 'sam', but change the match
-    /// // semantics to 'All'. In this case, there is no preference
-    /// // order semantics and the first match detected is returned.
-    /// let pre = Prefilter::new(MatchKind::All, &["samwise", "sam"])
-    ///     .expect("a prefilter");
-    /// assert_eq!(
-    ///     Some(Span::from(6..9)),
-    ///     pre.find(hay.as_bytes(), Span::from(0..hay.len())),
-    /// );
     ///
     /// # Ok::<(), Box<dyn std::error::Error>>(())
     /// ```
diff --git a/regex-automata/src/util/prefilter/teddy.rs b/regex-automata/src/util/prefilter/teddy.rs
index 02210a5ec..fc79f2b2f 100644
--- a/regex-automata/src/util/prefilter/teddy.rs
+++ b/regex-automata/src/util/prefilter/teddy.rs
@@ -50,12 +50,17 @@ impl Teddy {
             // theory we could at least support leftmost-longest, as the
             // aho-corasick crate does, but regex-automata doesn't know about
             // leftmost-longest currently.
+            //
+            // And like the aho-corasick prefilter, if we're using `All`
+            // semantics, then we can still use leftmost semantics for a
+            // prefilter. (This might be a suspicious choice for the literal
+            // engine, which uses a prefilter as a regex engine directly, but
+            // that only happens when using leftmost-first semantics.)
             let (packed_match_kind, ac_match_kind) = match kind {
-                MatchKind::LeftmostFirst => (
+                MatchKind::LeftmostFirst | MatchKind::All => (
                     aho_corasick::packed::MatchKind::LeftmostFirst,
                     aho_corasick::MatchKind::LeftmostFirst,
                 ),
-                _ => return None,
             };
             let minimum_len =
                 needles.iter().map(|n| n.as_ref().len()).min().unwrap_or(0);
diff --git a/testdata/regression.toml b/testdata/regression.toml
index a2efa2ad3..03b15d6d5 100644
--- a/testdata/regression.toml
+++ b/testdata/regression.toml
@@ -756,3 +756,29 @@ name = "reverse-inner-short"
 regex = '(?:([0-9][0-9][0-9]):)?([0-9][0-9]):([0-9][0-9])'
 haystack = '102:12:39'
 matches = [[[0, 9], [0, 3], [4, 6], [7, 9]]]
+
+# This regression test was found via the RegexSet APIs. It triggered a
+# particular code path where a regex was compiled with 'All' match semantics
+# (to support overlapping search), but got funneled down into a standard
+# leftmost search when calling 'is_match'. This is fine on its own, but the
+# leftmost search will use a prefilter and that's where this went awry.
+#
+# Namely, since 'All' semantics were used, the aho-corasick prefilter was
+# incorrectly compiled with 'Standard' semantics. This was wrong because
+# 'Standard' immediately attempts to report a match at every position, even if
+# that would mean reporting a match past the leftmost match before reporting
+# the leftmost match. This breaks the prefilter contract of never having false
+# negatives and leads overall to the engine not finding a match.
+#
+# See: https://github.com/rust-lang/regex/issues/1070
+[[test]]
+name = "prefilter-with-aho-corasick-standard-semantics"
+regex = '(?m)^ *v [0-9]'
+haystack = 'v 0'
+matches = [
+  { id = 0, spans = [[0, 3]] },
+]
+match-kind = "all"
+search-kind = "overlapping"
+unicode = true
+utf8 = true

From c788378d6fe407f4774df98a78436cea5d98525b Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Sat, 26 Aug 2023 09:18:30 -0400
Subject: [PATCH 036/136] ci: drop mips, add powerpc and s390x

The main reason we used mips before was to get test coverage on a big
endian target. Now that mips no longer seems to work[1], I wanted to
add at least one other big endian target. From the tier 2 supported
platforms[2], the only big endian targets I could find were powerpc and
s390x. So we just add both here.

[1]: https://github.com/rust-lang/compiler-team/issues/648
[2]: https://doc.rust-lang.org/nightly/rustc/platform-support.html#tier-2-with-host-tools
---
 .github/workflows/ci.yml | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 25df2b301..2035178a4 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -54,10 +54,14 @@ jobs:
           os: ubuntu-latest
           rust: stable
           target: i686-unknown-linux-gnu
-        - build: stable-mips
+        - build: stable-powerpc64
           os: ubuntu-latest
           rust: stable
-          target: mips64-unknown-linux-gnuabi64
+          target: powerpc64-unknown-linux-gnu
+        - build: stable-s390x
+          os: ubuntu-latest
+          rust: stable
+          target: s390x-unknown-linux-gnu
         - build: beta
           os: ubuntu-latest
           rust: beta

From e008f83090c1fa858cd602da94e9ffdf38fd7317 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Sat, 26 Aug 2023 09:50:22 -0400
Subject: [PATCH 037/136] changelog: 1.9.4

---
 CHANGELOG.md | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 764bb11b8..a5f218010 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,15 @@
+1.9.4 (2023-08-26)
+==================
+This is a patch release that fixes a bug where `RegexSet::is_match(..)` could
+incorrectly return false (even when `RegexSet::matches(..).matched_any()`
+returns true).
+
+Bug fixes:
+
+* [BUG #1070](https://github.com/rust-lang/regex/issues/1070):
+Fix a bug where a prefilter was incorrectly configured for a `RegexSet`.
+
+
 1.9.3 (2023-08-05)
 ==================
 This is a patch release that fixes a bug where some searches could result in

From 990979bbdc28fa841e3ad55934ee445cd710d110 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Sat, 26 Aug 2023 09:50:33 -0400
Subject: [PATCH 038/136] regex-syntax-0.7.5

---
 regex-syntax/Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/regex-syntax/Cargo.toml b/regex-syntax/Cargo.toml
index b7a149c23..aaceeee7f 100644
--- a/regex-syntax/Cargo.toml
+++ b/regex-syntax/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "regex-syntax"
-version = "0.7.4"  #:version
+version = "0.7.5"  #:version
 authors = ["The Rust Project Developers", "Andrew Gallant <jamslam@gmail.com>"]
 license = "MIT OR Apache-2.0"
 repository = "https://github.com/rust-lang/regex/tree/master/regex-syntax"

From 89b452af302a00458a129f8f40f3b65daf7a278a Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Sat, 26 Aug 2023 09:50:44 -0400
Subject: [PATCH 039/136] regex-automata-0.3.7

---
 regex-automata/Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/regex-automata/Cargo.toml b/regex-automata/Cargo.toml
index 22af1d9a3..d069b176e 100644
--- a/regex-automata/Cargo.toml
+++ b/regex-automata/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "regex-automata"
-version = "0.3.6"  #:version
+version = "0.3.7"  #:version
 authors = ["The Rust Project Developers", "Andrew Gallant <jamslam@gmail.com>"]
 description = "Automata construction and matching using regular expressions."
 documentation = "https://docs.rs/regex-automata"

From 3f15f1cf355577fe369c15ce60e1d225a163bf29 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Sat, 26 Aug 2023 09:51:18 -0400
Subject: [PATCH 040/136] deps: bump regex-syntax and regex-automata versions

---
 Cargo.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index cd2e30a5d..e8e1608ec 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -173,14 +173,14 @@ optional = true
 # For the actual regex engines.
 [dependencies.regex-automata]
 path = "regex-automata"
-version = "0.3.6"
+version = "0.3.7"
 default-features = false
 features = ["alloc", "syntax", "meta", "nfa-pikevm"]
 
 # For parsing regular expressions.
 [dependencies.regex-syntax]
 path = "regex-syntax"
-version = "0.7.3"
+version = "0.7.5"
 default-features = false
 
 [dev-dependencies]

From f39ab4d1b7229924f0cf310c9f3e19822fa19b8a Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Sat, 26 Aug 2023 09:51:37 -0400
Subject: [PATCH 041/136] 1.9.4

---
 Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Cargo.toml b/Cargo.toml
index e8e1608ec..0675337d7 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "regex"
-version = "1.9.3"  #:version
+version = "1.9.4"  #:version
 authors = ["The Rust Project Developers", "Andrew Gallant <jamslam@gmail.com>"]
 license = "MIT OR Apache-2.0"
 readme = "README.md"

From 2637e11b9fb9f3dc8bbfc3cbc625fd454c091d04 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Sat, 26 Aug 2023 14:32:11 -0400
Subject: [PATCH 042/136] ci: remove stale comment

---
 .github/workflows/ci.yml | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 2035178a4..1efa31f07 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -96,12 +96,6 @@ jobs:
         cd "$dir"
         curl -LO "https://github.com/cross-rs/cross/releases/download/$CROSS_VERSION/cross-x86_64-unknown-linux-musl.tar.gz"
         tar xf cross-x86_64-unknown-linux-musl.tar.gz
-
-        # We used to install 'cross' from master, but it kept failing. So now
-        # we build from a known-good version until 'cross' becomes more stable
-        # or we find an alternative. Notably, between v0.2.1 and current
-        # master (2022-06-14), the number of Cross's dependencies has doubled.
-        # cargo install --bins --git https://github.com/rust-embedded/cross --tag v0.2.1
         echo "CARGO=cross" >> $GITHUB_ENV
         echo "TARGET=--target ${{ matrix.target }}" >> $GITHUB_ENV
     - name: Show command used for Cargo

From 329c6a32451434fc2f229ad8d3c934c70148ae45 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Sat, 26 Aug 2023 14:40:25 -0400
Subject: [PATCH 043/136] ci: use dtolnay@master instead of @v1

I believe dtolnay corrected me on this a while ago, but either the
change got reverted or it was for some other project. In any case, we
should use @master so we get the latest updates.
---
 .github/workflows/ci.yml | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 1efa31f07..c2a38d6d4 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -81,7 +81,7 @@ jobs:
     - name: Checkout repository
       uses: actions/checkout@v3
     - name: Install Rust
-      uses: dtolnay/rust-toolchain@v1
+      uses: dtolnay/rust-toolchain@master
       with:
         toolchain: ${{ matrix.rust }}
     - name: Install and configure Cross
@@ -139,7 +139,7 @@ jobs:
     - name: Checkout repository
       uses: actions/checkout@v3
     - name: Install Rust
-      uses: dtolnay/rust-toolchain@v1
+      uses: dtolnay/rust-toolchain@master
       with:
         toolchain: 1.60.0
     - name: Basic build
@@ -160,7 +160,7 @@ jobs:
     - name: Checkout repository
       uses: actions/checkout@v3
     - name: Install Rust
-      uses: dtolnay/rust-toolchain@v1
+      uses: dtolnay/rust-toolchain@master
       with:
         toolchain: stable
     - name: Run full test suite
@@ -173,7 +173,7 @@ jobs:
     - name: Checkout repository
       uses: actions/checkout@v3
     - name: Install Rust
-      uses: dtolnay/rust-toolchain@v1
+      uses: dtolnay/rust-toolchain@master
       with:
         toolchain: stable
     - name: Run full test suite
@@ -186,7 +186,7 @@ jobs:
     - name: Checkout repository
       uses: actions/checkout@v3
     - name: Install Rust
-      uses: dtolnay/rust-toolchain@v1
+      uses: dtolnay/rust-toolchain@master
       with:
         toolchain: stable
     - name: Run full test suite
@@ -199,7 +199,7 @@ jobs:
     - name: Checkout repository
       uses: actions/checkout@v3
     - name: Install Rust
-      uses: dtolnay/rust-toolchain@v1
+      uses: dtolnay/rust-toolchain@master
       with:
         toolchain: stable
     - name: Run full test suite
@@ -214,7 +214,7 @@ jobs:
     - name: Checkout repository
       uses: actions/checkout@v3
     - name: Install Rust
-      uses: dtolnay/rust-toolchain@v1
+      uses: dtolnay/rust-toolchain@master
       with:
         # We use nightly here so that we can use miri I guess?
         # It caught me by surprise that miri seems to only be
@@ -231,7 +231,7 @@ jobs:
     - name: Checkout repository
       uses: actions/checkout@v3
     - name: Install Rust
-      uses: dtolnay/rust-toolchain@v1
+      uses: dtolnay/rust-toolchain@master
       with:
         toolchain: stable
         components: rustfmt

From 15cdc64869ea5508d96a0e7667c44c7c459986a1 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Sun, 27 Aug 2023 16:38:25 -0400
Subject: [PATCH 044/136] cli: remove use of deprecated API

I deprecated this API a couple releases ago. Update the `regex-cli` tool
to be in line with that.
---
 regex-cli/args/thompson.rs | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/regex-cli/args/thompson.rs b/regex-cli/args/thompson.rs
index 6e7b4afd8..151fc6a0b 100644
--- a/regex-cli/args/thompson.rs
+++ b/regex-cli/args/thompson.rs
@@ -28,7 +28,11 @@ impl Config {
     pub fn reversed(&self) -> Config {
         // Reverse DFAs require that captures are disabled. In practice, there
         // is no current use case for a reverse NFA with capture groups.
-        let thompson = self.thompson.clone().reverse(true).captures(false);
+        let thompson = self
+            .thompson
+            .clone()
+            .reverse(true)
+            .which_captures(thompson::WhichCaptures::None);
         Config { thompson }
     }
 
@@ -67,7 +71,10 @@ impl Configurable for Config {
                 self.thompson = self.thompson.clone().shrink(true);
             }
             Arg::Long("no-captures") => {
-                self.thompson = self.thompson.clone().captures(false);
+                self.thompson = self
+                    .thompson
+                    .clone()
+                    .which_captures(thompson::WhichCaptures::None);
             }
             Arg::Long("line-terminator") => {
                 let byte: flags::OneByte =

From 9a505a1804f8f89e3448a2a2c5c70573dc6362e5 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Sun, 27 Aug 2023 16:39:04 -0400
Subject: [PATCH 045/136] deps: bump to memchr 2.6

This bumps the minimum memchr version to 2.6, which brings in
massive improvements to aarch64 for single substring search. We also can
now enable the new `alloc` feature in `memchr` when `alloc` is enable
for `regex` and `regex-automata`.

We also squash some warnings.

[1]: https://github.com/BurntSushi/memchr/pull/129
---
 Cargo.toml                                | 3 ++-
 fuzz/fuzz_targets/ast_fuzz_match.rs       | 9 +++++----
 fuzz/fuzz_targets/ast_fuzz_match_bytes.rs | 9 +++++----
 regex-automata/Cargo.toml                 | 4 ++--
 4 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 0675337d7..4cc42b6cd 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -52,6 +52,7 @@ std = [
 # to actually emit the log messages somewhere.
 logging = [
   "aho-corasick?/logging",
+  "memchr?/logging",
   "regex-automata/logging",
 ]
 # The 'use_std' feature is DEPRECATED. It will be removed in regex 2. Until
@@ -167,7 +168,7 @@ optional = true
 
 # For skipping along search text quickly when a leading byte is known.
 [dependencies.memchr]
-version = "2.5.0"
+version = "2.6.0"
 optional = true
 
 # For the actual regex engines.
diff --git a/fuzz/fuzz_targets/ast_fuzz_match.rs b/fuzz/fuzz_targets/ast_fuzz_match.rs
index 58a8ebbf8..9ccb407dc 100644
--- a/fuzz/fuzz_targets/ast_fuzz_match.rs
+++ b/fuzz/fuzz_targets/ast_fuzz_match.rs
@@ -25,11 +25,12 @@ fuzz_target!(|data: FuzzData| -> Corpus {
     let _ = env_logger::try_init();
 
     let pattern = format!("{}", data.ast);
-    let Ok(re) = RegexBuilder::new(&pattern).size_limit(1<<20).build() else {
+    let Ok(re) = RegexBuilder::new(&pattern).size_limit(1 << 20).build()
+    else {
         return Corpus::Reject;
     };
-    re.is_match(&data.haystack);
-    re.find(&data.haystack);
-    re.captures(&data.haystack).map_or(0, |c| c.len());
+    let _ = re.is_match(&data.haystack);
+    let _ = re.find(&data.haystack);
+    let _ = re.captures(&data.haystack).map_or(0, |c| c.len());
     Corpus::Keep
 });
diff --git a/fuzz/fuzz_targets/ast_fuzz_match_bytes.rs b/fuzz/fuzz_targets/ast_fuzz_match_bytes.rs
index a4fa0bd73..045c1fb18 100644
--- a/fuzz/fuzz_targets/ast_fuzz_match_bytes.rs
+++ b/fuzz/fuzz_targets/ast_fuzz_match_bytes.rs
@@ -25,11 +25,12 @@ fuzz_target!(|data: FuzzData| -> Corpus {
     let _ = env_logger::try_init();
 
     let pattern = format!("{}", data.ast);
-    let Ok(re) = RegexBuilder::new(&pattern).size_limit(1<<20).build() else {
+    let Ok(re) = RegexBuilder::new(&pattern).size_limit(1 << 20).build()
+    else {
         return Corpus::Reject;
     };
-    re.is_match(&data.haystack);
-    re.find(&data.haystack);
-    re.captures(&data.haystack).map_or(0, |c| c.len());
+    let _ = re.is_match(&data.haystack);
+    let _ = re.find(&data.haystack);
+    let _ = re.captures(&data.haystack).map_or(0, |c| c.len());
     Corpus::Keep
 });
diff --git a/regex-automata/Cargo.toml b/regex-automata/Cargo.toml
index d069b176e..3cd9965b0 100644
--- a/regex-automata/Cargo.toml
+++ b/regex-automata/Cargo.toml
@@ -21,7 +21,7 @@ bench = false
 default = ["std", "syntax", "perf", "unicode", "meta", "nfa", "dfa", "hybrid"]
 std = ["regex-syntax?/std", "memchr?/std", "aho-corasick?/std", "alloc"]
 alloc = []
-logging = ["dep:log", "aho-corasick?/logging"]
+logging = ["dep:log", "aho-corasick?/logging", "memchr?/logging"]
 
 syntax = ["dep:regex-syntax", "alloc"]
 
@@ -84,7 +84,7 @@ internal-instrument-pikevm = ["logging", "std"]
 [dependencies]
 aho-corasick = { version = "1.0.0", optional = true, default-features = false }
 log = { version = "0.4.14", optional = true }
-memchr = { version = "2.5.0", optional = true, default-features = false }
+memchr = { version = "2.6.0", optional = true, default-features = false }
 regex-syntax = { path = "../regex-syntax", version = "0.7.4", optional = true, default-features = false }
 
 [dev-dependencies]

From f578d74ff42f3df408378ff52d3bdf4433423532 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Wed, 30 Aug 2023 18:28:06 -0400
Subject: [PATCH 046/136] automata: reduce regex contention somewhat
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

> **Context:** A `Regex` uses internal mutable space (called a `Cache`)
> while executing a search. Since a `Regex` really wants to be easily
> shared across multiple threads simultaneously, it follows that a
> `Regex` either needs to provide search functions that accept a `&mut
> Cache` (thereby pushing synchronization to a problem for the caller
> to solve) or it needs to do synchronization itself. While there are
> lower level APIs in `regex-automata` that do the former, they are
> less convenient. The higher level APIs, especially in the `regex`
> crate proper, need to do some kind of synchronization to give a
> search the mutable `Cache` that it needs.
>
> The current approach to that synchronization essentially uses a
> `Mutex<Vec<Cache>>` with an optimization for the "owning" thread
> that lets it bypass the `Mutex`. The owning thread optimization
> makes it so the single threaded use case essentially doesn't pay for
> any synchronization overhead, and that all works fine. But once the
> `Regex` is shared across multiple threads, that `Mutex<Vec<Cache>>`
> gets hit. And if you're doing a lot of regex searches on short
> haystacks in parallel, that `Mutex` comes under extremely heavy
> contention. To the point that a program can slow down by enormous
> amounts.
>
> This PR attempts to address that problem.
>
> Note that it's worth pointing out that this issue can be worked
> around.
>
> The simplest work-around is to clone a `Regex` and send it to other
> threads instead of sharing a single `Regex`. This won't use any
> additional memory (a `Regex` is reference counted internally),
> but it will force each thread to use the "owner" optimization
> described above. This does mean, for example, that you can't
> share a `Regex` across multiple threads conveniently with a
> `lazy_static`/`OnceCell`/`OnceLock`/whatever.
>
> The other work-around is to use the lower level search APIs on a
> `meta::Regex` in the `regex-automata` crate. Those APIs accept a
> `&mut Cache` explicitly. In that case, you can use the `thread_local`
> crate or even an actual `thread_local!` or something else entirely.

I wish I could say this PR was a home run that fixed the contention
issues with `Regex` once and for all, but it's not. It just makes
things a fair bit better by switching from one stack to eight stacks
for the pool, plus a couple other heuristics. The stack is chosen
by doing `self.stacks[thread_id % 8]`. It's a pretty dumb strategy,
but it limits extra memory usage while at least reducing contention.
Obviously, it works a lot better for the 8-16 thread case, and while
it helps with the 64-128 thread case too, things are still pretty slow
there.

A benchmark for this problem is described in #934. We compare 8 and 16
threads, and for each thread count, we compare a `cloned` and `shared`
approach. The `cloned` approach clones the regex before sending it to
each thread where as the `shared` approach shares a single regex across
multiple threads. The `cloned` approach is expected to be fast (and
it is) because it forces each thread into the owner optimization. The
`shared` approach, however, hit the shared stack behind a mutex and
suffers majorly from contention.

Here's what that benchmark looks like before this PR for 64 threads (on a
24-core CPU).

```
$ hyperfine "REGEX_BENCH_WHICH=cloned REGEX_BENCH_THREADS=64 ./target/release/repro" "REGEX_BENCH_WHICH=shared REGEX_BENCH_THREADS=64 ./tmp/repro-master"
Benchmark 1: REGEX_BENCH_WHICH=cloned REGEX_BENCH_THREADS=64 ./target/release/repro
  Time (mean ± σ):       9.0 ms ±   0.6 ms    [User: 128.3 ms, System: 5.7 ms]
  Range (min … max):     7.7 ms …  11.1 ms    278 runs

Benchmark 2: REGEX_BENCH_WHICH=shared REGEX_BENCH_THREADS=64 ./tmp/repro-master
  Time (mean ± σ):      1.938 s ±  0.036 s    [User: 4.827 s, System: 41.401 s]
  Range (min … max):    1.885 s …  1.992 s    10 runs

Summary
  'REGEX_BENCH_WHICH=cloned REGEX_BENCH_THREADS=64 ./target/release/repro' ran
  215.02 ± 15.45 times faster than 'REGEX_BENCH_WHICH=shared REGEX_BENCH_THREADS=64 ./tmp/repro-master'
```

And here's what it looks like after this PR:

```
$ hyperfine "REGEX_BENCH_WHICH=cloned REGEX_BENCH_THREADS=64 ./target/release/repro" "REGEX_BENCH_WHICH=shared REGEX_BENCH_THREADS=64 ./target/release/repro"
Benchmark 1: REGEX_BENCH_WHICH=cloned REGEX_BENCH_THREADS=64 ./target/release/repro
  Time (mean ± σ):       9.0 ms ±   0.6 ms    [User: 127.6 ms, System: 6.2 ms]
  Range (min … max):     7.9 ms …  11.7 ms    287 runs

Benchmark 2: REGEX_BENCH_WHICH=shared REGEX_BENCH_THREADS=64 ./target/release/repro
  Time (mean ± σ):      55.0 ms ±   5.1 ms    [User: 1050.4 ms, System: 12.0 ms]
  Range (min … max):    46.1 ms …  67.3 ms    57 runs

Summary
  'REGEX_BENCH_WHICH=cloned REGEX_BENCH_THREADS=64 ./target/release/repro' ran
    6.09 ± 0.71 times faster than 'REGEX_BENCH_WHICH=shared REGEX_BENCH_THREADS=64 ./target/release/repro'
```

So instead of things getting over 215x slower in the 64 thread case, it
"only" gets 6x slower.

Closes #934
---
 regex-automata/src/util/pool.rs | 187 ++++++++++++++++++++++++++++----
 1 file changed, 168 insertions(+), 19 deletions(-)

diff --git a/regex-automata/src/util/pool.rs b/regex-automata/src/util/pool.rs
index 7f4a1c21e..c03d7b013 100644
--- a/regex-automata/src/util/pool.rs
+++ b/regex-automata/src/util/pool.rs
@@ -268,6 +268,64 @@ mod inner {
     /// do.
     static THREAD_ID_DROPPED: usize = 2;
 
+    /// The number of stacks we use inside of the pool. These are only used for
+    /// non-owners. That is, these represent the "slow" path.
+    ///
+    /// In the original implementation of this pool, we only used a single
+    /// stack. While this might be okay for a couple threads, the prevalence of
+    /// 32, 64 and even 128 core CPUs has made it untenable. The contention
+    /// such an environment introduces when threads are doing a lot of searches
+    /// on short haystacks (a not uncommon use case) is palpable and leads to
+    /// huge slowdowns.
+    ///
+    /// This constant reflects a change from using one stack to the number of
+    /// stacks that this constant is set to. The stack for a particular thread
+    /// is simply chosen by `thread_id % MAX_POOL_STACKS`. The idea behind
+    /// this setup is that there should be a good chance that accesses to the
+    /// pool will be distributed over several stacks instead of all of them
+    /// converging to one.
+    ///
+    /// This is not a particularly smart or dynamic strategy. Fixing this to a
+    /// specific number has at least two downsides. First is that it will help,
+    /// say, an 8 core CPU more than it will a 128 core CPU. (But, crucially,
+    /// it will still help the 128 core case.) Second is that this may wind
+    /// up being a little wasteful with respect to memory usage. Namely, if a
+    /// regex is used on one thread and then moved to another thread, then it
+    /// could result in creating a new copy of the data in the pool even though
+    /// only one is actually needed.
+    ///
+    /// And that memory usage bit is why this is set to 8 and not, say, 64.
+    /// Keeping it at 8 limits, to an extent, how much unnecessary memory can
+    /// be allocated.
+    ///
+    /// In an ideal world, we'd be able to have something like this:
+    ///
+    /// * Grow the number of stacks as the number of concurrent callers
+    /// increases. I spent a little time trying this, but even just adding an
+    /// atomic addition/subtraction for each pop/push for tracking concurrent
+    /// callers led to a big perf hit. Since even more work would seemingly be
+    /// required than just an addition/subtraction, I abandoned this approach.
+    /// * The maximum amount of memory used should scale with respect to the
+    /// number of concurrent callers and *not* the total number of existing
+    /// threads. This is primarily why the `thread_local` crate isn't used, as
+    /// as some environments spin up a lot of threads. This led to multiple
+    /// reports of extremely high memory usage (often described as memory
+    /// leaks).
+    /// * Even more ideally, the pool should contract in size. That is, it
+    /// should grow with bursts and then shrink. But this is a pretty thorny
+    /// issue to tackle and it might be better to just not.
+    /// * It would be nice to explore the use of, say, a lock-free stack
+    /// instead of using a mutex to guard a `Vec` that is ultimately just
+    /// treated as a stack. The main thing preventing me from exploring this
+    /// is the ABA problem. The `crossbeam` crate has tools for dealing with
+    /// this sort of problem (via its epoch based memory reclamation strategy),
+    /// but I can't justify bringing in all of `crossbeam` as a dependency of
+    /// `regex` for this.
+    ///
+    /// See this issue for more context and discussion:
+    /// https://github.com/rust-lang/regex/issues/934
+    const MAX_POOL_STACKS: usize = 8;
+
     thread_local!(
         /// A thread local used to assign an ID to a thread.
         static THREAD_ID: usize = {
@@ -291,6 +349,17 @@ mod inner {
         };
     );
 
+    /// This puts each stack in the pool below into its own cache line. This is
+    /// an absolutely critical optimization that tends to have the most impact
+    /// in high contention workloads. Without forcing each mutex protected
+    /// into its own cache line, high contention exacerbates the performance
+    /// problem by causing "false sharing." By putting each mutex in its own
+    /// cache-line, we avoid the false sharing problem and the affects of
+    /// contention are greatly reduced.
+    #[derive(Debug)]
+    #[repr(C, align(64))]
+    struct CacheLine<T>(T);
+
     /// A thread safe pool utilizing std-only features.
     ///
     /// The main difference between this and the simplistic alloc-only pool is
@@ -299,12 +368,16 @@ mod inner {
     /// This makes the common case of running a regex within a single thread
     /// faster by avoiding mutex unlocking.
     pub(super) struct Pool<T, F> {
-        /// A stack of T values to hand out. These are used when a Pool is
-        /// accessed by a thread that didn't create it.
-        stack: Mutex<Vec<Box<T>>>,
         /// A function to create more T values when stack is empty and a caller
         /// has requested a T.
         create: F,
+        /// Multiple stacks of T values to hand out. These are used when a Pool
+        /// is accessed by a thread that didn't create it.
+        ///
+        /// Conceptually this is `Mutex<Vec<Box<T>>>`, but sharded out to make
+        /// it scale better under high contention work-loads. We index into
+        /// this sequence via `thread_id % stacks.len()`.
+        stacks: Vec<CacheLine<Mutex<Vec<Box<T>>>>>,
         /// The ID of the thread that owns this pool. The owner is the thread
         /// that makes the first call to 'get'. When the owner calls 'get', it
         /// gets 'owner_val' directly instead of returning a T from 'stack'.
@@ -354,9 +427,17 @@ mod inner {
     unsafe impl<T: Send, F: Send + Sync> Sync for Pool<T, F> {}
 
     // If T is UnwindSafe, then since we provide exclusive access to any
-    // particular value in the pool, it should therefore also be considered
-    // RefUnwindSafe. Also, since we use std::sync::Mutex, we get poisoning
-    // from it if another thread panics while the lock is held.
+    // particular value in the pool, the pool should therefore also be
+    // considered UnwindSafe.
+    //
+    // We require `F: UnwindSafe + RefUnwindSafe` because we call `F` at any
+    // point on demand, so it needs to be unwind safe on both dimensions for
+    // the entire Pool to be unwind safe.
+    impl<T: UnwindSafe, F: UnwindSafe + RefUnwindSafe> UnwindSafe for Pool<T, F> {}
+
+    // If T is UnwindSafe, then since we provide exclusive access to any
+    // particular value in the pool, the pool should therefore also be
+    // considered RefUnwindSafe.
     //
     // We require `F: UnwindSafe + RefUnwindSafe` because we call `F` at any
     // point on demand, so it needs to be unwind safe on both dimensions for
@@ -375,9 +456,13 @@ mod inner {
             // 'Pool::new' method as 'const' too. (The alloc-only Pool::new
             // is already 'const', so that should "just work" too.) The only
             // thing we're waiting for is Mutex::new to be const.
+            let mut stacks = Vec::with_capacity(MAX_POOL_STACKS);
+            for _ in 0..stacks.capacity() {
+                stacks.push(CacheLine(Mutex::new(vec![])));
+            }
             let owner = AtomicUsize::new(THREAD_ID_UNOWNED);
             let owner_val = UnsafeCell::new(None); // init'd on first access
-            Pool { stack: Mutex::new(vec![]), create, owner, owner_val }
+            Pool { create, stacks, owner, owner_val }
         }
     }
 
@@ -401,6 +486,9 @@ mod inner {
             let caller = THREAD_ID.with(|id| *id);
             let owner = self.owner.load(Ordering::Acquire);
             if caller == owner {
+                // N.B. We could also do a CAS here instead of a load/store,
+                // but ad hoc benchmarking suggests it is slower. And a lot
+                // slower in the case where `get_slow` is common.
                 self.owner.store(THREAD_ID_INUSE, Ordering::Release);
                 return self.guard_owned(caller);
             }
@@ -444,37 +532,82 @@ mod inner {
                     return self.guard_owned(caller);
                 }
             }
-            let mut stack = self.stack.lock().unwrap();
-            let value = match stack.pop() {
-                None => Box::new((self.create)()),
-                Some(value) => value,
-            };
-            self.guard_stack(value)
+            let stack_id = caller % self.stacks.len();
+            // We try to acquire exclusive access to this thread's stack, and
+            // if so, grab a value from it if we can. We put this in a loop so
+            // that it's easy to tweak and experiment with a different number
+            // of tries. In the end, I couldn't see anything obviously better
+            // than one attempt in ad hoc testing.
+            for _ in 0..1 {
+                let mut stack = match self.stacks[stack_id].0.try_lock() {
+                    Err(_) => continue,
+                    Ok(stack) => stack,
+                };
+                if let Some(value) = stack.pop() {
+                    return self.guard_stack(value);
+                }
+                // Unlock the mutex guarding the stack before creating a fresh
+                // value since we no longer need the stack.
+                drop(stack);
+                let value = Box::new((self.create)());
+                return self.guard_stack(value);
+            }
+            // We're only here if we could get access to our stack, so just
+            // create a new value. This seems like it could be wasteful, but
+            // waiting for exclusive access to a stack when there's high
+            // contention is brutal for perf.
+            self.guard_stack_transient(Box::new((self.create)()))
         }
 
         /// Puts a value back into the pool. Callers don't need to call this.
         /// Once the guard that's returned by 'get' is dropped, it is put back
         /// into the pool automatically.
         fn put_value(&self, value: Box<T>) {
-            let mut stack = self.stack.lock().unwrap();
-            stack.push(value);
+            let caller = THREAD_ID.with(|id| *id);
+            let stack_id = caller % self.stacks.len();
+            // As with trying to pop a value from this thread's stack, we
+            // merely attempt to get access to push this value back on the
+            // stack. If there's too much contention, we just give up and throw
+            // the value away.
+            //
+            // Interestingly, in ad hoc benchmarking, it is beneficial to
+            // attempt to push the value back more than once, unlike when
+            // popping the value. I don't have a good theory for why this is.
+            // I guess if we drop too many values then that winds up forcing
+            // the pop operation to create new fresh values and thus leads to
+            // less reuse. There's definitely a balancing act here.
+            for _ in 0..10 {
+                let mut stack = match self.stacks[stack_id].0.try_lock() {
+                    Err(_) => continue,
+                    Ok(stack) => stack,
+                };
+                stack.push(value);
+                return;
+            }
         }
 
         /// Create a guard that represents the special owned T.
         fn guard_owned(&self, caller: usize) -> PoolGuard<'_, T, F> {
-            PoolGuard { pool: self, value: Err(caller) }
+            PoolGuard { pool: self, value: Err(caller), discard: false }
         }
 
         /// Create a guard that contains a value from the pool's stack.
         fn guard_stack(&self, value: Box<T>) -> PoolGuard<'_, T, F> {
-            PoolGuard { pool: self, value: Ok(value) }
+            PoolGuard { pool: self, value: Ok(value), discard: false }
+        }
+
+        /// Create a guard that contains a value from the pool's stack with an
+        /// instruction to throw away the value instead of putting it back
+        /// into the pool.
+        fn guard_stack_transient(&self, value: Box<T>) -> PoolGuard<'_, T, F> {
+            PoolGuard { pool: self, value: Ok(value), discard: true }
         }
     }
 
     impl<T: core::fmt::Debug, F> core::fmt::Debug for Pool<T, F> {
         fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
             f.debug_struct("Pool")
-                .field("stack", &self.stack)
+                .field("stacks", &self.stacks)
                 .field("owner", &self.owner)
                 .field("owner_val", &self.owner_val)
                 .finish()
@@ -490,6 +623,12 @@ mod inner {
         /// in the special case of `Err(THREAD_ID_DROPPED)`, it means the
         /// guard has been put back into the pool and should no longer be used.
         value: Result<Box<T>, usize>,
+        /// When true, the value should be discarded instead of being pushed
+        /// back into the pool. We tend to use this under high contention, and
+        /// this allows us to avoid inflating the size of the pool. (Because
+        /// under contention, we tend to create more values instead of waiting
+        /// for access to a stack of existing values.)
+        discard: bool,
     }
 
     impl<'a, T: Send, F: Fn() -> T> PoolGuard<'a, T, F> {
@@ -557,7 +696,17 @@ mod inner {
         #[inline(always)]
         fn put_imp(&mut self) {
             match core::mem::replace(&mut self.value, Err(THREAD_ID_DROPPED)) {
-                Ok(value) => self.pool.put_value(value),
+                Ok(value) => {
+                    // If we were told to discard this value then don't bother
+                    // trying to put it back into the pool. This occurs when
+                    // the pop operation failed to acquire a lock and we
+                    // decided to create a new value in lieu of contending for
+                    // the lock.
+                    if self.discard {
+                        return;
+                    }
+                    self.pool.put_value(value);
+                }
                 // If this guard has a value "owned" by the thread, then
                 // the Pool guarantees that this is the ONLY such guard.
                 // Therefore, in order to place it back into the pool and make

From 135e11ba9c54b383072ae98043c31dfe1066886a Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Sat, 2 Sep 2023 11:12:49 -0400
Subject: [PATCH 047/136] changelog: 1.9.5

---
 CHANGELOG.md | 39 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index a5f218010..885bb9bd7 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,42 @@
+1.9.5 (2023-09-02)
+==================
+This is a patch release that hopefully mostly fixes a performance bug that
+occurs when sharing a regex across multiple threads.
+
+Issue [#934](https://github.com/rust-lang/regex/issues/934)
+explains this in more detail. It is [also noted in the crate
+documentation](https://docs.rs/regex/latest/regex/#sharing-a-regex-across-threads-can-result-in-contention).
+The bug can appear when sharing a regex across multiple threads simultaneously,
+as might be the case when using a regex from a `OnceLock`, `lazy_static` or
+similar primitive. Usually high contention only results when using many threads
+to execute searches on small haystacks.
+
+One can avoid the contention problem entirely through one of two methods.
+The first is to use lower level APIs from `regex-automata` that require passing
+state explicitly, such as [`meta::Regex::search_with`](https://docs.rs/regex-automata/latest/regex_automata/meta/struct.Regex.html#method.search_with).
+The second is to clone a regex and send it to other threads explicitly. This
+will not use any additional memory usage compared to sharing the regex. The
+only downside of this approach is that it may be less convenient, for example,
+it won't work with things like `OnceLock` or `lazy_static` or `once_cell`.
+
+With that said, as of this release, the contention performance problems have
+been greatly reduced. This was achieved by changing the free-list so that it
+was sharded across threads, and that ensuring each sharded mutex occupies a
+single cache line to mitigate false sharing. So while contention may still
+impact performance in some cases, it should be a lot better now.
+
+Because of the changes to how the free-list works, please report any issues you
+find with this release. That not only includes search time regressions but also
+significant regressions in memory usage. Reporting improvements is also welcome
+as well! If possible, provide a reproduction.
+
+Bug fixes:
+
+* [BUG #934](https://github.com/rust-lang/regex/issues/934):
+Fix a performance bug where high contention on a single regex led to massive
+slow downs.
+
+
 1.9.4 (2023-08-26)
 ==================
 This is a patch release that fixes a bug where `RegexSet::is_match(..)` could

From 894dcbe11e45d08b23db24f877574e06f3a69a35 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Sat, 2 Sep 2023 11:12:58 -0400
Subject: [PATCH 048/136] regex-automata-0.3.8

---
 regex-automata/Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/regex-automata/Cargo.toml b/regex-automata/Cargo.toml
index 3cd9965b0..c7e949c4c 100644
--- a/regex-automata/Cargo.toml
+++ b/regex-automata/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "regex-automata"
-version = "0.3.7"  #:version
+version = "0.3.8"  #:version
 authors = ["The Rust Project Developers", "Andrew Gallant <jamslam@gmail.com>"]
 description = "Automata construction and matching using regular expressions."
 documentation = "https://docs.rs/regex-automata"

From 48e09a85e46d2b8cc379cd0b69cd98467639f7ff Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Sat, 2 Sep 2023 11:13:24 -0400
Subject: [PATCH 049/136] deps: bump regex-automata to 0.3.8

---
 Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Cargo.toml b/Cargo.toml
index 4cc42b6cd..7afdfdc35 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -174,7 +174,7 @@ optional = true
 # For the actual regex engines.
 [dependencies.regex-automata]
 path = "regex-automata"
-version = "0.3.7"
+version = "0.3.8"
 default-features = false
 features = ["alloc", "syntax", "meta", "nfa-pikevm"]
 

From 554469b8c1116322f3c0a054ceeb610224f8ac65 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Sat, 2 Sep 2023 11:13:28 -0400
Subject: [PATCH 050/136] 1.9.5

---
 Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Cargo.toml b/Cargo.toml
index 7afdfdc35..c78ed045f 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "regex"
-version = "1.9.4"  #:version
+version = "1.9.5"  #:version
 authors = ["The Rust Project Developers", "Andrew Gallant <jamslam@gmail.com>"]
 license = "MIT OR Apache-2.0"
 readme = "README.md"

From 4aaf3896ef1147000a5e63f174fa49bfa5d18d65 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Sun, 3 Sep 2023 11:09:56 -0400
Subject: [PATCH 051/136] ci: pin to memchr 2.6.2 for MSRV CI job

I botched the memchr 2.6 MSRV because it actually requires Rust 1.61 and
not Rust 1.60. This crate's MSRV is Rust 1.60, so pin memchr to a
version that works on Rust 1.60 (for x86-64 at least).
---
 .github/workflows/ci.yml | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index c2a38d6d4..08cc60d9a 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -142,6 +142,25 @@ jobs:
       uses: dtolnay/rust-toolchain@master
       with:
         toolchain: 1.60.0
+    # The memchr 2.6 release purportedly bumped its MSRV to Rust 1.60, but it
+    # turned out that on aarch64, it was using something that wasn't stabilized
+    # until Rust 1.61[1]. (This was an oversight on my part. I had previously
+    # thought everything I needed was on Rust 1.60.) To resolve that, I just
+    # bumped memchr's MSRV to 1.61. Since it was so soon after the memchr 2.6
+    # release, I treated this as a bugfix.
+    #
+    # But the regex crate's MSRV is at Rust 1.60, and it now depends on at
+    # least memchr 2.6 (to make use of its `alloc` feature). So we can't set
+    # a lower minimal version. And I can't just bump the MSRV in a patch
+    # release as a bug fix because regex 1.9 was released quite some time ago.
+    # I could just release regex 1.10 and bump the MSRV there, but eh, I don't
+    # want to put out another minor version release just for this.
+    #
+    # So... pin memchr to 2.6.2, which at least works on x86-64 on Rust 1.60.
+    #
+    # [1]: https://github.com/BurntSushi/memchr/issues/136
+    - name: Pin memchr to 2.6.2
+      run: cargo update -p memchr --precise 2.6.2
     - name: Basic build
       run: cargo build --verbose
     - name: Build docs

From cdc0dbd3547462aedb6235197c2b743ec4ea75e5 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Sun, 3 Sep 2023 11:22:41 -0400
Subject: [PATCH 052/136] readme: add section about performance and benchmarks

---
 README.md | 71 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 71 insertions(+)

diff --git a/README.md b/README.md
index a9d6fcd37..51188654d 100644
--- a/README.md
+++ b/README.md
@@ -219,6 +219,77 @@ The full set of features one can disable are
 [in the "Crate features" section of the documentation](https://docs.rs/regex/1.*/#crate-features).
 
 
+### Performance
+
+One of the goals of this crate is for the regex engine to be "fast." What that
+is a somewhat nebulous goal, it is usually interpreted in one of two ways.
+First, it means that all searches take worst case `O(m * n)` time, where
+`m` is proportional to `len(regex)` and `n` is proportional to `len(haystack)`.
+Second, it means that even aside from the time complexity constraint, regex
+searches are "fast" in practice.
+
+While the first interpretation is pretty unambiguous, the second one remains
+nebulous. While nebulous, it guides this crate's architecture and the sorts of
+the trade offs it makes. For example, here are some general architectural
+statements that follow as a result of the goal to be "fast":
+
+* When given the choice between faster regex searches and faster Rust compile
+times, this crate will generally choose faster regex searches.
+* When given the choice between faster regex searches and faster regex compile
+times, this crate will generally choose faster regex searches. That is, it is
+generally acceptable for `Regex::new` to get a little slower if it means that
+searches get faster. (This is a somewhat delicate balance to strike, because
+the speed of `Regex::new` needs to remain somewhat reasonable. But this is why
+one should avoid re-compiling the same regex over and over again.)
+* When given the choice between faster regex searches and simpler API
+design, this crate will generally choose faster regex searches. For example,
+if one didn't care about performance, we could like get rid of both of
+the `Regex::is_match` and `Regex::find` APIs and instead just rely on
+`Regex::captures`.
+
+There are perhaps more ways that being "fast" influences things.
+
+While this repository used to provide its own benchmark suite, it has since
+been moved to [rebar](https://github.com/BurntSushi/rebar). The benchmarks are
+quite extensive, and there are many more than what is shown in rebar's README
+(which is just limited to a "curated" set meant to compare performance between
+regex engines). To run all of this crate's benchmarks, first start by cloning
+and installing `rebar`:
+
+```text
+$ git clone https://github.com/BurntSushi/rebar
+$ cd rebar
+$ cargo install --path ./
+```
+
+Then build the benchmark harness for just this crate:
+
+```text
+$ rebar build -e '^rust/regex$'
+```
+
+Run all benchmarks for this crate as tests (each benchmark is executed once to
+ensure it works):
+
+```text
+$ rebar measure -e '^rust/regex$' -t
+```
+
+Record measurements for all benchmarks and save them to a CSV file:
+
+```text
+$ rebar measure -e '^rust/regex$' | tee results.csv
+```
+
+Explore benchmark timings:
+
+```text
+$ rebar cmp results.csv
+```
+
+See the `rebar` documentation for more details on how it works and how to
+compare results with other regex engines.
+
 ### Minimum Rust version policy
 
 This crate's minimum supported `rustc` version is `1.60.0`.

From 8275c1b3bef014a4393d9975285757f89d7e4592 Mon Sep 17 00:00:00 2001
From: Yutaka Kamei <kamei@yykamei.me>
Date: Thu, 7 Sep 2023 23:00:49 +0900
Subject: [PATCH 053/136] doc: fix a few typos

PR #1085
---
 regex-automata/src/meta/regex.rs           | 2 +-
 regex-automata/src/nfa/thompson/builder.rs | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/regex-automata/src/meta/regex.rs b/regex-automata/src/meta/regex.rs
index 3a04b14d8..ce3bae0fa 100644
--- a/regex-automata/src/meta/regex.rs
+++ b/regex-automata/src/meta/regex.rs
@@ -2706,7 +2706,7 @@ impl Config {
     /// you're compiling untrusted patterns.
     ///
     /// Note that this limit is applied to _each_ NFA built, and if any of
-    /// them excceed the limit, then construction will fail. This limit does
+    /// them exceed the limit, then construction will fail. This limit does
     /// _not_ correspond to the total memory used by all NFAs in the meta regex
     /// engine.
     ///
diff --git a/regex-automata/src/nfa/thompson/builder.rs b/regex-automata/src/nfa/thompson/builder.rs
index b57e5bc0f..6b69e8784 100644
--- a/regex-automata/src/nfa/thompson/builder.rs
+++ b/regex-automata/src/nfa/thompson/builder.rs
@@ -61,7 +61,7 @@ enum State {
     Look { look: Look, next: StateID },
     /// An empty state that records the start of a capture location. This is an
     /// unconditional epsilon transition like `Empty`, except it can be used to
-    /// record position information for a captue group when using the NFA for
+    /// record position information for a capture group when using the NFA for
     /// search.
     CaptureStart {
         /// The ID of the pattern that this capture was defined.
@@ -77,7 +77,7 @@ enum State {
     },
     /// An empty state that records the end of a capture location. This is an
     /// unconditional epsilon transition like `Empty`, except it can be used to
-    /// record position information for a captue group when using the NFA for
+    /// record position information for a capture group when using the NFA for
     /// search.
     CaptureEnd {
         /// The ID of the pattern that this capture was defined.
@@ -128,7 +128,7 @@ enum State {
 }
 
 impl State {
-    /// If this state is an unconditional espilon transition, then this returns
+    /// If this state is an unconditional epsilon transition, then this returns
     /// the target of the transition.
     fn goto(&self) -> Option<StateID> {
         match *self {

From 061ee815ef2c44101dba7b0b124600fcb03c1912 Mon Sep 17 00:00:00 2001
From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com>
Date: Thu, 7 Sep 2023 10:02:09 -0400
Subject: [PATCH 054/136] readme: visually emphasize performance criteria
 difference

There was only a slight wording difference between these two
points, and it was easy to gloss over. So we emphasize that
wording difference to make it a bit easier to notice.

PR #1082
---
 README.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 51188654d..7454c166d 100644
--- a/README.md
+++ b/README.md
@@ -233,10 +233,10 @@ nebulous. While nebulous, it guides this crate's architecture and the sorts of
 the trade offs it makes. For example, here are some general architectural
 statements that follow as a result of the goal to be "fast":
 
-* When given the choice between faster regex searches and faster Rust compile
-times, this crate will generally choose faster regex searches.
-* When given the choice between faster regex searches and faster regex compile
-times, this crate will generally choose faster regex searches. That is, it is
+* When given the choice between faster regex searches and faster _Rust compile
+times_, this crate will generally choose faster regex searches.
+* When given the choice between faster regex searches and faster _regex compile
+times_, this crate will generally choose faster regex searches. That is, it is
 generally acceptable for `Regex::new` to get a little slower if it means that
 searches get faster. (This is a somewhat delicate balance to strike, because
 the speed of `Regex::new` needs to remain somewhat reasonable. But this is why

From 27a25385c0bd1228716271668febc88bd8c74932 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Thu, 28 Sep 2023 13:03:27 -0400
Subject: [PATCH 055/136] automata: add some #[inline] annotations

This hopefully ensures these functions can be inlined across crate
boundaries. (Although I think they likely already can be due to
generics?)
---
 regex-automata/src/util/pool.rs | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/regex-automata/src/util/pool.rs b/regex-automata/src/util/pool.rs
index c03d7b013..95afa4a0d 100644
--- a/regex-automata/src/util/pool.rs
+++ b/regex-automata/src/util/pool.rs
@@ -177,6 +177,7 @@ impl<T: Send, F: Fn() -> T> Pool<T, F> {
     /// the value to go back into the pool) and then calling get again is
     /// *not* guaranteed to return the same value received in the first `get`
     /// call.
+    #[inline]
     pub fn get(&self) -> PoolGuard<'_, T, F> {
         PoolGuard(self.0.get())
     }
@@ -200,6 +201,7 @@ impl<'a, T: Send, F: Fn() -> T> PoolGuard<'a, T, F> {
     /// This circumvents the guard's `Drop` implementation. This can be useful
     /// in circumstances where the automatic `Drop` results in poorer codegen,
     /// such as calling non-inlined functions.
+    #[inline]
     pub fn put(this: PoolGuard<'_, T, F>) {
         inner::PoolGuard::put(this.0);
     }
@@ -208,12 +210,14 @@ impl<'a, T: Send, F: Fn() -> T> PoolGuard<'a, T, F> {
 impl<'a, T: Send, F: Fn() -> T> core::ops::Deref for PoolGuard<'a, T, F> {
     type Target = T;
 
+    #[inline]
     fn deref(&self) -> &T {
         self.0.value()
     }
 }
 
 impl<'a, T: Send, F: Fn() -> T> core::ops::DerefMut for PoolGuard<'a, T, F> {
+    #[inline]
     fn deref_mut(&mut self) -> &mut T {
         self.0.value_mut()
     }
@@ -469,6 +473,7 @@ mod inner {
     impl<T: Send, F: Fn() -> T> Pool<T, F> {
         /// Get a value from the pool. This may block if another thread is also
         /// attempting to retrieve a value from the pool.
+        #[inline]
         pub(super) fn get(&self) -> PoolGuard<'_, T, F> {
             // Our fast path checks if the caller is the thread that "owns"
             // this pool. Or stated differently, whether it is the first thread
@@ -562,6 +567,7 @@ mod inner {
         /// Puts a value back into the pool. Callers don't need to call this.
         /// Once the guard that's returned by 'get' is dropped, it is put back
         /// into the pool automatically.
+        #[inline]
         fn put_value(&self, value: Box<T>) {
             let caller = THREAD_ID.with(|id| *id);
             let stack_id = caller % self.stacks.len();
@@ -587,11 +593,13 @@ mod inner {
         }
 
         /// Create a guard that represents the special owned T.
+        #[inline]
         fn guard_owned(&self, caller: usize) -> PoolGuard<'_, T, F> {
             PoolGuard { pool: self, value: Err(caller), discard: false }
         }
 
         /// Create a guard that contains a value from the pool's stack.
+        #[inline]
         fn guard_stack(&self, value: Box<T>) -> PoolGuard<'_, T, F> {
             PoolGuard { pool: self, value: Ok(value), discard: false }
         }
@@ -599,6 +607,7 @@ mod inner {
         /// Create a guard that contains a value from the pool's stack with an
         /// instruction to throw away the value instead of putting it back
         /// into the pool.
+        #[inline]
         fn guard_stack_transient(&self, value: Box<T>) -> PoolGuard<'_, T, F> {
             PoolGuard { pool: self, value: Ok(value), discard: true }
         }
@@ -633,6 +642,7 @@ mod inner {
 
     impl<'a, T: Send, F: Fn() -> T> PoolGuard<'a, T, F> {
         /// Return the underlying value.
+        #[inline]
         pub(super) fn value(&self) -> &T {
             match self.value {
                 Ok(ref v) => &**v,
@@ -657,6 +667,7 @@ mod inner {
         }
 
         /// Return the underlying value as a mutable borrow.
+        #[inline]
         pub(super) fn value_mut(&mut self) -> &mut T {
             match self.value {
                 Ok(ref mut v) => &mut **v,
@@ -681,6 +692,7 @@ mod inner {
         }
 
         /// Consumes this guard and puts it back into the pool.
+        #[inline]
         pub(super) fn put(this: PoolGuard<'_, T, F>) {
             // Since this is effectively consuming the guard and putting the
             // value back into the pool, there's no reason to run its Drop
@@ -729,6 +741,7 @@ mod inner {
     }
 
     impl<'a, T: Send, F: Fn() -> T> Drop for PoolGuard<'a, T, F> {
+        #[inline]
         fn drop(&mut self) {
             self.put_imp();
         }
@@ -806,6 +819,7 @@ mod inner {
     impl<T: Send, F: Fn() -> T> Pool<T, F> {
         /// Get a value from the pool. This may block if another thread is also
         /// attempting to retrieve a value from the pool.
+        #[inline]
         pub(super) fn get(&self) -> PoolGuard<'_, T, F> {
             let mut stack = self.stack.lock();
             let value = match stack.pop() {
@@ -815,6 +829,7 @@ mod inner {
             PoolGuard { pool: self, value: Some(value) }
         }
 
+        #[inline]
         fn put(&self, guard: PoolGuard<'_, T, F>) {
             let mut guard = core::mem::ManuallyDrop::new(guard);
             if let Some(value) = guard.value.take() {
@@ -825,6 +840,7 @@ mod inner {
         /// Puts a value back into the pool. Callers don't need to call this.
         /// Once the guard that's returned by 'get' is dropped, it is put back
         /// into the pool automatically.
+        #[inline]
         fn put_value(&self, value: Box<T>) {
             let mut stack = self.stack.lock();
             stack.push(value);
@@ -847,16 +863,19 @@ mod inner {
 
     impl<'a, T: Send, F: Fn() -> T> PoolGuard<'a, T, F> {
         /// Return the underlying value.
+        #[inline]
         pub(super) fn value(&self) -> &T {
             self.value.as_deref().unwrap()
         }
 
         /// Return the underlying value as a mutable borrow.
+        #[inline]
         pub(super) fn value_mut(&mut self) -> &mut T {
             self.value.as_deref_mut().unwrap()
         }
 
         /// Consumes this guard and puts it back into the pool.
+        #[inline]
         pub(super) fn put(this: PoolGuard<'_, T, F>) {
             // Since this is effectively consuming the guard and putting the
             // value back into the pool, there's no reason to run its Drop
@@ -878,6 +897,7 @@ mod inner {
     }
 
     impl<'a, T: Send, F: Fn() -> T> Drop for PoolGuard<'a, T, F> {
+        #[inline]
         fn drop(&mut self) {
             self.put_imp();
         }
@@ -931,6 +951,7 @@ mod inner {
         /// Lock this mutex and return a guard providing exclusive access to
         /// `T`. This blocks if some other thread has already locked this
         /// mutex.
+        #[inline]
         fn lock(&self) -> MutexGuard<'_, T> {
             while self
                 .locked
@@ -963,18 +984,21 @@ mod inner {
     impl<'a, T> core::ops::Deref for MutexGuard<'a, T> {
         type Target = T;
 
+        #[inline]
         fn deref(&self) -> &T {
             self.data
         }
     }
 
     impl<'a, T> core::ops::DerefMut for MutexGuard<'a, T> {
+        #[inline]
         fn deref_mut(&mut self) -> &mut T {
             self.data
         }
     }
 
     impl<'a, T> Drop for MutexGuard<'a, T> {
+        #[inline]
         fn drop(&mut self) {
             // Drop means 'data' is no longer accessible, so we can unlock
             // the mutex.

From aa4e4c7120b0090ce0624e3c42a2ed06dd8b918a Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Sat, 30 Sep 2023 07:24:05 -0400
Subject: [PATCH 056/136] automata: fix unintended panic in max_haystack_len

This fixes a bug where the bounded backtracker's `max_haystack_len`
could panic if its bitset capacity ended up being smaller than the total
number of NFA states. Under a default configuration this seems unlikely
to happen due to the default limits on the size of a compiled regex. But
if the compiled regex size limit is increased to a large number, then
the likelihood of this panicking increases. Of course, one can provoke
this even easier by just setting the visited capacity to a small number.
Indeed, this is how we provoke it in a regression test.
---
 regex-automata/src/meta/wrappers.rs          |  5 +++-
 regex-automata/src/nfa/thompson/backtrack.rs | 28 ++++++++++++++++++--
 2 files changed, 30 insertions(+), 3 deletions(-)

diff --git a/regex-automata/src/meta/wrappers.rs b/regex-automata/src/meta/wrappers.rs
index 08110d9bb..6cb19ba0d 100644
--- a/regex-automata/src/meta/wrappers.rs
+++ b/regex-automata/src/meta/wrappers.rs
@@ -212,7 +212,10 @@ impl BoundedBacktrackerEngine {
                 .configure(backtrack_config)
                 .build_from_nfa(nfa.clone())
                 .map_err(BuildError::nfa)?;
-            debug!("BoundedBacktracker built");
+            debug!(
+                "BoundedBacktracker built (max haystack length: {:?})",
+                engine.max_haystack_len()
+            );
             Ok(Some(BoundedBacktrackerEngine(engine)))
         }
         #[cfg(not(feature = "nfa-backtrack"))]
diff --git a/regex-automata/src/nfa/thompson/backtrack.rs b/regex-automata/src/nfa/thompson/backtrack.rs
index eba037c1d..df99e456d 100644
--- a/regex-automata/src/nfa/thompson/backtrack.rs
+++ b/regex-automata/src/nfa/thompson/backtrack.rs
@@ -820,8 +820,11 @@ impl BoundedBacktracker {
         // bytes to the capacity in bits.
         let capacity = 8 * self.get_config().get_visited_capacity();
         let blocks = div_ceil(capacity, Visited::BLOCK_SIZE);
-        let real_capacity = blocks * Visited::BLOCK_SIZE;
-        (real_capacity / self.nfa.states().len()) - 1
+        let real_capacity = blocks.saturating_mul(Visited::BLOCK_SIZE);
+        // It's possible for `real_capacity` to be smaller than the number of
+        // NFA states for particularly large regexes, so we saturate towards
+        // zero.
+        (real_capacity / self.nfa.states().len()).saturating_sub(1)
     }
 }
 
@@ -1882,3 +1885,24 @@ fn div_ceil(lhs: usize, rhs: usize) -> usize {
         (lhs / rhs) + 1
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    // This is a regression test for the maximum haystack length computation.
+    // Previously, it assumed that the total capacity of the backtracker's
+    // bitset would always be greater than the number of NFA states. But there
+    // is of course no guarantee that this is true. This regression test
+    // ensures that not only does `max_haystack_len` not panic, but that it
+    // should return `0`.
+    #[cfg(feature = "syntax")]
+    #[test]
+    fn max_haystack_len_overflow() {
+        let re = BoundedBacktracker::builder()
+            .configure(BoundedBacktracker::config().visited_capacity(10))
+            .build(r"[0-9A-Za-z]{100}")
+            .unwrap();
+        assert_eq!(0, re.max_haystack_len());
+    }
+}

From e4674083346283cdf24fdc211dc44a4a6f6846b1 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Sat, 30 Sep 2023 08:31:27 -0400
Subject: [PATCH 057/136] changelog: 1.9.6

---
 CHANGELOG.md | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 885bb9bd7..a50b811dd 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,14 @@
+1.9.6 (2023-09-30)
+==================
+This is a patch release that fixes a panic that can occur when the default
+regex size limit is increased to a large number.
+
+* [BUG aa4e4c71](https://github.com/rust-lang/regex/commit/aa4e4c7120b0090ce0624e3c42a2ed06dd8b918a):
+Fix a bug where computing the maximum haystack length for the bounded
+backtracker could result underflow and thus provoke a panic later in a search
+due to a broken invariant.
+
+
 1.9.5 (2023-09-02)
 ==================
 This is a patch release that hopefully mostly fixes a performance bug that

From 03f00bd756d85ee21714136e46836c4a5ad1b99c Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Sat, 30 Sep 2023 08:31:34 -0400
Subject: [PATCH 058/136] regex-automata-0.3.9

---
 regex-automata/Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/regex-automata/Cargo.toml b/regex-automata/Cargo.toml
index c7e949c4c..7d47140b0 100644
--- a/regex-automata/Cargo.toml
+++ b/regex-automata/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "regex-automata"
-version = "0.3.8"  #:version
+version = "0.3.9"  #:version
 authors = ["The Rust Project Developers", "Andrew Gallant <jamslam@gmail.com>"]
 description = "Automata construction and matching using regular expressions."
 documentation = "https://docs.rs/regex-automata"

From 3dda4255e11ddb9257f6b75135bb2f3f8a554acb Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Sat, 30 Sep 2023 08:31:59 -0400
Subject: [PATCH 059/136] deps: bump regex-automata to 0.3.9

---
 Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Cargo.toml b/Cargo.toml
index c78ed045f..2d3b8076d 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -174,7 +174,7 @@ optional = true
 # For the actual regex engines.
 [dependencies.regex-automata]
 path = "regex-automata"
-version = "0.3.8"
+version = "0.3.9"
 default-features = false
 features = ["alloc", "syntax", "meta", "nfa-pikevm"]
 

From 11b44439786499014f61afe6e294650fb01550be Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Sat, 30 Sep 2023 08:32:02 -0400
Subject: [PATCH 060/136] 1.9.6

---
 Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Cargo.toml b/Cargo.toml
index 2d3b8076d..46664f669 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "regex"
-version = "1.9.5"  #:version
+version = "1.9.6"  #:version
 authors = ["The Rust Project Developers", "Andrew Gallant <jamslam@gmail.com>"]
 license = "MIT OR Apache-2.0"
 readme = "README.md"

From 17284451f10aa06c6c42e622e3529b98513901a8 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Mon, 2 Oct 2023 09:39:24 -0400
Subject: [PATCH 061/136] syntax: fix Markdown for ASCII word class rendering

---
 regex-syntax/src/lib.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/regex-syntax/src/lib.rs b/regex-syntax/src/lib.rs
index 47d818a17..a552099c6 100644
--- a/regex-syntax/src/lib.rs
+++ b/regex-syntax/src/lib.rs
@@ -381,7 +381,7 @@ pub fn try_is_word_character(
 /// Returns true if and only if the given character is an ASCII word character.
 ///
 /// An ASCII word character is defined by the following character class:
-/// `[_0-9a-zA-Z]'.
+/// `[_0-9a-zA-Z]`.
 pub fn is_word_byte(c: u8) -> bool {
     match c {
         b'_' | b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' => true,

From e598c4db2c5ed4d71ff611350becf42cb6faf1db Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Mon, 2 Oct 2023 20:24:51 -0400
Subject: [PATCH 062/136] automata: clean up regression test

The name was quite vague, so add a little specificity.
---
 regex-automata/src/meta/regex.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/regex-automata/src/meta/regex.rs b/regex-automata/src/meta/regex.rs
index ce3bae0fa..a06d2bb48 100644
--- a/regex-automata/src/meta/regex.rs
+++ b/regex-automata/src/meta/regex.rs
@@ -3640,8 +3640,8 @@ mod tests {
     // I found this in the course of building out the benchmark suite for
     // rebar.
     #[test]
-    fn regression() {
-        env_logger::init();
+    fn regression_suffix_literal_count() {
+        let _ = env_logger::try_init();
 
         let re = Regex::new(r"[a-zA-Z]+ing").unwrap();
         assert_eq!(1, re.find_iter("tingling").count());

From c5e9de9d6e07786eb1ff7f88d7871e0f0ef28c32 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Mon, 2 Oct 2023 20:25:31 -0400
Subject: [PATCH 063/136] automata: fix line wrapping

Breaking lines in the middle of backticks appears to be bad juju for
some Markdown renderers.
---
 regex-automata/src/util/look.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/regex-automata/src/util/look.rs b/regex-automata/src/util/look.rs
index aee31b34e..a34ea1d75 100644
--- a/regex-automata/src/util/look.rs
+++ b/regex-automata/src/util/look.rs
@@ -184,8 +184,8 @@ impl Look {
 pub struct LookSet {
     /// The underlying representation this set is exposed to make it possible
     /// to store it somewhere efficiently. The representation is that
-    /// of a bitset, where each assertion occupies bit `i` where `i =
-    /// Look::as_repr()`.
+    /// of a bitset, where each assertion occupies bit `i` where
+    /// `i = Look::as_repr()`.
     ///
     /// Note that users of this internal representation must permit the full
     /// range of `u16` values to be represented. For example, even if the

From f15f3dcbc340eb98b40e60cc8b797263963d1e97 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Mon, 2 Oct 2023 20:27:11 -0400
Subject: [PATCH 064/136] automata: fix word boundary bug

This fixes a bug that can occur when:

1. The regex has a Unicode word boundary.
2. The haystack contains some non-ASCII Unicode scalar value.
3. An inner or suffix literal optimization is in play.

Specifically, this provokes a case where a match is detected in one of
the meta engine's ad hoc DFA search routines, but before the match
reaches its correct endpoint, a quit state is entered. (Because DFAs
can't deal with Unicode word boundaries on non-ASCII haystacks.) The
correct thing to do is to return a quit error and let the higher level
logic divert to a different engine, but it was returning the match that
it had found up until that point instead. The match returned is not
technically incorrect in the sense that a match does indeed exist, but
the offsets it reports may be shorter than what the true match actually
is.

So... if a quit state is entered, return an error regardless of whether
a match has been found.

Fixes #1046
---
 CHANGELOG.md                       |  8 ++++++++
 regex-automata/src/meta/limited.rs | 12 ------------
 regex-automata/src/meta/stopat.rs  | 12 ------------
 testdata/regression.toml           | 18 ++++++++++++++++++
 4 files changed, 26 insertions(+), 24 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index a50b811dd..4a474af1b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,11 @@
+TBD
+===
+
+* [BUG #1046](https://github.com/rust-lang/regex/issues/1046):
+Fix a bug that could result in incorrect match spans when using a Unicode word
+boundary and searching non-ASCII strings.
+
+
 1.9.6 (2023-09-30)
 ==================
 This is a patch release that fixes a panic that can occur when the default
diff --git a/regex-automata/src/meta/limited.rs b/regex-automata/src/meta/limited.rs
index 192a2625e..5653adc9a 100644
--- a/regex-automata/src/meta/limited.rs
+++ b/regex-automata/src/meta/limited.rs
@@ -69,9 +69,6 @@ pub(crate) fn dfa_try_search_half_rev(
             } else if dfa.is_dead_state(sid) {
                 return Ok(mat);
             } else if dfa.is_quit_state(sid) {
-                if mat.is_some() {
-                    return Ok(mat);
-                }
                 return Err(MatchError::quit(input.haystack()[at], at).into());
             }
         }
@@ -155,9 +152,6 @@ pub(crate) fn hybrid_try_search_half_rev(
             } else if sid.is_dead() {
                 return Ok(mat);
             } else if sid.is_quit() {
-                if mat.is_some() {
-                    return Ok(mat);
-                }
                 return Err(MatchError::quit(input.haystack()[at], at).into());
             }
         }
@@ -209,9 +203,6 @@ fn dfa_eoi_rev(
             let pattern = dfa.match_pattern(*sid, 0);
             *mat = Some(HalfMatch::new(pattern, sp.start));
         } else if dfa.is_quit_state(*sid) {
-            if mat.is_some() {
-                return Ok(());
-            }
             return Err(MatchError::quit(byte, sp.start - 1));
         }
     } else {
@@ -246,9 +237,6 @@ fn hybrid_eoi_rev(
             let pattern = dfa.match_pattern(cache, *sid, 0);
             *mat = Some(HalfMatch::new(pattern, sp.start));
         } else if sid.is_quit() {
-            if mat.is_some() {
-                return Ok(());
-            }
             return Err(MatchError::quit(byte, sp.start - 1));
         }
     } else {
diff --git a/regex-automata/src/meta/stopat.rs b/regex-automata/src/meta/stopat.rs
index e8d716689..c4dcd797a 100644
--- a/regex-automata/src/meta/stopat.rs
+++ b/regex-automata/src/meta/stopat.rs
@@ -81,9 +81,6 @@ pub(crate) fn dfa_try_search_half_fwd(
             } else if dfa.is_dead_state(sid) {
                 return Ok(mat.ok_or(at));
             } else if dfa.is_quit_state(sid) {
-                if mat.is_some() {
-                    return Ok(mat.ok_or(at));
-                }
                 return Err(MatchError::quit(input.haystack()[at], at).into());
             } else {
                 // Ideally we wouldn't use a DFA that specialized start states
@@ -122,9 +119,6 @@ pub(crate) fn hybrid_try_search_half_fwd(
             } else if sid.is_dead() {
                 return Ok(mat.ok_or(at));
             } else if sid.is_quit() {
-                if mat.is_some() {
-                    return Ok(mat.ok_or(at));
-                }
                 return Err(MatchError::quit(input.haystack()[at], at).into());
             } else {
                 // We should NEVER get an unknown state ID back from
@@ -162,9 +156,6 @@ fn dfa_eoi_fwd(
                 let pattern = dfa.match_pattern(*sid, 0);
                 *mat = Some(HalfMatch::new(pattern, sp.end));
             } else if dfa.is_quit_state(*sid) {
-                if mat.is_some() {
-                    return Ok(());
-                }
                 return Err(MatchError::quit(b, sp.end));
             }
         }
@@ -201,9 +192,6 @@ fn hybrid_eoi_fwd(
                 let pattern = dfa.match_pattern(cache, *sid, 0);
                 *mat = Some(HalfMatch::new(pattern, sp.end));
             } else if sid.is_quit() {
-                if mat.is_some() {
-                    return Ok(());
-                }
                 return Err(MatchError::quit(b, sp.end));
             }
         }
diff --git a/testdata/regression.toml b/testdata/regression.toml
index 03b15d6d5..09b2b1d1c 100644
--- a/testdata/regression.toml
+++ b/testdata/regression.toml
@@ -782,3 +782,21 @@ match-kind = "all"
 search-kind = "overlapping"
 unicode = true
 utf8 = true
+
+# This tests that the PikeVM and the meta regex agree on a particular regex.
+# This test previously failed when the ad hoc engines inside the meta engine
+# did not handle quit states correctly. Namely, the Unicode word boundary here
+# combined with a non-ASCII codepoint provokes the quit state. The ad hoc
+# engines were previously returning a match even after entering the quit state
+# if a match had been previously detected, but this is incorrect. The reason
+# is that if a quit state is found, then the search must give up *immediately*
+# because it prevents the search from finding the "proper" leftmost-first
+# match. If it instead returns a match that has been found, it risks reporting
+# an improper match, as it did in this case.
+#
+# See: https://github.com/rust-lang/regex/issues/1046
+[[test]]
+name = "non-prefix-literal-quit-state"
+regex = '.+\b\n'
+haystack = "β77\n"
+matches = [[0, 5]]

From b8c2066b6b6b424de95230ff1d63217a7d9e79c4 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Mon, 2 Oct 2023 22:45:13 -0400
Subject: [PATCH 065/136] automata/onepass: future proof bit packing

This was previously using the raw representation of a `LookSet`, which
is fine, but would have errantly overwritten bits unrelated to
look-around assertions if they were set in a `LookSet`.

This can't happen today because we don't have more than 10 assertions.
And the one-pass DFA constructor specifically errors if more assertions
exist and are in the pattern. But still, it seems like good form to mask
out only the bits we care about.
---
 regex-automata/src/dfa/onepass.rs | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/regex-automata/src/dfa/onepass.rs b/regex-automata/src/dfa/onepass.rs
index 44691d0c8..353bb1e17 100644
--- a/regex-automata/src/dfa/onepass.rs
+++ b/regex-automata/src/dfa/onepass.rs
@@ -2581,10 +2581,11 @@ impl Cache {
 
 /// Represents a single transition in a one-pass DFA.
 ///
-/// The high 24 bits corresponds to the state ID. The low 48 bits corresponds
-/// to the transition epsilons, which contains the slots that should be saved
-/// when this transition is followed and the conditional epsilon transitions
-/// that must be satisfied in order to follow this transition.
+/// The high 21 bits corresponds to the state ID. The bit following corresponds
+/// to the special "match wins" flag. The remaining low 42 bits corresponds to
+/// the transition epsilons, which contains the slots that should be saved when
+/// this transition is followed and the conditional epsilon transitions that
+/// must be satisfied in order to follow this transition.
 #[derive(Clone, Copy, Eq, PartialEq)]
 struct Transition(u64);
 
@@ -2741,7 +2742,7 @@ impl PatternEpsilons {
     fn set_epsilons(self, epsilons: Epsilons) -> PatternEpsilons {
         PatternEpsilons(
             (self.0 & PatternEpsilons::PATTERN_ID_MASK)
-                | u64::from(epsilons.0),
+                | (u64::from(epsilons.0) & PatternEpsilons::EPSILONS_MASK),
         )
     }
 }
@@ -2819,7 +2820,10 @@ impl Epsilons {
 
     /// Set the look-around assertions on these epsilon transitions.
     fn set_looks(self, look_set: LookSet) -> Epsilons {
-        Epsilons((self.0 & Epsilons::SLOT_MASK) | u64::from(look_set.bits))
+        Epsilons(
+            (self.0 & Epsilons::SLOT_MASK)
+                | (u64::from(look_set.bits) & Epsilons::LOOK_MASK),
+        )
     }
 }
 

From 0ead12869417434fb39ae4b876c4fb97543cbbd8 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Tue, 3 Oct 2023 15:16:21 -0400
Subject: [PATCH 066/136] syntax: make Ast the size of a pointer

This puts every Ast value behind a box to conserve space. It makes
things like Vec<Ast> quite a bit smaller than what they would be
otherwise, which is especially beneficial for the representation of
concatenations and alternations.

This doesn't quite solve the memory usage problems though, since an
AstKind is still quite big (over 200 bytes). The next step will be
boxing each of the variants of an AstKind which should hopefully resolve
the issue.

Ref #1090
---
 regex-syntax/src/ast/mod.rs       | 180 ++++++++++------
 regex-syntax/src/ast/parse.rs     | 328 +++++++++++++++---------------
 regex-syntax/src/ast/print.rs     |  34 ++--
 regex-syntax/src/ast/visitor.rs   |  18 +-
 regex-syntax/src/hir/translate.rs |  44 ++--
 5 files changed, 332 insertions(+), 272 deletions(-)

diff --git a/regex-syntax/src/ast/mod.rs b/regex-syntax/src/ast/mod.rs
index 9e4284fee..6a6b58237 100644
--- a/regex-syntax/src/ast/mod.rs
+++ b/regex-syntax/src/ast/mod.rs
@@ -429,9 +429,19 @@ pub struct Comment {
 ///
 /// This type defines its own destructor that uses constant stack space and
 /// heap space proportional to the size of the `Ast`.
+///
+/// This type boxes the actual kind of the AST element so that an `Ast` value
+/// itself has a very small size. This in turn makes things like `Vec<Ast>` use
+/// a lot less memory than it might otherwise, which is particularly beneficial
+/// for representing long concatenations or alternations.
+#[derive(Clone, Debug, Eq, PartialEq)]
+#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
+pub struct Ast(pub Box<AstKind>);
+
+/// The kind of an abstract syntax element.
 #[derive(Clone, Debug, Eq, PartialEq)]
 #[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
-pub enum Ast {
+pub enum AstKind {
     /// An empty regex that matches everything.
     Empty(Span),
     /// A set of flags, e.g., `(?is)`.
@@ -456,26 +466,76 @@ pub enum Ast {
 }
 
 impl Ast {
+    /// Create an "empty" AST item.
+    pub fn empty(span: Span) -> Ast {
+        Ast(Box::new(AstKind::Empty(span)))
+    }
+
+    /// Create a "flags" AST item.
+    pub fn flags(e: SetFlags) -> Ast {
+        Ast(Box::new(AstKind::Flags(e)))
+    }
+
+    /// Create a "literal" AST item.
+    pub fn literal(e: Literal) -> Ast {
+        Ast(Box::new(AstKind::Literal(e)))
+    }
+
+    /// Create a "dot" AST item.
+    pub fn dot(span: Span) -> Ast {
+        Ast(Box::new(AstKind::Dot(span)))
+    }
+
+    /// Create a "assertion" AST item.
+    pub fn assertion(e: Assertion) -> Ast {
+        Ast(Box::new(AstKind::Assertion(e)))
+    }
+
+    /// Create a "class" AST item.
+    pub fn class(e: Class) -> Ast {
+        Ast(Box::new(AstKind::Class(e)))
+    }
+
+    /// Create a "repetition" AST item.
+    pub fn repetition(e: Repetition) -> Ast {
+        Ast(Box::new(AstKind::Repetition(e)))
+    }
+
+    /// Create a "group" AST item.
+    pub fn group(e: Group) -> Ast {
+        Ast(Box::new(AstKind::Group(e)))
+    }
+
+    /// Create a "alternation" AST item.
+    pub fn alternation(e: Alternation) -> Ast {
+        Ast(Box::new(AstKind::Alternation(e)))
+    }
+
+    /// Create a "concat" AST item.
+    pub fn concat(e: Concat) -> Ast {
+        Ast(Box::new(AstKind::Concat(e)))
+    }
+
     /// Return the span of this abstract syntax tree.
     pub fn span(&self) -> &Span {
-        match *self {
-            Ast::Empty(ref span) => span,
-            Ast::Flags(ref x) => &x.span,
-            Ast::Literal(ref x) => &x.span,
-            Ast::Dot(ref span) => span,
-            Ast::Assertion(ref x) => &x.span,
-            Ast::Class(ref x) => x.span(),
-            Ast::Repetition(ref x) => &x.span,
-            Ast::Group(ref x) => &x.span,
-            Ast::Alternation(ref x) => &x.span,
-            Ast::Concat(ref x) => &x.span,
+        match *self.0 {
+            AstKind::Empty(ref span) => span,
+            AstKind::Flags(ref x) => &x.span,
+            AstKind::Literal(ref x) => &x.span,
+            AstKind::Dot(ref span) => span,
+            AstKind::Assertion(ref x) => &x.span,
+            AstKind::Class(ref x) => x.span(),
+            AstKind::Repetition(ref x) => &x.span,
+            AstKind::Group(ref x) => &x.span,
+            AstKind::Alternation(ref x) => &x.span,
+            AstKind::Concat(ref x) => &x.span,
         }
     }
 
     /// Return true if and only if this Ast is empty.
     pub fn is_empty(&self) -> bool {
-        match *self {
-            Ast::Empty(_) => true,
+        match *self.0 {
+            AstKind::Empty(_) => true,
             _ => false,
         }
     }
@@ -483,17 +543,17 @@ impl Ast {
     /// Returns true if and only if this AST has any (including possibly empty)
     /// subexpressions.
     fn has_subexprs(&self) -> bool {
-        match *self {
-            Ast::Empty(_)
-            | Ast::Flags(_)
-            | Ast::Literal(_)
-            | Ast::Dot(_)
-            | Ast::Assertion(_) => false,
-            Ast::Class(_)
-            | Ast::Repetition(_)
-            | Ast::Group(_)
-            | Ast::Alternation(_)
-            | Ast::Concat(_) => true,
+        match *self.0 {
+            AstKind::Empty(_)
+            | AstKind::Flags(_)
+            | AstKind::Literal(_)
+            | AstKind::Dot(_)
+            | AstKind::Assertion(_) => false,
+            AstKind::Class(_)
+            | AstKind::Repetition(_)
+            | AstKind::Group(_)
+            | AstKind::Alternation(_)
+            | AstKind::Concat(_) => true,
         }
     }
 }
@@ -526,14 +586,14 @@ pub struct Alternation {
 impl Alternation {
     /// Return this alternation as an AST.
     ///
-    /// If this alternation contains zero ASTs, then Ast::Empty is
-    /// returned. If this alternation contains exactly 1 AST, then the
-    /// corresponding AST is returned. Otherwise, Ast::Alternation is returned.
+    /// If this alternation contains zero ASTs, then `Ast::empty` is returned.
+    /// If this alternation contains exactly 1 AST, then the corresponding AST
+    /// is returned. Otherwise, `Ast::alternation` is returned.
     pub fn into_ast(mut self) -> Ast {
         match self.asts.len() {
-            0 => Ast::Empty(self.span),
+            0 => Ast::empty(self.span),
             1 => self.asts.pop().unwrap(),
-            _ => Ast::Alternation(self),
+            _ => Ast::alternation(self),
         }
     }
 }
@@ -551,14 +611,14 @@ pub struct Concat {
 impl Concat {
     /// Return this concatenation as an AST.
     ///
-    /// If this concatenation contains zero ASTs, then Ast::Empty is
-    /// returned. If this concatenation contains exactly 1 AST, then the
-    /// corresponding AST is returned. Otherwise, Ast::Concat is returned.
+    /// If this alternation contains zero ASTs, then `Ast::empty` is returned.
+    /// If this alternation contains exactly 1 AST, then the corresponding AST
+    /// is returned. Otherwise, `Ast::concat` is returned.
     pub fn into_ast(mut self) -> Ast {
         match self.asts.len() {
-            0 => Ast::Empty(self.span),
+            0 => Ast::empty(self.span),
             1 => self.asts.pop().unwrap(),
-            _ => Ast::Concat(self),
+            _ => Ast::concat(self),
         }
     }
 }
@@ -1544,43 +1604,43 @@ impl Drop for Ast {
     fn drop(&mut self) {
         use core::mem;
 
-        match *self {
-            Ast::Empty(_)
-            | Ast::Flags(_)
-            | Ast::Literal(_)
-            | Ast::Dot(_)
-            | Ast::Assertion(_)
+        match *self.0 {
+            AstKind::Empty(_)
+            | AstKind::Flags(_)
+            | AstKind::Literal(_)
+            | AstKind::Dot(_)
+            | AstKind::Assertion(_)
             // Classes are recursive, so they get their own Drop impl.
-            | Ast::Class(_) => return,
-            Ast::Repetition(ref x) if !x.ast.has_subexprs() => return,
-            Ast::Group(ref x) if !x.ast.has_subexprs() => return,
-            Ast::Alternation(ref x) if x.asts.is_empty() => return,
-            Ast::Concat(ref x) if x.asts.is_empty() => return,
+            | AstKind::Class(_) => return,
+            AstKind::Repetition(ref x) if !x.ast.has_subexprs() => return,
+            AstKind::Group(ref x) if !x.ast.has_subexprs() => return,
+            AstKind::Alternation(ref x) if x.asts.is_empty() => return,
+            AstKind::Concat(ref x) if x.asts.is_empty() => return,
             _ => {}
         }
 
         let empty_span = || Span::splat(Position::new(0, 0, 0));
-        let empty_ast = || Ast::Empty(empty_span());
+        let empty_ast = || Ast::empty(empty_span());
         let mut stack = vec![mem::replace(self, empty_ast())];
         while let Some(mut ast) = stack.pop() {
-            match ast {
-                Ast::Empty(_)
-                | Ast::Flags(_)
-                | Ast::Literal(_)
-                | Ast::Dot(_)
-                | Ast::Assertion(_)
+            match *ast.0 {
+                AstKind::Empty(_)
+                | AstKind::Flags(_)
+                | AstKind::Literal(_)
+                | AstKind::Dot(_)
+                | AstKind::Assertion(_)
                 // Classes are recursive, so they get their own Drop impl.
-                | Ast::Class(_) => {}
-                Ast::Repetition(ref mut x) => {
+                | AstKind::Class(_) => {}
+                AstKind::Repetition(ref mut x) => {
                     stack.push(mem::replace(&mut x.ast, empty_ast()));
                 }
-                Ast::Group(ref mut x) => {
+                AstKind::Group(ref mut x) => {
                     stack.push(mem::replace(&mut x.ast, empty_ast()));
                 }
-                Ast::Alternation(ref mut x) => {
+                AstKind::Alternation(ref mut x) => {
                     stack.extend(x.asts.drain(..));
                 }
-                Ast::Concat(ref mut x) => {
+                AstKind::Concat(ref mut x) => {
                     stack.extend(x.asts.drain(..));
                 }
             }
@@ -1663,9 +1723,9 @@ mod tests {
 
         let run = || {
             let span = || Span::splat(Position::new(0, 0, 0));
-            let mut ast = Ast::Empty(span());
+            let mut ast = Ast::empty(span());
             for i in 0..200 {
-                ast = Ast::Group(Group {
+                ast = Ast::group(Group {
                     span: span(),
                     kind: GroupKind::CaptureIndex(i),
                     ast: Box::new(ast),
diff --git a/regex-syntax/src/ast/parse.rs b/regex-syntax/src/ast/parse.rs
index 47ea2586b..b3f04bfdc 100644
--- a/regex-syntax/src/ast/parse.rs
+++ b/regex-syntax/src/ast/parse.rs
@@ -16,7 +16,7 @@ use alloc::{
 };
 
 use crate::{
-    ast::{self, Ast, Position, Span},
+    ast::{self, Ast, AstKind, Position, Span},
     either::Either,
     is_escapeable_character, is_meta_character,
 };
@@ -53,11 +53,11 @@ impl Primitive {
     /// Convert this primitive into a proper AST.
     fn into_ast(self) -> Ast {
         match self {
-            Primitive::Literal(lit) => Ast::Literal(lit),
-            Primitive::Assertion(assert) => Ast::Assertion(assert),
-            Primitive::Dot(span) => Ast::Dot(span),
-            Primitive::Perl(cls) => Ast::Class(ast::Class::Perl(cls)),
-            Primitive::Unicode(cls) => Ast::Class(ast::Class::Unicode(cls)),
+            Primitive::Literal(lit) => Ast::literal(lit),
+            Primitive::Assertion(assert) => Ast::assertion(assert),
+            Primitive::Dot(span) => Ast::dot(span),
+            Primitive::Perl(cls) => Ast::class(ast::Class::Perl(cls)),
+            Primitive::Unicode(cls) => Ast::class(ast::Class::Unicode(cls)),
         }
     }
 
@@ -691,7 +691,7 @@ impl<'s, P: Borrow<Parser>> ParserI<'s, P> {
                     self.parser().ignore_whitespace.set(v);
                 }
 
-                concat.asts.push(Ast::Flags(set));
+                concat.asts.push(Ast::flags(set));
                 Ok(concat)
             }
             Either::Right(group) => {
@@ -764,7 +764,7 @@ impl<'s, P: Borrow<Parser>> ParserI<'s, P> {
                 group.ast = Box::new(group_concat.into_ast());
             }
         }
-        prior_concat.asts.push(Ast::Group(group));
+        prior_concat.asts.push(Ast::group(group));
         Ok(prior_concat)
     }
 
@@ -783,7 +783,7 @@ impl<'s, P: Borrow<Parser>> ParserI<'s, P> {
             Some(GroupState::Alternation(mut alt)) => {
                 alt.span.end = self.pos();
                 alt.asts.push(concat.into_ast());
-                Ok(Ast::Alternation(alt))
+                Ok(Ast::alternation(alt))
             }
             Some(GroupState::Group { group, .. }) => {
                 return Err(
@@ -976,7 +976,7 @@ impl<'s, P: Borrow<Parser>> ParserI<'s, P> {
                 '|' => concat = self.push_alternate(concat)?,
                 '[' => {
                     let class = self.parse_set_class()?;
-                    concat.asts.push(Ast::Class(class));
+                    concat.asts.push(Ast::class(class));
                 }
                 '?' => {
                     concat = self.parse_uncounted_repetition(
@@ -1044,8 +1044,8 @@ impl<'s, P: Borrow<Parser>> ParserI<'s, P> {
                 )
             }
         };
-        match ast {
-            Ast::Empty(_) | Ast::Flags(_) => {
+        match *ast.0 {
+            AstKind::Empty(_) | AstKind::Flags(_) => {
                 return Err(
                     self.error(self.span(), ast::ErrorKind::RepetitionMissing)
                 )
@@ -1057,7 +1057,7 @@ impl<'s, P: Borrow<Parser>> ParserI<'s, P> {
             greedy = false;
             self.bump();
         }
-        concat.asts.push(Ast::Repetition(ast::Repetition {
+        concat.asts.push(Ast::repetition(ast::Repetition {
             span: ast.span().with_end(self.pos()),
             op: ast::RepetitionOp {
                 span: Span::new(op_start, self.pos()),
@@ -1096,8 +1096,8 @@ impl<'s, P: Borrow<Parser>> ParserI<'s, P> {
                 )
             }
         };
-        match ast {
-            Ast::Empty(_) | Ast::Flags(_) => {
+        match *ast.0 {
+            AstKind::Empty(_) | AstKind::Flags(_) => {
                 return Err(
                     self.error(self.span(), ast::ErrorKind::RepetitionMissing)
                 )
@@ -1159,7 +1159,7 @@ impl<'s, P: Borrow<Parser>> ParserI<'s, P> {
                 self.error(op_span, ast::ErrorKind::RepetitionCountInvalid)
             );
         }
-        concat.asts.push(Ast::Repetition(ast::Repetition {
+        concat.asts.push(Ast::repetition(ast::Repetition {
             span: ast.span().with_end(self.pos()),
             op: ast::RepetitionOp {
                 span: op_span,
@@ -1212,7 +1212,7 @@ impl<'s, P: Borrow<Parser>> ParserI<'s, P> {
             Ok(Either::Right(ast::Group {
                 span: open_span,
                 kind: ast::GroupKind::CaptureName { starts_with_p, name },
-                ast: Box::new(Ast::Empty(self.span())),
+                ast: Box::new(Ast::empty(self.span())),
             }))
         } else if self.bump_if("?") {
             if self.is_eof() {
@@ -1241,7 +1241,7 @@ impl<'s, P: Borrow<Parser>> ParserI<'s, P> {
                 Ok(Either::Right(ast::Group {
                     span: open_span,
                     kind: ast::GroupKind::NonCapturing(flags),
-                    ast: Box::new(Ast::Empty(self.span())),
+                    ast: Box::new(Ast::empty(self.span())),
                 }))
             }
         } else {
@@ -1249,7 +1249,7 @@ impl<'s, P: Borrow<Parser>> ParserI<'s, P> {
             Ok(Either::Right(ast::Group {
                 span: open_span,
                 kind: ast::GroupKind::CaptureIndex(capture_index),
-                ast: Box::new(Ast::Empty(self.span())),
+                ast: Box::new(Ast::empty(self.span())),
             }))
         }
     }
@@ -2183,43 +2183,43 @@ impl<'p, 's, P: Borrow<Parser>> ast::Visitor for NestLimiter<'p, 's, P> {
     }
 
     fn visit_pre(&mut self, ast: &Ast) -> Result<()> {
-        let span = match *ast {
-            Ast::Empty(_)
-            | Ast::Flags(_)
-            | Ast::Literal(_)
-            | Ast::Dot(_)
-            | Ast::Assertion(_)
-            | Ast::Class(ast::Class::Unicode(_))
-            | Ast::Class(ast::Class::Perl(_)) => {
+        let span = match *ast.0 {
+            AstKind::Empty(_)
+            | AstKind::Flags(_)
+            | AstKind::Literal(_)
+            | AstKind::Dot(_)
+            | AstKind::Assertion(_)
+            | AstKind::Class(ast::Class::Unicode(_))
+            | AstKind::Class(ast::Class::Perl(_)) => {
                 // These are all base cases, so we don't increment depth.
                 return Ok(());
             }
-            Ast::Class(ast::Class::Bracketed(ref x)) => &x.span,
-            Ast::Repetition(ref x) => &x.span,
-            Ast::Group(ref x) => &x.span,
-            Ast::Alternation(ref x) => &x.span,
-            Ast::Concat(ref x) => &x.span,
+            AstKind::Class(ast::Class::Bracketed(ref x)) => &x.span,
+            AstKind::Repetition(ref x) => &x.span,
+            AstKind::Group(ref x) => &x.span,
+            AstKind::Alternation(ref x) => &x.span,
+            AstKind::Concat(ref x) => &x.span,
         };
         self.increment_depth(span)
     }
 
     fn visit_post(&mut self, ast: &Ast) -> Result<()> {
-        match *ast {
-            Ast::Empty(_)
-            | Ast::Flags(_)
-            | Ast::Literal(_)
-            | Ast::Dot(_)
-            | Ast::Assertion(_)
-            | Ast::Class(ast::Class::Unicode(_))
-            | Ast::Class(ast::Class::Perl(_)) => {
+        match *ast.0 {
+            AstKind::Empty(_)
+            | AstKind::Flags(_)
+            | AstKind::Literal(_)
+            | AstKind::Dot(_)
+            | AstKind::Assertion(_)
+            | AstKind::Class(ast::Class::Unicode(_))
+            | AstKind::Class(ast::Class::Perl(_)) => {
                 // These are all base cases, so we don't decrement depth.
                 Ok(())
             }
-            Ast::Class(ast::Class::Bracketed(_))
-            | Ast::Repetition(_)
-            | Ast::Group(_)
-            | Ast::Alternation(_)
-            | Ast::Concat(_) => {
+            AstKind::Class(ast::Class::Bracketed(_))
+            | AstKind::Repetition(_)
+            | AstKind::Group(_)
+            | AstKind::Alternation(_)
+            | AstKind::Concat(_) => {
                 self.decrement_depth();
                 Ok(())
             }
@@ -2426,12 +2426,12 @@ mod tests {
 
     /// Create a meta literal starting at the given position.
     fn meta_lit(c: char, span: Span) -> Ast {
-        Ast::Literal(ast::Literal { span, kind: ast::LiteralKind::Meta, c })
+        Ast::literal(ast::Literal { span, kind: ast::LiteralKind::Meta, c })
     }
 
     /// Create a verbatim literal with the given span.
     fn lit_with(c: char, span: Span) -> Ast {
-        Ast::Literal(ast::Literal {
+        Ast::literal(ast::Literal {
             span,
             kind: ast::LiteralKind::Verbatim,
             c,
@@ -2445,17 +2445,17 @@ mod tests {
 
     /// Create a concatenation with the given span.
     fn concat_with(span: Span, asts: Vec<Ast>) -> Ast {
-        Ast::Concat(ast::Concat { span, asts })
+        Ast::concat(ast::Concat { span, asts })
     }
 
     /// Create an alternation with the given span.
     fn alt(range: Range<usize>, asts: Vec<Ast>) -> Ast {
-        Ast::Alternation(ast::Alternation { span: span(range), asts })
+        Ast::alternation(ast::Alternation { span: span(range), asts })
     }
 
     /// Create a capturing group with the given span.
     fn group(range: Range<usize>, index: u32, ast: Ast) -> Ast {
-        Ast::Group(ast::Group {
+        Ast::group(ast::Group {
             span: span(range),
             kind: ast::GroupKind::CaptureIndex(index),
             ast: Box::new(ast),
@@ -2488,7 +2488,7 @@ mod tests {
                 },
             );
         }
-        Ast::Flags(ast::SetFlags {
+        Ast::flags(ast::SetFlags {
             span: span_range(pat, range.clone()),
             flags: ast::Flags {
                 span: span_range(pat, (range.start + 2)..(range.end - 1)),
@@ -2502,7 +2502,7 @@ mod tests {
         // A nest limit of 0 still allows some types of regexes.
         assert_eq!(
             parser_nest_limit("", 0).parse(),
-            Ok(Ast::Empty(span(0..0)))
+            Ok(Ast::empty(span(0..0)))
         );
         assert_eq!(parser_nest_limit("a", 0).parse(), Ok(lit('a', 0)));
 
@@ -2516,7 +2516,7 @@ mod tests {
         );
         assert_eq!(
             parser_nest_limit("a+", 1).parse(),
-            Ok(Ast::Repetition(ast::Repetition {
+            Ok(Ast::repetition(ast::Repetition {
                 span: span(0..2),
                 op: ast::RepetitionOp {
                     span: span(1..2),
@@ -2542,14 +2542,14 @@ mod tests {
         );
         assert_eq!(
             parser_nest_limit("a+*", 2).parse(),
-            Ok(Ast::Repetition(ast::Repetition {
+            Ok(Ast::repetition(ast::Repetition {
                 span: span(0..3),
                 op: ast::RepetitionOp {
                     span: span(2..3),
                     kind: ast::RepetitionKind::ZeroOrMore,
                 },
                 greedy: true,
-                ast: Box::new(Ast::Repetition(ast::Repetition {
+                ast: Box::new(Ast::repetition(ast::Repetition {
                     span: span(0..2),
                     op: ast::RepetitionOp {
                         span: span(1..2),
@@ -2606,7 +2606,7 @@ mod tests {
         );
         assert_eq!(
             parser_nest_limit("[a]", 1).parse(),
-            Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
+            Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed {
                 span: span(0..3),
                 negated: false,
                 kind: ast::ClassSet::Item(ast::ClassSetItem::Literal(
@@ -2776,7 +2776,7 @@ bar
                 vec![
                     lit_with('a', span_range(pat, 0..1)),
                     lit_with(' ', span_range(pat, 1..2)),
-                    Ast::Group(ast::Group {
+                    Ast::group(ast::Group {
                         span: span_range(pat, 2..9),
                         kind: ast::GroupKind::NonCapturing(ast::Flags {
                             span: span_range(pat, 4..5),
@@ -2803,7 +2803,7 @@ bar
                 span_range(pat, 0..pat.len()),
                 vec![
                     flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
-                    Ast::Group(ast::Group {
+                    Ast::group(ast::Group {
                         span: span_range(pat, 4..pat.len()),
                         kind: ast::GroupKind::CaptureName {
                             starts_with_p: true,
@@ -2825,7 +2825,7 @@ bar
                 span_range(pat, 0..pat.len()),
                 vec![
                     flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
-                    Ast::Group(ast::Group {
+                    Ast::group(ast::Group {
                         span: span_range(pat, 4..pat.len()),
                         kind: ast::GroupKind::CaptureIndex(1),
                         ast: Box::new(lit_with('a', span_range(pat, 7..8))),
@@ -2840,7 +2840,7 @@ bar
                 span_range(pat, 0..pat.len()),
                 vec![
                     flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
-                    Ast::Group(ast::Group {
+                    Ast::group(ast::Group {
                         span: span_range(pat, 4..pat.len()),
                         kind: ast::GroupKind::NonCapturing(ast::Flags {
                             span: span_range(pat, 8..8),
@@ -2858,7 +2858,7 @@ bar
                 span_range(pat, 0..pat.len()),
                 vec![
                     flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
-                    Ast::Literal(ast::Literal {
+                    Ast::literal(ast::Literal {
                         span: span(4..13),
                         kind: ast::LiteralKind::HexBrace(
                             ast::HexLiteralKind::X
@@ -2877,7 +2877,7 @@ bar
                 span_range(pat, 0..pat.len()),
                 vec![
                     flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
-                    Ast::Literal(ast::Literal {
+                    Ast::literal(ast::Literal {
                         span: span_range(pat, 4..6),
                         kind: ast::LiteralKind::Superfluous,
                         c: ' ',
@@ -2895,9 +2895,9 @@ bar
             Ok(concat_with(
                 span_range(pat, 0..3),
                 vec![
-                    Ast::Dot(span_range(pat, 0..1)),
+                    Ast::dot(span_range(pat, 0..1)),
                     lit_with('\n', span_range(pat, 1..2)),
-                    Ast::Dot(span_range(pat, 2..3)),
+                    Ast::dot(span_range(pat, 2..3)),
                 ]
             ))
         );
@@ -2933,7 +2933,7 @@ bar
     fn parse_uncounted_repetition() {
         assert_eq!(
             parser(r"a*").parse(),
-            Ok(Ast::Repetition(ast::Repetition {
+            Ok(Ast::repetition(ast::Repetition {
                 span: span(0..2),
                 op: ast::RepetitionOp {
                     span: span(1..2),
@@ -2945,7 +2945,7 @@ bar
         );
         assert_eq!(
             parser(r"a+").parse(),
-            Ok(Ast::Repetition(ast::Repetition {
+            Ok(Ast::repetition(ast::Repetition {
                 span: span(0..2),
                 op: ast::RepetitionOp {
                     span: span(1..2),
@@ -2958,7 +2958,7 @@ bar
 
         assert_eq!(
             parser(r"a?").parse(),
-            Ok(Ast::Repetition(ast::Repetition {
+            Ok(Ast::repetition(ast::Repetition {
                 span: span(0..2),
                 op: ast::RepetitionOp {
                     span: span(1..2),
@@ -2970,7 +2970,7 @@ bar
         );
         assert_eq!(
             parser(r"a??").parse(),
-            Ok(Ast::Repetition(ast::Repetition {
+            Ok(Ast::repetition(ast::Repetition {
                 span: span(0..3),
                 op: ast::RepetitionOp {
                     span: span(1..3),
@@ -2982,7 +2982,7 @@ bar
         );
         assert_eq!(
             parser(r"a?").parse(),
-            Ok(Ast::Repetition(ast::Repetition {
+            Ok(Ast::repetition(ast::Repetition {
                 span: span(0..2),
                 op: ast::RepetitionOp {
                     span: span(1..2),
@@ -2997,7 +2997,7 @@ bar
             Ok(concat(
                 0..3,
                 vec![
-                    Ast::Repetition(ast::Repetition {
+                    Ast::repetition(ast::Repetition {
                         span: span(0..2),
                         op: ast::RepetitionOp {
                             span: span(1..2),
@@ -3015,7 +3015,7 @@ bar
             Ok(concat(
                 0..4,
                 vec![
-                    Ast::Repetition(ast::Repetition {
+                    Ast::repetition(ast::Repetition {
                         span: span(0..3),
                         op: ast::RepetitionOp {
                             span: span(1..3),
@@ -3034,7 +3034,7 @@ bar
                 0..3,
                 vec![
                     lit('a', 0),
-                    Ast::Repetition(ast::Repetition {
+                    Ast::repetition(ast::Repetition {
                         span: span(1..3),
                         op: ast::RepetitionOp {
                             span: span(2..3),
@@ -3048,7 +3048,7 @@ bar
         );
         assert_eq!(
             parser(r"(ab)?").parse(),
-            Ok(Ast::Repetition(ast::Repetition {
+            Ok(Ast::repetition(ast::Repetition {
                 span: span(0..5),
                 op: ast::RepetitionOp {
                     span: span(4..5),
@@ -3067,8 +3067,8 @@ bar
             Ok(alt(
                 0..3,
                 vec![
-                    Ast::Empty(span(0..0)),
-                    Ast::Repetition(ast::Repetition {
+                    Ast::empty(span(0..0)),
+                    Ast::repetition(ast::Repetition {
                         span: span(1..3),
                         op: ast::RepetitionOp {
                             span: span(2..3),
@@ -3157,7 +3157,7 @@ bar
     fn parse_counted_repetition() {
         assert_eq!(
             parser(r"a{5}").parse(),
-            Ok(Ast::Repetition(ast::Repetition {
+            Ok(Ast::repetition(ast::Repetition {
                 span: span(0..4),
                 op: ast::RepetitionOp {
                     span: span(1..4),
@@ -3171,7 +3171,7 @@ bar
         );
         assert_eq!(
             parser(r"a{5,}").parse(),
-            Ok(Ast::Repetition(ast::Repetition {
+            Ok(Ast::repetition(ast::Repetition {
                 span: span(0..5),
                 op: ast::RepetitionOp {
                     span: span(1..5),
@@ -3185,7 +3185,7 @@ bar
         );
         assert_eq!(
             parser(r"a{5,9}").parse(),
-            Ok(Ast::Repetition(ast::Repetition {
+            Ok(Ast::repetition(ast::Repetition {
                 span: span(0..6),
                 op: ast::RepetitionOp {
                     span: span(1..6),
@@ -3199,7 +3199,7 @@ bar
         );
         assert_eq!(
             parser(r"a{5}?").parse(),
-            Ok(Ast::Repetition(ast::Repetition {
+            Ok(Ast::repetition(ast::Repetition {
                 span: span(0..5),
                 op: ast::RepetitionOp {
                     span: span(1..5),
@@ -3217,7 +3217,7 @@ bar
                 0..5,
                 vec![
                     lit('a', 0),
-                    Ast::Repetition(ast::Repetition {
+                    Ast::repetition(ast::Repetition {
                         span: span(1..5),
                         op: ast::RepetitionOp {
                             span: span(2..5),
@@ -3237,7 +3237,7 @@ bar
                 0..6,
                 vec![
                     lit('a', 0),
-                    Ast::Repetition(ast::Repetition {
+                    Ast::repetition(ast::Repetition {
                         span: span(1..5),
                         op: ast::RepetitionOp {
                             span: span(2..5),
@@ -3255,7 +3255,7 @@ bar
 
         assert_eq!(
             parser(r"a{ 5 }").parse(),
-            Ok(Ast::Repetition(ast::Repetition {
+            Ok(Ast::repetition(ast::Repetition {
                 span: span(0..6),
                 op: ast::RepetitionOp {
                     span: span(1..6),
@@ -3269,7 +3269,7 @@ bar
         );
         assert_eq!(
             parser(r"a{ 5 , 9 }").parse(),
-            Ok(Ast::Repetition(ast::Repetition {
+            Ok(Ast::repetition(ast::Repetition {
                 span: span(0..10),
                 op: ast::RepetitionOp {
                     span: span(1..10),
@@ -3283,7 +3283,7 @@ bar
         );
         assert_eq!(
             parser_ignore_whitespace(r"a{5,9} ?").parse(),
-            Ok(Ast::Repetition(ast::Repetition {
+            Ok(Ast::repetition(ast::Repetition {
                 span: span(0..8),
                 op: ast::RepetitionOp {
                     span: span(1..8),
@@ -3414,7 +3414,7 @@ bar
     fn parse_alternate() {
         assert_eq!(
             parser(r"a|b").parse(),
-            Ok(Ast::Alternation(ast::Alternation {
+            Ok(Ast::alternation(ast::Alternation {
                 span: span(0..3),
                 asts: vec![lit('a', 0), lit('b', 2)],
             }))
@@ -3424,7 +3424,7 @@ bar
             Ok(group(
                 0..5,
                 1,
-                Ast::Alternation(ast::Alternation {
+                Ast::alternation(ast::Alternation {
                     span: span(1..4),
                     asts: vec![lit('a', 1), lit('b', 3)],
                 })
@@ -3433,14 +3433,14 @@ bar
 
         assert_eq!(
             parser(r"a|b|c").parse(),
-            Ok(Ast::Alternation(ast::Alternation {
+            Ok(Ast::alternation(ast::Alternation {
                 span: span(0..5),
                 asts: vec![lit('a', 0), lit('b', 2), lit('c', 4)],
             }))
         );
         assert_eq!(
             parser(r"ax|by|cz").parse(),
-            Ok(Ast::Alternation(ast::Alternation {
+            Ok(Ast::alternation(ast::Alternation {
                 span: span(0..8),
                 asts: vec![
                     concat(0..2, vec![lit('a', 0), lit('x', 1)]),
@@ -3454,7 +3454,7 @@ bar
             Ok(group(
                 0..10,
                 1,
-                Ast::Alternation(ast::Alternation {
+                Ast::alternation(ast::Alternation {
                     span: span(1..9),
                     asts: vec![
                         concat(1..3, vec![lit('a', 1), lit('x', 2)]),
@@ -3503,7 +3503,7 @@ bar
             parser(r"|").parse(),
             Ok(alt(
                 0..1,
-                vec![Ast::Empty(span(0..0)), Ast::Empty(span(1..1)),]
+                vec![Ast::empty(span(0..0)), Ast::empty(span(1..1)),]
             ))
         );
         assert_eq!(
@@ -3511,19 +3511,19 @@ bar
             Ok(alt(
                 0..2,
                 vec![
-                    Ast::Empty(span(0..0)),
-                    Ast::Empty(span(1..1)),
-                    Ast::Empty(span(2..2)),
+                    Ast::empty(span(0..0)),
+                    Ast::empty(span(1..1)),
+                    Ast::empty(span(2..2)),
                 ]
             ))
         );
         assert_eq!(
             parser(r"a|").parse(),
-            Ok(alt(0..2, vec![lit('a', 0), Ast::Empty(span(2..2)),]))
+            Ok(alt(0..2, vec![lit('a', 0), Ast::empty(span(2..2)),]))
         );
         assert_eq!(
             parser(r"|a").parse(),
-            Ok(alt(0..2, vec![Ast::Empty(span(0..0)), lit('a', 1),]))
+            Ok(alt(0..2, vec![Ast::empty(span(0..0)), lit('a', 1),]))
         );
 
         assert_eq!(
@@ -3533,7 +3533,7 @@ bar
                 1,
                 alt(
                     1..2,
-                    vec![Ast::Empty(span(1..1)), Ast::Empty(span(2..2)),]
+                    vec![Ast::empty(span(1..1)), Ast::empty(span(2..2)),]
                 )
             ))
         );
@@ -3542,7 +3542,7 @@ bar
             Ok(group(
                 0..4,
                 1,
-                alt(1..3, vec![lit('a', 1), Ast::Empty(span(3..3)),])
+                alt(1..3, vec![lit('a', 1), Ast::empty(span(3..3)),])
             ))
         );
         assert_eq!(
@@ -3550,7 +3550,7 @@ bar
             Ok(group(
                 0..4,
                 1,
-                alt(1..3, vec![Ast::Empty(span(1..1)), lit('a', 2),])
+                alt(1..3, vec![Ast::empty(span(1..1)), lit('a', 2),])
             ))
         );
 
@@ -3606,7 +3606,7 @@ bar
     fn parse_group() {
         assert_eq!(
             parser("(?i)").parse(),
-            Ok(Ast::Flags(ast::SetFlags {
+            Ok(Ast::flags(ast::SetFlags {
                 span: span(0..4),
                 flags: ast::Flags {
                     span: span(2..3),
@@ -3621,7 +3621,7 @@ bar
         );
         assert_eq!(
             parser("(?iU)").parse(),
-            Ok(Ast::Flags(ast::SetFlags {
+            Ok(Ast::flags(ast::SetFlags {
                 span: span(0..5),
                 flags: ast::Flags {
                     span: span(2..4),
@@ -3644,7 +3644,7 @@ bar
         );
         assert_eq!(
             parser("(?i-U)").parse(),
-            Ok(Ast::Flags(ast::SetFlags {
+            Ok(Ast::flags(ast::SetFlags {
                 span: span(0..6),
                 flags: ast::Flags {
                     span: span(2..5),
@@ -3672,15 +3672,15 @@ bar
 
         assert_eq!(
             parser("()").parse(),
-            Ok(Ast::Group(ast::Group {
+            Ok(Ast::group(ast::Group {
                 span: span(0..2),
                 kind: ast::GroupKind::CaptureIndex(1),
-                ast: Box::new(Ast::Empty(span(1..1))),
+                ast: Box::new(Ast::empty(span(1..1))),
             }))
         );
         assert_eq!(
             parser("(a)").parse(),
-            Ok(Ast::Group(ast::Group {
+            Ok(Ast::group(ast::Group {
                 span: span(0..3),
                 kind: ast::GroupKind::CaptureIndex(1),
                 ast: Box::new(lit('a', 1)),
@@ -3688,20 +3688,20 @@ bar
         );
         assert_eq!(
             parser("(())").parse(),
-            Ok(Ast::Group(ast::Group {
+            Ok(Ast::group(ast::Group {
                 span: span(0..4),
                 kind: ast::GroupKind::CaptureIndex(1),
-                ast: Box::new(Ast::Group(ast::Group {
+                ast: Box::new(Ast::group(ast::Group {
                     span: span(1..3),
                     kind: ast::GroupKind::CaptureIndex(2),
-                    ast: Box::new(Ast::Empty(span(2..2))),
+                    ast: Box::new(Ast::empty(span(2..2))),
                 })),
             }))
         );
 
         assert_eq!(
             parser("(?:a)").parse(),
-            Ok(Ast::Group(ast::Group {
+            Ok(Ast::group(ast::Group {
                 span: span(0..5),
                 kind: ast::GroupKind::NonCapturing(ast::Flags {
                     span: span(2..2),
@@ -3713,7 +3713,7 @@ bar
 
         assert_eq!(
             parser("(?i:a)").parse(),
-            Ok(Ast::Group(ast::Group {
+            Ok(Ast::group(ast::Group {
                 span: span(0..6),
                 kind: ast::GroupKind::NonCapturing(ast::Flags {
                     span: span(2..3),
@@ -3729,7 +3729,7 @@ bar
         );
         assert_eq!(
             parser("(?i-U:a)").parse(),
-            Ok(Ast::Group(ast::Group {
+            Ok(Ast::group(ast::Group {
                 span: span(0..8),
                 kind: ast::GroupKind::NonCapturing(ast::Flags {
                     span: span(2..5),
@@ -3818,7 +3818,7 @@ bar
     fn parse_capture_name() {
         assert_eq!(
             parser("(?<a>z)").parse(),
-            Ok(Ast::Group(ast::Group {
+            Ok(Ast::group(ast::Group {
                 span: span(0..7),
                 kind: ast::GroupKind::CaptureName {
                     starts_with_p: false,
@@ -3833,7 +3833,7 @@ bar
         );
         assert_eq!(
             parser("(?P<a>z)").parse(),
-            Ok(Ast::Group(ast::Group {
+            Ok(Ast::group(ast::Group {
                 span: span(0..8),
                 kind: ast::GroupKind::CaptureName {
                     starts_with_p: true,
@@ -3848,7 +3848,7 @@ bar
         );
         assert_eq!(
             parser("(?P<abc>z)").parse(),
-            Ok(Ast::Group(ast::Group {
+            Ok(Ast::group(ast::Group {
                 span: span(0..10),
                 kind: ast::GroupKind::CaptureName {
                     starts_with_p: true,
@@ -3864,7 +3864,7 @@ bar
 
         assert_eq!(
             parser("(?P<a_1>z)").parse(),
-            Ok(Ast::Group(ast::Group {
+            Ok(Ast::group(ast::Group {
                 span: span(0..10),
                 kind: ast::GroupKind::CaptureName {
                     starts_with_p: true,
@@ -3880,7 +3880,7 @@ bar
 
         assert_eq!(
             parser("(?P<a.1>z)").parse(),
-            Ok(Ast::Group(ast::Group {
+            Ok(Ast::group(ast::Group {
                 span: span(0..10),
                 kind: ast::GroupKind::CaptureName {
                     starts_with_p: true,
@@ -3896,7 +3896,7 @@ bar
 
         assert_eq!(
             parser("(?P<a[1]>z)").parse(),
-            Ok(Ast::Group(ast::Group {
+            Ok(Ast::group(ast::Group {
                 span: span(0..11),
                 kind: ast::GroupKind::CaptureName {
                     starts_with_p: true,
@@ -3912,7 +3912,7 @@ bar
 
         assert_eq!(
             parser("(?P<a¾>)").parse(),
-            Ok(Ast::Group(ast::Group {
+            Ok(Ast::group(ast::Group {
                 span: Span::new(
                     Position::new(0, 1, 1),
                     Position::new(9, 1, 9),
@@ -3928,7 +3928,7 @@ bar
                         index: 1,
                     }
                 },
-                ast: Box::new(Ast::Empty(Span::new(
+                ast: Box::new(Ast::empty(Span::new(
                     Position::new(8, 1, 8),
                     Position::new(8, 1, 8),
                 ))),
@@ -3936,7 +3936,7 @@ bar
         );
         assert_eq!(
             parser("(?P<名字>)").parse(),
-            Ok(Ast::Group(ast::Group {
+            Ok(Ast::group(ast::Group {
                 span: Span::new(
                     Position::new(0, 1, 1),
                     Position::new(12, 1, 9),
@@ -3952,7 +3952,7 @@ bar
                         index: 1,
                     }
                 },
-                ast: Box::new(Ast::Empty(Span::new(
+                ast: Box::new(Ast::empty(Span::new(
                     Position::new(11, 1, 8),
                     Position::new(11, 1, 8),
                 ))),
@@ -4494,15 +4494,15 @@ bar
         );
         assert_eq!(
             parser_octal(r"\778").parse(),
-            Ok(Ast::Concat(ast::Concat {
+            Ok(Ast::concat(ast::Concat {
                 span: span(0..4),
                 asts: vec![
-                    Ast::Literal(ast::Literal {
+                    Ast::literal(ast::Literal {
                         span: span(0..3),
                         kind: ast::LiteralKind::Octal,
                         c: '?',
                     }),
-                    Ast::Literal(ast::Literal {
+                    Ast::literal(ast::Literal {
                         span: span(3..4),
                         kind: ast::LiteralKind::Verbatim,
                         c: '8',
@@ -4512,15 +4512,15 @@ bar
         );
         assert_eq!(
             parser_octal(r"\7777").parse(),
-            Ok(Ast::Concat(ast::Concat {
+            Ok(Ast::concat(ast::Concat {
                 span: span(0..5),
                 asts: vec![
-                    Ast::Literal(ast::Literal {
+                    Ast::literal(ast::Literal {
                         span: span(0..4),
                         kind: ast::LiteralKind::Octal,
                         c: '\u{01FF}',
                     }),
-                    Ast::Literal(ast::Literal {
+                    Ast::literal(ast::Literal {
                         span: span(4..5),
                         kind: ast::LiteralKind::Verbatim,
                         c: '7',
@@ -4965,7 +4965,7 @@ bar
 
         assert_eq!(
             parser("[[:alnum:]]").parse(),
-            Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
+            Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed {
                 span: span(0..11),
                 negated: false,
                 kind: itemset(item_ascii(alnum(span(1..10), false))),
@@ -4973,7 +4973,7 @@ bar
         );
         assert_eq!(
             parser("[[[:alnum:]]]").parse(),
-            Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
+            Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed {
                 span: span(0..13),
                 negated: false,
                 kind: itemset(item_bracket(ast::ClassBracketed {
@@ -4985,7 +4985,7 @@ bar
         );
         assert_eq!(
             parser("[[:alnum:]&&[:lower:]]").parse(),
-            Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
+            Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed {
                 span: span(0..22),
                 negated: false,
                 kind: intersection(
@@ -4997,7 +4997,7 @@ bar
         );
         assert_eq!(
             parser("[[:alnum:]--[:lower:]]").parse(),
-            Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
+            Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed {
                 span: span(0..22),
                 negated: false,
                 kind: difference(
@@ -5009,7 +5009,7 @@ bar
         );
         assert_eq!(
             parser("[[:alnum:]~~[:lower:]]").parse(),
-            Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
+            Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed {
                 span: span(0..22),
                 negated: false,
                 kind: symdifference(
@@ -5022,7 +5022,7 @@ bar
 
         assert_eq!(
             parser("[a]").parse(),
-            Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
+            Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed {
                 span: span(0..3),
                 negated: false,
                 kind: itemset(lit(span(1..2), 'a')),
@@ -5030,7 +5030,7 @@ bar
         );
         assert_eq!(
             parser(r"[a\]]").parse(),
-            Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
+            Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed {
                 span: span(0..5),
                 negated: false,
                 kind: union(
@@ -5048,7 +5048,7 @@ bar
         );
         assert_eq!(
             parser(r"[a\-z]").parse(),
-            Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
+            Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed {
                 span: span(0..6),
                 negated: false,
                 kind: union(
@@ -5067,7 +5067,7 @@ bar
         );
         assert_eq!(
             parser("[ab]").parse(),
-            Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
+            Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed {
                 span: span(0..4),
                 negated: false,
                 kind: union(
@@ -5078,7 +5078,7 @@ bar
         );
         assert_eq!(
             parser("[a-]").parse(),
-            Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
+            Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed {
                 span: span(0..4),
                 negated: false,
                 kind: union(
@@ -5089,7 +5089,7 @@ bar
         );
         assert_eq!(
             parser("[-a]").parse(),
-            Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
+            Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed {
                 span: span(0..4),
                 negated: false,
                 kind: union(
@@ -5100,7 +5100,7 @@ bar
         );
         assert_eq!(
             parser(r"[\pL]").parse(),
-            Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
+            Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed {
                 span: span(0..5),
                 negated: false,
                 kind: itemset(item_unicode(ast::ClassUnicode {
@@ -5112,7 +5112,7 @@ bar
         );
         assert_eq!(
             parser(r"[\w]").parse(),
-            Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
+            Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed {
                 span: span(0..4),
                 negated: false,
                 kind: itemset(item_perl(ast::ClassPerl {
@@ -5124,7 +5124,7 @@ bar
         );
         assert_eq!(
             parser(r"[a\wz]").parse(),
-            Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
+            Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed {
                 span: span(0..6),
                 negated: false,
                 kind: union(
@@ -5144,7 +5144,7 @@ bar
 
         assert_eq!(
             parser("[a-z]").parse(),
-            Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
+            Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed {
                 span: span(0..5),
                 negated: false,
                 kind: itemset(range(span(1..4), 'a', 'z')),
@@ -5152,7 +5152,7 @@ bar
         );
         assert_eq!(
             parser("[a-cx-z]").parse(),
-            Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
+            Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed {
                 span: span(0..8),
                 negated: false,
                 kind: union(
@@ -5166,7 +5166,7 @@ bar
         );
         assert_eq!(
             parser(r"[\w&&a-cx-z]").parse(),
-            Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
+            Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed {
                 span: span(0..12),
                 negated: false,
                 kind: intersection(
@@ -5188,7 +5188,7 @@ bar
         );
         assert_eq!(
             parser(r"[a-cx-z&&\w]").parse(),
-            Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
+            Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed {
                 span: span(0..12),
                 negated: false,
                 kind: intersection(
@@ -5210,7 +5210,7 @@ bar
         );
         assert_eq!(
             parser(r"[a--b--c]").parse(),
-            Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
+            Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed {
                 span: span(0..9),
                 negated: false,
                 kind: difference(
@@ -5226,7 +5226,7 @@ bar
         );
         assert_eq!(
             parser(r"[a~~b~~c]").parse(),
-            Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
+            Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed {
                 span: span(0..9),
                 negated: false,
                 kind: symdifference(
@@ -5242,7 +5242,7 @@ bar
         );
         assert_eq!(
             parser(r"[\^&&^]").parse(),
-            Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
+            Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed {
                 span: span(0..7),
                 negated: false,
                 kind: intersection(
@@ -5258,7 +5258,7 @@ bar
         );
         assert_eq!(
             parser(r"[\&&&&]").parse(),
-            Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
+            Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed {
                 span: span(0..7),
                 negated: false,
                 kind: intersection(
@@ -5274,7 +5274,7 @@ bar
         );
         assert_eq!(
             parser(r"[&&&&]").parse(),
-            Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
+            Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed {
                 span: span(0..6),
                 negated: false,
                 kind: intersection(
@@ -5292,7 +5292,7 @@ bar
         let pat = "[☃-⛄]";
         assert_eq!(
             parser(pat).parse(),
-            Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
+            Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed {
                 span: span_range(pat, 0..9),
                 negated: false,
                 kind: itemset(ast::ClassSetItem::Range(ast::ClassSetRange {
@@ -5313,7 +5313,7 @@ bar
 
         assert_eq!(
             parser(r"[]]").parse(),
-            Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
+            Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed {
                 span: span(0..3),
                 negated: false,
                 kind: itemset(lit(span(1..2), ']')),
@@ -5321,7 +5321,7 @@ bar
         );
         assert_eq!(
             parser(r"[]\[]").parse(),
-            Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
+            Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed {
                 span: span(0..5),
                 negated: false,
                 kind: union(
@@ -5342,7 +5342,7 @@ bar
             Ok(concat(
                 0..5,
                 vec![
-                    Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
+                    Ast::class(ast::Class::Bracketed(ast::ClassBracketed {
                         span: span(0..4),
                         negated: false,
                         kind: itemset(ast::ClassSetItem::Literal(
@@ -5353,7 +5353,7 @@ bar
                             }
                         )),
                     })),
-                    Ast::Literal(ast::Literal {
+                    Ast::literal(ast::Literal {
                         span: span(4..5),
                         kind: ast::LiteralKind::Verbatim,
                         c: ']',
@@ -5914,15 +5914,15 @@ bar
 
         assert_eq!(
             parser(r"\pNz").parse(),
-            Ok(Ast::Concat(ast::Concat {
+            Ok(Ast::concat(ast::Concat {
                 span: span(0..4),
                 asts: vec![
-                    Ast::Class(ast::Class::Unicode(ast::ClassUnicode {
+                    Ast::class(ast::Class::Unicode(ast::ClassUnicode {
                         span: span(0..3),
                         negated: false,
                         kind: ast::ClassUnicodeKind::OneLetter('N'),
                     })),
-                    Ast::Literal(ast::Literal {
+                    Ast::literal(ast::Literal {
                         span: span(3..4),
                         kind: ast::LiteralKind::Verbatim,
                         c: 'z',
@@ -5932,15 +5932,15 @@ bar
         );
         assert_eq!(
             parser(r"\p{Greek}z").parse(),
-            Ok(Ast::Concat(ast::Concat {
+            Ok(Ast::concat(ast::Concat {
                 span: span(0..10),
                 asts: vec![
-                    Ast::Class(ast::Class::Unicode(ast::ClassUnicode {
+                    Ast::class(ast::Class::Unicode(ast::ClassUnicode {
                         span: span(0..9),
                         negated: false,
                         kind: ast::ClassUnicodeKind::Named(s("Greek")),
                     })),
-                    Ast::Literal(ast::Literal {
+                    Ast::literal(ast::Literal {
                         span: span(9..10),
                         kind: ast::LiteralKind::Verbatim,
                         c: 'z',
@@ -6017,7 +6017,7 @@ bar
 
         assert_eq!(
             parser(r"\d").parse(),
-            Ok(Ast::Class(ast::Class::Perl(ast::ClassPerl {
+            Ok(Ast::class(ast::Class::Perl(ast::ClassPerl {
                 span: span(0..2),
                 kind: ast::ClassPerlKind::Digit,
                 negated: false,
@@ -6025,15 +6025,15 @@ bar
         );
         assert_eq!(
             parser(r"\dz").parse(),
-            Ok(Ast::Concat(ast::Concat {
+            Ok(Ast::concat(ast::Concat {
                 span: span(0..3),
                 asts: vec![
-                    Ast::Class(ast::Class::Perl(ast::ClassPerl {
+                    Ast::class(ast::Class::Perl(ast::ClassPerl {
                         span: span(0..2),
                         kind: ast::ClassPerlKind::Digit,
                         negated: false,
                     })),
-                    Ast::Literal(ast::Literal {
+                    Ast::literal(ast::Literal {
                         span: span(2..3),
                         kind: ast::LiteralKind::Verbatim,
                         c: 'z',
diff --git a/regex-syntax/src/ast/print.rs b/regex-syntax/src/ast/print.rs
index 86a87e143..daf6776f2 100644
--- a/regex-syntax/src/ast/print.rs
+++ b/regex-syntax/src/ast/print.rs
@@ -7,7 +7,7 @@ use core::fmt;
 use crate::ast::{
     self,
     visitor::{self, Visitor},
-    Ast,
+    Ast, AstKind,
 };
 
 /// A builder for constructing a printer.
@@ -78,9 +78,9 @@ impl<W: fmt::Write> Visitor for Writer<W> {
     }
 
     fn visit_pre(&mut self, ast: &Ast) -> fmt::Result {
-        match *ast {
-            Ast::Group(ref x) => self.fmt_group_pre(x),
-            Ast::Class(ast::Class::Bracketed(ref x)) => {
+        match *ast.0 {
+            AstKind::Group(ref x) => self.fmt_group_pre(x),
+            AstKind::Class(ast::Class::Bracketed(ref x)) => {
                 self.fmt_class_bracketed_pre(x)
             }
             _ => Ok(()),
@@ -90,21 +90,21 @@ impl<W: fmt::Write> Visitor for Writer<W> {
     fn visit_post(&mut self, ast: &Ast) -> fmt::Result {
         use crate::ast::Class;
 
-        match *ast {
-            Ast::Empty(_) => Ok(()),
-            Ast::Flags(ref x) => self.fmt_set_flags(x),
-            Ast::Literal(ref x) => self.fmt_literal(x),
-            Ast::Dot(_) => self.wtr.write_str("."),
-            Ast::Assertion(ref x) => self.fmt_assertion(x),
-            Ast::Class(Class::Perl(ref x)) => self.fmt_class_perl(x),
-            Ast::Class(Class::Unicode(ref x)) => self.fmt_class_unicode(x),
-            Ast::Class(Class::Bracketed(ref x)) => {
+        match *ast.0 {
+            AstKind::Empty(_) => Ok(()),
+            AstKind::Flags(ref x) => self.fmt_set_flags(x),
+            AstKind::Literal(ref x) => self.fmt_literal(x),
+            AstKind::Dot(_) => self.wtr.write_str("."),
+            AstKind::Assertion(ref x) => self.fmt_assertion(x),
+            AstKind::Class(Class::Perl(ref x)) => self.fmt_class_perl(x),
+            AstKind::Class(Class::Unicode(ref x)) => self.fmt_class_unicode(x),
+            AstKind::Class(Class::Bracketed(ref x)) => {
                 self.fmt_class_bracketed_post(x)
             }
-            Ast::Repetition(ref x) => self.fmt_repetition(x),
-            Ast::Group(ref x) => self.fmt_group_post(x),
-            Ast::Alternation(_) => Ok(()),
-            Ast::Concat(_) => Ok(()),
+            AstKind::Repetition(ref x) => self.fmt_repetition(x),
+            AstKind::Group(ref x) => self.fmt_group_post(x),
+            AstKind::Alternation(_) => Ok(()),
+            AstKind::Concat(_) => Ok(()),
         }
     }
 
diff --git a/regex-syntax/src/ast/visitor.rs b/regex-syntax/src/ast/visitor.rs
index 03d12a14d..05fdac89c 100644
--- a/regex-syntax/src/ast/visitor.rs
+++ b/regex-syntax/src/ast/visitor.rs
@@ -1,6 +1,6 @@
 use alloc::{vec, vec::Vec};
 
-use crate::ast::{self, Ast};
+use crate::ast::{self, Ast, AstKind};
 
 /// A trait for visiting an abstract syntax tree (AST) in depth first order.
 ///
@@ -263,19 +263,19 @@ impl<'a> HeapVisitor<'a> {
         ast: &'a Ast,
         visitor: &mut V,
     ) -> Result<Option<Frame<'a>>, V::Err> {
-        Ok(match *ast {
-            Ast::Class(ast::Class::Bracketed(ref x)) => {
+        Ok(match *ast.0 {
+            AstKind::Class(ast::Class::Bracketed(ref x)) => {
                 self.visit_class(x, visitor)?;
                 None
             }
-            Ast::Repetition(ref x) => Some(Frame::Repetition(x)),
-            Ast::Group(ref x) => Some(Frame::Group(x)),
-            Ast::Concat(ref x) if x.asts.is_empty() => None,
-            Ast::Concat(ref x) => {
+            AstKind::Repetition(ref x) => Some(Frame::Repetition(x)),
+            AstKind::Group(ref x) => Some(Frame::Group(x)),
+            AstKind::Concat(ref x) if x.asts.is_empty() => None,
+            AstKind::Concat(ref x) => {
                 Some(Frame::Concat { head: &x.asts[0], tail: &x.asts[1..] })
             }
-            Ast::Alternation(ref x) if x.asts.is_empty() => None,
-            Ast::Alternation(ref x) => Some(Frame::Alternation {
+            AstKind::Alternation(ref x) if x.asts.is_empty() => None,
+            AstKind::Alternation(ref x) => Some(Frame::Alternation {
                 head: &x.asts[0],
                 tail: &x.asts[1..],
             }),
diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs
index 5430b51b2..743218df4 100644
--- a/regex-syntax/src/hir/translate.rs
+++ b/regex-syntax/src/hir/translate.rs
@@ -7,7 +7,7 @@ use core::cell::{Cell, RefCell};
 use alloc::{boxed::Box, string::ToString, vec, vec::Vec};
 
 use crate::{
-    ast::{self, Ast, Span, Visitor},
+    ast::{self, Ast, AstKind, Span, Visitor},
     either::Either,
     hir::{self, Error, ErrorKind, Hir, HirKind},
     unicode::{self, ClassQuery},
@@ -336,8 +336,8 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
     }
 
     fn visit_pre(&mut self, ast: &Ast) -> Result<()> {
-        match *ast {
-            Ast::Class(ast::Class::Bracketed(_)) => {
+        match *ast.0 {
+            AstKind::Class(ast::Class::Bracketed(_)) => {
                 if self.flags().unicode() {
                     let cls = hir::ClassUnicode::empty();
                     self.push(HirFrame::ClassUnicode(cls));
@@ -346,20 +346,20 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
                     self.push(HirFrame::ClassBytes(cls));
                 }
             }
-            Ast::Repetition(_) => self.push(HirFrame::Repetition),
-            Ast::Group(ref x) => {
+            AstKind::Repetition(_) => self.push(HirFrame::Repetition),
+            AstKind::Group(ref x) => {
                 let old_flags = x
                     .flags()
                     .map(|ast| self.set_flags(ast))
                     .unwrap_or_else(|| self.flags());
                 self.push(HirFrame::Group { old_flags });
             }
-            Ast::Concat(ref x) if x.asts.is_empty() => {}
-            Ast::Concat(_) => {
+            AstKind::Concat(ref x) if x.asts.is_empty() => {}
+            AstKind::Concat(_) => {
                 self.push(HirFrame::Concat);
             }
-            Ast::Alternation(ref x) if x.asts.is_empty() => {}
-            Ast::Alternation(_) => {
+            AstKind::Alternation(ref x) if x.asts.is_empty() => {}
+            AstKind::Alternation(_) => {
                 self.push(HirFrame::Alternation);
                 self.push(HirFrame::AlternationBranch);
             }
@@ -369,11 +369,11 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
     }
 
     fn visit_post(&mut self, ast: &Ast) -> Result<()> {
-        match *ast {
-            Ast::Empty(_) => {
+        match *ast.0 {
+            AstKind::Empty(_) => {
                 self.push(HirFrame::Expr(Hir::empty()));
             }
-            Ast::Flags(ref x) => {
+            AstKind::Flags(ref x) => {
                 self.set_flags(&x.flags);
                 // Flags in the AST are generally considered directives and
                 // not actual sub-expressions. However, they can be used in
@@ -386,7 +386,7 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
                 // consistency sake.
                 self.push(HirFrame::Expr(Hir::empty()));
             }
-            Ast::Literal(ref x) => {
+            AstKind::Literal(ref x) => {
                 match self.ast_literal_to_scalar(x)? {
                     Either::Right(byte) => self.push_byte(byte),
                     Either::Left(ch) => {
@@ -402,13 +402,13 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
                 }
                 // self.push(HirFrame::Expr(self.hir_literal(x)?));
             }
-            Ast::Dot(span) => {
+            AstKind::Dot(span) => {
                 self.push(HirFrame::Expr(self.hir_dot(span)?));
             }
-            Ast::Assertion(ref x) => {
+            AstKind::Assertion(ref x) => {
                 self.push(HirFrame::Expr(self.hir_assertion(x)?));
             }
-            Ast::Class(ast::Class::Perl(ref x)) => {
+            AstKind::Class(ast::Class::Perl(ref x)) => {
                 if self.flags().unicode() {
                     let cls = self.hir_perl_unicode_class(x)?;
                     let hcls = hir::Class::Unicode(cls);
@@ -419,11 +419,11 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
                     self.push(HirFrame::Expr(Hir::class(hcls)));
                 }
             }
-            Ast::Class(ast::Class::Unicode(ref x)) => {
+            AstKind::Class(ast::Class::Unicode(ref x)) => {
                 let cls = hir::Class::Unicode(self.hir_unicode_class(x)?);
                 self.push(HirFrame::Expr(Hir::class(cls)));
             }
-            Ast::Class(ast::Class::Bracketed(ref ast)) => {
+            AstKind::Class(ast::Class::Bracketed(ref ast)) => {
                 if self.flags().unicode() {
                     let mut cls = self.pop().unwrap().unwrap_class_unicode();
                     self.unicode_fold_and_negate(
@@ -444,18 +444,18 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
                     self.push(HirFrame::Expr(expr));
                 }
             }
-            Ast::Repetition(ref x) => {
+            AstKind::Repetition(ref x) => {
                 let expr = self.pop().unwrap().unwrap_expr();
                 self.pop().unwrap().unwrap_repetition();
                 self.push(HirFrame::Expr(self.hir_repetition(x, expr)));
             }
-            Ast::Group(ref x) => {
+            AstKind::Group(ref x) => {
                 let expr = self.pop().unwrap().unwrap_expr();
                 let old_flags = self.pop().unwrap().unwrap_group();
                 self.trans().flags.set(old_flags);
                 self.push(HirFrame::Expr(self.hir_capture(x, expr)));
             }
-            Ast::Concat(_) => {
+            AstKind::Concat(_) => {
                 let mut exprs = vec![];
                 while let Some(expr) = self.pop_concat_expr() {
                     if !matches!(*expr.kind(), HirKind::Empty) {
@@ -465,7 +465,7 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
                 exprs.reverse();
                 self.push(HirFrame::Expr(Hir::concat(exprs)));
             }
-            Ast::Alternation(_) => {
+            AstKind::Alternation(_) => {
                 let mut exprs = vec![];
                 while let Some(expr) = self.pop_alt_expr() {
                     self.pop().unwrap().unwrap_alternation_pipe();

From 31b4398390e02767fd387c43ed53548413f53dcc Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Tue, 3 Oct 2023 16:01:43 -0400
Subject: [PATCH 067/136] syntax: box each AstKind variant

This does reduce memory, but not as much as it is reduced if we don't
box the Ast.
---
 regex-syntax/src/ast/mod.rs       | 149 ++++++++++++++++++-----------
 regex-syntax/src/ast/parse.rs     | 152 +++++++++++++++---------------
 regex-syntax/src/ast/print.rs     |  14 +--
 regex-syntax/src/ast/visitor.rs   |   2 +-
 regex-syntax/src/hir/translate.rs |  38 ++++----
 5 files changed, 192 insertions(+), 163 deletions(-)

diff --git a/regex-syntax/src/ast/mod.rs b/regex-syntax/src/ast/mod.rs
index 6a6b58237..c346abcb6 100644
--- a/regex-syntax/src/ast/mod.rs
+++ b/regex-syntax/src/ast/mod.rs
@@ -443,77 +443,92 @@ pub struct Ast(pub Box<AstKind>);
 #[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
 pub enum AstKind {
     /// An empty regex that matches everything.
-    Empty(Span),
+    Empty(Box<Span>),
     /// A set of flags, e.g., `(?is)`.
-    Flags(SetFlags),
+    Flags(Box<SetFlags>),
     /// A single character literal, which includes escape sequences.
-    Literal(Literal),
+    Literal(Box<Literal>),
     /// The "any character" class.
-    Dot(Span),
+    Dot(Box<Span>),
     /// A single zero-width assertion.
-    Assertion(Assertion),
-    /// A single character class. This includes all forms of character classes
-    /// except for `.`. e.g., `\d`, `\pN`, `[a-z]` and `[[:alpha:]]`.
-    Class(Class),
+    Assertion(Box<Assertion>),
+    /// A single Unicode character class, e.g., `\pL` or `\p{Greek}`.
+    ClassUnicode(Box<ClassUnicode>),
+    /// A single perl character class, e.g., `\d` or `\W`.
+    ClassPerl(Box<ClassPerl>),
+    /// A single bracketed character class set, which may contain zero or more
+    /// character ranges and/or zero or more nested classes. e.g.,
+    /// `[a-zA-Z\pL]`.
+    ClassBracketed(Box<ClassBracketed>),
     /// A repetition operator applied to an arbitrary regular expression.
-    Repetition(Repetition),
+    Repetition(Box<Repetition>),
     /// A grouped regular expression.
-    Group(Group),
+    Group(Box<Group>),
     /// An alternation of regular expressions.
-    Alternation(Alternation),
+    Alternation(Box<Alternation>),
     /// A concatenation of regular expressions.
-    Concat(Concat),
+    Concat(Box<Concat>),
 }
 
 impl Ast {
     /// Create an "empty" AST item.
     pub fn empty(span: Span) -> Ast {
-        Ast(Box::new(AstKind::Empty(span)))
+        Ast(Box::new(AstKind::Empty(Box::new(span))))
     }
 
     /// Create a "flags" AST item.
     pub fn flags(e: SetFlags) -> Ast {
-        Ast(Box::new(AstKind::Flags(e)))
+        Ast(Box::new(AstKind::Flags(Box::new(e))))
     }
 
     /// Create a "literal" AST item.
     pub fn literal(e: Literal) -> Ast {
-        Ast(Box::new(AstKind::Literal(e)))
+        Ast(Box::new(AstKind::Literal(Box::new(e))))
     }
 
     /// Create a "dot" AST item.
     pub fn dot(span: Span) -> Ast {
-        Ast(Box::new(AstKind::Dot(span)))
+        Ast(Box::new(AstKind::Dot(Box::new(span))))
     }
 
     /// Create a "assertion" AST item.
     pub fn assertion(e: Assertion) -> Ast {
-        Ast(Box::new(AstKind::Assertion(e)))
+        Ast(Box::new(AstKind::Assertion(Box::new(e))))
+    }
+
+    /// Create a "Unicode class" AST item.
+    pub fn class_unicode(e: ClassUnicode) -> Ast {
+        Ast(Box::new(AstKind::ClassUnicode(Box::new(e))))
+    }
+
+    /// Create a "Perl class" AST item.
+    pub fn class_perl(e: ClassPerl) -> Ast {
+        Ast(Box::new(AstKind::ClassPerl(Box::new(e))))
     }
 
-    /// Create a "class" AST item.
-    pub fn class(e: Class) -> Ast {
-        Ast(Box::new(AstKind::Class(e)))
+    /// Create a "bracketed class" AST item.
+    pub fn class_bracketed(e: ClassBracketed) -> Ast {
+        Ast(Box::new(AstKind::ClassBracketed(Box::new(e))))
     }
 
     /// Create a "repetition" AST item.
     pub fn repetition(e: Repetition) -> Ast {
-        Ast(Box::new(AstKind::Repetition(e)))
+        Ast(Box::new(AstKind::Repetition(Box::new(e))))
     }
 
     /// Create a "group" AST item.
     pub fn group(e: Group) -> Ast {
-        Ast(Box::new(AstKind::Group(e)))
+        Ast(Box::new(AstKind::Group(Box::new(e))))
     }
 
     /// Create a "alternation" AST item.
     pub fn alternation(e: Alternation) -> Ast {
-        Ast(Box::new(AstKind::Alternation(e)))
+        Ast(Box::new(AstKind::Alternation(Box::new(e))))
     }
 
     /// Create a "concat" AST item.
     pub fn concat(e: Concat) -> Ast {
-        Ast(Box::new(AstKind::Concat(e)))
+        Ast(Box::new(AstKind::Concat(Box::new(e))))
     }
 
     /// Return the span of this abstract syntax tree.
@@ -524,7 +539,9 @@ impl Ast {
             AstKind::Literal(ref x) => &x.span,
             AstKind::Dot(ref span) => span,
             AstKind::Assertion(ref x) => &x.span,
-            AstKind::Class(ref x) => x.span(),
+            AstKind::ClassUnicode(ref x) => &x.span,
+            AstKind::ClassPerl(ref x) => &x.span,
+            AstKind::ClassBracketed(ref x) => &x.span,
             AstKind::Repetition(ref x) => &x.span,
             AstKind::Group(ref x) => &x.span,
             AstKind::Alternation(ref x) => &x.span,
@@ -548,8 +565,10 @@ impl Ast {
             | AstKind::Flags(_)
             | AstKind::Literal(_)
             | AstKind::Dot(_)
-            | AstKind::Assertion(_) => false,
-            AstKind::Class(_)
+            | AstKind::Assertion(_)
+            | AstKind::ClassUnicode(_)
+            | AstKind::ClassPerl(_) => false,
+            AstKind::ClassBracketed(_)
             | AstKind::Repetition(_)
             | AstKind::Group(_)
             | AstKind::Alternation(_)
@@ -735,31 +754,6 @@ impl HexLiteralKind {
     }
 }
 
-/// A single character class expression.
-#[derive(Clone, Debug, Eq, PartialEq)]
-#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
-pub enum Class {
-    /// A Unicode character class, e.g., `\pL` or `\p{Greek}`.
-    Unicode(ClassUnicode),
-    /// A perl character class, e.g., `\d` or `\W`.
-    Perl(ClassPerl),
-    /// A bracketed character class set, which may contain zero or more
-    /// character ranges and/or zero or more nested classes. e.g.,
-    /// `[a-zA-Z\pL]`.
-    Bracketed(ClassBracketed),
-}
-
-impl Class {
-    /// Return the span of this character class.
-    pub fn span(&self) -> &Span {
-        match *self {
-            Class::Perl(ref x) => &x.span,
-            Class::Unicode(ref x) => &x.span,
-            Class::Bracketed(ref x) => &x.span,
-        }
-    }
-}
-
 /// A Perl character class.
 #[derive(Clone, Debug, Eq, PartialEq)]
 #[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
@@ -1610,8 +1604,10 @@ impl Drop for Ast {
             | AstKind::Literal(_)
             | AstKind::Dot(_)
             | AstKind::Assertion(_)
-            // Classes are recursive, so they get their own Drop impl.
-            | AstKind::Class(_) => return,
+            | AstKind::ClassUnicode(_)
+            | AstKind::ClassPerl(_)
+            // Bracketed classes are recursive, they get their own Drop impl.
+            | AstKind::ClassBracketed(_) => return,
             AstKind::Repetition(ref x) if !x.ast.has_subexprs() => return,
             AstKind::Group(ref x) if !x.ast.has_subexprs() => return,
             AstKind::Alternation(ref x) if x.asts.is_empty() => return,
@@ -1629,8 +1625,11 @@ impl Drop for Ast {
                 | AstKind::Literal(_)
                 | AstKind::Dot(_)
                 | AstKind::Assertion(_)
-                // Classes are recursive, so they get their own Drop impl.
-                | AstKind::Class(_) => {}
+                | AstKind::ClassUnicode(_)
+                | AstKind::ClassPerl(_)
+                // Bracketed classes are recursive, so they get their own Drop
+                // impl.
+                | AstKind::ClassBracketed(_) => {}
                 AstKind::Repetition(ref mut x) => {
                     stack.push(mem::replace(&mut x.ast, empty_ast()));
                 }
@@ -1754,4 +1753,42 @@ mod tests {
             .join()
             .unwrap();
     }
+
+    // This tests that our `Ast` has a reasonable size. This isn't a hard rule
+    // and it can be increased if given a good enough reason. But this test
+    // exists because the size of `Ast` was at one point over 200 bytes on a
+    // 64-bit target. Wow.
+    #[test]
+    fn ast_size() {
+        std::dbg!(core::mem::size_of::<Span>());
+        std::dbg!(core::mem::size_of::<SetFlags>());
+        std::dbg!(core::mem::size_of::<Literal>());
+        std::dbg!(core::mem::size_of::<Span>());
+        std::dbg!(core::mem::size_of::<Assertion>());
+        std::dbg!(core::mem::size_of::<ClassUnicode>());
+        std::dbg!(core::mem::size_of::<ClassPerl>());
+        std::dbg!(core::mem::size_of::<ClassBracketed>());
+        std::dbg!(core::mem::size_of::<Repetition>());
+        std::dbg!(core::mem::size_of::<Group>());
+        std::dbg!(core::mem::size_of::<Alternation>());
+        std::dbg!(core::mem::size_of::<Concat>());
+
+        let max = core::mem::size_of::<usize>();
+        let size = core::mem::size_of::<Ast>();
+        assert!(
+            size <= max,
+            "Ast size of {} bytes is bigger than suggested max {}",
+            size,
+            max
+        );
+
+        let max = 2 * core::mem::size_of::<usize>();
+        let size = core::mem::size_of::<AstKind>();
+        assert!(
+            size <= max,
+            "AstKind size of {} bytes is bigger than suggested max {}",
+            size,
+            max
+        );
+    }
 }
diff --git a/regex-syntax/src/ast/parse.rs b/regex-syntax/src/ast/parse.rs
index b3f04bfdc..a87be0e02 100644
--- a/regex-syntax/src/ast/parse.rs
+++ b/regex-syntax/src/ast/parse.rs
@@ -56,8 +56,8 @@ impl Primitive {
             Primitive::Literal(lit) => Ast::literal(lit),
             Primitive::Assertion(assert) => Ast::assertion(assert),
             Primitive::Dot(span) => Ast::dot(span),
-            Primitive::Perl(cls) => Ast::class(ast::Class::Perl(cls)),
-            Primitive::Unicode(cls) => Ast::class(ast::Class::Unicode(cls)),
+            Primitive::Perl(cls) => Ast::class_perl(cls),
+            Primitive::Unicode(cls) => Ast::class_unicode(cls),
         }
     }
 
@@ -850,7 +850,7 @@ impl<'s, P: Borrow<Parser>> ParserI<'s, P> {
     fn pop_class(
         &self,
         nested_union: ast::ClassSetUnion,
-    ) -> Result<Either<ast::ClassSetUnion, ast::Class>> {
+    ) -> Result<Either<ast::ClassSetUnion, ast::ClassBracketed>> {
         assert_eq!(self.char(), ']');
 
         let item = ast::ClassSet::Item(nested_union.into_item());
@@ -882,7 +882,7 @@ impl<'s, P: Borrow<Parser>> ParserI<'s, P> {
                 set.span.end = self.pos();
                 set.kind = prevset;
                 if stack.is_empty() {
-                    Ok(Either::Right(ast::Class::Bracketed(set)))
+                    Ok(Either::Right(set))
                 } else {
                     union.push(ast::ClassSetItem::Bracketed(Box::new(set)));
                     Ok(Either::Left(union))
@@ -976,7 +976,7 @@ impl<'s, P: Borrow<Parser>> ParserI<'s, P> {
                 '|' => concat = self.push_alternate(concat)?,
                 '[' => {
                     let class = self.parse_set_class()?;
-                    concat.asts.push(Ast::class(class));
+                    concat.asts.push(Ast::class_bracketed(class));
                 }
                 '?' => {
                     concat = self.parse_uncounted_repetition(
@@ -1743,7 +1743,7 @@ impl<'s, P: Borrow<Parser>> ParserI<'s, P> {
     /// is successful, then the parser is advanced to the position immediately
     /// following the closing `]`.
     #[inline(never)]
-    fn parse_set_class(&self) -> Result<ast::Class> {
+    fn parse_set_class(&self) -> Result<ast::ClassBracketed> {
         assert_eq!(self.char(), '[');
 
         let mut union =
@@ -2189,12 +2189,12 @@ impl<'p, 's, P: Borrow<Parser>> ast::Visitor for NestLimiter<'p, 's, P> {
             | AstKind::Literal(_)
             | AstKind::Dot(_)
             | AstKind::Assertion(_)
-            | AstKind::Class(ast::Class::Unicode(_))
-            | AstKind::Class(ast::Class::Perl(_)) => {
+            | AstKind::ClassUnicode(_)
+            | AstKind::ClassPerl(_) => {
                 // These are all base cases, so we don't increment depth.
                 return Ok(());
             }
-            AstKind::Class(ast::Class::Bracketed(ref x)) => &x.span,
+            AstKind::ClassBracketed(ref x) => &x.span,
             AstKind::Repetition(ref x) => &x.span,
             AstKind::Group(ref x) => &x.span,
             AstKind::Alternation(ref x) => &x.span,
@@ -2210,12 +2210,12 @@ impl<'p, 's, P: Borrow<Parser>> ast::Visitor for NestLimiter<'p, 's, P> {
             | AstKind::Literal(_)
             | AstKind::Dot(_)
             | AstKind::Assertion(_)
-            | AstKind::Class(ast::Class::Unicode(_))
-            | AstKind::Class(ast::Class::Perl(_)) => {
+            | AstKind::ClassUnicode(_)
+            | AstKind::ClassPerl(_) => {
                 // These are all base cases, so we don't decrement depth.
                 Ok(())
             }
-            AstKind::Class(ast::Class::Bracketed(_))
+            AstKind::ClassBracketed(_)
             | AstKind::Repetition(_)
             | AstKind::Group(_)
             | AstKind::Alternation(_)
@@ -2606,7 +2606,7 @@ mod tests {
         );
         assert_eq!(
             parser_nest_limit("[a]", 1).parse(),
-            Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed {
+            Ok(Ast::class_bracketed(ast::ClassBracketed {
                 span: span(0..3),
                 negated: false,
                 kind: ast::ClassSet::Item(ast::ClassSetItem::Literal(
@@ -2616,7 +2616,7 @@ mod tests {
                         c: 'a',
                     }
                 )),
-            })))
+            }))
         );
         assert_eq!(
             parser_nest_limit("[ab]", 1).parse().unwrap_err(),
@@ -4965,15 +4965,15 @@ bar
 
         assert_eq!(
             parser("[[:alnum:]]").parse(),
-            Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed {
+            Ok(Ast::class_bracketed(ast::ClassBracketed {
                 span: span(0..11),
                 negated: false,
                 kind: itemset(item_ascii(alnum(span(1..10), false))),
-            })))
+            }))
         );
         assert_eq!(
             parser("[[[:alnum:]]]").parse(),
-            Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed {
+            Ok(Ast::class_bracketed(ast::ClassBracketed {
                 span: span(0..13),
                 negated: false,
                 kind: itemset(item_bracket(ast::ClassBracketed {
@@ -4981,11 +4981,11 @@ bar
                     negated: false,
                     kind: itemset(item_ascii(alnum(span(2..11), false))),
                 })),
-            })))
+            }))
         );
         assert_eq!(
             parser("[[:alnum:]&&[:lower:]]").parse(),
-            Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed {
+            Ok(Ast::class_bracketed(ast::ClassBracketed {
                 span: span(0..22),
                 negated: false,
                 kind: intersection(
@@ -4993,11 +4993,11 @@ bar
                     itemset(item_ascii(alnum(span(1..10), false))),
                     itemset(item_ascii(lower(span(12..21), false))),
                 ),
-            })))
+            }))
         );
         assert_eq!(
             parser("[[:alnum:]--[:lower:]]").parse(),
-            Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed {
+            Ok(Ast::class_bracketed(ast::ClassBracketed {
                 span: span(0..22),
                 negated: false,
                 kind: difference(
@@ -5005,11 +5005,11 @@ bar
                     itemset(item_ascii(alnum(span(1..10), false))),
                     itemset(item_ascii(lower(span(12..21), false))),
                 ),
-            })))
+            }))
         );
         assert_eq!(
             parser("[[:alnum:]~~[:lower:]]").parse(),
-            Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed {
+            Ok(Ast::class_bracketed(ast::ClassBracketed {
                 span: span(0..22),
                 negated: false,
                 kind: symdifference(
@@ -5017,20 +5017,20 @@ bar
                     itemset(item_ascii(alnum(span(1..10), false))),
                     itemset(item_ascii(lower(span(12..21), false))),
                 ),
-            })))
+            }))
         );
 
         assert_eq!(
             parser("[a]").parse(),
-            Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed {
+            Ok(Ast::class_bracketed(ast::ClassBracketed {
                 span: span(0..3),
                 negated: false,
                 kind: itemset(lit(span(1..2), 'a')),
-            })))
+            }))
         );
         assert_eq!(
             parser(r"[a\]]").parse(),
-            Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed {
+            Ok(Ast::class_bracketed(ast::ClassBracketed {
                 span: span(0..5),
                 negated: false,
                 kind: union(
@@ -5044,11 +5044,11 @@ bar
                         }),
                     ]
                 ),
-            })))
+            }))
         );
         assert_eq!(
             parser(r"[a\-z]").parse(),
-            Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed {
+            Ok(Ast::class_bracketed(ast::ClassBracketed {
                 span: span(0..6),
                 negated: false,
                 kind: union(
@@ -5063,44 +5063,44 @@ bar
                         lit(span(4..5), 'z'),
                     ]
                 ),
-            })))
+            }))
         );
         assert_eq!(
             parser("[ab]").parse(),
-            Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed {
+            Ok(Ast::class_bracketed(ast::ClassBracketed {
                 span: span(0..4),
                 negated: false,
                 kind: union(
                     span(1..3),
                     vec![lit(span(1..2), 'a'), lit(span(2..3), 'b'),]
                 ),
-            })))
+            }))
         );
         assert_eq!(
             parser("[a-]").parse(),
-            Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed {
+            Ok(Ast::class_bracketed(ast::ClassBracketed {
                 span: span(0..4),
                 negated: false,
                 kind: union(
                     span(1..3),
                     vec![lit(span(1..2), 'a'), lit(span(2..3), '-'),]
                 ),
-            })))
+            }))
         );
         assert_eq!(
             parser("[-a]").parse(),
-            Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed {
+            Ok(Ast::class_bracketed(ast::ClassBracketed {
                 span: span(0..4),
                 negated: false,
                 kind: union(
                     span(1..3),
                     vec![lit(span(1..2), '-'), lit(span(2..3), 'a'),]
                 ),
-            })))
+            }))
         );
         assert_eq!(
             parser(r"[\pL]").parse(),
-            Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed {
+            Ok(Ast::class_bracketed(ast::ClassBracketed {
                 span: span(0..5),
                 negated: false,
                 kind: itemset(item_unicode(ast::ClassUnicode {
@@ -5108,11 +5108,11 @@ bar
                     negated: false,
                     kind: ast::ClassUnicodeKind::OneLetter('L'),
                 })),
-            })))
+            }))
         );
         assert_eq!(
             parser(r"[\w]").parse(),
-            Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed {
+            Ok(Ast::class_bracketed(ast::ClassBracketed {
                 span: span(0..4),
                 negated: false,
                 kind: itemset(item_perl(ast::ClassPerl {
@@ -5120,11 +5120,11 @@ bar
                     kind: ast::ClassPerlKind::Word,
                     negated: false,
                 })),
-            })))
+            }))
         );
         assert_eq!(
             parser(r"[a\wz]").parse(),
-            Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed {
+            Ok(Ast::class_bracketed(ast::ClassBracketed {
                 span: span(0..6),
                 negated: false,
                 kind: union(
@@ -5139,20 +5139,20 @@ bar
                         lit(span(4..5), 'z'),
                     ]
                 ),
-            })))
+            }))
         );
 
         assert_eq!(
             parser("[a-z]").parse(),
-            Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed {
+            Ok(Ast::class_bracketed(ast::ClassBracketed {
                 span: span(0..5),
                 negated: false,
                 kind: itemset(range(span(1..4), 'a', 'z')),
-            })))
+            }))
         );
         assert_eq!(
             parser("[a-cx-z]").parse(),
-            Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed {
+            Ok(Ast::class_bracketed(ast::ClassBracketed {
                 span: span(0..8),
                 negated: false,
                 kind: union(
@@ -5162,11 +5162,11 @@ bar
                         range(span(4..7), 'x', 'z'),
                     ]
                 ),
-            })))
+            }))
         );
         assert_eq!(
             parser(r"[\w&&a-cx-z]").parse(),
-            Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed {
+            Ok(Ast::class_bracketed(ast::ClassBracketed {
                 span: span(0..12),
                 negated: false,
                 kind: intersection(
@@ -5184,11 +5184,11 @@ bar
                         ]
                     ),
                 ),
-            })))
+            }))
         );
         assert_eq!(
             parser(r"[a-cx-z&&\w]").parse(),
-            Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed {
+            Ok(Ast::class_bracketed(ast::ClassBracketed {
                 span: span(0..12),
                 negated: false,
                 kind: intersection(
@@ -5206,11 +5206,11 @@ bar
                         negated: false,
                     })),
                 ),
-            })))
+            }))
         );
         assert_eq!(
             parser(r"[a--b--c]").parse(),
-            Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed {
+            Ok(Ast::class_bracketed(ast::ClassBracketed {
                 span: span(0..9),
                 negated: false,
                 kind: difference(
@@ -5222,11 +5222,11 @@ bar
                     ),
                     itemset(lit(span(7..8), 'c')),
                 ),
-            })))
+            }))
         );
         assert_eq!(
             parser(r"[a~~b~~c]").parse(),
-            Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed {
+            Ok(Ast::class_bracketed(ast::ClassBracketed {
                 span: span(0..9),
                 negated: false,
                 kind: symdifference(
@@ -5238,11 +5238,11 @@ bar
                     ),
                     itemset(lit(span(7..8), 'c')),
                 ),
-            })))
+            }))
         );
         assert_eq!(
             parser(r"[\^&&^]").parse(),
-            Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed {
+            Ok(Ast::class_bracketed(ast::ClassBracketed {
                 span: span(0..7),
                 negated: false,
                 kind: intersection(
@@ -5254,11 +5254,11 @@ bar
                     })),
                     itemset(lit(span(5..6), '^')),
                 ),
-            })))
+            }))
         );
         assert_eq!(
             parser(r"[\&&&&]").parse(),
-            Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed {
+            Ok(Ast::class_bracketed(ast::ClassBracketed {
                 span: span(0..7),
                 negated: false,
                 kind: intersection(
@@ -5270,11 +5270,11 @@ bar
                     })),
                     itemset(lit(span(5..6), '&')),
                 ),
-            })))
+            }))
         );
         assert_eq!(
             parser(r"[&&&&]").parse(),
-            Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed {
+            Ok(Ast::class_bracketed(ast::ClassBracketed {
                 span: span(0..6),
                 negated: false,
                 kind: intersection(
@@ -5286,13 +5286,13 @@ bar
                     ),
                     itemset(empty(span(5..5))),
                 ),
-            })))
+            }))
         );
 
         let pat = "[☃-⛄]";
         assert_eq!(
             parser(pat).parse(),
-            Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed {
+            Ok(Ast::class_bracketed(ast::ClassBracketed {
                 span: span_range(pat, 0..9),
                 negated: false,
                 kind: itemset(ast::ClassSetItem::Range(ast::ClassSetRange {
@@ -5308,20 +5308,20 @@ bar
                         c: '⛄',
                     },
                 })),
-            })))
+            }))
         );
 
         assert_eq!(
             parser(r"[]]").parse(),
-            Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed {
+            Ok(Ast::class_bracketed(ast::ClassBracketed {
                 span: span(0..3),
                 negated: false,
                 kind: itemset(lit(span(1..2), ']')),
-            })))
+            }))
         );
         assert_eq!(
             parser(r"[]\[]").parse(),
-            Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed {
+            Ok(Ast::class_bracketed(ast::ClassBracketed {
                 span: span(0..5),
                 negated: false,
                 kind: union(
@@ -5335,14 +5335,14 @@ bar
                         }),
                     ]
                 ),
-            })))
+            }))
         );
         assert_eq!(
             parser(r"[\[]]").parse(),
             Ok(concat(
                 0..5,
                 vec![
-                    Ast::class(ast::Class::Bracketed(ast::ClassBracketed {
+                    Ast::class_bracketed(ast::ClassBracketed {
                         span: span(0..4),
                         negated: false,
                         kind: itemset(ast::ClassSetItem::Literal(
@@ -5352,7 +5352,7 @@ bar
                                 c: '[',
                             }
                         )),
-                    })),
+                    }),
                     Ast::literal(ast::Literal {
                         span: span(4..5),
                         kind: ast::LiteralKind::Verbatim,
@@ -5917,11 +5917,11 @@ bar
             Ok(Ast::concat(ast::Concat {
                 span: span(0..4),
                 asts: vec![
-                    Ast::class(ast::Class::Unicode(ast::ClassUnicode {
+                    Ast::class_unicode(ast::ClassUnicode {
                         span: span(0..3),
                         negated: false,
                         kind: ast::ClassUnicodeKind::OneLetter('N'),
-                    })),
+                    }),
                     Ast::literal(ast::Literal {
                         span: span(3..4),
                         kind: ast::LiteralKind::Verbatim,
@@ -5935,11 +5935,11 @@ bar
             Ok(Ast::concat(ast::Concat {
                 span: span(0..10),
                 asts: vec![
-                    Ast::class(ast::Class::Unicode(ast::ClassUnicode {
+                    Ast::class_unicode(ast::ClassUnicode {
                         span: span(0..9),
                         negated: false,
                         kind: ast::ClassUnicodeKind::Named(s("Greek")),
-                    })),
+                    }),
                     Ast::literal(ast::Literal {
                         span: span(9..10),
                         kind: ast::LiteralKind::Verbatim,
@@ -6017,22 +6017,22 @@ bar
 
         assert_eq!(
             parser(r"\d").parse(),
-            Ok(Ast::class(ast::Class::Perl(ast::ClassPerl {
+            Ok(Ast::class_perl(ast::ClassPerl {
                 span: span(0..2),
                 kind: ast::ClassPerlKind::Digit,
                 negated: false,
-            })))
+            }))
         );
         assert_eq!(
             parser(r"\dz").parse(),
             Ok(Ast::concat(ast::Concat {
                 span: span(0..3),
                 asts: vec![
-                    Ast::class(ast::Class::Perl(ast::ClassPerl {
+                    Ast::class_perl(ast::ClassPerl {
                         span: span(0..2),
                         kind: ast::ClassPerlKind::Digit,
                         negated: false,
-                    })),
+                    }),
                     Ast::literal(ast::Literal {
                         span: span(2..3),
                         kind: ast::LiteralKind::Verbatim,
diff --git a/regex-syntax/src/ast/print.rs b/regex-syntax/src/ast/print.rs
index daf6776f2..10ee56c2c 100644
--- a/regex-syntax/src/ast/print.rs
+++ b/regex-syntax/src/ast/print.rs
@@ -80,27 +80,21 @@ impl<W: fmt::Write> Visitor for Writer<W> {
     fn visit_pre(&mut self, ast: &Ast) -> fmt::Result {
         match *ast.0 {
             AstKind::Group(ref x) => self.fmt_group_pre(x),
-            AstKind::Class(ast::Class::Bracketed(ref x)) => {
-                self.fmt_class_bracketed_pre(x)
-            }
+            AstKind::ClassBracketed(ref x) => self.fmt_class_bracketed_pre(x),
             _ => Ok(()),
         }
     }
 
     fn visit_post(&mut self, ast: &Ast) -> fmt::Result {
-        use crate::ast::Class;
-
         match *ast.0 {
             AstKind::Empty(_) => Ok(()),
             AstKind::Flags(ref x) => self.fmt_set_flags(x),
             AstKind::Literal(ref x) => self.fmt_literal(x),
             AstKind::Dot(_) => self.wtr.write_str("."),
             AstKind::Assertion(ref x) => self.fmt_assertion(x),
-            AstKind::Class(Class::Perl(ref x)) => self.fmt_class_perl(x),
-            AstKind::Class(Class::Unicode(ref x)) => self.fmt_class_unicode(x),
-            AstKind::Class(Class::Bracketed(ref x)) => {
-                self.fmt_class_bracketed_post(x)
-            }
+            AstKind::ClassPerl(ref x) => self.fmt_class_perl(x),
+            AstKind::ClassUnicode(ref x) => self.fmt_class_unicode(x),
+            AstKind::ClassBracketed(ref x) => self.fmt_class_bracketed_post(x),
             AstKind::Repetition(ref x) => self.fmt_repetition(x),
             AstKind::Group(ref x) => self.fmt_group_post(x),
             AstKind::Alternation(_) => Ok(()),
diff --git a/regex-syntax/src/ast/visitor.rs b/regex-syntax/src/ast/visitor.rs
index 05fdac89c..2bd4b1956 100644
--- a/regex-syntax/src/ast/visitor.rs
+++ b/regex-syntax/src/ast/visitor.rs
@@ -264,7 +264,7 @@ impl<'a> HeapVisitor<'a> {
         visitor: &mut V,
     ) -> Result<Option<Frame<'a>>, V::Err> {
         Ok(match *ast.0 {
-            AstKind::Class(ast::Class::Bracketed(ref x)) => {
+            AstKind::ClassBracketed(ref x) => {
                 self.visit_class(x, visitor)?;
                 None
             }
diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs
index 743218df4..ab3aa93d7 100644
--- a/regex-syntax/src/hir/translate.rs
+++ b/regex-syntax/src/hir/translate.rs
@@ -337,7 +337,7 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
 
     fn visit_pre(&mut self, ast: &Ast) -> Result<()> {
         match *ast.0 {
-            AstKind::Class(ast::Class::Bracketed(_)) => {
+            AstKind::ClassBracketed(_) => {
                 if self.flags().unicode() {
                     let cls = hir::ClassUnicode::empty();
                     self.push(HirFrame::ClassUnicode(cls));
@@ -386,29 +386,27 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
                 // consistency sake.
                 self.push(HirFrame::Expr(Hir::empty()));
             }
-            AstKind::Literal(ref x) => {
-                match self.ast_literal_to_scalar(x)? {
-                    Either::Right(byte) => self.push_byte(byte),
-                    Either::Left(ch) => {
-                        if !self.flags().unicode() && ch.len_utf8() > 1 {
-                            return Err(self
-                                .error(x.span, ErrorKind::UnicodeNotAllowed));
-                        }
-                        match self.case_fold_char(x.span, ch)? {
-                            None => self.push_char(ch),
-                            Some(expr) => self.push(HirFrame::Expr(expr)),
-                        }
+            AstKind::Literal(ref x) => match self.ast_literal_to_scalar(x)? {
+                Either::Right(byte) => self.push_byte(byte),
+                Either::Left(ch) => {
+                    if !self.flags().unicode() && ch.len_utf8() > 1 {
+                        return Err(
+                            self.error(x.span, ErrorKind::UnicodeNotAllowed)
+                        );
+                    }
+                    match self.case_fold_char(x.span, ch)? {
+                        None => self.push_char(ch),
+                        Some(expr) => self.push(HirFrame::Expr(expr)),
                     }
                 }
-                // self.push(HirFrame::Expr(self.hir_literal(x)?));
-            }
-            AstKind::Dot(span) => {
-                self.push(HirFrame::Expr(self.hir_dot(span)?));
+            },
+            AstKind::Dot(ref span) => {
+                self.push(HirFrame::Expr(self.hir_dot(**span)?));
             }
             AstKind::Assertion(ref x) => {
                 self.push(HirFrame::Expr(self.hir_assertion(x)?));
             }
-            AstKind::Class(ast::Class::Perl(ref x)) => {
+            AstKind::ClassPerl(ref x) => {
                 if self.flags().unicode() {
                     let cls = self.hir_perl_unicode_class(x)?;
                     let hcls = hir::Class::Unicode(cls);
@@ -419,11 +417,11 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
                     self.push(HirFrame::Expr(Hir::class(hcls)));
                 }
             }
-            AstKind::Class(ast::Class::Unicode(ref x)) => {
+            AstKind::ClassUnicode(ref x) => {
                 let cls = hir::Class::Unicode(self.hir_unicode_class(x)?);
                 self.push(HirFrame::Expr(Hir::class(cls)));
             }
-            AstKind::Class(ast::Class::Bracketed(ref ast)) => {
+            AstKind::ClassBracketed(ref ast) => {
                 if self.flags().unicode() {
                     let mut cls = self.pop().unwrap().unwrap_class_unicode();
                     self.unicode_fold_and_negate(

From 17d9c1c6c4368fb0a18f88d0698482063931a361 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Tue, 3 Oct 2023 16:09:12 -0400
Subject: [PATCH 068/136] syntax: unbox Ast and remove AstKind

The AstKind experiment proved unfruitful. I think the issue here is that
the savings on Vec<Ast> didn't prove to be enough to offset the extra
heap allocation that resulted from the indirection.

This seems to be a sweet spot. It would be nice to get Ast down below 16
bytes, but it's not clear how to do that (without much larger changes
that I don't feel inclined to pursue).

Fixes #1090
---
 fuzz/fuzz_targets/ast_roundtrip.rs |  21 ++--
 regex-cli/cmd/generate/fowler.rs   |   4 +-
 regex-syntax/src/ast/mod.rs        | 168 ++++++++++++-----------------
 regex-syntax/src/ast/parse.rs      |  62 +++++------
 regex-syntax/src/ast/print.rs      |  34 +++---
 regex-syntax/src/ast/visitor.rs    |  18 ++--
 regex-syntax/src/hir/translate.rs  |  44 ++++----
 7 files changed, 161 insertions(+), 190 deletions(-)

diff --git a/fuzz/fuzz_targets/ast_roundtrip.rs b/fuzz/fuzz_targets/ast_roundtrip.rs
index 040b59d63..c35ac962e 100644
--- a/fuzz/fuzz_targets/ast_roundtrip.rs
+++ b/fuzz/fuzz_targets/ast_roundtrip.rs
@@ -3,7 +3,7 @@
 use {
     libfuzzer_sys::{fuzz_target, Corpus},
     regex_syntax::ast::{
-        parse::Parser, visit, Ast, Flag, Group, GroupKind, SetFlags, Visitor,
+        parse::Parser, visit, Ast, Flag, Flags, GroupKind, Visitor,
     },
 };
 
@@ -32,16 +32,17 @@ impl Visitor for VerboseVisitor {
     }
 
     fn visit_pre(&mut self, ast: &Ast) -> Result<Self::Output, Self::Err> {
+        let reject_flags = |flags: &Flags| {
+            flags.flag_state(Flag::IgnoreWhitespace).unwrap_or(false)
+        };
         match ast {
-            Ast::Flags(SetFlags { flags, .. })
-            | Ast::Group(Group {
-                kind: GroupKind::NonCapturing(flags), ..
-            }) if flags
-                .flag_state(Flag::IgnoreWhitespace)
-                .unwrap_or(false) =>
-            {
-                Err(())
-            }
+            Ast::Flags(x) if reject_flags(&x.flags) => return Err(()),
+            Ast::Group(x) => match x.kind {
+                GroupKind::NonCapturing(ref flags) if reject_flags(flags) => {
+                    return Err(())
+                }
+                _ => Ok(()),
+            },
             _ => Ok(()),
         }
     }
diff --git a/regex-cli/cmd/generate/fowler.rs b/regex-cli/cmd/generate/fowler.rs
index c0ab1b361..c287f6f52 100644
--- a/regex-cli/cmd/generate/fowler.rs
+++ b/regex-cli/cmd/generate/fowler.rs
@@ -404,7 +404,9 @@ fn count_capturing_groups_ast(ast: &regex_syntax::ast::Ast) -> usize {
         | Ast::Literal(_)
         | Ast::Dot(_)
         | Ast::Assertion(_)
-        | Ast::Class(_) => 0,
+        | Ast::ClassUnicode(_)
+        | Ast::ClassPerl(_)
+        | Ast::ClassBracketed(_) => 0,
         Ast::Repetition(ref rep) => count_capturing_groups_ast(&*rep.ast),
         Ast::Group(ref group) => {
             let this = if group.is_capturing() { 1 } else { 0 };
diff --git a/regex-syntax/src/ast/mod.rs b/regex-syntax/src/ast/mod.rs
index c346abcb6..9e0f92606 100644
--- a/regex-syntax/src/ast/mod.rs
+++ b/regex-syntax/src/ast/mod.rs
@@ -429,19 +429,9 @@ pub struct Comment {
 ///
 /// This type defines its own destructor that uses constant stack space and
 /// heap space proportional to the size of the `Ast`.
-///
-/// This type boxes the actual kind of the AST element so that an `Ast` value
-/// itself has a very small size. This in turn makes things like `Vec<Ast>` use
-/// a lot less memory than it might otherwise, which is particularly beneficial
-/// for representing long concatenations or alternations.
-#[derive(Clone, Debug, Eq, PartialEq)]
-#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
-pub struct Ast(pub Box<AstKind>);
-
-/// The kind of an abstract syntax element.
 #[derive(Clone, Debug, Eq, PartialEq)]
 #[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
-pub enum AstKind {
+pub enum Ast {
     /// An empty regex that matches everything.
     Empty(Box<Span>),
     /// A set of flags, e.g., `(?is)`.
@@ -473,86 +463,86 @@ pub enum AstKind {
 impl Ast {
     /// Create an "empty" AST item.
     pub fn empty(span: Span) -> Ast {
-        Ast(Box::new(AstKind::Empty(Box::new(span))))
+        Ast::Empty(Box::new(span))
     }
 
     /// Create a "flags" AST item.
     pub fn flags(e: SetFlags) -> Ast {
-        Ast(Box::new(AstKind::Flags(Box::new(e))))
+        Ast::Flags(Box::new(e))
     }
 
     /// Create a "literal" AST item.
     pub fn literal(e: Literal) -> Ast {
-        Ast(Box::new(AstKind::Literal(Box::new(e))))
+        Ast::Literal(Box::new(e))
     }
 
     /// Create a "dot" AST item.
     pub fn dot(span: Span) -> Ast {
-        Ast(Box::new(AstKind::Dot(Box::new(span))))
+        Ast::Dot(Box::new(span))
     }
 
     /// Create a "assertion" AST item.
     pub fn assertion(e: Assertion) -> Ast {
-        Ast(Box::new(AstKind::Assertion(Box::new(e))))
+        Ast::Assertion(Box::new(e))
     }
 
     /// Create a "Unicode class" AST item.
     pub fn class_unicode(e: ClassUnicode) -> Ast {
-        Ast(Box::new(AstKind::ClassUnicode(Box::new(e))))
+        Ast::ClassUnicode(Box::new(e))
     }
 
     /// Create a "Perl class" AST item.
     pub fn class_perl(e: ClassPerl) -> Ast {
-        Ast(Box::new(AstKind::ClassPerl(Box::new(e))))
+        Ast::ClassPerl(Box::new(e))
     }
 
     /// Create a "bracketed class" AST item.
     pub fn class_bracketed(e: ClassBracketed) -> Ast {
-        Ast(Box::new(AstKind::ClassBracketed(Box::new(e))))
+        Ast::ClassBracketed(Box::new(e))
     }
 
     /// Create a "repetition" AST item.
     pub fn repetition(e: Repetition) -> Ast {
-        Ast(Box::new(AstKind::Repetition(Box::new(e))))
+        Ast::Repetition(Box::new(e))
     }
 
     /// Create a "group" AST item.
     pub fn group(e: Group) -> Ast {
-        Ast(Box::new(AstKind::Group(Box::new(e))))
+        Ast::Group(Box::new(e))
     }
 
     /// Create a "alternation" AST item.
     pub fn alternation(e: Alternation) -> Ast {
-        Ast(Box::new(AstKind::Alternation(Box::new(e))))
+        Ast::Alternation(Box::new(e))
     }
 
     /// Create a "concat" AST item.
     pub fn concat(e: Concat) -> Ast {
-        Ast(Box::new(AstKind::Concat(Box::new(e))))
+        Ast::Concat(Box::new(e))
     }
 
     /// Return the span of this abstract syntax tree.
     pub fn span(&self) -> &Span {
-        match *self.0 {
-            AstKind::Empty(ref span) => span,
-            AstKind::Flags(ref x) => &x.span,
-            AstKind::Literal(ref x) => &x.span,
-            AstKind::Dot(ref span) => span,
-            AstKind::Assertion(ref x) => &x.span,
-            AstKind::ClassUnicode(ref x) => &x.span,
-            AstKind::ClassPerl(ref x) => &x.span,
-            AstKind::ClassBracketed(ref x) => &x.span,
-            AstKind::Repetition(ref x) => &x.span,
-            AstKind::Group(ref x) => &x.span,
-            AstKind::Alternation(ref x) => &x.span,
-            AstKind::Concat(ref x) => &x.span,
+        match *self {
+            Ast::Empty(ref span) => span,
+            Ast::Flags(ref x) => &x.span,
+            Ast::Literal(ref x) => &x.span,
+            Ast::Dot(ref span) => span,
+            Ast::Assertion(ref x) => &x.span,
+            Ast::ClassUnicode(ref x) => &x.span,
+            Ast::ClassPerl(ref x) => &x.span,
+            Ast::ClassBracketed(ref x) => &x.span,
+            Ast::Repetition(ref x) => &x.span,
+            Ast::Group(ref x) => &x.span,
+            Ast::Alternation(ref x) => &x.span,
+            Ast::Concat(ref x) => &x.span,
         }
     }
 
     /// Return true if and only if this Ast is empty.
     pub fn is_empty(&self) -> bool {
-        match *self.0 {
-            AstKind::Empty(_) => true,
+        match *self {
+            Ast::Empty(_) => true,
             _ => false,
         }
     }
@@ -560,19 +550,19 @@ impl Ast {
     /// Returns true if and only if this AST has any (including possibly empty)
     /// subexpressions.
     fn has_subexprs(&self) -> bool {
-        match *self.0 {
-            AstKind::Empty(_)
-            | AstKind::Flags(_)
-            | AstKind::Literal(_)
-            | AstKind::Dot(_)
-            | AstKind::Assertion(_)
-            | AstKind::ClassUnicode(_)
-            | AstKind::ClassPerl(_) => false,
-            AstKind::ClassBracketed(_)
-            | AstKind::Repetition(_)
-            | AstKind::Group(_)
-            | AstKind::Alternation(_)
-            | AstKind::Concat(_) => true,
+        match *self {
+            Ast::Empty(_)
+            | Ast::Flags(_)
+            | Ast::Literal(_)
+            | Ast::Dot(_)
+            | Ast::Assertion(_)
+            | Ast::ClassUnicode(_)
+            | Ast::ClassPerl(_) => false,
+            Ast::ClassBracketed(_)
+            | Ast::Repetition(_)
+            | Ast::Group(_)
+            | Ast::Alternation(_)
+            | Ast::Concat(_) => true,
         }
     }
 }
@@ -1598,20 +1588,20 @@ impl Drop for Ast {
     fn drop(&mut self) {
         use core::mem;
 
-        match *self.0 {
-            AstKind::Empty(_)
-            | AstKind::Flags(_)
-            | AstKind::Literal(_)
-            | AstKind::Dot(_)
-            | AstKind::Assertion(_)
-            | AstKind::ClassUnicode(_)
-            | AstKind::ClassPerl(_)
+        match *self {
+            Ast::Empty(_)
+            | Ast::Flags(_)
+            | Ast::Literal(_)
+            | Ast::Dot(_)
+            | Ast::Assertion(_)
+            | Ast::ClassUnicode(_)
+            | Ast::ClassPerl(_)
             // Bracketed classes are recursive, they get their own Drop impl.
-            | AstKind::ClassBracketed(_) => return,
-            AstKind::Repetition(ref x) if !x.ast.has_subexprs() => return,
-            AstKind::Group(ref x) if !x.ast.has_subexprs() => return,
-            AstKind::Alternation(ref x) if x.asts.is_empty() => return,
-            AstKind::Concat(ref x) if x.asts.is_empty() => return,
+            | Ast::ClassBracketed(_) => return,
+            Ast::Repetition(ref x) if !x.ast.has_subexprs() => return,
+            Ast::Group(ref x) if !x.ast.has_subexprs() => return,
+            Ast::Alternation(ref x) if x.asts.is_empty() => return,
+            Ast::Concat(ref x) if x.asts.is_empty() => return,
             _ => {}
         }
 
@@ -1619,27 +1609,27 @@ impl Drop for Ast {
         let empty_ast = || Ast::empty(empty_span());
         let mut stack = vec![mem::replace(self, empty_ast())];
         while let Some(mut ast) = stack.pop() {
-            match *ast.0 {
-                AstKind::Empty(_)
-                | AstKind::Flags(_)
-                | AstKind::Literal(_)
-                | AstKind::Dot(_)
-                | AstKind::Assertion(_)
-                | AstKind::ClassUnicode(_)
-                | AstKind::ClassPerl(_)
+            match ast {
+                Ast::Empty(_)
+                | Ast::Flags(_)
+                | Ast::Literal(_)
+                | Ast::Dot(_)
+                | Ast::Assertion(_)
+                | Ast::ClassUnicode(_)
+                | Ast::ClassPerl(_)
                 // Bracketed classes are recursive, so they get their own Drop
                 // impl.
-                | AstKind::ClassBracketed(_) => {}
-                AstKind::Repetition(ref mut x) => {
+                | Ast::ClassBracketed(_) => {}
+                Ast::Repetition(ref mut x) => {
                     stack.push(mem::replace(&mut x.ast, empty_ast()));
                 }
-                AstKind::Group(ref mut x) => {
+                Ast::Group(ref mut x) => {
                     stack.push(mem::replace(&mut x.ast, empty_ast()));
                 }
-                AstKind::Alternation(ref mut x) => {
+                Ast::Alternation(ref mut x) => {
                     stack.extend(x.asts.drain(..));
                 }
-                AstKind::Concat(ref mut x) => {
+                Ast::Concat(ref mut x) => {
                     stack.extend(x.asts.drain(..));
                 }
             }
@@ -1760,20 +1750,7 @@ mod tests {
     // 64-bit target. Wow.
     #[test]
     fn ast_size() {
-        std::dbg!(core::mem::size_of::<Span>());
-        std::dbg!(core::mem::size_of::<SetFlags>());
-        std::dbg!(core::mem::size_of::<Literal>());
-        std::dbg!(core::mem::size_of::<Span>());
-        std::dbg!(core::mem::size_of::<Assertion>());
-        std::dbg!(core::mem::size_of::<ClassUnicode>());
-        std::dbg!(core::mem::size_of::<ClassPerl>());
-        std::dbg!(core::mem::size_of::<ClassBracketed>());
-        std::dbg!(core::mem::size_of::<Repetition>());
-        std::dbg!(core::mem::size_of::<Group>());
-        std::dbg!(core::mem::size_of::<Alternation>());
-        std::dbg!(core::mem::size_of::<Concat>());
-
-        let max = core::mem::size_of::<usize>();
+        let max = 2 * core::mem::size_of::<usize>();
         let size = core::mem::size_of::<Ast>();
         assert!(
             size <= max,
@@ -1781,14 +1758,5 @@ mod tests {
             size,
             max
         );
-
-        let max = 2 * core::mem::size_of::<usize>();
-        let size = core::mem::size_of::<AstKind>();
-        assert!(
-            size <= max,
-            "AstKind size of {} bytes is bigger than suggested max {}",
-            size,
-            max
-        );
     }
 }
diff --git a/regex-syntax/src/ast/parse.rs b/regex-syntax/src/ast/parse.rs
index a87be0e02..f7bae7759 100644
--- a/regex-syntax/src/ast/parse.rs
+++ b/regex-syntax/src/ast/parse.rs
@@ -16,7 +16,7 @@ use alloc::{
 };
 
 use crate::{
-    ast::{self, Ast, AstKind, Position, Span},
+    ast::{self, Ast, Position, Span},
     either::Either,
     is_escapeable_character, is_meta_character,
 };
@@ -1044,8 +1044,8 @@ impl<'s, P: Borrow<Parser>> ParserI<'s, P> {
                 )
             }
         };
-        match *ast.0 {
-            AstKind::Empty(_) | AstKind::Flags(_) => {
+        match ast {
+            Ast::Empty(_) | Ast::Flags(_) => {
                 return Err(
                     self.error(self.span(), ast::ErrorKind::RepetitionMissing)
                 )
@@ -1096,8 +1096,8 @@ impl<'s, P: Borrow<Parser>> ParserI<'s, P> {
                 )
             }
         };
-        match *ast.0 {
-            AstKind::Empty(_) | AstKind::Flags(_) => {
+        match ast {
+            Ast::Empty(_) | Ast::Flags(_) => {
                 return Err(
                     self.error(self.span(), ast::ErrorKind::RepetitionMissing)
                 )
@@ -2183,43 +2183,43 @@ impl<'p, 's, P: Borrow<Parser>> ast::Visitor for NestLimiter<'p, 's, P> {
     }
 
     fn visit_pre(&mut self, ast: &Ast) -> Result<()> {
-        let span = match *ast.0 {
-            AstKind::Empty(_)
-            | AstKind::Flags(_)
-            | AstKind::Literal(_)
-            | AstKind::Dot(_)
-            | AstKind::Assertion(_)
-            | AstKind::ClassUnicode(_)
-            | AstKind::ClassPerl(_) => {
+        let span = match *ast {
+            Ast::Empty(_)
+            | Ast::Flags(_)
+            | Ast::Literal(_)
+            | Ast::Dot(_)
+            | Ast::Assertion(_)
+            | Ast::ClassUnicode(_)
+            | Ast::ClassPerl(_) => {
                 // These are all base cases, so we don't increment depth.
                 return Ok(());
             }
-            AstKind::ClassBracketed(ref x) => &x.span,
-            AstKind::Repetition(ref x) => &x.span,
-            AstKind::Group(ref x) => &x.span,
-            AstKind::Alternation(ref x) => &x.span,
-            AstKind::Concat(ref x) => &x.span,
+            Ast::ClassBracketed(ref x) => &x.span,
+            Ast::Repetition(ref x) => &x.span,
+            Ast::Group(ref x) => &x.span,
+            Ast::Alternation(ref x) => &x.span,
+            Ast::Concat(ref x) => &x.span,
         };
         self.increment_depth(span)
     }
 
     fn visit_post(&mut self, ast: &Ast) -> Result<()> {
-        match *ast.0 {
-            AstKind::Empty(_)
-            | AstKind::Flags(_)
-            | AstKind::Literal(_)
-            | AstKind::Dot(_)
-            | AstKind::Assertion(_)
-            | AstKind::ClassUnicode(_)
-            | AstKind::ClassPerl(_) => {
+        match *ast {
+            Ast::Empty(_)
+            | Ast::Flags(_)
+            | Ast::Literal(_)
+            | Ast::Dot(_)
+            | Ast::Assertion(_)
+            | Ast::ClassUnicode(_)
+            | Ast::ClassPerl(_) => {
                 // These are all base cases, so we don't decrement depth.
                 Ok(())
             }
-            AstKind::ClassBracketed(_)
-            | AstKind::Repetition(_)
-            | AstKind::Group(_)
-            | AstKind::Alternation(_)
-            | AstKind::Concat(_) => {
+            Ast::ClassBracketed(_)
+            | Ast::Repetition(_)
+            | Ast::Group(_)
+            | Ast::Alternation(_)
+            | Ast::Concat(_) => {
                 self.decrement_depth();
                 Ok(())
             }
diff --git a/regex-syntax/src/ast/print.rs b/regex-syntax/src/ast/print.rs
index 10ee56c2c..7dedf7f48 100644
--- a/regex-syntax/src/ast/print.rs
+++ b/regex-syntax/src/ast/print.rs
@@ -7,7 +7,7 @@ use core::fmt;
 use crate::ast::{
     self,
     visitor::{self, Visitor},
-    Ast, AstKind,
+    Ast,
 };
 
 /// A builder for constructing a printer.
@@ -78,27 +78,27 @@ impl<W: fmt::Write> Visitor for Writer<W> {
     }
 
     fn visit_pre(&mut self, ast: &Ast) -> fmt::Result {
-        match *ast.0 {
-            AstKind::Group(ref x) => self.fmt_group_pre(x),
-            AstKind::ClassBracketed(ref x) => self.fmt_class_bracketed_pre(x),
+        match *ast {
+            Ast::Group(ref x) => self.fmt_group_pre(x),
+            Ast::ClassBracketed(ref x) => self.fmt_class_bracketed_pre(x),
             _ => Ok(()),
         }
     }
 
     fn visit_post(&mut self, ast: &Ast) -> fmt::Result {
-        match *ast.0 {
-            AstKind::Empty(_) => Ok(()),
-            AstKind::Flags(ref x) => self.fmt_set_flags(x),
-            AstKind::Literal(ref x) => self.fmt_literal(x),
-            AstKind::Dot(_) => self.wtr.write_str("."),
-            AstKind::Assertion(ref x) => self.fmt_assertion(x),
-            AstKind::ClassPerl(ref x) => self.fmt_class_perl(x),
-            AstKind::ClassUnicode(ref x) => self.fmt_class_unicode(x),
-            AstKind::ClassBracketed(ref x) => self.fmt_class_bracketed_post(x),
-            AstKind::Repetition(ref x) => self.fmt_repetition(x),
-            AstKind::Group(ref x) => self.fmt_group_post(x),
-            AstKind::Alternation(_) => Ok(()),
-            AstKind::Concat(_) => Ok(()),
+        match *ast {
+            Ast::Empty(_) => Ok(()),
+            Ast::Flags(ref x) => self.fmt_set_flags(x),
+            Ast::Literal(ref x) => self.fmt_literal(x),
+            Ast::Dot(_) => self.wtr.write_str("."),
+            Ast::Assertion(ref x) => self.fmt_assertion(x),
+            Ast::ClassPerl(ref x) => self.fmt_class_perl(x),
+            Ast::ClassUnicode(ref x) => self.fmt_class_unicode(x),
+            Ast::ClassBracketed(ref x) => self.fmt_class_bracketed_post(x),
+            Ast::Repetition(ref x) => self.fmt_repetition(x),
+            Ast::Group(ref x) => self.fmt_group_post(x),
+            Ast::Alternation(_) => Ok(()),
+            Ast::Concat(_) => Ok(()),
         }
     }
 
diff --git a/regex-syntax/src/ast/visitor.rs b/regex-syntax/src/ast/visitor.rs
index 2bd4b1956..c1bb24d97 100644
--- a/regex-syntax/src/ast/visitor.rs
+++ b/regex-syntax/src/ast/visitor.rs
@@ -1,6 +1,6 @@
 use alloc::{vec, vec::Vec};
 
-use crate::ast::{self, Ast, AstKind};
+use crate::ast::{self, Ast};
 
 /// A trait for visiting an abstract syntax tree (AST) in depth first order.
 ///
@@ -263,19 +263,19 @@ impl<'a> HeapVisitor<'a> {
         ast: &'a Ast,
         visitor: &mut V,
     ) -> Result<Option<Frame<'a>>, V::Err> {
-        Ok(match *ast.0 {
-            AstKind::ClassBracketed(ref x) => {
+        Ok(match *ast {
+            Ast::ClassBracketed(ref x) => {
                 self.visit_class(x, visitor)?;
                 None
             }
-            AstKind::Repetition(ref x) => Some(Frame::Repetition(x)),
-            AstKind::Group(ref x) => Some(Frame::Group(x)),
-            AstKind::Concat(ref x) if x.asts.is_empty() => None,
-            AstKind::Concat(ref x) => {
+            Ast::Repetition(ref x) => Some(Frame::Repetition(x)),
+            Ast::Group(ref x) => Some(Frame::Group(x)),
+            Ast::Concat(ref x) if x.asts.is_empty() => None,
+            Ast::Concat(ref x) => {
                 Some(Frame::Concat { head: &x.asts[0], tail: &x.asts[1..] })
             }
-            AstKind::Alternation(ref x) if x.asts.is_empty() => None,
-            AstKind::Alternation(ref x) => Some(Frame::Alternation {
+            Ast::Alternation(ref x) if x.asts.is_empty() => None,
+            Ast::Alternation(ref x) => Some(Frame::Alternation {
                 head: &x.asts[0],
                 tail: &x.asts[1..],
             }),
diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs
index ab3aa93d7..56d261aa1 100644
--- a/regex-syntax/src/hir/translate.rs
+++ b/regex-syntax/src/hir/translate.rs
@@ -7,7 +7,7 @@ use core::cell::{Cell, RefCell};
 use alloc::{boxed::Box, string::ToString, vec, vec::Vec};
 
 use crate::{
-    ast::{self, Ast, AstKind, Span, Visitor},
+    ast::{self, Ast, Span, Visitor},
     either::Either,
     hir::{self, Error, ErrorKind, Hir, HirKind},
     unicode::{self, ClassQuery},
@@ -336,8 +336,8 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
     }
 
     fn visit_pre(&mut self, ast: &Ast) -> Result<()> {
-        match *ast.0 {
-            AstKind::ClassBracketed(_) => {
+        match *ast {
+            Ast::ClassBracketed(_) => {
                 if self.flags().unicode() {
                     let cls = hir::ClassUnicode::empty();
                     self.push(HirFrame::ClassUnicode(cls));
@@ -346,20 +346,20 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
                     self.push(HirFrame::ClassBytes(cls));
                 }
             }
-            AstKind::Repetition(_) => self.push(HirFrame::Repetition),
-            AstKind::Group(ref x) => {
+            Ast::Repetition(_) => self.push(HirFrame::Repetition),
+            Ast::Group(ref x) => {
                 let old_flags = x
                     .flags()
                     .map(|ast| self.set_flags(ast))
                     .unwrap_or_else(|| self.flags());
                 self.push(HirFrame::Group { old_flags });
             }
-            AstKind::Concat(ref x) if x.asts.is_empty() => {}
-            AstKind::Concat(_) => {
+            Ast::Concat(ref x) if x.asts.is_empty() => {}
+            Ast::Concat(_) => {
                 self.push(HirFrame::Concat);
             }
-            AstKind::Alternation(ref x) if x.asts.is_empty() => {}
-            AstKind::Alternation(_) => {
+            Ast::Alternation(ref x) if x.asts.is_empty() => {}
+            Ast::Alternation(_) => {
                 self.push(HirFrame::Alternation);
                 self.push(HirFrame::AlternationBranch);
             }
@@ -369,11 +369,11 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
     }
 
     fn visit_post(&mut self, ast: &Ast) -> Result<()> {
-        match *ast.0 {
-            AstKind::Empty(_) => {
+        match *ast {
+            Ast::Empty(_) => {
                 self.push(HirFrame::Expr(Hir::empty()));
             }
-            AstKind::Flags(ref x) => {
+            Ast::Flags(ref x) => {
                 self.set_flags(&x.flags);
                 // Flags in the AST are generally considered directives and
                 // not actual sub-expressions. However, they can be used in
@@ -386,7 +386,7 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
                 // consistency sake.
                 self.push(HirFrame::Expr(Hir::empty()));
             }
-            AstKind::Literal(ref x) => match self.ast_literal_to_scalar(x)? {
+            Ast::Literal(ref x) => match self.ast_literal_to_scalar(x)? {
                 Either::Right(byte) => self.push_byte(byte),
                 Either::Left(ch) => {
                     if !self.flags().unicode() && ch.len_utf8() > 1 {
@@ -400,13 +400,13 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
                     }
                 }
             },
-            AstKind::Dot(ref span) => {
+            Ast::Dot(ref span) => {
                 self.push(HirFrame::Expr(self.hir_dot(**span)?));
             }
-            AstKind::Assertion(ref x) => {
+            Ast::Assertion(ref x) => {
                 self.push(HirFrame::Expr(self.hir_assertion(x)?));
             }
-            AstKind::ClassPerl(ref x) => {
+            Ast::ClassPerl(ref x) => {
                 if self.flags().unicode() {
                     let cls = self.hir_perl_unicode_class(x)?;
                     let hcls = hir::Class::Unicode(cls);
@@ -417,11 +417,11 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
                     self.push(HirFrame::Expr(Hir::class(hcls)));
                 }
             }
-            AstKind::ClassUnicode(ref x) => {
+            Ast::ClassUnicode(ref x) => {
                 let cls = hir::Class::Unicode(self.hir_unicode_class(x)?);
                 self.push(HirFrame::Expr(Hir::class(cls)));
             }
-            AstKind::ClassBracketed(ref ast) => {
+            Ast::ClassBracketed(ref ast) => {
                 if self.flags().unicode() {
                     let mut cls = self.pop().unwrap().unwrap_class_unicode();
                     self.unicode_fold_and_negate(
@@ -442,18 +442,18 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
                     self.push(HirFrame::Expr(expr));
                 }
             }
-            AstKind::Repetition(ref x) => {
+            Ast::Repetition(ref x) => {
                 let expr = self.pop().unwrap().unwrap_expr();
                 self.pop().unwrap().unwrap_repetition();
                 self.push(HirFrame::Expr(self.hir_repetition(x, expr)));
             }
-            AstKind::Group(ref x) => {
+            Ast::Group(ref x) => {
                 let expr = self.pop().unwrap().unwrap_expr();
                 let old_flags = self.pop().unwrap().unwrap_group();
                 self.trans().flags.set(old_flags);
                 self.push(HirFrame::Expr(self.hir_capture(x, expr)));
             }
-            AstKind::Concat(_) => {
+            Ast::Concat(_) => {
                 let mut exprs = vec![];
                 while let Some(expr) = self.pop_concat_expr() {
                     if !matches!(*expr.kind(), HirKind::Empty) {
@@ -463,7 +463,7 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
                 exprs.reverse();
                 self.push(HirFrame::Expr(Hir::concat(exprs)));
             }
-            AstKind::Alternation(_) => {
+            Ast::Alternation(_) => {
                 let mut exprs = vec![];
                 while let Some(expr) = self.pop_alt_expr() {
                     self.pop().unwrap().unwrap_alternation_pipe();

From 536cf701ade853afd2a7a541684485f24491be91 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Fri, 6 Oct 2023 11:34:42 -0400
Subject: [PATCH 069/136] syntax: remove guarantees in the HIR related to 'u'
 flag

Basically, we never should have guaranteed that a particular HIR would
(or wouldn't) be used if the 'u' flag was present (or absent). Such a
guarantee generally results in too little flexibility, particularly when
it comes to HIR's smart constructors.

We could probably uphold that guarantee, but it's somewhat gnarly to do
and would require rejiggering some of the HIR types. For example, we
would probably need a literal that is an enum of `&str` or `&[u8]` that
correctly preserves the Unicode flag. This in turn comes with a bigger
complexity cost in various rewriting rules.

In general, it's much simpler to require the caller to be prepared for
any kind of HIR regardless of what the flags are. I feel somewhat
justified in this position due to the fact that part of the point of the
HIR is to erase all of the regex flags so that callers no longer need to
worry about them. That is, the erasure is the point that provides a
simplification for everyone downstream.

Closes #1088
---
 CHANGELOG.md                |  3 +++
 regex-syntax/src/hir/mod.rs | 16 +++++++++++-----
 2 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4a474af1b..5b88d9e80 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,9 @@ TBD
 * [BUG #1046](https://github.com/rust-lang/regex/issues/1046):
 Fix a bug that could result in incorrect match spans when using a Unicode word
 boundary and searching non-ASCII strings.
+* [BUG(regex-syntax) #1088](https://github.com/rust-lang/regex/issues/1088):
+Remove guarantees in the API that connect the `u` flag with a specific HIR
+representation.
 
 
 1.9.6 (2023-09-30)
diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs
index 6c1d2745e..f8a3d4a9e 100644
--- a/regex-syntax/src/hir/mod.rs
+++ b/regex-syntax/src/hir/mod.rs
@@ -797,13 +797,18 @@ impl core::fmt::Debug for Literal {
 /// The high-level intermediate representation of a character class.
 ///
 /// A character class corresponds to a set of characters. A character is either
-/// defined by a Unicode scalar value or a byte. Unicode characters are used
-/// by default, while bytes are used when Unicode mode (via the `u` flag) is
-/// disabled.
+/// defined by a Unicode scalar value or a byte.
 ///
 /// A character class, regardless of its character type, is represented by a
 /// sequence of non-overlapping non-adjacent ranges of characters.
 ///
+/// There are no guarantees about which class variant is used. Generally
+/// speaking, the Unicode variat is used whenever a class needs to contain
+/// non-ASCII Unicode scalar values. But the Unicode variant can be used even
+/// when Unicode mode is disabled. For example, at the time of writing, the
+/// regex `(?-u:a|\xc2\xa0)` will compile down to HIR for the Unicode class
+/// `[a\u00A0]` due to optimizations.
+///
 /// Note that `Bytes` variant may be produced even when it exclusively matches
 /// valid UTF-8. This is because a `Bytes` variant represents an intention by
 /// the author of the regular expression to disable Unicode mode, which in turn
@@ -1326,8 +1331,9 @@ impl ClassUnicodeRange {
     }
 }
 
-/// A set of characters represented by arbitrary bytes (where one byte
-/// corresponds to one character).
+/// A set of characters represented by arbitrary bytes.
+///
+/// Each byte corresponds to one character.
 #[derive(Clone, Debug, Eq, PartialEq)]
 pub struct ClassBytes {
     set: IntervalSet<ClassBytesRange>,

From 7a7ce8348f9cb74aac7c0a8f9836e76620ef035e Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Fri, 6 Oct 2023 11:39:14 -0400
Subject: [PATCH 070/136] automata: rejigger DFA start state computation

It turns out that requiring callers to provide an `Input` (and thus a
`&[u8]` haystack) is a bit onerous for all cases. Namely, part of the
point of `regex-automata` was to expose enough guts to make it tractable
to write a streaming regex engine. A streaming regex engine, especially
one that does a byte-at-a-time loop, is somewhat antithetical to having
a haystack in a single `&[u8]` slice. This made computing start states
possible but very awkward and quite unclear in terms of what the
implementation would actually do with the haystack.

This commit fixes that by exposing a lower level `start_state` method on
both of the DFAs that can be called without materializing an `Input`.
Instead, callers must create a new `start::Config` value which provides
all of the information necessary for the DFA to compute the correct
start state. This in turn also exposes the `crate::util::start` module.

This is ultimately a breaking change because it adds a new required
method to the `Automaton` trait. It also makes `start_state_forward` and
`start_state_reverse` optional. It isn't really expected for callers to
implement the `Automaton` trait themselves (and perhaps I will seal it
so we can do such changes in the future without it being breaking), but
still, this is technically breaking.

Callers using `start_state_forward` or `start_state_reverse` with either
DFA remain unchanged and unaffected.

Closes #1031
---
 CHANGELOG.md                        |   7 +
 regex-automata/src/dfa/automaton.rs | 188 ++++++++++++++++++---
 regex-automata/src/dfa/dense.rs     |  95 +++++------
 regex-automata/src/dfa/mod.rs       |   2 +-
 regex-automata/src/dfa/sparse.rs    |  59 +++----
 regex-automata/src/hybrid/dfa.rs    | 179 +++++++++++---------
 regex-automata/src/hybrid/error.rs  | 115 ++++++++++++-
 regex-automata/src/hybrid/mod.rs    |   2 +-
 regex-automata/src/util/mod.rs      |   2 +-
 regex-automata/src/util/start.rs    | 243 ++++++++++++++++++++++++----
 10 files changed, 662 insertions(+), 230 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5b88d9e80..265f5cd48 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,13 @@
 TBD
 ===
 
+New features:
+
+* [FEATURE(regex-automata) #1031](https://github.com/rust-lang/regex/pull/1031):
+DFAs now have a `start_state` method that doesn't use an `Input`.
+
+Bug fixes:
+
 * [BUG #1046](https://github.com/rust-lang/regex/issues/1046):
 Fix a bug that could result in incorrect match spans when using a Unicode word
 boundary and searching non-ASCII strings.
diff --git a/regex-automata/src/dfa/automaton.rs b/regex-automata/src/dfa/automaton.rs
index 7e2be9a15..cd597947e 100644
--- a/regex-automata/src/dfa/automaton.rs
+++ b/regex-automata/src/dfa/automaton.rs
@@ -7,6 +7,7 @@ use crate::{
         prefilter::Prefilter,
         primitives::{PatternID, StateID},
         search::{Anchored, HalfMatch, Input, MatchError},
+        start,
     },
 };
 
@@ -226,8 +227,8 @@ pub unsafe trait Automaton {
     /// ```
     fn next_eoi_state(&self, current: StateID) -> StateID;
 
-    /// Return the ID of the start state for this lazy DFA when executing a
-    /// forward search.
+    /// Return the ID of the start state for this DFA for the given starting
+    /// configuration.
     ///
     /// Unlike typical DFA implementations, the start state for DFAs in this
     /// crate is dependent on a few different factors:
@@ -235,12 +236,41 @@ pub unsafe trait Automaton {
     /// * The [`Anchored`] mode of the search. Unanchored, anchored and
     /// anchored searches for a specific [`PatternID`] all use different start
     /// states.
-    /// * The position at which the search begins, via [`Input::start`]. This
-    /// and the byte immediately preceding the start of the search (if one
-    /// exists) influence which look-behind assertions are true at the start
-    /// of the search. This in turn influences which start state is selected.
-    /// * Whether the search is a forward or reverse search. This routine can
-    /// only be used for forward searches.
+    /// * Whether a "look-behind" byte exists. For example, the `^` anchor
+    /// matches if and only if there is no look-behind byte.
+    /// * The specific value of that look-behind byte. For example, a `(?m:^)`
+    /// assertion only matches when there is either no look-behind byte, or
+    /// when the look-behind byte is a line terminator.
+    ///
+    /// The [starting configuration](start::Config) provides the above
+    /// information.
+    ///
+    /// This routine can be used for either forward or reverse searches.
+    /// Although, as a convenience, if you have an [`Input`], then it may
+    /// be more succinct to use [`Automaton::start_state_forward`] or
+    /// [`Automaton::start_state_reverse`]. Note, for example, that the
+    /// convenience routines return a [`MatchError`] on failure where as this
+    /// routine returns a [`StartError`].
+    ///
+    /// # Errors
+    ///
+    /// This may return a [`StartError`] if the search needs to give up when
+    /// determining the start state (for example, if it sees a "quit" byte).
+    /// This can also return an error if the given configuration contains an
+    /// unsupported [`Anchored`] configuration.
+    fn start_state(
+        &self,
+        config: &start::Config,
+    ) -> Result<StateID, StartError>;
+
+    /// Return the ID of the start state for this DFA when executing a forward
+    /// search.
+    ///
+    /// This is a convenience routine for calling [`Automaton::start_state`]
+    /// that converts the given [`Input`] to a [start
+    /// configuration](start::Config). Additionally, if an error occurs, it is
+    /// converted from a [`StartError`] to a [`MatchError`] using the offset
+    /// information in the given [`Input`].
     ///
     /// # Errors
     ///
@@ -251,23 +281,30 @@ pub unsafe trait Automaton {
     fn start_state_forward(
         &self,
         input: &Input<'_>,
-    ) -> Result<StateID, MatchError>;
+    ) -> Result<StateID, MatchError> {
+        let config = start::Config::from_input_forward(input);
+        self.start_state(&config).map_err(|err| match err {
+            StartError::Quit { byte } => {
+                let offset = input
+                    .start()
+                    .checked_sub(1)
+                    .expect("no quit in start without look-behind");
+                MatchError::quit(byte, offset)
+            }
+            StartError::UnsupportedAnchored { mode } => {
+                MatchError::unsupported_anchored(mode)
+            }
+        })
+    }
 
-    /// Return the ID of the start state for this lazy DFA when executing a
-    /// reverse search.
+    /// Return the ID of the start state for this DFA when executing a reverse
+    /// search.
     ///
-    /// Unlike typical DFA implementations, the start state for DFAs in this
-    /// crate is dependent on a few different factors:
-    ///
-    /// * The [`Anchored`] mode of the search. Unanchored, anchored and
-    /// anchored searches for a specific [`PatternID`] all use different start
-    /// states.
-    /// * The position at which the search begins, via [`Input::start`]. This
-    /// and the byte immediately preceding the start of the search (if one
-    /// exists) influence which look-behind assertions are true at the start
-    /// of the search. This in turn influences which start state is selected.
-    /// * Whether the search is a forward or reverse search. This routine can
-    /// only be used for reverse searches.
+    /// This is a convenience routine for calling [`Automaton::start_state`]
+    /// that converts the given [`Input`] to a [start
+    /// configuration](start::Config). Additionally, if an error occurs, it is
+    /// converted from a [`StartError`] to a [`MatchError`] using the offset
+    /// information in the given [`Input`].
     ///
     /// # Errors
     ///
@@ -278,7 +315,18 @@ pub unsafe trait Automaton {
     fn start_state_reverse(
         &self,
         input: &Input<'_>,
-    ) -> Result<StateID, MatchError>;
+    ) -> Result<StateID, MatchError> {
+        let config = start::Config::from_input_reverse(input);
+        self.start_state(&config).map_err(|err| match err {
+            StartError::Quit { byte } => {
+                let offset = input.end();
+                MatchError::quit(byte, offset)
+            }
+            StartError::UnsupportedAnchored { mode } => {
+                MatchError::unsupported_anchored(mode)
+            }
+        })
+    }
 
     /// If this DFA has a universal starting state for the given anchor mode
     /// and the DFA supports universal starting states, then this returns that
@@ -1798,6 +1846,14 @@ unsafe impl<'a, A: Automaton + ?Sized> Automaton for &'a A {
         (**self).next_eoi_state(current)
     }
 
+    #[inline]
+    fn start_state(
+        &self,
+        config: &start::Config,
+    ) -> Result<StateID, StartError> {
+        (**self).start_state(config)
+    }
+
     #[inline]
     fn start_state_forward(
         &self,
@@ -2015,6 +2071,90 @@ impl OverlappingState {
     }
 }
 
+/// An error that can occur when computing the start state for a search.
+///
+/// Computing a start state can fail for a few reasons, either based on
+/// incorrect configuration or even based on whether the look-behind byte
+/// triggers a quit state. Typically one does not need to handle this error
+/// if you're using [`Automaton::start_state_forward`] (or its reverse
+/// counterpart), as that routine automatically converts `StartError` to a
+/// [`MatchError`] for you.
+///
+/// This error may be returned by the [`Automaton::start_state`] routine.
+///
+/// This error implements the `std::error::Error` trait when the `std` feature
+/// is enabled.
+///
+/// This error is marked as non-exhaustive. New variants may be added in a
+/// semver compatible release.
+#[non_exhaustive]
+#[derive(Clone, Debug)]
+pub enum StartError {
+    /// An error that occurs when a starting configuration's look-behind byte
+    /// is in this DFA's quit set.
+    Quit {
+        /// The quit byte that was found.
+        byte: u8,
+    },
+    /// An error that occurs when the caller requests an anchored mode that
+    /// isn't supported by the DFA.
+    UnsupportedAnchored {
+        /// The anchored mode given that is unsupported.
+        mode: Anchored,
+    },
+}
+
+impl StartError {
+    pub(crate) fn quit(byte: u8) -> StartError {
+        StartError::Quit { byte }
+    }
+
+    pub(crate) fn unsupported_anchored(mode: Anchored) -> StartError {
+        StartError::UnsupportedAnchored { mode }
+    }
+}
+
+#[cfg(feature = "std")]
+impl std::error::Error for StartError {}
+
+impl core::fmt::Display for StartError {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        match *self {
+            StartError::Quit { byte } => write!(
+                f,
+                "error computing start state because the look-behind byte \
+                 {:?} triggered a quit state",
+                crate::util::escape::DebugByte(byte),
+            ),
+            StartError::UnsupportedAnchored { mode: Anchored::Yes } => {
+                write!(
+                    f,
+                    "error computing start state because \
+                     anchored searches are not supported or enabled"
+                )
+            }
+            StartError::UnsupportedAnchored { mode: Anchored::No } => {
+                write!(
+                    f,
+                    "error computing start state because \
+                     unanchored searches are not supported or enabled"
+                )
+            }
+            StartError::UnsupportedAnchored {
+                mode: Anchored::Pattern(pid),
+            } => {
+                write!(
+                    f,
+                    "error computing start state because \
+                     anchored searches for a specific pattern ({}) \
+                     are not supported or enabled",
+                    pid.as_usize(),
+                )
+            }
+        }
+    }
+}
+
 /// Runs the given overlapping `search` function (forwards or backwards) until
 /// a match is found whose offset does not split a codepoint.
 ///
diff --git a/regex-automata/src/dfa/dense.rs b/regex-automata/src/dfa/dense.rs
index 6da865f97..7af38b546 100644
--- a/regex-automata/src/dfa/dense.rs
+++ b/regex-automata/src/dfa/dense.rs
@@ -30,7 +30,7 @@ use crate::{
 use crate::{
     dfa::{
         accel::Accels,
-        automaton::{fmt_state_indicator, Automaton},
+        automaton::{fmt_state_indicator, Automaton, StartError},
         special::Special,
         start::StartKind,
         DEAD,
@@ -40,8 +40,8 @@ use crate::{
         int::{Pointer, Usize},
         prefilter::Prefilter,
         primitives::{PatternID, StateID},
-        search::{Anchored, Input, MatchError},
-        start::{Start, StartByteMap},
+        search::Anchored,
+        start::{self, Start, StartByteMap},
         wire::{self, DeserializeError, Endian, SerializeError},
     },
 };
@@ -2885,31 +2885,33 @@ impl OwnedDFA {
     fn set_universal_starts(&mut self) {
         assert_eq!(6, Start::len(), "expected 6 start configurations");
 
-        let start_id = |dfa: &mut OwnedDFA, inp: &Input<'_>, start: Start| {
+        let start_id = |dfa: &mut OwnedDFA,
+                        anchored: Anchored,
+                        start: Start| {
             // This OK because we only call 'start' under conditions
             // in which we know it will succeed.
-            dfa.st.start(inp, start).expect("valid Input configuration")
+            dfa.st.start(anchored, start).expect("valid Input configuration")
         };
         if self.start_kind().has_unanchored() {
-            let inp = Input::new("").anchored(Anchored::No);
-            let sid = start_id(self, &inp, Start::NonWordByte);
-            if sid == start_id(self, &inp, Start::WordByte)
-                && sid == start_id(self, &inp, Start::Text)
-                && sid == start_id(self, &inp, Start::LineLF)
-                && sid == start_id(self, &inp, Start::LineCR)
-                && sid == start_id(self, &inp, Start::CustomLineTerminator)
+            let anchor = Anchored::No;
+            let sid = start_id(self, anchor, Start::NonWordByte);
+            if sid == start_id(self, anchor, Start::WordByte)
+                && sid == start_id(self, anchor, Start::Text)
+                && sid == start_id(self, anchor, Start::LineLF)
+                && sid == start_id(self, anchor, Start::LineCR)
+                && sid == start_id(self, anchor, Start::CustomLineTerminator)
             {
                 self.st.universal_start_unanchored = Some(sid);
             }
         }
         if self.start_kind().has_anchored() {
-            let inp = Input::new("").anchored(Anchored::Yes);
-            let sid = start_id(self, &inp, Start::NonWordByte);
-            if sid == start_id(self, &inp, Start::WordByte)
-                && sid == start_id(self, &inp, Start::Text)
-                && sid == start_id(self, &inp, Start::LineLF)
-                && sid == start_id(self, &inp, Start::LineCR)
-                && sid == start_id(self, &inp, Start::CustomLineTerminator)
+            let anchor = Anchored::Yes;
+            let sid = start_id(self, anchor, Start::NonWordByte);
+            if sid == start_id(self, anchor, Start::WordByte)
+                && sid == start_id(self, anchor, Start::Text)
+                && sid == start_id(self, anchor, Start::LineLF)
+                && sid == start_id(self, anchor, Start::LineCR)
+                && sid == start_id(self, anchor, Start::CustomLineTerminator)
             {
                 self.st.universal_start_anchored = Some(sid);
             }
@@ -3216,35 +3218,21 @@ unsafe impl<T: AsRef<[u32]>> Automaton for DFA<T> {
     }
 
     #[cfg_attr(feature = "perf-inline", inline(always))]
-    fn start_state_forward(
+    fn start_state(
         &self,
-        input: &Input<'_>,
-    ) -> Result<StateID, MatchError> {
-        if !self.quitset.is_empty() && input.start() > 0 {
-            let offset = input.start() - 1;
-            let byte = input.haystack()[offset];
-            if self.quitset.contains(byte) {
-                return Err(MatchError::quit(byte, offset));
-            }
-        }
-        let start = self.st.start_map.fwd(&input);
-        self.st.start(input, start)
-    }
-
-    #[cfg_attr(feature = "perf-inline", inline(always))]
-    fn start_state_reverse(
-        &self,
-        input: &Input<'_>,
-    ) -> Result<StateID, MatchError> {
-        if !self.quitset.is_empty() && input.end() < input.haystack().len() {
-            let offset = input.end();
-            let byte = input.haystack()[offset];
-            if self.quitset.contains(byte) {
-                return Err(MatchError::quit(byte, offset));
+        config: &start::Config,
+    ) -> Result<StateID, StartError> {
+        let anchored = config.get_anchored();
+        let start = match config.get_look_behind() {
+            None => Start::Text,
+            Some(byte) => {
+                if !self.quitset.is_empty() && self.quitset.contains(byte) {
+                    return Err(StartError::quit(byte));
+                }
+                self.st.start_map.get(byte)
             }
-        }
-        let start = self.st.start_map.rev(&input);
-        self.st.start(input, start)
+        };
+        self.st.start(anchored, start)
     }
 
     #[cfg_attr(feature = "perf-inline", inline(always))]
@@ -4180,28 +4168,27 @@ impl<T: AsRef<[u32]>> StartTable<T> {
     #[cfg_attr(feature = "perf-inline", inline(always))]
     fn start(
         &self,
-        input: &Input<'_>,
+        anchored: Anchored,
         start: Start,
-    ) -> Result<StateID, MatchError> {
+    ) -> Result<StateID, StartError> {
         let start_index = start.as_usize();
-        let mode = input.get_anchored();
-        let index = match mode {
+        let index = match anchored {
             Anchored::No => {
                 if !self.kind.has_unanchored() {
-                    return Err(MatchError::unsupported_anchored(mode));
+                    return Err(StartError::unsupported_anchored(anchored));
                 }
                 start_index
             }
             Anchored::Yes => {
                 if !self.kind.has_anchored() {
-                    return Err(MatchError::unsupported_anchored(mode));
+                    return Err(StartError::unsupported_anchored(anchored));
                 }
                 self.stride + start_index
             }
             Anchored::Pattern(pid) => {
                 let len = match self.pattern_len {
                     None => {
-                        return Err(MatchError::unsupported_anchored(mode))
+                        return Err(StartError::unsupported_anchored(anchored))
                     }
                     Some(len) => len,
                 };
@@ -5086,6 +5073,8 @@ impl core::fmt::Display for BuildError {
 
 #[cfg(all(test, feature = "syntax", feature = "dfa-build"))]
 mod tests {
+    use crate::{Input, MatchError};
+
     use super::*;
 
     #[test]
diff --git a/regex-automata/src/dfa/mod.rs b/regex-automata/src/dfa/mod.rs
index 4bb870435..fd58cac23 100644
--- a/regex-automata/src/dfa/mod.rs
+++ b/regex-automata/src/dfa/mod.rs
@@ -320,7 +320,7 @@ dramatically.
 
 #[cfg(feature = "dfa-search")]
 pub use crate::dfa::{
-    automaton::{Automaton, OverlappingState},
+    automaton::{Automaton, OverlappingState, StartError},
     start::StartKind,
 };
 
diff --git a/regex-automata/src/dfa/sparse.rs b/regex-automata/src/dfa/sparse.rs
index 5d8ec2340..a5ccf9add 100644
--- a/regex-automata/src/dfa/sparse.rs
+++ b/regex-automata/src/dfa/sparse.rs
@@ -52,7 +52,7 @@ use alloc::{vec, vec::Vec};
 use crate::dfa::dense::{self, BuildError};
 use crate::{
     dfa::{
-        automaton::{fmt_state_indicator, Automaton},
+        automaton::{fmt_state_indicator, Automaton, StartError},
         dense::Flags,
         special::Special,
         StartKind, DEAD,
@@ -63,8 +63,8 @@ use crate::{
         int::{Pointer, Usize, U16, U32},
         prefilter::Prefilter,
         primitives::{PatternID, StateID},
-        search::{Anchored, Input, MatchError},
-        start::{Start, StartByteMap},
+        search::Anchored,
+        start::{self, Start, StartByteMap},
         wire::{self, DeserializeError, Endian, SerializeError},
     },
 };
@@ -1207,35 +1207,21 @@ unsafe impl<T: AsRef<[u8]>> Automaton for DFA<T> {
     }
 
     #[inline]
-    fn start_state_forward(
+    fn start_state(
         &self,
-        input: &Input<'_>,
-    ) -> Result<StateID, MatchError> {
-        if !self.quitset.is_empty() && input.start() > 0 {
-            let offset = input.start() - 1;
-            let byte = input.haystack()[offset];
-            if self.quitset.contains(byte) {
-                return Err(MatchError::quit(byte, offset));
-            }
-        }
-        let start = self.st.start_map.fwd(&input);
-        self.st.start(input, start)
-    }
-
-    #[inline]
-    fn start_state_reverse(
-        &self,
-        input: &Input<'_>,
-    ) -> Result<StateID, MatchError> {
-        if !self.quitset.is_empty() && input.end() < input.haystack().len() {
-            let offset = input.end();
-            let byte = input.haystack()[offset];
-            if self.quitset.contains(byte) {
-                return Err(MatchError::quit(byte, offset));
+        config: &start::Config,
+    ) -> Result<StateID, StartError> {
+        let anchored = config.get_anchored();
+        let start = match config.get_look_behind() {
+            None => Start::Text,
+            Some(byte) => {
+                if !self.quitset.is_empty() && self.quitset.contains(byte) {
+                    return Err(StartError::quit(byte));
+                }
+                self.st.start_map.get(byte)
             }
-        }
-        let start = self.st.start_map.rev(&input);
-        self.st.start(input, start)
+        };
+        self.st.start(anchored, start)
     }
 
     #[inline]
@@ -2145,28 +2131,27 @@ impl<T: AsRef<[u8]>> StartTable<T> {
     /// panics.
     fn start(
         &self,
-        input: &Input<'_>,
+        anchored: Anchored,
         start: Start,
-    ) -> Result<StateID, MatchError> {
+    ) -> Result<StateID, StartError> {
         let start_index = start.as_usize();
-        let mode = input.get_anchored();
-        let index = match mode {
+        let index = match anchored {
             Anchored::No => {
                 if !self.kind.has_unanchored() {
-                    return Err(MatchError::unsupported_anchored(mode));
+                    return Err(StartError::unsupported_anchored(anchored));
                 }
                 start_index
             }
             Anchored::Yes => {
                 if !self.kind.has_anchored() {
-                    return Err(MatchError::unsupported_anchored(mode));
+                    return Err(StartError::unsupported_anchored(anchored));
                 }
                 self.stride + start_index
             }
             Anchored::Pattern(pid) => {
                 let len = match self.pattern_len {
                     None => {
-                        return Err(MatchError::unsupported_anchored(mode))
+                        return Err(StartError::unsupported_anchored(anchored))
                     }
                     Some(len) => len,
                 };
diff --git a/regex-automata/src/hybrid/dfa.rs b/regex-automata/src/hybrid/dfa.rs
index 67261c1a3..102cfb6fe 100644
--- a/regex-automata/src/hybrid/dfa.rs
+++ b/regex-automata/src/hybrid/dfa.rs
@@ -13,7 +13,7 @@ use alloc::vec::Vec;
 
 use crate::{
     hybrid::{
-        error::{BuildError, CacheError},
+        error::{BuildError, CacheError, StartError},
         id::{LazyStateID, LazyStateIDError},
         search,
     },
@@ -28,7 +28,7 @@ use crate::{
             Anchored, HalfMatch, Input, MatchError, MatchKind, PatternSet,
         },
         sparse_set::SparseSets,
-        start::{Start, StartByteMap},
+        start::{self, Start, StartByteMap},
     },
 };
 
@@ -1518,8 +1518,8 @@ impl DFA {
         Lazy::new(self, cache).cache_next_state(current, unit)
     }
 
-    /// Return the ID of the start state for this lazy DFA when executing a
-    /// forward search.
+    /// Return the ID of the start state for this lazy DFA for the given
+    /// starting configuration.
     ///
     /// Unlike typical DFA implementations, the start state for DFAs in this
     /// crate is dependent on a few different factors:
@@ -1527,85 +1527,122 @@ impl DFA {
     /// * The [`Anchored`] mode of the search. Unanchored, anchored and
     /// anchored searches for a specific [`PatternID`] all use different start
     /// states.
-    /// * The position at which the search begins, via [`Input::start`]. This
-    /// and the byte immediately preceding the start of the search (if one
-    /// exists) influence which look-behind assertions are true at the start
-    /// of the search. This in turn influences which start state is selected.
-    /// * Whether the search is a forward or reverse search. This routine can
-    /// only be used for forward searches.
+    /// * Whether a "look-behind" byte exists. For example, the `^` anchor
+    /// matches if and only if there is no look-behind byte.
+    /// * The specific value of that look-behind byte. For example, a `(?m:^)`
+    /// assertion only matches when there is either no look-behind byte, or
+    /// when the look-behind byte is a line terminator.
+    ///
+    /// The [starting configuration](start::Config) provides the above
+    /// information.
+    ///
+    /// This routine can be used for either forward or reverse searches.
+    /// Although, as a convenience, if you have an [`Input`], then it
+    /// may be more succinct to use [`DFA::start_state_forward`] or
+    /// [`DFA::start_state_reverse`]. Note, for example, that the convenience
+    /// routines return a [`MatchError`] on failure where as this routine
+    /// returns a [`StartError`].
+    ///
+    /// # Errors
+    ///
+    /// This may return a [`StartError`] if the search needs to give up when
+    /// determining the start state (for example, if it sees a "quit" byte
+    /// or if the cache has become inefficient). This can also return an
+    /// error if the given configuration contains an unsupported [`Anchored`]
+    /// configuration.
+    #[cfg_attr(feature = "perf-inline", inline(always))]
+    pub fn start_state(
+        &self,
+        cache: &mut Cache,
+        config: &start::Config,
+    ) -> Result<LazyStateID, StartError> {
+        let lazy = LazyRef::new(self, cache);
+        let anchored = config.get_anchored();
+        let start = match config.get_look_behind() {
+            None => Start::Text,
+            Some(byte) => {
+                if !self.quitset.is_empty() && self.quitset.contains(byte) {
+                    return Err(StartError::quit(byte));
+                }
+                self.start_map.get(byte)
+            }
+        };
+        let start_id = lazy.get_cached_start_id(anchored, start)?;
+        if !start_id.is_unknown() {
+            return Ok(start_id);
+        }
+        Lazy::new(self, cache).cache_start_group(anchored, start)
+    }
+
+    /// Return the ID of the start state for this lazy DFA when executing a
+    /// forward search.
+    ///
+    /// This is a convenience routine for calling [`DFA::start_state`] that
+    /// converts the given [`Input`] to a [start configuration](start::Config).
+    /// Additionally, if an error occurs, it is converted from a [`StartError`]
+    /// to a [`MatchError`] using the offset information in the given
+    /// [`Input`].
     ///
     /// # Errors
     ///
-    /// This may return a [`MatchError`] (not a [`CacheError`]!) if the search
-    /// needs to give up when determining the start state (for example, if
-    /// it sees a "quit" byte or if the cache has been cleared too many
-    /// times). This can also return an error if the given `Input` contains an
-    /// unsupported [`Anchored`] configuration.
+    /// This may return a [`MatchError`] if the search needs to give up when
+    /// determining the start state (for example, if it sees a "quit" byte or
+    /// if the cache has become inefficient). This can also return an error if
+    /// the given `Input` contains an unsupported [`Anchored`] configuration.
     #[cfg_attr(feature = "perf-inline", inline(always))]
     pub fn start_state_forward(
         &self,
         cache: &mut Cache,
         input: &Input<'_>,
     ) -> Result<LazyStateID, MatchError> {
-        if !self.quitset.is_empty() && input.start() > 0 {
-            let offset = input.start() - 1;
-            let byte = input.haystack()[offset];
-            if self.quitset.contains(byte) {
-                return Err(MatchError::quit(byte, offset));
+        let config = start::Config::from_input_forward(input);
+        self.start_state(cache, &config).map_err(|err| match err {
+            StartError::Cache { .. } => MatchError::gave_up(input.start()),
+            StartError::Quit { byte } => {
+                let offset = input
+                    .start()
+                    .checked_sub(1)
+                    .expect("no quit in start without look-behind");
+                MatchError::quit(byte, offset)
             }
-        }
-        let start_type = self.start_map.fwd(input);
-        let start = LazyRef::new(self, cache)
-            .get_cached_start_id(input, start_type)?;
-        if !start.is_unknown() {
-            return Ok(start);
-        }
-        Lazy::new(self, cache).cache_start_group(input, start_type)
+            StartError::UnsupportedAnchored { mode } => {
+                MatchError::unsupported_anchored(mode)
+            }
+        })
     }
 
     /// Return the ID of the start state for this lazy DFA when executing a
     /// reverse search.
     ///
-    /// Unlike typical DFA implementations, the start state for DFAs in this
-    /// crate is dependent on a few different factors:
-    ///
-    /// * The [`Anchored`] mode of the search. Unanchored, anchored and
-    /// anchored searches for a specific [`PatternID`] all use different start
-    /// states.
-    /// * The position at which the search begins, via [`Input::start`]. This
-    /// and the byte immediately preceding the start of the search (if one
-    /// exists) influence which look-behind assertions are true at the start
-    /// of the search. This in turn influences which start state is selected.
-    /// * Whether the search is a forward or reverse search. This routine can
-    /// only be used for reverse searches.
+    /// This is a convenience routine for calling [`DFA::start_state`] that
+    /// converts the given [`Input`] to a [start configuration](start::Config).
+    /// Additionally, if an error occurs, it is converted from a [`StartError`]
+    /// to a [`MatchError`] using the offset information in the given
+    /// [`Input`].
     ///
     /// # Errors
     ///
-    /// This may return a [`MatchError`] (not a [`CacheError`]!) if the search
-    /// needs to give up when determining the start state (for example, if
-    /// it sees a "quit" byte or if the cache has been cleared too many
-    /// times). This can also return an error if the given `Input` contains an
-    /// unsupported [`Anchored`] configuration.
+    /// This may return a [`MatchError`] if the search needs to give up when
+    /// determining the start state (for example, if it sees a "quit" byte or
+    /// if the cache has become inefficient). This can also return an error if
+    /// the given `Input` contains an unsupported [`Anchored`] configuration.
     #[cfg_attr(feature = "perf-inline", inline(always))]
     pub fn start_state_reverse(
         &self,
         cache: &mut Cache,
         input: &Input<'_>,
     ) -> Result<LazyStateID, MatchError> {
-        if !self.quitset.is_empty() && input.end() < input.haystack().len() {
-            let offset = input.end();
-            let byte = input.haystack()[offset];
-            if self.quitset.contains(byte) {
-                return Err(MatchError::quit(byte, offset));
+        let config = start::Config::from_input_reverse(input);
+        self.start_state(cache, &config).map_err(|err| match err {
+            StartError::Cache { .. } => MatchError::gave_up(input.end()),
+            StartError::Quit { byte } => {
+                let offset = input.end();
+                MatchError::quit(byte, offset)
             }
-        }
-        let start_type = self.start_map.rev(input);
-        let start = LazyRef::new(self, cache)
-            .get_cached_start_id(input, start_type)?;
-        if !start.is_unknown() {
-            return Ok(start);
-        }
-        Lazy::new(self, cache).cache_start_group(input, start_type)
+            StartError::UnsupportedAnchored { mode } => {
+                MatchError::unsupported_anchored(mode)
+            }
+        })
     }
 
     /// Returns the total number of patterns that match in this state.
@@ -2122,16 +2159,15 @@ impl<'i, 'c> Lazy<'i, 'c> {
     #[inline(never)]
     fn cache_start_group(
         &mut self,
-        input: &Input<'_>,
+        anchored: Anchored,
         start: Start,
-    ) -> Result<LazyStateID, MatchError> {
-        let mode = input.get_anchored();
-        let nfa_start_id = match mode {
+    ) -> Result<LazyStateID, StartError> {
+        let nfa_start_id = match anchored {
             Anchored::No => self.dfa.get_nfa().start_unanchored(),
             Anchored::Yes => self.dfa.get_nfa().start_anchored(),
             Anchored::Pattern(pid) => {
                 if !self.dfa.get_config().get_starts_for_each_pattern() {
-                    return Err(MatchError::unsupported_anchored(mode));
+                    return Err(StartError::unsupported_anchored(anchored));
                 }
                 match self.dfa.get_nfa().start_pattern(pid) {
                     None => return Ok(self.as_ref().dead_id()),
@@ -2142,8 +2178,8 @@ impl<'i, 'c> Lazy<'i, 'c> {
 
         let id = self
             .cache_start_one(nfa_start_id, start)
-            .map_err(|_| MatchError::gave_up(input.start()))?;
-        self.set_start_state(input, start, id);
+            .map_err(StartError::cache)?;
+        self.set_start_state(anchored, start, id);
         Ok(id)
     }
 
@@ -2574,13 +2610,13 @@ impl<'i, 'c> Lazy<'i, 'c> {
     /// 'starts_for_each_pattern' is not enabled.
     fn set_start_state(
         &mut self,
-        input: &Input<'_>,
+        anchored: Anchored,
         start: Start,
         id: LazyStateID,
     ) {
         assert!(self.as_ref().is_valid(id));
         let start_index = start.as_usize();
-        let index = match input.get_anchored() {
+        let index = match anchored {
             Anchored::No => start_index,
             Anchored::Yes => Start::len() + start_index,
             Anchored::Pattern(pid) => {
@@ -2642,17 +2678,16 @@ impl<'i, 'c> LazyRef<'i, 'c> {
     #[cfg_attr(feature = "perf-inline", inline(always))]
     fn get_cached_start_id(
         &self,
-        input: &Input<'_>,
+        anchored: Anchored,
         start: Start,
-    ) -> Result<LazyStateID, MatchError> {
+    ) -> Result<LazyStateID, StartError> {
         let start_index = start.as_usize();
-        let mode = input.get_anchored();
-        let index = match mode {
+        let index = match anchored {
             Anchored::No => start_index,
             Anchored::Yes => Start::len() + start_index,
             Anchored::Pattern(pid) => {
                 if !self.dfa.get_config().get_starts_for_each_pattern() {
-                    return Err(MatchError::unsupported_anchored(mode));
+                    return Err(StartError::unsupported_anchored(anchored));
                 }
                 if pid.as_usize() >= self.dfa.pattern_len() {
                     return Ok(self.dead_id());
diff --git a/regex-automata/src/hybrid/error.rs b/regex-automata/src/hybrid/error.rs
index 604daf3c3..d134e7ec9 100644
--- a/regex-automata/src/hybrid/error.rs
+++ b/regex-automata/src/hybrid/error.rs
@@ -1,4 +1,4 @@
-use crate::{hybrid::id::LazyStateIDError, nfa};
+use crate::{hybrid::id::LazyStateIDError, nfa, util::search::Anchored};
 
 /// An error that occurs when initial construction of a lazy DFA fails.
 ///
@@ -95,6 +95,113 @@ impl core::fmt::Display for BuildError {
     }
 }
 
+/// An error that can occur when computing the start state for a search.
+///
+/// Computing a start state can fail for a few reasons, either
+/// based on incorrect configuration or even based on whether
+/// the look-behind byte triggers a quit state. Typically
+/// one does not need to handle this error if you're using
+/// [`DFA::start_state_forward`](crate::hybrid::dfa::DFA::start_state_forward)
+/// (or its reverse counterpart), as that routine automatically converts
+/// `StartError` to a [`MatchError`](crate::MatchError) for you.
+///
+/// This error may be returned by the
+/// [`DFA::start_state`](crate::hybrid::dfa::DFA::start_state) routine.
+///
+/// This error implements the `std::error::Error` trait when the `std` feature
+/// is enabled.
+///
+/// This error is marked as non-exhaustive. New variants may be added in a
+/// semver compatible release.
+#[non_exhaustive]
+#[derive(Clone, Debug)]
+pub enum StartError {
+    /// An error that occurs when cache inefficiency has dropped below the
+    /// configured heuristic thresholds.
+    Cache {
+        /// The underlying cache error that occurred.
+        err: CacheError,
+    },
+    /// An error that occurs when a starting configuration's look-behind byte
+    /// is in this DFA's quit set.
+    Quit {
+        /// The quit byte that was found.
+        byte: u8,
+    },
+    /// An error that occurs when the caller requests an anchored mode that
+    /// isn't supported by the DFA.
+    UnsupportedAnchored {
+        /// The anchored mode given that is unsupported.
+        mode: Anchored,
+    },
+}
+
+impl StartError {
+    pub(crate) fn cache(err: CacheError) -> StartError {
+        StartError::Cache { err }
+    }
+
+    pub(crate) fn quit(byte: u8) -> StartError {
+        StartError::Quit { byte }
+    }
+
+    pub(crate) fn unsupported_anchored(mode: Anchored) -> StartError {
+        StartError::UnsupportedAnchored { mode }
+    }
+}
+
+#[cfg(feature = "std")]
+impl std::error::Error for StartError {
+    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
+        match *self {
+            StartError::Cache { ref err } => Some(err),
+            _ => None,
+        }
+    }
+}
+
+impl core::fmt::Display for StartError {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        match *self {
+            StartError::Cache { .. } => write!(
+                f,
+                "error computing start state because of cache inefficiency"
+            ),
+            StartError::Quit { byte } => write!(
+                f,
+                "error computing start state because the look-behind byte \
+                 {:?} triggered a quit state",
+                crate::util::escape::DebugByte(byte),
+            ),
+            StartError::UnsupportedAnchored { mode: Anchored::Yes } => {
+                write!(
+                    f,
+                    "error computing start state because \
+                     anchored searches are not supported or enabled"
+                )
+            }
+            StartError::UnsupportedAnchored { mode: Anchored::No } => {
+                write!(
+                    f,
+                    "error computing start state because \
+                     unanchored searches are not supported or enabled"
+                )
+            }
+            StartError::UnsupportedAnchored {
+                mode: Anchored::Pattern(pid),
+            } => {
+                write!(
+                    f,
+                    "error computing start state because \
+                     anchored searches for a specific pattern ({}) \
+                     are not supported or enabled",
+                    pid.as_usize(),
+                )
+            }
+        }
+    }
+}
+
 /// An error that occurs when cache usage has become inefficient.
 ///
 /// One of the weaknesses of a lazy DFA is that it may need to clear its
@@ -126,11 +233,7 @@ impl CacheError {
 }
 
 #[cfg(feature = "std")]
-impl std::error::Error for CacheError {
-    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
-        None
-    }
-}
+impl std::error::Error for CacheError {}
 
 impl core::fmt::Display for CacheError {
     fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
diff --git a/regex-automata/src/hybrid/mod.rs b/regex-automata/src/hybrid/mod.rs
index 44e67e129..2feb839d1 100644
--- a/regex-automata/src/hybrid/mod.rs
+++ b/regex-automata/src/hybrid/mod.rs
@@ -133,7 +133,7 @@ compiled DFAs.
 */
 
 pub use self::{
-    error::{BuildError, CacheError},
+    error::{BuildError, CacheError, StartError},
     id::LazyStateID,
 };
 
diff --git a/regex-automata/src/util/mod.rs b/regex-automata/src/util/mod.rs
index bb739df1d..b3eef64e6 100644
--- a/regex-automata/src/util/mod.rs
+++ b/regex-automata/src/util/mod.rs
@@ -40,6 +40,7 @@ pub mod look;
 pub mod pool;
 pub mod prefilter;
 pub mod primitives;
+pub mod start;
 #[cfg(feature = "syntax")]
 pub mod syntax;
 pub mod wire;
@@ -52,6 +53,5 @@ pub(crate) mod memchr;
 pub(crate) mod search;
 #[cfg(feature = "alloc")]
 pub(crate) mod sparse_set;
-pub(crate) mod start;
 pub(crate) mod unicode_data;
 pub(crate) mod utf8;
diff --git a/regex-automata/src/util/start.rs b/regex-automata/src/util/start.rs
index 4e360d083..f2d1922c9 100644
--- a/regex-automata/src/util/start.rs
+++ b/regex-automata/src/util/start.rs
@@ -1,17 +1,195 @@
 /*!
-Provides some helpers for dealing with start state configurations in DFAs.
-
-[`Start`] represents the possible starting configurations, while
-[`StartByteMap`] represents a way to retrieve the `Start` configuration for a
-given position in a haystack.
+Provides helpers for dealing with start state configurations in DFAs.
 */
 
 use crate::util::{
     look::LookMatcher,
-    search::Input,
+    search::{Anchored, Input},
     wire::{self, DeserializeError, SerializeError},
 };
 
+/// The configuration used to determine a DFA's start state for a search.
+///
+/// A DFA has a single starting state in the typical textbook description. That
+/// is, it corresponds to the set of all starting states for the NFA that built
+/// it, along with their espsilon closures. In this crate, however, DFAs have
+/// many possible start states due to a few factors:
+///
+/// * DFAs support the ability to run either anchored or unanchored searches.
+/// Each type of search needs its own start state. For example, an unanchored
+/// search requires starting at a state corresponding to a regex with a
+/// `(?s-u:.)*?` prefix, which will match through anything.
+/// * DFAs also optionally support starting an anchored search for any one
+/// specific pattern. Each such pattern requires its own start state.
+/// * If a look-behind assertion like `^` or `\b` is used in the regex, then
+/// the DFA will need to inspect a single byte immediately before the start of
+/// the search to choose the correct start state.
+///
+/// Indeed, this configuration precisely encapsulates all of the above factors.
+/// The [`Config::anchored`] method sets which kind of anchored search to
+/// perform while the [`Config::look_behind`] method provides a way to set
+/// the byte that occurs immediately before the start of the search.
+///
+/// Generally speaking, this type is only useful when you want to run searches
+/// without using an [`Input`](crate::Input). In particular, an `Input` wants a
+/// haystack slice, but callers may not have a contiguous sequence of bytes as
+/// a haystack in all cases. This type provides a lower level of control such
+/// that callers can provide their own anchored configuration and look-behind
+/// byte explicitly.
+///
+/// # Example
+///
+/// This shows basic usage that permits running a search with a DFA without
+/// using the `Input` abstraction.
+///
+/// ```
+/// use regex_automata::{
+///     dfa::{Automaton, dense},
+///     util::start,
+///     Anchored,
+/// };
+///
+/// let dfa = dense::DFA::new(r"(?-u)\b\w+\b")?;
+/// let haystack = "quartz";
+///
+/// let config = start::Config::new().anchored(Anchored::Yes);
+/// let mut state = dfa.start_state(&config)?;
+/// for &b in haystack.as_bytes().iter() {
+///     state = dfa.next_state(state, b);
+/// }
+/// state = dfa.next_eoi_state(state);
+/// assert!(dfa.is_match_state(state));
+///
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+///
+/// This example shows how to correctly run a search that doesn't begin at
+/// the start of a haystack. Notice how we set the look-behind byte, and as
+/// a result, the `\b` assertion does not match.
+///
+/// ```
+/// use regex_automata::{
+///     dfa::{Automaton, dense},
+///     util::start,
+///     Anchored,
+/// };
+///
+/// let dfa = dense::DFA::new(r"(?-u)\b\w+\b")?;
+/// let haystack = "quartz";
+///
+/// let config = start::Config::new()
+///     .anchored(Anchored::Yes)
+///     .look_behind(Some(b'q'));
+/// let mut state = dfa.start_state(&config)?;
+/// for &b in haystack.as_bytes().iter().skip(1) {
+///     state = dfa.next_state(state, b);
+/// }
+/// state = dfa.next_eoi_state(state);
+/// // No match!
+/// assert!(!dfa.is_match_state(state));
+///
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+///
+/// If we had instead not set a look-behind byte, then the DFA would assume
+/// that it was starting at the beginning of the haystack, and thus `\b` should
+/// match. This in turn would result in erroneously reporting a match:
+///
+/// ```
+/// use regex_automata::{
+///     dfa::{Automaton, dense},
+///     util::start,
+///     Anchored,
+/// };
+///
+/// let dfa = dense::DFA::new(r"(?-u)\b\w+\b")?;
+/// let haystack = "quartz";
+///
+/// // Whoops, forgot the look-behind byte...
+/// let config = start::Config::new().anchored(Anchored::Yes);
+/// let mut state = dfa.start_state(&config)?;
+/// for &b in haystack.as_bytes().iter().skip(1) {
+///     state = dfa.next_state(state, b);
+/// }
+/// state = dfa.next_eoi_state(state);
+/// // And now we get a match unexpectedly.
+/// assert!(dfa.is_match_state(state));
+///
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+#[derive(Clone, Debug)]
+pub struct Config {
+    look_behind: Option<u8>,
+    anchored: Anchored,
+}
+
+impl Config {
+    /// Create a new default start configuration.
+    ///
+    /// The default is an unanchored search that starts at the beginning of the
+    /// haystack.
+    pub fn new() -> Config {
+        Config { anchored: Anchored::No, look_behind: None }
+    }
+
+    /// A convenience routine for building a start configuration from an
+    /// [`Input`] for a forward search.
+    ///
+    /// This automatically sets the look-behind byte to the byte immediately
+    /// preceding the start of the search. If the start of the search is at
+    /// offset `0`, then no look-behind byte is set.
+    pub fn from_input_forward(input: &Input<'_>) -> Config {
+        let look_behind = input
+            .start()
+            .checked_sub(1)
+            .and_then(|i| input.haystack().get(i).copied());
+        Config { look_behind, anchored: input.get_anchored() }
+    }
+
+    /// A convenience routine for building a start configuration from an
+    /// [`Input`] for a reverse search.
+    ///
+    /// This automatically sets the look-behind byte to the byte immediately
+    /// following the end of the search. If the end of the search is at
+    /// offset `haystack.len()`, then no look-behind byte is set.
+    pub fn from_input_reverse(input: &Input<'_>) -> Config {
+        let look_behind = input.haystack().get(input.end()).copied();
+        Config { look_behind, anchored: input.get_anchored() }
+    }
+
+    /// Set the look-behind byte at the start of a search.
+    ///
+    /// Unless the search is intended to logically start at the beginning of a
+    /// haystack, this should _always_ be set to the byte immediately preceding
+    /// the start of the search. If no look-behind byte is set, then the start
+    /// configuration will assume it is at the beginning of the haystack. For
+    /// example, the anchor `^` will match.
+    ///
+    /// The default is that no look-behind byte is set.
+    pub fn look_behind(mut self, byte: Option<u8>) -> Config {
+        self.look_behind = byte;
+        self
+    }
+
+    /// Set the anchored mode of a search.
+    ///
+    /// The default is an unanchored search.
+    pub fn anchored(mut self, mode: Anchored) -> Config {
+        self.anchored = mode;
+        self
+    }
+
+    /// Return the look-behind byte in this configuration, if one exists.
+    pub fn get_look_behind(&self) -> Option<u8> {
+        self.look_behind
+    }
+
+    /// Return the anchored mode in this configuration.
+    pub fn get_anchored(&self) -> Anchored {
+        self.anchored
+    }
+}
+
 /// A map from every possible byte value to its corresponding starting
 /// configuration.
 ///
@@ -71,30 +249,11 @@ impl StartByteMap {
         StartByteMap { map }
     }
 
-    /// Return the forward starting configuration for the given `input`.
-    #[cfg_attr(feature = "perf-inline", inline(always))]
-    pub(crate) fn fwd(&self, input: &Input) -> Start {
-        match input
-            .start()
-            .checked_sub(1)
-            .and_then(|i| input.haystack().get(i))
-        {
-            None => Start::Text,
-            Some(&byte) => self.get(byte),
-        }
-    }
-
-    /// Return the reverse starting configuration for the given `input`.
-    #[cfg_attr(feature = "perf-inline", inline(always))]
-    pub(crate) fn rev(&self, input: &Input) -> Start {
-        match input.haystack().get(input.end()) {
-            None => Start::Text,
-            Some(&byte) => self.get(byte),
-        }
-    }
-
+    /// Return the starting configuration for the given look-behind byte.
+    ///
+    /// If no look-behind exists, callers should use `Start::Text`.
     #[cfg_attr(feature = "perf-inline", inline(always))]
-    fn get(&self, byte: u8) -> Start {
+    pub(crate) fn get(&self, byte: u8) -> Start {
         self.map[usize::from(byte)]
     }
 
@@ -253,21 +412,32 @@ mod tests {
     #[test]
     fn start_fwd_done_range() {
         let smap = StartByteMap::new(&LookMatcher::default());
-        assert_eq!(Start::Text, smap.fwd(&Input::new("").range(1..0)));
+        let input = Input::new("").range(1..0);
+        let config = Config::from_input_forward(&input);
+        let start =
+            config.get_look_behind().map_or(Start::Text, |b| smap.get(b));
+        assert_eq!(Start::Text, start);
     }
 
     #[test]
     fn start_rev_done_range() {
         let smap = StartByteMap::new(&LookMatcher::default());
-        assert_eq!(Start::Text, smap.rev(&Input::new("").range(1..0)));
+        let input = Input::new("").range(1..0);
+        let config = Config::from_input_reverse(&input);
+        let start =
+            config.get_look_behind().map_or(Start::Text, |b| smap.get(b));
+        assert_eq!(Start::Text, start);
     }
 
     #[test]
     fn start_fwd() {
         let f = |haystack, start, end| {
             let smap = StartByteMap::new(&LookMatcher::default());
-            let input = &Input::new(haystack).range(start..end);
-            smap.fwd(input)
+            let input = Input::new(haystack).range(start..end);
+            let config = Config::from_input_forward(&input);
+            let start =
+                config.get_look_behind().map_or(Start::Text, |b| smap.get(b));
+            start
         };
 
         assert_eq!(Start::Text, f("", 0, 0));
@@ -287,8 +457,11 @@ mod tests {
     fn start_rev() {
         let f = |haystack, start, end| {
             let smap = StartByteMap::new(&LookMatcher::default());
-            let input = &Input::new(haystack).range(start..end);
-            smap.rev(input)
+            let input = Input::new(haystack).range(start..end);
+            let config = Config::from_input_reverse(&input);
+            let start =
+                config.get_look_behind().map_or(Start::Text, |b| smap.get(b));
+            start
         };
 
         assert_eq!(Start::Text, f("", 0, 0));

From 201e055ef31760cb70893a0faa93a0941fd49c25 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Fri, 6 Oct 2023 11:52:10 -0400
Subject: [PATCH 071/136] automata: fix doc links

---
 regex-automata/src/dfa/dense.rs    |  8 ++--
 regex-automata/src/dfa/regex.rs    |  2 +-
 regex-automata/src/dfa/sparse.rs   | 75 +++++++++++++-----------------
 regex-automata/src/hybrid/dfa.rs   | 16 +++----
 regex-automata/src/hybrid/regex.rs |  2 +-
 regex-automata/src/util/start.rs   |  6 +--
 6 files changed, 50 insertions(+), 59 deletions(-)

diff --git a/regex-automata/src/dfa/dense.rs b/regex-automata/src/dfa/dense.rs
index 7af38b546..25dcac989 100644
--- a/regex-automata/src/dfa/dense.rs
+++ b/regex-automata/src/dfa/dense.rs
@@ -66,8 +66,9 @@ const VERSION: u32 = 2;
 ///
 /// The default configuration guarantees that a search will never return
 /// a "quit" error, although it is possible for a search to fail if
-/// [`Config::starts_for_each_pattern`] wasn't enabled (which it is not by
-/// default) and an [`Anchored::Pattern`] mode is requested via [`Input`].
+/// [`Config::starts_for_each_pattern`] wasn't enabled (which it is
+/// not by default) and an [`Anchored::Pattern`] mode is requested via
+/// [`Input`](crate::Input).
 #[cfg(feature = "dfa-build")]
 #[derive(Clone, Debug, Default)]
 pub struct Config {
@@ -113,8 +114,7 @@ impl Config {
     /// make searching slower than it otherwise would be if the transitions
     /// that leave accelerated states are traversed frequently.
     ///
-    /// See [`Automaton::accelerator`](crate::dfa::Automaton::accelerator) for
-    /// an example.
+    /// See [`Automaton::accelerator`] for an example.
     ///
     /// This is enabled by default.
     pub fn accelerate(mut self, yes: bool) -> Config {
diff --git a/regex-automata/src/dfa/regex.rs b/regex-automata/src/dfa/regex.rs
index f39c1c055..5e7e6e38a 100644
--- a/regex-automata/src/dfa/regex.rs
+++ b/regex-automata/src/dfa/regex.rs
@@ -853,7 +853,7 @@ impl Builder {
     }
 
     /// Set the dense DFA compilation configuration for this builder using
-    /// [`dense::Config`](dense::Config).
+    /// [`dense::Config`].
     ///
     /// This permits setting things like whether the underlying DFAs should
     /// be minimized.
diff --git a/regex-automata/src/dfa/sparse.rs b/regex-automata/src/dfa/sparse.rs
index a5ccf9add..7862d48a2 100644
--- a/regex-automata/src/dfa/sparse.rs
+++ b/regex-automata/src/dfa/sparse.rs
@@ -3,13 +3,12 @@ Types and routines specific to sparse DFAs.
 
 This module is the home of [`sparse::DFA`](DFA).
 
-Unlike the [`dense`](super::dense) module, this module does not contain a
-builder or configuration specific for sparse DFAs. Instead, the intended
-way to build a sparse DFA is either by using a default configuration with
-its constructor [`sparse::DFA::new`](DFA::new), or by first configuring the
-construction of a dense DFA with [`dense::Builder`](super::dense::Builder)
-and then calling [`dense::DFA::to_sparse`](super::dense::DFA::to_sparse). For
-example, this configures a sparse DFA to do an overlapping search:
+Unlike the [`dense`] module, this module does not contain a builder or
+configuration specific for sparse DFAs. Instead, the intended way to build a
+sparse DFA is either by using a default configuration with its constructor
+[`sparse::DFA::new`](DFA::new), or by first configuring the construction of a
+dense DFA with [`dense::Builder`] and then calling [`dense::DFA::to_sparse`].
+For example, this configures a sparse DFA to do an overlapping search:
 
 ```
 use regex_automata::{
@@ -74,18 +73,17 @@ const VERSION: u32 = 2;
 
 /// A sparse deterministic finite automaton (DFA) with variable sized states.
 ///
-/// In contrast to a [dense::DFA](crate::dfa::dense::DFA), a sparse DFA uses
-/// a more space efficient representation for its transitions. Consequently,
-/// sparse DFAs may use much less memory than dense DFAs, but this comes at a
-/// price. In particular, reading the more space efficient transitions takes
-/// more work, and consequently, searching using a sparse DFA is typically
-/// slower than a dense DFA.
+/// In contrast to a [dense::DFA], a sparse DFA uses a more space efficient
+/// representation for its transitions. Consequently, sparse DFAs may use much
+/// less memory than dense DFAs, but this comes at a price. In particular,
+/// reading the more space efficient transitions takes more work, and
+/// consequently, searching using a sparse DFA is typically slower than a dense
+/// DFA.
 ///
 /// A sparse DFA can be built using the default configuration via the
-/// [`DFA::new`] constructor. Otherwise, one can configure various aspects
-/// of a dense DFA via [`dense::Builder`](crate::dfa::dense::Builder),
-/// and then convert a dense DFA to a sparse DFA using
-/// [`dense::DFA::to_sparse`](crate::dfa::dense::DFA::to_sparse).
+/// [`DFA::new`] constructor. Otherwise, one can configure various aspects of a
+/// dense DFA via [`dense::Builder`], and then convert a dense DFA to a sparse
+/// DFA using [`dense::DFA::to_sparse`].
 ///
 /// In general, a sparse DFA supports all the same search operations as a dense
 /// DFA.
@@ -140,11 +138,9 @@ impl DFA<Vec<u8>> {
     /// Parse the given regular expression using a default configuration and
     /// return the corresponding sparse DFA.
     ///
-    /// If you want a non-default configuration, then use
-    /// the [`dense::Builder`](crate::dfa::dense::Builder)
-    /// to set your own configuration, and then call
-    /// [`dense::DFA::to_sparse`](crate::dfa::dense::DFA::to_sparse) to create
-    /// a sparse DFA.
+    /// If you want a non-default configuration, then use the
+    /// [`dense::Builder`] to set your own configuration, and then call
+    /// [`dense::DFA::to_sparse`] to create a sparse DFA.
     ///
     /// # Example
     ///
@@ -167,11 +163,9 @@ impl DFA<Vec<u8>> {
     /// Parse the given regular expressions using a default configuration and
     /// return the corresponding multi-DFA.
     ///
-    /// If you want a non-default configuration, then use
-    /// the [`dense::Builder`](crate::dfa::dense::Builder)
-    /// to set your own configuration, and then call
-    /// [`dense::DFA::to_sparse`](crate::dfa::dense::DFA::to_sparse) to create
-    /// a sparse DFA.
+    /// If you want a non-default configuration, then use the
+    /// [`dense::Builder`] to set your own configuration, and then call
+    /// [`dense::DFA::to_sparse`] to create a sparse DFA.
     ///
     /// # Example
     ///
@@ -511,10 +505,9 @@ impl<T: AsRef<[u8]>> DFA<T> {
     /// * [`DFA::from_bytes`]
     /// * [`DFA::from_bytes_unchecked`]
     ///
-    /// Note that unlike a [`dense::DFA`](crate::dfa::dense::DFA)'s
-    /// serialization methods, this does not add any initial padding to the
-    /// returned bytes. Padding isn't required for sparse DFAs since they have
-    /// no alignment requirements.
+    /// Note that unlike a [`dense::DFA`]'s serialization methods, this does
+    /// not add any initial padding to the returned bytes. Padding isn't
+    /// required for sparse DFAs since they have no alignment requirements.
     ///
     /// # Example
     ///
@@ -553,10 +546,9 @@ impl<T: AsRef<[u8]>> DFA<T> {
     /// * [`DFA::from_bytes`]
     /// * [`DFA::from_bytes_unchecked`]
     ///
-    /// Note that unlike a [`dense::DFA`](crate::dfa::dense::DFA)'s
-    /// serialization methods, this does not add any initial padding to the
-    /// returned bytes. Padding isn't required for sparse DFAs since they have
-    /// no alignment requirements.
+    /// Note that unlike a [`dense::DFA`]'s serialization methods, this does
+    /// not add any initial padding to the returned bytes. Padding isn't
+    /// required for sparse DFAs since they have no alignment requirements.
     ///
     /// # Example
     ///
@@ -595,10 +587,9 @@ impl<T: AsRef<[u8]>> DFA<T> {
     /// * [`DFA::from_bytes`]
     /// * [`DFA::from_bytes_unchecked`]
     ///
-    /// Note that unlike a [`dense::DFA`](crate::dfa::dense::DFA)'s
-    /// serialization methods, this does not add any initial padding to the
-    /// returned bytes. Padding isn't required for sparse DFAs since they have
-    /// no alignment requirements.
+    /// Note that unlike a [`dense::DFA`]'s serialization methods, this does
+    /// not add any initial padding to the returned bytes. Padding isn't
+    /// required for sparse DFAs since they have no alignment requirements.
     ///
     /// Generally speaking, native endian format should only be used when
     /// you know that the target you're compiling the DFA for matches the
@@ -903,9 +894,9 @@ impl<'a> DFA<&'a [u8]> {
     ///
     /// If any of the above are not true, then an error will be returned.
     ///
-    /// Note that unlike deserializing a
-    /// [`dense::DFA`](crate::dfa::dense::DFA), deserializing a sparse DFA has
-    /// no alignment requirements. That is, an alignment of `1` is valid.
+    /// Note that unlike deserializing a [`dense::DFA`], deserializing a sparse
+    /// DFA has no alignment requirements. That is, an alignment of `1` is
+    /// valid.
     ///
     /// # Panics
     ///
diff --git a/regex-automata/src/hybrid/dfa.rs b/regex-automata/src/hybrid/dfa.rs
index 102cfb6fe..9466e1e76 100644
--- a/regex-automata/src/hybrid/dfa.rs
+++ b/regex-automata/src/hybrid/dfa.rs
@@ -3213,12 +3213,12 @@ impl Config {
     /// be quit bytes _only_ when a Unicode word boundary is present in the
     /// pattern.
     ///
-    /// When enabling this option, callers _must_ be prepared to handle
-    /// a [`MatchError`](crate::MatchError) error during search.
-    /// When using a [`Regex`](crate::hybrid::regex::Regex), this
-    /// corresponds to using the `try_` suite of methods. Alternatively,
-    /// if callers can guarantee that their input is ASCII only, then a
-    /// [`MatchError::quit`] error will never be returned while searching.
+    /// When enabling this option, callers _must_ be prepared to
+    /// handle a [`MatchError`] error during search. When using a
+    /// [`Regex`](crate::hybrid::regex::Regex), this corresponds to using the
+    /// `try_` suite of methods. Alternatively, if callers can guarantee that
+    /// their input is ASCII only, then a [`MatchError::quit`] error will never
+    /// be returned while searching.
     ///
     /// This is disabled by default.
     ///
@@ -3304,8 +3304,8 @@ impl Config {
     /// (The advantage being that non-ASCII quit bytes will only be added if a
     /// Unicode word boundary is in the pattern.)
     ///
-    /// When enabling this option, callers _must_ be prepared to handle a
-    /// [`MatchError`](crate::MatchError) error during search. When using a
+    /// When enabling this option, callers _must_ be prepared to
+    /// handle a [`MatchError`] error during search. When using a
     /// [`Regex`](crate::hybrid::regex::Regex), this corresponds to using the
     /// `try_` suite of methods.
     ///
diff --git a/regex-automata/src/hybrid/regex.rs b/regex-automata/src/hybrid/regex.rs
index 75667daf9..b3b1fe317 100644
--- a/regex-automata/src/hybrid/regex.rs
+++ b/regex-automata/src/hybrid/regex.rs
@@ -878,7 +878,7 @@ impl Builder {
     }
 
     /// Set the lazy DFA compilation configuration for this builder using
-    /// [`dfa::Config`](dfa::Config).
+    /// [`dfa::Config`].
     ///
     /// This permits setting things like whether Unicode word boundaries should
     /// be heuristically supported or settings how the behavior of the cache.
diff --git a/regex-automata/src/util/start.rs b/regex-automata/src/util/start.rs
index f2d1922c9..27153780e 100644
--- a/regex-automata/src/util/start.rs
+++ b/regex-automata/src/util/start.rs
@@ -31,9 +31,9 @@ use crate::util::{
 /// the byte that occurs immediately before the start of the search.
 ///
 /// Generally speaking, this type is only useful when you want to run searches
-/// without using an [`Input`](crate::Input). In particular, an `Input` wants a
-/// haystack slice, but callers may not have a contiguous sequence of bytes as
-/// a haystack in all cases. This type provides a lower level of control such
+/// without using an [`Input`]. In particular, an `Input` wants a haystack
+/// slice, but callers may not have a contiguous sequence of bytes as a
+/// haystack in all cases. This type provides a lower level of control such
 /// that callers can provide their own anchored configuration and look-behind
 /// byte explicitly.
 ///

From 1c0bf9411b1ae1fc247d87dcb210eb374a014b5c Mon Sep 17 00:00:00 2001
From: Leachim <32847549+Licheam@users.noreply.github.com>
Date: Sun, 23 Jul 2023 21:33:41 +0800
Subject: [PATCH 072/136] automata: fix one outdated regex-cli test command

Ref #1053
---
 regex-automata/src/dfa/dense.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/regex-automata/src/dfa/dense.rs b/regex-automata/src/dfa/dense.rs
index 25dcac989..28b525eb7 100644
--- a/regex-automata/src/dfa/dense.rs
+++ b/regex-automata/src/dfa/dense.rs
@@ -1233,8 +1233,8 @@ impl Builder {
             //
             // Test case:
             //
-            //   regex-cli find hybrid regex -w @conn.json.1000x.log \
-            //     '^#' '\b10\.55\.182\.100\b'
+            //   regex-cli find match dense --unicode-word-boundary \
+            //     -p '^#' -p '\b10\.55\.182\.100\b' -y @conn.json.1000x.log
             if !quitset.is_empty() {
                 set.add_set(&quitset);
             }

From 9a4e2281a193a47cc396fdff8b813a76a3ed3873 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Fri, 6 Oct 2023 12:38:50 -0400
Subject: [PATCH 073/136] automata: fix more out-dated regex-cli commands

That should cover all of them.

Closes #1053
---
 regex-automata/src/dfa/accel.rs               | 13 +++++++------
 regex-automata/src/dfa/automaton.rs           |  2 +-
 regex-automata/src/dfa/dense.rs               |  5 +++--
 regex-automata/src/hybrid/dfa.rs              | 10 ++++++----
 regex-automata/src/hybrid/search.rs           | 10 +++++-----
 regex-automata/src/nfa/thompson/compiler.rs   |  2 +-
 regex-automata/src/nfa/thompson/map.rs        |  2 +-
 regex-automata/src/nfa/thompson/nfa.rs        |  6 ++----
 regex-automata/src/nfa/thompson/range_trie.rs |  2 +-
 regex-automata/src/util/look.rs               |  4 +++-
 10 files changed, 30 insertions(+), 26 deletions(-)

diff --git a/regex-automata/src/dfa/accel.rs b/regex-automata/src/dfa/accel.rs
index 5ea2423dd..c0ba18ea8 100644
--- a/regex-automata/src/dfa/accel.rs
+++ b/regex-automata/src/dfa/accel.rs
@@ -6,15 +6,16 @@
 // non-Unicode regexes. For example, consider '(?-u)[^a]+a'. We can look at its
 // DFA with regex-cli:
 //
-//     $ regex-cli debug dfa dense '(?-u)[^a]+a' -BbC
-//     dense::DFA(
+//     $ regex-cli debug dense dfa -p '(?-u)[^a]+a' -BbC --no-table
 //     D 000000:
 //     Q 000001:
 //      *000002:
-//     A 000003: \x00-` => 3, a => 5, b-\xFF => 3
-//      >000004: \x00-` => 3, a => 4, b-\xFF => 3
-//       000005: \x00-\xFF => 2, EOI => 2
-//     )
+//     A 000003: \x00-` => 3, a => 8, b-\xFF => 3
+//     A 000004: \x00-` => 4, a => 7, b-\xFF => 4
+//       000005: \x00-` => 4, b-\xFF => 4
+//       000006: \x00-` => 3, a => 6, b-\xFF => 3
+//       000007: \x00-\xFF => 2, EOI => 2
+//       000008: \x00-\xFF => 2, EOI => 2
 //
 // In particular, state 3 is accelerated (shown via the 'A' indicator) since
 // the only way to leave that state once entered is to see an 'a' byte. If
diff --git a/regex-automata/src/dfa/automaton.rs b/regex-automata/src/dfa/automaton.rs
index cd597947e..fcfcf2997 100644
--- a/regex-automata/src/dfa/automaton.rs
+++ b/regex-automata/src/dfa/automaton.rs
@@ -1132,7 +1132,7 @@ pub unsafe trait Automaton {
     /// // implementation defined.
     /// //
     /// // N.B. We get '3' by inspecting the state machine using 'regex-cli'.
-    /// // e.g., try `regex-cli debug dfa dense '[^abc]+a' -BbUC`.
+    /// // e.g., try `regex-cli debug dense dfa -p '[^abc]+a' -BbUC`.
     /// let id = StateID::new(3 * dfa.stride()).unwrap();
     /// let accelerator = dfa.accelerator(id);
     /// // The `[^abc]+` sub-expression permits [a, b, c] to be accelerated.
diff --git a/regex-automata/src/dfa/dense.rs b/regex-automata/src/dfa/dense.rs
index 28b525eb7..c9fe3b381 100644
--- a/regex-automata/src/dfa/dense.rs
+++ b/regex-automata/src/dfa/dense.rs
@@ -1228,8 +1228,9 @@ impl Builder {
         } else {
             let mut set = nfa.byte_class_set().clone();
             // It is important to distinguish any "quit" bytes from all other
-            // bytes. Otherwise, a non-quit byte may end up in the same class
-            // as a quit byte, and thus cause the DFA stop when it shouldn't.
+            // bytes. Otherwise, a non-quit byte may end up in the same
+            // class as a quit byte, and thus cause the DFA to stop when it
+            // shouldn't.
             //
             // Test case:
             //
diff --git a/regex-automata/src/hybrid/dfa.rs b/regex-automata/src/hybrid/dfa.rs
index 9466e1e76..bd9179b19 100644
--- a/regex-automata/src/hybrid/dfa.rs
+++ b/regex-automata/src/hybrid/dfa.rs
@@ -2103,8 +2103,10 @@ impl<'i, 'c> Lazy<'i, 'c> {
     /// Here's an example that justifies 'inline(never)'
     ///
     /// ```ignore
-    /// regex-cli find hybrid dfa \
-    ///   @all-codepoints-utf8-100x '\pL{100}' --cache-capacity 10000000
+    /// regex-cli find match hybrid \
+    ///   --cache-capacity 100000000 \
+    ///   -p '\pL{100}'
+    ///   all-codepoints-utf8-100x
     /// ```
     ///
     /// Where 'all-codepoints-utf8-100x' is the UTF-8 encoding of every
@@ -3830,8 +3832,8 @@ impl Config {
             //
             // Test case:
             //
-            //   regex-cli find hybrid regex -w @conn.json.1000x.log \
-            //     '^#' '\b10\.55\.182\.100\b'
+            //   regex-cli find match hybrid --unicode-word-boundary \
+            //     -p '^#' -p '\b10\.55\.182\.100\b' -y @conn.json.1000x.log
             if !quit.is_empty() {
                 set.add_set(&quit);
             }
diff --git a/regex-automata/src/hybrid/search.rs b/regex-automata/src/hybrid/search.rs
index f23283685..1f4a505db 100644
--- a/regex-automata/src/hybrid/search.rs
+++ b/regex-automata/src/hybrid/search.rs
@@ -105,14 +105,14 @@ fn find_fwd_imp(
             // PERF: For justification of omitting bounds checks, it gives us a
             // ~10% bump in search time. This was used for a benchmark:
             //
-            //     regex-cli find hybrid dfa @bigfile '(?m)^.+$' -UBb
+            //     regex-cli find half hybrid -p '(?m)^.+$' -UBb bigfile
             //
             // PERF: For justification for the loop unrolling, we use a few
             // different tests:
             //
-            //     regex-cli find hybrid dfa @$bigfile '\w{50}' -UBb
-            //     regex-cli find hybrid dfa @$bigfile '(?m)^.+$' -UBb
-            //     regex-cli find hybrid dfa @$bigfile 'ZQZQZQZQ' -UBb
+            //     regex-cli find half hybrid -p '\w{50}' -UBb bigfile
+            //     regex-cli find half hybrid -p '(?m)^.+$' -UBb bigfile
+            //     regex-cli find half hybrid -p 'ZQZQZQZQ' -UBb bigfile
             //
             // And there are three different configurations:
             //
@@ -353,7 +353,7 @@ fn find_rev_imp(
             // anchored and on shorter haystacks. However, this still makes a
             // difference. Take this command for example:
             //
-            //     regex-cli find hybrid regex @$bigfile '(?m)^.+$' -UBb
+            //     regex-cli find match hybrid -p '(?m)^.+$' -UBb bigfile
             //
             // (Notice that we use 'find hybrid regex', not 'find hybrid dfa'
             // like in the justification for the forward direction. The 'regex'
diff --git a/regex-automata/src/nfa/thompson/compiler.rs b/regex-automata/src/nfa/thompson/compiler.rs
index 065e9ef27..a188017d8 100644
--- a/regex-automata/src/nfa/thompson/compiler.rs
+++ b/regex-automata/src/nfa/thompson/compiler.rs
@@ -1466,7 +1466,7 @@ impl Compiler {
         // compare and contrast performance of the Pike VM when the code below
         // is active vs the code above. Here's an example to try:
         //
-        //   regex-cli find match pikevm -b -p '(?m)^\w{20}' -y '@$smallishru'
+        //   regex-cli find match pikevm -b -p '(?m)^\w{20}' non-ascii-file
         //
         // With Unicode classes generated below, this search takes about 45s on
         // my machine. But with the compressed version above, the search takes
diff --git a/regex-automata/src/nfa/thompson/map.rs b/regex-automata/src/nfa/thompson/map.rs
index c36ce5386..c92d4c0b8 100644
--- a/regex-automata/src/nfa/thompson/map.rs
+++ b/regex-automata/src/nfa/thompson/map.rs
@@ -65,7 +65,7 @@ const INIT: u64 = 14695981039346656037;
 /// Specifically, one could observe the difference with std's hashmap via
 /// something like the following benchmark:
 ///
-///   hyperfine "regex-cli debug nfa thompson --quiet --reverse '\w{90} ecurB'"
+///   hyperfine "regex-cli debug thompson -qr --no-captures '\w{90} ecurB'"
 ///
 /// But to observe that difference, you'd have to modify the code to use
 /// std's hashmap.
diff --git a/regex-automata/src/nfa/thompson/nfa.rs b/regex-automata/src/nfa/thompson/nfa.rs
index 2108fa338..1f57f8ebd 100644
--- a/regex-automata/src/nfa/thompson/nfa.rs
+++ b/regex-automata/src/nfa/thompson/nfa.rs
@@ -1841,14 +1841,12 @@ impl SparseTransitions {
         // This is an alternative implementation that uses binary search. In
         // some ad hoc experiments, like
         //
-        //   smallishru=OpenSubtitles2018.raw.sample.smallish.ru
-        //   regex-cli find nfa thompson pikevm -b "@$smallishru" '\b\w+\b'
+        //   regex-cli find match pikevm -b -p '\b\w+\b' non-ascii-file
         //
         // I could not observe any improvement, and in fact, things seemed to
         // be a bit slower. I can see an improvement in at least one benchmark:
         //
-        //   allcpssmall=all-codepoints-utf8-10x
-        //   regex-cli find nfa thompson pikevm @$allcpssmall '\pL{100}'
+        //   regex-cli find match pikevm -b -p '\pL{100}' all-codepoints-utf8
         //
         // Where total search time goes from 3.2s to 2.4s when using binary
         // search.
diff --git a/regex-automata/src/nfa/thompson/range_trie.rs b/regex-automata/src/nfa/thompson/range_trie.rs
index 2d43a5b6f..75c9b796b 100644
--- a/regex-automata/src/nfa/thompson/range_trie.rs
+++ b/regex-automata/src/nfa/thompson/range_trie.rs
@@ -594,7 +594,7 @@ impl State {
         // Benchmarks suggest that binary search is just a bit faster than
         // straight linear search. Specifically when using the debug tool:
         //
-        //   hyperfine "regex-cli debug nfa thompson --quiet --reverse '\w{90} ecurB'"
+        //   hyperfine "regex-cli debug thompson -qr --no-captures '\w{90} ecurB'"
         binary_search(&self.transitions, |t| range.start <= t.range.end)
     }
 
diff --git a/regex-automata/src/util/look.rs b/regex-automata/src/util/look.rs
index a34ea1d75..81b4eb718 100644
--- a/regex-automata/src/util/look.rs
+++ b/regex-automata/src/util/look.rs
@@ -1024,7 +1024,9 @@ impl core::fmt::Display for UnicodeWordBoundaryError {
 // There are perhaps other choices as well. Why did I stop at these 4? Because
 // I wanted to preserve my sanity. I suspect I'll wind up adding the lazy DFA
 // approach eventually, as the benefits of the DFA approach are somewhat
-// compelling. The 'boundary-words-holmes' benchmark tests this:
+// compelling. The 'boundary-words-holmes' benchmark tests this. (Note that
+// the commands below no longer work. If necessary, we should re-capitulate
+// the benchmark from whole cloth in rebar.)
 //
 //   $ regex-cli bench measure -f boundary-words-holmes -e pikevm > dfa.csv
 //

From 6d2b09ed6fbc136cca007ce0c57ec9cbae16f3b4 Mon Sep 17 00:00:00 2001
From: Leachim <32847549+Licheam@users.noreply.github.com>
Date: Fri, 21 Jul 2023 20:32:37 +0800
Subject: [PATCH 074/136] syntax: optimize most of the IntervalSet routines

This reduces or eliminates allocation when combining Unicode classes and
should make some things faster. It's unlikely for these optimizations to
matter much in practice, but they are likely to help in niche or
pathological cases where there are a lot of ops in a class.

Closes #1051
---
 regex-syntax/src/hir/interval.rs | 282 ++++++++++++++++++++-----------
 1 file changed, 185 insertions(+), 97 deletions(-)

diff --git a/regex-syntax/src/hir/interval.rs b/regex-syntax/src/hir/interval.rs
index e063390a8..e3051bf31 100644
--- a/regex-syntax/src/hir/interval.rs
+++ b/regex-syntax/src/hir/interval.rs
@@ -19,7 +19,7 @@ use crate::unicode;
 //
 // Some of the implementation complexity here is a result of me wanting to
 // preserve the sequential representation without using additional memory.
-// In many cases, we do use linear extra memory, but it is at most 2x and it
+// In some cases, we do use linear extra memory, but it is at most 2x and it
 // is amortized. If we relaxed the memory requirements, this implementation
 // could become much simpler. The extra memory is honestly probably OK, but
 // character classes (especially of the Unicode variety) can become quite
@@ -81,14 +81,45 @@ impl<I: Interval> IntervalSet<I> {
 
     /// Add a new interval to this set.
     pub fn push(&mut self, interval: I) {
-        // TODO: This could be faster. e.g., Push the interval such that
-        // it preserves canonicalization.
-        self.ranges.push(interval);
-        self.canonicalize();
         // We don't know whether the new interval added here is considered
         // case folded, so we conservatively assume that the entire set is
         // no longer case folded if it was previously.
         self.folded = false;
+
+        if self.ranges.is_empty() {
+            self.ranges.push(interval);
+            return;
+        }
+
+        // Find the first range that is not greater than the new interval.
+        // This is the first range that could possibly be unioned with the
+        // new interval.
+        let mut drain_end = self.ranges.len();
+        while drain_end > 0
+            && self.ranges[drain_end - 1].lower() > interval.upper()
+            && !self.ranges[drain_end - 1].is_contiguous(&interval)
+        {
+            drain_end -= 1;
+        }
+
+        // Try to union the new interval with old intervals backwards.
+        if drain_end > 0 && self.ranges[drain_end - 1].is_contiguous(&interval)
+        {
+            self.ranges[drain_end - 1] =
+                self.ranges[drain_end - 1].union(&interval).unwrap();
+            for i in (0..drain_end - 1).rev() {
+                if let Some(union) =
+                    self.ranges[drain_end - 1].union(&self.ranges[i])
+                {
+                    self.ranges[drain_end - 1] = union;
+                } else {
+                    self.ranges.drain(i + 1..drain_end - 1);
+                    break;
+                }
+            }
+        } else {
+            self.ranges.insert(drain_end, interval);
+        }
     }
 
     /// Return an iterator over all intervals in this set.
@@ -192,34 +223,13 @@ impl<I: Interval> IntervalSet<I> {
         // Folks seem to suggest interval or segment trees, but I'd like to
         // avoid the overhead (both runtime and conceptual) of that.
         //
-        // The following is basically my Shitty First Draft. Therefore, in
-        // order to grok it, you probably need to read each line carefully.
-        // Simplifications are most welcome!
-        //
         // Remember, we can assume the canonical format invariant here, which
         // says that all ranges are sorted, not overlapping and not adjacent in
         // each class.
         let drain_end = self.ranges.len();
-        let (mut a, mut b) = (0, 0);
-        'LOOP: while a < drain_end && b < other.ranges.len() {
-            // Basically, the easy cases are when neither range overlaps with
-            // each other. If the `b` range is less than our current `a`
-            // range, then we can skip it and move on.
-            if other.ranges[b].upper() < self.ranges[a].lower() {
-                b += 1;
-                continue;
-            }
-            // ... similarly for the `a` range. If it's less than the smallest
-            // `b` range, then we can add it as-is.
-            if self.ranges[a].upper() < other.ranges[b].lower() {
-                let range = self.ranges[a];
-                self.ranges.push(range);
-                a += 1;
-                continue;
-            }
-            // Otherwise, we have overlapping ranges.
-            assert!(!self.ranges[a].is_intersection_empty(&other.ranges[b]));
 
+        let mut b = 0;
+        for a in 0..drain_end {
             // This part is tricky and was non-obvious to me without looking
             // at explicit examples (see the tests). The trickiness stems from
             // two things: 1) subtracting a range from another range could
@@ -231,47 +241,34 @@ impl<I: Interval> IntervalSet<I> {
             // For example, if our `a` range is `a-t` and our next three `b`
             // ranges are `a-c`, `g-i`, `r-t` and `x-z`, then we need to apply
             // subtraction three times before moving on to the next `a` range.
-            let mut range = self.ranges[a];
+            self.ranges.push(self.ranges[a]);
+            // Only when `b` is not above `a`, `b` might apply to current
+            // `a` range.
             while b < other.ranges.len()
-                && !range.is_intersection_empty(&other.ranges[b])
+                && other.ranges[b].lower() <= self.ranges[a].upper()
             {
-                let old_range = range;
-                range = match range.difference(&other.ranges[b]) {
-                    (None, None) => {
-                        // We lost the entire range, so move on to the next
-                        // without adding this one.
-                        a += 1;
-                        continue 'LOOP;
+                match self.ranges.pop().unwrap().difference(&other.ranges[b]) {
+                    (Some(range1), None) | (None, Some(range1)) => {
+                        self.ranges.push(range1);
                     }
-                    (Some(range1), None) | (None, Some(range1)) => range1,
                     (Some(range1), Some(range2)) => {
                         self.ranges.push(range1);
-                        range2
+                        self.ranges.push(range2);
                     }
-                };
-                // It's possible that the `b` range has more to contribute
-                // here. In particular, if it is greater than the original
-                // range, then it might impact the next `a` range *and* it
-                // has impacted the current `a` range as much as possible,
-                // so we can quit. We don't bump `b` so that the next `a`
-                // range can apply it.
-                if other.ranges[b].upper() > old_range.upper() {
-                    break;
+                    (None, None) => {}
                 }
-                // Otherwise, the next `b` range might apply to the current
+                // The next `b` range might apply to the current
                 // `a` range.
                 b += 1;
             }
-            self.ranges.push(range);
-            a += 1;
-        }
-        while a < drain_end {
-            let range = self.ranges[a];
-            self.ranges.push(range);
-            a += 1;
+            // It's possible that the last `b` range has more to
+            // contribute to the next `a`. We don't bump the last
+            // `b` so that the next `a` range can apply it.
+            b = b.saturating_sub(1);
         }
+
         self.ranges.drain(..drain_end);
-        self.folded = self.folded && other.folded;
+        self.folded = self.ranges.is_empty() || (self.folded && other.folded);
     }
 
     /// Compute the symmetric difference of the two sets, in place.
@@ -282,11 +279,83 @@ impl<I: Interval> IntervalSet<I> {
     /// set. That is, the set will contain all elements in either set,
     /// but will not contain any elements that are in both sets.
     pub fn symmetric_difference(&mut self, other: &IntervalSet<I>) {
-        // TODO(burntsushi): Fix this so that it amortizes allocation.
-        let mut intersection = self.clone();
-        intersection.intersect(other);
-        self.union(other);
-        self.difference(&intersection);
+        if self.ranges.is_empty() {
+            self.ranges.extend(&other.ranges);
+            self.folded = other.folded;
+            return;
+        }
+        if other.ranges.is_empty() {
+            return;
+        }
+
+        // There should be a way to do this in-place with constant memory,
+        // but I couldn't figure out a simple way to do it. So just append
+        // the symmetric difference to the end of this range, and then drain
+        // it before we're done.
+        let drain_end = self.ranges.len();
+        let mut b = 0;
+        let mut b_range = Some(other.ranges[b]);
+        for a in 0..drain_end {
+            self.ranges.push(self.ranges[a]);
+            while b_range
+                .map_or(false, |r| r.lower() <= self.ranges[a].upper())
+            {
+                let (range1, range2) = match self
+                    .ranges
+                    .pop()
+                    .unwrap()
+                    .symmetric_difference(&b_range.as_ref().unwrap())
+                {
+                    (Some(range1), None) | (None, Some(range1)) => {
+                        (Some(range1), None)
+                    }
+                    (Some(range1), Some(range2)) => {
+                        (Some(range1), Some(range2))
+                    }
+                    (None, None) => (None, None),
+                };
+                if let Some(range) = range1 {
+                    if self.ranges.len() > drain_end
+                        && self.ranges.last().unwrap().is_contiguous(&range)
+                    {
+                        self.ranges
+                            .last_mut()
+                            .map(|last| *last = last.union(&range).unwrap());
+                    } else {
+                        self.ranges.push(range);
+                    }
+                }
+                if let Some(range) = range2 {
+                    self.ranges.push(range);
+                }
+
+                b_range = if self.ranges.len() > drain_end
+                    && self.ranges.last().unwrap().upper()
+                        > self.ranges[a].upper()
+                {
+                    Some(*self.ranges.last().unwrap())
+                } else {
+                    b += 1;
+                    other.ranges.get(b).cloned()
+                };
+            }
+        }
+        while let Some(range) = b_range {
+            if self.ranges.len() > drain_end
+                && self.ranges.last().unwrap().is_contiguous(&range)
+            {
+                self.ranges
+                    .last_mut()
+                    .map(|last| *last = last.union(&range).unwrap());
+            } else {
+                self.ranges.push(range);
+            }
+            b += 1;
+            b_range = other.ranges.get(b).cloned();
+        }
+
+        self.ranges.drain(..drain_end);
+        self.folded = self.ranges.is_empty() || (self.folded && other.folded);
     }
 
     /// Negate this interval set.
@@ -302,28 +371,44 @@ impl<I: Interval> IntervalSet<I> {
             return;
         }
 
-        // There should be a way to do this in-place with constant memory,
-        // but I couldn't figure out a simple way to do it. So just append
-        // the negation to the end of this range, and then drain it before
-        // we're done.
-        let drain_end = self.ranges.len();
-
         // We do checked arithmetic below because of the canonical ordering
         // invariant.
         if self.ranges[0].lower() > I::Bound::min_value() {
-            let upper = self.ranges[0].lower().decrement();
-            self.ranges.push(I::create(I::Bound::min_value(), upper));
-        }
-        for i in 1..drain_end {
-            let lower = self.ranges[i - 1].upper().increment();
-            let upper = self.ranges[i].lower().decrement();
-            self.ranges.push(I::create(lower, upper));
-        }
-        if self.ranges[drain_end - 1].upper() < I::Bound::max_value() {
-            let lower = self.ranges[drain_end - 1].upper().increment();
-            self.ranges.push(I::create(lower, I::Bound::max_value()));
+            let mut pre_upper = self.ranges[0].upper();
+            self.ranges[0] = I::create(
+                I::Bound::min_value(),
+                self.ranges[0].lower().decrement(),
+            );
+            for i in 1..self.ranges.len() {
+                let lower = pre_upper.increment();
+                pre_upper = self.ranges[i].upper();
+                self.ranges[i] =
+                    I::create(lower, self.ranges[i].lower().decrement());
+            }
+            if pre_upper < I::Bound::max_value() {
+                self.ranges.push(I::create(
+                    pre_upper.increment(),
+                    I::Bound::max_value(),
+                ));
+            }
+        } else {
+            for i in 1..self.ranges.len() {
+                self.ranges[i - 1] = I::create(
+                    self.ranges[i - 1].upper().increment(),
+                    self.ranges[i].lower().decrement(),
+                );
+            }
+            if self.ranges.last().unwrap().upper() < I::Bound::max_value() {
+                self.ranges.last_mut().map(|range| {
+                    *range = I::create(
+                        range.upper().increment(),
+                        I::Bound::max_value(),
+                    )
+                });
+            } else {
+                self.ranges.pop();
+            }
         }
-        self.ranges.drain(..drain_end);
         // We don't need to update whether this set is folded or not, because
         // it is conservatively preserved through negation. Namely, if a set
         // is not folded, then it is possible that its negation is folded, for
@@ -337,6 +422,7 @@ impl<I: Interval> IntervalSet<I> {
         // of case folded characters. Negating it in turn means that all
         // equivalence classes in the set are negated, and any equivalence
         // class that was previously not in the set is now entirely in the set.
+        self.folded = self.ranges.is_empty() || self.folded;
     }
 
     /// Converts this set into a canonical ordering.
@@ -347,24 +433,20 @@ impl<I: Interval> IntervalSet<I> {
         self.ranges.sort();
         assert!(!self.ranges.is_empty());
 
-        // Is there a way to do this in-place with constant memory? I couldn't
-        // figure out a way to do it. So just append the canonicalization to
-        // the end of this range, and then drain it before we're done.
-        let drain_end = self.ranges.len();
-        for oldi in 0..drain_end {
-            // If we've added at least one new range, then check if we can
-            // merge this range in the previously added range.
-            if self.ranges.len() > drain_end {
-                let (last, rest) = self.ranges.split_last_mut().unwrap();
-                if let Some(union) = last.union(&rest[oldi]) {
-                    *last = union;
-                    continue;
-                }
+        // We maintain the canonicalization results in-place at `0..newi`.
+        // `newi` will keep track of the end of the canonicalized ranges.
+        let mut newi = 0;
+        for oldi in 1..self.ranges.len() {
+            // The last new range gets merged with currnet old range when
+            // unionable. If not, we update `newi` and store it as a new range.
+            if let Some(union) = self.ranges[newi].union(&self.ranges[oldi]) {
+                self.ranges[newi] = union;
+            } else {
+                newi += 1;
+                self.ranges[newi] = self.ranges[oldi];
             }
-            let range = self.ranges[oldi];
-            self.ranges.push(range);
         }
-        self.ranges.drain(..drain_end);
+        self.ranges.truncate(newi + 1);
     }
 
     /// Returns true if and only if this class is in a canonical ordering.
@@ -486,7 +568,13 @@ pub trait Interval:
         other: &Self,
     ) -> (Option<Self>, Option<Self>) {
         let union = match self.union(other) {
-            None => return (Some(self.clone()), Some(other.clone())),
+            None => {
+                return if self.upper() < other.lower() {
+                    (Some(self.clone()), Some(other.clone()))
+                } else {
+                    (Some(other.clone()), Some(self.clone()))
+                }
+            }
             Some(union) => union,
         };
         let intersection = match self.intersect(other) {

From baf5b1ef29eec3136884a1595bda4833044a9bee Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Sat, 7 Oct 2023 09:19:57 -0400
Subject: [PATCH 075/136] syntax and automata: bump LookSet representation from
 u16 to u32

This is in preparation for adding 8 new word boundary look-around
assertions: \b{start}, \b{end}, \b{start-half} and \b{end-half}, along
with Unicode and ASCII-only variants of each.

Ref #469
---
 regex-automata/src/dfa/dense.rs              |  8 ++--
 regex-automata/src/dfa/onepass.rs            |  2 +-
 regex-automata/src/util/determinize/state.rs | 39 ++++++++++----------
 regex-automata/src/util/look.rs              | 26 +++++++------
 regex-automata/tests/hybrid/api.rs           |  4 +-
 regex-syntax/src/hir/mod.rs                  | 26 +++++++------
 6 files changed, 55 insertions(+), 50 deletions(-)

diff --git a/regex-automata/src/dfa/dense.rs b/regex-automata/src/dfa/dense.rs
index c9fe3b381..902f4b273 100644
--- a/regex-automata/src/dfa/dense.rs
+++ b/regex-automata/src/dfa/dense.rs
@@ -882,20 +882,20 @@ impl Config {
     /// # if !cfg!(target_pointer_width = "64") { return Ok(()); } // see #1039
     /// use regex_automata::{dfa::{dense, Automaton}, Input};
     ///
-    /// // 600KB isn't enough!
+    /// // 700KB isn't enough!
     /// dense::Builder::new()
     ///     .configure(dense::Config::new()
-    ///         .determinize_size_limit(Some(600_000))
+    ///         .determinize_size_limit(Some(700_000))
     ///     )
     ///     .build(r"\w{20}")
     ///     .unwrap_err();
     ///
-    /// // ... but 700KB probably is!
+    /// // ... but 800KB probably is!
     /// // (Note that auxiliary storage sizes aren't necessarily stable between
     /// // releases.)
     /// let dfa = dense::Builder::new()
     ///     .configure(dense::Config::new()
-    ///         .determinize_size_limit(Some(700_000))
+    ///         .determinize_size_limit(Some(800_000))
     ///     )
     ///     .build(r"\w{20}")?;
     /// let haystack = "A".repeat(20).into_bytes();
diff --git a/regex-automata/src/dfa/onepass.rs b/regex-automata/src/dfa/onepass.rs
index 353bb1e17..e62bbd383 100644
--- a/regex-automata/src/dfa/onepass.rs
+++ b/regex-automata/src/dfa/onepass.rs
@@ -2815,7 +2815,7 @@ impl Epsilons {
 
     /// Return the set of look-around assertions in these epsilon transitions.
     fn looks(self) -> LookSet {
-        LookSet { bits: (self.0 & Epsilons::LOOK_MASK).low_u16() }
+        LookSet { bits: (self.0 & Epsilons::LOOK_MASK).low_u32() }
     }
 
     /// Set the look-around assertions on these epsilon transitions.
diff --git a/regex-automata/src/util/determinize/state.rs b/regex-automata/src/util/determinize/state.rs
index e64123587..effa6f44d 100644
--- a/regex-automata/src/util/determinize/state.rs
+++ b/regex-automata/src/util/determinize/state.rs
@@ -197,7 +197,7 @@ impl StateBuilderEmpty {
     }
 
     pub(crate) fn into_matches(mut self) -> StateBuilderMatches {
-        self.0.extend_from_slice(&[0, 0, 0, 0, 0]);
+        self.0.extend_from_slice(&[0, 0, 0, 0, 0, 0, 0, 0, 0]);
         StateBuilderMatches(self.0)
     }
 
@@ -348,16 +348,17 @@ impl StateBuilderNFA {
 /// generated by a transition over a "word" byte. (Callers may not always set
 /// this. For example, if the NFA has no word boundary assertion, then needing
 /// to track whether a state came from a word byte or not is superfluous and
-/// wasteful.)
+/// wasteful.) Bit 3 is set to 1 if the state was generated by a transition
+/// from a `\r` (forward search) or a `\n` (reverse search) when CRLF mode is
+/// enabled.
 ///
-/// Byte 1 corresponds to the look-behind assertions that were satisfied by
-/// the transition that created this state. This generally only includes the
-/// StartLF and Start assertions. (Look-ahead assertions are not tracked as
-/// part of states. Instead, these are applied by re-computing the epsilon
-/// closure of a state when computing the transition function. See `next` in
-/// the parent module.)
+/// Bytes 1..5 correspond to the look-behind assertions that were satisfied
+/// by the transition that created this state. (Look-ahead assertions are not
+/// tracked as part of states. Instead, these are applied by re-computing the
+/// epsilon closure of a state when computing the transition function. See
+/// `next` in the parent module.)
 ///
-/// Byte 2 corresponds to the set of look-around assertions (including both
+/// Bytes 5..9 correspond to the set of look-around assertions (including both
 /// look-behind and look-ahead) that appear somewhere in this state's set of
 /// NFA state IDs. This is used to determine whether this state's epsilon
 /// closure should be re-computed when computing the transition function.
@@ -366,7 +367,7 @@ impl StateBuilderNFA {
 /// function, we should only re-compute the epsilon closure if those new
 /// assertions are relevant to this particular state.
 ///
-/// Bytes 3..7 correspond to a 32-bit native-endian encoded integer
+/// Bytes 9..13 correspond to a 32-bit native-endian encoded integer
 /// corresponding to the number of patterns encoded in this state. If the state
 /// is not a match state (byte 0 bit 0 is 0) or if it's only pattern ID is
 /// PatternID::ZERO, then no integer is encoded at this position. Instead, byte
@@ -452,7 +453,7 @@ impl<'a> Repr<'a> {
     /// state has no conditional epsilon transitions, then there is no need
     /// to re-compute the epsilon closure.
     fn look_need(&self) -> LookSet {
-        LookSet::read_repr(&self.0[3..])
+        LookSet::read_repr(&self.0[5..])
     }
 
     /// Returns the total number of match pattern IDs in this state.
@@ -476,7 +477,7 @@ impl<'a> Repr<'a> {
         if !self.has_pattern_ids() {
             PatternID::ZERO
         } else {
-            let offset = 9 + index * PatternID::SIZE;
+            let offset = 13 + index * PatternID::SIZE;
             // This is OK since we only ever serialize valid PatternIDs to
             // states.
             wire::read_pattern_id_unchecked(&self.0[offset..]).0
@@ -507,7 +508,7 @@ impl<'a> Repr<'a> {
             f(PatternID::ZERO);
             return;
         }
-        let mut pids = &self.0[9..self.pattern_offset_end()];
+        let mut pids = &self.0[13..self.pattern_offset_end()];
         while !pids.is_empty() {
             let pid = wire::read_u32(pids);
             pids = &pids[PatternID::SIZE..];
@@ -539,11 +540,11 @@ impl<'a> Repr<'a> {
     fn pattern_offset_end(&self) -> usize {
         let encoded = self.encoded_pattern_len();
         if encoded == 0 {
-            return 5;
+            return 9;
         }
         // This arithmetic is OK since we were able to address this many bytes
         // when writing to the state, thus, it must fit into a usize.
-        encoded.checked_mul(4).unwrap().checked_add(9).unwrap()
+        encoded.checked_mul(4).unwrap().checked_add(13).unwrap()
     }
 
     /// Returns the total number of *encoded* pattern IDs in this state.
@@ -557,7 +558,7 @@ impl<'a> Repr<'a> {
         }
         // This unwrap is OK since the total number of patterns is always
         // guaranteed to fit into a usize.
-        usize::try_from(wire::read_u32(&self.0[5..9])).unwrap()
+        usize::try_from(wire::read_u32(&self.0[9..13])).unwrap()
     }
 }
 
@@ -643,7 +644,7 @@ impl<'a> ReprVec<'a> {
     /// Mutate the set of look-around (both behind and ahead) assertions that
     /// appear at least once in this state's set of NFA states.
     fn set_look_need(&mut self, mut set: impl FnMut(LookSet) -> LookSet) {
-        set(self.look_need()).write_repr(&mut self.0[3..]);
+        set(self.look_need()).write_repr(&mut self.0[5..]);
     }
 
     /// Add a pattern ID to this state. All match states must have at least
@@ -703,14 +704,14 @@ impl<'a> ReprVec<'a> {
             return;
         }
         let patsize = PatternID::SIZE;
-        let pattern_bytes = self.0.len() - 9;
+        let pattern_bytes = self.0.len() - 13;
         // Every pattern ID uses 4 bytes, so number of bytes should be
         // divisible by 4.
         assert_eq!(pattern_bytes % patsize, 0);
         // This unwrap is OK since we are guaranteed that the maximum number
         // of possible patterns fits into a u32.
         let count32 = u32::try_from(pattern_bytes / patsize).unwrap();
-        wire::NE::write_u32(count32, &mut self.0[5..9]);
+        wire::NE::write_u32(count32, &mut self.0[9..13]);
     }
 
     /// Add an NFA state ID to this state. The order in which NFA states are
diff --git a/regex-automata/src/util/look.rs b/regex-automata/src/util/look.rs
index 81b4eb718..f87b963ad 100644
--- a/regex-automata/src/util/look.rs
+++ b/regex-automata/src/util/look.rs
@@ -125,17 +125,17 @@ impl Look {
     /// constructor is guaranteed to return the same look-around variant that
     /// one started with within a semver compatible release of this crate.
     #[inline]
-    pub const fn as_repr(self) -> u16 {
+    pub const fn as_repr(self) -> u32 {
         // AFAIK, 'as' is the only way to zero-cost convert an int enum to an
         // actual int.
-        self as u16
+        self as u32
     }
 
     /// Given the underlying representation of a `Look` value, return the
     /// corresponding `Look` value if the representation is valid. Otherwise
     /// `None` is returned.
     #[inline]
-    pub const fn from_repr(repr: u16) -> Option<Look> {
+    pub const fn from_repr(repr: u32) -> Option<Look> {
         match repr {
             0b00_0000_0001 => Some(Look::Start),
             0b00_0000_0010 => Some(Look::End),
@@ -191,7 +191,7 @@ pub struct LookSet {
     /// range of `u16` values to be represented. For example, even if the
     /// current implementation only makes use of the 10 least significant bits,
     /// it may use more bits in a future semver compatible release.
-    pub bits: u16,
+    pub bits: u32,
 }
 
 impl LookSet {
@@ -379,29 +379,31 @@ impl LookSet {
         *self = self.intersect(other);
     }
 
-    /// Return a `LookSet` from the slice given as a native endian 16-bit
+    /// Return a `LookSet` from the slice given as a native endian 32-bit
     /// integer.
     ///
     /// # Panics
     ///
-    /// This panics if `slice.len() < 2`.
+    /// This panics if `slice.len() < 4`.
     #[inline]
     pub fn read_repr(slice: &[u8]) -> LookSet {
-        let bits = u16::from_ne_bytes(slice[..2].try_into().unwrap());
+        let bits = u32::from_ne_bytes(slice[..4].try_into().unwrap());
         LookSet { bits }
     }
 
-    /// Write a `LookSet` as a native endian 16-bit integer to the beginning
+    /// Write a `LookSet` as a native endian 32-bit integer to the beginning
     /// of the slice given.
     ///
     /// # Panics
     ///
-    /// This panics if `slice.len() < 2`.
+    /// This panics if `slice.len() < 4`.
     #[inline]
     pub fn write_repr(self, slice: &mut [u8]) {
         let raw = self.bits.to_ne_bytes();
         slice[0] = raw[0];
         slice[1] = raw[1];
+        slice[2] = raw[2];
+        slice[3] = raw[3];
     }
 
     /// Checks that all assertions in this set can be matched.
@@ -456,9 +458,9 @@ impl Iterator for LookSetIter {
             return None;
         }
         // We'll never have more than u8::MAX distinct look-around assertions,
-        // so 'repr' will always fit into a u16.
-        let repr = u16::try_from(self.set.bits.trailing_zeros()).unwrap();
-        let look = Look::from_repr(1 << repr)?;
+        // so 'bit' will always fit into a u16.
+        let bit = u16::try_from(self.set.bits.trailing_zeros()).unwrap();
+        let look = Look::from_repr(1 << bit)?;
         self.set = self.set.remove(look);
         Some(look)
     }
diff --git a/regex-automata/tests/hybrid/api.rs b/regex-automata/tests/hybrid/api.rs
index e82d808e3..4b04c4f8f 100644
--- a/regex-automata/tests/hybrid/api.rs
+++ b/regex-automata/tests/hybrid/api.rs
@@ -55,7 +55,7 @@ fn too_many_cache_resets_cause_quit() -> Result<(), Box<dyn Error>> {
     let mut cache = dfa.create_cache();
 
     let haystack = "a".repeat(101).into_bytes();
-    let err = MatchError::gave_up(25);
+    let err = MatchError::gave_up(24);
     // Notice that we make the same amount of progress in each search! That's
     // because the cache is reused and already has states to handle the first
     // N bytes.
@@ -83,7 +83,7 @@ fn too_many_cache_resets_cause_quit() -> Result<(), Box<dyn Error>> {
     // OK, if we reset the cache, then we should be able to create more states
     // and make more progress with searching for betas.
     cache.reset(&dfa);
-    let err = MatchError::gave_up(27);
+    let err = MatchError::gave_up(26);
     assert_eq!(
         Err(err),
         dfa.try_search_fwd(&mut cache, &Input::new(&haystack))
diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs
index f8a3d4a9e..361ca41af 100644
--- a/regex-syntax/src/hir/mod.rs
+++ b/regex-syntax/src/hir/mod.rs
@@ -1664,17 +1664,17 @@ impl Look {
     /// constructor is guaranteed to return the same look-around variant that
     /// one started with within a semver compatible release of this crate.
     #[inline]
-    pub const fn as_repr(self) -> u16 {
+    pub const fn as_repr(self) -> u32 {
         // AFAIK, 'as' is the only way to zero-cost convert an int enum to an
         // actual int.
-        self as u16
+        self as u32
     }
 
     /// Given the underlying representation of a `Look` value, return the
     /// corresponding `Look` value if the representation is valid. Otherwise
     /// `None` is returned.
     #[inline]
-    pub const fn from_repr(repr: u16) -> Option<Look> {
+    pub const fn from_repr(repr: u32) -> Option<Look> {
         match repr {
             0b00_0000_0001 => Some(Look::Start),
             0b00_0000_0010 => Some(Look::End),
@@ -2600,7 +2600,7 @@ pub struct LookSet {
     /// range of `u16` values to be represented. For example, even if the
     /// current implementation only makes use of the 10 least significant bits,
     /// it may use more bits in a future semver compatible release.
-    pub bits: u16,
+    pub bits: u32,
 }
 
 impl LookSet {
@@ -2788,29 +2788,31 @@ impl LookSet {
         *self = self.intersect(other);
     }
 
-    /// Return a `LookSet` from the slice given as a native endian 16-bit
+    /// Return a `LookSet` from the slice given as a native endian 32-bit
     /// integer.
     ///
     /// # Panics
     ///
-    /// This panics if `slice.len() < 2`.
+    /// This panics if `slice.len() < 4`.
     #[inline]
     pub fn read_repr(slice: &[u8]) -> LookSet {
-        let bits = u16::from_ne_bytes(slice[..2].try_into().unwrap());
+        let bits = u32::from_ne_bytes(slice[..4].try_into().unwrap());
         LookSet { bits }
     }
 
-    /// Write a `LookSet` as a native endian 16-bit integer to the beginning
+    /// Write a `LookSet` as a native endian 32-bit integer to the beginning
     /// of the slice given.
     ///
     /// # Panics
     ///
-    /// This panics if `slice.len() < 2`.
+    /// This panics if `slice.len() < 4`.
     #[inline]
     pub fn write_repr(self, slice: &mut [u8]) {
         let raw = self.bits.to_ne_bytes();
         slice[0] = raw[0];
         slice[1] = raw[1];
+        slice[2] = raw[2];
+        slice[3] = raw[3];
     }
 }
 
@@ -2843,9 +2845,9 @@ impl Iterator for LookSetIter {
             return None;
         }
         // We'll never have more than u8::MAX distinct look-around assertions,
-        // so 'repr' will always fit into a u16.
-        let repr = u16::try_from(self.set.bits.trailing_zeros()).unwrap();
-        let look = Look::from_repr(1 << repr)?;
+        // so 'bit' will always fit into a u16.
+        let bit = u16::try_from(self.set.bits.trailing_zeros()).unwrap();
+        let look = Look::from_repr(1 << bit)?;
         self.set = self.set.remove(look);
         Some(look)
     }

From 19e54d89f94a8785892b8f8f4568ac7d37066c09 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Fri, 6 Oct 2023 21:58:38 -0400
Subject: [PATCH 076/136] syntax/ast: add support for additional word boundary
 assertions

This adds AST support for the following new assertions:
\b{start}, \b{end}, \b{start-half}, \b{end-half}, \< and \>. The last
two, \< and \>, are aliases for \b{start} and \b{end}.

The parsing for this is a little suspect since there's a little
ambiguity between, e.g., \b{5} and \b{start}, but we handle it by
allowing the parser to look for one of the new special assertions, and
then back-up if it fails to find one so that it can try to parse a
counted repetition.

Ref #469
---
 regex-syntax/src/ast/mod.rs       |  47 +++++++
 regex-syntax/src/ast/parse.rs     | 226 ++++++++++++++++++++++++++++--
 regex-syntax/src/ast/print.rs     |   6 +
 regex-syntax/src/hir/translate.rs |  14 ++
 regex-syntax/src/lib.rs           |   3 +
 5 files changed, 281 insertions(+), 15 deletions(-)

diff --git a/regex-syntax/src/ast/mod.rs b/regex-syntax/src/ast/mod.rs
index 9e0f92606..6a77ee134 100644
--- a/regex-syntax/src/ast/mod.rs
+++ b/regex-syntax/src/ast/mod.rs
@@ -162,6 +162,18 @@ pub enum ErrorKind {
     /// `(?i)*`. It is, however, possible to create a repetition operating on
     /// an empty sub-expression. For example, `()*` is still considered valid.
     RepetitionMissing,
+    /// The special word boundary syntax, `\b{something}`, was used, but
+    /// either EOF without `}` was seen, or an invalid character in the
+    /// braces was seen.
+    SpecialWordBoundaryUnclosed,
+    /// The special word boundary syntax, `\b{something}`, was used, but
+    /// `something` was not recognized as a valid word boundary kind.
+    SpecialWordBoundaryUnrecognized,
+    /// The syntax `\b{` was observed, but afterwards the end of the pattern
+    /// was observed without being able to tell whether it was meant to be a
+    /// bounded repetition on the `\b` or the beginning of a special word
+    /// boundary assertion.
+    SpecialWordOrRepetitionUnexpectedEof,
     /// The Unicode class is not valid. This typically occurs when a `\p` is
     /// followed by something other than a `{`.
     UnicodeClassInvalid,
@@ -260,6 +272,29 @@ impl core::fmt::Display for ErrorKind {
             RepetitionMissing => {
                 write!(f, "repetition operator missing expression")
             }
+            SpecialWordBoundaryUnclosed => {
+                write!(
+                    f,
+                    "special word boundary assertion is either \
+                     unclosed or contains an invalid character",
+                )
+            }
+            SpecialWordBoundaryUnrecognized => {
+                write!(
+                    f,
+                    "unrecognized special word boundary assertion, \
+                     valid choices are: start, end, start-half \
+                     or end-half",
+                )
+            }
+            SpecialWordOrRepetitionUnexpectedEof => {
+                write!(
+                    f,
+                    "found either the beginning of a special word \
+                     boundary or a bounded repetition on a \\b with \
+                     an opening brace, but no closing brace",
+                )
+            }
             UnicodeClassInvalid => {
                 write!(f, "invalid Unicode character class")
             }
@@ -1293,6 +1328,18 @@ pub enum AssertionKind {
     WordBoundary,
     /// `\B`
     NotWordBoundary,
+    /// `\b{start}`
+    WordBoundaryStart,
+    /// `\b{end}`
+    WordBoundaryEnd,
+    /// `\<` (alias for `\b{start}`)
+    WordBoundaryStartAngle,
+    /// `\>` (alias for `\b{end}`)
+    WordBoundaryEndAngle,
+    /// `\b{start-half}`
+    WordBoundaryStartHalf,
+    /// `\b{end-half}`
+    WordBoundaryEndHalf,
 }
 
 /// A repetition operation applied to a regular expression.
diff --git a/regex-syntax/src/ast/parse.rs b/regex-syntax/src/ast/parse.rs
index f7bae7759..593b14fbc 100644
--- a/regex-syntax/src/ast/parse.rs
+++ b/regex-syntax/src/ast/parse.rs
@@ -1528,18 +1528,115 @@ impl<'s, P: Borrow<Parser>> ParserI<'s, P> {
                 span,
                 kind: ast::AssertionKind::EndText,
             })),
-            'b' => Ok(Primitive::Assertion(ast::Assertion {
-                span,
-                kind: ast::AssertionKind::WordBoundary,
-            })),
+            'b' => {
+                let mut wb = ast::Assertion {
+                    span,
+                    kind: ast::AssertionKind::WordBoundary,
+                };
+                // After a \b, we "try" to parse things like \b{start} for
+                // special word boundary assertions.
+                if !self.is_eof() && self.char() == '{' {
+                    if let Some(kind) =
+                        self.maybe_parse_special_word_boundary(start)?
+                    {
+                        wb.kind = kind;
+                        wb.span.end = self.pos();
+                    }
+                }
+                Ok(Primitive::Assertion(wb))
+            }
             'B' => Ok(Primitive::Assertion(ast::Assertion {
                 span,
                 kind: ast::AssertionKind::NotWordBoundary,
             })),
+            '<' => Ok(Primitive::Assertion(ast::Assertion {
+                span,
+                kind: ast::AssertionKind::WordBoundaryStartAngle,
+            })),
+            '>' => Ok(Primitive::Assertion(ast::Assertion {
+                span,
+                kind: ast::AssertionKind::WordBoundaryEndAngle,
+            })),
             _ => Err(self.error(span, ast::ErrorKind::EscapeUnrecognized)),
         }
     }
 
+    /// Attempt to parse a specialty word boundary. That is, `\b{start}`,
+    /// `\b{end}`, `\b{start-half}` or `\b{end-half}`.
+    ///
+    /// This is similar to `maybe_parse_ascii_class` in that, in most cases,
+    /// if it fails it will just return `None` with no error. This is done
+    /// because `\b{5}` is a valid expression and we want to let that be parsed
+    /// by the existing counted repetition parsing code. (I thought about just
+    /// invoking the counted repetition code from here, but it seemed a little
+    /// ham-fisted.)
+    ///
+    /// Unlike `maybe_parse_ascii_class` though, this can return an error.
+    /// Namely, if we definitely know it isn't a counted repetition, then we
+    /// return an error specific to the specialty word boundaries.
+    ///
+    /// This assumes the parser is positioned at a `{` immediately following
+    /// a `\b`. When `None` is returned, the parser is returned to the position
+    /// at which it started: pointing at a `{`.
+    ///
+    /// The position given should correspond to the start of the `\b`.
+    fn maybe_parse_special_word_boundary(
+        &self,
+        wb_start: Position,
+    ) -> Result<Option<ast::AssertionKind>> {
+        assert_eq!(self.char(), '{');
+
+        let is_valid_char = |c| match c {
+            'A'..='Z' | 'a'..='z' | '-' => true,
+            _ => false,
+        };
+        let start = self.pos();
+        if !self.bump_and_bump_space() {
+            return Err(self.error(
+                Span::new(wb_start, self.pos()),
+                ast::ErrorKind::SpecialWordOrRepetitionUnexpectedEof,
+            ));
+        }
+        let start_contents = self.pos();
+        // This is one of the critical bits: if the first non-whitespace
+        // character isn't in [-A-Za-z] (i.e., this can't be a special word
+        // boundary), then we bail and let the counted repetition parser deal
+        // with this.
+        if !is_valid_char(self.char()) {
+            self.parser().pos.set(start);
+            return Ok(None);
+        }
+
+        // Now collect up our chars until we see a '}'.
+        let mut scratch = self.parser().scratch.borrow_mut();
+        scratch.clear();
+        while !self.is_eof() && is_valid_char(self.char()) {
+            scratch.push(self.char());
+            self.bump_and_bump_space();
+        }
+        if self.is_eof() || self.char() != '}' {
+            return Err(self.error(
+                Span::new(start, self.pos()),
+                ast::ErrorKind::SpecialWordBoundaryUnclosed,
+            ));
+        }
+        let end = self.pos();
+        self.bump();
+        let kind = match scratch.as_str() {
+            "start" => ast::AssertionKind::WordBoundaryStart,
+            "end" => ast::AssertionKind::WordBoundaryEnd,
+            "start-half" => ast::AssertionKind::WordBoundaryStartHalf,
+            "end-half" => ast::AssertionKind::WordBoundaryEndHalf,
+            _ => {
+                return Err(self.error(
+                    Span::new(start_contents, end),
+                    ast::ErrorKind::SpecialWordBoundaryUnrecognized,
+                ))
+            }
+        };
+        Ok(Some(kind))
+    }
+
     /// Parse an octal representation of a Unicode codepoint up to 3 digits
     /// long. This expects the parser to be positioned at the first octal
     /// digit and advances the parser to the first character immediately
@@ -1967,9 +2064,9 @@ impl<'s, P: Borrow<Parser>> ParserI<'s, P> {
         // because parsing cannot fail with any interesting error. For example,
         // in order to use an ASCII character class, it must be enclosed in
         // double brackets, e.g., `[[:alnum:]]`. Alternatively, you might think
-        // of it as "ASCII character characters have the syntax `[:NAME:]`
-        // which can only appear within character brackets." This means that
-        // things like `[[:lower:]A]` are legal constructs.
+        // of it as "ASCII character classes have the syntax `[:NAME:]` which
+        // can only appear within character brackets." This means that things
+        // like `[[:lower:]A]` are legal constructs.
         //
         // However, if one types an incorrect ASCII character class, e.g.,
         // `[[:loower:]]`, then we treat that as a normal nested character
@@ -3295,6 +3392,23 @@ bar
                 ast: Box::new(lit('a', 0)),
             }))
         );
+        assert_eq!(
+            parser(r"\b{5,9}").parse(),
+            Ok(Ast::repetition(ast::Repetition {
+                span: span(0..7),
+                op: ast::RepetitionOp {
+                    span: span(2..7),
+                    kind: ast::RepetitionKind::Range(
+                        ast::RepetitionRange::Bounded(5, 9)
+                    ),
+                },
+                greedy: true,
+                ast: Box::new(Ast::assertion(ast::Assertion {
+                    span: span(0..2),
+                    kind: ast::AssertionKind::WordBoundary,
+                })),
+            }))
+        );
 
         assert_eq!(
             parser(r"(?i){0}").parse().unwrap_err(),
@@ -4381,6 +4495,48 @@ bar
                 kind: ast::AssertionKind::WordBoundary,
             }))
         );
+        assert_eq!(
+            parser(r"\b{start}").parse_primitive(),
+            Ok(Primitive::Assertion(ast::Assertion {
+                span: span(0..9),
+                kind: ast::AssertionKind::WordBoundaryStart,
+            }))
+        );
+        assert_eq!(
+            parser(r"\b{end}").parse_primitive(),
+            Ok(Primitive::Assertion(ast::Assertion {
+                span: span(0..7),
+                kind: ast::AssertionKind::WordBoundaryEnd,
+            }))
+        );
+        assert_eq!(
+            parser(r"\b{start-half}").parse_primitive(),
+            Ok(Primitive::Assertion(ast::Assertion {
+                span: span(0..14),
+                kind: ast::AssertionKind::WordBoundaryStartHalf,
+            }))
+        );
+        assert_eq!(
+            parser(r"\b{end-half}").parse_primitive(),
+            Ok(Primitive::Assertion(ast::Assertion {
+                span: span(0..12),
+                kind: ast::AssertionKind::WordBoundaryEndHalf,
+            }))
+        );
+        assert_eq!(
+            parser(r"\<").parse_primitive(),
+            Ok(Primitive::Assertion(ast::Assertion {
+                span: span(0..2),
+                kind: ast::AssertionKind::WordBoundaryStartAngle,
+            }))
+        );
+        assert_eq!(
+            parser(r"\>").parse_primitive(),
+            Ok(Primitive::Assertion(ast::Assertion {
+                span: span(0..2),
+                kind: ast::AssertionKind::WordBoundaryEndAngle,
+            }))
+        );
         assert_eq!(
             parser(r"\B").parse_primitive(),
             Ok(Primitive::Assertion(ast::Assertion {
@@ -4418,20 +4574,60 @@ bar
                 kind: ast::ErrorKind::EscapeUnrecognized,
             }
         );
-        // But also, < and > are banned, so that we may evolve them into
-        // start/end word boundary assertions. (Not sure if we will...)
+
+        // Starting a special word boundary without any non-whitespace chars
+        // after the brace makes it ambiguous whether the user meant to write
+        // a counted repetition (probably not?) or an actual special word
+        // boundary assertion.
         assert_eq!(
-            parser(r"\<").parse_escape().unwrap_err(),
+            parser(r"\b{").parse_escape().unwrap_err(),
             TestError {
-                span: span(0..2),
-                kind: ast::ErrorKind::EscapeUnrecognized,
+                span: span(0..3),
+                kind: ast::ErrorKind::SpecialWordOrRepetitionUnexpectedEof,
             }
         );
         assert_eq!(
-            parser(r"\>").parse_escape().unwrap_err(),
+            parser_ignore_whitespace(r"\b{ ").parse_escape().unwrap_err(),
             TestError {
-                span: span(0..2),
-                kind: ast::ErrorKind::EscapeUnrecognized,
+                span: span(0..4),
+                kind: ast::ErrorKind::SpecialWordOrRepetitionUnexpectedEof,
+            }
+        );
+        // When 'x' is not enabled, the space is seen as a non-[-A-Za-z] char,
+        // and thus causes the parser to treat it as a counted repetition.
+        assert_eq!(
+            parser(r"\b{ ").parse().unwrap_err(),
+            TestError {
+                span: span(4..4),
+                kind: ast::ErrorKind::RepetitionCountDecimalEmpty,
+            }
+        );
+        // In this case, we got some valid chars that makes it look like the
+        // user is writing one of the special word boundary assertions, but
+        // we forget to close the brace.
+        assert_eq!(
+            parser(r"\b{foo").parse_escape().unwrap_err(),
+            TestError {
+                span: span(2..6),
+                kind: ast::ErrorKind::SpecialWordBoundaryUnclosed,
+            }
+        );
+        // We get the same error as above, except it is provoked by seeing a
+        // char that we know is invalid before seeing a closing brace.
+        assert_eq!(
+            parser(r"\b{foo!}").parse_escape().unwrap_err(),
+            TestError {
+                span: span(2..6),
+                kind: ast::ErrorKind::SpecialWordBoundaryUnclosed,
+            }
+        );
+        // And this one occurs when, syntactically, everything looks okay, but
+        // we don't use a valid spelling of a word boundary assertion.
+        assert_eq!(
+            parser(r"\b{foo}").parse_escape().unwrap_err(),
+            TestError {
+                span: span(3..6),
+                kind: ast::ErrorKind::SpecialWordBoundaryUnrecognized,
             }
         );
 
diff --git a/regex-syntax/src/ast/print.rs b/regex-syntax/src/ast/print.rs
index 7dedf7f48..1ceb3c7fa 100644
--- a/regex-syntax/src/ast/print.rs
+++ b/regex-syntax/src/ast/print.rs
@@ -261,6 +261,12 @@ impl<W: fmt::Write> Writer<W> {
             EndText => self.wtr.write_str(r"\z"),
             WordBoundary => self.wtr.write_str(r"\b"),
             NotWordBoundary => self.wtr.write_str(r"\B"),
+            WordBoundaryStart => self.wtr.write_str(r"\b{start}"),
+            WordBoundaryEnd => self.wtr.write_str(r"\b{end}"),
+            WordBoundaryStartAngle => self.wtr.write_str(r"\<"),
+            WordBoundaryEndAngle => self.wtr.write_str(r"\>"),
+            WordBoundaryStartHalf => self.wtr.write_str(r"\b{start-half}"),
+            WordBoundaryEndHalf => self.wtr.write_str(r"\b{end-half}"),
         }
     }
 
diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs
index 56d261aa1..4ae279f92 100644
--- a/regex-syntax/src/hir/translate.rs
+++ b/regex-syntax/src/hir/translate.rs
@@ -962,6 +962,20 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
             } else {
                 hir::Look::WordAsciiNegate
             }),
+            ast::AssertionKind::WordBoundaryStart
+            | ast::AssertionKind::WordBoundaryStartAngle => {
+                Hir::look(if unicode { todo!() } else { todo!() })
+            }
+            ast::AssertionKind::WordBoundaryEnd
+            | ast::AssertionKind::WordBoundaryEndAngle => {
+                Hir::look(if unicode { todo!() } else { todo!() })
+            }
+            ast::AssertionKind::WordBoundaryStartHalf => {
+                Hir::look(if unicode { todo!() } else { todo!() })
+            }
+            ast::AssertionKind::WordBoundaryEndHalf => {
+                Hir::look(if unicode { todo!() } else { todo!() })
+            }
         })
     }
 
diff --git a/regex-syntax/src/lib.rs b/regex-syntax/src/lib.rs
index a552099c6..38c8d88d4 100644
--- a/regex-syntax/src/lib.rs
+++ b/regex-syntax/src/lib.rs
@@ -334,6 +334,9 @@ pub fn is_escapeable_character(c: char) -> bool {
         // escapeable, \< and \> will result in a parse error. Thus, we can
         // turn them into something else in the future without it being a
         // backwards incompatible change.
+        //
+        // OK, now we support \< and \>, and we need to retain them as *not*
+        // escapeable here since the escape sequence is significant.
         '<' | '>' => false,
         _ => true,
     }

From bbb98bbb1bd15f7a90469ec2470cb3fdfd2f8db8 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Sat, 7 Oct 2023 09:59:51 -0400
Subject: [PATCH 077/136] syntax/hir: add new special word boundaries to HIR

This builds on the previous commit to bring word boundary support to the
HIR, and updates AST->HIR translation to produce them from the
corresponding AST elements.

Ref #469
---
 regex-syntax/src/hir/mod.rs       | 95 ++++++++++++++++++++++++++-----
 regex-syntax/src/hir/print.rs     | 24 ++++++++
 regex-syntax/src/hir/translate.rs | 26 +++++++--
 3 files changed, 126 insertions(+), 19 deletions(-)

diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs
index 361ca41af..ce38ead7b 100644
--- a/regex-syntax/src/hir/mod.rs
+++ b/regex-syntax/src/hir/mod.rs
@@ -1635,6 +1635,42 @@ pub enum Look {
     WordUnicode = 1 << 8,
     /// Match a Unicode-aware negation of a word boundary.
     WordUnicodeNegate = 1 << 9,
+    /// Match the start of an ASCII-only word boundary. That is, this matches a
+    /// position at either the beginning of the haystack or where the previous
+    /// character is not a word character and the following character is a word
+    /// character.
+    WordStartAscii = 1 << 10,
+    /// Match the end of an ASCII-only word boundary. That is, this matches
+    /// a position at either the end of the haystack or where the previous
+    /// character is a word character and the following character is not a word
+    /// character.
+    WordEndAscii = 1 << 11,
+    /// Match the start of a Unicode word boundary. That is, this matches a
+    /// position at either the beginning of the haystack or where the previous
+    /// character is not a word character and the following character is a word
+    /// character.
+    WordStartUnicode = 1 << 12,
+    /// Match the end of a Unicode word boundary. That is, this matches a
+    /// position at either the end of the haystack or where the previous
+    /// character is a word character and the following character is not a word
+    /// character.
+    WordEndUnicode = 1 << 13,
+    /// Match the start half of an ASCII-only word boundary. That is, this
+    /// matches a position at either the beginning of the haystack or where the
+    /// previous character is not a word character.
+    WordStartHalfAscii = 1 << 14,
+    /// Match the end half of an ASCII-only word boundary. That is, this
+    /// matches a position at either the end of the haystack or where the
+    /// following character is not a word character.
+    WordEndHalfAscii = 1 << 15,
+    /// Match the start half of a Unicode word boundary. That is, this matches
+    /// a position at either the beginning of the haystack or where the
+    /// previous character is not a word character.
+    WordStartHalfUnicode = 1 << 16,
+    /// Match the end half of a Unicode word boundary. That is, this matches
+    /// a position at either the end of the haystack or where the following
+    /// character is not a word character.
+    WordEndHalfUnicode = 1 << 17,
 }
 
 impl Look {
@@ -1656,6 +1692,14 @@ impl Look {
             Look::WordAsciiNegate => Look::WordAsciiNegate,
             Look::WordUnicode => Look::WordUnicode,
             Look::WordUnicodeNegate => Look::WordUnicodeNegate,
+            Look::WordStartAscii => Look::WordEndAscii,
+            Look::WordEndAscii => Look::WordStartAscii,
+            Look::WordStartUnicode => Look::WordEndUnicode,
+            Look::WordEndUnicode => Look::WordStartUnicode,
+            Look::WordStartHalfAscii => Look::WordEndHalfAscii,
+            Look::WordEndHalfAscii => Look::WordStartHalfAscii,
+            Look::WordStartHalfUnicode => Look::WordEndHalfUnicode,
+            Look::WordEndHalfUnicode => Look::WordStartHalfUnicode,
         }
     }
 
@@ -1676,16 +1720,24 @@ impl Look {
     #[inline]
     pub const fn from_repr(repr: u32) -> Option<Look> {
         match repr {
-            0b00_0000_0001 => Some(Look::Start),
-            0b00_0000_0010 => Some(Look::End),
-            0b00_0000_0100 => Some(Look::StartLF),
-            0b00_0000_1000 => Some(Look::EndLF),
-            0b00_0001_0000 => Some(Look::StartCRLF),
-            0b00_0010_0000 => Some(Look::EndCRLF),
-            0b00_0100_0000 => Some(Look::WordAscii),
-            0b00_1000_0000 => Some(Look::WordAsciiNegate),
-            0b01_0000_0000 => Some(Look::WordUnicode),
-            0b10_0000_0000 => Some(Look::WordUnicodeNegate),
+            0b00_0000_0000_0000_0001 => Some(Look::Start),
+            0b00_0000_0000_0000_0010 => Some(Look::End),
+            0b00_0000_0000_0000_0100 => Some(Look::StartLF),
+            0b00_0000_0000_0000_1000 => Some(Look::EndLF),
+            0b00_0000_0000_0001_0000 => Some(Look::StartCRLF),
+            0b00_0000_0000_0010_0000 => Some(Look::EndCRLF),
+            0b00_0000_0000_0100_0000 => Some(Look::WordAscii),
+            0b00_0000_0000_1000_0000 => Some(Look::WordAsciiNegate),
+            0b00_0000_0001_0000_0000 => Some(Look::WordUnicode),
+            0b00_0000_0010_0000_0000 => Some(Look::WordUnicodeNegate),
+            0b00_0000_0100_0000_0000 => Some(Look::WordStartAscii),
+            0b00_0000_1000_0000_0000 => Some(Look::WordEndAscii),
+            0b00_0001_0000_0000_0000 => Some(Look::WordStartUnicode),
+            0b00_0010_0000_0000_0000 => Some(Look::WordEndUnicode),
+            0b00_0100_0000_0000_0000 => Some(Look::WordStartHalfAscii),
+            0b00_1000_0000_0000_0000 => Some(Look::WordEndHalfAscii),
+            0b01_0000_0000_0000_0000 => Some(Look::WordStartHalfUnicode),
+            0b10_0000_0000_0000_0000 => Some(Look::WordEndHalfUnicode),
             _ => None,
         }
     }
@@ -1710,6 +1762,14 @@ impl Look {
             Look::WordAsciiNegate => 'B',
             Look::WordUnicode => '𝛃',
             Look::WordUnicodeNegate => '𝚩',
+            Look::WordStartAscii => '<',
+            Look::WordEndAscii => '>',
+            Look::WordStartUnicode => '〈',
+            Look::WordEndUnicode => '〉',
+            Look::WordStartHalfAscii => '◁',
+            Look::WordEndHalfAscii => '▷',
+            Look::WordStartHalfUnicode => '◀',
+            Look::WordEndHalfUnicode => '▶',
         }
     }
 }
@@ -2703,13 +2763,22 @@ impl LookSet {
     pub fn contains_word_unicode(self) -> bool {
         self.contains(Look::WordUnicode)
             || self.contains(Look::WordUnicodeNegate)
+            || self.contains(Look::WordStartUnicode)
+            || self.contains(Look::WordEndUnicode)
+            || self.contains(Look::WordStartHalfUnicode)
+            || self.contains(Look::WordEndHalfUnicode)
     }
 
     /// Returns true if and only if this set contains any ASCII word boundary
     /// or negated ASCII word boundary assertions.
     #[inline]
     pub fn contains_word_ascii(self) -> bool {
-        self.contains(Look::WordAscii) || self.contains(Look::WordAsciiNegate)
+        self.contains(Look::WordAscii)
+            || self.contains(Look::WordAsciiNegate)
+            || self.contains(Look::WordStartAscii)
+            || self.contains(Look::WordEndAscii)
+            || self.contains(Look::WordStartHalfAscii)
+            || self.contains(Look::WordEndHalfAscii)
     }
 
     /// Returns an iterator over all of the look-around assertions in this set.
@@ -3769,7 +3838,7 @@ mod tests {
         assert_eq!(0, set.iter().count());
 
         let set = LookSet::full();
-        assert_eq!(10, set.iter().count());
+        assert_eq!(18, set.iter().count());
 
         let set =
             LookSet::empty().insert(Look::StartLF).insert(Look::WordUnicode);
@@ -3787,6 +3856,6 @@ mod tests {
         let res = format!("{:?}", LookSet::empty());
         assert_eq!("∅", res);
         let res = format!("{:?}", LookSet::full());
-        assert_eq!("Az^$rRbB𝛃𝚩", res);
+        assert_eq!("Az^$rRbB𝛃𝚩<>〈〉◁▷◀▶", res);
     }
 }
diff --git a/regex-syntax/src/hir/print.rs b/regex-syntax/src/hir/print.rs
index aa737a092..dfa6d4032 100644
--- a/regex-syntax/src/hir/print.rs
+++ b/regex-syntax/src/hir/print.rs
@@ -202,6 +202,30 @@ impl<W: fmt::Write> Visitor for Writer<W> {
                 hir::Look::WordUnicodeNegate => {
                     self.wtr.write_str(r"\B")?;
                 }
+                hir::Look::WordStartAscii => {
+                    self.wtr.write_str(r"(?-u:\b{start})")?;
+                }
+                hir::Look::WordEndAscii => {
+                    self.wtr.write_str(r"(?-u:\b{end})")?;
+                }
+                hir::Look::WordStartUnicode => {
+                    self.wtr.write_str(r"\b{start}")?;
+                }
+                hir::Look::WordEndUnicode => {
+                    self.wtr.write_str(r"\b{end}")?;
+                }
+                hir::Look::WordStartHalfAscii => {
+                    self.wtr.write_str(r"(?-u:\b{start-half})")?;
+                }
+                hir::Look::WordEndHalfAscii => {
+                    self.wtr.write_str(r"(?-u:\b{end-half})")?;
+                }
+                hir::Look::WordStartHalfUnicode => {
+                    self.wtr.write_str(r"\b{start-half}")?;
+                }
+                hir::Look::WordEndHalfUnicode => {
+                    self.wtr.write_str(r"\b{end-half}")?;
+                }
             },
             HirKind::Capture(hir::Capture { ref name, .. }) => {
                 self.wtr.write_str("(")?;
diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs
index 4ae279f92..55ca074fa 100644
--- a/regex-syntax/src/hir/translate.rs
+++ b/regex-syntax/src/hir/translate.rs
@@ -964,18 +964,32 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
             }),
             ast::AssertionKind::WordBoundaryStart
             | ast::AssertionKind::WordBoundaryStartAngle => {
-                Hir::look(if unicode { todo!() } else { todo!() })
+                Hir::look(if unicode {
+                    hir::Look::WordStartUnicode
+                } else {
+                    hir::Look::WordStartAscii
+                })
             }
             ast::AssertionKind::WordBoundaryEnd
             | ast::AssertionKind::WordBoundaryEndAngle => {
-                Hir::look(if unicode { todo!() } else { todo!() })
+                Hir::look(if unicode {
+                    hir::Look::WordEndUnicode
+                } else {
+                    hir::Look::WordEndAscii
+                })
             }
             ast::AssertionKind::WordBoundaryStartHalf => {
-                Hir::look(if unicode { todo!() } else { todo!() })
-            }
-            ast::AssertionKind::WordBoundaryEndHalf => {
-                Hir::look(if unicode { todo!() } else { todo!() })
+                Hir::look(if unicode {
+                    hir::Look::WordStartHalfUnicode
+                } else {
+                    hir::Look::WordStartHalfAscii
+                })
             }
+            ast::AssertionKind::WordBoundaryEndHalf => Hir::look(if unicode {
+                hir::Look::WordEndHalfUnicode
+            } else {
+                hir::Look::WordEndHalfAscii
+            }),
         })
     }
 

From 21eb31e38c31073258fc670cd80a8a26e96d11aa Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Sat, 7 Oct 2023 18:04:56 -0400
Subject: [PATCH 078/136] automata: add special word boundaries to
 regex-automata

In this commit, all of the regex engines now support the new special
word boundary assertions: \b{start}, \b{end}, \b{start-half} and
\b{end-half}. Of course, when they are Unicode-aware, the DFAs will quit
upon seeing a non-ASCII character, just like for the \b and \B
assertions.

For now, we don't add support to the one-pass DFA, since it would either
make it use more memory or reduce the number of capture groups it
supports. I think these assertions will be rare enough that it isn't
worth adding support yet.

This is a breaking change because it adds new variants to the `Look`
enum.
---
 regex-automata/src/nfa/thompson/compiler.rs |   8 +
 regex-automata/src/util/determinize/mod.rs  |  60 +-
 regex-automata/src/util/look.rs             | 898 ++++++++++++++++++--
 regex-automata/tests/dfa/suite.rs           |   6 +-
 regex-automata/tests/lib.rs                 |   1 +
 testdata/word-boundary-special.toml         | 653 ++++++++++++++
 tests/lib.rs                                |   1 +
 7 files changed, 1563 insertions(+), 64 deletions(-)
 create mode 100644 testdata/word-boundary-special.toml

diff --git a/regex-automata/src/nfa/thompson/compiler.rs b/regex-automata/src/nfa/thompson/compiler.rs
index a188017d8..2d2172957 100644
--- a/regex-automata/src/nfa/thompson/compiler.rs
+++ b/regex-automata/src/nfa/thompson/compiler.rs
@@ -1557,6 +1557,14 @@ impl Compiler {
             hir::Look::WordAsciiNegate => Look::WordAsciiNegate,
             hir::Look::WordUnicode => Look::WordUnicode,
             hir::Look::WordUnicodeNegate => Look::WordUnicodeNegate,
+            hir::Look::WordStartAscii => Look::WordStartAscii,
+            hir::Look::WordEndAscii => Look::WordEndAscii,
+            hir::Look::WordStartUnicode => Look::WordStartUnicode,
+            hir::Look::WordEndUnicode => Look::WordEndUnicode,
+            hir::Look::WordStartHalfAscii => Look::WordStartHalfAscii,
+            hir::Look::WordEndHalfAscii => Look::WordEndHalfAscii,
+            hir::Look::WordStartHalfUnicode => Look::WordStartHalfUnicode,
+            hir::Look::WordEndHalfUnicode => Look::WordEndHalfUnicode,
         };
         let id = self.add_look(look)?;
         Ok(ThompsonRef { start: id, end: id })
diff --git a/regex-automata/src/util/determinize/mod.rs b/regex-automata/src/util/determinize/mod.rs
index 30a82afb8..d320fabc3 100644
--- a/regex-automata/src/util/determinize/mod.rs
+++ b/regex-automata/src/util/determinize/mod.rs
@@ -145,9 +145,10 @@ pub(crate) fn next(
             }
             Some(_) => {}
             None => {
-                look_have = look_have.insert(Look::End);
-                look_have = look_have.insert(Look::EndLF);
-                look_have = look_have.insert(Look::EndCRLF);
+                look_have = look_have
+                    .insert(Look::End)
+                    .insert(Look::EndLF)
+                    .insert(Look::EndCRLF);
             }
         }
         if unit.is_byte(lookm.get_line_terminator()) {
@@ -160,11 +161,26 @@ pub(crate) fn next(
             look_have = look_have.insert(Look::StartCRLF);
         }
         if state.is_from_word() == unit.is_word_byte() {
-            look_have = look_have.insert(Look::WordUnicodeNegate);
-            look_have = look_have.insert(Look::WordAsciiNegate);
+            look_have = look_have
+                .insert(Look::WordAsciiNegate)
+                .insert(Look::WordUnicodeNegate);
         } else {
-            look_have = look_have.insert(Look::WordUnicode);
-            look_have = look_have.insert(Look::WordAscii);
+            look_have =
+                look_have.insert(Look::WordAscii).insert(Look::WordUnicode);
+        }
+        if !unit.is_word_byte() {
+            look_have = look_have
+                .insert(Look::WordEndHalfAscii)
+                .insert(Look::WordEndHalfUnicode);
+        }
+        if state.is_from_word() && !unit.is_word_byte() {
+            look_have = look_have
+                .insert(Look::WordEndAscii)
+                .insert(Look::WordEndUnicode);
+        } else if !state.is_from_word() && unit.is_word_byte() {
+            look_have = look_have
+                .insert(Look::WordStartAscii)
+                .insert(Look::WordStartUnicode);
         }
         // If we have new assertions satisfied that are among the set of
         // assertions that exist in this state (that is, just because we added
@@ -220,6 +236,14 @@ pub(crate) fn next(
     {
         builder.set_look_have(|have| have.insert(Look::StartCRLF));
     }
+    // And also for the start-half word boundary assertions. As long as the
+    // look-behind byte is not a word char, then the assertions are satisfied.
+    if nfa.look_set_any().contains_word() && !unit.is_word_byte() {
+        builder.set_look_have(|have| {
+            have.insert(Look::WordStartHalfAscii)
+                .insert(Look::WordStartHalfUnicode)
+        });
+    }
     for nfa_id in sparses.set1.iter() {
         match *nfa.state(nfa_id) {
             thompson::State::Union { .. }
@@ -564,7 +588,12 @@ pub(crate) fn set_lookbehind_from_start(
     let rev = nfa.is_reverse();
     let lineterm = nfa.look_matcher().get_line_terminator();
     match *start {
-        Start::NonWordByte => {}
+        Start::NonWordByte => {
+            builder.set_look_have(|have| {
+                have.insert(Look::WordStartHalfAscii)
+                    .insert(Look::WordStartHalfUnicode)
+            });
+        }
         Start::WordByte => {
             builder.set_is_from_word();
         }
@@ -573,6 +602,8 @@ pub(crate) fn set_lookbehind_from_start(
                 have.insert(Look::Start)
                     .insert(Look::StartLF)
                     .insert(Look::StartCRLF)
+                    .insert(Look::WordStartHalfAscii)
+                    .insert(Look::WordStartHalfUnicode)
             });
         }
         Start::LineLF => {
@@ -585,6 +616,10 @@ pub(crate) fn set_lookbehind_from_start(
             if lineterm == b'\n' {
                 builder.set_look_have(|have| have.insert(Look::StartLF));
             }
+            builder.set_look_have(|have| {
+                have.insert(Look::WordStartHalfAscii)
+                    .insert(Look::WordStartHalfUnicode)
+            });
         }
         Start::LineCR => {
             if rev {
@@ -595,6 +630,10 @@ pub(crate) fn set_lookbehind_from_start(
             if lineterm == b'\r' {
                 builder.set_look_have(|have| have.insert(Look::StartLF));
             }
+            builder.set_look_have(|have| {
+                have.insert(Look::WordStartHalfAscii)
+                    .insert(Look::WordStartHalfUnicode)
+            });
         }
         Start::CustomLineTerminator => {
             builder.set_look_have(|have| have.insert(Look::StartLF));
@@ -604,6 +643,11 @@ pub(crate) fn set_lookbehind_from_start(
             // state as having come from a word byte.
             if utf8::is_word_byte(lineterm) {
                 builder.set_is_from_word();
+            } else {
+                builder.set_look_have(|have| {
+                    have.insert(Look::WordStartHalfAscii)
+                        .insert(Look::WordStartHalfUnicode)
+                });
             }
         }
     }
diff --git a/regex-automata/src/util/look.rs b/regex-automata/src/util/look.rs
index f87b963ad..ddf8fb129 100644
--- a/regex-automata/src/util/look.rs
+++ b/regex-automata/src/util/look.rs
@@ -96,6 +96,42 @@ pub enum Look {
     WordUnicode = 1 << 8,
     /// Match a Unicode-aware negation of a word boundary.
     WordUnicodeNegate = 1 << 9,
+    /// Match the start of an ASCII-only word boundary. That is, this matches a
+    /// position at either the beginning of the haystack or where the previous
+    /// character is not a word character and the following character is a word
+    /// character.
+    WordStartAscii = 1 << 10,
+    /// Match the end of an ASCII-only word boundary. That is, this matches
+    /// a position at either the end of the haystack or where the previous
+    /// character is a word character and the following character is not a word
+    /// character.
+    WordEndAscii = 1 << 11,
+    /// Match the start of a Unicode word boundary. That is, this matches a
+    /// position at either the beginning of the haystack or where the previous
+    /// character is not a word character and the following character is a word
+    /// character.
+    WordStartUnicode = 1 << 12,
+    /// Match the end of a Unicode word boundary. That is, this matches a
+    /// position at either the end of the haystack or where the previous
+    /// character is a word character and the following character is not a word
+    /// character.
+    WordEndUnicode = 1 << 13,
+    /// Match the start half of an ASCII-only word boundary. That is, this
+    /// matches a position at either the beginning of the haystack or where the
+    /// previous character is not a word character.
+    WordStartHalfAscii = 1 << 14,
+    /// Match the end half of an ASCII-only word boundary. That is, this
+    /// matches a position at either the end of the haystack or where the
+    /// following character is not a word character.
+    WordEndHalfAscii = 1 << 15,
+    /// Match the start half of a Unicode word boundary. That is, this matches
+    /// a position at either the beginning of the haystack or where the
+    /// previous character is not a word character.
+    WordStartHalfUnicode = 1 << 16,
+    /// Match the end half of a Unicode word boundary. That is, this matches
+    /// a position at either the end of the haystack or where the following
+    /// character is not a word character.
+    WordEndHalfUnicode = 1 << 17,
 }
 
 impl Look {
@@ -117,6 +153,14 @@ impl Look {
             Look::WordAsciiNegate => Look::WordAsciiNegate,
             Look::WordUnicode => Look::WordUnicode,
             Look::WordUnicodeNegate => Look::WordUnicodeNegate,
+            Look::WordStartAscii => Look::WordEndAscii,
+            Look::WordEndAscii => Look::WordStartAscii,
+            Look::WordStartUnicode => Look::WordEndUnicode,
+            Look::WordEndUnicode => Look::WordStartUnicode,
+            Look::WordStartHalfAscii => Look::WordEndHalfAscii,
+            Look::WordEndHalfAscii => Look::WordStartHalfAscii,
+            Look::WordStartHalfUnicode => Look::WordEndHalfUnicode,
+            Look::WordEndHalfUnicode => Look::WordStartHalfUnicode,
         }
     }
 
@@ -137,16 +181,24 @@ impl Look {
     #[inline]
     pub const fn from_repr(repr: u32) -> Option<Look> {
         match repr {
-            0b00_0000_0001 => Some(Look::Start),
-            0b00_0000_0010 => Some(Look::End),
-            0b00_0000_0100 => Some(Look::StartLF),
-            0b00_0000_1000 => Some(Look::EndLF),
-            0b00_0001_0000 => Some(Look::StartCRLF),
-            0b00_0010_0000 => Some(Look::EndCRLF),
-            0b00_0100_0000 => Some(Look::WordAscii),
-            0b00_1000_0000 => Some(Look::WordAsciiNegate),
-            0b01_0000_0000 => Some(Look::WordUnicode),
-            0b10_0000_0000 => Some(Look::WordUnicodeNegate),
+            0b00_0000_0000_0000_0001 => Some(Look::Start),
+            0b00_0000_0000_0000_0010 => Some(Look::End),
+            0b00_0000_0000_0000_0100 => Some(Look::StartLF),
+            0b00_0000_0000_0000_1000 => Some(Look::EndLF),
+            0b00_0000_0000_0001_0000 => Some(Look::StartCRLF),
+            0b00_0000_0000_0010_0000 => Some(Look::EndCRLF),
+            0b00_0000_0000_0100_0000 => Some(Look::WordAscii),
+            0b00_0000_0000_1000_0000 => Some(Look::WordAsciiNegate),
+            0b00_0000_0001_0000_0000 => Some(Look::WordUnicode),
+            0b00_0000_0010_0000_0000 => Some(Look::WordUnicodeNegate),
+            0b00_0000_0100_0000_0000 => Some(Look::WordStartAscii),
+            0b00_0000_1000_0000_0000 => Some(Look::WordEndAscii),
+            0b00_0001_0000_0000_0000 => Some(Look::WordStartUnicode),
+            0b00_0010_0000_0000_0000 => Some(Look::WordEndUnicode),
+            0b00_0100_0000_0000_0000 => Some(Look::WordStartHalfAscii),
+            0b00_1000_0000_0000_0000 => Some(Look::WordEndHalfAscii),
+            0b01_0000_0000_0000_0000 => Some(Look::WordStartHalfUnicode),
+            0b10_0000_0000_0000_0000 => Some(Look::WordEndHalfUnicode),
             _ => None,
         }
     }
@@ -171,6 +223,14 @@ impl Look {
             Look::WordAsciiNegate => 'B',
             Look::WordUnicode => '𝛃',
             Look::WordUnicodeNegate => '𝚩',
+            Look::WordStartAscii => '<',
+            Look::WordEndAscii => '>',
+            Look::WordStartUnicode => '〈',
+            Look::WordEndUnicode => '〉',
+            Look::WordStartHalfAscii => '◁',
+            Look::WordEndHalfAscii => '▷',
+            Look::WordStartHalfUnicode => '◀',
+            Look::WordEndHalfUnicode => '▶',
         }
     }
 }
@@ -294,13 +354,22 @@ impl LookSet {
     pub fn contains_word_unicode(self) -> bool {
         self.contains(Look::WordUnicode)
             || self.contains(Look::WordUnicodeNegate)
+            || self.contains(Look::WordStartUnicode)
+            || self.contains(Look::WordEndUnicode)
+            || self.contains(Look::WordStartHalfUnicode)
+            || self.contains(Look::WordEndHalfUnicode)
     }
 
     /// Returns true if and only if this set contains any ASCII word boundary
     /// or negated ASCII word boundary assertions.
     #[inline]
     pub fn contains_word_ascii(self) -> bool {
-        self.contains(Look::WordAscii) || self.contains(Look::WordAsciiNegate)
+        self.contains(Look::WordAscii)
+            || self.contains(Look::WordAsciiNegate)
+            || self.contains(Look::WordStartAscii)
+            || self.contains(Look::WordEndAscii)
+            || self.contains(Look::WordStartHalfAscii)
+            || self.contains(Look::WordEndHalfAscii)
     }
 
     /// Returns an iterator over all of the look-around assertions in this set.
@@ -568,6 +637,23 @@ impl LookMatcher {
     }
 
     /// Like `matches`, but forcefully inlined.
+    ///
+    /// # Panics
+    ///
+    /// This panics when testing any Unicode word boundary assertion in this
+    /// set and when the Unicode word data is not available. Specifically, this
+    /// only occurs when the `unicode-word-boundary` feature is not enabled.
+    ///
+    /// Since it's generally expected that this routine is called inside of
+    /// a matching engine, callers should check the error condition when
+    /// building the matching engine. If there is a Unicode word boundary
+    /// in the matcher and the data isn't available, then the matcher should
+    /// fail to build.
+    ///
+    /// Callers can check the error condition with [`LookSet::available`].
+    ///
+    /// This also may panic when `at > haystack.len()`. Note that `at ==
+    /// haystack.len()` is legal and guaranteed not to panic.
     #[cfg_attr(feature = "perf-inline", inline(always))]
     pub(crate) fn matches_inline(
         &self,
@@ -588,6 +674,26 @@ impl LookMatcher {
             Look::WordUnicodeNegate => {
                 self.is_word_unicode_negate(haystack, at).unwrap()
             }
+            Look::WordStartAscii => self.is_word_start_ascii(haystack, at),
+            Look::WordEndAscii => self.is_word_end_ascii(haystack, at),
+            Look::WordStartUnicode => {
+                self.is_word_start_unicode(haystack, at).unwrap()
+            }
+            Look::WordEndUnicode => {
+                self.is_word_end_unicode(haystack, at).unwrap()
+            }
+            Look::WordStartHalfAscii => {
+                self.is_word_start_half_ascii(haystack, at)
+            }
+            Look::WordEndHalfAscii => {
+                self.is_word_end_half_ascii(haystack, at)
+            }
+            Look::WordStartHalfUnicode => {
+                self.is_word_start_half_unicode(haystack, at).unwrap()
+            }
+            Look::WordEndHalfUnicode => {
+                self.is_word_end_half_unicode(haystack, at).unwrap()
+            }
         }
     }
 
@@ -682,6 +788,46 @@ impl LookMatcher {
                 return false;
             }
         }
+        if set.contains(Look::WordStartAscii) {
+            if !self.is_word_start_ascii(haystack, at) {
+                return false;
+            }
+        }
+        if set.contains(Look::WordEndAscii) {
+            if !self.is_word_end_ascii(haystack, at) {
+                return false;
+            }
+        }
+        if set.contains(Look::WordStartUnicode) {
+            if !self.is_word_start_unicode(haystack, at).unwrap() {
+                return false;
+            }
+        }
+        if set.contains(Look::WordEndUnicode) {
+            if !self.is_word_end_unicode(haystack, at).unwrap() {
+                return false;
+            }
+        }
+        if set.contains(Look::WordStartHalfAscii) {
+            if !self.is_word_start_half_ascii(haystack, at) {
+                return false;
+            }
+        }
+        if set.contains(Look::WordEndHalfAscii) {
+            if !self.is_word_end_half_ascii(haystack, at) {
+                return false;
+            }
+        }
+        if set.contains(Look::WordStartHalfUnicode) {
+            if !self.is_word_start_half_unicode(haystack, at).unwrap() {
+                return false;
+            }
+        }
+        if set.contains(Look::WordEndHalfUnicode) {
+            if !self.is_word_end_half_unicode(haystack, at).unwrap() {
+                return false;
+            }
+        }
         true
     }
 
@@ -705,7 +851,15 @@ impl LookMatcher {
             Look::WordAscii
             | Look::WordAsciiNegate
             | Look::WordUnicode
-            | Look::WordUnicodeNegate => {
+            | Look::WordUnicodeNegate
+            | Look::WordStartAscii
+            | Look::WordEndAscii
+            | Look::WordStartUnicode
+            | Look::WordEndUnicode
+            | Look::WordStartHalfAscii
+            | Look::WordEndHalfAscii
+            | Look::WordStartHalfUnicode
+            | Look::WordEndHalfUnicode => {
                 // We need to mark all ranges of bytes whose pairs result in
                 // evaluating \b differently. This isn't technically correct
                 // for Unicode word boundaries, but DFAs can't handle those
@@ -933,6 +1087,177 @@ impl LookMatcher {
             };
         Ok(word_before == word_after)
     }
+
+    /// Returns true when [`Look::WordStartAscii`] is satisfied `at` the given
+    /// position in `haystack`.
+    ///
+    /// # Panics
+    ///
+    /// This may panic when `at > haystack.len()`. Note that `at ==
+    /// haystack.len()` is legal and guaranteed not to panic.
+    #[inline]
+    pub fn is_word_start_ascii(&self, haystack: &[u8], at: usize) -> bool {
+        let word_before = at > 0 && utf8::is_word_byte(haystack[at - 1]);
+        let word_after =
+            at < haystack.len() && utf8::is_word_byte(haystack[at]);
+        !word_before && word_after
+    }
+
+    /// Returns true when [`Look::WordEndAscii`] is satisfied `at` the given
+    /// position in `haystack`.
+    ///
+    /// # Panics
+    ///
+    /// This may panic when `at > haystack.len()`. Note that `at ==
+    /// haystack.len()` is legal and guaranteed not to panic.
+    #[inline]
+    pub fn is_word_end_ascii(&self, haystack: &[u8], at: usize) -> bool {
+        let word_before = at > 0 && utf8::is_word_byte(haystack[at - 1]);
+        let word_after =
+            at < haystack.len() && utf8::is_word_byte(haystack[at]);
+        word_before && !word_after
+    }
+
+    /// Returns true when [`Look::WordStartUnicode`] is satisfied `at` the
+    /// given position in `haystack`.
+    ///
+    /// # Panics
+    ///
+    /// This may panic when `at > haystack.len()`. Note that `at ==
+    /// haystack.len()` is legal and guaranteed not to panic.
+    ///
+    /// # Errors
+    ///
+    /// This returns an error when Unicode word boundary tables
+    /// are not available. Specifically, this only occurs when the
+    /// `unicode-word-boundary` feature is not enabled.
+    #[inline]
+    pub fn is_word_start_unicode(
+        &self,
+        haystack: &[u8],
+        at: usize,
+    ) -> Result<bool, UnicodeWordBoundaryError> {
+        let word_before = is_word_char::rev(haystack, at)?;
+        let word_after = is_word_char::fwd(haystack, at)?;
+        Ok(!word_before && word_after)
+    }
+
+    /// Returns true when [`Look::WordEndUnicode`] is satisfied `at` the
+    /// given position in `haystack`.
+    ///
+    /// # Panics
+    ///
+    /// This may panic when `at > haystack.len()`. Note that `at ==
+    /// haystack.len()` is legal and guaranteed not to panic.
+    ///
+    /// # Errors
+    ///
+    /// This returns an error when Unicode word boundary tables
+    /// are not available. Specifically, this only occurs when the
+    /// `unicode-word-boundary` feature is not enabled.
+    #[inline]
+    pub fn is_word_end_unicode(
+        &self,
+        haystack: &[u8],
+        at: usize,
+    ) -> Result<bool, UnicodeWordBoundaryError> {
+        let word_before = is_word_char::rev(haystack, at)?;
+        let word_after = is_word_char::fwd(haystack, at)?;
+        Ok(word_before && !word_after)
+    }
+
+    /// Returns true when [`Look::WordStartHalfAscii`] is satisfied `at` the
+    /// given position in `haystack`.
+    ///
+    /// # Panics
+    ///
+    /// This may panic when `at > haystack.len()`. Note that `at ==
+    /// haystack.len()` is legal and guaranteed not to panic.
+    #[inline]
+    pub fn is_word_start_half_ascii(
+        &self,
+        haystack: &[u8],
+        at: usize,
+    ) -> bool {
+        let word_before = at > 0 && utf8::is_word_byte(haystack[at - 1]);
+        !word_before
+    }
+
+    /// Returns true when [`Look::WordEndHalfAscii`] is satisfied `at` the
+    /// given position in `haystack`.
+    ///
+    /// # Panics
+    ///
+    /// This may panic when `at > haystack.len()`. Note that `at ==
+    /// haystack.len()` is legal and guaranteed not to panic.
+    #[inline]
+    pub fn is_word_end_half_ascii(&self, haystack: &[u8], at: usize) -> bool {
+        let word_after =
+            at < haystack.len() && utf8::is_word_byte(haystack[at]);
+        !word_after
+    }
+
+    /// Returns true when [`Look::WordStartHalfUnicode`] is satisfied `at` the
+    /// given position in `haystack`.
+    ///
+    /// # Panics
+    ///
+    /// This may panic when `at > haystack.len()`. Note that `at ==
+    /// haystack.len()` is legal and guaranteed not to panic.
+    ///
+    /// # Errors
+    ///
+    /// This returns an error when Unicode word boundary tables
+    /// are not available. Specifically, this only occurs when the
+    /// `unicode-word-boundary` feature is not enabled.
+    #[inline]
+    pub fn is_word_start_half_unicode(
+        &self,
+        haystack: &[u8],
+        at: usize,
+    ) -> Result<bool, UnicodeWordBoundaryError> {
+        // See `is_word_unicode_negate` for why we need to do this. We don't
+        // need to do it for `is_word_start_unicode` because that guarantees
+        // that the position matched falls on a valid UTF-8 boundary given
+        // that the right side must be in \w.
+        let word_before = at > 0
+            && match utf8::decode_last(&haystack[..at]) {
+                None | Some(Err(_)) => return Ok(false),
+                Some(Ok(_)) => is_word_char::rev(haystack, at)?,
+            };
+        Ok(!word_before)
+    }
+
+    /// Returns true when [`Look::WordEndHalfUnicode`] is satisfied `at` the
+    /// given position in `haystack`.
+    ///
+    /// # Panics
+    ///
+    /// This may panic when `at > haystack.len()`. Note that `at ==
+    /// haystack.len()` is legal and guaranteed not to panic.
+    ///
+    /// # Errors
+    ///
+    /// This returns an error when Unicode word boundary tables
+    /// are not available. Specifically, this only occurs when the
+    /// `unicode-word-boundary` feature is not enabled.
+    #[inline]
+    pub fn is_word_end_half_unicode(
+        &self,
+        haystack: &[u8],
+        at: usize,
+    ) -> Result<bool, UnicodeWordBoundaryError> {
+        // See `is_word_unicode_negate` for why we need to do this. We don't
+        // need to do it for `is_word_end_unicode` because that guarantees
+        // that the position matched falls on a valid UTF-8 boundary given
+        // that the left side must be in \w.
+        let word_after = at < haystack.len()
+            && match utf8::decode(&haystack[at..]) {
+                None | Some(Err(_)) => return Ok(false),
+                Some(Ok(_)) => is_word_char::fwd(haystack, at)?,
+            };
+        Ok(!word_after)
+    }
 }
 
 impl Default for LookMatcher {
@@ -1660,50 +1985,478 @@ mod tests {
     }
 
     #[test]
-    fn look_set() {
-        let mut f = LookSet::default();
-        assert!(!f.contains(Look::Start));
-        assert!(!f.contains(Look::End));
-        assert!(!f.contains(Look::StartLF));
-        assert!(!f.contains(Look::EndLF));
-        assert!(!f.contains(Look::WordUnicode));
-        assert!(!f.contains(Look::WordUnicodeNegate));
-        assert!(!f.contains(Look::WordAscii));
-        assert!(!f.contains(Look::WordAsciiNegate));
+    fn look_matches_word_start_ascii() {
+        let look = Look::WordStartAscii;
 
-        f = f.insert(Look::Start);
-        assert!(f.contains(Look::Start));
-        f = f.remove(Look::Start);
-        assert!(!f.contains(Look::Start));
+        // \xF0\x9D\x9B\x83 = 𝛃 (in \w)
+        // \xF0\x90\x86\x80 = 𐆀 (not in \w)
 
-        f = f.insert(Look::End);
-        assert!(f.contains(Look::End));
-        f = f.remove(Look::End);
-        assert!(!f.contains(Look::End));
+        // Simple ASCII word boundaries.
+        assert!(testlook!(look, "a", 0));
+        assert!(!testlook!(look, "a", 1));
+        assert!(!testlook!(look, "a ", 1));
+        assert!(testlook!(look, " a ", 1));
+        assert!(!testlook!(look, " a ", 2));
 
-        f = f.insert(Look::StartLF);
-        assert!(f.contains(Look::StartLF));
-        f = f.remove(Look::StartLF);
-        assert!(!f.contains(Look::StartLF));
+        // Unicode word boundaries with a non-ASCII codepoint. Since this is
+        // an ASCII word boundary, none of these match.
+        assert!(!testlook!(look, "𝛃", 0));
+        assert!(!testlook!(look, "𝛃", 4));
+        assert!(!testlook!(look, "𝛃 ", 4));
+        assert!(!testlook!(look, " 𝛃 ", 1));
+        assert!(!testlook!(look, " 𝛃 ", 5));
 
-        f = f.insert(Look::EndLF);
-        assert!(f.contains(Look::EndLF));
-        f = f.remove(Look::EndLF);
-        assert!(!f.contains(Look::EndLF));
+        // Unicode word boundaries between non-ASCII codepoints. Again, since
+        // this is an ASCII word boundary, none of these match.
+        assert!(!testlook!(look, "𝛃𐆀", 0));
+        assert!(!testlook!(look, "𝛃𐆀", 4));
 
-        f = f.insert(Look::StartCRLF);
-        assert!(f.contains(Look::StartCRLF));
-        f = f.remove(Look::StartCRLF);
-        assert!(!f.contains(Look::StartCRLF));
+        // Non word boundaries for ASCII.
+        assert!(!testlook!(look, "", 0));
+        assert!(!testlook!(look, "ab", 1));
+        assert!(!testlook!(look, "a ", 2));
+        assert!(!testlook!(look, " a ", 0));
+        assert!(!testlook!(look, " a ", 3));
 
-        f = f.insert(Look::EndCRLF);
-        assert!(f.contains(Look::EndCRLF));
-        f = f.remove(Look::EndCRLF);
-        assert!(!f.contains(Look::EndCRLF));
+        // Non word boundaries with a non-ASCII codepoint.
+        assert!(testlook!(look, "𝛃b", 4));
+        assert!(!testlook!(look, "b𝛃", 1));
+        assert!(!testlook!(look, "𝛃 ", 5));
+        assert!(!testlook!(look, " 𝛃 ", 0));
+        assert!(!testlook!(look, " 𝛃 ", 6));
+        assert!(!testlook!(look, "𝛃", 1));
+        assert!(!testlook!(look, "𝛃", 2));
+        assert!(!testlook!(look, "𝛃", 3));
 
-        f = f.insert(Look::WordUnicode);
-        assert!(f.contains(Look::WordUnicode));
-        f = f.remove(Look::WordUnicode);
+        // Non word boundaries with non-ASCII codepoints.
+        assert!(!testlook!(look, "𝛃𐆀", 1));
+        assert!(!testlook!(look, "𝛃𐆀", 2));
+        assert!(!testlook!(look, "𝛃𐆀", 3));
+        assert!(!testlook!(look, "𝛃𐆀", 5));
+        assert!(!testlook!(look, "𝛃𐆀", 6));
+        assert!(!testlook!(look, "𝛃𐆀", 7));
+        assert!(!testlook!(look, "𝛃𐆀", 8));
+    }
+
+    #[test]
+    fn look_matches_word_end_ascii() {
+        let look = Look::WordEndAscii;
+
+        // \xF0\x9D\x9B\x83 = 𝛃 (in \w)
+        // \xF0\x90\x86\x80 = 𐆀 (not in \w)
+
+        // Simple ASCII word boundaries.
+        assert!(!testlook!(look, "a", 0));
+        assert!(testlook!(look, "a", 1));
+        assert!(testlook!(look, "a ", 1));
+        assert!(!testlook!(look, " a ", 1));
+        assert!(testlook!(look, " a ", 2));
+
+        // Unicode word boundaries with a non-ASCII codepoint. Since this is
+        // an ASCII word boundary, none of these match.
+        assert!(!testlook!(look, "𝛃", 0));
+        assert!(!testlook!(look, "𝛃", 4));
+        assert!(!testlook!(look, "𝛃 ", 4));
+        assert!(!testlook!(look, " 𝛃 ", 1));
+        assert!(!testlook!(look, " 𝛃 ", 5));
+
+        // Unicode word boundaries between non-ASCII codepoints. Again, since
+        // this is an ASCII word boundary, none of these match.
+        assert!(!testlook!(look, "𝛃𐆀", 0));
+        assert!(!testlook!(look, "𝛃𐆀", 4));
+
+        // Non word boundaries for ASCII.
+        assert!(!testlook!(look, "", 0));
+        assert!(!testlook!(look, "ab", 1));
+        assert!(!testlook!(look, "a ", 2));
+        assert!(!testlook!(look, " a ", 0));
+        assert!(!testlook!(look, " a ", 3));
+
+        // Non word boundaries with a non-ASCII codepoint.
+        assert!(!testlook!(look, "𝛃b", 4));
+        assert!(testlook!(look, "b𝛃", 1));
+        assert!(!testlook!(look, "𝛃 ", 5));
+        assert!(!testlook!(look, " 𝛃 ", 0));
+        assert!(!testlook!(look, " 𝛃 ", 6));
+        assert!(!testlook!(look, "𝛃", 1));
+        assert!(!testlook!(look, "𝛃", 2));
+        assert!(!testlook!(look, "𝛃", 3));
+
+        // Non word boundaries with non-ASCII codepoints.
+        assert!(!testlook!(look, "𝛃𐆀", 1));
+        assert!(!testlook!(look, "𝛃𐆀", 2));
+        assert!(!testlook!(look, "𝛃𐆀", 3));
+        assert!(!testlook!(look, "𝛃𐆀", 5));
+        assert!(!testlook!(look, "𝛃𐆀", 6));
+        assert!(!testlook!(look, "𝛃𐆀", 7));
+        assert!(!testlook!(look, "𝛃𐆀", 8));
+    }
+
+    #[test]
+    #[cfg(all(not(miri), feature = "unicode-word-boundary"))]
+    fn look_matches_word_start_unicode() {
+        let look = Look::WordStartUnicode;
+
+        // \xF0\x9D\x9B\x83 = 𝛃 (in \w)
+        // \xF0\x90\x86\x80 = 𐆀 (not in \w)
+
+        // Simple ASCII word boundaries.
+        assert!(testlook!(look, "a", 0));
+        assert!(!testlook!(look, "a", 1));
+        assert!(!testlook!(look, "a ", 1));
+        assert!(testlook!(look, " a ", 1));
+        assert!(!testlook!(look, " a ", 2));
+
+        // Unicode word boundaries with a non-ASCII codepoint.
+        assert!(testlook!(look, "𝛃", 0));
+        assert!(!testlook!(look, "𝛃", 4));
+        assert!(!testlook!(look, "𝛃 ", 4));
+        assert!(testlook!(look, " 𝛃 ", 1));
+        assert!(!testlook!(look, " 𝛃 ", 5));
+
+        // Unicode word boundaries between non-ASCII codepoints.
+        assert!(testlook!(look, "𝛃𐆀", 0));
+        assert!(!testlook!(look, "𝛃𐆀", 4));
+
+        // Non word boundaries for ASCII.
+        assert!(!testlook!(look, "", 0));
+        assert!(!testlook!(look, "ab", 1));
+        assert!(!testlook!(look, "a ", 2));
+        assert!(!testlook!(look, " a ", 0));
+        assert!(!testlook!(look, " a ", 3));
+
+        // Non word boundaries with a non-ASCII codepoint.
+        assert!(!testlook!(look, "𝛃b", 4));
+        assert!(!testlook!(look, "b𝛃", 1));
+        assert!(!testlook!(look, "𝛃 ", 5));
+        assert!(!testlook!(look, " 𝛃 ", 0));
+        assert!(!testlook!(look, " 𝛃 ", 6));
+        assert!(!testlook!(look, "𝛃", 1));
+        assert!(!testlook!(look, "𝛃", 2));
+        assert!(!testlook!(look, "𝛃", 3));
+
+        // Non word boundaries with non-ASCII codepoints.
+        assert!(!testlook!(look, "𝛃𐆀", 1));
+        assert!(!testlook!(look, "𝛃𐆀", 2));
+        assert!(!testlook!(look, "𝛃𐆀", 3));
+        assert!(!testlook!(look, "𝛃𐆀", 5));
+        assert!(!testlook!(look, "𝛃𐆀", 6));
+        assert!(!testlook!(look, "𝛃𐆀", 7));
+        assert!(!testlook!(look, "𝛃𐆀", 8));
+    }
+
+    #[test]
+    #[cfg(all(not(miri), feature = "unicode-word-boundary"))]
+    fn look_matches_word_end_unicode() {
+        let look = Look::WordEndUnicode;
+
+        // \xF0\x9D\x9B\x83 = 𝛃 (in \w)
+        // \xF0\x90\x86\x80 = 𐆀 (not in \w)
+
+        // Simple ASCII word boundaries.
+        assert!(!testlook!(look, "a", 0));
+        assert!(testlook!(look, "a", 1));
+        assert!(testlook!(look, "a ", 1));
+        assert!(!testlook!(look, " a ", 1));
+        assert!(testlook!(look, " a ", 2));
+
+        // Unicode word boundaries with a non-ASCII codepoint.
+        assert!(!testlook!(look, "𝛃", 0));
+        assert!(testlook!(look, "𝛃", 4));
+        assert!(testlook!(look, "𝛃 ", 4));
+        assert!(!testlook!(look, " 𝛃 ", 1));
+        assert!(testlook!(look, " 𝛃 ", 5));
+
+        // Unicode word boundaries between non-ASCII codepoints.
+        assert!(!testlook!(look, "𝛃𐆀", 0));
+        assert!(testlook!(look, "𝛃𐆀", 4));
+
+        // Non word boundaries for ASCII.
+        assert!(!testlook!(look, "", 0));
+        assert!(!testlook!(look, "ab", 1));
+        assert!(!testlook!(look, "a ", 2));
+        assert!(!testlook!(look, " a ", 0));
+        assert!(!testlook!(look, " a ", 3));
+
+        // Non word boundaries with a non-ASCII codepoint.
+        assert!(!testlook!(look, "𝛃b", 4));
+        assert!(!testlook!(look, "b𝛃", 1));
+        assert!(!testlook!(look, "𝛃 ", 5));
+        assert!(!testlook!(look, " 𝛃 ", 0));
+        assert!(!testlook!(look, " 𝛃 ", 6));
+        assert!(!testlook!(look, "𝛃", 1));
+        assert!(!testlook!(look, "𝛃", 2));
+        assert!(!testlook!(look, "𝛃", 3));
+
+        // Non word boundaries with non-ASCII codepoints.
+        assert!(!testlook!(look, "𝛃𐆀", 1));
+        assert!(!testlook!(look, "𝛃𐆀", 2));
+        assert!(!testlook!(look, "𝛃𐆀", 3));
+        assert!(!testlook!(look, "𝛃𐆀", 5));
+        assert!(!testlook!(look, "𝛃𐆀", 6));
+        assert!(!testlook!(look, "𝛃𐆀", 7));
+        assert!(!testlook!(look, "𝛃𐆀", 8));
+    }
+
+    #[test]
+    fn look_matches_word_start_half_ascii() {
+        let look = Look::WordStartHalfAscii;
+
+        // \xF0\x9D\x9B\x83 = 𝛃 (in \w)
+        // \xF0\x90\x86\x80 = 𐆀 (not in \w)
+
+        // Simple ASCII word boundaries.
+        assert!(testlook!(look, "a", 0));
+        assert!(!testlook!(look, "a", 1));
+        assert!(!testlook!(look, "a ", 1));
+        assert!(testlook!(look, " a ", 1));
+        assert!(!testlook!(look, " a ", 2));
+
+        // Unicode word boundaries with a non-ASCII codepoint. Since this is
+        // an ASCII word boundary, none of these match.
+        assert!(testlook!(look, "𝛃", 0));
+        assert!(testlook!(look, "𝛃", 4));
+        assert!(testlook!(look, "𝛃 ", 4));
+        assert!(testlook!(look, " 𝛃 ", 1));
+        assert!(testlook!(look, " 𝛃 ", 5));
+
+        // Unicode word boundaries between non-ASCII codepoints. Again, since
+        // this is an ASCII word boundary, none of these match.
+        assert!(testlook!(look, "𝛃𐆀", 0));
+        assert!(testlook!(look, "𝛃𐆀", 4));
+
+        // Non word boundaries for ASCII.
+        assert!(testlook!(look, "", 0));
+        assert!(!testlook!(look, "ab", 1));
+        assert!(testlook!(look, "a ", 2));
+        assert!(testlook!(look, " a ", 0));
+        assert!(testlook!(look, " a ", 3));
+
+        // Non word boundaries with a non-ASCII codepoint.
+        assert!(testlook!(look, "𝛃b", 4));
+        assert!(!testlook!(look, "b𝛃", 1));
+        assert!(testlook!(look, "𝛃 ", 5));
+        assert!(testlook!(look, " 𝛃 ", 0));
+        assert!(testlook!(look, " 𝛃 ", 6));
+        assert!(testlook!(look, "𝛃", 1));
+        assert!(testlook!(look, "𝛃", 2));
+        assert!(testlook!(look, "𝛃", 3));
+
+        // Non word boundaries with non-ASCII codepoints.
+        assert!(testlook!(look, "𝛃𐆀", 1));
+        assert!(testlook!(look, "𝛃𐆀", 2));
+        assert!(testlook!(look, "𝛃𐆀", 3));
+        assert!(testlook!(look, "𝛃𐆀", 5));
+        assert!(testlook!(look, "𝛃𐆀", 6));
+        assert!(testlook!(look, "𝛃𐆀", 7));
+        assert!(testlook!(look, "𝛃𐆀", 8));
+    }
+
+    #[test]
+    fn look_matches_word_end_half_ascii() {
+        let look = Look::WordEndHalfAscii;
+
+        // \xF0\x9D\x9B\x83 = 𝛃 (in \w)
+        // \xF0\x90\x86\x80 = 𐆀 (not in \w)
+
+        // Simple ASCII word boundaries.
+        assert!(!testlook!(look, "a", 0));
+        assert!(testlook!(look, "a", 1));
+        assert!(testlook!(look, "a ", 1));
+        assert!(!testlook!(look, " a ", 1));
+        assert!(testlook!(look, " a ", 2));
+
+        // Unicode word boundaries with a non-ASCII codepoint. Since this is
+        // an ASCII word boundary, none of these match.
+        assert!(testlook!(look, "𝛃", 0));
+        assert!(testlook!(look, "𝛃", 4));
+        assert!(testlook!(look, "𝛃 ", 4));
+        assert!(testlook!(look, " 𝛃 ", 1));
+        assert!(testlook!(look, " 𝛃 ", 5));
+
+        // Unicode word boundaries between non-ASCII codepoints. Again, since
+        // this is an ASCII word boundary, none of these match.
+        assert!(testlook!(look, "𝛃𐆀", 0));
+        assert!(testlook!(look, "𝛃𐆀", 4));
+
+        // Non word boundaries for ASCII.
+        assert!(testlook!(look, "", 0));
+        assert!(!testlook!(look, "ab", 1));
+        assert!(testlook!(look, "a ", 2));
+        assert!(testlook!(look, " a ", 0));
+        assert!(testlook!(look, " a ", 3));
+
+        // Non word boundaries with a non-ASCII codepoint.
+        assert!(!testlook!(look, "𝛃b", 4));
+        assert!(testlook!(look, "b𝛃", 1));
+        assert!(testlook!(look, "𝛃 ", 5));
+        assert!(testlook!(look, " 𝛃 ", 0));
+        assert!(testlook!(look, " 𝛃 ", 6));
+        assert!(testlook!(look, "𝛃", 1));
+        assert!(testlook!(look, "𝛃", 2));
+        assert!(testlook!(look, "𝛃", 3));
+
+        // Non word boundaries with non-ASCII codepoints.
+        assert!(testlook!(look, "𝛃𐆀", 1));
+        assert!(testlook!(look, "𝛃𐆀", 2));
+        assert!(testlook!(look, "𝛃𐆀", 3));
+        assert!(testlook!(look, "𝛃𐆀", 5));
+        assert!(testlook!(look, "𝛃𐆀", 6));
+        assert!(testlook!(look, "𝛃𐆀", 7));
+        assert!(testlook!(look, "𝛃𐆀", 8));
+    }
+
+    #[test]
+    #[cfg(all(not(miri), feature = "unicode-word-boundary"))]
+    fn look_matches_word_start_half_unicode() {
+        let look = Look::WordStartHalfUnicode;
+
+        // \xF0\x9D\x9B\x83 = 𝛃 (in \w)
+        // \xF0\x90\x86\x80 = 𐆀 (not in \w)
+
+        // Simple ASCII word boundaries.
+        assert!(testlook!(look, "a", 0));
+        assert!(!testlook!(look, "a", 1));
+        assert!(!testlook!(look, "a ", 1));
+        assert!(testlook!(look, " a ", 1));
+        assert!(!testlook!(look, " a ", 2));
+
+        // Unicode word boundaries with a non-ASCII codepoint.
+        assert!(testlook!(look, "𝛃", 0));
+        assert!(!testlook!(look, "𝛃", 4));
+        assert!(!testlook!(look, "𝛃 ", 4));
+        assert!(testlook!(look, " 𝛃 ", 1));
+        assert!(!testlook!(look, " 𝛃 ", 5));
+
+        // Unicode word boundaries between non-ASCII codepoints.
+        assert!(testlook!(look, "𝛃𐆀", 0));
+        assert!(!testlook!(look, "𝛃𐆀", 4));
+
+        // Non word boundaries for ASCII.
+        assert!(testlook!(look, "", 0));
+        assert!(!testlook!(look, "ab", 1));
+        assert!(testlook!(look, "a ", 2));
+        assert!(testlook!(look, " a ", 0));
+        assert!(testlook!(look, " a ", 3));
+
+        // Non word boundaries with a non-ASCII codepoint.
+        assert!(!testlook!(look, "𝛃b", 4));
+        assert!(!testlook!(look, "b𝛃", 1));
+        assert!(testlook!(look, "𝛃 ", 5));
+        assert!(testlook!(look, " 𝛃 ", 0));
+        assert!(testlook!(look, " 𝛃 ", 6));
+        assert!(!testlook!(look, "𝛃", 1));
+        assert!(!testlook!(look, "𝛃", 2));
+        assert!(!testlook!(look, "𝛃", 3));
+
+        // Non word boundaries with non-ASCII codepoints.
+        assert!(!testlook!(look, "𝛃𐆀", 1));
+        assert!(!testlook!(look, "𝛃𐆀", 2));
+        assert!(!testlook!(look, "𝛃𐆀", 3));
+        assert!(!testlook!(look, "𝛃𐆀", 5));
+        assert!(!testlook!(look, "𝛃𐆀", 6));
+        assert!(!testlook!(look, "𝛃𐆀", 7));
+        assert!(testlook!(look, "𝛃𐆀", 8));
+    }
+
+    #[test]
+    #[cfg(all(not(miri), feature = "unicode-word-boundary"))]
+    fn look_matches_word_end_half_unicode() {
+        let look = Look::WordEndHalfUnicode;
+
+        // \xF0\x9D\x9B\x83 = 𝛃 (in \w)
+        // \xF0\x90\x86\x80 = 𐆀 (not in \w)
+
+        // Simple ASCII word boundaries.
+        assert!(!testlook!(look, "a", 0));
+        assert!(testlook!(look, "a", 1));
+        assert!(testlook!(look, "a ", 1));
+        assert!(!testlook!(look, " a ", 1));
+        assert!(testlook!(look, " a ", 2));
+
+        // Unicode word boundaries with a non-ASCII codepoint.
+        assert!(!testlook!(look, "𝛃", 0));
+        assert!(testlook!(look, "𝛃", 4));
+        assert!(testlook!(look, "𝛃 ", 4));
+        assert!(!testlook!(look, " 𝛃 ", 1));
+        assert!(testlook!(look, " 𝛃 ", 5));
+
+        // Unicode word boundaries between non-ASCII codepoints.
+        assert!(!testlook!(look, "𝛃𐆀", 0));
+        assert!(testlook!(look, "𝛃𐆀", 4));
+
+        // Non word boundaries for ASCII.
+        assert!(testlook!(look, "", 0));
+        assert!(!testlook!(look, "ab", 1));
+        assert!(testlook!(look, "a ", 2));
+        assert!(testlook!(look, " a ", 0));
+        assert!(testlook!(look, " a ", 3));
+
+        // Non word boundaries with a non-ASCII codepoint.
+        assert!(!testlook!(look, "𝛃b", 4));
+        assert!(!testlook!(look, "b𝛃", 1));
+        assert!(testlook!(look, "𝛃 ", 5));
+        assert!(testlook!(look, " 𝛃 ", 0));
+        assert!(testlook!(look, " 𝛃 ", 6));
+        assert!(!testlook!(look, "𝛃", 1));
+        assert!(!testlook!(look, "𝛃", 2));
+        assert!(!testlook!(look, "𝛃", 3));
+
+        // Non word boundaries with non-ASCII codepoints.
+        assert!(!testlook!(look, "𝛃𐆀", 1));
+        assert!(!testlook!(look, "𝛃𐆀", 2));
+        assert!(!testlook!(look, "𝛃𐆀", 3));
+        assert!(!testlook!(look, "𝛃𐆀", 5));
+        assert!(!testlook!(look, "𝛃𐆀", 6));
+        assert!(!testlook!(look, "𝛃𐆀", 7));
+        assert!(testlook!(look, "𝛃𐆀", 8));
+    }
+
+    #[test]
+    fn look_set() {
+        let mut f = LookSet::default();
+        assert!(!f.contains(Look::Start));
+        assert!(!f.contains(Look::End));
+        assert!(!f.contains(Look::StartLF));
+        assert!(!f.contains(Look::EndLF));
+        assert!(!f.contains(Look::WordUnicode));
+        assert!(!f.contains(Look::WordUnicodeNegate));
+        assert!(!f.contains(Look::WordAscii));
+        assert!(!f.contains(Look::WordAsciiNegate));
+
+        f = f.insert(Look::Start);
+        assert!(f.contains(Look::Start));
+        f = f.remove(Look::Start);
+        assert!(!f.contains(Look::Start));
+
+        f = f.insert(Look::End);
+        assert!(f.contains(Look::End));
+        f = f.remove(Look::End);
+        assert!(!f.contains(Look::End));
+
+        f = f.insert(Look::StartLF);
+        assert!(f.contains(Look::StartLF));
+        f = f.remove(Look::StartLF);
+        assert!(!f.contains(Look::StartLF));
+
+        f = f.insert(Look::EndLF);
+        assert!(f.contains(Look::EndLF));
+        f = f.remove(Look::EndLF);
+        assert!(!f.contains(Look::EndLF));
+
+        f = f.insert(Look::StartCRLF);
+        assert!(f.contains(Look::StartCRLF));
+        f = f.remove(Look::StartCRLF);
+        assert!(!f.contains(Look::StartCRLF));
+
+        f = f.insert(Look::EndCRLF);
+        assert!(f.contains(Look::EndCRLF));
+        f = f.remove(Look::EndCRLF);
+        assert!(!f.contains(Look::EndCRLF));
+
+        f = f.insert(Look::WordUnicode);
+        assert!(f.contains(Look::WordUnicode));
+        f = f.remove(Look::WordUnicode);
         assert!(!f.contains(Look::WordUnicode));
 
         f = f.insert(Look::WordUnicodeNegate);
@@ -1720,6 +2473,46 @@ mod tests {
         assert!(f.contains(Look::WordAsciiNegate));
         f = f.remove(Look::WordAsciiNegate);
         assert!(!f.contains(Look::WordAsciiNegate));
+
+        f = f.insert(Look::WordStartAscii);
+        assert!(f.contains(Look::WordStartAscii));
+        f = f.remove(Look::WordStartAscii);
+        assert!(!f.contains(Look::WordStartAscii));
+
+        f = f.insert(Look::WordEndAscii);
+        assert!(f.contains(Look::WordEndAscii));
+        f = f.remove(Look::WordEndAscii);
+        assert!(!f.contains(Look::WordEndAscii));
+
+        f = f.insert(Look::WordStartUnicode);
+        assert!(f.contains(Look::WordStartUnicode));
+        f = f.remove(Look::WordStartUnicode);
+        assert!(!f.contains(Look::WordStartUnicode));
+
+        f = f.insert(Look::WordEndUnicode);
+        assert!(f.contains(Look::WordEndUnicode));
+        f = f.remove(Look::WordEndUnicode);
+        assert!(!f.contains(Look::WordEndUnicode));
+
+        f = f.insert(Look::WordStartHalfAscii);
+        assert!(f.contains(Look::WordStartHalfAscii));
+        f = f.remove(Look::WordStartHalfAscii);
+        assert!(!f.contains(Look::WordStartHalfAscii));
+
+        f = f.insert(Look::WordEndHalfAscii);
+        assert!(f.contains(Look::WordEndHalfAscii));
+        f = f.remove(Look::WordEndHalfAscii);
+        assert!(!f.contains(Look::WordEndHalfAscii));
+
+        f = f.insert(Look::WordStartHalfUnicode);
+        assert!(f.contains(Look::WordStartHalfUnicode));
+        f = f.remove(Look::WordStartHalfUnicode);
+        assert!(!f.contains(Look::WordStartHalfUnicode));
+
+        f = f.insert(Look::WordEndHalfUnicode);
+        assert!(f.contains(Look::WordEndHalfUnicode));
+        f = f.remove(Look::WordEndHalfUnicode);
+        assert!(!f.contains(Look::WordEndHalfUnicode));
     }
 
     #[test]
@@ -1728,7 +2521,7 @@ mod tests {
         assert_eq!(0, set.iter().count());
 
         let set = LookSet::full();
-        assert_eq!(10, set.iter().count());
+        assert_eq!(18, set.iter().count());
 
         let set =
             LookSet::empty().insert(Look::StartLF).insert(Look::WordUnicode);
@@ -1739,6 +2532,9 @@ mod tests {
 
         let set = LookSet::empty().insert(Look::WordAsciiNegate);
         assert_eq!(1, set.iter().count());
+
+        let set = LookSet::empty().insert(Look::WordEndHalfUnicode);
+        assert_eq!(1, set.iter().count());
     }
 
     #[test]
@@ -1747,6 +2543,6 @@ mod tests {
         let res = alloc::format!("{:?}", LookSet::empty());
         assert_eq!("∅", res);
         let res = alloc::format!("{:?}", LookSet::full());
-        assert_eq!("Az^$rRbB𝛃𝚩", res);
+        assert_eq!("Az^$rRbB𝛃𝚩<>〈〉◁▷◀▶", res);
     }
 }
diff --git a/regex-automata/tests/dfa/suite.rs b/regex-automata/tests/dfa/suite.rs
index f3445e02a..8ed6dd007 100644
--- a/regex-automata/tests/dfa/suite.rs
+++ b/regex-automata/tests/dfa/suite.rs
@@ -9,7 +9,6 @@ use {
         util::{prefilter::Prefilter, syntax},
         Anchored, Input, PatternSet,
     },
-    regex_syntax::hir,
     regex_test::{
         CompiledRegex, Match, RegexTest, SearchKind, Span, TestResult,
         TestRunner,
@@ -285,10 +284,7 @@ fn compiler(
         // That is, Unicode word boundaries when searching non-ASCII text.
         if !test.haystack().is_ascii() {
             for hir in hirs.iter() {
-                let looks = hir.properties().look_set();
-                if looks.contains(hir::Look::WordUnicode)
-                    || looks.contains(hir::Look::WordUnicodeNegate)
-                {
+                if hir.properties().look_set().contains_word_unicode() {
                     return Ok(CompiledRegex::skip());
                 }
             }
diff --git a/regex-automata/tests/lib.rs b/regex-automata/tests/lib.rs
index 1465e51eb..67c979aa8 100644
--- a/regex-automata/tests/lib.rs
+++ b/regex-automata/tests/lib.rs
@@ -61,6 +61,7 @@ fn suite() -> anyhow::Result<regex_test::RegexTests> {
     load!("unicode");
     load!("utf8");
     load!("word-boundary");
+    load!("word-boundary-special");
     load!("fowler/basic");
     load!("fowler/nullsubexpr");
     load!("fowler/repetition");
diff --git a/testdata/word-boundary-special.toml b/testdata/word-boundary-special.toml
new file mode 100644
index 000000000..c1689f5cc
--- /dev/null
+++ b/testdata/word-boundary-special.toml
@@ -0,0 +1,653 @@
+# These tests are for the "special" word boundary assertions. That is,
+# \b{start}, \b{end}, \b{start-half}, \b{end-half}. These are specialty
+# assertions for more niche use cases, but hitting those cases without these
+# assertions is difficult. For example, \b{start-half} and \b{end-half} are
+# used to implement the -w/--word-regexp flag in a grep program.
+
+# Tests for (?-u:\b{start})
+
+[[test]]
+name = "word-start-ascii-010"
+regex = '\b{start}'
+haystack = "a"
+matches = [[0, 0]]
+unicode = false
+
+[[test]]
+name = "word-start-ascii-020"
+regex = '\b{start}'
+haystack = "a "
+matches = [[0, 0]]
+unicode = false
+
+[[test]]
+name = "word-start-ascii-030"
+regex = '\b{start}'
+haystack = " a "
+matches = [[1, 1]]
+unicode = false
+
+[[test]]
+name = "word-start-ascii-040"
+regex = '\b{start}'
+haystack = ""
+matches = []
+unicode = false
+
+[[test]]
+name = "word-start-ascii-050"
+regex = '\b{start}'
+haystack = "ab"
+matches = [[0, 0]]
+unicode = false
+
+[[test]]
+name = "word-start-ascii-060"
+regex = '\b{start}'
+haystack = "𝛃"
+matches = []
+unicode = false
+
+[[test]]
+name = "word-start-ascii-060-bounds"
+regex = '\b{start}'
+haystack = "𝛃"
+bounds = [2, 3]
+matches = []
+unicode = false
+
+[[test]]
+name = "word-start-ascii-070"
+regex = '\b{start}'
+haystack = " 𝛃 "
+matches = []
+unicode = false
+
+[[test]]
+name = "word-start-ascii-080"
+regex = '\b{start}'
+haystack = "𝛃𐆀"
+matches = []
+unicode = false
+
+[[test]]
+name = "word-start-ascii-090"
+regex = '\b{start}'
+haystack = "𝛃b"
+matches = [[4, 4]]
+unicode = false
+
+[[test]]
+name = "word-start-ascii-110"
+regex = '\b{start}'
+haystack = "b𝛃"
+matches = [[0, 0]]
+unicode = false
+
+# Tests for (?-u:\b{end})
+
+[[test]]
+name = "word-end-ascii-010"
+regex = '\b{end}'
+haystack = "a"
+matches = [[1, 1]]
+unicode = false
+
+[[test]]
+name = "word-end-ascii-020"
+regex = '\b{end}'
+haystack = "a "
+matches = [[1, 1]]
+unicode = false
+
+[[test]]
+name = "word-end-ascii-030"
+regex = '\b{end}'
+haystack = " a "
+matches = [[2, 2]]
+unicode = false
+
+[[test]]
+name = "word-end-ascii-040"
+regex = '\b{end}'
+haystack = ""
+matches = []
+unicode = false
+
+[[test]]
+name = "word-end-ascii-050"
+regex = '\b{end}'
+haystack = "ab"
+matches = [[2, 2]]
+unicode = false
+
+[[test]]
+name = "word-end-ascii-060"
+regex = '\b{end}'
+haystack = "𝛃"
+matches = []
+unicode = false
+
+[[test]]
+name = "word-end-ascii-060-bounds"
+regex = '\b{end}'
+haystack = "𝛃"
+bounds = [2, 3]
+matches = []
+unicode = false
+
+[[test]]
+name = "word-end-ascii-070"
+regex = '\b{end}'
+haystack = " 𝛃 "
+matches = []
+unicode = false
+
+[[test]]
+name = "word-end-ascii-080"
+regex = '\b{end}'
+haystack = "𝛃𐆀"
+matches = []
+unicode = false
+
+[[test]]
+name = "word-end-ascii-090"
+regex = '\b{end}'
+haystack = "𝛃b"
+matches = [[5, 5]]
+unicode = false
+
+[[test]]
+name = "word-end-ascii-110"
+regex = '\b{end}'
+haystack = "b𝛃"
+matches = [[1, 1]]
+unicode = false
+
+# Tests for \b{start}
+
+[[test]]
+name = "word-start-unicode-010"
+regex = '\b{start}'
+haystack = "a"
+matches = [[0, 0]]
+unicode = true
+
+[[test]]
+name = "word-start-unicode-020"
+regex = '\b{start}'
+haystack = "a "
+matches = [[0, 0]]
+unicode = true
+
+[[test]]
+name = "word-start-unicode-030"
+regex = '\b{start}'
+haystack = " a "
+matches = [[1, 1]]
+unicode = true
+
+[[test]]
+name = "word-start-unicode-040"
+regex = '\b{start}'
+haystack = ""
+matches = []
+unicode = true
+
+[[test]]
+name = "word-start-unicode-050"
+regex = '\b{start}'
+haystack = "ab"
+matches = [[0, 0]]
+unicode = true
+
+[[test]]
+name = "word-start-unicode-060"
+regex = '\b{start}'
+haystack = "𝛃"
+matches = [[0, 0]]
+unicode = true
+
+[[test]]
+name = "word-start-unicode-060-bounds"
+regex = '\b{start}'
+haystack = "𝛃"
+bounds = [2, 3]
+matches = []
+unicode = true
+
+[[test]]
+name = "word-start-unicode-070"
+regex = '\b{start}'
+haystack = " 𝛃 "
+matches = [[1, 1]]
+unicode = true
+
+[[test]]
+name = "word-start-unicode-080"
+regex = '\b{start}'
+haystack = "𝛃𐆀"
+matches = [[0, 0]]
+unicode = true
+
+[[test]]
+name = "word-start-unicode-090"
+regex = '\b{start}'
+haystack = "𝛃b"
+matches = [[0, 0]]
+unicode = true
+
+[[test]]
+name = "word-start-unicode-110"
+regex = '\b{start}'
+haystack = "b𝛃"
+matches = [[0, 0]]
+unicode = true
+
+# Tests for \b{end}
+
+[[test]]
+name = "word-end-unicode-010"
+regex = '\b{end}'
+haystack = "a"
+matches = [[1, 1]]
+unicode = true
+
+[[test]]
+name = "word-end-unicode-020"
+regex = '\b{end}'
+haystack = "a "
+matches = [[1, 1]]
+unicode = true
+
+[[test]]
+name = "word-end-unicode-030"
+regex = '\b{end}'
+haystack = " a "
+matches = [[2, 2]]
+unicode = true
+
+[[test]]
+name = "word-end-unicode-040"
+regex = '\b{end}'
+haystack = ""
+matches = []
+unicode = true
+
+[[test]]
+name = "word-end-unicode-050"
+regex = '\b{end}'
+haystack = "ab"
+matches = [[2, 2]]
+unicode = true
+
+[[test]]
+name = "word-end-unicode-060"
+regex = '\b{end}'
+haystack = "𝛃"
+matches = [[4, 4]]
+unicode = true
+
+[[test]]
+name = "word-end-unicode-060-bounds"
+regex = '\b{end}'
+haystack = "𝛃"
+bounds = [2, 3]
+matches = []
+unicode = true
+
+[[test]]
+name = "word-end-unicode-070"
+regex = '\b{end}'
+haystack = " 𝛃 "
+matches = [[5, 5]]
+unicode = true
+
+[[test]]
+name = "word-end-unicode-080"
+regex = '\b{end}'
+haystack = "𝛃𐆀"
+matches = [[4, 4]]
+unicode = true
+
+[[test]]
+name = "word-end-unicode-090"
+regex = '\b{end}'
+haystack = "𝛃b"
+matches = [[5, 5]]
+unicode = true
+
+[[test]]
+name = "word-end-unicode-110"
+regex = '\b{end}'
+haystack = "b𝛃"
+matches = [[5, 5]]
+unicode = true
+
+# Tests for (?-u:\b{start-half})
+
+[[test]]
+name = "word-start-half-ascii-010"
+regex = '\b{start-half}'
+haystack = "a"
+matches = [[0, 0]]
+unicode = false
+
+[[test]]
+name = "word-start-half-ascii-020"
+regex = '\b{start-half}'
+haystack = "a "
+matches = [[0, 0], [2, 2]]
+unicode = false
+
+[[test]]
+name = "word-start-half-ascii-030"
+regex = '\b{start-half}'
+haystack = " a "
+matches = [[0, 0], [1, 1], [3, 3]]
+unicode = false
+
+[[test]]
+name = "word-start-half-ascii-040"
+regex = '\b{start-half}'
+haystack = ""
+matches = [[0, 0]]
+unicode = false
+
+[[test]]
+name = "word-start-half-ascii-050"
+regex = '\b{start-half}'
+haystack = "ab"
+matches = [[0, 0]]
+unicode = false
+
+[[test]]
+name = "word-start-half-ascii-060"
+regex = '\b{start-half}'
+haystack = "𝛃"
+matches = [[0, 0], [4, 4]]
+unicode = false
+
+[[test]]
+name = "word-start-half-ascii-060-noutf8"
+regex = '\b{start-half}'
+haystack = "𝛃"
+matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4]]
+unicode = false
+utf8 = false
+
+[[test]]
+name = "word-start-half-ascii-060-bounds"
+regex = '\b{start-half}'
+haystack = "𝛃"
+bounds = [2, 3]
+matches = []
+unicode = false
+
+[[test]]
+name = "word-start-half-ascii-070"
+regex = '\b{start-half}'
+haystack = " 𝛃 "
+matches = [[0, 0], [1, 1], [5, 5], [6, 6]]
+unicode = false
+
+[[test]]
+name = "word-start-half-ascii-080"
+regex = '\b{start-half}'
+haystack = "𝛃𐆀"
+matches = [[0, 0], [4, 4], [8, 8]]
+unicode = false
+
+[[test]]
+name = "word-start-half-ascii-090"
+regex = '\b{start-half}'
+haystack = "𝛃b"
+matches = [[0, 0], [4, 4]]
+unicode = false
+
+[[test]]
+name = "word-start-half-ascii-110"
+regex = '\b{start-half}'
+haystack = "b𝛃"
+matches = [[0, 0], [5, 5]]
+unicode = false
+
+# Tests for (?-u:\b{end-half})
+
+[[test]]
+name = "word-end-half-ascii-010"
+regex = '\b{end-half}'
+haystack = "a"
+matches = [[1, 1]]
+unicode = false
+
+[[test]]
+name = "word-end-half-ascii-020"
+regex = '\b{end-half}'
+haystack = "a "
+matches = [[1, 1], [2, 2]]
+unicode = false
+
+[[test]]
+name = "word-end-half-ascii-030"
+regex = '\b{end-half}'
+haystack = " a "
+matches = [[0, 0], [2, 2], [3, 3]]
+unicode = false
+
+[[test]]
+name = "word-end-half-ascii-040"
+regex = '\b{end-half}'
+haystack = ""
+matches = [[0, 0]]
+unicode = false
+
+[[test]]
+name = "word-end-half-ascii-050"
+regex = '\b{end-half}'
+haystack = "ab"
+matches = [[2, 2]]
+unicode = false
+
+[[test]]
+name = "word-end-half-ascii-060"
+regex = '\b{end-half}'
+haystack = "𝛃"
+matches = [[0, 0], [4, 4]]
+unicode = false
+
+[[test]]
+name = "word-end-half-ascii-060-bounds"
+regex = '\b{end-half}'
+haystack = "𝛃"
+bounds = [2, 3]
+matches = []
+unicode = false
+
+[[test]]
+name = "word-end-half-ascii-070"
+regex = '\b{end-half}'
+haystack = " 𝛃 "
+matches = [[0, 0], [1, 1], [5, 5], [6, 6]]
+unicode = false
+
+[[test]]
+name = "word-end-half-ascii-080"
+regex = '\b{end-half}'
+haystack = "𝛃𐆀"
+matches = [[0, 0], [4, 4], [8, 8]]
+unicode = false
+
+[[test]]
+name = "word-end-half-ascii-090"
+regex = '\b{end-half}'
+haystack = "𝛃b"
+matches = [[0, 0], [5, 5]]
+unicode = false
+
+[[test]]
+name = "word-end-half-ascii-110"
+regex = '\b{end-half}'
+haystack = "b𝛃"
+matches = [[1, 1], [5, 5]]
+unicode = false
+
+# Tests for \b{start-half}
+
+[[test]]
+name = "word-start-half-unicode-010"
+regex = '\b{start-half}'
+haystack = "a"
+matches = [[0, 0]]
+unicode = true
+
+[[test]]
+name = "word-start-half-unicode-020"
+regex = '\b{start-half}'
+haystack = "a "
+matches = [[0, 0], [2, 2]]
+unicode = true
+
+[[test]]
+name = "word-start-half-unicode-030"
+regex = '\b{start-half}'
+haystack = " a "
+matches = [[0, 0], [1, 1], [3, 3]]
+unicode = true
+
+[[test]]
+name = "word-start-half-unicode-040"
+regex = '\b{start-half}'
+haystack = ""
+matches = [[0, 0]]
+unicode = true
+
+[[test]]
+name = "word-start-half-unicode-050"
+regex = '\b{start-half}'
+haystack = "ab"
+matches = [[0, 0]]
+unicode = true
+
+[[test]]
+name = "word-start-half-unicode-060"
+regex = '\b{start-half}'
+haystack = "𝛃"
+matches = [[0, 0]]
+unicode = true
+
+[[test]]
+name = "word-start-half-unicode-060-bounds"
+regex = '\b{start-half}'
+haystack = "𝛃"
+bounds = [2, 3]
+matches = []
+unicode = true
+
+[[test]]
+name = "word-start-half-unicode-070"
+regex = '\b{start-half}'
+haystack = " 𝛃 "
+matches = [[0, 0], [1, 1], [6, 6]]
+unicode = true
+
+[[test]]
+name = "word-start-half-unicode-080"
+regex = '\b{start-half}'
+haystack = "𝛃𐆀"
+matches = [[0, 0], [8, 8]]
+unicode = true
+
+[[test]]
+name = "word-start-half-unicode-090"
+regex = '\b{start-half}'
+haystack = "𝛃b"
+matches = [[0, 0]]
+unicode = true
+
+[[test]]
+name = "word-start-half-unicode-110"
+regex = '\b{start-half}'
+haystack = "b𝛃"
+matches = [[0, 0]]
+unicode = true
+
+# Tests for \b{end-half}
+
+[[test]]
+name = "word-end-half-unicode-010"
+regex = '\b{end-half}'
+haystack = "a"
+matches = [[1, 1]]
+unicode = true
+
+[[test]]
+name = "word-end-half-unicode-020"
+regex = '\b{end-half}'
+haystack = "a "
+matches = [[1, 1], [2, 2]]
+unicode = true
+
+[[test]]
+name = "word-end-half-unicode-030"
+regex = '\b{end-half}'
+haystack = " a "
+matches = [[0, 0], [2, 2], [3, 3]]
+unicode = true
+
+[[test]]
+name = "word-end-half-unicode-040"
+regex = '\b{end-half}'
+haystack = ""
+matches = [[0, 0]]
+unicode = true
+
+[[test]]
+name = "word-end-half-unicode-050"
+regex = '\b{end-half}'
+haystack = "ab"
+matches = [[2, 2]]
+unicode = true
+
+[[test]]
+name = "word-end-half-unicode-060"
+regex = '\b{end-half}'
+haystack = "𝛃"
+matches = [[4, 4]]
+unicode = true
+
+[[test]]
+name = "word-end-half-unicode-060-bounds"
+regex = '\b{end-half}'
+haystack = "𝛃"
+bounds = [2, 3]
+matches = []
+unicode = true
+
+[[test]]
+name = "word-end-half-unicode-070"
+regex = '\b{end-half}'
+haystack = " 𝛃 "
+matches = [[0, 0], [5, 5], [6, 6]]
+unicode = true
+
+[[test]]
+name = "word-end-half-unicode-080"
+regex = '\b{end-half}'
+haystack = "𝛃𐆀"
+matches = [[4, 4], [8, 8]]
+unicode = true
+
+[[test]]
+name = "word-end-half-unicode-090"
+regex = '\b{end-half}'
+haystack = "𝛃b"
+matches = [[5, 5]]
+unicode = true
+
+[[test]]
+name = "word-end-half-unicode-110"
+regex = '\b{end-half}'
+haystack = "b𝛃"
+matches = [[5, 5]]
+unicode = true
diff --git a/tests/lib.rs b/tests/lib.rs
index badd57455..b3f69423d 100644
--- a/tests/lib.rs
+++ b/tests/lib.rs
@@ -49,6 +49,7 @@ fn suite() -> anyhow::Result<regex_test::RegexTests> {
     load!("unicode");
     load!("utf8");
     load!("word-boundary");
+    load!("word-boundary-special");
     load!("fowler/basic");
     load!("fowler/nullsubexpr");
     load!("fowler/repetition");

From 2743a7a0181cf16069445ad11d56977b1b991674 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Sun, 8 Oct 2023 09:29:27 -0400
Subject: [PATCH 079/136] doc: explain the new word boundary assertions

Closes #469
---
 CHANGELOG.md |  7 ++++++
 src/lib.rs   | 70 ++++++++++++++++++++++++++++++----------------------
 2 files changed, 47 insertions(+), 30 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 265f5cd48..7f90e45a8 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,9 +3,16 @@ TBD
 
 New features:
 
+* [FEATURE #469](https://github.com/rust-lang/regex/issues/469):
+Add support for `\<` and `\>` word boundary assertions.
 * [FEATURE(regex-automata) #1031](https://github.com/rust-lang/regex/pull/1031):
 DFAs now have a `start_state` method that doesn't use an `Input`.
 
+Performance improvements:
+
+* [PERF #1051](https://github.com/rust-lang/regex/pull/1051):
+Unicode character class operations have been optimized in `regex-syntax`.
+
 Bug fixes:
 
 * [BUG #1046](https://github.com/rust-lang/regex/issues/1046):
diff --git a/src/lib.rs b/src/lib.rs
index 1e191b692..6dbd3c202 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -543,8 +543,10 @@ scalar value, even when it is encoded using multiple bytes. When Unicode mode
 is disabled (e.g., `(?-u:.)`), then `.` will match a single byte in all cases.
 * The character classes `\w`, `\d` and `\s` are all Unicode-aware by default.
 Use `(?-u:\w)`, `(?-u:\d)` and `(?-u:\s)` to get their ASCII-only definitions.
-* Similarly, `\b` and `\B` use a Unicode definition of a "word" character. To
-get ASCII-only word boundaries, use `(?-u:\b)` and `(?-u:\B)`.
+* Similarly, `\b` and `\B` use a Unicode definition of a "word" character.
+To get ASCII-only word boundaries, use `(?-u:\b)` and `(?-u:\B)`. This also
+applies to the special word boundary assertions. (That is, `\b{start}`,
+`\b{end}`, `\b{start-half}`, `\b{end-half}`.)
 * `^` and `$` are **not** Unicode-aware in multi-line mode. Namely, they only
 recognize `\n` (assuming CRLF mode is not enabled) and not any of the other
 forms of line terminators defined by Unicode.
@@ -723,12 +725,16 @@ x{n}?     exactly n x
 ### Empty matches
 
 <pre class="rust">
-^     the beginning of a haystack (or start-of-line with multi-line mode)
-$     the end of a haystack (or end-of-line with multi-line mode)
-\A    only the beginning of a haystack (even with multi-line mode enabled)
-\z    only the end of a haystack (even with multi-line mode enabled)
-\b    a Unicode word boundary (\w on one side and \W, \A, or \z on other)
-\B    not a Unicode word boundary
+^               the beginning of a haystack (or start-of-line with multi-line mode)
+$               the end of a haystack (or end-of-line with multi-line mode)
+\A              only the beginning of a haystack (even with multi-line mode enabled)
+\z              only the end of a haystack (even with multi-line mode enabled)
+\b              a Unicode word boundary (\w on one side and \W, \A, or \z on other)
+\B              not a Unicode word boundary
+\b{start}, \<   a Unicode start-of-word boundary (\W|\A on the left, \w on the right)
+\b{end}, \>     a Unicode end-of-word boundary (\w on the left, \W|\z on the right))
+\b{start-half}  half of a Unicode start-of-word boundary (\W|\A on the left)
+\b{end-half}    half of a Unicode end-of-word boundary (\W|\z on the right)
 </pre>
 
 The empty regex is valid and matches the empty string. For example, the
@@ -856,28 +862,32 @@ Note that this includes all possible escape sequences, even ones that are
 documented elsewhere.
 
 <pre class="rust">
-\*          literal *, applies to all ASCII except [0-9A-Za-z<>]
-\a          bell (\x07)
-\f          form feed (\x0C)
-\t          horizontal tab
-\n          new line
-\r          carriage return
-\v          vertical tab (\x0B)
-\A          matches at the beginning of a haystack
-\z          matches at the end of a haystack
-\b          word boundary assertion
-\B          negated word boundary assertion
-\123        octal character code, up to three digits (when enabled)
-\x7F        hex character code (exactly two digits)
-\x{10FFFF}  any hex character code corresponding to a Unicode code point
-\u007F      hex character code (exactly four digits)
-\u{7F}      any hex character code corresponding to a Unicode code point
-\U0000007F  hex character code (exactly eight digits)
-\U{7F}      any hex character code corresponding to a Unicode code point
-\p{Letter}  Unicode character class
-\P{Letter}  negated Unicode character class
-\d, \s, \w  Perl character class
-\D, \S, \W  negated Perl character class
+\*              literal *, applies to all ASCII except [0-9A-Za-z<>]
+\a              bell (\x07)
+\f              form feed (\x0C)
+\t              horizontal tab
+\n              new line
+\r              carriage return
+\v              vertical tab (\x0B)
+\A              matches at the beginning of a haystack
+\z              matches at the end of a haystack
+\b              word boundary assertion
+\B              negated word boundary assertion
+\b{start}, \<   start-of-word boundary assertion
+\b{end}, \>     end-of-word boundary assertion
+\b{start-half}  half of a start-of-word boundary assertion
+\b{end-half}    half of a end-of-word boundary assertion
+\123            octal character code, up to three digits (when enabled)
+\x7F            hex character code (exactly two digits)
+\x{10FFFF}      any hex character code corresponding to a Unicode code point
+\u007F          hex character code (exactly four digits)
+\u{7F}          any hex character code corresponding to a Unicode code point
+\U0000007F      hex character code (exactly eight digits)
+\U{7F}          any hex character code corresponding to a Unicode code point
+\p{Letter}      Unicode character class
+\P{Letter}      negated Unicode character class
+\d, \s, \w      Perl character class
+\D, \S, \W      negated Perl character class
 </pre>
 
 ### Perl character classes (Unicode friendly)

From dbc5e6d98ba731ddd7f5cddd8f96f5e147d14b51 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Sun, 8 Oct 2023 10:20:24 -0400
Subject: [PATCH 080/136] lite: add special word boundaries to regex-lite

This was substantially easier. Coupling, private abstractions and slow
code are so much easier to deal with.

Ref #469
---
 regex-lite/src/hir/mod.rs   | 42 +++++++++++++++++
 regex-lite/src/hir/parse.rs | 89 +++++++++++++++++++++++++++++++++++--
 regex-lite/src/lib.rs       | 58 +++++++++++++-----------
 regex-lite/tests/lib.rs     |  1 +
 4 files changed, 162 insertions(+), 28 deletions(-)

diff --git a/regex-lite/src/hir/mod.rs b/regex-lite/src/hir/mod.rs
index f73a5420a..3d61ce8c9 100644
--- a/regex-lite/src/hir/mod.rs
+++ b/regex-lite/src/hir/mod.rs
@@ -592,6 +592,24 @@ pub(crate) enum Look {
     Word = 1 << 6,
     /// Match an ASCII-only negation of a word boundary.
     WordNegate = 1 << 7,
+    /// Match the start of an ASCII-only word boundary. That is, this matches a
+    /// position at either the beginning of the haystack or where the previous
+    /// character is not a word character and the following character is a word
+    /// character.
+    WordStart = 1 << 8,
+    /// Match the end of an ASCII-only word boundary. That is, this matches
+    /// a position at either the end of the haystack or where the previous
+    /// character is a word character and the following character is not a word
+    /// character.
+    WordEnd = 1 << 9,
+    /// Match the start half of an ASCII-only word boundary. That is, this
+    /// matches a position at either the beginning of the haystack or where the
+    /// previous character is not a word character.
+    WordStartHalf = 1 << 10,
+    /// Match the end half of an ASCII-only word boundary. That is, this
+    /// matches a position at either the end of the haystack or where the
+    /// following character is not a word character.
+    WordEndHalf = 1 << 11,
 }
 
 impl Look {
@@ -631,6 +649,30 @@ impl Look {
                     at < haystack.len() && utf8::is_word_byte(haystack[at]);
                 word_before == word_after
             }
+            WordStart => {
+                let word_before =
+                    at > 0 && utf8::is_word_byte(haystack[at - 1]);
+                let word_after =
+                    at < haystack.len() && utf8::is_word_byte(haystack[at]);
+                !word_before && word_after
+            }
+            WordEnd => {
+                let word_before =
+                    at > 0 && utf8::is_word_byte(haystack[at - 1]);
+                let word_after =
+                    at < haystack.len() && utf8::is_word_byte(haystack[at]);
+                word_before && !word_after
+            }
+            WordStartHalf => {
+                let word_before =
+                    at > 0 && utf8::is_word_byte(haystack[at - 1]);
+                !word_before
+            }
+            WordEndHalf => {
+                let word_after =
+                    at < haystack.len() && utf8::is_word_byte(haystack[at]);
+                !word_after
+            }
         }
     }
 }
diff --git a/regex-lite/src/hir/parse.rs b/regex-lite/src/hir/parse.rs
index cc3c21fe6..33bb97a7d 100644
--- a/regex-lite/src/hir/parse.rs
+++ b/regex-lite/src/hir/parse.rs
@@ -111,6 +111,12 @@ const ERR_CLASS_DIFFERENCE_UNSUPPORTED: &str =
     "character class difference is not supported";
 const ERR_CLASS_SYMDIFFERENCE_UNSUPPORTED: &str =
     "character class symmetric difference is not supported";
+const ERR_SPECIAL_WORD_BOUNDARY_UNCLOSED: &str =
+    "special word boundary assertion is unclosed or has an invalid character";
+const ERR_SPECIAL_WORD_BOUNDARY_UNRECOGNIZED: &str =
+    "special word boundary assertion is unrecognized";
+const ERR_SPECIAL_WORD_OR_REP_UNEXPECTED_EOF: &str =
+    "found start of special word boundary or repetition without an end";
 
 /// A regular expression parser.
 ///
@@ -479,12 +485,86 @@ impl<'a> Parser<'a> {
             'v' => special('\x0B'),
             'A' => Ok(Hir::look(hir::Look::Start)),
             'z' => Ok(Hir::look(hir::Look::End)),
-            'b' => Ok(Hir::look(hir::Look::Word)),
+            'b' => {
+                let mut hir = Hir::look(hir::Look::Word);
+                if !self.is_done() && self.char() == '{' {
+                    if let Some(special) =
+                        self.maybe_parse_special_word_boundary()?
+                    {
+                        hir = special;
+                    }
+                }
+                Ok(hir)
+            }
             'B' => Ok(Hir::look(hir::Look::WordNegate)),
+            '<' => Ok(Hir::look(hir::Look::WordStart)),
+            '>' => Ok(Hir::look(hir::Look::WordEnd)),
             _ => Err(Error::new(ERR_ESCAPE_UNRECOGNIZED)),
         }
     }
 
+    /// Attempt to parse a specialty word boundary. That is, `\b{start}`,
+    /// `\b{end}`, `\b{start-half}` or `\b{end-half}`.
+    ///
+    /// This is similar to `maybe_parse_ascii_class` in that, in most cases,
+    /// if it fails it will just return `None` with no error. This is done
+    /// because `\b{5}` is a valid expression and we want to let that be parsed
+    /// by the existing counted repetition parsing code. (I thought about just
+    /// invoking the counted repetition code from here, but it seemed a little
+    /// ham-fisted.)
+    ///
+    /// Unlike `maybe_parse_ascii_class` though, this can return an error.
+    /// Namely, if we definitely know it isn't a counted repetition, then we
+    /// return an error specific to the specialty word boundaries.
+    ///
+    /// This assumes the parser is positioned at a `{` immediately following
+    /// a `\b`. When `None` is returned, the parser is returned to the position
+    /// at which it started: pointing at a `{`.
+    ///
+    /// The position given should correspond to the start of the `\b`.
+    fn maybe_parse_special_word_boundary(&self) -> Result<Option<Hir>, Error> {
+        assert_eq!(self.char(), '{');
+
+        let is_valid_char = |c| match c {
+            'A'..='Z' | 'a'..='z' | '-' => true,
+            _ => false,
+        };
+        let start = self.pos();
+        if !self.bump_and_bump_space() {
+            return Err(Error::new(ERR_SPECIAL_WORD_OR_REP_UNEXPECTED_EOF));
+        }
+        // This is one of the critical bits: if the first non-whitespace
+        // character isn't in [-A-Za-z] (i.e., this can't be a special word
+        // boundary), then we bail and let the counted repetition parser deal
+        // with this.
+        if !is_valid_char(self.char()) {
+            self.pos.set(start);
+            self.char.set(Some('{'));
+            return Ok(None);
+        }
+
+        // Now collect up our chars until we see a '}'.
+        let mut scratch = String::new();
+        while !self.is_done() && is_valid_char(self.char()) {
+            scratch.push(self.char());
+            self.bump_and_bump_space();
+        }
+        if self.is_done() || self.char() != '}' {
+            return Err(Error::new(ERR_SPECIAL_WORD_BOUNDARY_UNCLOSED));
+        }
+        self.bump();
+        let kind = match scratch.as_str() {
+            "start" => hir::Look::WordStart,
+            "end" => hir::Look::WordEnd,
+            "start-half" => hir::Look::WordStartHalf,
+            "end-half" => hir::Look::WordEndHalf,
+            _ => {
+                return Err(Error::new(ERR_SPECIAL_WORD_BOUNDARY_UNRECOGNIZED))
+            }
+        };
+        Ok(Some(Hir::look(kind)))
+    }
+
     /// Parse a hex representation of a Unicode codepoint. This handles both
     /// hex notations, i.e., `\xFF` and `\x{FFFF}`. This expects the parser to
     /// be positioned at the `x`, `u` or `U` prefix. The parser is advanced to
@@ -1948,8 +2028,6 @@ bar
         assert_eq!(ERR_UNICODE_CLASS_UNSUPPORTED, perr(r"\pL"));
         assert_eq!(ERR_UNICODE_CLASS_UNSUPPORTED, perr(r"\p{L}"));
         assert_eq!(ERR_ESCAPE_UNRECOGNIZED, perr(r"\i"));
-        assert_eq!(ERR_ESCAPE_UNRECOGNIZED, perr(r"\<"));
-        assert_eq!(ERR_ESCAPE_UNRECOGNIZED, perr(r"\>"));
         assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"?"));
         assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"*"));
         assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"+"));
@@ -1983,6 +2061,11 @@ bar
         assert_eq!(ERR_CLASS_INTERSECTION_UNSUPPORTED, perr(r"[a&&b]"));
         assert_eq!(ERR_CLASS_DIFFERENCE_UNSUPPORTED, perr(r"[a--b]"));
         assert_eq!(ERR_CLASS_SYMDIFFERENCE_UNSUPPORTED, perr(r"[a~~b]"));
+        assert_eq!(ERR_SPECIAL_WORD_BOUNDARY_UNCLOSED, perr(r"\b{foo"));
+        assert_eq!(ERR_SPECIAL_WORD_BOUNDARY_UNCLOSED, perr(r"\b{foo!}"));
+        assert_eq!(ERR_SPECIAL_WORD_BOUNDARY_UNRECOGNIZED, perr(r"\b{foo}"));
+        assert_eq!(ERR_SPECIAL_WORD_OR_REP_UNEXPECTED_EOF, perr(r"\b{"));
+        assert_eq!(ERR_SPECIAL_WORD_OR_REP_UNEXPECTED_EOF, perr(r"(?x)\b{ "));
     }
 
     #[test]
diff --git a/regex-lite/src/lib.rs b/regex-lite/src/lib.rs
index 8008b9e59..68d54824f 100644
--- a/regex-lite/src/lib.rs
+++ b/regex-lite/src/lib.rs
@@ -466,12 +466,16 @@ x{n}?     exactly n x
 ### Empty matches
 
 <pre class="rust">
-^     the beginning of a haystack (or start-of-line with multi-line mode)
-$     the end of a haystack (or end-of-line with multi-line mode)
-\A    only the beginning of a haystack (even with multi-line mode enabled)
-\z    only the end of a haystack (even with multi-line mode enabled)
-\b    an ASCII word boundary (\w on one side and \W, \A, or \z on other)
-\B    not an ASCII word boundary
+^               the beginning of a haystack (or start-of-line with multi-line mode)
+$               the end of a haystack (or end-of-line with multi-line mode)
+\A              only the beginning of a haystack (even with multi-line mode enabled)
+\z              only the end of a haystack (even with multi-line mode enabled)
+\b              an ASCII word boundary (\w on one side and \W, \A, or \z on other)
+\B              not an ASCII word boundary
+\b{start}       an ASCII start-of-word boundary (\W|\A on the left, \w on the right)
+\b{end}         an ASCII end-of-word boundary (\w on the left, \W|\z on the right))
+\b{start-half}  half of an ASCII start-of-word boundary (\W|\A on the left)
+\b{end-half}    half of an ASCII end-of-word boundary (\W|\z on the right)
 </pre>
 
 The empty regex is valid and matches the empty string. For example, the
@@ -581,25 +585,29 @@ Note that this includes all possible escape sequences, even ones that are
 documented elsewhere.
 
 <pre class="rust">
-\*          literal *, applies to all ASCII except [0-9A-Za-z<>]
-\a          bell (\x07)
-\f          form feed (\x0C)
-\t          horizontal tab
-\n          new line
-\r          carriage return
-\v          vertical tab (\x0B)
-\A          matches at the beginning of a haystack
-\z          matches at the end of a haystack
-\b          word boundary assertion
-\B          negated word boundary assertion
-\x7F        hex character code (exactly two digits)
-\x{10FFFF}  any hex character code corresponding to a Unicode code point
-\u007F      hex character code (exactly four digits)
-\u{7F}      any hex character code corresponding to a Unicode code point
-\U0000007F  hex character code (exactly eight digits)
-\U{7F}      any hex character code corresponding to a Unicode code point
-\d, \s, \w  Perl character class
-\D, \S, \W  negated Perl character class
+\*              literal *, applies to all ASCII except [0-9A-Za-z<>]
+\a              bell (\x07)
+\f              form feed (\x0C)
+\t              horizontal tab
+\n              new line
+\r              carriage return
+\v              vertical tab (\x0B)
+\A              matches at the beginning of a haystack
+\z              matches at the end of a haystack
+\b              word boundary assertion
+\B              negated word boundary assertion
+\b{start}, \<   start-of-word boundary assertion
+\b{end}, \>     end-of-word boundary assertion
+\b{start-half}  half of a start-of-word boundary assertion
+\b{end-half}    half of a end-of-word boundary assertion
+\x7F            hex character code (exactly two digits)
+\x{10FFFF}      any hex character code corresponding to a Unicode code point
+\u007F          hex character code (exactly four digits)
+\u{7F}          any hex character code corresponding to a Unicode code point
+\U0000007F      hex character code (exactly eight digits)
+\U{7F}          any hex character code corresponding to a Unicode code point
+\d, \s, \w      Perl character class
+\D, \S, \W      negated Perl character class
 </pre>
 
 ### Perl character classes (ASCII only)
diff --git a/regex-lite/tests/lib.rs b/regex-lite/tests/lib.rs
index 757b39441..89635f2d7 100644
--- a/regex-lite/tests/lib.rs
+++ b/regex-lite/tests/lib.rs
@@ -38,6 +38,7 @@ fn suite() -> anyhow::Result<regex_test::RegexTests> {
     load!("unicode");
     load!("utf8");
     load!("word-boundary");
+    load!("word-boundary-special");
     load!("fowler/basic");
     load!("fowler/nullsubexpr");
     load!("fowler/repetition");

From 07dcf208ef201df907fcd1dc09c83cac61d1503b Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Sun, 8 Oct 2023 14:55:20 -0400
Subject: [PATCH 081/136] doc: remove HACKING document

It is almost completely wrong now. Instead of rewriting it---which would
be a huge endeavor---we just point folks toward my blog on regex
internals.

Closes #1058
---
 HACKING.md | 341 -----------------------------------------------------
 README.md  |  15 +++
 2 files changed, 15 insertions(+), 341 deletions(-)
 delete mode 100644 HACKING.md

diff --git a/HACKING.md b/HACKING.md
deleted file mode 100644
index 34af5b517..000000000
--- a/HACKING.md
+++ /dev/null
@@ -1,341 +0,0 @@
-Your friendly guide to hacking and navigating the regex library.
-
-This guide assumes familiarity with Rust and Cargo, and at least a perusal of
-the user facing documentation for this crate.
-
-If you're looking for background on the implementation in this library, then
-you can do no better than Russ Cox's article series on implementing regular
-expressions using finite automata: https://swtch.com/~rsc/regexp/
-
-
-## Architecture overview
-
-As you probably already know, this library executes regular expressions using
-finite automata. In particular, a design goal is to make searching linear
-with respect to both the regular expression and the text being searched.
-Meeting that design goal on its own is not so hard and can be done with an
-implementation of the Pike VM (similar to Thompson's construction, but supports
-capturing groups), as described in: https://swtch.com/~rsc/regexp/regexp2.html
---- This library contains such an implementation in src/pikevm.rs.
-
-Making it fast is harder. One of the key problems with the Pike VM is that it
-can be in more than one state at any point in time, and must shuffle capture
-positions between them. The Pike VM also spends a lot of time following the
-same epsilon transitions over and over again. We can employ one trick to
-speed up the Pike VM: extract one or more literal prefixes from the regular
-expression and execute specialized code to quickly find matches of those
-prefixes in the search text. The Pike VM can then be avoided for most the
-search, and instead only executed when a prefix is found. The code to find
-prefixes is in the regex-syntax crate (in this repository). The code to search
-for literals is in src/literals.rs. When more than one literal prefix is found,
-we fall back to an Aho-Corasick DFA using the aho-corasick crate. For one
-literal, we use a variant of the Boyer-Moore algorithm. Both Aho-Corasick and
-Boyer-Moore use `memchr` when appropriate. The Boyer-Moore variant in this
-library also uses elementary frequency analysis to choose the right byte to run
-`memchr` with.
-
-Of course, detecting prefix literals can only take us so far. Not all regular
-expressions have literal prefixes. To remedy this, we try another approach
-to executing the Pike VM: backtracking, whose implementation can be found in
-src/backtrack.rs. One reason why backtracking can be faster is that it avoids
-excessive shuffling of capture groups. Of course, backtracking is susceptible
-to exponential runtimes, so we keep track of every state we've visited to make
-sure we never visit it again. This guarantees linear time execution, but we
-pay for it with the memory required to track visited states. Because of the
-memory requirement, we only use this engine on small search strings *and* small
-regular expressions.
-
-Lastly, the real workhorse of this library is the "lazy" DFA in src/dfa.rs.
-It is distinct from the Pike VM in that the DFA is explicitly represented in
-memory and is only ever in one state at a time. It is said to be "lazy" because
-the DFA is computed as text is searched, where each byte in the search text
-results in at most one new DFA state. It is made fast by caching states. DFAs
-are susceptible to exponential state blow up (where the worst case is computing
-a new state for every input byte, regardless of what's in the state cache). To
-avoid using a lot of memory, the lazy DFA uses a bounded cache. Once the cache
-is full, it is wiped and state computation starts over again. If the cache is
-wiped too frequently, then the DFA gives up and searching falls back to one of
-the aforementioned algorithms.
-
-All of the above matching engines expose precisely the same matching semantics.
-This is indeed tested. (See the section below about testing.)
-
-The following sub-sections describe the rest of the library and how each of the
-matching engines are actually used.
-
-### Parsing
-
-Regular expressions are parsed using the regex-syntax crate, which is
-maintained in this repository. The regex-syntax crate defines an abstract
-syntax and provides very detailed error messages when a parse error is
-encountered. Parsing is done in a separate crate so that others may benefit
-from its existence, and because it is relatively divorced from the rest of the
-regex library.
-
-The regex-syntax crate also provides sophisticated support for extracting
-prefix and suffix literals from regular expressions.
-
-### Compilation
-
-The compiler is in src/compile.rs. The input to the compiler is some abstract
-syntax for a regular expression and the output is a sequence of opcodes that
-matching engines use to execute a search. (One can think of matching engines as
-mini virtual machines.) The sequence of opcodes is a particular encoding of a
-non-deterministic finite automaton. In particular, the opcodes explicitly rely
-on epsilon transitions.
-
-Consider a simple regular expression like `a|b`. Its compiled form looks like
-this:
-
-    000 Save(0)
-    001 Split(2, 3)
-    002 'a' (goto: 4)
-    003 'b'
-    004 Save(1)
-    005 Match
-
-The first column is the instruction pointer and the second column is the
-instruction. Save instructions indicate that the current position in the input
-should be stored in a captured location. Split instructions represent a binary
-branch in the program (i.e., epsilon transitions). The instructions `'a'` and
-`'b'` indicate that the literal bytes `'a'` or `'b'` should match.
-
-In older versions of this library, the compilation looked like this:
-
-    000 Save(0)
-    001 Split(2, 3)
-    002 'a'
-    003 Jump(5)
-    004 'b'
-    005 Save(1)
-    006 Match
-
-In particular, empty instructions that merely served to move execution from one
-point in the program to another were removed. Instead, every instruction has a
-`goto` pointer embedded into it. This resulted in a small performance boost for
-the Pike VM, because it was one fewer epsilon transition that it had to follow.
-
-There exist more instructions and they are defined and documented in
-src/prog.rs.
-
-Compilation has several knobs and a few unfortunately complicated invariants.
-Namely, the output of compilation can be one of two types of programs: a
-program that executes on Unicode scalar values or a program that executes
-on raw bytes. In the former case, the matching engine is responsible for
-performing UTF-8 decoding and executing instructions using Unicode codepoints.
-In the latter case, the program handles UTF-8 decoding implicitly, so that the
-matching engine can execute on raw bytes. All matching engines can execute
-either Unicode or byte based programs except for the lazy DFA, which requires
-byte based programs. In general, both representations were kept because (1) the
-lazy DFA requires byte based programs so that states can be encoded in a memory
-efficient manner and (2) the Pike VM benefits greatly from inlining Unicode
-character classes into fewer instructions as it results in fewer epsilon
-transitions.
-
-N.B. UTF-8 decoding is built into the compiled program by making use of the
-utf8-ranges crate. The compiler in this library factors out common suffixes to
-reduce the size of huge character classes (e.g., `\pL`).
-
-A regrettable consequence of this split in instruction sets is we generally
-need to compile two programs; one for NFA execution and one for the lazy DFA.
-
-In fact, it is worse than that: the lazy DFA is not capable of finding the
-starting location of a match in a single scan, and must instead execute a
-backwards search after finding the end location. To execute a backwards search,
-we must have compiled the regular expression *in reverse*.
-
-This means that every compilation of a regular expression generally results in
-three distinct programs. It would be possible to lazily compile the Unicode
-program, since it is never needed if (1) the regular expression uses no word
-boundary assertions and (2) the caller never asks for sub-capture locations.
-
-### Execution
-
-At the time of writing, there are four matching engines in this library:
-
-1. The Pike VM (supports captures).
-2. Bounded backtracking (supports captures).
-3. Literal substring or multi-substring search.
-4. Lazy DFA (no support for Unicode word boundary assertions).
-
-Only the first two matching engines are capable of executing every regular
-expression program. They also happen to be the slowest, which means we need
-some logic that (1) knows various facts about the regular expression and (2)
-knows what the caller wants. Using this information, we can determine which
-engine (or engines) to use.
-
-The logic for choosing which engine to execute is in src/exec.rs and is
-documented on the Exec type. Exec values contain regular expression Programs
-(defined in src/prog.rs), which contain all the necessary tidbits for actually
-executing a regular expression on search text.
-
-For the most part, the execution logic is straight-forward and follows the
-limitations of each engine described above pretty faithfully. The hairiest
-part of src/exec.rs by far is the execution of the lazy DFA, since it requires
-a forwards and backwards search, and then falls back to either the Pike VM or
-backtracking if the caller requested capture locations.
-
-The Exec type also contains mutable scratch space for each type of matching
-engine. This scratch space is used during search (for example, for the lazy
-DFA, it contains compiled states that are reused on subsequent searches).
-
-### Programs
-
-A regular expression program is essentially a sequence of opcodes produced by
-the compiler plus various facts about the regular expression (such as whether
-it is anchored, its capture names, etc.).
-
-### The regex! macro
-
-The `regex!` macro no longer exists. It was developed in a bygone era as a
-compiler plugin during the infancy of the regex crate. Back then, then only
-matching engine in the crate was the Pike VM. The `regex!` macro was, itself,
-also a Pike VM. The only advantages it offered over the dynamic Pike VM that
-was built at runtime were the following:
-
-  1. Syntax checking was done at compile time. Your Rust program wouldn't
-     compile if your regex didn't compile.
-  2. Reduction of overhead that was proportional to the size of the regex.
-     For the most part, this overhead consisted of heap allocation, which
-     was nearly eliminated in the compiler plugin.
-
-The main takeaway here is that the compiler plugin was a marginally faster
-version of a slow regex engine. As the regex crate evolved, it grew other regex
-engines (DFA, bounded backtracker) and sophisticated literal optimizations.
-The regex macro didn't keep pace, and it therefore became (dramatically) slower
-than the dynamic engines. The only reason left to use it was for the compile
-time guarantee that your regex is correct. Fortunately, Clippy (the Rust lint
-tool) has a lint that checks your regular expression validity, which mostly
-replaces that use case.
-
-Additionally, the regex compiler plugin stopped receiving maintenance. Nobody
-complained. At that point, it seemed prudent to just remove it.
-
-Will a compiler plugin be brought back? The future is murky, but there is
-definitely an opportunity there to build something that is faster than the
-dynamic engines in some cases. But it will be challenging! As of now, there
-are no plans to work on this.
-
-
-## Testing
-
-A key aspect of any mature regex library is its test suite. A subset of the
-tests in this library come from Glenn Fowler's AT&T test suite (its online
-presence seems gone at the time of writing). The source of the test suite is
-located in src/testdata. The scripts/regex-match-tests.py takes the test suite
-in src/testdata and generates tests/matches.rs.
-
-There are also many other manually crafted tests and regression tests in
-tests/tests.rs. Some of these tests were taken from RE2.
-
-The biggest source of complexity in the tests is related to answering this
-question: how can we reuse the tests to check all of our matching engines? One
-approach would have been to encode every test into some kind of format (like
-the AT&T test suite) and code generate tests for each matching engine. The
-approach we use in this library is to create a Cargo.toml entry point for each
-matching engine we want to test. The entry points are:
-
-* `tests/test_default.rs` - tests `Regex::new`
-* `tests/test_default_bytes.rs` - tests `bytes::Regex::new`
-* `tests/test_nfa.rs` - tests `Regex::new`, forced to use the NFA
-  algorithm on every regex.
-* `tests/test_nfa_bytes.rs` - tests `Regex::new`, forced to use the NFA
-  algorithm on every regex and use *arbitrary* byte based programs.
-* `tests/test_nfa_utf8bytes.rs` - tests `Regex::new`, forced to use the NFA
-  algorithm on every regex and use *UTF-8* byte based programs.
-* `tests/test_backtrack.rs` - tests `Regex::new`, forced to use
-  backtracking on every regex.
-* `tests/test_backtrack_bytes.rs` - tests `Regex::new`, forced to use
-  backtracking on every regex and use *arbitrary* byte based programs.
-* `tests/test_backtrack_utf8bytes.rs` - tests `Regex::new`, forced to use
-  backtracking on every regex and use *UTF-8* byte based programs.
-* `tests/test_crates_regex.rs` - tests to make sure that all of the
-  backends behave in the same way against a number of quickcheck
-  generated random inputs. These tests need to be enabled through
-  the `RUST_REGEX_RANDOM_TEST` environment variable (see
-  below).
-
-The lazy DFA and pure literal engines are absent from this list because
-they cannot be used on every regular expression. Instead, we rely on
-`tests/test_dynamic.rs` to test the lazy DFA and literal engines when possible.
-
-Since the tests are repeated several times, and because `cargo test` runs all
-entry points, it can take a while to compile everything. To reduce compile
-times slightly, try using `cargo test --test default`, which will only use the
-`tests/test_default.rs` entry point.
-
-The random testing takes quite a while, so it is not enabled by default.
-In order to run the random testing you can set the
-`RUST_REGEX_RANDOM_TEST` environment variable to anything before
-invoking `cargo test`. Note that this variable is inspected at compile
-time, so if the tests don't seem to be running, you may need to run
-`cargo clean`.
-
-## Benchmarking
-
-The benchmarking in this crate is made up of many micro-benchmarks. Currently,
-there are two primary sets of benchmarks: the benchmarks that were adopted
-at this library's inception (in `bench/src/misc.rs`) and a newer set of
-benchmarks meant to test various optimizations. Specifically, the latter set
-contain some analysis and are in `bench/src/sherlock.rs`. Also, the latter
-set are all executed on the same lengthy input whereas the former benchmarks
-are executed on strings of varying length.
-
-There is also a smattering of benchmarks for parsing and compilation.
-
-Benchmarks are in a separate crate so that its dependencies can be managed
-separately from the main regex crate.
-
-Benchmarking follows a similarly wonky setup as tests. There are multiple entry
-points:
-
-* `bench_rust.rs` - benchmarks `Regex::new`
-* `bench_rust_bytes.rs` benchmarks `bytes::Regex::new`
-* `bench_pcre.rs` - benchmarks PCRE
-* `bench_onig.rs` - benchmarks Oniguruma
-
-The PCRE and Oniguruma benchmarks exist as a comparison point to a mature
-regular expression library. In general, this regex library compares favorably
-(there are even a few benchmarks that PCRE simply runs too slowly on or
-outright can't execute at all). I would love to add other regular expression
-library benchmarks (especially RE2).
-
-If you're hacking on one of the matching engines and just want to see
-benchmarks, then all you need to run is:
-
-    $ (cd bench && ./run rust)
-
-If you want to compare your results with older benchmarks, then try:
-
-    $ (cd bench && ./run rust | tee old)
-    $ ... make it faster
-    $ (cd bench && ./run rust | tee new)
-    $ cargo benchcmp old new --improvements
-
-The `cargo-benchcmp` utility is available here:
-https://github.com/BurntSushi/cargo-benchcmp
-
-The `./bench/run` utility can run benchmarks for PCRE and Oniguruma too. See
-`./bench/bench --help`.
-
-## Dev Docs
-
-When digging your teeth into the codebase for the first time, the
-crate documentation can be a great resource. By default `rustdoc`
-will strip out all documentation of private crate members in an
-effort to help consumers of the crate focus on the *interface*
-without having to concern themselves with the *implementation*.
-Normally this is a great thing, but if you want to start hacking
-on regex internals it is not what you want. Many of the private members
-of this crate are well documented with rustdoc style comments, and
-it would be a shame to miss out on the opportunity that presents.
-You can generate the private docs with:
-
-```
-$ rustdoc --crate-name docs src/lib.rs -o target/doc -L target/debug/deps --no-defaults --passes collapse-docs --passes unindent-comments
-```
-
-Then just point your browser at `target/doc/regex/index.html`.
-
-See https://github.com/rust-lang/rust/issues/15347 for more info
-about generating developer docs for internal use.
diff --git a/README.md b/README.md
index 7454c166d..a23a266d3 100644
--- a/README.md
+++ b/README.md
@@ -290,6 +290,21 @@ $ rebar cmp results.csv
 See the `rebar` documentation for more details on how it works and how to
 compare results with other regex engines.
 
+
+### Hacking
+
+The `regex` crate is, for the most part, a pretty thin wrapper around the
+[`meta::Regex`](https://docs.rs/regex-automata/latest/regex_automata/meta/struct.Regex.html)
+from the
+[`regex-automata` crate](https://docs.rs/regex-automata/latest/regex_automata/).
+Therefore, if you're looking to work on the internals of this crate, you'll
+likely either want to look in `regex-syntax` (for parsing) or `regex-automata`
+(for construction of finite automata and the search routines).
+
+My [blog on regex internals](https://blog.burntsushi.net/regex-internals/)
+goes into more depth.
+
+
 ### Minimum Rust version policy
 
 This crate's minimum supported `rustc` version is `1.60.0`.

From f9671471ea8d69242a5aac5f8edc66aabedf3901 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Sun, 8 Oct 2023 14:57:53 -0400
Subject: [PATCH 082/136] changelog: add note about decreasing memory usage

Ref #1090
---
 CHANGELOG.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7f90e45a8..a813c4fdb 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -12,6 +12,8 @@ Performance improvements:
 
 * [PERF #1051](https://github.com/rust-lang/regex/pull/1051):
 Unicode character class operations have been optimized in `regex-syntax`.
+* [PERF #1090](https://github.com/rust-lang/regex/issues/1090):
+Make patterns containing lots of literal characters use less memory.
 
 Bug fixes:
 

From ed8032195119caf2d691862efc0f5ff0377c8275 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Sun, 8 Oct 2023 15:05:16 -0400
Subject: [PATCH 083/136] test: disable some tests on non-64-bit

Some doc tests make 64-bit assumptions and fail on 32-bit. I'd be open
to perhaps refactoring the tests somehow to make them work on both, but
I literally have no easy way to run doc tests in a 32-bit environment.
Without being able to actually run them myself, I don't feel comfortable
doing anything other than squashing the tests in that case.

Closes #1041
---
 regex-lite/src/string.rs | 1 +
 src/builders.rs          | 4 ++++
 src/regex/bytes.rs       | 1 +
 src/regex/string.rs      | 1 +
 4 files changed, 7 insertions(+)

diff --git a/regex-lite/src/string.rs b/regex-lite/src/string.rs
index 1c6eb4ab9..af0a5b629 100644
--- a/regex-lite/src/string.rs
+++ b/regex-lite/src/string.rs
@@ -2063,6 +2063,7 @@ impl<'h, 'n> core::ops::Index<&'n str> for Captures<'h> {
 /// This example shows how to create and use `CaptureLocations` in a search.
 ///
 /// ```
+/// # if !cfg!(target_pointer_width = "64") { return; } // see #1041
 /// use regex_lite::Regex;
 ///
 /// let re = Regex::new(r"(?<first>\w+)\s+(?<last>\w+)").unwrap();
diff --git a/src/builders.rs b/src/builders.rs
index 46c4824c5..c111a96c0 100644
--- a/src/builders.rs
+++ b/src/builders.rs
@@ -679,6 +679,7 @@ pub(crate) mod string {
         /// # Example
         ///
         /// ```
+        /// # if !cfg!(target_pointer_width = "64") { return; } // see #1041
         /// use regex::RegexBuilder;
         ///
         /// // It may surprise you how big some seemingly small patterns can
@@ -1246,6 +1247,7 @@ pub(crate) mod string {
         /// # Example
         ///
         /// ```
+        /// # if !cfg!(target_pointer_width = "64") { return; } // see #1041
         /// use regex::RegexSetBuilder;
         ///
         /// // It may surprise you how big some seemingly small patterns can
@@ -1856,6 +1858,7 @@ pub(crate) mod bytes {
         /// # Example
         ///
         /// ```
+        /// # if !cfg!(target_pointer_width = "64") { return; } // see #1041
         /// use regex::bytes::RegexBuilder;
         ///
         /// // It may surprise you how big some seemingly small patterns can
@@ -2428,6 +2431,7 @@ pub(crate) mod bytes {
         /// # Example
         ///
         /// ```
+        /// # if !cfg!(target_pointer_width = "64") { return; } // see #1041
         /// use regex::bytes::RegexSetBuilder;
         ///
         /// // It may surprise you how big some seemingly small patterns can
diff --git a/src/regex/bytes.rs b/src/regex/bytes.rs
index cc53482cb..c742b095a 100644
--- a/src/regex/bytes.rs
+++ b/src/regex/bytes.rs
@@ -2025,6 +2025,7 @@ impl<'h, 'n> core::ops::Index<&'n str> for Captures<'h> {
 /// This example shows how to create and use `CaptureLocations` in a search.
 ///
 /// ```
+/// # if !cfg!(target_pointer_width = "64") { return; } // see #1041
 /// use regex::bytes::Regex;
 ///
 /// let re = Regex::new(r"(?<first>\w+)\s+(?<last>\w+)").unwrap();
diff --git a/src/regex/string.rs b/src/regex/string.rs
index d5908ae0d..177a2af34 100644
--- a/src/regex/string.rs
+++ b/src/regex/string.rs
@@ -2028,6 +2028,7 @@ impl<'h, 'n> core::ops::Index<&'n str> for Captures<'h> {
 /// This example shows how to create and use `CaptureLocations` in a search.
 ///
 /// ```
+/// # if !cfg!(target_pointer_width = "64") { return; } // see #1041
 /// use regex::Regex;
 ///
 /// let re = Regex::new(r"(?<first>\w+)\s+(?<last>\w+)").unwrap();

From f6559fe270126c1a0593fe06218a89c1139267cc Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Sun, 8 Oct 2023 15:23:17 -0400
Subject: [PATCH 084/136] syntax: fix panics that occur with non-sensical Ast
 values

These panics I do not believe can occur from an actual pattern, since
the parser will either never produce such things or will return an
error. But still, the Ast->Hir translator shouldn't panic in such cases.

Actually, the non-sensical Ast values are actually somewhat sensible,
and they don't map to invalid regexes. These panics were likely the
result of the regex crate not supporting empty patterns or "fail"
patterns particularly well in the fast. But now that we do, we can just
let the Asts through and generate the Hir you'd expect.

Fixes #1047
---
 CHANGELOG.md                      |  3 ++
 regex-syntax/src/hir/translate.rs | 59 ++++++++++++++++++++++++++++---
 2 files changed, 58 insertions(+), 4 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index a813c4fdb..2c0d193a1 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -20,6 +20,9 @@ Bug fixes:
 * [BUG #1046](https://github.com/rust-lang/regex/issues/1046):
 Fix a bug that could result in incorrect match spans when using a Unicode word
 boundary and searching non-ASCII strings.
+* [BUG(regex-syntax) #1047](https://github.com/rust-lang/regex/issues/1047):
+Fix panics that can occur in `Ast->Hir` translation (not reachable from `regex`
+crate).
 * [BUG(regex-syntax) #1088](https://github.com/rust-lang/regex/issues/1088):
 Remove guarantees in the API that connect the `u` flag with a specific HIR
 representation.
diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs
index 55ca074fa..2b500cc2f 100644
--- a/regex-syntax/src/hir/translate.rs
+++ b/regex-syntax/src/hir/translate.rs
@@ -354,14 +354,14 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
                     .unwrap_or_else(|| self.flags());
                 self.push(HirFrame::Group { old_flags });
             }
-            Ast::Concat(ref x) if x.asts.is_empty() => {}
             Ast::Concat(_) => {
                 self.push(HirFrame::Concat);
             }
-            Ast::Alternation(ref x) if x.asts.is_empty() => {}
-            Ast::Alternation(_) => {
+            Ast::Alternation(ref x) => {
                 self.push(HirFrame::Alternation);
-                self.push(HirFrame::AlternationBranch);
+                if !x.asts.is_empty() {
+                    self.push(HirFrame::AlternationBranch);
+                }
             }
             _ => {}
         }
@@ -3652,4 +3652,55 @@ mod tests {
             ]),
         );
     }
+
+    #[test]
+    fn regression_alt_empty_concat() {
+        use crate::ast::{self, Ast};
+
+        let span = Span::splat(Position::new(0, 0, 0));
+        let ast = Ast::alternation(ast::Alternation {
+            span,
+            asts: vec![Ast::concat(ast::Concat { span, asts: vec![] })],
+        });
+
+        let mut t = Translator::new();
+        assert_eq!(Ok(Hir::empty()), t.translate("", &ast));
+    }
+
+    #[test]
+    fn regression_empty_alt() {
+        use crate::ast::{self, Ast};
+
+        let span = Span::splat(Position::new(0, 0, 0));
+        let ast = Ast::concat(ast::Concat {
+            span,
+            asts: vec![Ast::alternation(ast::Alternation {
+                span,
+                asts: vec![],
+            })],
+        });
+
+        let mut t = Translator::new();
+        assert_eq!(Ok(Hir::fail()), t.translate("", &ast));
+    }
+
+    #[test]
+    fn regression_singleton_alt() {
+        use crate::{
+            ast::{self, Ast},
+            hir::Dot,
+        };
+
+        let span = Span::splat(Position::new(0, 0, 0));
+        let ast = Ast::concat(ast::Concat {
+            span,
+            asts: vec![Ast::alternation(ast::Alternation {
+                span,
+                asts: vec![Ast::dot(span)],
+            })],
+        });
+
+        let mut t = Translator::new();
+        assert_eq!(Ok(Hir::dot(Dot::AnyCharExceptLF)), t.translate("", &ast));
+    }
 }

From 674a952cf46318f07b6fde5f4fa14bca2159066a Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Sun, 8 Oct 2023 16:06:14 -0400
Subject: [PATCH 085/136] changelog: start filling out the 1.10 release

---
 CHANGELOG.md | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 50 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 2c0d193a1..b51142218 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,30 @@
-TBD
-===
+1.10.0 (2023-10-09)
+===================
+This is a new minor release of `regex` that adds support for start and end
+word boundary assertions. That is, `\<` and `\>`. The minimum supported Rust
+version has also been raised to 1.65, which was released about one year ago.
+
+The new word boundary assertions are:
+
+* `\<` or `\b{start}`: a Unicode start-of-word boundary (`\W|\A` on the left,
+`\w` on the right).
+* `\>` or `\b{end}`: a Unicode end-of-word boundary (`\w` on the left, `\W|\z`
+on the right)).
+* `\b{start-half}`: half of a Unicode start-of-word boundary (`\W|\A` on the
+left).
+* `\b{end-half}`: half of a Unicode end-of-word boundary (`\W|\z` on the
+right).
+
+The `\<` and `\>` are GNU extensions to POSIX regexes. They have been added
+to the `regex` crate because they enjoy somewhat broad support in other regex
+engines as well (for example, vim). The `\b{start}` and `\b{end}` assertions
+are aliases for `\<` and `\>`, respectively.
+
+The `\b{start-half}` and `\b{end-half}` assertions are not found in any
+other regex engine (although regex engines with general look-around support
+can certainly express them). They were added principally to support the
+implementation of word matching in grep programs, where one generally wants to
+be a bit more flexible in what is considered a word boundary.
 
 New features:
 
@@ -27,6 +52,29 @@ crate).
 Remove guarantees in the API that connect the `u` flag with a specific HIR
 representation.
 
+`regex-automata` breaking change release:
+
+This release includes a `regex-automata 0.4.0` breaking change release, which
+was necessary in order to support the new word boundary assertions. For
+example, the `Look` enum has new variants and the `LookSet` type now uses `u32`
+instead of `u16` to represent a bitset of look-around assertions. These are
+overall very minor changes, and most users of `regex-automata` should be able
+to move to `0.4` from `0.3` without any changes at all.
+
+`regex-syntax` breaking change release:
+
+This release also includes a `regex-syntax 0.8.0` breaking change release,
+which, like `regex-automata`, was necessary in order to support the new word
+boundary assertions. This release also includes some changes to the `Ast`
+type to reduce heap usage in some cases. If you are using the `Ast` type
+directly, your code may require some minor modifications. Otherwise, users of
+`regex-syntax 0.7` should be able to migrate to `0.8` without any code changes.
+
+`regex-lite` release:
+
+The `regex-lite 0.1.1` release contains support for the new word boundary
+assertions. There are no breaking changes.
+
 
 1.9.6 (2023-09-30)
 ==================

From 356d3c950414abfb5ba67124cdbc7ef3d9a018dc Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Sun, 8 Oct 2023 21:31:16 -0400
Subject: [PATCH 086/136] automata: fix subtle DFA performance bug

This commit fixes a subtle *performance* bug in the start state
computation. The issue here is rather tricky, but it boils down to the
fact that the way the look-behind assertions are computed in the start
state is not quite precisely equivalent to how they're computed during
normal state generation. Namely, in normal state generation, we only
compute look-behind assertions if the NFA actually has one (or one
similar to it) in its graph somewhere. If it doesn't, then there's no
point in saving whether the assertion is satisfied or not.

Logically speaking, this doesn't matter too much, because if the
look-around assertions don't match up with how they're computed in the
start state, a new state will simply be created. Not a huge deal, but
wasteful. The real problem is that the new state will no longer be
considered a start state. It will just be like any other normal state.
We rely on being able to detect start states at search time to know when
to trigger the prefilter. So if we re-generate start states as non-start
states, then we may end up not triggering the prefilter. That's bad.

rebar actually caught this bug via the
`imported/sherlock/line-boundary-sherlock-holmes` benchmark, which
recorded a 20x slowdown due to the prefilter not running. Owch!

This specifically was caused by the start states unconditionally
attaching half-starting word boundary assertions whenever they were
satisfied, where as normal state generation only does this when there is
actually a half-starting word boundary assertion in the NFA. So this led
to re-generating start states needlessly.

Interestingly, the start state computation was unconditionally attaching
all different types of look-behind assertions, and thus in theory, this
problem already existed under different circumstances. My hypothesis is
that it wasn't "as bad" because it was mostly limited to line
terminators. But the half-starting word boundary assertion is much more
broadly applicable.

We remedy this not only for the half-starting word boundary assertion,
but for all others as well. I also did manual mutation testing in this
start state computation and found a few branches not covered by tests.
We add those tests here.

Thanks rebar!
---
 regex-automata/src/util/determinize/mod.rs | 102 +++++++++++++--------
 testdata/line-terminator.toml              |  12 +++
 testdata/word-boundary-special.toml        |  34 +++++++
 3 files changed, 111 insertions(+), 37 deletions(-)

diff --git a/regex-automata/src/util/determinize/mod.rs b/regex-automata/src/util/determinize/mod.rs
index d320fabc3..ba32991d0 100644
--- a/regex-automata/src/util/determinize/mod.rs
+++ b/regex-automata/src/util/determinize/mod.rs
@@ -587,67 +587,95 @@ pub(crate) fn set_lookbehind_from_start(
 ) {
     let rev = nfa.is_reverse();
     let lineterm = nfa.look_matcher().get_line_terminator();
+    let lookset = nfa.look_set_any();
     match *start {
         Start::NonWordByte => {
-            builder.set_look_have(|have| {
-                have.insert(Look::WordStartHalfAscii)
-                    .insert(Look::WordStartHalfUnicode)
-            });
+            if lookset.contains_word() {
+                builder.set_look_have(|have| {
+                    have.insert(Look::WordStartHalfAscii)
+                        .insert(Look::WordStartHalfUnicode)
+                });
+            }
         }
         Start::WordByte => {
-            builder.set_is_from_word();
+            if lookset.contains_word() {
+                builder.set_is_from_word();
+            }
         }
         Start::Text => {
-            builder.set_look_have(|have| {
-                have.insert(Look::Start)
-                    .insert(Look::StartLF)
-                    .insert(Look::StartCRLF)
-                    .insert(Look::WordStartHalfAscii)
-                    .insert(Look::WordStartHalfUnicode)
-            });
+            if lookset.contains_anchor_haystack() {
+                builder.set_look_have(|have| have.insert(Look::Start));
+            }
+            if lookset.contains_anchor_line() {
+                builder.set_look_have(|have| {
+                    have.insert(Look::StartLF).insert(Look::StartCRLF)
+                });
+            }
+            if lookset.contains_word() {
+                builder.set_look_have(|have| {
+                    have.insert(Look::WordStartHalfAscii)
+                        .insert(Look::WordStartHalfUnicode)
+                });
+            }
         }
         Start::LineLF => {
             if rev {
-                builder.set_is_half_crlf();
-                builder.set_look_have(|have| have.insert(Look::StartLF));
+                if lookset.contains_anchor_crlf() {
+                    builder.set_is_half_crlf();
+                }
+                if lookset.contains_anchor_line() {
+                    builder.set_look_have(|have| have.insert(Look::StartLF));
+                }
             } else {
-                builder.set_look_have(|have| have.insert(Look::StartCRLF));
+                if lookset.contains_anchor_line() {
+                    builder.set_look_have(|have| have.insert(Look::StartCRLF));
+                }
             }
-            if lineterm == b'\n' {
+            if lookset.contains_anchor_line() && lineterm == b'\n' {
                 builder.set_look_have(|have| have.insert(Look::StartLF));
             }
-            builder.set_look_have(|have| {
-                have.insert(Look::WordStartHalfAscii)
-                    .insert(Look::WordStartHalfUnicode)
-            });
+            if lookset.contains_word() {
+                builder.set_look_have(|have| {
+                    have.insert(Look::WordStartHalfAscii)
+                        .insert(Look::WordStartHalfUnicode)
+                });
+            }
         }
         Start::LineCR => {
-            if rev {
-                builder.set_look_have(|have| have.insert(Look::StartCRLF));
-            } else {
-                builder.set_is_half_crlf();
+            if lookset.contains_anchor_crlf() {
+                if rev {
+                    builder.set_look_have(|have| have.insert(Look::StartCRLF));
+                } else {
+                    builder.set_is_half_crlf();
+                }
             }
-            if lineterm == b'\r' {
+            if lookset.contains_anchor_line() && lineterm == b'\r' {
                 builder.set_look_have(|have| have.insert(Look::StartLF));
             }
-            builder.set_look_have(|have| {
-                have.insert(Look::WordStartHalfAscii)
-                    .insert(Look::WordStartHalfUnicode)
-            });
+            if lookset.contains_word() {
+                builder.set_look_have(|have| {
+                    have.insert(Look::WordStartHalfAscii)
+                        .insert(Look::WordStartHalfUnicode)
+                });
+            }
         }
         Start::CustomLineTerminator => {
-            builder.set_look_have(|have| have.insert(Look::StartLF));
+            if lookset.contains_anchor_line() {
+                builder.set_look_have(|have| have.insert(Look::StartLF));
+            }
             // This is a bit of a tricky case, but if the line terminator was
             // set to a word byte, then we also need to behave as if the start
             // configuration is Start::WordByte. That is, we need to mark our
             // state as having come from a word byte.
-            if utf8::is_word_byte(lineterm) {
-                builder.set_is_from_word();
-            } else {
-                builder.set_look_have(|have| {
-                    have.insert(Look::WordStartHalfAscii)
-                        .insert(Look::WordStartHalfUnicode)
-                });
+            if lookset.contains_word() {
+                if utf8::is_word_byte(lineterm) {
+                    builder.set_is_from_word();
+                } else {
+                    builder.set_look_have(|have| {
+                        have.insert(Look::WordStartHalfAscii)
+                            .insert(Look::WordStartHalfUnicode)
+                    });
+                }
             }
         }
     }
diff --git a/testdata/line-terminator.toml b/testdata/line-terminator.toml
index 4de72de31..a398dafa2 100644
--- a/testdata/line-terminator.toml
+++ b/testdata/line-terminator.toml
@@ -38,6 +38,18 @@ unescape = true
 line-terminator = '\xFF'
 utf8 = false
 
+# This tests a tricky case where the line terminator is set to \r. This ensures
+# that the StartLF look-behind assertion is tracked when computing the start
+# state.
+[[test]]
+name = "carriage"
+regex = '(?m)^[a-z]+'
+haystack = 'ABC\rabc'
+matches = [[4, 7]]
+bounds = [4, 7]
+unescape = true
+line-terminator = '\r'
+
 # This tests that we can set the line terminator to a byte corresponding to a
 # word character, and things work as expected.
 [[test]]
diff --git a/testdata/word-boundary-special.toml b/testdata/word-boundary-special.toml
index c1689f5cc..2b5a2a0ac 100644
--- a/testdata/word-boundary-special.toml
+++ b/testdata/word-boundary-special.toml
@@ -651,3 +651,37 @@ regex = '\b{end-half}'
 haystack = "b𝛃"
 matches = [[5, 5]]
 unicode = true
+
+# Specialty tests.
+
+# Since \r is special cased in the start state computation (to deal with CRLF
+# mode), this test ensures that the correct start state is computed when the
+# pattern starts with a half word boundary assertion.
+[[test]]
+name = "word-start-half-ascii-carriage"
+regex = '\b{start-half}[a-z]+'
+haystack = 'ABC\rabc'
+matches = [[4, 7]]
+bounds = [4, 7]
+unescape = true
+
+# Since \n is also special cased in the start state computation, this test
+# ensures that the correct start state is computed when the pattern starts with
+# a half word boundary assertion.
+[[test]]
+name = "word-start-half-ascii-linefeed"
+regex = '\b{start-half}[a-z]+'
+haystack = 'ABC\nabc'
+matches = [[4, 7]]
+bounds = [4, 7]
+unescape = true
+
+# Like the carriage return test above, but with a custom line terminator.
+[[test]]
+name = "word-start-half-ascii-customlineterm"
+regex = '\b{start-half}[a-z]+'
+haystack = 'ABC!abc'
+matches = [[4, 7]]
+bounds = [4, 7]
+unescape = true
+line-terminator = '!'

From 8e13494bc898db42c64ef6a750203b1e6ce47214 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Sun, 8 Oct 2023 22:15:37 -0400
Subject: [PATCH 087/136] msrv: bump to Rust 1.65

This MSRV bump is mostly motivated by "good sense," and in particular,
Rust 1.65 means we can use 'let ... else'. We don't actually start
peppering the code with 'let ... else' just yet, but we fix a few
outstanding small issues and update our Rust version everywhere.

Also, Rust 1.65 is about a year old at time of writing. Let's keep the
trains moving.
---
 .github/workflows/ci.yml        |  2 +-
 Cargo.toml                      |  2 +-
 README.md                       |  2 +-
 regex-automata/Cargo.toml       |  1 +
 regex-automata/src/util/lazy.rs |  6 +----
 regex-automata/src/util/look.rs |  3 +--
 regex-automata/src/util/pool.rs | 43 +++++++++++++++++++++++++++++----
 regex-cli/Cargo.toml            |  1 +
 regex-lite/Cargo.toml           |  2 +-
 regex-lite/README.md            |  2 +-
 regex-syntax/Cargo.toml         |  2 +-
 regex-syntax/src/hir/literal.rs | 21 ++++++----------
 regex-syntax/src/lib.rs         | 12 ---------
 13 files changed, 56 insertions(+), 43 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 08cc60d9a..2813a1676 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -141,7 +141,7 @@ jobs:
     - name: Install Rust
       uses: dtolnay/rust-toolchain@master
       with:
-        toolchain: 1.60.0
+        toolchain: 1.65.0
     # The memchr 2.6 release purportedly bumped its MSRV to Rust 1.60, but it
     # turned out that on aarch64, it was using something that wasn't stabilized
     # until Rust 1.61[1]. (This was an oversight on my part. I had previously
diff --git a/Cargo.toml b/Cargo.toml
index 46664f669..6f94dc4ae 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -15,7 +15,7 @@ categories = ["text-processing"]
 autotests = false
 exclude = ["/scripts/*", "/.github/*"]
 edition = "2021"
-rust-version = "1.60.0"
+rust-version = "1.65"
 
 [workspace]
 members = [
diff --git a/README.md b/README.md
index a23a266d3..f1e4c404a 100644
--- a/README.md
+++ b/README.md
@@ -307,7 +307,7 @@ goes into more depth.
 
 ### Minimum Rust version policy
 
-This crate's minimum supported `rustc` version is `1.60.0`.
+This crate's minimum supported `rustc` version is `1.65.0`.
 
 The policy is that the minimum Rust version required to use this crate can be
 increased in minor version updates. For example, if regex 1.0 requires Rust
diff --git a/regex-automata/Cargo.toml b/regex-automata/Cargo.toml
index 7d47140b0..2d08cec75 100644
--- a/regex-automata/Cargo.toml
+++ b/regex-automata/Cargo.toml
@@ -11,6 +11,7 @@ license = "MIT OR Apache-2.0"
 categories = ["text-processing"]
 edition = "2021"
 autoexamples = false
+rust-version = "1.65"
 
 [lib]
 bench = false
diff --git a/regex-automata/src/util/lazy.rs b/regex-automata/src/util/lazy.rs
index de27a2a6e..0d0b4fb2a 100644
--- a/regex-automata/src/util/lazy.rs
+++ b/regex-automata/src/util/lazy.rs
@@ -384,11 +384,7 @@ mod lazy {
                 // SAFETY: state is DONE if and only if data has been fully
                 // initialized. At which point, it is safe to drop.
                 unsafe {
-                    // MSRV(1.60): Use assume_init_drop. The below is how
-                    // assume_init_drop is implemented.
-                    core::ptr::drop_in_place(
-                        (*self.data.as_ptr()).as_mut_ptr(),
-                    )
+                    self.data.get_mut().assume_init_drop();
                 }
             }
         }
diff --git a/regex-automata/src/util/look.rs b/regex-automata/src/util/look.rs
index ddf8fb129..73e51c0f6 100644
--- a/regex-automata/src/util/look.rs
+++ b/regex-automata/src/util/look.rs
@@ -1651,8 +1651,7 @@ mod is_word_char {
     fn is_word_character(c: char) -> bool {
         use crate::util::{unicode_data::perl_word::PERL_WORD, utf8};
 
-        // MSRV(1.59): Use 'u8::try_from(c)' instead.
-        if u8::try_from(u32::from(c)).map_or(false, utf8::is_word_byte) {
+        if u8::try_from(c).map_or(false, utf8::is_word_byte) {
             return true;
         }
         PERL_WORD
diff --git a/regex-automata/src/util/pool.rs b/regex-automata/src/util/pool.rs
index 95afa4a0d..d90d4ecff 100644
--- a/regex-automata/src/util/pool.rs
+++ b/regex-automata/src/util/pool.rs
@@ -455,11 +455,44 @@ mod inner {
         /// Create a new pool. The given closure is used to create values in
         /// the pool when necessary.
         pub(super) fn new(create: F) -> Pool<T, F> {
-            // MSRV(1.63): Mark this function as 'const'. I've arranged the
-            // code such that it should "just work." Then mark the public
-            // 'Pool::new' method as 'const' too. (The alloc-only Pool::new
-            // is already 'const', so that should "just work" too.) The only
-            // thing we're waiting for is Mutex::new to be const.
+            // FIXME: Now that we require 1.65+, Mutex::new is available as
+            // const... So we can almost mark this function as const. But of
+            // course, we're creating a Vec of stacks below (we didn't when I
+            // originally wrote this code). It seems like the best way to work
+            // around this would be to use a `[Stack; MAX_POOL_STACKS]` instead
+            // of a `Vec<Stack>`. I refrained from making this change at time
+            // of writing (2023/10/08) because I was making a lot of other
+            // changes at the same time and wanted to do this more carefully.
+            // Namely, because of the cache line optimization, that `[Stack;
+            // MAX_POOL_STACKS]` would be quite big. It's unclear how bad (if
+            // at all) that would be.
+            //
+            // Another choice would be to lazily allocate the stacks, but...
+            // I'm not so sure about that. Seems like a fair bit of complexity?
+            //
+            // Maybe there's a simple solution I'm missing.
+            //
+            // ... OK, I tried to fix this. First, I did it by putting `stacks`
+            // in an `UnsafeCell` and using a `Once` to lazily initialize it.
+            // I benchmarked it and everything looked okay. I then made this
+            // function `const` and thought I was just about done. But the
+            // public pool type wraps its inner pool in a `Box` to keep its
+            // size down. Blech.
+            //
+            // So then I thought that I could push the box down into this
+            // type (and leave the non-std version unboxed) and use the same
+            // `UnsafeCell` technique to lazily initialize it. This has the
+            // downside of the `Once` now needing to get hit in the owner fast
+            // path, but maybe that's OK? However, I then realized that we can
+            // only lazily initialize `stacks`, `owner` and `owner_val`. The
+            // `create` function needs to be put somewhere outside of the box.
+            // So now the pool is a `Box`, `Once` and a function. Now we're
+            // starting to defeat the point of boxing in the first place. So I
+            // backed out that change too.
+            //
+            // Back to square one. I maybe we just don't make a pool's
+            // constructor const and live with it. It's probably not a huge
+            // deal.
             let mut stacks = Vec::with_capacity(MAX_POOL_STACKS);
             for _ in 0..stacks.capacity() {
                 stacks.push(CacheLine(Mutex::new(vec![])));
diff --git a/regex-cli/Cargo.toml b/regex-cli/Cargo.toml
index f9dec0024..b5de2b5e7 100644
--- a/regex-cli/Cargo.toml
+++ b/regex-cli/Cargo.toml
@@ -12,6 +12,7 @@ license = "MIT OR Apache-2.0"
 categories = ["text-processing"]
 autotests = false
 edition = "2021"
+rust-version = "1.65"
 
 [[bin]]
 name = "regex-cli"
diff --git a/regex-lite/Cargo.toml b/regex-lite/Cargo.toml
index 1dc144b31..21330fd4e 100644
--- a/regex-lite/Cargo.toml
+++ b/regex-lite/Cargo.toml
@@ -10,7 +10,7 @@ A lightweight regex engine that optimizes for binary size and compilation time.
 """
 workspace = ".."
 edition = "2021"
-rust-version = "1.60.0"
+rust-version = "1.65"
 autotests = false
 
 # Features are documented in the "Crate features" section of the crate docs:
diff --git a/regex-lite/README.md b/regex-lite/README.md
index 34c749b21..758fac6ae 100644
--- a/regex-lite/README.md
+++ b/regex-lite/README.md
@@ -78,7 +78,7 @@ year: 2014, month: 10, day: 14
 
 ### Minimum Rust version policy
 
-This crate's minimum supported `rustc` version is `1.60.0`.
+This crate's minimum supported `rustc` version is `1.65.0`.
 
 The policy is that the minimum Rust version required to use this crate can be
 increased in semver compatible updates.
diff --git a/regex-syntax/Cargo.toml b/regex-syntax/Cargo.toml
index aaceeee7f..e5e541302 100644
--- a/regex-syntax/Cargo.toml
+++ b/regex-syntax/Cargo.toml
@@ -8,7 +8,7 @@ documentation = "https://docs.rs/regex-syntax"
 description = "A regular expression parser."
 workspace = ".."
 edition = "2021"
-rust-version = "1.60.0"
+rust-version = "1.65"
 
 # Features are documented in the "Crate features" section of the crate docs:
 # https://docs.rs/regex-syntax/*/#crate-features
diff --git a/regex-syntax/src/hir/literal.rs b/regex-syntax/src/hir/literal.rs
index afcd506e0..a5a3737f6 100644
--- a/regex-syntax/src/hir/literal.rs
+++ b/regex-syntax/src/hir/literal.rs
@@ -2235,24 +2235,19 @@ impl PreferenceTrie {
     /// after them and because any removed literals are guaranteed to never
     /// match.
     fn minimize(literals: &mut Vec<Literal>, keep_exact: bool) {
-        use core::cell::RefCell;
-
-        // MSRV(1.61): Use retain_mut here to avoid interior mutability.
-        let trie = RefCell::new(PreferenceTrie {
+        let mut trie = PreferenceTrie {
             states: vec![],
             matches: vec![],
             next_literal_index: 1,
-        });
+        };
         let mut make_inexact = vec![];
-        literals.retain(|lit| {
-            match trie.borrow_mut().insert(lit.as_bytes()) {
-                Ok(_) => true,
-                Err(i) => {
-                    if !keep_exact {
-                        make_inexact.push(i.checked_sub(1).unwrap());
-                    }
-                    false
+        literals.retain_mut(|lit| match trie.insert(lit.as_bytes()) {
+            Ok(_) => true,
+            Err(i) => {
+                if !keep_exact {
+                    make_inexact.push(i.checked_sub(1).unwrap());
                 }
+                false
             }
         });
         for i in make_inexact {
diff --git a/regex-syntax/src/lib.rs b/regex-syntax/src/lib.rs
index 38c8d88d4..20f25db71 100644
--- a/regex-syntax/src/lib.rs
+++ b/regex-syntax/src/lib.rs
@@ -168,18 +168,6 @@ The following features are available:
 #![forbid(unsafe_code)]
 #![deny(missing_docs, rustdoc::broken_intra_doc_links)]
 #![warn(missing_debug_implementations)]
-// MSRV(1.62): Allow unused warnings. Needed for the 'allow' below,
-// since the warning is no longer triggered in newer Rust releases.
-// Once the 'allow(mutable_borrow_reservation_conflict)' can be
-// removed, we can remove the 'allow(renamed_and_removed_lints)' too.
-#![allow(renamed_and_removed_lints)]
-// MSRV(1.62): This gets triggered on Rust <1.62, and since our MSRV
-// is Rust 1.60 at the time of writing, a warning is displayed. But
-// the lang team decided the code pattern flagged by this warning is
-// OK, so the warning is innocuous. We can remove this explicit allow
-// once we get to a Rust release where the warning is no longer
-// triggered. I believe that's Rust 1.62.
-#![allow(mutable_borrow_reservation_conflict)]
 #![cfg_attr(docsrs, feature(doc_auto_cfg))]
 
 #[cfg(any(test, feature = "std"))]

From 0689353e43ef958f7a547d3c796f59f03c08b531 Mon Sep 17 00:00:00 2001
From: Addison Crump <addison.crump@cispa.de>
Date: Sat, 15 Jul 2023 16:00:21 +0200
Subject: [PATCH 088/136] fuzz: institute sane limits for arbitrary-based
 fuzzers

Closes #1043
Fixes https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=61570
Fixes https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=62436
---
 fuzz/ast-fuzzers.options                       |   2 ++
 fuzz/oss-fuzz-build.sh                         |   5 ++++-
 ...e-minimized-ast_fuzz_match-5990349284442112 | Bin 0 -> 169710 bytes
 ...e-minimized-ast_fuzz_match-6114393576046592 | Bin 0 -> 51466 bytes
 ...mized-ast_fuzz_match_bytes-4820641084473344 | Bin 0 -> 47681 bytes
 5 files changed, 6 insertions(+), 1 deletion(-)
 create mode 100644 fuzz/ast-fuzzers.options
 create mode 100644 fuzz/regressions/clusterfuzz-testcase-minimized-ast_fuzz_match-5990349284442112
 create mode 100644 fuzz/regressions/clusterfuzz-testcase-minimized-ast_fuzz_match-6114393576046592
 create mode 100644 fuzz/regressions/clusterfuzz-testcase-minimized-ast_fuzz_match_bytes-4820641084473344

diff --git a/fuzz/ast-fuzzers.options b/fuzz/ast-fuzzers.options
new file mode 100644
index 000000000..678d526b1
--- /dev/null
+++ b/fuzz/ast-fuzzers.options
@@ -0,0 +1,2 @@
+[libfuzzer]
+max_len = 65536
diff --git a/fuzz/oss-fuzz-build.sh b/fuzz/oss-fuzz-build.sh
index f96474739..81f619dcb 100755
--- a/fuzz/oss-fuzz-build.sh
+++ b/fuzz/oss-fuzz-build.sh
@@ -14,5 +14,8 @@ targets=(
   ast_fuzz_match_bytes
 )
 for target in "${targets[@]}"; do
-  cp fuzz/target/x86_64-unknown-linux-gnu/release/$target $OUT/
+  cp "fuzz/target/x86_64-unknown-linux-gnu/release/${target}" "${OUT}/"
+  if [[ "$target" == ast_* ]]; then
+    cp fuzz/ast-fuzzers.options "${OUT}/${target}.options"
+  fi
 done
diff --git a/fuzz/regressions/clusterfuzz-testcase-minimized-ast_fuzz_match-5990349284442112 b/fuzz/regressions/clusterfuzz-testcase-minimized-ast_fuzz_match-5990349284442112
new file mode 100644
index 0000000000000000000000000000000000000000..8de974975d4227f6da75038a92caa08dc6feac23
GIT binary patch
literal 169710
zcmeI5F^=Rm7KRJh#20YS8C(koK0yPUj2C+e!*FE4FyO?Q53n875h|J!IGAH7oowR?
zeE_4EeAFZPC_VK^WYx~Pzk(sY$C5bG&yT-IRdxCF_U-fE-abA)o*v))=ed48{^Q^A
zzsE~@{e3Q1i*Vd({Z{3;2IE=xpX!r;va6?ho%grb9?zwIo$BOPv7AY0+`ocp{jJpt
z(_J?k)Sa}9m-a#*s9=|i+w$H%sQz^gLmAKI?l1ZG-*3PE`R%v=(mkF(emp%sKm2sc
zuXi6FPt{eFaJ83I&2XV*&&8#KUlvCN1HXtM_+2im<4Sq4<4S9tGP(`!r`G0_xSyp5
z{Nk_~@)BVk?6eyEz;6g_%SOPjo`YJ<E>n}wDekxA=wOW~sy648xSz!k{0_plJPiDB
zzcjd(Y}}y+{J;<Vnt{f1zl22C^4yQPUxq;f=6;P^98X10XUU<};0J!k3EMmp{FwWt
z!L^CWt=<Gb@B_bQbg}eENQ5m*59WRu1_@YtG;VP`6+Ly0;gsM9e#Z&hJQDnv`=!CP
ziOH?r1V8Wtzh-o?^hijAElUsPei;S{Sb8*WaXb}0oh64>gCF=ECv5Xb@MG?m2G=Gg
zw|W!&zz_VI(Z$juArZDLJ(&At7$jin(YVF&RP@v}hEswc_#G!~^GNVx?w1DFCMLIf
z^A+Q_kW{n?wW&MSw$oZAKYOrH@_2;^?A(R5n!B;Zkjdb2-kKx95BE!hs{!LaDfodO
z_%&mV2Z0HRu)XFeXq00UJekZSJLU;Z7u+Y6lhsL{Q!@86UCJ-6jYAXsRxD+QXIL;7
zS?nl&(a*6pt;XIgH{dtS->|n8dsEsb4QvU1;McE!Fcq4yY}MIRCnW~Ie(~oPL{z=Y
zIwd#Xf?kZk5B&NS5T-&imaRIQ>ZHWr*DwCuf{5z4o>t?21-%%7ANchvAWVg3EL(Lp
z)k%rLuV4JR1rb$k&M9%ff?kZk5B&NS5T-&imaRIQ>ZHWr*DwCuf{5z4o>t?21-%%7
zANchvAWVg3EL(Lp)k%rLuV4JR1rb$k&M9%ff?kZk5B&NS5T-&imaRIQ>ZHWr*DwCu
zf{5z4o>t?21-%%7ANchvAWVg3EL(Lp)k%rLuV4JR1rb$k&M9%ff?kZk5B&NS5T-&i
zmaRIQ>ZHWr*DwCuf{5z4o>t?21-%%7ANchvAWVg3EL(Lp)k%rLuV4JR1rb$k&M9%f
zf?kZk5B&NS5T-&imaRIQ>ZHWr*DwCuf{5z4o>t?21-%%7ANchvAWVg3EL(Lp)k%rL
zuV4JR1rb$k&M9%ff?kZk5B&NS5T-&imaRIQ>ZHWr*DwCuf{5z4o>t?21-%%7ANchv
zAWVg3EL(Lp)k%rLuV4JR1rb$k&M9%ff?kZk5B&NS5T-&imaRIQ>ZHWr*DwCuf{5z4
zo>t?21-%%7ANchvAWVg3EL(Lp)k%rLuV4JR1rb$k&M80D{q~=f{Ob^)WbYSnz!4(}
z&<mN%)p2^A-?ZJ{UI;&6o3&KIE*C3V-r;h2s^TVG?E~G%HrV#TU2-`U$9S~*Qw>Nj
zYc5wWUDe#abnwess$k$3aeV&x@$~ro@Y5;3-hFsHWwGi2{Ax8;<LEH_zOCAvQ{sM>
zoz-!ryI5P-TBiiRD2#)hR)e3V2mIo&8S)Zgl{2RVKkyp@+p-bxd$w%V*;FSb#{HHY
z9jp;i9oN%p+^?V)Bk(&2+ww5*!~Nu$&JvB=)&L&h2?OzY_WfVr2Y%q!j5|K>NJxb3
zHAg|C9Gl?DWG2}$L2bI=KB=6nt}dLCMThB979Hcz1V6`8qA-)Fv)Jsgl*O^OBK9L6
zw>hq-)i1l>!^4N)^>upq<K36;>&;hx=Ff@0i&4nGNHt$eQH`u`&gb*5KmYQ>_var&
zq_1Vo`M0y#Bj)GxU2foVwt|$3<&-aLXu+JbRnql-9RBg@ph`Fi?{fX}@7wdJPAP^{
zlh5huh3T%F4eCx>#*1YLJJ@M8_$e-_1iv_JhP*^j<;*F;5B!F}wrm9a>PdjbjGOP0
zm%%mQ2YzYL4F>Dengc)Z1HWdhu<}hvgzYs)L8Ba-;K^hr*|AD$y5K&koUF6t&}x<*
zrb~HrKMqasb1WqaGl@Ej%??Xh9BV6LKl0p9ZO$oiKMM``4YNNH*1=Az!S7Y#S6%cl
z-1amCn%tNZgL|kMwlDaBANV!HiH9-?iLkxqC}@;p6Yx;R$T26Yvv*a~YT3~D41QTq
z6^x%RiOUpzzJ$LaY|Wx0mr<t6U#DbMoV`ebRa@6xtgUOUQ^t<tnV^H6R)e3V2mCtx
z&K@elDrZg!e&A=R8J2;cvz&~=Orp+WgZqJBzXF0)Xl8*|u9$LU0`Th>e{Ml^b#A&^
z&CS=)$G2gI(U*q4a^{ra2Y%y-GcDnM+<epEGFaRz0YC5qzh<=Yp<6;CY_B;A8s*po
zPbM?Tj#X0A1@}qiWSu#mRx|fAUCPp99Gc+gSV|OT5_J}v9hR~<)>g!R<U=>LIj6+^
zEHvOZ%>G1J2Rp3>Kkyp{oNWmFxcR2RWx2Rh27cfNe$8Ox!An9SY_B;A8s*poJa{p3
z%*pBuUDdQ&HuUjj`=X}`#+U8mQiLzt^JV+iEIM)-X1c_6N>;wbjAFEE>$;1zb***E
z*l|4fbFkBD@U!%QUx(k>Lq%BS%qhVS{46!YGVpVjlTnyS)LCqBKk(~UK(GqUEbz(|
zQ;tjke*NOlEr_npO;@YA`5O9IdJLm44SnUzDZvl?#t~;)!u?+F&G$NwssKbaz6>5A
z;S;DVjta&{0P#k}M*uv7ZOx)1=T20+aQ2)OOiezgECV}$vxkbJ&iq@enTJ~p!Ot?(
zk%8a;#{D|FOb?Z=3yB>YbvD&Wi5WgBVRtE_s!cm3?p@G}5%_gurwxD~?w1Ca5^>iO
z{J;<Vnjyz?%Y;POUUL*Q%CQNaOlFcD^Ms}g?vu*NI&(g)X6|RYl)2wHG{MiYlqk$3
z>MS-pEM;-5t%&`|SC`f1oD%o5(171C`x9Xu?6eyEz;76EwjuE2=9>nW<>F2m_<<kz
zHG_=@FA0gTz2+!rlw%X{;Kj%>C#y4bRnuzO(8u=`i=HYN-&c%F5x%d;XTGgjbmTJ3
zbcySftbB_Z#c0*mbr);vTI-at<9P1pV5imKXXydI4!^U9im=L=Q-UA(S!#x5;O8tS
zqcD@Gv)JH%;McE!U=^BK;FT+;9GL+8`o*7H5M7;{u2ysNHT1Fc7)D<j`pTJ8f*<&e
zBhIvh`*HJ4gUeuXuLS(S5B!?Z#%I0>iLkxqC}@;p6FiyBBs*3~O&8oJm6LVmd|J)i
z&vYqEk8x;%pJORem`T)GY<5`6;#gY|`;iab)aINL_p{J|-!S_VVIAzW8vMX-7;v^B
z@Z;v22AAdHP8s-tANVzcjR!9YiLkxqC}@;p6Y${0$T26YGjvtcYT3}om+gz5Di~k3
zk4q80Y|poyTeIlMWtiy_*C|=~7Bh;`s;%oT*4DMwDPza++|R*ItHICG1AZNTXAc!&
zl{2RVKk&2E49mdJSx!b_CQ)ax!TrFmUje}?G_$}fS4=rF0r>TcKer&dIyYUd=H_eY
zdu>Y(d-3uBi1pG%2ZvKKW-K;XHkRG5fG`!Bv24}ZR3{|{zkc!O7DQCX^|TuIE9k`t
z{5~`M9v(jYuCLR>AMd_&UvIwp^Q5ouVi58#QpML&R9E)R`F#HM=U;yK{`_Nz^tG%x
z|8_Qe#Qc0dS!P1Uhi({&g#sg~hFf1K!s-pnDZ%friJuy4AgH>)mM6Y_l*LiO`1(h5
z3cmiql(sdCj+{GD?ZVk}QZP07oH7yW0L~sNiaHY@t!5%*F$6!$P)7!SxL+Dv9S|Kq
z6#T#s{F?E@$~Peqwyb=a`(+p;VD8tr#qm`1)HQ}vf*<%DCv5Xb@MG?m2G=Ggw|W!&
zzz_VI(Z$juArZDLJ(&At7$jin(YVF&RP=O~99j*2;CGy`%_G5&xnCMwo0#0{P4ELh
z@M}gFOOJ#^*s}Cs?w4VZfTc&{7ROW3Q`Z<y34Y*roUqL!!H>CL8eE&0-0ICS`2BzU
z0iX3S$)c?M+MILO(`xR1mK-Ku!`x_${n+lAv(;YGl~lNZ>Y(;=SbAo0R51S3EV?0o
zYQ~?MwPw+g%P7;ktW$FH&0ZwIs;%oT*4DMwDPzYm_j9n*YVfo4fM18-*+WHG<;*F;
z5Bw}O!!q!zmp~RXerb3ZW?n<D5<j`<ph!bol}=S$h1_;Mj&|3Js?P0JtGVq~?1d2g
zc4Oup0Dib%8eASBH?0Rh@B_bQi1B$xLLzLhISLx(*aS}|Gs%v5LemBJN#$g9lIN7n
z{Y;lK_Zx>M_&Jsmg_%U1#b$@4ERMAmu^;)o!*M;W#{Dcb;5W?vL|Em_DZvl?h5=_A
z0zYoPX>eIC?v#Na_<>(D*m&@gkO<prj)F!xHUSS_j2v^aIzv}At(Faa&)}B@Rl)e4
zb6lD5J?C3~=9{6z_1?c{#arPJ!c{xhU96pJty4y~=Xsy=phc^}&(Z^a?Ih`NF2X8j
zP6>YCcNnnk1HsQ(Mn+*KQD?Ei{kECf@vdm=xSm$yewIM++X2}2uHg52-A|w81VXCu
z)$+o_uPlxV#>208XX4=(Pa#^f=*YPf)h?VpCk0cJ&nXk34&dyeqNq-|osxOD#Sr`~
zLme6T;eKgwbwG6dQ1Amk@N32oE8m1f*s}6v?w4VZfVp4e7ROW3(^+z8HTZ$wal$r_
z1V83}X>e^~a;rDN5B$Kd8C@(r5)xs{(u28QhCu?B9*tWZPeo5%V>l)Ff!}e$Hje~9
z=6-2#ZDMk(H^C45z^@ryEIkquVaw8kxnG7s0+t?)TO3bCPiM)Y)!+wy#|hg!68xC^
zrNOm{$*tZTgWqTQ|C4=5vN4yQ`mWio-kx<97tX8Ny**2&y;&R;j6ZXU^AP^bg+Fs?
z&7vchQKs`-r)1ijy-0#pTi0Ezt!u4Q#*X8+UL5SS8vHCh;Md`I_D~U4Ide+z`)lH-
z)5<_Zb%B<TXMSt3XrqEL506gq+0VmE$5qi&6*u9I+e$k~?vl&tIv%anc(v`+9ervK
zTx79}nW-vgP6>XN8}J+Ej3TUqomPV%_zeTjHUxh4BEe$DuM%2TZjgZ=_@zO21Grvp
z68yjq{F-sXC)5dvu;s}ZOOFhL1gv};w>X}Pp1Q<!O7H`}<AiM<34YA|(%{;}<W_Hj
zANYY^GrCxMBqYL?r3Z7r41)wLJsP(-o{FB%l0&P(5B!c3ws|D@G51S@YZH@Oy$OEc
p2Y$`yV(F2P2wRpO%>6P960r1W+~RmDdg>a(DPP9#v;2eX{{n8r{}2EG

literal 0
HcmV?d00001

diff --git a/fuzz/regressions/clusterfuzz-testcase-minimized-ast_fuzz_match-6114393576046592 b/fuzz/regressions/clusterfuzz-testcase-minimized-ast_fuzz_match-6114393576046592
new file mode 100644
index 0000000000000000000000000000000000000000..a34eeaf2c0d2ba02b5f35ae85b06cfec4f1089b0
GIT binary patch
literal 51466
zcmeIzv2Gi60LSrbDi(x?A|#eFAdnbH!Eqc1q*^DsWbKp=nM~z~LNJv8DH0M=pCE5w
z1`ln=NGv=;o}v;9D?Gt*=S#x*n{Px!1ysuKKREt<UB{PCqf4=~Q{CJt%d#lneo@uu
z$D%!p`aIa(-F=SpU=N`?KWxXM+nzt|uBz&;v*-0^HP>+ei}o6-qvq3B&eq}PMt{5i
zdv#vdZZsF#FBawEhs9!k-WQAan-})?o5$|?>ftwEKc0N|=<{w|{`4m&Cr3v|H|u^}
zpI?rT$Fur;G<#9KRkznZKD^V&H0QVWOXsnus`7qyx2h&`ewxqA?+@qm`3KJ?v)Oq3
z$LSmE&w2OD$?Q@0O0NEVttjqn4n|v_)^91_C~g(&r>FTAH*OVgo`0*`opZN-TXXjH
zw69(9aX1_nwbsGG!A5_p*PN9%{`K-;`Qn9#d;j39{q1`$d<?@oz1|+%%7tGTz2UGw
z9Mrii#zpgEEzc|D-vX~(cl6IVI=}q4|CQhVYpTD!(f?$7I2w(%Hb?bcx#s?#??V68
z|HI|CAl_DNzs$X7W6<2}>$7t2SzmAC^6~7CKpQXpGqAjEH@CFYV^`X{{0DuRSM;M0
zg9H$pK{6GzAOXZ?kW2+FNC2@JBvU~P5<qMQ$yCsS1Q44+G8MES0mNpIOa(1S0I?Y)
zQ$Y(7Kx_udRM3J15Su|V6|^7$#Ac981uaMbu^A*&K?@Q<YzE0x(1HXIn?W)av>*Y*
zW{^w;El2>d86;Cd3lczV2FX;=f&>toK{6GzAOXZ?kW2+FNC2@JBvU~P5<qMQ$yCsS
z1Q44+G8MES0mNpIOa(1S0I?Y)Q$Y(7Kx_udRM3J15Su|V6|^7$#Ac981uaMbu^A*&
zK?@Q<YzE0x(1HXIn?W)av>*Y*W{^w;El2>d86;Cd3lczV2FX;=f&>toK{6GzAOXZ?
zkW2+FNC2@JBvU~P5<qMQ$yCsS1Q44+G8MES0mNpIOa(1S0I?Y)Q$Y(7Kx_udRM3J1
z5Su|V6|^7$#Ac981uaMbu^A*&K?@Q<YzE0x(1HXIn?W)av>*Y*W{^w;El2>d86;Cd
z3lczV2FX;=f&>toK{6GzAOXZ?kW2+FNC2@JBvU~P5<qMQ$yCsS1Q44+G8MES0mNpI
zOa(1S0I?Y)Q$Y(7Kx_udRM3J15Su|V6|^7$#Ac981uaMbu^A*&K?@Q<YzE0x(1HXI
zn?W)av>*Y*W{^w;El2>d86;Cd3lczV2FX;=f&>toK{6GzAOXZ?kW2+FNC2@JBvU~P
z5<qMQ$yCsS1Q44+G8MES0mNpIOa(1S0I?Y)Q$Y(7Kx_udRM3J15Su|V6|^7$#Ac98
z1uaMbu^A*&K?@Q<YzE0x(1HXIn?W)av>*Y*W{^w;El2>d86;Cd3lczVUU#HZlk3iK
z&we-9f<zJ7K@t?EAW=kikOYM(NEDGBBtc<%HIa(?eEWFvu>LY_p0A!?e||C<)DNGn
f=FS>-p3BoI6|S`Rdhg-iy<W6IKH~B;olgG*tn6yN

literal 0
HcmV?d00001

diff --git a/fuzz/regressions/clusterfuzz-testcase-minimized-ast_fuzz_match_bytes-4820641084473344 b/fuzz/regressions/clusterfuzz-testcase-minimized-ast_fuzz_match_bytes-4820641084473344
new file mode 100644
index 0000000000000000000000000000000000000000..ce5b868b95214c4b19caf2292a61087eb7cc8d37
GIT binary patch
literal 47681
zcmeGl3v3+4b=Hj&=V!-6RVu!52{Z|Huzj}I`5d-V6`W97D1uNq5x&^hOB(<0atVQw
z3#uCR5@ATNE-?Y8M1W}3AeYi46a>Uo>x&9asVEJiu`sPrYNauaQXt@)zBjvj`?=lw
z?)f{<zMa|myqS4B^Jd<B-UMlPcX2mDb9cC$bT?BOmN9cVdxFnDX?K{?wQGGrpYLzM
zAf*oz<^_(k3BaDM_#_nGDFsX3+O@ApAr`i=fYcX6cZ(1L3Hm50CZxo@(dBiQmkHHN
zN`otI{1lx_ifc$oNlBTb$_dUzD}s?vFeC=u!h$Y>pb-|dnAAYX*q}F;lx%~f83^Ia
z)560l(1J&Yhr_<OS%~6YuFpbzypTp%Adu+A!(Fj391f4r;Z!iFl!*XM4^TQncZd~~
z!skF(EL@~3d_lK61Ryyje3H)-1~CM<v;;pRA#M|BMeq&=c0~ryCk5xEP@58hi^J5W
zfZTY)!#(?6U^Oy4FAQ#mSE+@=Gh=swtP29qlIl7kUtpR8(B7Jxn`bDFB#-5UP$#8g
z`7)7yRw`CG#QZ&Ur2p=tM~}Wum*x0^W$S<=UJwW{Kx=%9orqL}uS!I8tURnNGcYNs
z%%+75BQ*f-EiJjfL@@;7cqkkO#29(`PBcgWUOg6IU{V)ObaqvLf4jl8Kmouu2LN%K
zLLb7ZK2da3Ivj-55K`1nBN60XtdH0UIYDU4fs!kksxr@7!xq3Ld8J%PlgynlaZACe
zHx4956DM+S;@IOc1gm*5jV|LhXpS}veX6EyW<ZZLoAvA~>+0%gh~{p>v}^BMQ&$%X
zk&slGBeiHw4~1@ZB9s$m#6FfYrwYlV2%<Qh-*%QcBegXcA74L0zlx=Z0K0Q&#5a?F
zV?z7)3o}`%FCO0UthV^jFs~>mn2YT*8X}+z0e%pI0Y;Rq1qGrgMqBwW(B}d^0A>MK
zsWESolIzFjo=G%P^N5OTnQU1kkwalwj1MrzB0rpced0`WXl(U#%$!3CNiY~KTo<xo
zMCmvNW!98h4bdeKQmT1n)+nWbCcyLR7;(~hoX1ol{fJlzCNliUTX6wGl)fAVj4i<h
zh6(LSVkuLQm?o!B=4C31DJP)GrlU#$1<UVJg`~@+N!E~UY$b`Q8>I`-2O<CzoTC2n
zQtG3ADs@)50{PjMP;G?O4kWk~C}<LBtI*#+?~4BZQy5J1VdVIPXlM^ubXm8?ib+F_
zMU?tLhsxJ~@aB&%&#(R8x&0dszWwaA-`w?+{E=HDZ+-dZ)!Uz5@YQ7t-njE$f3JRF
z&5W16epT(k_ZP0;aMS7jLtmCZz3qms-GTY;pDtPS)A0FM3;!GW{i{!P-Ij|<X!EVf
zuEDC$ef`Tz^%W)wG{8jv3Y>E%aI_D#P^|#8!x_cJ#qJh&hpXxSZ;@%CoH{wE*VXLm
zkRn|#2d^u}k;wB%pxa#!xEOE77l1u7)IPo^gn!Gls7$de#b`z};Dl`PE9e&krfJ+^
z2PkERjIPIIg;iJ^=7U`9Uc7j5^_{Cup2Pv($&*43^@7Dub7>@(Wl*ZUQeHLDj1e9?
zboTC!*_Y}L%z5#%+((A*8QA`5{ku!fTz$)~Cm*@%qwgGfs&M1_W{2yVy$|1Z{Jx>9
zcYosU{9xg8e_1}a^X&I7ugl+F=>OGse{;vVdme49e|Kf~GoSX%-TA@nm#*5oa`j5j
zsc8!~{o(xA&OLF9yX}p<nFogs9qf4NnNugOy@m=8Ro2fq_1+Io<q7%E|F&b`xtiL`
zBWJ&{v;7?@$LT(+=oj@hKDw>7qj6)0+qH$#j~;n4p~Os~N6#k2ip%$;3IrppmcW}B
z8uUTrh=}7^fVQnI4L-D}06uG1-<t_w8ag5_2-Pv+gE?Zo70L-4DplYMxgD4Q@PG*L
zfG2hFQhmXp;+r@Eb&i&J7+0Lbvgw35E8(zwor!SB8m~dv)L~0TG(3k;?Cb4{5jQY%
zGH19B?F!Hmh3Z17Vgu6`fe9u;i^2LZ{2~w>UKE3pSyCq%sovb;ZRptC=G%Ot#nq%i
zfL7>=&*U)qlh7h`7z}ixLF?Hwy$35IdniRnXeO6XdK@GJ{&Df}uAZc_mXM27VEc{>
zHgXZj_R;aqNd^PIOp3CaKv8Cr0xucB$ym+gkfyzsm6nU;RgTJX(Si$HJRCR@H01<l
z(@q-9>Lk%vOf{8NR#t+HzmicdW@_pQ4g5n7YP&VsfL|%_=P?B(7rnj6QZ(GM(cRYJ
zZgBbBjWV}9tEo7ORXk>d_k~#!Z(7)iW*H%uDb4HIcfbfsbq?C8POOy@P^d5`=sflU
z8HFX8ARS<7uOKv+(4YY{)PL~72OeY)kPP;0`%!zl$KU8{Z*Ok%w6wQ-+dA6drLi&A
z!*AlDlp0OO=V6S>IuORJw7$vMSFe_2?Y$ECFpi(zMMg-Z;WY*)Btv-~mn@;2>f1n6
z5I+e7n7H8bxP7f156IGi`~|5ZTU1wn_~G?9^j{VVJ&po_f(=YcqAvw(7UE-NQbKP6
z{jFzPH>5n<nqbPodY)C4%z#vxij<1TYxb$whsl?xLX6*swGtpR4qjfDh6z%Q-`M18
z*&<m<PfvAGk2U<TzP{etSM_OVYV|UVa-)*5r(w8Nq_MWn9xIH0BBb8RPj&|~*frO_
zI|+cV0x3?2`FMRR@)+v<cwBVY=kdgC>}E?V@HLFUR~?KL$>?3}qhoD=IJdXAuSf5b
zQF7*yr}nDsXq0~QXe45D8h=b4Y)+Y@;}l4q0t!#lvq}A%#7bk_l&E%_nuKs91@qjc
zZsUY?$}`*$;YN8}rc8~?(yOV2HQb{SUZ2p2PiQsDtm?Z)dR0q-$;#3P&bGzKk7o4C
zKz`JTj|WUk7fwZfEE_!j6pE7?B5)Z}ks>FI3<+`p6A#QP8*`gLGUVX#plxC#@wQ_7
z<YJ^EVWucX!aP>ZSZ(XdrWiSs;Pw)}cP)lrsQp^3Ehg}UaFC`{jT=TsXYX2?8_*`-
zWWQROkR8VxE}b>0RO4v4G>L{vt*wjLet!0VH3c9%E(IrJ?iI*ZB^<V|!f+VZ0-Bo;
zhK7c$)>nl9CG)TbAg~V&3t(_S<@H>;o03R3*x~d+^pFYt4{h#-UD0{+5fdc_xT`rg
zv)NKGrGRFpNhQtPNo^!-QHt#V%7$d5$!VNTL+Qw1itJ^TLS5<`nKIb+l#LQt6o9~9
zwrHJBny8au^L;i)X_V~&9-UDdV@GE5{7Dv2Q{^SAZrMX#Q<K+lsHUaISRwZ4%xo(;
zj>&=c3|=b7*K{R@m&tViF3TMjrQ^6O<bAx-STZ#=jSc$?-pAwIFYn{ctJNrWe^dGt
zp(K5fxtYlyU5`ls0!fVk=N%DL;Y7c&Q<9XL{QD91u_ru0a?-%^m~HObqXBjZ2GVey
zk6q;<o?-kl9yh9_u}_7MU#%5Ga0L#Dv^a1D4#_XJPwkLgfddTUVs^=^Ck(g(2g@lI
zlhH5VN@+rr^qu%3SW;~>yC+^$*&js%z6hoqM&V~p)G<0VRDR|pdM0Qr&zz{O6_xhD
zl)N6Sc(L|mtrBKx4-9$fM~MO((@F=<_`aA{#0eW|>XmrHeyY*R_y!n|jaFJ&NeA_Y
zg+iMzNnMV-8{sf}bB4pOu<89ZUNY-uzkh?D`u!tx06a&e;P6}%wI#4<dO;k;PSpi0
zg-Tq%YF4T<M`||(`1Bk!Sw1}nlBQn7WF3cvHc8mEWU5UPwEsod%0MB6s!dYiip*T9
zyn*({9jb&2Ht?|a^msZw?8ue3K^MtWuYdhK4&+RqKHbFoGZQ!F1RdFR*nhsmbXWZx
zUXL6(qDfBF*wG648^^hmGmEM;6-5^sl^3tQCZBFf>Tr`Cp%)@pY2dIUfystDp2iY8
z>E*=gF{V{xvVA!_zYc4={OiQk-y_kRQ_7&}h&L<N$BkMKH{t26Q5Kej^)$e*0s8u!
zeVT)G%6KnAX66xEjW-^!s}zJ;)x>w7?2B;$I@tgc6p6t_FUnYw=_VFM*gCgRvZ^+b
zN}7@39tQZA;T;K-=(xw8xhSUWuQg7eW^af9^8-2L3QP63S(vIR<O)lbx*gwCRYhRU
zN($m5RD7)t)Y)@MkdlhV<O)lTYLabJeasb>Vm%vB)v4NcH_jy8zQI+7a+RT7UDd0K
zaHNBy90t&a;%bAOKpJ0cQ}@EPhAMhqWD{yiqI$v)8NoPhr#x0m>8diWH55DY@dla&
z>KH5Jb#%^F+r4_n@fj(=wT2og!rwUs?4f|d(qs6i#zA7G8V9f{@v>lQ62h*UYYk-=
zP$Yfp1lJl0b&}saAy1Xd5@Z3_8X9{iL{Cpc6W1D=nHb5nhVmP)qaAh@1_Psjk{_5%
zYPaMhopHB&xWu)Fq7qH6H8dNHE6Ql4tL&!gfegLUO|DVQysGiZ{&Xk%jr-<H!UAzT
zWU11^uwh);qa>SA)lxBENf%I&)1);Vt~C_YH6hRfcmVlfjCS!b5X|LumzN3EOG<;N
zQ{e!GV{JRc3JSLxGgvHKq$_+uw;MM2CwqmjZO6K})=;h)Jf@MS?7;hYMl#;VV~+r&
zh`aHF_wn}j@m8u)xz<ozi7{!I;AS=c<{IxOSL2Q8$@utna%jDe`LN)?RU6r}yw0(t
zOaZRiC^?dG)kewDX$pnns*TM0GBY<WW$UP7+Z-6X`heYp;2jL?iVP;wIWGBg&Er#u
zNo7=2uG$FNEmv&>OD=lVM$!_E)$|Qpq0uos>4j?^HzA6t6W=4FZFV%#<(H<{Tn%U_
ztDD{i-2^A;Ng;=NVLHukah0VeGP6b(45JoQz#zS7!m{aOZ=0*ZEjwDa8&nV}Uy4Wt
z-I}H`OE?#Z;B-3Az)u5^e?sc&;O<XCriDU9A+!+kyeXVFbv1F6bFQPA#pRq-R<+Ke
zWFbPlFSH7aI$uXQk4HC2*3}hV4F-2?Q#bilSjifhUovd*@XN^3z?nuA+cN%`orzCS
z8iY9w!kh-c5qLe|G{BBw;?)D_5BxS~+_o+z4Bf7TEDgG*5yi%g+lXqcY4F~B95NBy
z-IeX3kR{c|ng(gtLdz1R?lJyppbn#>;tmY)M{#dV^mf-pQClkw;<iIyBso*NB|cs}
zS=zDJ7{mE){B&BeC)=NLFz_fLk`z{D5XR(J)Rx>Q*@5p^-@znH!vWw>ridtQ*M0kN
zxDXkc`l!wE-Qj9!EOBja2!dc-QBh9C{5^%Wu1%h;P2w>zPo(4+Dk$8B4+-Y%DRi-K
ze}m|(6RUhGHngFPsZBxeD<ypi!YxA&Gf%}0C+ek}iQyVzE0uFXfRcHTA_JDGtY2KO
zyF1ybyE|FAI$60pIe%y<-wt5+Fk6uzc8@Vs1+@w@n_aa#o!!gnu?a_|!$C+5Aw~T(
K5|QUj$o~Ojyp`$z

literal 0
HcmV?d00001


From 912479c869c76f62b0c52d0ab26991aced3c3098 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Mon, 9 Oct 2023 13:31:11 -0400
Subject: [PATCH 089/136] automata: remove 'is_quit_state' debug assertions

It's not feasible for us to check such things when deserializing a DFA,
so we just have no real choice but to remove the assert and let the
search proceed with incorrect results. I had previously wrote these as
real asserts and then swapped them to debug_asserts, but of course, the
fuzzer still trips over them.

Fixes https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=60652
---
 ...tomata_deserialize_sparse_dfa-5415338693754880 | Bin 0 -> 992 bytes
 regex-automata/src/dfa/search.rs                  |  10 ----------
 2 files changed, 10 deletions(-)
 create mode 100644 fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_automata_deserialize_sparse_dfa-5415338693754880

diff --git a/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_automata_deserialize_sparse_dfa-5415338693754880 b/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_automata_deserialize_sparse_dfa-5415338693754880
new file mode 100644
index 0000000000000000000000000000000000000000..cac835c53eda6ae94833f300257aec639d5d6b3c
GIT binary patch
literal 992
zcmb_Y!3~2j40Hfh;@J&O?7)K!I#&1S7L~dge>ewBDFzw=CpmWR>`Uaw8&{ztfLuht
z9OEkMlRm@GPMN61D$F&Cn6+>ZrBLm@n~)%A$SW)_>nMgNphd}`#b1aGREYySj+v-<
z*XI>_Ifh~0ogs77L89<;!BdWzdZxm)BoN!fKgw;ivK4+3tmiE)FJRP+7h?(Y-O4>9
COL@rv

literal 0
HcmV?d00001

diff --git a/regex-automata/src/dfa/search.rs b/regex-automata/src/dfa/search.rs
index 8c012a594..5a82261f9 100644
--- a/regex-automata/src/dfa/search.rs
+++ b/regex-automata/src/dfa/search.rs
@@ -176,7 +176,6 @@ fn find_fwd_imp<A: Automaton + ?Sized>(
                 // It's important that this is a debug_assert, since this can
                 // actually be tripped even if DFA::from_bytes succeeds and
                 // returns a supposedly valid DFA.
-                debug_assert!(dfa.is_quit_state(sid));
                 return Err(MatchError::quit(input.haystack()[at], at));
             }
         }
@@ -297,7 +296,6 @@ fn find_rev_imp<A: Automaton + ?Sized>(
             } else if dfa.is_dead_state(sid) {
                 return Ok(mat);
             } else {
-                debug_assert!(dfa.is_quit_state(sid));
                 return Err(MatchError::quit(input.haystack()[at], at));
             }
         }
@@ -422,7 +420,6 @@ fn find_overlapping_fwd_imp<A: Automaton + ?Sized>(
             } else if dfa.is_dead_state(sid) {
                 return Ok(());
             } else {
-                debug_assert!(dfa.is_quit_state(sid));
                 return Err(MatchError::quit(
                     input.haystack()[state.at],
                     state.at,
@@ -526,7 +523,6 @@ pub(crate) fn find_overlapping_rev<A: Automaton + ?Sized>(
             } else if dfa.is_dead_state(sid) {
                 return Ok(());
             } else {
-                debug_assert!(dfa.is_quit_state(sid));
                 return Err(MatchError::quit(
                     input.haystack()[state.at],
                     state.at,
@@ -600,9 +596,6 @@ fn eoi_fwd<A: Automaton + ?Sized>(
                 let pattern = dfa.match_pattern(*sid, 0);
                 *mat = Some(HalfMatch::new(pattern, input.haystack().len()));
             }
-            // N.B. We don't have to check 'is_quit' here because the EOI
-            // transition can never lead to a quit state.
-            debug_assert!(!dfa.is_quit_state(*sid));
         }
     }
     Ok(())
@@ -631,9 +624,6 @@ fn eoi_rev<A: Automaton + ?Sized>(
             let pattern = dfa.match_pattern(*sid, 0);
             *mat = Some(HalfMatch::new(pattern, 0));
         }
-        // N.B. We don't have to check 'is_quit' here because the EOI
-        // transition can never lead to a quit state.
-        debug_assert!(!dfa.is_quit_state(*sid));
     }
     Ok(())
 }

From 39d8b45d0f485376f77fdde316210d7d3fd0e587 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Mon, 9 Oct 2023 13:50:42 -0400
Subject: [PATCH 090/136] automata: fix invalid accelerators

It's possible for DFA deserialization to result in an otherwise valid
DFA, but one that records accelerated DFA states without any actual
accelerator. We remedy that by checking for it at deserialization time.

Fixes https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=60739
Fixes https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=61255

fixup
---
 ...ata_deserialize_dense_dfa-5883983265923072 | Bin 0 -> 2734 bytes
 ...ata_deserialize_dense_dfa-6363062083649536 | Bin 0 -> 2735 bytes
 regex-automata/src/dfa/dense.rs               |  18 ++++++++++++++++++
 3 files changed, 18 insertions(+)
 create mode 100644 fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_automata_deserialize_dense_dfa-5883983265923072
 create mode 100644 fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_automata_deserialize_dense_dfa-6363062083649536

diff --git a/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_automata_deserialize_dense_dfa-5883983265923072 b/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_automata_deserialize_dense_dfa-5883983265923072
new file mode 100644
index 0000000000000000000000000000000000000000..233fcbc950a61bc614dc0e0a7418724fa0c36c56
GIT binary patch
literal 2734
zcmZQ%VEF(4KNFB(WMp6fqESE~1R4m)<B~1`N<6|PIVwIH0;3@?8UmvsFd70h3V}B?
zGI!Ki8ifF`F~&+YqnQYrgrNdxKN}8RVmQR{30tEzE<yHL0mX!Xv<W_q5IHUo1pr9f
B2($nI

literal 0
HcmV?d00001

diff --git a/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_automata_deserialize_dense_dfa-6363062083649536 b/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_automata_deserialize_dense_dfa-6363062083649536
new file mode 100644
index 0000000000000000000000000000000000000000..d4a35d1d10e221c0acff68cfca53eba4ddd84aa2
GIT binary patch
literal 2735
zcmZQ#U|{$U1VDn3k%0k-Mm(qjhJeE$4+R4}?i=iG$K!OmO5qDbx;kXkve6J24S~@R
o7!85Z5E#G^U_!SW*cfC(7o&i0jn=pXanZ%ljR!SNfdDQH0K!oY)c^nh

literal 0
HcmV?d00001

diff --git a/regex-automata/src/dfa/dense.rs b/regex-automata/src/dfa/dense.rs
index 902f4b273..fd96bc878 100644
--- a/regex-automata/src/dfa/dense.rs
+++ b/regex-automata/src/dfa/dense.rs
@@ -2346,6 +2346,24 @@ impl<'a> DFA<&'a [u32]> {
         dfa.accels.validate()?;
         // N.B. dfa.special doesn't have a way to do unchecked deserialization,
         // so it has already been validated.
+        for state in dfa.states() {
+            // If the state is an accel state, then it must have a non-empty
+            // accelerator.
+            if dfa.is_accel_state(state.id()) {
+                let index = dfa.accelerator_index(state.id());
+                if index >= dfa.accels.len() {
+                    return Err(DeserializeError::generic(
+                        "found DFA state with invalid accelerator index",
+                    ));
+                }
+                let needles = dfa.accels.needles(index);
+                if !(1 <= needles.len() && needles.len() <= 3) {
+                    return Err(DeserializeError::generic(
+                        "accelerator needles has invalid length",
+                    ));
+                }
+            }
+        }
         Ok((dfa, nread))
     }
 

From fc9a11a452adbd262d63990d6be813b577b96687 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Mon, 9 Oct 2023 13:54:25 -0400
Subject: [PATCH 091/136] lite: reduce size limit to avoid timeouts

Fixes https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=60779
Fixes https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=61434
---
 fuzz/fuzz_targets/fuzz_regex_lite_match.rs     |   7 +++++--
 ...ized-fuzz_regex_lite_match-5690981331369984 | Bin 0 -> 133532 bytes
 ...ized-fuzz_regex_lite_match-5888324890656768 | Bin 0 -> 233677 bytes
 3 files changed, 5 insertions(+), 2 deletions(-)
 create mode 100644 fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_lite_match-5690981331369984
 create mode 100644 fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_lite_match-5888324890656768

diff --git a/fuzz/fuzz_targets/fuzz_regex_lite_match.rs b/fuzz/fuzz_targets/fuzz_regex_lite_match.rs
index 579078c71..155fa6d8d 100644
--- a/fuzz/fuzz_targets/fuzz_regex_lite_match.rs
+++ b/fuzz/fuzz_targets/fuzz_regex_lite_match.rs
@@ -57,8 +57,11 @@ fuzz_target!(|case: FuzzCase| -> Corpus {
         .dot_matches_new_line(case.dot_matches_new_line)
         .swap_greed(case.swap_greed)
         .ignore_whitespace(case.ignore_whitespace)
-        .size_limit(1<<20)
-        .build() else { return Corpus::Reject };
+        .size_limit(1 << 16)
+        .build()
+    else {
+        return Corpus::Reject;
+    };
     re.is_match(case.haystack);
     Corpus::Keep
 });
diff --git a/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_lite_match-5690981331369984 b/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_lite_match-5690981331369984
new file mode 100644
index 0000000000000000000000000000000000000000..d892bc31c496d70f689adcd583a32f42896f5fb5
GIT binary patch
literal 133532
zcmeHQ&8j5HbsnLO+@|;1*$jx0rgcTTJv~3*VGxZPc{@U55YlXfAHX1kp_#|<6Zi!T
z-dHf&NPE43Hrn_#TyaiDeDR$;aUwFSvTok0yo<b%=l4555&2tHKm6gduYdI6vmbr<
zLjOMe@QV*mfA5}s^6KR`-+52|{#O5`y;r~T{@%R(hc|CNfB*gG`TsB8e)9PTA3S^W
z_8;Gz{{AMho@biu{UVW0tn^RDY4gmpy&GByG6%9SxFP*yQz9Ocell5<XdbQDU!8jK
zx!L^bfB*EK|M%bjrBvAxC29OT{Ta2%%!5yM{D`2M5_{I5F@#-TX9luTkYOaC0BQ-P
zxR`3OuX?5z3Hy<Wi*@Nm4s<j{b@~SmA>=weC!VMfg{6gWy9s_U%3(U{Y6CC_ak>g{
zNT7vG6vjcUo^TE&0f0XlJ%Ng4^qhb`a7s~8?vy?|<5c;kG=aBMRX=vqFlN`+Y2*vK
zS*?^sb1B;|A0SteW#}7z>NS-X@`Da?VH7k~oo*c0@iVPXWE%-63X)LIx&=txPhCL5
zrY$nBvXMzF33Hv65JA5QD;vOBEt`SoY%uA;3`V3XWTJ2^#OetLo~KMKY$Slz5mEsT
zA!P7e&QbKnhI$Hs`v5wn_fzGY5(K_NBMrS!$xlx>@Vu_7fkDJ$M!pWkfx1CAtAUqB
zeb~O?r(RQ;K1hy?`nfO)s)d>8t}$QzfHNK_`Khk3MFI+gBowZ0f%4x99uh88^+!A*
z5yePP)(urXKj^>0t6Il5SqT_b+0Bide1h+r3KSZuLJo`(t0(M3RZ!|bNRA4mv~79I
z=FQue(_N!=x0r_RtK9Nk7`fvX<ZfU0l!nR`+)*5Am%Ws`1bf^@E%$|I_lQf9qT8^0
zspy4QLW%v+Zlhi_n4vO;6MK5(a8GUEJWaW3r%w-c&S`{4t9x2eMxH?p0pzTolaIOX
z5P*FhrRPB`qC^odGPH?sgl?mTvgc?eF1mzRd|r+-^x#AvLT2Ra(Dg;RW~!h?0(u!n
z66#d9%1&EAAz|b1xeu|SEabo(v3kOR=jrlr=*kID?I`DhP;vakS_e1?!vZS4i;gEz
zm{deaF2Xz@ENOyy8LcZo54mxoo^q8aT7q5DRzZsd)FMf&bmd}06pjmFiW<b!x$ZA)
z>78d(A98<%G$Clpjj7;0EIOkLFy%?oM-<N7j9*^IV7X<F0>tVx<Jk_%sWMZcTS9)f
z2J!d`@JgyeCJN&qR!_M0%w`sM^D4jWv%lPulcL-yS{-B~sl}A=eT9Z<FU178CmeY0
zCkFsA3JgNXjC>u61Nnk(Rs%1M`mlY&LpA+pbBOUx3B-&pX#g`uq$*^ha8ShZg$*(h
z>gNJ@FzO+PBu2gt!RCk*S|lI={k)b?%2l%2i%j_I(hJi%nxZ=W1E-D}R2dz|7)e8v
zA!M{lsU}a8`Jn$wM^1$-`-W|IVIQ0b5Aj5WDC{=RBtunC0dU`BC$j7;MKl%*Vy_e$
zs=ZJNa!=TYDq2&JL2>{Pqrf19%*fZFI8Zm}W;O8As1MsW{M2hIErjIAsGkd?pjw@7
z9M|zPT|tWk6a`7BXWarM@Q|=+i_EKRWD-llT&E>O&~L)Z25?qWJMf$hCOw$Z(es3q
zbcIe74u)7g;lTB}PO|us3vg<L{^v#bUc0?DYCm04-x&Xw*FP?Q|NiTr{OrA-{aXHh
z@OQs?_QS7#`qO`%h${5=V}IfxZT9$Sm7#mpb&4K8njy&HgTR3oLe}@tWfN3_J)ZE(
zL{6LI34eLK^Q&J;gB(xz)ACy8$~`^dpITuk_FG%WSH94Z){u>!!&VdX0>dX*)Lzu5
zwcC8UmyfhP{bl_13H)G>Cu<|YM<I{sZz~g_yX2IVbA{lYRc^YT^uE^3@d{ll+q?Jx
zy8G1u`>Nap>oML0MRLgd%_gk<^%8qo3x?_wUe~TXA8M!O%x>7saOvqC4Q{V|*GsPC
zZWkA^X)$P?p)N`Y`eBjU5@EOqPIVTn7)yM!tRP|1CH5{IpnHzst^Y4icg6ji%V#c$
zK~}C6YyjbG*7-3!se7|{Dj}oJ^B@0lN&P&-+QajY*`u8;p6rM_i$V7vd@?^gf{Y(3
z_}<9ZiP9rhA_2`GNvLPtBE7&v!W>?Cl2+LW#6_yp5-Ky{U~ho5JTVAt);G8QNDyh$
z+){yrL=?_>&a|Pb&0q@nCOfsy&Qh^}+Bc=K1Z7Q#NwpX1B4Hn@f>P)~a#X;iv=hph
zj(k@4aZejfSRU>+4M~6u)x}sRo~RIorDy!(ZRtjqXPJVWjOy+;Io`B_*)RK?R+V7F
z`Au=GMy9OT&52Ic1T`M{I)3V>4~nEZX0<EU^mtqPly|R~xfWdZMhT^OVkHvL^`Mqe
zirZGT*o}W-rX<eKH!5>S`B)Tc&f68&4DH%RX$+33!_z+~V6-UIMZ!5Rj_3YC5&$s@
z3_{3^d>x7d`GRg%122vGuzkbs@-}zIiSkW}Rz}APFi5IGCJF~dEMGYJvCY%Z1@K_h
zLk>xdd>w+#5h=7tKmwvq%}nP2>Q&hsbYzZrVuC0J-DKTR)$;?wmQmF?zR60!sH#3f
zsS4<u3KU?PRE10w4vJVkVZY1@O8p1PQGw+B?0ckvfuS#-T5v;u_<r{dJgFPH^}OBP
z`=F{#))3^d+)wIm`SFtvI?ck0?(G?Vvw1JhklFBPck{5oV>EDhk;5M}WHcQzhmIyt
z4GvEbG9zDyVDkY}p+y3EJg+6DBfN1fyS3PjU%@U4RgyEc0T;kzF&D5Ul=-O!P$ev^
z>MJx<g&Y_oR!=zaylz4Ow2r1e9L~fl^d_6?Ihe`hq$qbv@2AQ)B?x>4B#2anOq7Ca
z&YDwLIPko#s)0eoV@AGCBSh$CH4~8t96$Mxva-_>0$QWSBB1L5NoehLyLP##Y>;j4
zB9ZD$bCsEJIO@btV(L0c2A*?QvqD2v$bm6p^@Ib@>lz85bu^`FI1{VTb0PY`$w^V}
zl-^I3Z%ShK6_6lO6*5r@t~qN?Vd21YKRE!1QD6{4X5{NMLWFKsGZCr41LR~>mCQF>
z2^dw?M|<vbrV2iTe5WN$1yg8#De8oigodYE)B{12uxSh8i3(AS^km&o)zbqU%Bbob
z-()3VR8=3L?6b81I#0QT7*vfy_k-j>;o^4>Duc&e)G8SZheNj{kGrVpM`(k0?zc-&
zsn79!5b~+bv^jlXql{DLk1ql+`B=CH>_xovdNkFJo9SGx=<`cM2|aV|nw}&wsAYRY
zFt@0CT1Cpbepv_0GSHJA%FvmG>|wYrKZ^LeNIY*5x{Vr2j(F86NP%_Uc{$F|gA;uS
znUSyKUlNI`UKd3IdSyfs>QuMNPFp}BVRQXfUchCQhO&?YbHwTi`^gcl>GE*s$_Y>r
z8fe)GcxlwPPmvVmNw4g~kHVJhAMY2RGHfzarAKl!*qx&0m!h^<OQ`wEK9JO6$J1Hl
zMN?_^y<LAYK{E4sEupmY)bSNykW__C6gEbzo^ar~zc>J3do=ao=^tbWA%o{99HZ!s
z4fXT@hcc?>q7R@FFsiDLPzGah6=0fB2$?7x6tQ~37q4ny5b>Ci=onga*LOfcBOkDm
z)xb+5E#&bi7rFQ<m<-4!MmevkgF%f)zD}crfEEep1R@E=pj*r@AR}SZ7MWMs$Rw78
zxz3s(f;sS}4dASj8F<bHlOCX8M5;n23gaMFPdM;AWny6?0kn>g3UCM^gFg-jjZf!N
z&aPh7piNL#GxBvP4%8mHSq;22>NlfrxHFxzGoE<9DS_xK06nP+nJ63-v3y~fJ}@t%
zelCoHYGEe2Ys|O~!RCk*S|p$-NJ2g979fF#gpEJqi3(AS^km&o)zbqU%Bbob-()3V
zR8=3LRQdKz1qv`tszN3T2Su!&aM09KkU??)u-tjtU~oLjU7ysUo09vwNjwBOyfJ@#
zAZC5TY@vC6MUM}}oZdCa&$n;!TdA+~fW&^Q<GxltZ|)tBa@W7faXiZPI%*fR=#cRE
z03&o)TB%ABPL~om&6d%U^Kz&LC;AXFBVUKEFZj|!p+y3EWkeF{RJXWr0u&N9{*Fhv
zQ#Yi%R6{>!l>pL<k(T>-lzTkNt$IR-51`1X(@92p+aDAk29%dGUPFL2^V1>(sy~&~
zDVD2Bp`j||z!<T5!ah_5rT+I^v5cMzG4)gcoKjSjJH42l-D6e#*tJ1Q{>-kg<EMW5
zLZf6VtFhU8t*I;wHnFQFX3i_OuQ6?QeVs-LrFdc>63~^7B-E*HmCYDQ*tA9FRW>q-
zB~<_zv7blpSMxsy-n0Rn)q)*(&IUO$b$#pKqOBL+_LhnFO9{s}N0rO9D-O=TN<D6r
zU$I;RuFkLMaijdWQNI3$>9LLvOwx;sJld>3?vIyRF8%oEtJnB-C^0fr#Qi^RniBE^
zqZxu6zBaJ(*ty{>k|(~^=ez8=_`dpSr~(&*JmhVQdLIYLvT9y&aDKTzdiB{ypMCtv
ztIu9fUlpJeYdD7U5v*Kbo=U?ZL9%Y0HhX+i1t@Q8VGWnMtMtBiXK1T~<ExK99hglQ
zJJ<{_5|Q9_J8#Ha*fawP_DcR#maCvtDR<MIV!o4nJkH2>-OJks51`1Xx*Kme%69L-
zdy=5}H{qkZ_M#gJC<>BLf4T)opdew>7MWMs$Rw78xlW!$FbCeW0i4I<4CDey4vKJm
z+H9AfHj^q&eUCcEqdZNLu1fCUM?r(b%OchW$8pI~XZ@Myw6zGKHoi_Y2Sqm$&`C`a
z>QA@IM%@#}B>X%1Yc%-t@~#3aB9OU1qHrdNH5icKhCdli1$pqv?9<3pyRD40a<K?~
zjWYz4J3)k03&74lR3)>EfDZ^?t|U45WU;EuP{LS;V6(n<OC%rxUH<3CG$Pps-nHff
z(dT4wbWqIv@x*A{vV|EP-TE6FCf;RpBxjl{(O;GPE|~P#81Q9h7b$CULMX%!Q5XlY
zdcr<b(ds7$5n~h>gpe8eIur*&gl<*?FOB-JeZx<^rrw&+&jqrHQO+y3FZkZb*YPtg
z`U)))kbo|~#7nmbgue5iAFX9FphY>i*ZgaQH*a6j!wdJf9gmxOZt{jMq@Lg%<FMSv
z8-8vtJQ{JR3B!us_g#7WE$@$3$nzW@ZLmInlDjuQNq?uew%+{q)ASJP2HuR>V%Bf<
z`1Q01tuLUbIW!^mb#V5=sA==2<fo;z?D^vxOkgJ|QN$h9E3hMU$5#kM*Oh}8o@7F=
zw{?r_FWpa<XC!Q{*Y4qIKoCE%26JyNZv8tO$i6o!IwAAq$m1`<JRof15tlc4W2j`E
z5=$8}vqMs&o3To0-tH!&(<h*jCS)l}>pd(w_1ukeK|DzxQMe&z{6~DS=^(qp1PhWw
ztXZbenKG)Onn;#*^D4jWv%lPulcGH7mHi+{uzXXqtUv)HQWY{$7zeR>!gW31Pe%RZ
z7zNcr8Qr7kxDLgE5Mdx=2QQ8KuzkZrH9fwj=#*hp0Ha)sRDDqWObcJFL;~7KwZzm~
zu#ZtKcH_S|qv@M+k^UhI7~N4m7KNJo^onbSR;p1NgX4CpniJh>P~-z6Ux(_8e9hIs
zOQTJX13Rv%Ktpn5)X#-cP_0fkj_Wi^h%!Y2ih?B6vu*(rcu3f^1@T0MC`Nj+Zm8<%
z0S;v}WjV?ECMyA>s`?0}TGM?~fdWjEs*s7oK@qDb?9WsMrT&BDs6g_*<t@?U3;fq-
zcj!Lyz8;MH;rD<3_S--H<*(&7xA%Z9eA;!qwXnWnw$L1E!my&pTMLi#y+l~xF&a3$
z$l(tfGMWyVLq`*+28SmInUSwUu=#+g&>{gnp4Sr75#G3#-CFF%uV5F2D#@AJfD2%<
zm<!kv%KTIVs1g=d^%WYbLJo`(t0x?IUN<2CT1QhK4rgK&dXr7{9L!{LQj|NT_fzGY
z5(K^i5=5#(CQ89IXU!=r9C%(=)xaR)F(Y565h8T6nu$mRj-NqIS=ng`0j*JE5zzI3
zB((OrUAtUVHpn)2kw|r>xynp99ChL+F?F3J1JAjuS)rjS<iHrQdcuL{b&UkjI+{{7
zoQYNFxe$He<fJHfO7Ex2Hzl$A3P=#C3YjPc*PJz{uyEkHpBw<hC@=^iGxBvBAwoB+
znTS;20dg{`O6D7`1dOWcqdoUIQw5(vzS9z>f+;k=6m`N$Lc`N7>Vcq1*t7-lM1?3u
zda`b)>gfRvWmI(zo2(dCC8IZVwU?Qv{~n*5hyYO-GqHSOac3M&(1TEog6!rQnJjvI
zkLK$TY>r5wMFJ9#^LfAJuGErjJ?sQRGay?};U+VByw)|96)lRELB~*^Ks1f`WKUEA
z=^6ict*epcS*9Q-qk1jiifsEUOxZ8{>@N;}Q_Q;0m_;*I>#=WYmKBpQ*(KFEg}pBl
z&Us0Qjpu%HjDqY220igWhvGnpu##PYmqvZqzTu&owqNV2)H5#5sqZz2AAyppkcq<a
z6RRieLsd}f|3Uv1A61-&<L+W6fK3RCVc{=ZmNR-|Lp=q+eE^-vva@@vsvkQl>WzFI
zKlRi3OsZp6+n}ajfAO2|eDB4(*FXN`n|B|7^Ww#eXYXE~#vV=ix;+V2e>3&AXm$WI
z&$6NSPJh2lo0spo`rn@f6(J0KHyfDSKIP7oA*XV;9V?521+DwR>@^Cq>xb$|g68Wq
z8dO?SbRz*>?MOoX=~mg}<(L~HikTvlbwgFpK{|btwwKgQvAsNZh~1uNh-pGGrxeF?
z)||q^K2*_~f(())VzSxd#f5D1cmpYCV)_4}mp}IP2GS7Z@O_oz4W#9bvZcoP6@8>P
zkiOh9D`U1LcW?4<GCWUHb&o&HFcQp@ZG(~Ewvuta9)-V&LRw?HU&EMw5a68f9<A=j
z7pR<5woP@Tp4im!knspzc`-UX1v&eGRdAlJ3~-vg_`Dpd!HGVE%*fZFixG1*AqFA=
zz0e^Eb*fuj2LTER8-L}7NK%9fMx-ocqOdt)^@RQ8h}M!TCqT8MoC`vAalk<s7EtkB
zbUcZ|yi!FZO$Nxi0`!m@C$j771SL~wk$~2nBv!i09W-1C1$Q?YJsuQK6Q#`TZ67mA
z8`q6}JSfJRQs<@n!G(I+LGfgSN#C%|^2al!MiZ{C2huL1ogRyTGWU|{3l(^PTpdl4
zr34k_*@{v)sxR%?eo#F4m<Ff_g^-ED-ihT4@9yANt_1wS1+P_n2O)LH`>owyKKtVC
zdS7@uRG)3HWG@`-4g5l7IHQPpd*!3>Hr<ij?c&b4_YwMX6L(P#&<~5$mI%W|a8r4~
zim}8u%L)=EU1IOj0lGiqQOa9+dwDt+dFD!QcYv(iw-?R@)pX&Xg1tmxS~kv);YmFZ
z#;L80hR&z;nh%<Ep3A4e`D{O_Kmj9C6*5uS7_ofe-JMT2uUrJbrkmp8kf;2Sgl12-
zS_;lC2^({nSJ}uUmQ;a6>$HRj<}x>J0B6+(f#+<HBU4bvPxB2Vjw><M@BvlTvSHsh
z*{Qu>CEt`fbo-!zLdbzJV)cZ5s0vE`2gy+ZlhRHoV>-9@>%&u85I&U&IfOti4R~pE
zuPrP2Lvm!)&xKJ?t&4PMszsxOC{rY$C`dxl>J}go0|}e9$h^u%Cb1;UwN|nBD~8{M
zl?~v0>?2ruRsD*8i=GZ3VKQ3RxW-8LDbeXN=X8_I-udw^irQBt+g%j3E;Lrshptcd
z70nRj@UrkpT~bXqY*SD6UY84kUfCYkQ9Hk)1IfAN3A~Ftg=hE5(FW^NIF!3vTgO+<
z`s3Bw(r2gB966}ujh_3ok29}c<3_L;d4hdcf}anbU^GLJgA;=xeec-0;Vf$RpPq3)
z!FB9nkcaBEMZJ%Uo%wpPJ>~pzfAs3Jk3ReOlUJX;rVEH+=;tF?d6<4G4Tl7kqv&vQ
zuO|qU!K3kutx^|>YY1|<)Lo_buX%8M_3@_zv*`(z@nVqAZ<>JwdnJD=%T+KnQ|_i4
z(|q&0f8*7^X<pqCw|8OXZvQ3kf+vH7LOc?MgCdqMEcguaelCoH?B)jo$)d;iXub}?
z=7<zpB%lc=3H7X7Wiv(+HvTfNvXMzFiSRltA%Z#ZrVZe%k{Ni;201bXbv*wcNE}z9
zKLvb~o!a|V@=Z~ncx6q9NmVq6!a)(MC+tI2P_q5%LH`vWo!F(LC#y!YxjP4Od(Zz*
zKVB1@uZAaov*WlVvQvk7PD=$B)M3_qY(z0o!x;fx;7CH<>sHy^_DT46^4IuG(DQrv
zym5VWrSzEu6Hk&#6qXi#r*TwfDvYQTWnQaRl~CL2*e14QLonwQ20kDx?qFU<Q>YI<
znO#8UX-}P2IdTd$PT6KsGZDy1bCi?SwBfcv;5i%QMpOUCDb7IRxDx#-;G68M5`L9@
zQ&Kx$0c}O9LM93aMXa81;JKe10K_OT2q81_btn#m2;Hm(UK;gb`-Y!-O^+qsTnd=B
zpsGs~5yePP3$vkW?>(ReWmI*JZ?Y0Fs;Ym<qd-DTD8vO(I4ENEg!7bhjz3e7L2?kg
ze|PH5+gJ4P!u=%AvWtP^4Zru_e_k%GE_VCXuirxI`S);-H~bc+d}HzRD>}QHzpn{w
zUenpQTw9qx+F*TVC3kOrlKxWb|Mt`L5X!!#erxOa%2~hRyGiQ{=xGj3h<z2+3!|pZ
zo06ZF*0Sf1Z!m$Kq(l*SRIk8}(3P{zZO_z8l&4D(Tv*6x$(0CHgG(C-nUSwUS5$nl
zrO+Y)y%r=1b*fujDgp`#8-MrkG$4qdSduZKm}DWjRZICbM)ti?(FvLJ&YdHiQy4Ra
z;|ohv2FOiLk&<;vEM<t3=PMvpq$*_1+udaJ`1a9hqM&c2_ND=-Pav8`%kIU^CUEb0
zl{lMep@T7+1;RK*FwX~vMf?;#EJx8JTa`jXRmg!cV)cY`C<%ZG2bdZ_SbkXrvtMIF
zJw3pojH;SYa7DQiFsiDLP^$F$rUC_+CRHI5rQn*g<`fprc}a-jPexOaL2`_OYPqp{
zlA!q@8X?ZrOhhUW!1fIf)%5uG(bL+A3ShLTgsEU^#V>`Joh;T8Q)|I1MzvV?OAZzZ
z`;jR`^&hkv0JcY%FpwdHUVjIu#0nXhQ`qDp;lT6M8CTB7D)gLyK5%kUlqbEij{?fb
zPnB;fP=HlZ6*5s62eEp>K2!yz{ufp?FsSwzta`#!s18OH2ee=%y8<tb?zN_pLnKEN
ztNVK^A<ERKu?T2Wkc5V#Tl9?XCmj+_1hAMV^hkQLZfvNzAf3L+3I+gSl>0dO1l2bs
zd|#oVD&)Wzv3kOR=PAe_IRIF07H%*&e*1HMQitv%@9V+HAAbMmZ@>NHU;bKdb9)cy
z!du_RTMO$OW(&=sCJZaO^7-YBe3sCDxyTkZ@@-;)$7ta2BF9>2$Y{yY1ggQ|2|{M%
z>kw@3EFcollZRSDDb>*(AZoE2zk*#9s${5h=A<%Ha=A)K@bUsr2Gc*7!Dvw!i-dis
z8m0OlI=uo^J3?gO5JFDkanjtl;ZHe}R1(Ex5m~qWw9{n{&5=9OL?i-V2F>WFqcfee
zGoE<9DM8>X06nP+nJ63-v3y}EB+ScbrzK3WDKv_UL!R<S5*nUv(YOejgpIk(t88Qv
zOTt{IB}6cnxoHD9s|gl(&IY;B0E$$FOccgJte$Y-d0is`w2qJpa0nrTKUVl-h1Zrb
zeWBg+fF_brHCz3rssxOx>Z3jPIa39nLB4Kc1~c8*$F8sAXIk?q+eknYP7>-*x2T8i
zr!F92(-y=VbZ#E__tQyEK5w^FNf)qVRG6m_LB)ui2$?7xH?e%-yXhQ1v6LZBp05y9
zLQ8%(89iR>n$7V|*QMD6Zkz+~N;uO(2V*n~gr|ZC=K0{Th@Wx*%NLzsfiYr<d!jJt
z97=-9obC@kATkY_yo~x~pHuQ5tZD3%lcL-yS{-E1l<<9phN_T#Tyxf(Qi2dx;JKe1
zv0yfVK_iJ>U#Ah`T+KwJ0^8KI{aRPm5{tq~;C%%IN|_KcQ8<2L^@Ib@>&g$%I!g0g
z|G*)H41U=Hn$a5@>S>z>SQ1N){PZ8eV?=2XGEq1vV)cY`UJ?NKlhL}W1_q6N?D{$s
z2SS{yftN;o*uLROPmO*3#c#gzy%+CZ|M-({-hKSdix)4Ry?c2Ydo<<i_9R&S&D7hX
z*#XQv%ZAb~IsGziUcTq*e}58GgfQ^kY+!EtlqF<QMvE#SS-x_|iaQHh_aA&RcLikp
zP{H>`zK-8VqObXl1T=pnp`LY%^a2kFb9l#M*f`CmoJlH)VzP*=pR*YH53nyoJ>EcC
zpU0t>Klb$o(h%hEeU;-4r1cH6h35GceWW*#zT6@k%WO*Sj_*x5eHTXWSDo;>nPhmV
z>Grkqd7a^WJqmvl#l}A~G-cI;HcA_+M7W@@oc_lbsEn5A>T*#}Tpi5JVT7){7`;Wu
znl&r|JDs27a!5wgh3s(+c)Ia%4>t#ct2bh;yVUcq^G(Z{-J)aYvRjz*&K3S5%mc!5
z0c(KV<P`A}OBouzwr^BI^L95GJsuQK69s)EwKokAwHWc{T?NBsRBJTuU@g0~CthcP
z6n|J6@&s~NY6HPn$J`n(Yy}DsB2^(1g>ew8C!9k`h}q2XpjfMjDqyrI#fyaNdcdEI
zc6uxV%8Z@+s1KVGRFpdtQJu9P6c0Y80eV6qWTF&YbJm=agr$36UPk>~7zNeRM)VAN
z;(<;OkwS|E6h$pD;qpM#V#lYaOolqGO$Fq>gP}axQVl?*w6m(O&`=d}V2oHj;lT5{
z2?5YLn)+}!6RXghY^vu61Es1gIVsAOfKioQKUKb|Kmn#nRmemsxaO=mg@t{nqP4E7
zfkCnAZG4?Zh%k`VOhhVhcQuvigXG9)rzK1UQ)rDEi-4{NB%$HycI|Re*&y58MRocd
zjvA?8+SBf$lbbZbg;wC%S5Vb+fAs2u&*RS@Ag2}9Iu*gJLU8R7>_etb#p$8WV?W`T
z-TmE+FmzjaUyr1QAcuFtkB9M__0uZD=T~&)(>Ofa*!S-}1X-2b9S`I0_F??g8bed)
z*4FWrv;KGnAFL-EvyLAz**3w^I<T$h5$h4UOTNU=djaXHuPcotW)qofW&4ewAM&={
z5y-u0+s@sZf(>OTyl^Q8ycO<;yx(lX+GqNESqp~hGu9nD3APPmUv0KxGR;+PuYB_u
zx1vbyc5w|G-uYV;KKfyi+7e;7c-x+83s#IJzFAg~FzFI|mk!W9NATAFm!~`D{%zpx
zR<3IWdx^rdY@8p%le#sF=La&H9(0wbE|!f457mQ>`pLtP0)$9a$V6cr#PWp&AI!_>
zcz9RRSl$5C6!|&y(^qU5w(RC-Z7re9PvwC=)st0y1yw!w;a&2y&h@%ZLclu090Di8
zLp)I-ik~<`RZqcFN8jXAUk;GqprO`PH82RTW;EB44`J`(w6S$O9HyUt;4HTW@~NPt
zZ&eSP#GBoD9eQhm81Uzag49V(66#d9tvUZY`D?weeGebQ89gA(Q%e+H@3bbGdWQo>
z?u3?5=BN5WEAZ?qG*pEg7$a6sIPkn~g#&0EwR*VKy^P+|wjQ_&en4c^C=8~EOels+
zM3}&kX@hRcfT61A2mM!g)uQK{tOSgz>@Lq8>Qgm3OcM$r6Quy3v*r{Q_MwW_y1oqz
zidApp>oh`yfvjdCQh_H}Q%$(E0)mv$PD_yzuh1H`5&`XPlF;yUi`GZbBy7ri4~OL-
zequ?+h+>k3<W}oSo;Qh!oX4Z0<B1h!!yqB-VIB~cs0@&soFaZ=DMKR#N2*9w$j72{
zTEu!ic^4nauNMRA6VqbE>t$}nA3I-7F3AsLs5)1M%}t&_j=I{Ar9?L{5_rxAlL{0t
zB2^(1g>ew8CtQ1GGsmZ6Z2_-c5BQVOPLD-EnR|)*s1KVOwkS_}WgmVt8&aL))3NzD
z<EF=Uu%Btor)(nuHC9U~<q=mb8?^2}_~ce68R>1;J#p4)ZIzjl+d`cuTM|PAb#*``
zgRB4{QWY{$7zeR>!ah_5rT+I^v5cM*FqJR>PAMwNoziD#TqNI=CNN1(f11nUN_I(g
zPB}x^hpM0yNst^BVDEuJPh36a*iZi<L_xy|1hOme(&%1mDmg@QG_ktBw-TaEjT(!9
zHU&v&IJ!m8=zh{6;Y0wd@G2XDxJXs%Fl0u46GABS6I@UO&%Q!KRmg!cVoky?zxg8l
Q{nOV!diNh6fAc^853U*hc>n+a

literal 0
HcmV?d00001

diff --git a/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_lite_match-5888324890656768 b/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_lite_match-5888324890656768
new file mode 100644
index 0000000000000000000000000000000000000000..86126585263977b28dc771444064df04f8961bdf
GIT binary patch
literal 233677
zcmeI*&2Agl6$fx;6M%~X-t+;yC?Ewb%n>P34ip&3(tUw0gf4>W4T5IT&dP7$7c$TS
z>DG_a`JRj6mFA92i5&7LAh9Axwj|Cy|Bv%O=idKgQM~=7c>DXWZ}0Q_>R<U?`EGvq
z>mSSCTrA2DzWrtJf&2V#JHKCk(oY|Y|NGy0@ykDdxc}|P`|rxnzs{G7{NdvAU#on1
zoxl6Tn@#s$)q8pM8~q3Kcgx^^X+HVqyYiFjFX^HC!<K_TFx#%OO_trRH#e)z&HJl$
zwq0j8x7+n<z0Iy~Z#UWMdVBT0`}^~E^%qrtfoXro^~hK1+bmw@k)Ki8Pui>Lp>2yt
z+7mPO+&P=hCO|396x@829*naLRR*8CxI|k^IzNRu<8%1jv?oTTRcTe)=^lkjtJ12p
zB|RAk9e@H*017~9ni0eD05{?03~{sThiR^LZALxBvRSHCsw1rNJ;|jc5l93QfkYq?
zB&h_(%BY;|%}!>q2@*v_M)Ny6pP^l>U0elVc}}9a637W*ZbFO@Bg6<XLW~e2#0W7_
zj@V!vy=kOnQh!6nLB?T*83%*C=9i}on{W@f2i$|{?!hYQ?aB%TbE0kRaAes7g+ifF
zD7&*LwGjJoXIYJ%JR%l5fr$n-L@a`3SQXMrl1P&5F-cNyWlK6{9ns?<@(_85=8MZh
zeDJ2!gaTHZf&dhN?h!y|S_lbG%V1^uBm0O6&l*v86gk|4n|s8~3A~N)w8y;C8q@ok
zasYAwasboM4U3P(H&bQn{twEQvZZW;#%Ys3X{0@6+g`>LY5S!`5zqtl06owFJc<tB
zUM_pfDs#_};lca4dwDgZJ|!=R7sLzV1=%Pux=~`iGT~#w$AnK<EV%~-MebU$@#G)(
zATXDhBuo+}36tb>OcJuXApsHN96i_{J!pw%q=_YkSQ(X-lY?zxTkA|Vumadns-+!_
zF)^c}@2{fU-(`Q7{ar4kjoU^ltx9W0P~jB~@&_D54k8DUgIEb1o>P$MwF;^8$fnS0
zfC5ke3P1rU1EyVjftm~Dg8@(oF)8dTw=@<Mb5XrL^kg$yHyi(sLV1otxx09{%rAc)
z`$<1R-M{?)pD$nj`}_YaQr7qjI^u`yZx%lP&R2q6VOP-v*%PIWT%X3r4|(PrlHk?!
z(0XWjOgyHjR>Fd?AS}p|E=#&B>57PCR$5gYxd#`Pbfvj@^d`MYZ_=CeCcSArMY*@<
zrE&Aq=dX!>2RquzH7nO@O(#|*RwY&?R#hVOSl9<{!cDk2Yux<)Ia$C&TD?koyRySm
zrEQbDBg(jGJIL&D-Cx>0NOvfXh6TASS+_dXRy&%9R>Z<X>!Cf3h((==NSd$eh@QT<
zzPP@)zPP@)zIX=;@%?)wUtoN|O}Gg+Um7=e1HO{alFu#}k`j>;krI&-krL@OOaESe
zu<ME2K5E2IG|_9K*F>*&lXjDKQv@F9TDq35rEBS0x|Xh`YkM`@zjruY>ta3^^G(5K
z@kWzLCX;48nG|Fl>eI}?WMDEd8JG-A1||bDnA>fxLNXj#;Nyr`6(J31)!ZYb-+m>H
z2&o9^OGili^#(}>hAV0MA6P6#qk^AM1y5WgGIyZzZH75|uO`}ob{rY)Xo928rnBj6
zI-91ki)SDtEm}~$EKAa{042M*-L6+_fC5ke3P1rUr_!5&go}r6&~Q0;t|>@>5AXp#
zz#kMw3KDL@P4oagKo8IZ^x$cFz&>T479EJ`6Fg%HSSjJ6^qh(l)m{S=M8ftiGm<WG
zTDirksMzkys~^ab<D~>CLCwsHG;dJ4X=B=$Hl~ef<EORpMN4}lHw^)lZw8cl4_@)J
zwFt{1tg>fA){r$NCe^u5>WzA%-ip&OASu)v_14zNG!{B$L#wnZtxBuX?nj7a6@?ps
zmJ(!7rahT8&>CyQX<1%pVtK7seHvNHrghYZnif4k56}bjplP@b%LB`c<;C(UJ0oOu
zY*!-#lYz;=+$|5;wi}<fW2n_0u>z9TM?pi-5Hti0DH|NIDuRZfA!ta9hPYPIwTe^P
zUi@HwFh7_d%n#mVF5+_$pNse+OR^ie6&cPV#0W7$j1b#Tpf!3IFoi;)P$(1%g+ig+
zZYY#vP$=%{4ZHlh)hE%dc0E)OppR&M(T#6j#3Ium(;(9jjn?Ls!Wr2!sRSuO>ukHW
zN!ljq$1kkn7;+3bh8#nVA;*woSc>{I-V8V4CftOZa1(CAO}H8L>FV-0yP>R4pTA;3
zVbc=XL-vq8WDnUx_K-bf@07@%SM&H@&3Rrr+T{^s{7xd{*KBF>YI-$g@d}Wz=Fonj
zU03KRvUnW3<0El!6K=vyxGBRPt<6V)o6L{sYctcnrB~Cd`9!n*`!}6KRX23^Sc2y1
z*ws5ks@`r|>Jg{KX>nSd_TZfM!`<2%rR|yAR_EI+neC!_Gs)MJICk`r+R<kq#476i
zgxCmX&$|Z}d&ZuzXYBdl?Ah{etMLrFTHYNPqRnFJ#`zD}%igV0Ea*5{(5{xy2C7-U
z)6fjQ#Pp8FLGXB}ZO2*LN<+f<TKgjybcsE*9@-h&65WyBB`$kx>^mF|#|#gLt4&>P
z>S|LXJFWzexP0hA*@2i+9<U%RXxbBFWs;RiRwnVBcurO(S($VOfjIO)ut~6K&{`3m
z(0N-t9@!_W$!fR>H{mAS?50Gbmc2`)!!AGEJPF)%iFD77MJq?fTb=onIP;dux{ud=
zypLM~`lkA(`lkA(`lkA(`lkA(QDe+troqhR7*jNuX)tr0ZP)ZBz4>gt*&CGN;ycqM
zEJ>E6_PzGK_I*^P7_%~FRd1LdJRWcpZo*Bt2{#9Y@jT<=A^|KK7R|FQnx<12<|EIu
zj<hS@o*$)iixXmsm?EZ#DPsEcJjSK&s_v@pY7<169@iTSFEoE}@QmTKda-*SrS5KV
zx>;>*-d{P;4s<z&M2nBGALY4_2qQlu3^71&EkDi*XN9xES>dd}&GfMrxcMC1%*rye
zjGnZHxh`FO*B`^`J!!q_p{b^iQFqjx7sLxX4KHZD>Yd|v^@}=Xm}kuA;05u5Tqd&1
zH&OTJQ1|G-fD&XWj-@!$p<4-3g1GLypo6;Z?&)Q%vDTuz7Cysg_za&9iqEbJcU3q)
zSi86w!PONHVgb71opPH}9&lQm7N^B&aoTA9j2FZU;ss^5>y44StM`IA7njw&Uq<d^
z9Aq419Aq3^{4?i^f85g>?xEuaorD)uIK#m_NKI7eSUQ%DrDL<z^_Gq;bvzwgQ`dg3
zR6`MNe*02hB=RIkq%H5Zyxa0_vmAO5G=0eG_V$8E^vo8H;PJqMupqH2u`01Dv8oXY
z1vlYl`u3_%pT8zvW+9;>p(3Fop(3F&=Y&dPv8OlbO?s2wq&Ht$Z@P3;rB!KFT9x*t
zD{akbKiru;li5o%Nm*VjFP0a}Yp<496`N1nwbONa*fB%`kw7F639lm(4E7l8xkzl4
z5D7%Wd5eVIF{fHGq>)?$#eE{&rp28_tfUX=L;8?Dqz~!S`C;*8)oG@Fw(S-h?-Cb#
zP0DkIhM*z#bJ}ZSuSt(M{d@MBz)gEi65UbqoqV50DEaC9^z-)9`N2=@g9j)81)%VQ
z`N90)5lx=z2fG5*6{u4zZwiG%p-_wk>~$;vr^RVsg44Q^ChS&l4^F~8a3xLe5yn!s
zl#d;N={o?+XDRK8p?oMG%7^lyeCz<c-Ix)Cn~}RA4S&@u9s&w##;w;YQ=E2k{UqeN
z%fDRy<?=6=e~t9(8{z;r8+kQrXAj^DF`{qio4&Tb_Dl4&6FU}4kP@T>DM3n*xcG?t
zOlyGyl;Z&uz9L_dugF&%DeSM{E4sz$cu>J^4%%4e`%3<^sr&UWQ(^~Vu41lYuG7yA
zZHxt-iP<{g7AN69S%6p;pn8O+&=TcUc|DWz>f1c#mYq9vjlLLtG5Yc*Fy3n-^9oiw
z7K8;&P+#36>bddUpq0l=E8V(^R-%<?C0f~$7T)ohc*kL2LHk+zS^HW0*_I_+mTXxn
z2|(It2Hbo-+-ycKEMo2X>8#f^v@0hp-)Mgl%Uv0oR)!eOjAlkNquI!@k>lRTi{2Uy
zPy)UIH{mASgqv^^Zo<uU+Y4^OO}Gg+;U?UKn{ab*3eW@e06jnt(1Sr?OwU%jLpO{k
z0+f>g6wwXQjpS&RGk+3i-oji9b1lrZFn7czGR0qrrIDYJ=!WQq6{}69m_DmBcD$4z
zB`CKXFJBH!D?*GA<K}box%q>_nC|8$P7@wMrR@z$NC*jUR-2pmu61bV^JtDo_5+Ys
zuQvw0%B(1`q98qU=4?8fR{Knc?js=ME$;|y?^DzqzRhyE@cA@<L;^$tL;^$tMk?Mb
z^yA#2tF@uEp|zp4aR#joGUL2thD+J!({^+=oy}=P?0NRwgj<|AZL$EdJ9N2M+$-)C
z_v-NO)$<^58qAKhC0@a#^KCw2(pBrif7r`g^D<6ZK>p|8*04=}@sM9HTNXe3V-}0W
zp9<Kl-PU}W^~9oFU|$r|<nQD@fBVaQewP>j{PyGhcZ-LgieKuF_5WRc_~GNnzkd8b
DTKo+f

literal 0
HcmV?d00001


From 914198fd288329cfb67290076286e32296496be0 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Mon, 9 Oct 2023 14:02:40 -0400
Subject: [PATCH 092/136] regex: reject large patterns when fuzzing

Otherwise we risk timeouts.

Fixes https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=61484
---
 fuzz/fuzz_targets/fuzz_regex_match.rs          |  10 ++++++++--
 ...minimized-fuzz_regex_match-6659953212129280 | Bin 0 -> 399135 bytes
 2 files changed, 8 insertions(+), 2 deletions(-)
 create mode 100644 fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_match-6659953212129280

diff --git a/fuzz/fuzz_targets/fuzz_regex_match.rs b/fuzz/fuzz_targets/fuzz_regex_match.rs
index 6c375510d..a5dda53d6 100644
--- a/fuzz/fuzz_targets/fuzz_regex_match.rs
+++ b/fuzz/fuzz_targets/fuzz_regex_match.rs
@@ -54,6 +54,9 @@ re.is_match({haystack:?});
 fuzz_target!(|case: FuzzCase| -> Corpus {
     let _ = env_logger::try_init();
 
+    if case.pattern.len() > (16 * (1 << 10)) {
+        return Corpus::Reject;
+    }
     if case.haystack.len() > (16 * (1 << 10)) {
         return Corpus::Reject;
     }
@@ -65,8 +68,11 @@ fuzz_target!(|case: FuzzCase| -> Corpus {
         .ignore_whitespace(case.ignore_whitespace)
         .unicode(case.unicode)
         .octal(case.octal)
-        .size_limit(1<<18)
-        .build() else { return Corpus::Reject };
+        .size_limit(1 << 18)
+        .build()
+    else {
+        return Corpus::Reject;
+    };
     re.is_match(case.haystack);
     Corpus::Keep
 });
diff --git a/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_match-6659953212129280 b/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_match-6659953212129280
new file mode 100644
index 0000000000000000000000000000000000000000..b8cdc138a42791040d908484abc77478ca1defec
GIT binary patch
literal 399135
zcmeI*Ps^tFS;q1ENi$g5B8ZEMB5fgc^ogStUDTkYbnCKEg)o>bvXDv^QlW)OX|OM3
z-hiv_Y)jsX@1WqU+yrmtAV(g)lRV~{oWJ|;H<)ss(<|royFS0;d|H?$-rPL7;s5=3
z{|gWKi+Sv3x7*$Q3+{Q;$L<QC*6oBr-roJuciw$`^Zq;B0dMVuZ_Uf1fR_aUw63!t
z5I}1}1c3lr6LR_>{Le-2?DX%fC-kyPk>bQaetP$K_n*x@q`+OfeSJS)ANH|Qq!45i
z2vaEECcZwbTPacqvI)dJq`)S=K5XntkwTD7AWWfroA~;$Zly>e$R-f?kOG_d`mnJp
zMG8SSfiQ*gZQ|?0x|Je@Ae%tkLkeu->%+#b6e$GR1i}=`w~4P0>sE>sf@}hD4=J#T
zuMZo$Qlt=M69`i%-zL63tXnBk2(k&pJ*2=UzCLX1N|8d4O(0C6e4F_Cux_PDA;=~W
z_mBdc`1-K1D@6)HHi0mO@@?Yl!@89sg&><i+(Qa%;_Ji4t`sQ**#yEAo(kV9JlOuL
zPUUPb{6lFx8!{)zRuIPBtV2)`K_Gy3Bq8o01y(3`=E$8ntpX^{mzOC#-!JKv_z9&*
zA;{^2xQ7%t{X6RkU9eK5IA0K^@O;0dSK=p>;@s2O54rd|bq^`fyEnw35M&()Q+V+|
zcuMJ)mcj)C#o<BRLki$ZTvtILfOaGyOyS7T9@J^2NFj(I5J>ADQUFYDjvx>~JCYEl
zaAar?>a<d%5JV6Nq;(G|046s_5D1_hNeELoGPDPES}9TpA_xT1x`z}1lba(51kjEo
zgee>u+Jic+6e$D|1OjQ@LkfV&%@G6wXh#yl6pjq-L7i5L6oLo>fwb-+1;FIy2m%4L
zBMD&&M~3#GPAf$UK?H$7TKA9wU~+Q=fdJZ(gfN98LwiuCl_G^8f<Pdxdq@E=xjBMB
z0PRRZn8J~vJ*d-4kwOqbAduEQqyU)Q96=y}b|fK8;mFV))M=$iA&4LlNb4R_08DO<
zAP_)1k`ShFWM~iSv{IxHL=Xt1bq^^3CO1bA2%sHF2vayRv<G!sDN+a`2n5o)hZF#l
zn<EGW(2gX8DI6KvgF3AgDFhJ&0%_et3V_MY5d;EgM-svmjtuQVomPqzf(QbEwC*7V
zz~tr#0s*um31JFHhW4OND@6)H1c5+W_mBc$a&rWM0NRm+Foh#Sdr+s9B84D=Kp?Gq
zNC7apIf6g{?MOnH!jYjpsMAW3LJ&b9kk&n<0GQkyK_Gy3Bq2=U$j~0tX{AUZh#(M1
z>mE`7Om2=K5I{SU5T<ZsXb<YNQlt<>5D26_#vVR<`oV8*j`3n&`hoyL;m?U9g18bC
zIO1jW)viP-Qd~R;Q+V+o+za+gOK~Mx@AsbI)w_oj=xv!!PzbUPgeg419~J$0&sGXo
z0u<*C;vP}}7wqy20s*um31JFHhW4OND@6)H1c5+W_mBc$a&rWM0NRm+Foh#Sdr+s9
zB84D=Kp?GqNC7apIf6g{?MOnH!jYjpsMAW3LJ&b9kk&n<0GQkyK_Gy3Bq2=U$j~0t
zX{AUZh#(M1>mE`7Om2=K5I{SU5T<ZsXb<YNQlt<>5D27o4=Dg9H%AZ%pdCpFQ#dlT
z2X$I0QV1dl1k#?^9zJ{e^IzWk_p2IYT%#Zt19D=!=pH^7aKSES!TKTh->G{@fx@M@
zpRW)5SSfS^#peNG3e6J`1rY=SXh#y_9#UY1a%Ya*nbRtO;(U3T!t?!-UWuPjiWGvJ
zK8Slrfz!XUp77`kcJu?3;>E(V!DIx4+cGk=2X$I0QV1dl1k$>P6abT(BM1c0jwFOB
z92we!I;|8b1Q7%RY28B#fXU4f1OjMB62cUY4DCUkR*DpY2m*n$?jZ%h<mLzh0kk6t
zVG2iv_MlEHMG8R#fk0aKkOE+Ga|D3^+L44Xg(E|IP^Xn5g&=}JAgy~y0Wi5af<OT6
zNJ5yxk)b{4%TIfS2iwnl`Dvf_w_iG~*OWq#BZ4sQUd9nGqp$WBN|EB?LEJ+MTw4PA
zy%(SLQ~pw>(97s;r4U3A2%sHFh<iwZ70R7Ca%WDf0E+YFWeU&tOL`@KLMc)Ra{3_d
zAq7tV&U!)@tQ0BE7lbK1-!JKv_z9&r_jL9{F8)s4Lkjfn4KXMLSqH)tUi=T9Qu?K(
zaKS)vco6rH0=N>_RS*cE9Z3jNI5M;cby_J>2qFjs(z=Hf0F#>|2n5iMB!np(8QOz7
ztrRH)5d;Eh-9rk1$;}Z20%%7P!W51S?LnPZiWGtf0)e#dAqBwX<_H1-v?B>&3P*<a
zpiV1A3PA*cKw9^Z0$_4;1c3nBk%TaXBSU*or<EdwAc8<3t$RoTFu6H`KmhGXLYTsl
zp*^V6N|8bkK_HOUJ){7b+#Eq5fOaGyOyS7T9@J^2NFj(I5J>ADQUFYDjvx>~JCYEl
zaAar?>a<d%5JV6Nq;(G|046s_5D1_hNeELoGPDPES}9TpA_xT1x`z}1lba(51kjEo
zgee>u+Jic+6e$D|1OjQ@LkfV&%@G6wXh#yl6pjq-L7i5L6oLo>fwb-+1;FIy2m%4L
zBMD&&M~3#GPAf$UK?H$7TKA9wU~+Q=fdJZ(gfN98LwiuCl_G^8f<Pdxdq@E=xjBMB
z0PRRZn8J~vJ*d-4kwOqbAduEQqyU)Q96=y}b|fK8;mFV))M=$iA&4LlNb4R_08DO<
zAP_)1k`ShFWM~iSv{IxHL=Xt1bq^^3CO1bA2%sHF2vayRv<G!sDN+a`2n5m|V-KG_
z{opq@$9S<XeL(=B@aM!4L0kz69Pu*xYFDBZDJ~v_DZKa(?gjg$rMQx;_j^z9>fJ*M
z^tMbVC<Ivt!W5q1kBWZ0XDfv(0g7`6aSthg3wHShfdJZ(gfN98LwiuCl_G^8f<Pdx
zdq@E=xjBMB0PRRZn8J~vJ*d-4kwOqbAduEQqyU)Q96=y}b|fK8;mFV))M=$iA&4Ll
zNb4R_08DO<AP_)1k`ShFWM~iSv{IxHL=Xt1bq^^3CO1bA2%sHF2vayRv<G!sDN+a`
z2n5o)hZF#ln<EGW(2gX8DI6KvgF3AgDFhJ&0%=cd51&2#`7iJN`&A7xu2GPS0XeZ<
zbPt~kxL_BvVEvH$@6<h{K;cr{&)0{2tQ5L|;`4woh2{x}f(QZuv?B>|4=J!hxid%Z
z%xM)salX7v;rV_^uf$I%MG8SqAH+SR!0F#vPw0Y`BE|WFFooy)CA|_qp%mwy&VI<n
z->G{@f!@6#28AH&K$yad|G`sAzqAxC7$^=8;vP}}SK_(~0s*um31JFHhW4OND@6)H
z1c5+W_mBc$a&rWM0NRm+Foh#Sdr+s9B84D=Kp?GqNC7apIf6g{?MOnH!jYjpsMAW3
zLJ&b9kk&n<0GQkyK_Gy3Bq2=U$j~0tX{AUZh#(M1>mE`7Om2=K5I{SU5T<ZsXb<YN
zQlt<>5D27o4=Dg9H%AZ%pdCpFQ#dlT2X$I0QV1dl1k$>P6abT(BM1c0jwFOB92we!
zI;|8b1Q7%RY28B#fXU4f1OjMB62cUY4DCUkR*DpY2m*n$7r2N0&O?U&0v!*@ca%Hx
zr4>MNzE|%aQs82)wjXl7>Aap#U<$pn-hB!|1c3nBk%YL16j-6$nIm_0TGJkUI^YF<
z>1X^uu^IVNB906_t)Q&r^Ghp$!XRw|aSti5iLVbEyHcbOWD^KeDBmW&KCD|QQV6mM
z#66_ICcZvw>`IYBkWC;=p?sV8`mk=LNFm535ciM*oA~;$u`5LiK{kOfh4O9U>%+R0
zB84EEK-@zLY~t&~#;z181la__6w0@WuMg{1iWGuu0&x#1u!*k^8@p1Z5M&bwQz+ji
zzCNs5DN+cs3B*05z$U&vZ0t&rLXb@$Ord<6`1-JJrAQ&jCJ^_K0-N~yu(2ye3PCo3
zFop7M;_Jh@l_G^8n?T${3T)!*!^W-@DFoRB!W7E4iLVdqR*DpYYyxo)DX@vJ4;#Bu
zq!45i2vaEECcZwbTPacqvI)dJq`)S=K5XntkwTD7AWY$P@_pko{uwrV%IgF{hVHh4
z+|`&$h<iwZ70R7yh)a<|kT@`fx)>A)pfw>YK-@zLtWfSuLtKg!g2aI-)Wx7c0IdmG
z0pcD~V1;sL8sbu<5F`#vp)Ljm0%%Rh3J~{@0xOg|(-4;;g&=WY3Ux6k5I}1}R)DyN
z6j-6$nTEI&DFlfFQ>cqUfdE<)vI4|Cq`(U0&NRfONFhiZm_l6)3Ix!akQE^AAq7?_
zccvjOMG8UUz!d6YP#}QTgscE@4=J!hxibxMDN+a$2c}RLg8~7xCS(POdq{y5%AIM5
zOOZm5I535}7!(MgH6bfN+(QbiQ0`1aT#6Ke#DOW)#h^d{tqEBH;vQ09g>q*a;!>m#
zBo0iWE(QexXidlp5ciM*E0jCa5SJo_AaP&{bulOqKx;x)fVhVgSfSjRhPV_d1c?Jv
zsEa{?09q5W0>nL}zzXHgG{mJyAxIpULR}0B1kjq06(H^*1y(3`rXemx3PIw)6zXD7
zAb{3{tN?KjDX>DhGYxSmQV0?Urcf7y0s*upWCe(ONP!i~ooR?mkwTC-Fon7p6bPU-
zAuB-KLkg@=?o2~miWGvxfhp9*pg;hv30VQ+9#UY1a%URiQlt<h4osmg1_c6WO~?um
z_mBcBlsnT9mm-BAabOB{F(?o~YeH6lxQ7&2q1>5<xD+V_i33xpi$Q?^S`)GY#66_I
z3gyl;#HC0fNF10#T?`5Y(3+4HAnqXrRw#F-AudIV4Is~+e(<Y@AK%t1zy0vT=k@+q
ze|_`uuihy(4DQf7e>so2dvf!69@`(p$L=r=--5h%Z1)j8BrkXDknip``qEOQ5ailG
z+(QanTLSvME`KRf91?^nJmkCk*}k+ChfhFX?Jbnz@VDR|Qs9V}(O0_?r8xXb^wo#F
z1*Y(j?>@>0`_c-a@am5U;vQ1qh?mh<yAq{Haq%Eb;l+P&FW4_F#g$~e-+O{r?;cX1
zw`Dp(A;>xqrtk!RRP^ILTPa)#P@Fr6dq@FXu*)w91kjEogee>u+Jic+6e$D|1OjQ@
zLkfV&%@G6wXh#yl6pjq-L7i5L6oLo>fwb-+1;FIy2m%4LBMD&&M~3#GPAf$UK?H$7
zTKA9wU~+Q=fdJZ(gfN98LwnE*n3iw3fDbw3CzvV<bS2CaC<Ivt!W16zXF$KomzKh;
zgW^g++(QZ=2mXu@1OjMB62cUY4DCUkR*DpY2m*n$?jZ%h<mLzh0kk6tVG2iv_MlEH
zMG8R#fk0aKkOE+Ga|D3^+L44Xg(E|IP^Xn5g&=}JAgy~y0Wi5af<OT6NJ5yxk)b`P
z(@K#-5J4c2);**EnA{veAb@rxAxz=O&>qxjrAQ%&AP`9F9#Q~IZjK-jKs%BUrf_6v
z59+j1q!2_92&8onDF7xnM-T|09Z3jNI5M;cby_J>2qFjs(z=Hf0F#>|2n5iMB!np(
z8QOz7trRH)5d;Eh-9rk1$;}Z20%%7P!W51S?Lm(+?I>@*K*yt8o-b{DQwXvN#66_I
zCcZvw>`IYBkWC;=p?sV8`mk=LNFm535ciM*oA~<h^~UbOrvqLu$j!@u@UM}ttCd0!
zK_Gy3Bq8o01y(3`=E$8ntpX^{mzOC#-!JKv_z9&*A;{^2xQ7%t{X6RkU9eK5IA0K^
z@O;0dSK=p>;@s2O54rd|bq^`fyEnw35M&()Q+V+|cuMJ)mcj)C#o<BRLki$ZTvtIL
zfOaGyOyS7T9@J^2NFj(I5J>ADQUFYDjvx>~JCYElaAar?>a<d%5JV6Nq;(G|046s_
z5D1_hNeELoGPDPES}9TpA_xT1x`z}1lba(51kjEogee>u+Jic+6e$D|1OjQ@LkfV&
z%@G6wXh#yl6pjq-L7i5L6oLo>fwb-+1;FIy2m%4LBMD&&M~3#GPAf$UK?H$7TKA9w
zU~+Q=fdJZ(gfN98LwnG3nf8s(_-Fg<Kjd6Kq2@0ZegaHR5X3#C!0F#vPk4fP@@y4A
zasOp8h4(+*(LPp+6oLo>0krog#66_ICcZv=zp{^20EKP=VG4CIC=ftvLRNsdhZI<$
z+?j^B6e$FW15>DrL4g2T6S4xtJ*2=2<<2z3rAQ%29GF5~3<?C$nvfMB?jZ$MD0ikI
zE=3AK;=mN@Vo)G})`YA8aSti5Lb)>yaVb&=5(lPG7lQ%;v?gQ)h<iwZ70R7yh)a<|
zkT@`fx)>A)pfw>YK-@zLtWfSuLtKg!g2aI-)Wx7c0IdmG0pcD~V1;sL8sbu<5F`#v
zp)Ljm0%%Rh3J~{@0xOg|(-4;;g&=WY3Ux6k5I}1}R)DyN6j-6$nTEI&DFlfFQ>cqU
zfdE<)vI4|Cq`(U0&NRfONFhiZm_l6)3Ix!akQE^AAq7?_ccvjOMG8UUz!d6YP#}QT
zgscE@4=J!hxibxMDN+a$2c}RLg8~7xCS(POdq{y5%AIM5OOZm5I535}7!(MgH6bfN
z+(QbiQ0`1aT#6Ke#DOW)#h^d{tqEBH;vQ09g>q*a;!>m#Bo0iWE(QexXidlp5ciM*
zE0jCa5SJo_AaP&{bulOqKx;x)fVhVgSfSjRhPV_d1c?JvsEa{?09q5W0>nL}zzXHg
zG{mJyAxIpULR}0B1kjq06(H^*1y(3`rXemx3PIw)6zXD7Ab{3{tN?KjDX>DhGYxSm
zQV0?Urcf7y0s*upWCe(ONP!i~ooR?mkzxbLv!@^Y>fy(?^~!HQ{P20b|J7gLeEh3-
ziVcH1^v++-WA2{Ze4fYl2l25xjKjAe?;YEHL=VZ!9XsT^yN$lI6e$F`HW2ra0@s#+
zey_`4iWG+gVG0lV?tZo}Eydvz&{um4r8xX8xQ7%t;$`&Ju0$ygzY=}*A#Z^xJmkBN
z^1;5e0w}!tBZ9bx6gc8#^wq9JDN<ZK2vd0RAKVM}OG|MjS?~9r;MKc_6zFZ4PEZK4
z4umN@!5<a<c+XY}R{|904&okC02l1?3jzVOBMD&&M~3#GPAf$UK?H$7TKA9wU~+Q=
zfdJZ(gfN98LwiuCl_G^8f<Pdxdq@E=xjBMB0PRRZn8J~vJ*d-4kwOqbAduEQqyU)Q
z96=y}b|fK8;mFV))M=$iA&4LlNb4R_08DO<AP_)1k`ShFWM~iSv{IxHL=Xt1bq^^3
zCO1bA2%sHF2vayRv<G!sDN+a`2n5o)hZF#ln<EGW(2gX8DI6KvgF3AgDFhJ&0%_et
z3V_MY5d;EgM-svmjtuQVomPqzf(QbEwC*7Vz~tr#0s*um31JFHhW4OND@6)H1c5+W
z_mBc$a&rWM0NRm+Foh#Sdr+s9B84D=Kp?GqNC7apIf6g{?MOnH!jYjpsMAW3LJ&b9
zkk&n<0GQkyK_Gy3Bq2=U$j~0tX{AUZh#(M1>mE`7Om2=K5I{SU5T<ZsXb<YNQlt<>
z5D27o4=Dg9H%AZ%pdCpFQ#dlT2X$I0QV1dl1k$>P6abT(BM1c0jwFOB92we!I;|8b
z1Q7%RY28B#fXU4f1OjMB62cUY4DCUkR*DpY2m*n$?jZ%h<mLzh0kk6tVG2iv_MlEH
zMG8R#fk0aKkOE+Ga|D3^+L44Xg(E|IP^Xn5g&=}JAgy~y0Wi5af<OT6NJ5yxk)b`P
z(@K#-5J4c2);**EnA{veAb@rxAxz=O&>qxjrAQ%&AP`9F9#Q~IZjK-jKs%BUrf_6v
z59+j1q!2_92&8onDF7xnM-T|09Z3jNI5M;cby_J>2qFjs(z=Hf0F#>|2n5iMB!np(
z8QOz7trRH)5d;Eh-9rk1$;}Z20%%7P!W51S?LnPZiWGtf0)e#dAqBwX<_H1-v?fFl
z2%t3~f<OT6btD8Iy>Dl~JM(8}It{lbh#(L^YeE2mRv$qifYyYZCdfZe{DDtP>y3j&
z5J4b-)`SQG0kkGW5D1_(A%Z{vtqBnX0%%Q$AP_)nLIi;TS`%`rApe7d?OQJkCV~h8
z0kkGW5D1_(AukWId;EvH-R|o@-2L(HOZ#H~$M@d*(HoC{^soQ)#>4;Tf8Mx%sq!x`
zFYnDK0W(MjDFP<JBtQVI2@wPWXibP95I}1}?g82V_cHvuZFMU}3PA*c09q3w2n5iY
z5J4b-)`SQG0kkGW5D1_(A)g6y_up8-^!CHA+`jp(?|=P=KY8?I|L3=!?B4qQn|ELR
z>Q`U=%lCfz?%#j;@i%{b^VY-NpM2$OyMOzK#}D?u3G?r}x4-nYCvUy}r*FQq`?Gg;
z55N7|n~xsv-gxvo|MLCU9z6N(`?r6-|MSuJ-ul7Me)jmiJAeG{?)P@j+vmTP^LKx-
z+r7T8zWk57-A{IpU;Xy(o6mp!lYiL{`K$e(kN)<%fBl`mdH=s}-@8kZdIiXfZ^5tl
zQb8bq)`VOI$p7x}nfvyOT>URT`shDC`sf!A{H@1N1R(rHcroe=Ue?7-yYV;s&*eA!
eU)$Z@zJ7aq`=9@{|Cjvn`t94dx9|Mm5B?9F4sLb;

literal 0
HcmV?d00001


From 3feff9e10e028eed26336ff1934d6b89fc6c74e9 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Mon, 9 Oct 2023 14:55:59 -0400
Subject: [PATCH 093/136] automata: improve sparse DFA validation

This rejiggers some code so that we can more reliably check whether
start state IDs are valid or not.

Fixes https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=62726
---
 ...ta_deserialize_sparse_dfa-4903112680538112 | Bin 0 -> 953 bytes
 regex-automata/src/dfa/sparse.rs              | 159 +++++++++---------
 2 files changed, 83 insertions(+), 76 deletions(-)
 create mode 100644 fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_automata_deserialize_sparse_dfa-4903112680538112

diff --git a/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_automata_deserialize_sparse_dfa-4903112680538112 b/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_automata_deserialize_sparse_dfa-4903112680538112
new file mode 100644
index 0000000000000000000000000000000000000000..3056bca2f335559837ff22c307040e7d200693b5
GIT binary patch
literal 953
zcmcgrzb^z)5dPkCdx%D_SrUavRGOo46+(5LlBjgl;_fdHC6V|8G}<dQ{0AZtiBh9b
z_!(};H}B1E_GINclgxfI-@KWfnYTN6c-B2%Szh1U-dYop_tS|69)YDN-~xq+m*XLN
z4tL7cH-@uBjkZYkrUs^|UXPUI!0i1J=Sy}pX9+DA%AtnQNcISwXoly=FQ3sghTUaA
z7rg<`8K7xLM7~7E#sN;fDPs9v7D)%u?i1`H>v1iCd3X&#o?ZHHI$Q{9vE`2)E1aXc
z4dV*ibs}S9a$sF5@Ts&8Nm;m!h_LYvQgx@O&0-~*MqvXc*1mdu8%`|_4(ibjUcp(+
zUaNf^t&($zmCZ!j&`hg&V+sEZeFMdJm&UDZ#tk}9nN`%b_4cnaUQ`}Jy`sc(1@{;A
cVu5+$3H6G4Prat{2Q-<>jd&&e<ZLqk1EykL>Hq)$

literal 0
HcmV?d00001

diff --git a/regex-automata/src/dfa/sparse.rs b/regex-automata/src/dfa/sparse.rs
index 7862d48a2..38096d994 100644
--- a/regex-automata/src/dfa/sparse.rs
+++ b/regex-automata/src/dfa/sparse.rs
@@ -992,8 +992,8 @@ impl<'a> DFA<&'a [u8]> {
         // (by trying to decode every state) and start state ID list below. If
         // either validation fails, then we return an error.
         let (dfa, nread) = unsafe { DFA::from_bytes_unchecked(slice)? };
-        dfa.tt.validate(&dfa.special)?;
-        dfa.st.validate(&dfa.special, &dfa.tt)?;
+        let seen = dfa.tt.validate(&dfa.special)?;
+        dfa.st.validate(&dfa.special, &seen)?;
         // N.B. dfa.special doesn't have a way to do unchecked deserialization,
         // so it has already been validated.
         Ok((dfa, nread))
@@ -1388,63 +1388,8 @@ impl<T: AsRef<[u8]>> Transitions<T> {
     ///
     /// That is, every state ID can be used to correctly index a state in this
     /// table.
-    fn validate(&self, sp: &Special) -> Result<(), DeserializeError> {
-        // In order to validate everything, we not only need to make sure we
-        // can decode every state, but that every transition in every state
-        // points to a valid state. There are many duplicative transitions, so
-        // we record state IDs that we've verified so that we don't redo the
-        // decoding work.
-        //
-        // Except, when in no_std mode, we don't have dynamic memory allocation
-        // available to us, so we skip this optimization. It's not clear
-        // whether doing something more clever is worth it just yet. If you're
-        // profiling this code and need it to run faster, please file an issue.
-        //
-        // OK, so we also use this to record the set of valid state IDs. Since
-        // it is possible for a transition to point to an invalid state ID that
-        // still (somehow) deserializes to a valid state. So we need to make
-        // sure our transitions are limited to actually correct state IDs.
-        // The problem is, I'm not sure how to do this verification step in
-        // no-std no-alloc mode. I think we'd *have* to store the set of valid
-        // state IDs in the DFA itself. For now, we don't do this verification
-        // in no-std no-alloc mode. The worst thing that can happen is an
-        // incorrect result. But no panics or memory safety problems should
-        // result. Because we still do validate that the state itself is
-        // "valid" in the sense that everything it points to actually exists.
-        //
-        // ---AG
-        struct Seen {
-            #[cfg(feature = "alloc")]
-            set: alloc::collections::BTreeSet<StateID>,
-            #[cfg(not(feature = "alloc"))]
-            set: core::marker::PhantomData<StateID>,
-        }
-
-        #[cfg(feature = "alloc")]
-        impl Seen {
-            fn new() -> Seen {
-                Seen { set: alloc::collections::BTreeSet::new() }
-            }
-            fn insert(&mut self, id: StateID) {
-                self.set.insert(id);
-            }
-            fn contains(&self, id: &StateID) -> bool {
-                self.set.contains(id)
-            }
-        }
-
-        #[cfg(not(feature = "alloc"))]
-        impl Seen {
-            fn new() -> Seen {
-                Seen { set: core::marker::PhantomData }
-            }
-            fn insert(&mut self, _id: StateID) {}
-            fn contains(&self, _id: &StateID) -> bool {
-                false
-            }
-        }
-
-        let mut verified: Seen = Seen::new();
+    fn validate(&self, sp: &Special) -> Result<Seen, DeserializeError> {
+        let mut verified = Seen::new();
         // We need to make sure that we decode the correct number of states.
         // Otherwise, an empty set of transitions would validate even if the
         // recorded state length is non-empty.
@@ -1521,7 +1466,7 @@ impl<T: AsRef<[u8]>> Transitions<T> {
                 "mismatching sparse state length",
             ));
         }
-        Ok(())
+        Ok(verified)
     }
 
     /// Converts these transitions to a borrowed value.
@@ -1659,7 +1604,7 @@ impl<T: AsRef<[u8]>> Transitions<T> {
             let state = &state[nr..];
             if npats == 0 {
                 return Err(DeserializeError::generic(
-                    "state marked as a match, but has no pattern IDs",
+                    "state marked as a match, but pattern length is zero",
                 ));
             }
 
@@ -1681,6 +1626,21 @@ impl<T: AsRef<[u8]>> Transitions<T> {
         } else {
             (&[][..], state)
         };
+        if is_match && pattern_ids.is_empty() {
+            return Err(DeserializeError::generic(
+                "state marked as a match, but has no pattern IDs",
+            ));
+        }
+        if sp.is_match_state(id) && pattern_ids.is_empty() {
+            return Err(DeserializeError::generic(
+                "state marked special as a match, but has no pattern IDs",
+            ));
+        }
+        if sp.is_match_state(id) != is_match {
+            return Err(DeserializeError::generic(
+                "whether state is a match or not is inconsistent",
+            ));
+        }
 
         // Now read this state's accelerator info. The first byte is the length
         // of the accelerator, which is typically 0 (for no acceleration) but
@@ -2061,28 +2021,19 @@ impl<T: AsRef<[u8]>> StartTable<T> {
     fn validate(
         &self,
         sp: &Special,
-        trans: &Transitions<T>,
+        seen: &Seen,
     ) -> Result<(), DeserializeError> {
         for (id, _, _) in self.iter() {
+            if !seen.contains(&id) {
+                return Err(DeserializeError::generic(
+                    "found invalid start state ID",
+                ));
+            }
             if sp.is_match_state(id) {
                 return Err(DeserializeError::generic(
                     "start states cannot be match states",
                 ));
             }
-            // Confirm that the start state points to a valid state.
-            let state = trans.try_state(sp, id)?;
-            // And like for the transition table, confirm that the transitions
-            // on all start states themselves point to a valid state.
-            //
-            // It'd probably be better to integrate this validation with the
-            // transition table, or otherwise store a sorted sequence of all
-            // valid state IDs in the sparse DFA itself. That way, we could
-            // check that every pointer to a state corresponds precisely to a
-            // correct and valid state.
-            for i in 0..state.ntrans {
-                let to = state.next_at(i);
-                let _ = trans.try_state(sp, to)?;
-            }
         }
         Ok(())
     }
@@ -2537,6 +2488,62 @@ impl<'a> fmt::Debug for StateMut<'a> {
     }
 }
 
+// In order to validate everything, we not only need to make sure we
+// can decode every state, but that every transition in every state
+// points to a valid state. There are many duplicative transitions, so
+// we record state IDs that we've verified so that we don't redo the
+// decoding work.
+//
+// Except, when in no_std mode, we don't have dynamic memory allocation
+// available to us, so we skip this optimization. It's not clear
+// whether doing something more clever is worth it just yet. If you're
+// profiling this code and need it to run faster, please file an issue.
+//
+// OK, so we also use this to record the set of valid state IDs. Since
+// it is possible for a transition to point to an invalid state ID that
+// still (somehow) deserializes to a valid state. So we need to make
+// sure our transitions are limited to actually correct state IDs.
+// The problem is, I'm not sure how to do this verification step in
+// no-std no-alloc mode. I think we'd *have* to store the set of valid
+// state IDs in the DFA itself. For now, we don't do this verification
+// in no-std no-alloc mode. The worst thing that can happen is an
+// incorrect result. But no panics or memory safety problems should
+// result. Because we still do validate that the state itself is
+// "valid" in the sense that everything it points to actually exists.
+//
+// ---AG
+#[derive(Debug)]
+struct Seen {
+    #[cfg(feature = "alloc")]
+    set: alloc::collections::BTreeSet<StateID>,
+    #[cfg(not(feature = "alloc"))]
+    set: core::marker::PhantomData<StateID>,
+}
+
+#[cfg(feature = "alloc")]
+impl Seen {
+    fn new() -> Seen {
+        Seen { set: alloc::collections::BTreeSet::new() }
+    }
+    fn insert(&mut self, id: StateID) {
+        self.set.insert(id);
+    }
+    fn contains(&self, id: &StateID) -> bool {
+        self.set.contains(id)
+    }
+}
+
+#[cfg(not(feature = "alloc"))]
+impl Seen {
+    fn new() -> Seen {
+        Seen { set: core::marker::PhantomData }
+    }
+    fn insert(&mut self, _id: StateID) {}
+    fn contains(&self, _id: &StateID) -> bool {
+        false
+    }
+}
+
 /*
 /// A binary search routine specialized specifically to a sparse DFA state's
 /// transitions. Specifically, the transitions are defined as a set of pairs

From 2c44e2a6b63920bf1752a61231ee1349154ae717 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Mon, 9 Oct 2023 15:12:22 -0400
Subject: [PATCH 094/136] fuzz: add regression test for AST roundtripping

I couldn't get this to reproduce. Maybe some of my recent changes to
regex-syntax fixed this? Not sure.

I'm not a huge fan of this fuzzer in general because it isn't really
testing a rock solid guarantee that we provide. And the positions are
tough to deal with.

Fixes https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=62382
---
 ...tcase-minimized-ast_roundtrip-5633607856947200 | Bin 0 -> 491 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 fuzz/regressions/clusterfuzz-testcase-minimized-ast_roundtrip-5633607856947200

diff --git a/fuzz/regressions/clusterfuzz-testcase-minimized-ast_roundtrip-5633607856947200 b/fuzz/regressions/clusterfuzz-testcase-minimized-ast_roundtrip-5633607856947200
new file mode 100644
index 0000000000000000000000000000000000000000..726609cf21cf2933ef0fa2a49301f10f675f091a
GIT binary patch
literal 491
zcma)0!4ZHk2=nO}S-}n1%X1I6tpU6uwYBnSNdhE5i9(c-gjGZd5jX?*<m1)W_G0Gl
z?1a4NOxPt}NVPA<NFtez*q!+`T7{eMj&w~sD(MJQztyueyT791@T_X9J{|iHYBGKS
Dhh&o}

literal 0
HcmV?d00001


From aabbfe0c53e18d2712fd239ca6da965f8c220439 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Mon, 9 Oct 2023 16:51:39 -0400
Subject: [PATCH 095/136] regex-lite-0.1.1

---
 regex-lite/Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/regex-lite/Cargo.toml b/regex-lite/Cargo.toml
index 21330fd4e..225193c38 100644
--- a/regex-lite/Cargo.toml
+++ b/regex-lite/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "regex-lite"
-version = "0.1.0"  #:version
+version = "0.1.1"  #:version
 authors = ["The Rust Project Developers", "Andrew Gallant <jamslam@gmail.com>"]
 license = "MIT OR Apache-2.0"
 repository = "https://github.com/rust-lang/regex/tree/master/regex-lite"

From ea8f6c05f30e5148cea40194db1646de460869cd Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Mon, 9 Oct 2023 16:52:00 -0400
Subject: [PATCH 096/136] regex-syntax-0.8.0

---
 regex-syntax/Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/regex-syntax/Cargo.toml b/regex-syntax/Cargo.toml
index e5e541302..f14298299 100644
--- a/regex-syntax/Cargo.toml
+++ b/regex-syntax/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "regex-syntax"
-version = "0.7.5"  #:version
+version = "0.8.0"  #:version
 authors = ["The Rust Project Developers", "Andrew Gallant <jamslam@gmail.com>"]
 license = "MIT OR Apache-2.0"
 repository = "https://github.com/rust-lang/regex/tree/master/regex-syntax"

From 68b701808a1694e53d3aae8a2390eaa7a8ba9403 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Mon, 9 Oct 2023 16:54:45 -0400
Subject: [PATCH 097/136] deps: bump regex-syntax to 0.8.0

---
 Cargo.toml                | 2 +-
 regex-automata/Cargo.toml | 2 +-
 regex-cli/Cargo.toml      | 2 +-
 regex-lite/Cargo.toml     | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 6f94dc4ae..17120a0a4 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -181,7 +181,7 @@ features = ["alloc", "syntax", "meta", "nfa-pikevm"]
 # For parsing regular expressions.
 [dependencies.regex-syntax]
 path = "regex-syntax"
-version = "0.7.5"
+version = "0.8.0"
 default-features = false
 
 [dev-dependencies]
diff --git a/regex-automata/Cargo.toml b/regex-automata/Cargo.toml
index 2d08cec75..719f68c66 100644
--- a/regex-automata/Cargo.toml
+++ b/regex-automata/Cargo.toml
@@ -86,7 +86,7 @@ internal-instrument-pikevm = ["logging", "std"]
 aho-corasick = { version = "1.0.0", optional = true, default-features = false }
 log = { version = "0.4.14", optional = true }
 memchr = { version = "2.6.0", optional = true, default-features = false }
-regex-syntax = { path = "../regex-syntax", version = "0.7.4", optional = true, default-features = false }
+regex-syntax = { path = "../regex-syntax", version = "0.8.0", optional = true, default-features = false }
 
 [dev-dependencies]
 anyhow = "1.0.69"
diff --git a/regex-cli/Cargo.toml b/regex-cli/Cargo.toml
index b5de2b5e7..571191721 100644
--- a/regex-cli/Cargo.toml
+++ b/regex-cli/Cargo.toml
@@ -31,6 +31,6 @@ memmap2 = "0.5.10"
 regex = { version = "1.9.0", path = ".." }
 regex-automata = { version = "0.3.0", path = "../regex-automata", features = ["logging"] }
 regex-lite = { version = "0.1.0", path = "../regex-lite" }
-regex-syntax = { version = "0.7.3", path = "../regex-syntax" }
+regex-syntax = { version = "0.8.0", path = "../regex-syntax" }
 tabwriter = { version = "1.2.1", features = ["ansi_formatting"] }
 textwrap = { version = "0.16.0", default-features = false }
diff --git a/regex-lite/Cargo.toml b/regex-lite/Cargo.toml
index 225193c38..b378018c2 100644
--- a/regex-lite/Cargo.toml
+++ b/regex-lite/Cargo.toml
@@ -14,7 +14,7 @@ rust-version = "1.65"
 autotests = false
 
 # Features are documented in the "Crate features" section of the crate docs:
-# https://docs.rs/regex-syntax/*/#crate-features
+# https://docs.rs/regex-lite/*/#crate-features
 #
 # (Currently there are no supported features. 'std' is technically one, but it
 # is currently required.)

From dc0d79e97e16dba1558a44aa5f68d1da4932bc33 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Mon, 9 Oct 2023 16:55:17 -0400
Subject: [PATCH 098/136] regex-automata-0.4.0

---
 regex-automata/Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/regex-automata/Cargo.toml b/regex-automata/Cargo.toml
index 719f68c66..3792f53e6 100644
--- a/regex-automata/Cargo.toml
+++ b/regex-automata/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "regex-automata"
-version = "0.3.9"  #:version
+version = "0.4.0"  #:version
 authors = ["The Rust Project Developers", "Andrew Gallant <jamslam@gmail.com>"]
 description = "Automata construction and matching using regular expressions."
 documentation = "https://docs.rs/regex-automata"

From 26d8e3ad1ffe3ab88679d185103f6a7fe5a562b5 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Mon, 9 Oct 2023 16:56:44 -0400
Subject: [PATCH 099/136] deps: bump regex-automata to 0.4.0

---
 Cargo.toml           | 2 +-
 regex-cli/Cargo.toml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 17120a0a4..9bc90d0e3 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -174,7 +174,7 @@ optional = true
 # For the actual regex engines.
 [dependencies.regex-automata]
 path = "regex-automata"
-version = "0.3.9"
+version = "0.4.0"
 default-features = false
 features = ["alloc", "syntax", "meta", "nfa-pikevm"]
 
diff --git a/regex-cli/Cargo.toml b/regex-cli/Cargo.toml
index 571191721..ab570a30f 100644
--- a/regex-cli/Cargo.toml
+++ b/regex-cli/Cargo.toml
@@ -29,7 +29,7 @@ lexopt = "0.3.0"
 log = { version = "0.4.17", features = ["std"] }
 memmap2 = "0.5.10"
 regex = { version = "1.9.0", path = ".." }
-regex-automata = { version = "0.3.0", path = "../regex-automata", features = ["logging"] }
+regex-automata = { version = "0.4.0", path = "../regex-automata", features = ["logging"] }
 regex-lite = { version = "0.1.0", path = "../regex-lite" }
 regex-syntax = { version = "0.8.0", path = "../regex-syntax" }
 tabwriter = { version = "1.2.1", features = ["ansi_formatting"] }

From 2cbd34215d1df6415aeac8ed93018ca8ada0cfca Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Mon, 9 Oct 2023 16:56:54 -0400
Subject: [PATCH 100/136] 1.10.0

---
 Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Cargo.toml b/Cargo.toml
index 9bc90d0e3..88f96b0b1 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "regex"
-version = "1.9.6"  #:version
+version = "1.10.0"  #:version
 authors = ["The Rust Project Developers", "Andrew Gallant <jamslam@gmail.com>"]
 license = "MIT OR Apache-2.0"
 readme = "README.md"

From 951eebd20781671a7aa6f5ceb6b9f284923b425d Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Mon, 9 Oct 2023 17:14:50 -0400
Subject: [PATCH 101/136] regex-cli-0.1.1

---
 regex-cli/Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/regex-cli/Cargo.toml b/regex-cli/Cargo.toml
index ab570a30f..3fe5390aa 100644
--- a/regex-cli/Cargo.toml
+++ b/regex-cli/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "regex-cli"
-version = "0.1.0"  #:version
+version = "0.1.1"  #:version
 authors = ["The Rust Project Developers", "Andrew Gallant <jamslam@gmail.com>"]
 description = """
 A command line tool for debugging, ad hoc benchmarking and generating regular

From f01f71b66940279835de25ee8687a7e0d30e854d Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Mon, 9 Oct 2023 17:15:35 -0400
Subject: [PATCH 102/136] lite: add \< and \> to the syntax docs

This was probably a copy & paste error.
---
 regex-lite/src/lib.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/regex-lite/src/lib.rs b/regex-lite/src/lib.rs
index 68d54824f..9b394a480 100644
--- a/regex-lite/src/lib.rs
+++ b/regex-lite/src/lib.rs
@@ -472,8 +472,8 @@ $               the end of a haystack (or end-of-line with multi-line mode)
 \z              only the end of a haystack (even with multi-line mode enabled)
 \b              an ASCII word boundary (\w on one side and \W, \A, or \z on other)
 \B              not an ASCII word boundary
-\b{start}       an ASCII start-of-word boundary (\W|\A on the left, \w on the right)
-\b{end}         an ASCII end-of-word boundary (\w on the left, \W|\z on the right))
+\b{start}, \<   an ASCII start-of-word boundary (\W|\A on the left, \w on the right)
+\b{end}, \>     an ASCII end-of-word boundary (\w on the left, \W|\z on the right))
 \b{start-half}  half of an ASCII start-of-word boundary (\W|\A on the left)
 \b{end-half}    half of an ASCII end-of-word boundary (\W|\z on the right)
 </pre>

From 452bc3211635a38a89190da993d4c87a6eeaaf9f Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Mon, 9 Oct 2023 17:15:59 -0400
Subject: [PATCH 103/136] regex-lite-0.1.2

---
 regex-lite/Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/regex-lite/Cargo.toml b/regex-lite/Cargo.toml
index b378018c2..5a6c2ac8a 100644
--- a/regex-lite/Cargo.toml
+++ b/regex-lite/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "regex-lite"
-version = "0.1.1"  #:version
+version = "0.1.2"  #:version
 authors = ["The Rust Project Developers", "Andrew Gallant <jamslam@gmail.com>"]
 license = "MIT OR Apache-2.0"
 repository = "https://github.com/rust-lang/regex/tree/master/regex-lite"

From a2a1986b13aebbafc54ef4b7d9a76626270a0a24 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Mon, 9 Oct 2023 18:15:08 -0400
Subject: [PATCH 104/136] automata: fix sparse DFA state validation in no-std

The verified set isn't tracked in no-std/no-alloc because it is probably
not worth doing (and bloating the size of the sparse DFA itself to store
the state IDs).

So when we deserialize a DFA without std enabled, the verified set of
states was always reporting `false`, and this now trips an error 100% of
the time in the new start state validation code.

We fix this by always reporting `true`, thus treating every possible
state ID as possibly valid on its own. Not great, but maintains the
status quo.
---
 regex-automata/src/dfa/sparse.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/regex-automata/src/dfa/sparse.rs b/regex-automata/src/dfa/sparse.rs
index 38096d994..d461e0a0f 100644
--- a/regex-automata/src/dfa/sparse.rs
+++ b/regex-automata/src/dfa/sparse.rs
@@ -2540,7 +2540,7 @@ impl Seen {
     }
     fn insert(&mut self, _id: StateID) {}
     fn contains(&self, _id: &StateID) -> bool {
-        false
+        true
     }
 }
 

From dd04a57e1db3099fdcee337b8219eddc58ce4eb4 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Mon, 9 Oct 2023 18:17:15 -0400
Subject: [PATCH 105/136] regex-automata-0.4.1

---
 regex-automata/Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/regex-automata/Cargo.toml b/regex-automata/Cargo.toml
index 3792f53e6..63554314f 100644
--- a/regex-automata/Cargo.toml
+++ b/regex-automata/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "regex-automata"
-version = "0.4.0"  #:version
+version = "0.4.1"  #:version
 authors = ["The Rust Project Developers", "Andrew Gallant <jamslam@gmail.com>"]
 description = "Automata construction and matching using regular expressions."
 documentation = "https://docs.rs/regex-automata"

From f5b8cb4d52ca0fa01a9c4e8e70bcd3e4b6673368 Mon Sep 17 00:00:00 2001
From: Fabio Valentini <decathorpe@users.noreply.github.com>
Date: Tue, 10 Oct 2023 15:42:29 +0200
Subject: [PATCH 106/136] lite: fix doctests on 32-bit

Returning early on non-64-bit architectures is not enough, since the
doctest failed to compile due to two numeric literals being too large
for usize on 32-bit architectures.

PR #1101
---
 regex-lite/src/string.rs | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/regex-lite/src/string.rs b/regex-lite/src/string.rs
index af0a5b629..4e4de9068 100644
--- a/regex-lite/src/string.rs
+++ b/regex-lite/src/string.rs
@@ -2063,7 +2063,6 @@ impl<'h, 'n> core::ops::Index<&'n str> for Captures<'h> {
 /// This example shows how to create and use `CaptureLocations` in a search.
 ///
 /// ```
-/// # if !cfg!(target_pointer_width = "64") { return; } // see #1041
 /// use regex_lite::Regex;
 ///
 /// let re = Regex::new(r"(?<first>\w+)\s+(?<last>\w+)").unwrap();
@@ -2076,7 +2075,10 @@ impl<'h, 'n> core::ops::Index<&'n str> for Captures<'h> {
 ///
 /// // Asking for an invalid capture group always returns None.
 /// assert_eq!(None, locs.get(3));
+/// # // literals are too big for 32-bit usize: #1041
+/// # #[cfg(target_pointer_width = "64")]
 /// assert_eq!(None, locs.get(34973498648));
+/// # #[cfg(target_pointer_width = "64")]
 /// assert_eq!(None, locs.get(9944060567225171988));
 /// ```
 #[derive(Clone, Debug)]

From 6ec0a00c0046d0fbbf64276cc28258ad7a4a7317 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Tue, 10 Oct 2023 09:42:40 -0400
Subject: [PATCH 107/136] regex-lite-0.1.3

---
 regex-lite/Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/regex-lite/Cargo.toml b/regex-lite/Cargo.toml
index 5a6c2ac8a..e09229723 100644
--- a/regex-lite/Cargo.toml
+++ b/regex-lite/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "regex-lite"
-version = "0.1.2"  #:version
+version = "0.1.3"  #:version
 authors = ["The Rust Project Developers", "Andrew Gallant <jamslam@gmail.com>"]
 license = "MIT OR Apache-2.0"
 repository = "https://github.com/rust-lang/regex/tree/master/regex-lite"

From d5144b2f5fcf931a4fdf9e247ba20c93000391c3 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Wed, 11 Oct 2023 11:32:12 -0400
Subject: [PATCH 108/136] syntax: add regression tests for new bugs in internal
 set ops

Ref https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=63153
Ref https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=63155
Ref https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=63168
Ref https://github.com/rust-lang/regex/pull/1051

fixup
---
 ...-minimized-ast_fuzz_regex-6345245270605824 | Bin 0 -> 3933 bytes
 ...inimized-fuzz_regex_match-5736465767989248 | Bin 0 -> 452 bytes
 ...inimized-fuzz_regex_match-6413499984904192 | Bin 0 -> 27 bytes
 regex-syntax/src/hir/translate.rs             |  44 ++++++++++++++++++
 4 files changed, 44 insertions(+)
 create mode 100644 fuzz/regressions/clusterfuzz-testcase-minimized-ast_fuzz_regex-6345245270605824
 create mode 100644 fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_match-5736465767989248
 create mode 100644 fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_match-6413499984904192

diff --git a/fuzz/regressions/clusterfuzz-testcase-minimized-ast_fuzz_regex-6345245270605824 b/fuzz/regressions/clusterfuzz-testcase-minimized-ast_fuzz_regex-6345245270605824
new file mode 100644
index 0000000000000000000000000000000000000000..312767e97b7e1cff042e7a32d26ea481adeaf6c6
GIT binary patch
literal 3933
zcmeyoaKVEGjT0IsH1s!2*a)J5(9^}3g^{6|fr%kB*e8U6fq@Z-S^oe3uL`7jfiwZY
z$Z$;QP6Y!40cGS#f5)Zmz7<5~@#F6Q|93Y3|ND59g(D0`!-wKDf-`a51cO*GSqUU}
zLIEfb0f7*Z3roQ;DIlE>0xMUpwAZZ8g@Ysv0F#24#lV}J>%G?-sK?ve+iK-#q6cPt
zYpfX`tKbkSA*fKC4fP;ED$4LAbUxBEHj=@xOwEl&44ztGir@ldgzSL>`UkFGzkcA>
zfddDCdHE2K`u`uu{Qn<RVB`UXkjjRiNNgDM<_&NGbMq#opaF4UQrKyr4KuG_w+{f?
zqYh-E0idkCy&;eUISL4X4q%u$6IM=vX;cd<kt`&oWTPa|h}K(@np27|1|bo)5C&8h
zhQ-0y7&IabVi8Oj>mQ6kJZ(t4QfjFrqaF4Wr`=dW0jJvkm;USGmZVt5l429E8E_v}
zFtM~GA5{b^50Y*cEIf^xfHWC~T3eKX!M!N4AR{$5HQsrDX)&-IU@A#1g7|6`8UTte
zfQQB)N*HiAk<5S=W$Xxz?_u2vgb*^z7ny@Mz2i3r#TYaWFb))8LPVLO22zb0AF$#O
z+JMApSSI0ZSpplTAYiXq32JDj)YjG}d^-^HpNoM({WmnoG8h&v1Q(<WkaK`aI%eg_
pbQ@U(Ivc4lZvL;*46GFgd1ocBbZ2uIQq>gq_wmBVkN<=6Jpg;~t&9Kw

literal 0
HcmV?d00001

diff --git a/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_match-5736465767989248 b/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_match-5736465767989248
new file mode 100644
index 0000000000000000000000000000000000000000..30a3a3ba0ef9fbdaff1b01233c44da7977d4bb7a
GIT binary patch
literal 452
zcmXSqE{~2Yx92VAh0+in4w3;c02za(k%57cA(nwbR|`af04f*EgGexFAxWZ{f@v;9
z1W5<P5ZtC$V5kM^(6YB@sINAv6#&vOzzk&6;&d=J_ai(9mStr0(9Mk11^XA`KbWh4
xCNo;<*2gla*FZ=O`%K-^3e9LR4`H#53&b&uU^C#sr*H@;eDE+@5GP+;2mlMWNF4wG

literal 0
HcmV?d00001

diff --git a/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_match-6413499984904192 b/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_match-6413499984904192
new file mode 100644
index 0000000000000000000000000000000000000000..8b24e0a6e04b67ddf9cd2a4fb750b6ba5477768f
GIT binary patch
literal 27
jcma#nU{hew{j`4j)>s7v1_fpY1%+=43jhEASKtQ#bkPY$

literal 0
HcmV?d00001

diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs
index 2b500cc2f..2eff6318c 100644
--- a/regex-syntax/src/hir/translate.rs
+++ b/regex-syntax/src/hir/translate.rs
@@ -3703,4 +3703,48 @@ mod tests {
         let mut t = Translator::new();
         assert_eq!(Ok(Hir::dot(Dot::AnyCharExceptLF)), t.translate("", &ast));
     }
+
+    // See: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=63168
+    #[test]
+    fn regression_fuzz_match() {
+        let pat = "[(\u{6} \0-\u{afdf5}]  \0 ";
+        let ast = ParserBuilder::new()
+            .octal(false)
+            .ignore_whitespace(true)
+            .build()
+            .parse(pat)
+            .unwrap();
+        let hir = TranslatorBuilder::new()
+            .utf8(true)
+            .case_insensitive(false)
+            .multi_line(false)
+            .dot_matches_new_line(false)
+            .swap_greed(true)
+            .unicode(true)
+            .build()
+            .translate(pat, &ast)
+            .unwrap();
+        assert_eq!(
+            hir,
+            Hir::concat(vec![
+                hir_uclass(&[('\0', '\u{afdf5}')]),
+                hir_lit("\0"),
+            ])
+        );
+    }
+
+    // See: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=63155
+    #[cfg(feature = "unicode")]
+    #[test]
+    fn regression_fuzz_difference1() {
+        let pat = r"\W\W|\W[^\v--\W\W\P{Script_Extensions:Pau_Cin_Hau}\u10A1A1-\U{3E3E3}--~~~~--~~~~~~~~------~~~~~~--~~~~~~]*";
+        let _ = t(pat); // shouldn't panic
+    }
+
+    // See: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=63153
+    #[test]
+    fn regression_fuzz_char_decrement1() {
+        let pat = "w[w[^w?\rw\rw[^w?\rw[^w?\rw[^w?\rw[^w?\rw[^w?\rw[^w?\r\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0w?\rw[^w?\rw[^w?\rw[^w\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\u{1}\0]\0\0\0\0\0\0\0\0\0*\0\0\u{1}\0]\0\0-*\0][^w?\rw[^w?\rw[^w?\rw[^w?\rw[^w?\rw[^w?\rw[^w\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\u{1}\0]\0\0\0\0\0\0\0\0\0x\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\0\0\0\0*??\0\u{7f}{2}\u{10}??\0\0\0\0\0\0\0\0\0\u{3}\0\0\0}\0-*\0]\0\0\0\0\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\0\0\u{1}\0]\0\u{1}\u{1}H-i]-]\0\0\0\0\u{1}\0]\0\0\0\u{1}\0]\0\0-*\0\0\0\0\u{1}9-\u{7f}]\0'|-\u{7f}]\0'|(?i-ux)[-\u{7f}]\0'\u{3}\0\0\0}\0-*\0]<D\0\0\0\0\0\0\u{1}]\0\0\0\0]\0\0-*\0]\0\0 ";
+        let _ = t(pat); // shouldn't panic
+    }
 }

From f082244720d02fea04185c654e3d0dc95e39309d Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Wed, 11 Oct 2023 11:34:47 -0400
Subject: [PATCH 109/136] syntax: revert interval set optimizations

This reverts commit 6d2b09ed6fbc136cca007ce0c57ec9cbae16f3b4.

Sadly I just don't have the time to fix this code myself. It's too
subtle. So I'm just reverting it entirely for now.

Fixes https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=63155
Fixes https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=63153
Fixes https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=63168
Ref https://github.com/rust-lang/regex/pull/1051
---
 regex-syntax/src/hir/interval.rs | 282 +++++++++++--------------------
 1 file changed, 97 insertions(+), 185 deletions(-)

diff --git a/regex-syntax/src/hir/interval.rs b/regex-syntax/src/hir/interval.rs
index e3051bf31..e063390a8 100644
--- a/regex-syntax/src/hir/interval.rs
+++ b/regex-syntax/src/hir/interval.rs
@@ -19,7 +19,7 @@ use crate::unicode;
 //
 // Some of the implementation complexity here is a result of me wanting to
 // preserve the sequential representation without using additional memory.
-// In some cases, we do use linear extra memory, but it is at most 2x and it
+// In many cases, we do use linear extra memory, but it is at most 2x and it
 // is amortized. If we relaxed the memory requirements, this implementation
 // could become much simpler. The extra memory is honestly probably OK, but
 // character classes (especially of the Unicode variety) can become quite
@@ -81,45 +81,14 @@ impl<I: Interval> IntervalSet<I> {
 
     /// Add a new interval to this set.
     pub fn push(&mut self, interval: I) {
+        // TODO: This could be faster. e.g., Push the interval such that
+        // it preserves canonicalization.
+        self.ranges.push(interval);
+        self.canonicalize();
         // We don't know whether the new interval added here is considered
         // case folded, so we conservatively assume that the entire set is
         // no longer case folded if it was previously.
         self.folded = false;
-
-        if self.ranges.is_empty() {
-            self.ranges.push(interval);
-            return;
-        }
-
-        // Find the first range that is not greater than the new interval.
-        // This is the first range that could possibly be unioned with the
-        // new interval.
-        let mut drain_end = self.ranges.len();
-        while drain_end > 0
-            && self.ranges[drain_end - 1].lower() > interval.upper()
-            && !self.ranges[drain_end - 1].is_contiguous(&interval)
-        {
-            drain_end -= 1;
-        }
-
-        // Try to union the new interval with old intervals backwards.
-        if drain_end > 0 && self.ranges[drain_end - 1].is_contiguous(&interval)
-        {
-            self.ranges[drain_end - 1] =
-                self.ranges[drain_end - 1].union(&interval).unwrap();
-            for i in (0..drain_end - 1).rev() {
-                if let Some(union) =
-                    self.ranges[drain_end - 1].union(&self.ranges[i])
-                {
-                    self.ranges[drain_end - 1] = union;
-                } else {
-                    self.ranges.drain(i + 1..drain_end - 1);
-                    break;
-                }
-            }
-        } else {
-            self.ranges.insert(drain_end, interval);
-        }
     }
 
     /// Return an iterator over all intervals in this set.
@@ -223,13 +192,34 @@ impl<I: Interval> IntervalSet<I> {
         // Folks seem to suggest interval or segment trees, but I'd like to
         // avoid the overhead (both runtime and conceptual) of that.
         //
+        // The following is basically my Shitty First Draft. Therefore, in
+        // order to grok it, you probably need to read each line carefully.
+        // Simplifications are most welcome!
+        //
         // Remember, we can assume the canonical format invariant here, which
         // says that all ranges are sorted, not overlapping and not adjacent in
         // each class.
         let drain_end = self.ranges.len();
+        let (mut a, mut b) = (0, 0);
+        'LOOP: while a < drain_end && b < other.ranges.len() {
+            // Basically, the easy cases are when neither range overlaps with
+            // each other. If the `b` range is less than our current `a`
+            // range, then we can skip it and move on.
+            if other.ranges[b].upper() < self.ranges[a].lower() {
+                b += 1;
+                continue;
+            }
+            // ... similarly for the `a` range. If it's less than the smallest
+            // `b` range, then we can add it as-is.
+            if self.ranges[a].upper() < other.ranges[b].lower() {
+                let range = self.ranges[a];
+                self.ranges.push(range);
+                a += 1;
+                continue;
+            }
+            // Otherwise, we have overlapping ranges.
+            assert!(!self.ranges[a].is_intersection_empty(&other.ranges[b]));
 
-        let mut b = 0;
-        for a in 0..drain_end {
             // This part is tricky and was non-obvious to me without looking
             // at explicit examples (see the tests). The trickiness stems from
             // two things: 1) subtracting a range from another range could
@@ -241,34 +231,47 @@ impl<I: Interval> IntervalSet<I> {
             // For example, if our `a` range is `a-t` and our next three `b`
             // ranges are `a-c`, `g-i`, `r-t` and `x-z`, then we need to apply
             // subtraction three times before moving on to the next `a` range.
-            self.ranges.push(self.ranges[a]);
-            // Only when `b` is not above `a`, `b` might apply to current
-            // `a` range.
+            let mut range = self.ranges[a];
             while b < other.ranges.len()
-                && other.ranges[b].lower() <= self.ranges[a].upper()
+                && !range.is_intersection_empty(&other.ranges[b])
             {
-                match self.ranges.pop().unwrap().difference(&other.ranges[b]) {
-                    (Some(range1), None) | (None, Some(range1)) => {
-                        self.ranges.push(range1);
+                let old_range = range;
+                range = match range.difference(&other.ranges[b]) {
+                    (None, None) => {
+                        // We lost the entire range, so move on to the next
+                        // without adding this one.
+                        a += 1;
+                        continue 'LOOP;
                     }
+                    (Some(range1), None) | (None, Some(range1)) => range1,
                     (Some(range1), Some(range2)) => {
                         self.ranges.push(range1);
-                        self.ranges.push(range2);
+                        range2
                     }
-                    (None, None) => {}
+                };
+                // It's possible that the `b` range has more to contribute
+                // here. In particular, if it is greater than the original
+                // range, then it might impact the next `a` range *and* it
+                // has impacted the current `a` range as much as possible,
+                // so we can quit. We don't bump `b` so that the next `a`
+                // range can apply it.
+                if other.ranges[b].upper() > old_range.upper() {
+                    break;
                 }
-                // The next `b` range might apply to the current
+                // Otherwise, the next `b` range might apply to the current
                 // `a` range.
                 b += 1;
             }
-            // It's possible that the last `b` range has more to
-            // contribute to the next `a`. We don't bump the last
-            // `b` so that the next `a` range can apply it.
-            b = b.saturating_sub(1);
+            self.ranges.push(range);
+            a += 1;
+        }
+        while a < drain_end {
+            let range = self.ranges[a];
+            self.ranges.push(range);
+            a += 1;
         }
-
         self.ranges.drain(..drain_end);
-        self.folded = self.ranges.is_empty() || (self.folded && other.folded);
+        self.folded = self.folded && other.folded;
     }
 
     /// Compute the symmetric difference of the two sets, in place.
@@ -279,83 +282,11 @@ impl<I: Interval> IntervalSet<I> {
     /// set. That is, the set will contain all elements in either set,
     /// but will not contain any elements that are in both sets.
     pub fn symmetric_difference(&mut self, other: &IntervalSet<I>) {
-        if self.ranges.is_empty() {
-            self.ranges.extend(&other.ranges);
-            self.folded = other.folded;
-            return;
-        }
-        if other.ranges.is_empty() {
-            return;
-        }
-
-        // There should be a way to do this in-place with constant memory,
-        // but I couldn't figure out a simple way to do it. So just append
-        // the symmetric difference to the end of this range, and then drain
-        // it before we're done.
-        let drain_end = self.ranges.len();
-        let mut b = 0;
-        let mut b_range = Some(other.ranges[b]);
-        for a in 0..drain_end {
-            self.ranges.push(self.ranges[a]);
-            while b_range
-                .map_or(false, |r| r.lower() <= self.ranges[a].upper())
-            {
-                let (range1, range2) = match self
-                    .ranges
-                    .pop()
-                    .unwrap()
-                    .symmetric_difference(&b_range.as_ref().unwrap())
-                {
-                    (Some(range1), None) | (None, Some(range1)) => {
-                        (Some(range1), None)
-                    }
-                    (Some(range1), Some(range2)) => {
-                        (Some(range1), Some(range2))
-                    }
-                    (None, None) => (None, None),
-                };
-                if let Some(range) = range1 {
-                    if self.ranges.len() > drain_end
-                        && self.ranges.last().unwrap().is_contiguous(&range)
-                    {
-                        self.ranges
-                            .last_mut()
-                            .map(|last| *last = last.union(&range).unwrap());
-                    } else {
-                        self.ranges.push(range);
-                    }
-                }
-                if let Some(range) = range2 {
-                    self.ranges.push(range);
-                }
-
-                b_range = if self.ranges.len() > drain_end
-                    && self.ranges.last().unwrap().upper()
-                        > self.ranges[a].upper()
-                {
-                    Some(*self.ranges.last().unwrap())
-                } else {
-                    b += 1;
-                    other.ranges.get(b).cloned()
-                };
-            }
-        }
-        while let Some(range) = b_range {
-            if self.ranges.len() > drain_end
-                && self.ranges.last().unwrap().is_contiguous(&range)
-            {
-                self.ranges
-                    .last_mut()
-                    .map(|last| *last = last.union(&range).unwrap());
-            } else {
-                self.ranges.push(range);
-            }
-            b += 1;
-            b_range = other.ranges.get(b).cloned();
-        }
-
-        self.ranges.drain(..drain_end);
-        self.folded = self.ranges.is_empty() || (self.folded && other.folded);
+        // TODO(burntsushi): Fix this so that it amortizes allocation.
+        let mut intersection = self.clone();
+        intersection.intersect(other);
+        self.union(other);
+        self.difference(&intersection);
     }
 
     /// Negate this interval set.
@@ -371,44 +302,28 @@ impl<I: Interval> IntervalSet<I> {
             return;
         }
 
+        // There should be a way to do this in-place with constant memory,
+        // but I couldn't figure out a simple way to do it. So just append
+        // the negation to the end of this range, and then drain it before
+        // we're done.
+        let drain_end = self.ranges.len();
+
         // We do checked arithmetic below because of the canonical ordering
         // invariant.
         if self.ranges[0].lower() > I::Bound::min_value() {
-            let mut pre_upper = self.ranges[0].upper();
-            self.ranges[0] = I::create(
-                I::Bound::min_value(),
-                self.ranges[0].lower().decrement(),
-            );
-            for i in 1..self.ranges.len() {
-                let lower = pre_upper.increment();
-                pre_upper = self.ranges[i].upper();
-                self.ranges[i] =
-                    I::create(lower, self.ranges[i].lower().decrement());
-            }
-            if pre_upper < I::Bound::max_value() {
-                self.ranges.push(I::create(
-                    pre_upper.increment(),
-                    I::Bound::max_value(),
-                ));
-            }
-        } else {
-            for i in 1..self.ranges.len() {
-                self.ranges[i - 1] = I::create(
-                    self.ranges[i - 1].upper().increment(),
-                    self.ranges[i].lower().decrement(),
-                );
-            }
-            if self.ranges.last().unwrap().upper() < I::Bound::max_value() {
-                self.ranges.last_mut().map(|range| {
-                    *range = I::create(
-                        range.upper().increment(),
-                        I::Bound::max_value(),
-                    )
-                });
-            } else {
-                self.ranges.pop();
-            }
+            let upper = self.ranges[0].lower().decrement();
+            self.ranges.push(I::create(I::Bound::min_value(), upper));
+        }
+        for i in 1..drain_end {
+            let lower = self.ranges[i - 1].upper().increment();
+            let upper = self.ranges[i].lower().decrement();
+            self.ranges.push(I::create(lower, upper));
+        }
+        if self.ranges[drain_end - 1].upper() < I::Bound::max_value() {
+            let lower = self.ranges[drain_end - 1].upper().increment();
+            self.ranges.push(I::create(lower, I::Bound::max_value()));
         }
+        self.ranges.drain(..drain_end);
         // We don't need to update whether this set is folded or not, because
         // it is conservatively preserved through negation. Namely, if a set
         // is not folded, then it is possible that its negation is folded, for
@@ -422,7 +337,6 @@ impl<I: Interval> IntervalSet<I> {
         // of case folded characters. Negating it in turn means that all
         // equivalence classes in the set are negated, and any equivalence
         // class that was previously not in the set is now entirely in the set.
-        self.folded = self.ranges.is_empty() || self.folded;
     }
 
     /// Converts this set into a canonical ordering.
@@ -433,20 +347,24 @@ impl<I: Interval> IntervalSet<I> {
         self.ranges.sort();
         assert!(!self.ranges.is_empty());
 
-        // We maintain the canonicalization results in-place at `0..newi`.
-        // `newi` will keep track of the end of the canonicalized ranges.
-        let mut newi = 0;
-        for oldi in 1..self.ranges.len() {
-            // The last new range gets merged with currnet old range when
-            // unionable. If not, we update `newi` and store it as a new range.
-            if let Some(union) = self.ranges[newi].union(&self.ranges[oldi]) {
-                self.ranges[newi] = union;
-            } else {
-                newi += 1;
-                self.ranges[newi] = self.ranges[oldi];
+        // Is there a way to do this in-place with constant memory? I couldn't
+        // figure out a way to do it. So just append the canonicalization to
+        // the end of this range, and then drain it before we're done.
+        let drain_end = self.ranges.len();
+        for oldi in 0..drain_end {
+            // If we've added at least one new range, then check if we can
+            // merge this range in the previously added range.
+            if self.ranges.len() > drain_end {
+                let (last, rest) = self.ranges.split_last_mut().unwrap();
+                if let Some(union) = last.union(&rest[oldi]) {
+                    *last = union;
+                    continue;
+                }
             }
+            let range = self.ranges[oldi];
+            self.ranges.push(range);
         }
-        self.ranges.truncate(newi + 1);
+        self.ranges.drain(..drain_end);
     }
 
     /// Returns true if and only if this class is in a canonical ordering.
@@ -568,13 +486,7 @@ pub trait Interval:
         other: &Self,
     ) -> (Option<Self>, Option<Self>) {
         let union = match self.union(other) {
-            None => {
-                return if self.upper() < other.lower() {
-                    (Some(self.clone()), Some(other.clone()))
-                } else {
-                    (Some(other.clone()), Some(self.clone()))
-                }
-            }
+            None => return (Some(self.clone()), Some(other.clone())),
             Some(union) => union,
         };
         let intersection = match self.intersect(other) {

From b99cff05449ba7f61e38e0efb18d4c95c8bc28e3 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Wed, 11 Oct 2023 12:43:42 -0400
Subject: [PATCH 110/136] regex-syntax-0.8.1

---
 regex-syntax/Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/regex-syntax/Cargo.toml b/regex-syntax/Cargo.toml
index f14298299..b0ba658b8 100644
--- a/regex-syntax/Cargo.toml
+++ b/regex-syntax/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "regex-syntax"
-version = "0.8.0"  #:version
+version = "0.8.1"  #:version
 authors = ["The Rust Project Developers", "Andrew Gallant <jamslam@gmail.com>"]
 license = "MIT OR Apache-2.0"
 repository = "https://github.com/rust-lang/regex/tree/master/regex-syntax"

From ef3e01be4aa73afbc4a3b61d4f08c417a57e0c61 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Thu, 12 Oct 2023 09:28:15 -0400
Subject: [PATCH 111/136] syntax: add regression test for the errant HIR
 interval set optimizations

Fixes #1103
Ref #1051, Ref #1102
---
 testdata/regression.toml | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/testdata/regression.toml b/testdata/regression.toml
index 09b2b1d1c..2954c9118 100644
--- a/testdata/regression.toml
+++ b/testdata/regression.toml
@@ -800,3 +800,16 @@ name = "non-prefix-literal-quit-state"
 regex = '.+\b\n'
 haystack = "β77\n"
 matches = [[0, 5]]
+
+# This is a regression test for some errant HIR interval set operations that
+# were made in the regex-syntax 0.8.0 release and then reverted in 0.8.1. The
+# issue here is that the HIR produced from the regex had out-of-order ranges.
+#
+# See: https://github.com/rust-lang/regex/issues/1103
+# Ref: https://github.com/rust-lang/regex/pull/1051
+# Ref: https://github.com/rust-lang/regex/pull/1102
+[[test]]
+name = "hir-optimization-out-of-order-class"
+regex = '^[[:alnum:]./-]+$'
+haystack = "a-b"
+matches = [[0, 3]]

From 69051b797ba3065663564b382b024c3cb3484bf4 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Thu, 12 Oct 2023 10:16:36 -0400
Subject: [PATCH 112/136] fuzz: add another HIR interval set regression

This is a new test case revealed by OSS-fuzz. It passes in the current
releases.

Fixes https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=63203
---
 ...case-minimized-ast_fuzz_regex-4596093180313600 | Bin 0 -> 329 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 fuzz/regressions/clusterfuzz-testcase-minimized-ast_fuzz_regex-4596093180313600

diff --git a/fuzz/regressions/clusterfuzz-testcase-minimized-ast_fuzz_regex-4596093180313600 b/fuzz/regressions/clusterfuzz-testcase-minimized-ast_fuzz_regex-4596093180313600
new file mode 100644
index 0000000000000000000000000000000000000000..711817e4ed98c89f3eac4def9acfbea0451dbbf3
GIT binary patch
literal 329
zcmeybps-MZMDQQ1{yzjLfGLOx{~<Ey#sP(a5)hJf+d!s)4UU5vfx{Mrt#JDgLd3He
Ma2pRX7rRma0We2R82|tP

literal 0
HcmV?d00001


From 25ad29f56e10ab0d2c19b041d9f76c5969088b47 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Fri, 13 Oct 2023 09:51:09 -0400
Subject: [PATCH 113/136] bench: add a redirect

This directory was linked in a fair number of places, so we re-introduce
it with a README pointing folks toward rebar.
---
 bench/README.md | 2 ++
 1 file changed, 2 insertions(+)
 create mode 100644 bench/README.md

diff --git a/bench/README.md b/bench/README.md
new file mode 100644
index 000000000..3cc6a1a7a
--- /dev/null
+++ b/bench/README.md
@@ -0,0 +1,2 @@
+Benchmarks for this crate have been moved into the rebar project:
+https://github.com/BurntSushi/rebar

From cfd0ca2428c986777e21542a531b450178bc0cf2 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Thu, 12 Oct 2023 14:15:48 -0400
Subject: [PATCH 114/136] automata/meta: force some prefilter inlining

In some ad hoc profiling, I noticed an extra function call that really
didn't need to be there.
---
 regex-automata/src/meta/strategy.rs | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/regex-automata/src/meta/strategy.rs b/regex-automata/src/meta/strategy.rs
index ea6c6ab57..5b96d888a 100644
--- a/regex-automata/src/meta/strategy.rs
+++ b/regex-automata/src/meta/strategy.rs
@@ -353,6 +353,7 @@ impl Pre<()> {
 // strategy when len(patterns)==1 if the number of literals is large. In that
 // case, literal extraction gives up and will return an infinite set.)
 impl<P: PrefilterI> Strategy for Pre<P> {
+    #[cfg_attr(feature = "perf-inline", inline(always))]
     fn group_info(&self) -> &GroupInfo {
         &self.group_info
     }
@@ -378,6 +379,7 @@ impl<P: PrefilterI> Strategy for Pre<P> {
         self.pre.memory_usage()
     }
 
+    #[cfg_attr(feature = "perf-inline", inline(always))]
     fn search(&self, _cache: &mut Cache, input: &Input<'_>) -> Option<Match> {
         if input.is_done() {
             return None;
@@ -393,6 +395,7 @@ impl<P: PrefilterI> Strategy for Pre<P> {
             .map(|sp| Match::new(PatternID::ZERO, sp))
     }
 
+    #[cfg_attr(feature = "perf-inline", inline(always))]
     fn search_half(
         &self,
         cache: &mut Cache,
@@ -401,10 +404,12 @@ impl<P: PrefilterI> Strategy for Pre<P> {
         self.search(cache, input).map(|m| HalfMatch::new(m.pattern(), m.end()))
     }
 
+    #[cfg_attr(feature = "perf-inline", inline(always))]
     fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool {
         self.search(cache, input).is_some()
     }
 
+    #[cfg_attr(feature = "perf-inline", inline(always))]
     fn search_slots(
         &self,
         cache: &mut Cache,
@@ -421,6 +426,7 @@ impl<P: PrefilterI> Strategy for Pre<P> {
         Some(m.pattern())
     }
 
+    #[cfg_attr(feature = "perf-inline", inline(always))]
     fn which_overlapping_matches(
         &self,
         cache: &mut Cache,

From 04f5d7be4efc542864cc400f5d43fbea4eb9bab6 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Thu, 12 Oct 2023 14:16:20 -0400
Subject: [PATCH 115/136] syntax: loosen ASCII compatible rules
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previously, patterns like `(?-u:☃)` were banned under the logic that
Unicode scalar values shouldn't be available unless Unicode mode is
enabled. But since patterns are required to be UTF-8, there really isn't
any difficulty in just interpreting Unicode literals as their
corresponding UTF-8 encoding.

Note though that Unicode character classes, even things like
`(?-u:[☃])`, remain banned. We probably could make character classes
work too, but it's unclear how that plays with ASCII compatible mode
requiring that a single byte is the fundamental atom of matching (where
as Unicode mode requires that Unicode scalar values are the fundamental
atom of matching).
---
 regex-syntax/src/hir/translate.rs | 46 +++++++------------------------
 src/bytes.rs                      |  4 +--
 2 files changed, 12 insertions(+), 38 deletions(-)

diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs
index 2eff6318c..313a1e9e8 100644
--- a/regex-syntax/src/hir/translate.rs
+++ b/regex-syntax/src/hir/translate.rs
@@ -388,17 +388,10 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
             }
             Ast::Literal(ref x) => match self.ast_literal_to_scalar(x)? {
                 Either::Right(byte) => self.push_byte(byte),
-                Either::Left(ch) => {
-                    if !self.flags().unicode() && ch.len_utf8() > 1 {
-                        return Err(
-                            self.error(x.span, ErrorKind::UnicodeNotAllowed)
-                        );
-                    }
-                    match self.case_fold_char(x.span, ch)? {
-                        None => self.push_char(ch),
-                        Some(expr) => self.push(HirFrame::Expr(expr)),
-                    }
-                }
+                Either::Left(ch) => match self.case_fold_char(x.span, ch)? {
+                    None => self.push_char(ch),
+                    Some(expr) => self.push(HirFrame::Expr(expr)),
+                },
             },
             Ast::Dot(ref span) => {
                 self.push(HirFrame::Expr(self.hir_dot(**span)?));
@@ -872,8 +865,8 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
             })?;
             Ok(Some(Hir::class(hir::Class::Unicode(cls))))
         } else {
-            if c.len_utf8() > 1 {
-                return Err(self.error(span, ErrorKind::UnicodeNotAllowed));
+            if !c.is_ascii() {
+                return Ok(None);
             }
             // If case folding won't do anything, then don't bother trying.
             match c {
@@ -1211,9 +1204,8 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
         match self.ast_literal_to_scalar(ast)? {
             Either::Right(byte) => Ok(byte),
             Either::Left(ch) => {
-                let cp = u32::from(ch);
-                if cp <= 0x7F {
-                    Ok(u8::try_from(cp).unwrap())
+                if ch.is_ascii() {
+                    Ok(u8::try_from(ch).unwrap())
                 } else {
                     // We can't feasibly support Unicode in
                     // byte oriented classes. Byte classes don't
@@ -1661,16 +1653,7 @@ mod tests {
         assert_eq!(t_bytes(r"(?-u)\x61"), hir_lit("a"));
         assert_eq!(t_bytes(r"(?-u)\xFF"), hir_blit(b"\xFF"));
 
-        assert_eq!(
-            t_err("(?-u)☃"),
-            TestError {
-                kind: hir::ErrorKind::UnicodeNotAllowed,
-                span: Span::new(
-                    Position::new(5, 1, 6),
-                    Position::new(8, 1, 7)
-                ),
-            }
-        );
+        assert_eq!(t("(?-u)☃"), hir_lit("☃"));
         assert_eq!(
             t_err(r"(?-u)\xFF"),
             TestError {
@@ -1748,16 +1731,7 @@ mod tests {
         );
         assert_eq!(t_bytes(r"(?i-u)\xFF"), hir_blit(b"\xFF"));
 
-        assert_eq!(
-            t_err("(?i-u)β"),
-            TestError {
-                kind: hir::ErrorKind::UnicodeNotAllowed,
-                span: Span::new(
-                    Position::new(6, 1, 7),
-                    Position::new(8, 1, 8),
-                ),
-            }
-        );
+        assert_eq!(t("(?i-u)β"), hir_lit("β"),);
     }
 
     #[test]
diff --git a/src/bytes.rs b/src/bytes.rs
index 3f53a3ea5..383ac4a5b 100644
--- a/src/bytes.rs
+++ b/src/bytes.rs
@@ -68,8 +68,8 @@ bytes:
 1. The `u` flag can be disabled even when disabling it might cause the regex to
 match invalid UTF-8. When the `u` flag is disabled, the regex is said to be in
 "ASCII compatible" mode.
-2. In ASCII compatible mode, neither Unicode scalar values nor Unicode
-character classes are allowed.
+2. In ASCII compatible mode, Unicode character classes are not allowed. Literal
+Unicode scalar values outside of character classes are allowed.
 3. In ASCII compatible mode, Perl character classes (`\w`, `\d` and `\s`)
 revert to their typical ASCII definition. `\w` maps to `[[:word:]]`, `\d` maps
 to `[[:digit:]]` and `\s` maps to `[[:space:]]`.

From 8a8d599f9d2f2d78e9ad84e4084788c2d563afa5 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Fri, 13 Oct 2023 14:52:09 -0400
Subject: [PATCH 116/136] automata/meta: tweak reverse suffix prefilter
 strategy

Previously, we were only use the reverse suffix optimization if it found
a non-empty longest common suffix *and* if the prefilter thought itself
was fast. This was a heuristic used in the old regex crate before we
grew the "is prefilter fast" heuristic. We change this optimization to
just use the "is prefilter fast" heuristic instead of requiring a
non-empty longest common suffix.

This is, after all, what the inner literal optimization does. And in the
inner literal case, one should probably be even more conservative
because of the extra work that needs to be done. So if things are going
okay with the inner literal optimization, then we should be fine with
the reverse suffix optimization doing essentially the same thing.
---
 regex-automata/src/meta/strategy.rs | 37 ++++++++++-------------------
 1 file changed, 12 insertions(+), 25 deletions(-)

diff --git a/regex-automata/src/meta/strategy.rs b/regex-automata/src/meta/strategy.rs
index 5b96d888a..4cb3b29b9 100644
--- a/regex-automata/src/meta/strategy.rs
+++ b/regex-automata/src/meta/strategy.rs
@@ -1167,34 +1167,21 @@ impl ReverseSuffix {
             return Err(core);
         }
         let kind = core.info.config().get_match_kind();
-        let suffixes = crate::util::prefilter::suffixes(kind, hirs);
-        let lcs = match suffixes.longest_common_suffix() {
-            None => {
-                debug!(
-                    "skipping reverse suffix optimization because \
-                     a longest common suffix could not be found",
-                );
-                return Err(core);
-            }
-            Some(lcs) if lcs.is_empty() => {
-                debug!(
-                    "skipping reverse suffix optimization because \
-                     the longest common suffix is the empty string",
-                );
-                return Err(core);
-            }
-            Some(lcs) => lcs,
+        let suffixseq = crate::util::prefilter::suffixes(kind, hirs);
+        let Some(suffixes) = suffixseq.literals() else {
+            debug!(
+                "skipping reverse suffix optimization because \
+                 the extract suffix sequence is not finite",
+            );
+            return Err(core);
         };
-        let pre = match Prefilter::new(kind, &[lcs]) {
-            Some(pre) => pre,
-            None => {
-                debug!(
-                    "skipping reverse suffix optimization because \
+        let Some(pre) = Prefilter::new(kind, suffixes) else {
+            debug!(
+                "skipping reverse suffix optimization because \
                      a prefilter could not be constructed from the \
                      longest common suffix",
-                );
-                return Err(core);
-            }
+            );
+            return Err(core);
         };
         if !pre.is_fast() {
             debug!(

From 049d063ba1e8cb2f7203684865b58d6af44357e9 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Sat, 14 Oct 2023 11:07:52 -0400
Subject: [PATCH 117/136] changelog: 1.10.1

---
 CHANGELOG.md | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index b51142218..b5f31bec0 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,19 @@
+1.10.1 (2023-10-14)
+===================
+This is a new patch release with a minor increase in the number of valid
+patterns and a broadening of some literal optimizations.
+
+New features:
+
+* [FEATURE 04f5d7be](https://github.com/rust-lang/regex/commit/04f5d7be4efc542864cc400f5d43fbea4eb9bab6):
+Loosen ASCII-compatible rules such that regexes like `(?-u:☃)` are now allowed.
+
+Performance improvements:
+
+* [PERF 8a8d599f](https://github.com/rust-lang/regex/commit/8a8d599f9d2f2d78e9ad84e4084788c2d563afa5):
+Broader the reverse suffix optimization to apply in more cases.
+
+
 1.10.0 (2023-10-09)
 ===================
 This is a new minor release of `regex` that adds support for start and end

From 1dbeee73b9fcde708502d3d5f799b198fe3a6cf5 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Sat, 14 Oct 2023 11:08:27 -0400
Subject: [PATCH 118/136] regex-syntax-0.8.2

---
 regex-syntax/Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/regex-syntax/Cargo.toml b/regex-syntax/Cargo.toml
index b0ba658b8..c9ce87da7 100644
--- a/regex-syntax/Cargo.toml
+++ b/regex-syntax/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "regex-syntax"
-version = "0.8.1"  #:version
+version = "0.8.2"  #:version
 authors = ["The Rust Project Developers", "Andrew Gallant <jamslam@gmail.com>"]
 license = "MIT OR Apache-2.0"
 repository = "https://github.com/rust-lang/regex/tree/master/regex-syntax"

From ee01ec2725279273630d2b1ebc99775b932131b2 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Sat, 14 Oct 2023 11:09:04 -0400
Subject: [PATCH 119/136] deps: bump regex-syntax to 0.8.2

---
 Cargo.toml                | 2 +-
 regex-automata/Cargo.toml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 88f96b0b1..f3eaf7961 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -181,7 +181,7 @@ features = ["alloc", "syntax", "meta", "nfa-pikevm"]
 # For parsing regular expressions.
 [dependencies.regex-syntax]
 path = "regex-syntax"
-version = "0.8.0"
+version = "0.8.2"
 default-features = false
 
 [dev-dependencies]
diff --git a/regex-automata/Cargo.toml b/regex-automata/Cargo.toml
index 63554314f..99f9a9220 100644
--- a/regex-automata/Cargo.toml
+++ b/regex-automata/Cargo.toml
@@ -86,7 +86,7 @@ internal-instrument-pikevm = ["logging", "std"]
 aho-corasick = { version = "1.0.0", optional = true, default-features = false }
 log = { version = "0.4.14", optional = true }
 memchr = { version = "2.6.0", optional = true, default-features = false }
-regex-syntax = { path = "../regex-syntax", version = "0.8.0", optional = true, default-features = false }
+regex-syntax = { path = "../regex-syntax", version = "0.8.2", optional = true, default-features = false }
 
 [dev-dependencies]
 anyhow = "1.0.69"

From 488604dd6f053104b008a22b9808e383f283992d Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Sat, 14 Oct 2023 11:09:15 -0400
Subject: [PATCH 120/136] regex-automata-0.4.2

---
 regex-automata/Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/regex-automata/Cargo.toml b/regex-automata/Cargo.toml
index 99f9a9220..f9f59feb3 100644
--- a/regex-automata/Cargo.toml
+++ b/regex-automata/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "regex-automata"
-version = "0.4.1"  #:version
+version = "0.4.2"  #:version
 authors = ["The Rust Project Developers", "Andrew Gallant <jamslam@gmail.com>"]
 description = "Automata construction and matching using regular expressions."
 documentation = "https://docs.rs/regex-automata"

From d242ede2ab07df6b32b9ee86f9ae2ae43252ebfa Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Sat, 14 Oct 2023 11:09:51 -0400
Subject: [PATCH 121/136] deps: bump regex-automata to 0.4.2

---
 Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Cargo.toml b/Cargo.toml
index f3eaf7961..7d5a210b0 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -174,7 +174,7 @@ optional = true
 # For the actual regex engines.
 [dependencies.regex-automata]
 path = "regex-automata"
-version = "0.4.0"
+version = "0.4.2"
 default-features = false
 features = ["alloc", "syntax", "meta", "nfa-pikevm"]
 

From 5dff4bd7e3bf8b87a272e31c23f1b64417e4c5de Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Sat, 14 Oct 2023 11:09:54 -0400
Subject: [PATCH 122/136] 1.10.1

---
 Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Cargo.toml b/Cargo.toml
index 7d5a210b0..45132a906 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "regex"
-version = "1.10.0"  #:version
+version = "1.10.1"  #:version
 authors = ["The Rust Project Developers", "Andrew Gallant <jamslam@gmail.com>"]
 license = "MIT OR Apache-2.0"
 readme = "README.md"

From 466e42ca2bea2480ff367e0e26e3967435ac3e30 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Sat, 14 Oct 2023 12:48:09 -0400
Subject: [PATCH 123/136] lite: fix stack overflow in NFA compiler

This commit fixes a bug where the parser could produce a very deeply
nested Hir value beyond the configured nested limit. This was caused by
the fact that the Hir can have some of its nested structures added to it
without a corresponding recursive call in the parser. For example,
repetition operators. This means that even if we don't blow the nest
limit in the parser, the Hir itself can still become nested beyond the
limit. This in turn will make it possible to unintentionally overflow
the stack in subsequent recursion over the Hir value, such as in the
Thompson NFA compiler.

We fix this by checking the nesting limit both on every recursive parse
call and also on the depth of the final Hir value once parsing is
finished but before it has returned to the caller.

Fixes https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=60608
---
 ...zed-fuzz_regex_lite_match-4692452983046144 | Bin 0 -> 5437 bytes
 regex-lite/src/hir/parse.rs                   |  60 ++++++++++++++++--
 regex-lite/tests/fuzz/mod.rs                  |  17 +++++
 3 files changed, 72 insertions(+), 5 deletions(-)
 create mode 100644 fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_lite_match-4692452983046144

diff --git a/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_lite_match-4692452983046144 b/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_lite_match-4692452983046144
new file mode 100644
index 0000000000000000000000000000000000000000..184b6ed7019033ef791366a8dfd3287d7d347965
GIT binary patch
literal 5437
zcmdPk(umX40wR6=va~WS2!Jy6pj;v;b0XD^Dx+}-aM{z?f>Gl~Lx7YJ;37rwsM^sG
z7?~lUOf_$FQN<+cm_ik&j2be+Ltr#5j_?F9>U~HEj8+oxl)wdNjWS3Ifzf~?DY%GK
sJQ}J*2Go#K#>4;uYz#;aze`hqWDyORTr@Bfi;-4X#ON%<MN|9A0HC71j{pDw

literal 0
HcmV?d00001

diff --git a/regex-lite/src/hir/parse.rs b/regex-lite/src/hir/parse.rs
index 33bb97a7d..0dcccdd46 100644
--- a/regex-lite/src/hir/parse.rs
+++ b/regex-lite/src/hir/parse.rs
@@ -377,6 +377,24 @@ impl<'a> Parser<'a> {
 /// own routine.
 impl<'a> Parser<'a> {
     pub(super) fn parse(&self) -> Result<Hir, Error> {
+        let hir = self.parse_inner()?;
+        // While we also check nesting during parsing, that only checks the
+        // number of recursive parse calls. It does not necessarily cover
+        // all possible recursive nestings of the Hir itself. For example,
+        // repetition operators don't require recursive parse calls. So one
+        // can stack them arbitrarily without overflowing the stack in the
+        // *parser*. But then if one recurses over the resulting Hir, a stack
+        // overflow is possible. So here we check the Hir nesting level
+        // thoroughly to ensure it isn't nested too deeply.
+        //
+        // Note that we do still need the nesting limit check in the parser as
+        // well, since that will avoid overflowing the stack during parse time
+        // before the complete Hir value is constructed.
+        check_hir_nesting(&hir, self.config.nest_limit)?;
+        Ok(hir)
+    }
+
+    fn parse_inner(&self) -> Result<Hir, Error> {
         let depth = self.increment_depth()?;
         let mut alternates = vec![];
         let mut concat = vec![];
@@ -806,7 +824,7 @@ impl<'a> Parser<'a> {
         if self.bump_if("?P<") || self.bump_if("?<") {
             let index = self.next_capture_index()?;
             let name = Some(Box::from(self.parse_capture_name()?));
-            let sub = Box::new(self.parse()?);
+            let sub = Box::new(self.parse_inner()?);
             let cap = hir::Capture { index, name, sub };
             Ok(Some(Hir::capture(cap)))
         } else if self.bump_if("?") {
@@ -826,11 +844,11 @@ impl<'a> Parser<'a> {
             } else {
                 assert_eq!(':', self.char());
                 self.bump();
-                self.parse().map(Some)
+                self.parse_inner().map(Some)
             }
         } else {
             let index = self.next_capture_index()?;
-            let sub = Box::new(self.parse()?);
+            let sub = Box::new(self.parse_inner()?);
             let cap = hir::Capture { index, name: None, sub };
             Ok(Some(Hir::capture(cap)))
         }
@@ -1263,6 +1281,38 @@ impl<'a> Parser<'a> {
     }
 }
 
+/// This checks the depth of the given `Hir` value, and if it exceeds the given
+/// limit, then an error is returned.
+fn check_hir_nesting(hir: &Hir, limit: u32) -> Result<(), Error> {
+    fn recurse(hir: &Hir, limit: u32, depth: u32) -> Result<(), Error> {
+        if depth > limit {
+            return Err(Error::new(ERR_TOO_MUCH_NESTING));
+        }
+        let Some(next_depth) = depth.checked_add(1) else {
+            return Err(Error::new(ERR_TOO_MUCH_NESTING));
+        };
+        match *hir.kind() {
+            HirKind::Empty
+            | HirKind::Char(_)
+            | HirKind::Class(_)
+            | HirKind::Look(_) => Ok(()),
+            HirKind::Repetition(hir::Repetition { ref sub, .. }) => {
+                recurse(sub, limit, next_depth)
+            }
+            HirKind::Capture(hir::Capture { ref sub, .. }) => {
+                recurse(sub, limit, next_depth)
+            }
+            HirKind::Concat(ref subs) | HirKind::Alternation(ref subs) => {
+                for sub in subs.iter() {
+                    recurse(sub, limit, next_depth)?;
+                }
+                Ok(())
+            }
+        }
+    }
+    recurse(hir, limit, 0)
+}
+
 /// Converts the given Hir to a literal char if the Hir is just a single
 /// character. Otherwise this returns an error.
 ///
@@ -1344,12 +1394,12 @@ mod tests {
     use super::*;
 
     fn p(pattern: &str) -> Hir {
-        Parser::new(Config::default(), pattern).parse().unwrap()
+        Parser::new(Config::default(), pattern).parse_inner().unwrap()
     }
 
     fn perr(pattern: &str) -> String {
         Parser::new(Config::default(), pattern)
-            .parse()
+            .parse_inner()
             .unwrap_err()
             .to_string()
     }
diff --git a/regex-lite/tests/fuzz/mod.rs b/regex-lite/tests/fuzz/mod.rs
index 6eb37b50b..747aab040 100644
--- a/regex-lite/tests/fuzz/mod.rs
+++ b/regex-lite/tests/fuzz/mod.rs
@@ -14,6 +14,23 @@ fn captures_wrong_order_min() {
     let _ = run(data);
 }
 
+// Simpler regression test from a failure found by OSS-fuzz[1]. This test,
+// when it failed, caused a stack overflow. We fixed it by adding another nest
+// check on the Hir value itself, since the Hir type can have depth added to
+// it without recursive calls in the parser (which is where the existing nest
+// check was).
+//
+// Many thanks to Addison Crump for coming up with this test case[2].
+//
+// [1]: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=60608
+// [2]: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=60608#c1
+#[test]
+fn many_zero_to_many_reps() {
+    let pat = format!(".{}", "*".repeat(1 << 15));
+    let Ok(re) = regex_lite::RegexBuilder::new(&pat).build() else { return };
+    re.is_match("");
+}
+
 // This is the fuzz target function. We duplicate it here since this is the
 // thing we use to interpret the data. It is ultimately what we want to
 // succeed.

From cd79881df40755707ad9f1944b5f34881e1172b0 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Sat, 14 Oct 2023 12:50:49 -0400
Subject: [PATCH 124/136] regex-lite-0.1.4

---
 regex-lite/Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/regex-lite/Cargo.toml b/regex-lite/Cargo.toml
index e09229723..704970f2f 100644
--- a/regex-lite/Cargo.toml
+++ b/regex-lite/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "regex-lite"
-version = "0.1.3"  #:version
+version = "0.1.4"  #:version
 authors = ["The Rust Project Developers", "Andrew Gallant <jamslam@gmail.com>"]
 license = "MIT OR Apache-2.0"
 repository = "https://github.com/rust-lang/regex/tree/master/regex-lite"

From 4ae14720e9599830f653ca1a881b42e620eba11e Mon Sep 17 00:00:00 2001
From: Fabio Valentini <decathorpe@users.noreply.github.com>
Date: Sun, 15 Oct 2023 14:33:18 +0200
Subject: [PATCH 125/136] tests: fix compilation of doctests on 32-bit
 architectures

PR #1107
---
 regex-automata/src/util/captures.rs | 3 ++-
 src/regex/bytes.rs                  | 4 +++-
 src/regex/string.rs                 | 4 +++-
 3 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/regex-automata/src/util/captures.rs b/regex-automata/src/util/captures.rs
index cd3a5f8f7..05db6a993 100644
--- a/regex-automata/src/util/captures.rs
+++ b/regex-automata/src/util/captures.rs
@@ -433,7 +433,6 @@ impl Captures {
     ///
     /// ```
     /// # if cfg!(miri) { return Ok(()); } // miri takes too long
-    /// # if !cfg!(target_pointer_width = "64") { return Ok(()); } // see #1039
     /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Span, Match};
     ///
     /// let re = PikeVM::new(r"^(?P<first>\pL+)\s+(?P<last>\pL+)$")?;
@@ -445,6 +444,8 @@ impl Captures {
     /// assert_eq!(Some(Span::from(6..17)), caps.get_group(2));
     /// // Looking for a non-existent capturing group will return None:
     /// assert_eq!(None, caps.get_group(3));
+    /// # // literals are too big for 32-bit usize: #1039
+    /// # #[cfg(target_pointer_width = "64")]
     /// assert_eq!(None, caps.get_group(9944060567225171988));
     ///
     /// # Ok::<(), Box<dyn std::error::Error>>(())
diff --git a/src/regex/bytes.rs b/src/regex/bytes.rs
index c742b095a..19f5701af 100644
--- a/src/regex/bytes.rs
+++ b/src/regex/bytes.rs
@@ -2025,7 +2025,6 @@ impl<'h, 'n> core::ops::Index<&'n str> for Captures<'h> {
 /// This example shows how to create and use `CaptureLocations` in a search.
 ///
 /// ```
-/// # if !cfg!(target_pointer_width = "64") { return; } // see #1041
 /// use regex::bytes::Regex;
 ///
 /// let re = Regex::new(r"(?<first>\w+)\s+(?<last>\w+)").unwrap();
@@ -2038,7 +2037,10 @@ impl<'h, 'n> core::ops::Index<&'n str> for Captures<'h> {
 ///
 /// // Asking for an invalid capture group always returns None.
 /// assert_eq!(None, locs.get(3));
+/// # // literals are too big for 32-bit usize: #1041
+/// # #[cfg(target_pointer_width = "64")]
 /// assert_eq!(None, locs.get(34973498648));
+/// # #[cfg(target_pointer_width = "64")]
 /// assert_eq!(None, locs.get(9944060567225171988));
 /// ```
 #[derive(Clone, Debug)]
diff --git a/src/regex/string.rs b/src/regex/string.rs
index 177a2af34..880d6082a 100644
--- a/src/regex/string.rs
+++ b/src/regex/string.rs
@@ -2028,7 +2028,6 @@ impl<'h, 'n> core::ops::Index<&'n str> for Captures<'h> {
 /// This example shows how to create and use `CaptureLocations` in a search.
 ///
 /// ```
-/// # if !cfg!(target_pointer_width = "64") { return; } // see #1041
 /// use regex::Regex;
 ///
 /// let re = Regex::new(r"(?<first>\w+)\s+(?<last>\w+)").unwrap();
@@ -2041,7 +2040,10 @@ impl<'h, 'n> core::ops::Index<&'n str> for Captures<'h> {
 ///
 /// // Asking for an invalid capture group always returns None.
 /// assert_eq!(None, locs.get(3));
+/// # // literals are too big for 32-bit usize: #1041
+/// # #[cfg(target_pointer_width = "64")]
 /// assert_eq!(None, locs.get(34973498648));
+/// # #[cfg(target_pointer_width = "64")]
 /// assert_eq!(None, locs.get(9944060567225171988));
 /// ```
 #[derive(Clone, Debug)]

From 0086dec69a77a9e1153e97cd050ab567b5c7f109 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Sat, 14 Oct 2023 13:18:09 -0400
Subject: [PATCH 126/136] lite: fix stack overflow test

It turns out that we missed another case where the stack could overflow:
dropping a deeply nested Hir. Namely, since we permit deeply nested Hirs
to be constructed and only reject them after determining they are too
deeply nested, they still then need to be dropped. We fix this by
implementing a custom a Drop impl that uses the heap to traverse the Hir
and drop things without using unbounded stack space.

An alternative way to fix this would be to adjust the parser somehow to
avoid building deeply nested Hir values in the first place. But that
seems trickier, so we just stick with this for now.
---
 regex-lite/src/hir/mod.rs    | 60 ++++++++++++++++++++++++++++++++++++
 regex-lite/src/hir/parse.rs  |  6 ++--
 regex-lite/tests/fuzz/mod.rs |  2 +-
 3 files changed, 65 insertions(+), 3 deletions(-)

diff --git a/regex-lite/src/hir/mod.rs b/regex-lite/src/hir/mod.rs
index 3d61ce8c9..6e5348a5b 100644
--- a/regex-lite/src/hir/mod.rs
+++ b/regex-lite/src/hir/mod.rs
@@ -366,6 +366,24 @@ impl Hir {
     }
 }
 
+impl HirKind {
+    /// Returns a slice of this kind's sub-expressions, if any.
+    fn subs(&self) -> &[Hir] {
+        use core::slice::from_ref;
+
+        match *self {
+            HirKind::Empty
+            | HirKind::Char(_)
+            | HirKind::Class(_)
+            | HirKind::Look(_) => &[],
+            HirKind::Repetition(Repetition { ref sub, .. }) => from_ref(sub),
+            HirKind::Capture(Capture { ref sub, .. }) => from_ref(sub),
+            HirKind::Concat(ref subs) => subs,
+            HirKind::Alternation(ref subs) => subs,
+        }
+    }
+}
+
 #[derive(Clone, Debug, Eq, PartialEq)]
 pub(crate) struct Class {
     pub(crate) ranges: Vec<ClassRange>,
@@ -747,3 +765,45 @@ fn prev_char(ch: char) -> Option<char> {
     // and U+E000 yields a valid scalar value.
     Some(char::from_u32(u32::from(ch).checked_sub(1)?).unwrap())
 }
+
+impl Drop for Hir {
+    fn drop(&mut self) {
+        use core::mem;
+
+        match *self.kind() {
+            HirKind::Empty
+            | HirKind::Char(_)
+            | HirKind::Class(_)
+            | HirKind::Look(_) => return,
+            HirKind::Capture(ref x) if x.sub.kind.subs().is_empty() => return,
+            HirKind::Repetition(ref x) if x.sub.kind.subs().is_empty() => {
+                return
+            }
+            HirKind::Concat(ref x) if x.is_empty() => return,
+            HirKind::Alternation(ref x) if x.is_empty() => return,
+            _ => {}
+        }
+
+        let mut stack = vec![mem::replace(self, Hir::empty())];
+        while let Some(mut expr) = stack.pop() {
+            match expr.kind {
+                HirKind::Empty
+                | HirKind::Char(_)
+                | HirKind::Class(_)
+                | HirKind::Look(_) => {}
+                HirKind::Capture(ref mut x) => {
+                    stack.push(mem::replace(&mut x.sub, Hir::empty()));
+                }
+                HirKind::Repetition(ref mut x) => {
+                    stack.push(mem::replace(&mut x.sub, Hir::empty()));
+                }
+                HirKind::Concat(ref mut x) => {
+                    stack.extend(x.drain(..));
+                }
+                HirKind::Alternation(ref mut x) => {
+                    stack.extend(x.drain(..));
+                }
+            }
+        }
+    }
+}
diff --git a/regex-lite/src/hir/parse.rs b/regex-lite/src/hir/parse.rs
index 0dcccdd46..ca93b8838 100644
--- a/regex-lite/src/hir/parse.rs
+++ b/regex-lite/src/hir/parse.rs
@@ -1328,8 +1328,10 @@ fn into_class_item_range(hir: Hir) -> Result<char, Error> {
     }
 }
 
-fn into_class_item_ranges(hir: Hir) -> Result<Vec<hir::ClassRange>, Error> {
-    match hir.kind {
+fn into_class_item_ranges(
+    mut hir: Hir,
+) -> Result<Vec<hir::ClassRange>, Error> {
+    match core::mem::replace(&mut hir.kind, HirKind::Empty) {
         HirKind::Char(ch) => Ok(vec![hir::ClassRange { start: ch, end: ch }]),
         HirKind::Class(hir::Class { ranges }) => Ok(ranges),
         _ => Err(Error::new(ERR_CLASS_INVALID_ITEM)),
diff --git a/regex-lite/tests/fuzz/mod.rs b/regex-lite/tests/fuzz/mod.rs
index 747aab040..5a721f142 100644
--- a/regex-lite/tests/fuzz/mod.rs
+++ b/regex-lite/tests/fuzz/mod.rs
@@ -27,7 +27,7 @@ fn captures_wrong_order_min() {
 #[test]
 fn many_zero_to_many_reps() {
     let pat = format!(".{}", "*".repeat(1 << 15));
-    let Ok(re) = regex_lite::RegexBuilder::new(&pat).build() else { return };
+    let Ok(re) = regex_lite::Regex::new(&pat) else { return };
     re.is_match("");
 }
 

From e7bd19dd3ebf4b1a861275f0353202bf93a39ab1 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Sun, 15 Oct 2023 09:24:20 -0400
Subject: [PATCH 127/136] regex-lite-0.1.5

---
 regex-lite/Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/regex-lite/Cargo.toml b/regex-lite/Cargo.toml
index 704970f2f..0ba53485b 100644
--- a/regex-lite/Cargo.toml
+++ b/regex-lite/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "regex-lite"
-version = "0.1.4"  #:version
+version = "0.1.5"  #:version
 authors = ["The Rust Project Developers", "Andrew Gallant <jamslam@gmail.com>"]
 license = "MIT OR Apache-2.0"
 repository = "https://github.com/rust-lang/regex/tree/master/regex-lite"

From eb950f65e660a45c7e123f3c6fba9f2c86b4a256 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Mon, 16 Oct 2023 10:20:25 -0400
Subject: [PATCH 128/136] automata/meta: revert broadening of reverse suffix
 optimization

This reverts commit 8a8d599f9d2f2d78e9ad84e4084788c2d563afa5 and
includes a regression test, as well as a tweak to a log message.

Essentially, the broadening was improper. We have to be careful when
dealing with suffixes as opposed to prefixes. Namely, my logic
previously was that the broadening was okay because we were already
doing it for the reverse inner optimization. But the reverse inner
optimization works with prefixes, not suffixes. So the comparison wasn't
quite correct.

This goes back to only applying the reverse suffix optimization when
there is a non-empty single common suffix.

Fixes #1110
Ref https://github.com/astral-sh/ruff/pull/7980
---
 regex-automata/src/meta/strategy.rs | 39 +++++++++++++++++++----------
 testdata/regression.toml            | 15 +++++++++++
 2 files changed, 41 insertions(+), 13 deletions(-)

diff --git a/regex-automata/src/meta/strategy.rs b/regex-automata/src/meta/strategy.rs
index 4cb3b29b9..04f2ba3c3 100644
--- a/regex-automata/src/meta/strategy.rs
+++ b/regex-automata/src/meta/strategy.rs
@@ -1167,21 +1167,34 @@ impl ReverseSuffix {
             return Err(core);
         }
         let kind = core.info.config().get_match_kind();
-        let suffixseq = crate::util::prefilter::suffixes(kind, hirs);
-        let Some(suffixes) = suffixseq.literals() else {
-            debug!(
-                "skipping reverse suffix optimization because \
-                 the extract suffix sequence is not finite",
-            );
-            return Err(core);
+        let suffixes = crate::util::prefilter::suffixes(kind, hirs);
+        let lcs = match suffixes.longest_common_suffix() {
+            None => {
+                debug!(
+                    "skipping reverse suffix optimization because \
+                     a longest common suffix could not be found",
+                );
+                return Err(core);
+            }
+            Some(lcs) if lcs.is_empty() => {
+                debug!(
+                    "skipping reverse suffix optimization because \
+                     the longest common suffix is the empty string",
+                );
+                return Err(core);
+            }
+            Some(lcs) => lcs,
         };
-        let Some(pre) = Prefilter::new(kind, suffixes) else {
-            debug!(
-                "skipping reverse suffix optimization because \
+        let pre = match Prefilter::new(kind, &[lcs]) {
+            Some(pre) => pre,
+            None => {
+                debug!(
+                    "skipping reverse suffix optimization because \
                      a prefilter could not be constructed from the \
                      longest common suffix",
-            );
-            return Err(core);
+                );
+                return Err(core);
+            }
         };
         if !pre.is_fast() {
             debug!(
@@ -1268,7 +1281,7 @@ impl ReverseSuffix {
             e.try_search_half_rev_limited(&input, min_start)
         } else if let Some(e) = self.core.hybrid.get(&input) {
             trace!(
-                "using lazy DFA for reverse inner search at {:?}, \
+                "using lazy DFA for reverse suffix search at {:?}, \
                  but will be stopped at {} to avoid quadratic behavior",
                 input.get_span(),
                 min_start,
diff --git a/testdata/regression.toml b/testdata/regression.toml
index 2954c9118..53b0701a3 100644
--- a/testdata/regression.toml
+++ b/testdata/regression.toml
@@ -813,3 +813,18 @@ name = "hir-optimization-out-of-order-class"
 regex = '^[[:alnum:]./-]+$'
 haystack = "a-b"
 matches = [[0, 3]]
+
+# This is a regression test for an improper reverse suffix optimization. This
+# occurred when I "broadened" the applicability of the optimization to include
+# multiple possible literal suffixes instead of only sticking to a non-empty
+# longest common suffix. It turns out that, at least given how the reverse
+# suffix optimization works, we need to stick to the longest common suffix for
+# now.
+#
+# See: https://github.com/rust-lang/regex/issues/1110
+# See also: https://github.com/astral-sh/ruff/pull/7980
+[[test]]
+name = 'improper-reverse-suffix-optimization'
+regex = '(\\N\{[^}]+})|([{}])'
+haystack = 'hiya \N{snowman} bye'
+matches = [[[5, 16], [5, 16], []]]

From 50fe7d177db6854ea1a2b1d04d3db75ec544f39c Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Mon, 16 Oct 2023 10:45:26 -0400
Subject: [PATCH 129/136] changelog: 1.10.2

---
 CHANGELOG.md | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index b5f31bec0..420e08f74 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,14 @@
+1.10.2 (2023-10-16)
+===================
+This is a new patch release that fixes a search regression where incorrect
+matches could be reported.
+
+Bug fixes:
+
+* [BUG #1110](https://github.com/rust-lang/regex/issues/1110):
+Revert broadening of reverse suffix literal optimization introduced in 1.10.1.
+
+
 1.10.1 (2023-10-14)
 ===================
 This is a new patch release with a minor increase in the number of valid

From 61242b1e0e9941dadc5ec7c6cd7391db3cca5710 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Mon, 16 Oct 2023 10:45:33 -0400
Subject: [PATCH 130/136] regex-automata-0.4.3

---
 regex-automata/Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/regex-automata/Cargo.toml b/regex-automata/Cargo.toml
index f9f59feb3..3cb3d7c8e 100644
--- a/regex-automata/Cargo.toml
+++ b/regex-automata/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "regex-automata"
-version = "0.4.2"  #:version
+version = "0.4.3"  #:version
 authors = ["The Rust Project Developers", "Andrew Gallant <jamslam@gmail.com>"]
 description = "Automata construction and matching using regular expressions."
 documentation = "https://docs.rs/regex-automata"

From 1a54a829ba730257cbb8ed53521db11be318c43e Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Mon, 16 Oct 2023 10:46:01 -0400
Subject: [PATCH 131/136] deps: bump regex-automata to 0.4.3

---
 Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Cargo.toml b/Cargo.toml
index 45132a906..55108a968 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -174,7 +174,7 @@ optional = true
 # For the actual regex engines.
 [dependencies.regex-automata]
 path = "regex-automata"
-version = "0.4.2"
+version = "0.4.3"
 default-features = false
 features = ["alloc", "syntax", "meta", "nfa-pikevm"]
 

From 5f1f1c8b6db4d1fd373ef1ab4eab05a8f66c4235 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Mon, 16 Oct 2023 10:46:02 -0400
Subject: [PATCH 132/136] 1.10.2

---
 Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Cargo.toml b/Cargo.toml
index 55108a968..3ba14c904 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "regex"
-version = "1.10.1"  #:version
+version = "1.10.2"  #:version
 authors = ["The Rust Project Developers", "Andrew Gallant <jamslam@gmail.com>"]
 license = "MIT OR Apache-2.0"
 readme = "README.md"

From 20b5317f7a8accbf64ee21245b0a37f636017e13 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Fri, 20 Oct 2023 07:52:52 -0400
Subject: [PATCH 133/136] automata: fix panic in dense DFA deserialization

This fixes a hole in the validation logic that accidentally permitted a
dense DFA to contain a match state with zero pattern IDs. Since search
code is permitted to assume that every match state has at least one
corresponding pattern ID, this led to a panic.

Fixes https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=63391
---
 ...ata_deserialize_dense_dfa-5624222820728832 | Bin 0 -> 749 bytes
 regex-automata/src/dfa/dense.rs               |  20 ++++++++++++------
 2 files changed, 13 insertions(+), 7 deletions(-)
 create mode 100644 fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_automata_deserialize_dense_dfa-5624222820728832

diff --git a/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_automata_deserialize_dense_dfa-5624222820728832 b/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_automata_deserialize_dense_dfa-5624222820728832
new file mode 100644
index 0000000000000000000000000000000000000000..e236ae735c7f413c90a0e9b61cc4add46ced15e7
GIT binary patch
literal 749
zcmd5)TMED+469=gkFon-(j{nQDi!=03sh-q(;8}bMmo$a!2W|zr>V_O2(ZE8>!v1*
z6U=#_h8}h#XfDExxv!cs^^Zrt{#L1#-lYZ{(nKt)H^)$Ct|9sII*#X6$oXD13{eTq
K(N?9_h4%nug9@bp

literal 0
HcmV?d00001

diff --git a/regex-automata/src/dfa/dense.rs b/regex-automata/src/dfa/dense.rs
index fd96bc878..6fc61dc4f 100644
--- a/regex-automata/src/dfa/dense.rs
+++ b/regex-automata/src/dfa/dense.rs
@@ -2340,8 +2340,8 @@ impl<'a> DFA<&'a [u32]> {
         // table, match states and accelerators below. If any validation fails,
         // then we return an error.
         let (dfa, nread) = unsafe { DFA::from_bytes_unchecked(slice)? };
-        dfa.tt.validate(&dfa.special)?;
-        dfa.st.validate(&dfa.tt)?;
+        dfa.tt.validate(&dfa)?;
+        dfa.st.validate(&dfa)?;
         dfa.ms.validate(&dfa)?;
         dfa.accels.validate()?;
         // N.B. dfa.special doesn't have a way to do unchecked deserialization,
@@ -3593,7 +3593,8 @@ impl<T: AsRef<[u32]>> TransitionTable<T> {
     ///
     /// That is, every state ID can be used to correctly index a state in this
     /// table.
-    fn validate(&self, sp: &Special) -> Result<(), DeserializeError> {
+    fn validate(&self, dfa: &DFA<T>) -> Result<(), DeserializeError> {
+        let sp = &dfa.special;
         for state in self.states() {
             // We check that the ID itself is well formed. That is, if it's
             // a special state then it must actually be a quit, dead, accel,
@@ -3611,6 +3612,13 @@ impl<T: AsRef<[u32]>> TransitionTable<T> {
                          wasn't actually special",
                     ));
                 }
+                if sp.is_match_state(state.id())
+                    && dfa.match_len(state.id()) == 0
+                {
+                    return Err(DeserializeError::generic(
+                        "found match state with zero pattern IDs",
+                    ));
+                }
             }
             for (_, to) in state.transitions() {
                 if !self.is_valid(to) {
@@ -4127,10 +4135,8 @@ impl<T: AsRef<[u32]>> StartTable<T> {
     /// it against the given transition table (which must be for the same DFA).
     ///
     /// That is, every state ID can be used to correctly index a state.
-    fn validate(
-        &self,
-        tt: &TransitionTable<T>,
-    ) -> Result<(), DeserializeError> {
+    fn validate(&self, dfa: &DFA<T>) -> Result<(), DeserializeError> {
+        let tt = &dfa.tt;
         if !self.universal_start_unanchored.map_or(true, |s| tt.is_valid(s)) {
             return Err(DeserializeError::generic(
                 "found invalid universal unanchored starting state ID",

From 6b72eec64b428859702ae5ee811048112af5269e Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Wed, 25 Oct 2023 09:37:53 -0400
Subject: [PATCH 134/136] syntax: add Hir::literal example for `char`

The example shows a succinct way of creating an HIR literal from a
`char` value by first encoding it to UTF-8.

Closes #1114
---
 regex-syntax/src/hir/mod.rs | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs
index ce38ead7b..ae3ba318e 100644
--- a/regex-syntax/src/hir/mod.rs
+++ b/regex-syntax/src/hir/mod.rs
@@ -322,6 +322,22 @@ impl Hir {
     /// let expected = HirKind::Literal(Literal(Box::from("☃".as_bytes())));
     /// assert_eq!(&expected, concat.kind());
     /// ```
+    ///
+    /// # Example: building a literal from a `char`
+    ///
+    /// This example shows how to build a single `Hir` literal from a `char`
+    /// value. Since a [`Literal`] is just bytes, we just need to UTF-8
+    /// encode a `char` value:
+    ///
+    /// ```
+    /// use regex_syntax::hir::{Hir, HirKind, Literal};
+    ///
+    /// let ch = '☃';
+    /// let got = Hir::literal(ch.encode_utf8(&mut [0; 4]).as_bytes());
+    ///
+    /// let expected = HirKind::Literal(Literal(Box::from("☃".as_bytes())));
+    /// assert_eq!(&expected, got.kind());
+    /// ```
     #[inline]
     pub fn literal<B: Into<Box<[u8]>>>(lit: B) -> Hir {
         let bytes = lit.into();

From 662a8b93afa55b5c489f14bca83565ebe62ccf67 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Wed, 1 Nov 2023 11:52:44 -0400
Subject: [PATCH 135/136] cli: change --no-captures to --captures
 (all|implicit|none)

When we added the WhichCaptures type, we didn't update the CLI to expose
the full functionality. This change does that.
---
 regex-automata/src/nfa/thompson/map.rs        |  2 +-
 regex-automata/src/nfa/thompson/range_trie.rs |  2 +-
 regex-cli/args/flags.rs                       | 52 +++++++++++++++++++
 regex-cli/args/thompson.rs                    | 24 +++------
 4 files changed, 60 insertions(+), 20 deletions(-)

diff --git a/regex-automata/src/nfa/thompson/map.rs b/regex-automata/src/nfa/thompson/map.rs
index c92d4c0b8..7f074a353 100644
--- a/regex-automata/src/nfa/thompson/map.rs
+++ b/regex-automata/src/nfa/thompson/map.rs
@@ -65,7 +65,7 @@ const INIT: u64 = 14695981039346656037;
 /// Specifically, one could observe the difference with std's hashmap via
 /// something like the following benchmark:
 ///
-///   hyperfine "regex-cli debug thompson -qr --no-captures '\w{90} ecurB'"
+///   hyperfine "regex-cli debug thompson -qr --captures none '\w{90} ecurB'"
 ///
 /// But to observe that difference, you'd have to modify the code to use
 /// std's hashmap.
diff --git a/regex-automata/src/nfa/thompson/range_trie.rs b/regex-automata/src/nfa/thompson/range_trie.rs
index 75c9b796b..cd77cc150 100644
--- a/regex-automata/src/nfa/thompson/range_trie.rs
+++ b/regex-automata/src/nfa/thompson/range_trie.rs
@@ -594,7 +594,7 @@ impl State {
         // Benchmarks suggest that binary search is just a bit faster than
         // straight linear search. Specifically when using the debug tool:
         //
-        //   hyperfine "regex-cli debug thompson -qr --no-captures '\w{90} ecurB'"
+        //   hyperfine "regex-cli debug thompson -qr --captures none '\w{90} ecurB'"
         binary_search(&self.transitions, |t| range.start <= t.range.end)
     }
 
diff --git a/regex-cli/args/flags.rs b/regex-cli/args/flags.rs
index db8a847ef..61732a28e 100644
--- a/regex-cli/args/flags.rs
+++ b/regex-cli/args/flags.rs
@@ -152,3 +152,55 @@ impl std::str::FromStr for MatchKind {
         Ok(MatchKind { kind })
     }
 }
+
+/// Provides an implementation of the --captures flag, for use with Thompson
+/// NFA configuration.
+#[derive(Debug)]
+pub struct WhichCaptures {
+    pub which: regex_automata::nfa::thompson::WhichCaptures,
+}
+
+impl WhichCaptures {
+    pub const USAGE: Usage = Usage::new(
+        "--captures <which>",
+        "One of: all, implicit or none.",
+        r#"
+Selects which capture states should be included in the Thompson NFA. The
+choices are 'all' (the default), 'implicit' or 'none'.
+
+'all' means that both explicit and implicit capture states are included.
+
+'implicit' means that only implicit capture states are included. That is, the
+Thompson NFA will only be able to report the overall match offsets and not the
+match offsets of each explicit capture group.
+
+'none' means that no capture states will be included. This is useful when
+capture states aren't needed (like when building a DFA) or if they aren't
+supported (like when building a reverse NFA).
+"#,
+    );
+}
+
+impl Default for WhichCaptures {
+    fn default() -> WhichCaptures {
+        WhichCaptures {
+            which: regex_automata::nfa::thompson::WhichCaptures::All,
+        }
+    }
+}
+
+impl std::str::FromStr for WhichCaptures {
+    type Err = anyhow::Error;
+
+    fn from_str(s: &str) -> anyhow::Result<WhichCaptures> {
+        let which = match s {
+            "all" => regex_automata::nfa::thompson::WhichCaptures::All,
+            "implicit" => {
+                regex_automata::nfa::thompson::WhichCaptures::Implicit
+            }
+            "none" => regex_automata::nfa::thompson::WhichCaptures::None,
+            unk => anyhow::bail!("unrecognized captures option '{}'", unk),
+        };
+        Ok(WhichCaptures { which })
+    }
+}
diff --git a/regex-cli/args/thompson.rs b/regex-cli/args/thompson.rs
index 151fc6a0b..bd8388d11 100644
--- a/regex-cli/args/thompson.rs
+++ b/regex-cli/args/thompson.rs
@@ -70,11 +70,11 @@ impl Configurable for Config {
             Arg::Long("shrink") => {
                 self.thompson = self.thompson.clone().shrink(true);
             }
-            Arg::Long("no-captures") => {
-                self.thompson = self
-                    .thompson
-                    .clone()
-                    .which_captures(thompson::WhichCaptures::None);
+            Arg::Long("captures") => {
+                let which: flags::WhichCaptures =
+                    args::parse(p, "--captures")?;
+                self.thompson =
+                    self.thompson.clone().which_captures(which.which);
             }
             Arg::Long("line-terminator") => {
                 let byte: flags::OneByte =
@@ -136,19 +136,7 @@ spent shrinking the NFA can lead to far larger savings in the subsequent DFA
 determinization.
 "#,
             ),
-            Usage::new(
-                "--no-captures",
-                "Disable capture states.",
-                r#"
-Disables capture states. By default, NFAs include special "capture" states that
-instruct some regex engines (like the PikeVM) to record offset positions in
-ancillary state.
-
-It can be useful to disable capture states in order to reduce "clutter" in the
-automaton when debugging it. Also, at time of writing, reverse NFAs require
-that capture groups are disabled.
-"#,
-            ),
+            flags::WhichCaptures::USAGE,
             Usage::new(
                 "--line-terminator",
                 "Set the line terminator used by line anchors.",

From 837fd85e79fac2a4ea64030411b9a4a7b17dfa42 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Wed, 1 Nov 2023 11:53:34 -0400
Subject: [PATCH 136/136] regex-cli-0.2.0

---
 regex-cli/Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/regex-cli/Cargo.toml b/regex-cli/Cargo.toml
index 3fe5390aa..a107c09df 100644
--- a/regex-cli/Cargo.toml
+++ b/regex-cli/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "regex-cli"
-version = "0.1.1"  #:version
+version = "0.2.0"  #:version
 authors = ["The Rust Project Developers", "Andrew Gallant <jamslam@gmail.com>"]
 description = """
 A command line tool for debugging, ad hoc benchmarking and generating regular