From 43ba6b810f2e3f60d7a57f4bcc8e1831b78a011c Mon Sep 17 00:00:00 2001 From: "Victor M. Alvarez" Date: Tue, 11 Jul 2023 03:10:29 +0200 Subject: [PATCH 001/136] syntax: improve literal extraction from certain repetitions When repetitions didn't have an explicit max value, like in `(ab){2,}` the literal extractor was producing sub-optimal literals, like `"ab"` instead of `"abab"`. Close #1032 --- regex-syntax/src/hir/literal.rs | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/regex-syntax/src/hir/literal.rs b/regex-syntax/src/hir/literal.rs index 9461db989..afcd506e0 100644 --- a/regex-syntax/src/hir/literal.rs +++ b/regex-syntax/src/hir/literal.rs @@ -477,7 +477,7 @@ impl Extractor { } seq } - hir::Repetition { min, max: Some(max), .. } if min < max => { + hir::Repetition { min, .. } => { assert!(min > 0); // handled above let limit = u32::try_from(self.limit_repeat).unwrap_or(u32::MAX); @@ -491,10 +491,6 @@ impl Extractor { seq.make_inexact(); seq } - hir::Repetition { .. } => { - subseq.make_inexact(); - subseq - } } } @@ -2655,6 +2651,12 @@ mod tests { ]), e(r"(ab|cd)(ef|gh)(ij|kl)") ); + + assert_eq!(inexact([E("abab")], [E("abab")]), e(r"(ab){2}")); + + assert_eq!(inexact([I("abab")], [I("abab")]), e(r"(ab){2,3}")); + + assert_eq!(inexact([I("abab")], [I("abab")]), e(r"(ab){2,}")); } #[test] From 5e8eaf1f7ab92b68bfabaa004561ccb1269ecb9f Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Mon, 10 Jul 2023 21:10:57 -0400 Subject: [PATCH 002/136] regex-syntax-0.7.4 --- regex-syntax/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/regex-syntax/Cargo.toml b/regex-syntax/Cargo.toml index e6d7965be..b7a149c23 100644 --- a/regex-syntax/Cargo.toml +++ b/regex-syntax/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "regex-syntax" -version = "0.7.3" #:version +version = "0.7.4" #:version authors = ["The Rust Project Developers", "Andrew Gallant "] license = "MIT OR Apache-2.0" repository = "https://github.com/rust-lang/regex/tree/master/regex-syntax" From bbb285b81fd1108536eedc52990a95f30ca6bdf5 Mon Sep 17 00:00:00 2001 From: CosmicHorror Date: Mon, 10 Jul 2023 20:53:21 -0600 Subject: [PATCH 003/136] regex-cli: update installation instructions PR #1035 --- regex-cli/README.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/regex-cli/README.md b/regex-cli/README.md index 36dc50e77..376d89091 100644 --- a/regex-cli/README.md +++ b/regex-cli/README.md @@ -7,11 +7,10 @@ various regex development tasks such as generating tests. ### Installation -Currently `regex-cli` is not on crates.io and should be installed from this -git repository: +Simply use `cargo` to install from crates.io. ``` -$ cargo install --git https://github.com/rust-lang/regex regex-cli +$ cargo install regex-cli ``` From 40585afe940294bc50aad7fc563588668f860f51 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Wed, 12 Jul 2023 09:05:44 -0400 Subject: [PATCH 004/136] pikevm: fix anchored search bug This fixes a bug where one could ask the PikeVM to perform an anchored search, but in some cases it could return a match where the start of the match is greater than the start of the search. For example, an anchored search of the pattern '.c' on the haystack 'abc' starting at '0' would report a match at '1..3'. No other engine (other than the meta engine, which we'll address in a subsequent commit) had this bug. The issue in the pikevm was our simulation of the '(?s-u:.)*?' prefix for implementing unanchored searches. Namely, instead of using the NFA itself to implement the unanchored search (it has both unanchored and anchored start states), the PikeVM simulates it in code for performance reasons. This simulation was actually incorrect for the anchored case, because we were re-computing the epsilon closure for every step in the search. Effectively, we were simulating an unanchored search unconditionally. Now the reason why this bug wasn't caught is because the PikeVM only gets things half wrong. Namely, the regex '[b-z]c' does not match 'abc' when starting the search at offset '0' and that's correct. The reason is that the '[b-z]' doesn't match 'a', where as '.' in the aforementioned regex does. Since the PikeVM doesn't match there, it's current list of states becomes empty, and *this* case is anchor-aware and knows not to continue the search in this case. In other words, the PikeVM only half-implemented the unanchored search simulation. It gets it right in some cases, but not all. We fix the bug by requiring that we only do the epsilon closure when the search is unanchored, or if it's anchored, that the current position is at the start of the search. We add a regression test from #1036 as well. Partially resolves #1036 --- regex-automata/src/nfa/thompson/pikevm.rs | 10 +++++++++- testdata/anchored.toml | 10 ++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/regex-automata/src/nfa/thompson/pikevm.rs b/regex-automata/src/nfa/thompson/pikevm.rs index d737fb71e..79ce3c60d 100644 --- a/regex-automata/src/nfa/thompson/pikevm.rs +++ b/regex-automata/src/nfa/thompson/pikevm.rs @@ -1356,7 +1356,15 @@ impl PikeVM { // matches their behavior. (Generally, 'allmatches' is useful for // overlapping searches or leftmost anchored searches to find the // longest possible match by ignoring match priority.) - if !pid.is_some() || allmatches { + // + // Additionally, when we're running an anchored search, this + // epsilon closure should only be computed at the beginning of the + // search. If we re-computed it at every position, we would be + // simulating an unanchored search when we were tasked to perform + // an anchored search. + if (!pid.is_some() || allmatches) + && (!anchored || at == input.start()) + { // Since we are adding to the 'curr' active states and since // this is for the start ID, we use a slots slice that is // guaranteed to have the right length but where every element diff --git a/testdata/anchored.toml b/testdata/anchored.toml index cca561de1..7023335ec 100644 --- a/testdata/anchored.toml +++ b/testdata/anchored.toml @@ -69,3 +69,13 @@ haystack = 'abcβ' matches = [[0, 3]] anchored = true unicode = false + +# Tests that '.c' doesn't match 'abc' when performing an anchored search from +# the beginning of the haystack. This test found two different bugs in the +# PikeVM and the meta engine. +[[test]] +name = "no-match-at-start" +regex = '.c' +haystack = 'abc' +matches = [] +anchored = true From 70c7f575a24c1f465fcd51b3f2d0e25ba935da6e Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Wed, 12 Jul 2023 09:49:33 -0400 Subject: [PATCH 005/136] meta: fix anchored search bugs It turns out that all three of the "reverse" optimizations in the meta regex engine did not support anchored searches correctly. This was intended, and in particular, none of these optimizations are active when the regex is anchored at the beginning. However, a caller can still request an anchored search even when the regex itself isn't anchored. In this case, the general best approach is to just do a standard forward regex search. Namely, the reverse suffix and reverse inner optimizations are generally throughput optimizations, and anchored searches tend to be more heavily dominated by latency. Now it is plausible that we will want to do some optimizations in the anchored case. For example, we might want to confirm that a required literal is in the haystack before running a standard forward regex search. But I think that's future work and will probably benefit from being a distinct strategy. It's also somewhat tricky to do because while it will make performance in the "no match" case much better, it will likely regress performance in the "always match" case. Anyway, we add more regression tests covering all of these cases and fix the bug. We fix it by just checking whether the caller requested an anchored search, and if so, fall back to the core engine. Fixes #1036 --- regex-automata/src/meta/strategy.rs | 49 +++++++++++++++++++++++++++++ testdata/anchored.toml | 46 +++++++++++++++++++++++++++ 2 files changed, 95 insertions(+) diff --git a/regex-automata/src/meta/strategy.rs b/regex-automata/src/meta/strategy.rs index 2de2c385e..aa1d61ef3 100644 --- a/regex-automata/src/meta/strategy.rs +++ b/regex-automata/src/meta/strategy.rs @@ -845,6 +845,14 @@ impl ReverseAnchored { ); return Err(core); } + // Note that the caller can still request an anchored search even when + // the regex isn't anchored at the start. We detect that case in the + // search routines below and just fallback to the core engine. This + // is fine because both searches are anchored. It's just a matter of + // picking one. Falling back to the core engine is a little simpler, + // since if we used the reverse anchored approach, we'd have to add an + // extra check to ensure the match reported starts at the place where + // the caller requested the search to start. if core.info.is_always_anchored_start() { debug!( "skipping reverse anchored optimization because \ @@ -930,6 +938,9 @@ impl Strategy for ReverseAnchored { #[cfg_attr(feature = "perf-inline", inline(always))] fn search(&self, cache: &mut Cache, input: &Input<'_>) -> Option { + if input.get_anchored().is_anchored() { + return self.core.search(cache, input); + } match self.try_search_half_anchored_rev(cache, input) { Err(_err) => { trace!("fast reverse anchored search failed: {}", _err); @@ -948,6 +959,9 @@ impl Strategy for ReverseAnchored { cache: &mut Cache, input: &Input<'_>, ) -> Option { + if input.get_anchored().is_anchored() { + return self.core.search_half(cache, input); + } match self.try_search_half_anchored_rev(cache, input) { Err(_err) => { trace!("fast reverse anchored search failed: {}", _err); @@ -973,6 +987,9 @@ impl Strategy for ReverseAnchored { input: &Input<'_>, slots: &mut [Option], ) -> Option { + if input.get_anchored().is_anchored() { + return self.core.search_slots(cache, input, slots); + } match self.try_search_half_anchored_rev(cache, input) { Err(_err) => { trace!("fast reverse anchored search failed: {}", _err); @@ -1034,6 +1051,13 @@ impl ReverseSuffix { // requires a reverse scan after a literal match to confirm or reject // the match. (Although, in the case of confirmation, it then needs to // do another forward scan to find the end position.) + // + // Note that the caller can still request an anchored search even + // when the regex isn't anchored. We detect that case in the search + // routines below and just fallback to the core engine. Currently this + // optimization assumes all searches are unanchored, so if we do want + // to enable this optimization for anchored searches, it will need a + // little work to support it. if core.info.is_always_anchored_start() { debug!( "skipping reverse suffix optimization because \ @@ -1211,6 +1235,9 @@ impl Strategy for ReverseSuffix { #[cfg_attr(feature = "perf-inline", inline(always))] fn search(&self, cache: &mut Cache, input: &Input<'_>) -> Option { + if input.get_anchored().is_anchored() { + return self.core.search(cache, input); + } match self.try_search_half_start(cache, input) { Err(RetryError::Quadratic(_err)) => { trace!("reverse suffix optimization failed: {}", _err); @@ -1255,6 +1282,9 @@ impl Strategy for ReverseSuffix { cache: &mut Cache, input: &Input<'_>, ) -> Option { + if input.get_anchored().is_anchored() { + return self.core.search_half(cache, input); + } match self.try_search_half_start(cache, input) { Err(RetryError::Quadratic(_err)) => { trace!("reverse suffix half optimization failed: {}", _err); @@ -1309,6 +1339,9 @@ impl Strategy for ReverseSuffix { input: &Input<'_>, slots: &mut [Option], ) -> Option { + if input.get_anchored().is_anchored() { + return self.core.search_slots(cache, input, slots); + } if !self.core.is_capture_search_needed(slots.len()) { trace!("asked for slots unnecessarily, trying fast path"); let m = self.search(cache, input)?; @@ -1396,6 +1429,13 @@ impl ReverseInner { // or when the literal scan matches. If it matches, then confirming the // match requires a reverse scan followed by a forward scan to confirm // or reject, which is a fair bit of work. + // + // Note that the caller can still request an anchored search even + // when the regex isn't anchored. We detect that case in the search + // routines below and just fallback to the core engine. Currently this + // optimization assumes all searches are unanchored, so if we do want + // to enable this optimization for anchored searches, it will need a + // little work to support it. if core.info.is_always_anchored_start() { debug!( "skipping reverse inner optimization because \ @@ -1635,6 +1675,9 @@ impl Strategy for ReverseInner { #[cfg_attr(feature = "perf-inline", inline(always))] fn search(&self, cache: &mut Cache, input: &Input<'_>) -> Option { + if input.get_anchored().is_anchored() { + return self.core.search(cache, input); + } match self.try_search_full(cache, input) { Err(RetryError::Quadratic(_err)) => { trace!("reverse inner optimization failed: {}", _err); @@ -1654,6 +1697,9 @@ impl Strategy for ReverseInner { cache: &mut Cache, input: &Input<'_>, ) -> Option { + if input.get_anchored().is_anchored() { + return self.core.search_half(cache, input); + } match self.try_search_full(cache, input) { Err(RetryError::Quadratic(_err)) => { trace!("reverse inner half optimization failed: {}", _err); @@ -1675,6 +1721,9 @@ impl Strategy for ReverseInner { input: &Input<'_>, slots: &mut [Option], ) -> Option { + if input.get_anchored().is_anchored() { + return self.core.search_slots(cache, input, slots); + } if !self.core.is_capture_search_needed(slots.len()) { trace!("asked for slots unnecessarily, trying fast path"); let m = self.search(cache, input)?; diff --git a/testdata/anchored.toml b/testdata/anchored.toml index 7023335ec..0f2248d09 100644 --- a/testdata/anchored.toml +++ b/testdata/anchored.toml @@ -79,3 +79,49 @@ regex = '.c' haystack = 'abc' matches = [] anchored = true + +# Like above, but at a non-zero start offset. +[[test]] +name = "no-match-at-start-bounds" +regex = '.c' +haystack = 'aabc' +bounds = [1, 4] +matches = [] +anchored = true + +# This is like no-match-at-start, but hits the "reverse inner" optimization +# inside the meta engine. (no-match-at-start hits the "reverse suffix" +# optimization.) +[[test]] +name = "no-match-at-start-reverse-inner" +regex = '.c[a-z]' +haystack = 'abcz' +matches = [] +anchored = true + +# Like above, but at a non-zero start offset. +[[test]] +name = "no-match-at-start-reverse-inner-bounds" +regex = '.c[a-z]' +haystack = 'aabcz' +bounds = [1, 5] +matches = [] +anchored = true + +# Same as no-match-at-start, but applies to the meta engine's "reverse +# anchored" optimization. +[[test]] +name = "no-match-at-start-reverse-anchored" +regex = '.c[a-z]$' +haystack = 'abcz' +matches = [] +anchored = true + +# Like above, but at a non-zero start offset. +[[test]] +name = "no-match-at-start-reverse-anchored-bounds" +regex = '.c[a-z]$' +haystack = 'aabcz' +bounds = [1, 5] +matches = [] +anchored = true From 961a882e9408b794eb8a9294c04e0aa20a32d95f Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Wed, 12 Jul 2023 10:14:34 -0400 Subject: [PATCH 006/136] regex-automata-0.3.3 --- regex-automata/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/regex-automata/Cargo.toml b/regex-automata/Cargo.toml index 86eb7d8f5..1936cf783 100644 --- a/regex-automata/Cargo.toml +++ b/regex-automata/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "regex-automata" -version = "0.3.2" #:version +version = "0.3.3" #:version authors = ["The Rust Project Developers", "Andrew Gallant "] description = "Automata construction and matching using regular expressions." documentation = "https://docs.rs/regex-automata" From 48daadc0dc5865ced38495258cfffcd3951682e4 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Wed, 12 Jul 2023 11:10:50 -0400 Subject: [PATCH 007/136] regex-automata/test: ignore some tests in 32-bit targets One of these tests (the captures one) is very specific to 64-bit since it uses a numeric literal that is bigger than what can be fit into 32 bits. The other two tests, for determinize_size_limit, are not specific to 64-bit targets but do somewhat depend on the specific memory usages in play. We could probably find some limits that work for both 32-bit and 64-bit, but since 'cross' doesn't run doc tests, doing this is pretty annoying. So just ignore the tests. Fixes #1039 --- regex-automata/src/dfa/dense.rs | 2 ++ regex-automata/src/util/captures.rs | 1 + 2 files changed, 3 insertions(+) diff --git a/regex-automata/src/dfa/dense.rs b/regex-automata/src/dfa/dense.rs index 35f037ca6..75ca85e6e 100644 --- a/regex-automata/src/dfa/dense.rs +++ b/regex-automata/src/dfa/dense.rs @@ -879,6 +879,7 @@ impl Config { /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// # if !cfg!(target_pointer_width = "64") { return Ok(()); } // see #1039 /// use regex_automata::{dfa::{dense, Automaton}, Input}; /// /// // 600KB isn't enough! @@ -912,6 +913,7 @@ impl Config { /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// # if !cfg!(target_pointer_width = "64") { return Ok(()); } // see #1039 /// use regex_automata::{ /// dfa::{dense, Automaton, StartKind}, /// Anchored, Input, diff --git a/regex-automata/src/util/captures.rs b/regex-automata/src/util/captures.rs index 60b6df7e2..c6517348d 100644 --- a/regex-automata/src/util/captures.rs +++ b/regex-automata/src/util/captures.rs @@ -433,6 +433,7 @@ impl Captures { /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// # if !cfg!(target_pointer_width = "64") { return Ok(()); } // see #1039 /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Span, Match}; /// /// let re = PikeVM::new(r"^(?P\pL+)\s+(?P\pL+)$")?; From 855c5c4d984e0cfcfd8557ddacdf55eb05828bf2 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Thu, 13 Jul 2023 09:44:28 -0400 Subject: [PATCH 008/136] fuzz: add all fuzzers to OSS-fuzz I forgot to do this step, and as a result, OSS-fuzz hasn't been running any of the new fuzzers. Hopefully this is enough. Ref #1037 --- fuzz/Cargo.toml | 3 +++ fuzz/oss-fuzz-build.sh | 18 ++++++++++++++++-- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml index 8688e73e0..a7eec2c81 100644 --- a/fuzz/Cargo.toml +++ b/fuzz/Cargo.toml @@ -32,6 +32,9 @@ features = ["atty", "humantime", "termcolor"] [workspace] members = ["."] +# NOTE: If you add a new fuzzer below, please make sure to add it to the +# oss-fuzz-build.sh script, otherwise it won't get run in OSS-fuzz. + [[bin]] name = "fuzz_regex_match" path = "fuzz_targets/fuzz_regex_match.rs" diff --git a/fuzz/oss-fuzz-build.sh b/fuzz/oss-fuzz-build.sh index 38750250b..f96474739 100755 --- a/fuzz/oss-fuzz-build.sh +++ b/fuzz/oss-fuzz-build.sh @@ -1,4 +1,18 @@ #!/bin/bash -eu + cd $SRC/regex -cargo fuzz build -O --debug-assertions -cp fuzz/target/x86_64-unknown-linux-gnu/release/fuzz_regex_match $OUT/ +cargo fuzz build -O --debug-assertions + +targets=( + fuzz_regex_match + fuzz_regex_lite_match + fuzz_regex_automata_deserialize_dense_dfa + fuzz_regex_automata_deserialize_sparse_dfa + ast_roundtrip + ast_fuzz_match + ast_fuzz_regex + ast_fuzz_match_bytes +) +for target in "${targets[@]}"; do + cp fuzz/target/x86_64-unknown-linux-gnu/release/$target $OUT/ +done From e55e96ce3aa2744cb2ca2bbd2d41f49d4171fb4f Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Fri, 21 Jul 2023 08:07:13 -0400 Subject: [PATCH 009/136] doc: clarify ambiguous wording Fixes #1050 --- src/lib.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 191aa2e1a..cd98be103 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -665,8 +665,8 @@ a separate crate, [`regex-syntax`](https://docs.rs/regex-syntax). Any named character class may appear inside a bracketed `[...]` character -class. For example, `[\p{Greek}[:digit:]]` matches any Greek or ASCII -digit. `[\p{Greek}&&\pL]` matches Greek letters. +class. For example, `[\p{Greek}[:digit:]]` matches any ASCII digit or any +codepoint in the `Greek` script. `[\p{Greek}&&\pL]` matches Greek letters. Precedence in character classes, from most binding to least: From 7bc8f884257ecd53e0599e9f1ae97a3ed751d99c Mon Sep 17 00:00:00 2001 From: Leachim <32847549+Licheam@users.noreply.github.com> Date: Fri, 21 Jul 2023 20:10:13 +0800 Subject: [PATCH 010/136] doc: update the old UTF-8 automata algorithm in comment regex-cli went through a few iterations before its initial release, but this means some comments in the code that reference it are now probably stale. This fixes one of them. PR #1049 --- regex-automata/src/nfa/thompson/compiler.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/regex-automata/src/nfa/thompson/compiler.rs b/regex-automata/src/nfa/thompson/compiler.rs index 497fc62b4..2021d93ea 100644 --- a/regex-automata/src/nfa/thompson/compiler.rs +++ b/regex-automata/src/nfa/thompson/compiler.rs @@ -1319,7 +1319,7 @@ impl Compiler { // compare and contrast performance of the Pike VM when the code below // is active vs the code above. Here's an example to try: // - // regex-cli find nfa thompson pikevm -b @$smallishru '(?m)^\w{20}' + // regex-cli find match pikevm -b -p '(?m)^\w{20}' -y '@$smallishru' // // With Unicode classes generated below, this search takes about 45s on // my machine. But with the compressed version above, the search takes @@ -1338,7 +1338,7 @@ impl Compiler { .map(|rng| self.c_range(rng.start, rng.end)); self.c_concat(it) }); - self.c_alt(it) + self.c_alt_iter(it) */ } From 87f7f3f5125c98eb8c831b3f6ac3688526d5b331 Mon Sep 17 00:00:00 2001 From: Leachim <32847549+Licheam@users.noreply.github.com> Date: Sun, 23 Jul 2023 20:05:00 +0800 Subject: [PATCH 011/136] automata/doc: fix typo when describing implicit unanchored prefix PR #1052 --- regex-automata/src/util/search.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/regex-automata/src/util/search.rs b/regex-automata/src/util/search.rs index b7bf934ea..39aec522b 100644 --- a/regex-automata/src/util/search.rs +++ b/regex-automata/src/util/search.rs @@ -246,7 +246,7 @@ impl<'h> Input<'h> { /// When a search is anchored (so that's [`Anchored::Yes`] or /// [`Anchored::Pattern`]), a match must begin at the start of a search. /// When a search is not anchored (that's [`Anchored::No`]), regex engines - /// will behave as if the pattern started with a `(?:s-u.)*?`. This prefix + /// will behave as if the pattern started with a `(?s-u:.)*?`. This prefix /// permits a match to appear anywhere. /// /// By default, the anchored mode is [`Anchored::No`]. From 9a8720f6b5d946cdb2d2e9be92986e595cda60c5 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Fri, 28 Jul 2023 19:42:33 -0400 Subject: [PATCH 012/136] automata: bump regex-syntax to latest version Fixes #1056 --- regex-automata/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/regex-automata/Cargo.toml b/regex-automata/Cargo.toml index 1936cf783..c64df5efc 100644 --- a/regex-automata/Cargo.toml +++ b/regex-automata/Cargo.toml @@ -85,7 +85,7 @@ internal-instrument-pikevm = ["logging", "std"] aho-corasick = { version = "1.0.0", optional = true, default-features = false } log = { version = "0.4.14", optional = true } memchr = { version = "2.5.0", optional = true, default-features = false } -regex-syntax = { path = "../regex-syntax", version = "0.7.0", optional = true, default-features = false } +regex-syntax = { path = "../regex-syntax", version = "0.7.4", optional = true, default-features = false } [dev-dependencies] anyhow = "1.0.69" From a1910244f873e003efbe6e80ad9302c8ea949430 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Fri, 28 Jul 2023 19:43:04 -0400 Subject: [PATCH 013/136] regex-automata-0.3.4 --- regex-automata/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/regex-automata/Cargo.toml b/regex-automata/Cargo.toml index c64df5efc..1f423c605 100644 --- a/regex-automata/Cargo.toml +++ b/regex-automata/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "regex-automata" -version = "0.3.3" #:version +version = "0.3.4" #:version authors = ["The Rust Project Developers", "Andrew Gallant "] description = "Automata construction and matching using regular expressions." documentation = "https://docs.rs/regex-automata" From e10c9d7b56d3f33f48abf487a3d353f64f67897b Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Thu, 3 Aug 2023 11:11:15 -0400 Subject: [PATCH 014/136] automata: add new 'WhichCaptures' config This is the first step in fixing a regression in memory usage. The underlying problem is that regex-automata now natively supports multi-pattern regexes *with* capturing support. Unfortunately though, this overall doesn't work too well with the current design of the PikeVM, because the amount of memory used is `len(captures) * len(states)`. So basically, as the regex and number of captures increases, the amount of memory used gets quite high. This is new functionality that we hope to improve upon over time, so it's not too big of a deal on its own. But it turns out this impacts previous uses of RegexSet that have capture groups. The old implementation just ignored these capture groups because they weren't supported in a RegexSet, and thus there were no memory problems. But in the new implementation, nothing tells it that it's okay to ignore the capture groups. So it winds up allocating space for them even though the RegexSet APIs don't provide any of that functionality. So my plan to fix this is to introduce a new configuration knob for controlling more granularly which capture states are compiled into the NFA. Previously we only supported "all of them" or "none of them." This commit adds a new (backwards compatible) knob that also permits "just implicit groups." That is, one capture group per pattern. This hopefully leads to less memory usage overall. (Well, it will certaintly be less, but hopefully it's a big reduction.) We don't actually change anything here. We just add a new `Config::which_captures` knob, implement the existing `Config::captures` in terms of `Config::which_captures` and deprecate `Config::captures`. If this winds up not being sufficient, then we may need to adapt the PikeVM to work without any capture groups at all and instead just report which patterns match. Which is... probably fine? --- regex-automata/src/dfa/dense.rs | 5 +- regex-automata/src/hybrid/dfa.rs | 5 +- regex-automata/src/meta/strategy.rs | 11 +- regex-automata/src/nfa/thompson/compiler.rs | 174 +++++++++++++++++--- regex-automata/src/nfa/thompson/mod.rs | 2 +- regex-automata/src/nfa/thompson/nfa.rs | 8 +- regex-automata/src/util/captures.rs | 16 +- 7 files changed, 182 insertions(+), 39 deletions(-) diff --git a/regex-automata/src/dfa/dense.rs b/regex-automata/src/dfa/dense.rs index 75ca85e6e..6da865f97 100644 --- a/regex-automata/src/dfa/dense.rs +++ b/regex-automata/src/dfa/dense.rs @@ -1170,7 +1170,10 @@ impl Builder { .clone() // We can always forcefully disable captures because DFAs do not // support them. - .configure(thompson::Config::new().captures(false)) + .configure( + thompson::Config::new() + .which_captures(thompson::WhichCaptures::None), + ) .build_many(patterns) .map_err(BuildError::nfa)?; self.build_from_nfa(&nfa) diff --git a/regex-automata/src/hybrid/dfa.rs b/regex-automata/src/hybrid/dfa.rs index 86963248f..67261c1a3 100644 --- a/regex-automata/src/hybrid/dfa.rs +++ b/regex-automata/src/hybrid/dfa.rs @@ -3973,7 +3973,10 @@ impl Builder { .clone() // We can always forcefully disable captures because DFAs do not // support them. - .configure(thompson::Config::new().captures(false)) + .configure( + thompson::Config::new() + .which_captures(thompson::WhichCaptures::None), + ) .build_many(patterns) .map_err(BuildError::nfa)?; self.build_from_nfa(nfa) diff --git a/regex-automata/src/meta/strategy.rs b/regex-automata/src/meta/strategy.rs index aa1d61ef3..52a501bf6 100644 --- a/regex-automata/src/meta/strategy.rs +++ b/regex-automata/src/meta/strategy.rs @@ -13,7 +13,7 @@ use crate::{ regex::{Cache, RegexInfo}, reverse_inner, wrappers, }, - nfa::thompson::{self, NFA}, + nfa::thompson::{self, WhichCaptures, NFA}, util::{ captures::{Captures, GroupInfo}, look::LookMatcher, @@ -452,7 +452,7 @@ impl Core { .utf8(info.config().get_utf8_empty()) .nfa_size_limit(info.config().get_nfa_size_limit()) .shrink(false) - .captures(true) + .which_captures(WhichCaptures::All) .look_matcher(lookm); let nfa = thompson::Compiler::new() .configure(thompson_config.clone()) @@ -499,7 +499,10 @@ impl Core { // useful with capturing groups in reverse. And of course, // the lazy DFA ignores capturing groups in all cases. .configure( - thompson_config.clone().captures(false).reverse(true), + thompson_config + .clone() + .which_captures(WhichCaptures::None) + .reverse(true), ) .build_many_from_hir(hirs) .map_err(BuildError::nfa)?; @@ -1480,7 +1483,7 @@ impl ReverseInner { .utf8(core.info.config().get_utf8_empty()) .nfa_size_limit(core.info.config().get_nfa_size_limit()) .shrink(false) - .captures(false) + .which_captures(WhichCaptures::None) .look_matcher(lookm); let result = thompson::Compiler::new() .configure(thompson_config) diff --git a/regex-automata/src/nfa/thompson/compiler.rs b/regex-automata/src/nfa/thompson/compiler.rs index 2021d93ea..6cc79822a 100644 --- a/regex-automata/src/nfa/thompson/compiler.rs +++ b/regex-automata/src/nfa/thompson/compiler.rs @@ -30,7 +30,7 @@ pub struct Config { reverse: Option, nfa_size_limit: Option>, shrink: Option, - captures: Option, + which_captures: Option, look_matcher: Option, #[cfg(test)] unanchored_prefix: Option, @@ -178,12 +178,15 @@ impl Config { /// ``` /// use regex_automata::{ /// dfa::{self, Automaton}, - /// nfa::thompson::NFA, + /// nfa::thompson::{NFA, WhichCaptures}, /// HalfMatch, Input, /// }; /// /// let dfa = dfa::dense::Builder::new() - /// .thompson(NFA::config().captures(false).reverse(true)) + /// .thompson(NFA::config() + /// .which_captures(WhichCaptures::None) + /// .reverse(true) + /// ) /// .build("baz[0-9]+")?; /// let expected = Some(HalfMatch::must(0, 3)); /// assert_eq!( @@ -277,10 +280,12 @@ impl Config { /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long - /// use regex_automata::nfa::thompson::NFA; + /// use regex_automata::nfa::thompson::{NFA, WhichCaptures}; /// /// // Currently we have to disable captures when enabling reverse NFA. - /// let config = NFA::config().captures(false).reverse(true); + /// let config = NFA::config() + /// .which_captures(WhichCaptures::None) + /// .reverse(true); /// let not_shrunk = NFA::compiler() /// .configure(config.clone().shrink(false)) /// .build(r"\w")?; @@ -314,18 +319,70 @@ impl Config { /// require capturing groups to be present in the NFA. Building a Pike VM /// with an NFA without capturing groups will result in an error. /// + /// (Note that since this method is deprecated, the example below uses + /// [`Config::which_captures`] to disable capture states.) + /// /// ``` - /// use regex_automata::nfa::thompson::{pikevm::PikeVM, NFA}; + /// use regex_automata::nfa::thompson::{ + /// pikevm::PikeVM, + /// NFA, + /// WhichCaptures, + /// }; /// /// let nfa = NFA::compiler() - /// .configure(NFA::config().captures(false)) + /// .configure(NFA::config().which_captures(WhichCaptures::None)) /// .build(r"[a-z]+")?; /// assert!(PikeVM::new_from_nfa(nfa).is_err()); /// /// # Ok::<(), Box>(()) /// ``` - pub fn captures(mut self, yes: bool) -> Config { - self.captures = Some(yes); + #[deprecated(since = "0.3.5", note = "use which_captures instead")] + pub fn captures(self, yes: bool) -> Config { + self.which_captures(if yes { + WhichCaptures::All + } else { + WhichCaptures::None + }) + } + + /// Configures what kinds of capture groups are compiled into + /// [`State::Capture`](crate::nfa::thompson::State::Capture) states in a + /// Thompson NFA. + /// + /// Currently, using any option except for [`WhichCaptures::None`] requires + /// disabling the [`reverse`](Config::reverse) setting. If both are + /// enabled, then the compiler will return an error. It is expected that + /// this limitation will be lifted in the future. + /// + /// This is set to [`WhichCaptures::All`] by default. Callers may wish to + /// use [`WhichCaptures::Implicit`] in cases where one wants avoid the + /// overhead of capture states for explicit groups. Usually this occurs + /// when one wants to use the `PikeVM` only for determining the overall + /// match. Otherwise, the `PikeVM` could use much more memory than is + /// necessary. + /// + /// # Example + /// + /// This example demonstrates that some regex engines, like the Pike VM, + /// require capturing groups to be present in the NFA. Building a Pike VM + /// with an NFA without capturing groups will result in an error. + /// + /// ``` + /// use regex_automata::nfa::thompson::{ + /// pikevm::PikeVM, + /// NFA, + /// WhichCaptures, + /// }; + /// + /// let nfa = NFA::compiler() + /// .configure(NFA::config().which_captures(WhichCaptures::None)) + /// .build(r"[a-z]+")?; + /// assert!(PikeVM::new_from_nfa(nfa).is_err()); + /// + /// # Ok::<(), Box>(()) + /// ``` + pub fn which_captures(mut self, which_captures: WhichCaptures) -> Config { + self.which_captures = Some(which_captures); self } @@ -405,8 +462,14 @@ impl Config { } /// Return whether NFA compilation is configured to produce capture states. + #[deprecated(since = "0.3.5", note = "use get_which_captures instead")] pub fn get_captures(&self) -> bool { - self.captures.unwrap_or(true) + self.get_which_captures().is_any() + } + + /// Return what kinds of capture states will be compiled into an NFA. + pub fn get_which_captures(&self) -> WhichCaptures { + self.which_captures.unwrap_or(WhichCaptures::All) } /// Return the look-around matcher for this NFA. @@ -439,7 +502,7 @@ impl Config { reverse: o.reverse.or(self.reverse), nfa_size_limit: o.nfa_size_limit.or(self.nfa_size_limit), shrink: o.shrink.or(self.shrink), - captures: o.captures.or(self.captures), + which_captures: o.which_captures.or(self.which_captures), look_matcher: o.look_matcher.or_else(|| self.look_matcher.clone()), #[cfg(test)] unanchored_prefix: o.unanchored_prefix.or(self.unanchored_prefix), @@ -447,6 +510,57 @@ impl Config { } } +/// A configuration indicating which kinds of +/// [`State::Capture`](crate::nfa::thompson::State::Capture) states to include. +/// +/// This configuration can be used with [`Config::which_captures`] to control +/// which capture states are compiled into a Thompson NFA. +/// +/// The default configuration is [`WhichCaptures::All`]. +#[derive(Clone, Copy, Debug)] +pub enum WhichCaptures { + /// All capture states, including those corresponding to both implicit and + /// explicit capture groups, are included in the Thompson NFA. + All, + /// Only capture states corresponding to implicit capture groups are + /// included. Implicit capture groups appear in every pattern implicitly + /// and correspond to the overall match of a pattern. + /// + /// This is useful when one only cares about the overall match of a + /// pattern. By excluding capture states from explicit capture groups, + /// one might be able to reduce the memory usage of a multi-pattern regex + /// substantially if it was otherwise written to have many explicit capture + /// groups. + Implicit, + /// No capture states are compiled into the Thompson NFA. + /// + /// This is useful when capture states are either not needed (for example, + /// if one is only trying to build a DFA) or if they aren't supported (for + /// example, a reverse NFA). + None, +} + +impl Default for WhichCaptures { + fn default() -> WhichCaptures { + WhichCaptures::All + } +} + +impl WhichCaptures { + /// Returns true if this configuration indicates that no capture states + /// should be produced in an NFA. + pub fn is_none(&self) -> bool { + matches!(*self, WhichCaptures::None) + } + + /// Returns true if this configuration indicates that some capture states + /// should be added to an NFA. Note that this might only include capture + /// states for implicit capture groups. + pub fn is_any(&self) -> bool { + !self.is_none() + } +} + /* This compiler below uses Thompson's construction algorithm. The compiler takes a regex-syntax::Hir as input and emits an NFA graph as output. The NFA graph @@ -800,7 +914,9 @@ impl Compiler { if exprs.len() > PatternID::LIMIT { return Err(BuildError::too_many_patterns(exprs.len())); } - if self.config.get_reverse() && self.config.get_captures() { + if self.config.get_reverse() + && self.config.get_which_captures().is_any() + { return Err(BuildError::unsupported_captures()); } @@ -978,7 +1094,7 @@ impl Compiler { name: Option<&str>, expr: &Hir, ) -> Result { - if !self.config.get_captures() { + if self.config.get_which_captures().is_none() { return self.c(expr); } @@ -1728,9 +1844,15 @@ mod tests { util::primitives::{PatternID, StateID}, }; + use super::*; + fn build(pattern: &str) -> NFA { NFA::compiler() - .configure(NFA::config().captures(false).unanchored_prefix(false)) + .configure( + NFA::config() + .which_captures(WhichCaptures::None) + .unanchored_prefix(false), + ) .build(pattern) .unwrap() } @@ -1794,7 +1916,7 @@ mod tests { #[test] fn compile_unanchored_prefix() { let nfa = NFA::compiler() - .configure(NFA::config().captures(false)) + .configure(NFA::config().which_captures(WhichCaptures::None)) .build(r"a") .unwrap(); assert_eq!( @@ -1827,7 +1949,11 @@ mod tests { // Check that non-UTF-8 literals work. let nfa = NFA::compiler() - .configure(NFA::config().captures(false).unanchored_prefix(false)) + .configure( + NFA::config() + .which_captures(WhichCaptures::None) + .unanchored_prefix(false), + ) .syntax(crate::util::syntax::Config::new().utf8(false)) .build(r"(?-u)\xFF") .unwrap(); @@ -1937,7 +2063,7 @@ mod tests { let nfa = NFA::compiler() .configure( NFA::config() - .captures(false) + .which_captures(WhichCaptures::None) .reverse(true) .shrink(false) .unanchored_prefix(false), @@ -1965,7 +2091,11 @@ mod tests { #[test] fn compile_many_start_pattern() { let nfa = NFA::compiler() - .configure(NFA::config().captures(false).unanchored_prefix(false)) + .configure( + NFA::config() + .which_captures(WhichCaptures::None) + .unanchored_prefix(false), + ) .build_many(&["a", "b"]) .unwrap(); assert_eq!( @@ -1993,7 +2123,9 @@ mod tests { use regex_syntax::hir::{Class, ClassBytes, Hir}; let hir = Hir::class(Class::Bytes(ClassBytes::new(vec![]))); - let config = NFA::config().captures(false).unanchored_prefix(false); + let config = NFA::config() + .which_captures(WhichCaptures::None) + .unanchored_prefix(false); let nfa = NFA::compiler().configure(config).build_from_hir(&hir).unwrap(); assert_eq!(nfa.states(), &[s_fail(), s_match(0)]); @@ -2005,7 +2137,9 @@ mod tests { use regex_syntax::hir::{Class, ClassUnicode, Hir}; let hir = Hir::class(Class::Unicode(ClassUnicode::new(vec![]))); - let config = NFA::config().captures(false).unanchored_prefix(false); + let config = NFA::config() + .which_captures(WhichCaptures::None) + .unanchored_prefix(false); let nfa = NFA::compiler().configure(config).build_from_hir(&hir).unwrap(); assert_eq!(nfa.states(), &[s_fail(), s_match(0)]); diff --git a/regex-automata/src/nfa/thompson/mod.rs b/regex-automata/src/nfa/thompson/mod.rs index 3581d738c..cf426736d 100644 --- a/regex-automata/src/nfa/thompson/mod.rs +++ b/regex-automata/src/nfa/thompson/mod.rs @@ -78,4 +78,4 @@ pub use self::{ }, }; #[cfg(feature = "syntax")] -pub use compiler::{Compiler, Config}; +pub use compiler::{Compiler, Config, WhichCaptures}; diff --git a/regex-automata/src/nfa/thompson/nfa.rs b/regex-automata/src/nfa/thompson/nfa.rs index 86131406c..2108fa338 100644 --- a/regex-automata/src/nfa/thompson/nfa.rs +++ b/regex-automata/src/nfa/thompson/nfa.rs @@ -453,10 +453,10 @@ impl NFA { /// predict the anchored starting state. /// /// ``` - /// use regex_automata::nfa::thompson::{NFA, State}; + /// use regex_automata::nfa::thompson::{NFA, State, WhichCaptures}; /// /// let nfa = NFA::compiler() - /// .configure(NFA::config().captures(false)) + /// .configure(NFA::config().which_captures(WhichCaptures::None)) /// .build("a")?; /// let state = nfa.state(nfa.start_anchored()); /// match *state { @@ -711,7 +711,7 @@ impl NFA { /// or not. /// /// ``` - /// use regex_automata::nfa::thompson::NFA; + /// use regex_automata::nfa::thompson::{NFA, WhichCaptures}; /// /// // Obviously has capture states. /// let nfa = NFA::new("(a)")?; @@ -733,7 +733,7 @@ impl NFA { /// // Notice that 'has_capture' is false here even when we have an /// // explicit capture group in the pattern. /// let nfa = NFA::compiler() - /// .configure(NFA::config().captures(false)) + /// .configure(NFA::config().which_captures(WhichCaptures::None)) /// .build("(a)")?; /// assert!(!nfa.has_capture()); /// diff --git a/regex-automata/src/util/captures.rs b/regex-automata/src/util/captures.rs index c6517348d..cd3a5f8f7 100644 --- a/regex-automata/src/util/captures.rs +++ b/regex-automata/src/util/captures.rs @@ -1810,10 +1810,10 @@ impl GroupInfo { /// panic even if captures aren't enabled on this NFA: /// /// ``` - /// use regex_automata::nfa::thompson::NFA; + /// use regex_automata::nfa::thompson::{NFA, WhichCaptures}; /// /// let nfa = NFA::compiler() - /// .configure(NFA::config().captures(false)) + /// .configure(NFA::config().which_captures(WhichCaptures::None)) /// .build_many(&[ /// r"(?Pa)", /// r"a", @@ -1958,7 +1958,7 @@ impl GroupInfo { /// for different patterns and NFA configurations. /// /// ``` - /// use regex_automata::{nfa::thompson::NFA, PatternID}; + /// use regex_automata::{nfa::thompson::{NFA, WhichCaptures}, PatternID}; /// /// let nfa = NFA::new(r"(a)(b)(c)")?; /// // There are 3 explicit groups in the pattern's concrete syntax and @@ -1970,13 +1970,13 @@ impl GroupInfo { /// assert_eq!(1, nfa.group_info().group_len(PatternID::ZERO)); /// /// let nfa = NFA::compiler() - /// .configure(NFA::config().captures(false)) + /// .configure(NFA::config().which_captures(WhichCaptures::None)) /// .build(r"abc")?; /// // We disabled capturing groups, so there are none. /// assert_eq!(0, nfa.group_info().group_len(PatternID::ZERO)); /// /// let nfa = NFA::compiler() - /// .configure(NFA::config().captures(false)) + /// .configure(NFA::config().which_captures(WhichCaptures::None)) /// .build(r"(a)(b)(c)")?; /// // We disabled capturing groups, so there are none, even if there are /// // explicit groups in the concrete syntax. @@ -2000,7 +2000,7 @@ impl GroupInfo { /// for different patterns and NFA configurations. /// /// ``` - /// use regex_automata::{nfa::thompson::NFA, PatternID}; + /// use regex_automata::{nfa::thompson::{NFA, WhichCaptures}, PatternID}; /// /// let nfa = NFA::new(r"(a)(b)(c)")?; /// // There are 3 explicit groups in the pattern's concrete syntax and @@ -2017,13 +2017,13 @@ impl GroupInfo { /// assert_eq!(5, nfa.group_info().all_group_len()); /// /// let nfa = NFA::compiler() - /// .configure(NFA::config().captures(false)) + /// .configure(NFA::config().which_captures(WhichCaptures::None)) /// .build(r"abc")?; /// // We disabled capturing groups, so there are none. /// assert_eq!(0, nfa.group_info().all_group_len()); /// /// let nfa = NFA::compiler() - /// .configure(NFA::config().captures(false)) + /// .configure(NFA::config().which_captures(WhichCaptures::None)) /// .build(r"(a)(b)(c)")?; /// // We disabled capturing groups, so there are none, even if there are /// // explicit groups in the concrete syntax. From a2ec566c7fc0f878effa7e4d36e8cbd4c51dcf71 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Fri, 4 Aug 2023 10:12:36 -0400 Subject: [PATCH 015/136] automata: respect new 'which_captures' option The NFA compiler now implements the 'All', 'Implicit' and 'None' options. We also add some targeted unit tests to confirm basic behavior. --- regex-automata/src/nfa/thompson/compiler.rs | 90 ++++++++++++++++++++- 1 file changed, 87 insertions(+), 3 deletions(-) diff --git a/regex-automata/src/nfa/thompson/compiler.rs b/regex-automata/src/nfa/thompson/compiler.rs index 6cc79822a..fc3e57710 100644 --- a/regex-automata/src/nfa/thompson/compiler.rs +++ b/regex-automata/src/nfa/thompson/compiler.rs @@ -1094,8 +1094,13 @@ impl Compiler { name: Option<&str>, expr: &Hir, ) -> Result { - if self.config.get_which_captures().is_none() { - return self.c(expr); + match self.config.get_which_captures() { + // No capture states means we always skip them. + WhichCaptures::None => return self.c(expr), + // Implicit captures states means we only add when index==0 since + // index==0 implies the group is implicit. + WhichCaptures::Implicit if index > 0 => return self.c(expr), + _ => {} } let start = self.add_capture_start(index, name)?; @@ -1841,7 +1846,7 @@ mod tests { use crate::{ nfa::thompson::{SparseTransitions, State, Transition, NFA}, - util::primitives::{PatternID, StateID}, + util::primitives::{PatternID, SmallIndex, StateID}, }; use super::*; @@ -1903,6 +1908,15 @@ mod tests { } } + fn s_cap(next: usize, pattern: usize, index: usize, slot: usize) -> State { + State::Capture { + next: sid(next), + pattern_id: pid(pattern), + group_index: SmallIndex::new(index).unwrap(), + slot: SmallIndex::new(slot).unwrap(), + } + } + fn s_fail() -> State { State::Fail } @@ -2144,4 +2158,74 @@ mod tests { NFA::compiler().configure(config).build_from_hir(&hir).unwrap(); assert_eq!(nfa.states(), &[s_fail(), s_match(0)]); } + + #[test] + fn compile_captures_all() { + let nfa = NFA::compiler() + .configure( + NFA::config() + .unanchored_prefix(false) + .which_captures(WhichCaptures::All), + ) + .build("a(b)c") + .unwrap(); + assert_eq!( + nfa.states(), + &[ + s_cap(1, 0, 0, 0), + s_byte(b'a', 2), + s_cap(3, 0, 1, 2), + s_byte(b'b', 4), + s_cap(5, 0, 1, 3), + s_byte(b'c', 6), + s_cap(7, 0, 0, 1), + s_match(0) + ] + ); + let ginfo = nfa.group_info(); + assert_eq!(2, ginfo.all_group_len()); + } + + #[test] + fn compile_captures_implicit() { + let nfa = NFA::compiler() + .configure( + NFA::config() + .unanchored_prefix(false) + .which_captures(WhichCaptures::Implicit), + ) + .build("a(b)c") + .unwrap(); + assert_eq!( + nfa.states(), + &[ + s_cap(1, 0, 0, 0), + s_byte(b'a', 2), + s_byte(b'b', 3), + s_byte(b'c', 4), + s_cap(5, 0, 0, 1), + s_match(0) + ] + ); + let ginfo = nfa.group_info(); + assert_eq!(1, ginfo.all_group_len()); + } + + #[test] + fn compile_captures_none() { + let nfa = NFA::compiler() + .configure( + NFA::config() + .unanchored_prefix(false) + .which_captures(WhichCaptures::None), + ) + .build("a(b)c") + .unwrap(); + assert_eq!( + nfa.states(), + &[s_byte(b'a', 1), s_byte(b'b', 2), s_byte(b'c', 3), s_match(0)] + ); + let ginfo = nfa.group_info(); + assert_eq!(0, ginfo.all_group_len()); + } } From 04b11b6e190dc98a7de14fccc4f50c08fcd31237 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Fri, 4 Aug 2023 10:50:23 -0400 Subject: [PATCH 016/136] automata: add 'which_captures' knob to meta::Regex This propagates the new Thompson NFA compiler option to the meta regex config API. --- regex-automata/src/meta/regex.rs | 80 +++++++++++++++++++++++++++++ regex-automata/src/meta/strategy.rs | 2 +- 2 files changed, 81 insertions(+), 1 deletion(-) diff --git a/regex-automata/src/meta/regex.rs b/regex-automata/src/meta/regex.rs index 6e16ceedb..bc043793d 100644 --- a/regex-automata/src/meta/regex.rs +++ b/regex-automata/src/meta/regex.rs @@ -16,6 +16,7 @@ use crate::{ strategy::{self, Strategy}, wrappers, }, + nfa::thompson::WhichCaptures, util::{ captures::{Captures, GroupInfo}, iter, @@ -2429,6 +2430,7 @@ pub struct Config { utf8_empty: Option, autopre: Option, pre: Option>, + which_captures: Option, nfa_size_limit: Option>, onepass_size_limit: Option>, hybrid_cache_capacity: Option, @@ -2619,6 +2621,75 @@ impl Config { Config { pre: Some(pre), ..self } } + /// Configures what kinds of groups are compiled as "capturing" in the + /// underlying regex engine. + /// + /// This is set to [`WhichCaptures::All`] by default. Callers may wish to + /// use [`WhichCaptures::Implicit`] in cases where one wants avoid the + /// overhead of capture states for explicit groups. + /// + /// Note that another approach to avoiding the overhead of capture groups + /// is by using non-capturing groups in the regex pattern. That is, + /// `(?:a)` instead of `(a)`. This option is useful when you can't control + /// the concrete syntax but know that you don't need the underlying capture + /// states. For example, using `WhichCaptures::Implicit` will behave as if + /// all explicit capturing groups in the pattern were non-capturing. + /// + /// Setting this to `WhichCaptures::None` may result in an error when + /// building a meta regex. + /// + /// # Example + /// + /// This example demonstrates how the results of capture groups can change + /// based on this option. First we show the default (all capture groups in + /// the pattern are capturing): + /// + /// ``` + /// use regex_automata::{meta::Regex, Match, Span}; + /// + /// let re = Regex::new(r"foo([0-9]+)bar")?; + /// let hay = "foo123bar"; + /// + /// let mut caps = re.create_captures(); + /// re.captures(hay, &mut caps); + /// assert_eq!(Some(Span::from(0..9)), caps.get_group(0)); + /// assert_eq!(Some(Span::from(3..6)), caps.get_group(1)); + /// + /// Ok::<(), Box>(()) + /// ``` + /// + /// And now we show the behavior when we only include implicit capture + /// groups. In this case, we can only find the overall match span, but the + /// spans of any other explicit group don't exist because they are treated + /// as non-capturing. (In effect, when `WhichCaptures::Implicit` is used, + /// there is no real point in using [`Regex::captures`] since it will never + /// be able to report more information than [`Regex::find`].) + /// + /// ``` + /// use regex_automata::{ + /// meta::Regex, + /// nfa::thompson::WhichCaptures, + /// Match, + /// Span, + /// }; + /// + /// let re = Regex::builder() + /// .configure(Regex::config().which_captures(WhichCaptures::Implicit)) + /// .build(r"foo([0-9]+)bar")?; + /// let hay = "foo123bar"; + /// + /// let mut caps = re.create_captures(); + /// re.captures(hay, &mut caps); + /// assert_eq!(Some(Span::from(0..9)), caps.get_group(0)); + /// assert_eq!(None, caps.get_group(1)); + /// + /// Ok::<(), Box>(()) + /// ``` + pub fn which_captures(mut self, which_captures: WhichCaptures) -> Config { + self.which_captures = Some(which_captures); + self + } + /// Sets the size limit, in bytes, to enforce on the construction of every /// NFA build by the meta regex engine. /// @@ -2983,6 +3054,14 @@ impl Config { self.pre.as_ref().unwrap_or(&None).as_ref() } + /// Returns the capture configuration, as set by + /// [`Config::which_captures`]. + /// + /// If it was not explicitly set, then a default value is returned. + pub fn get_which_captures(&self) -> WhichCaptures { + self.which_captures.unwrap_or(WhichCaptures::All) + } + /// Returns NFA size limit, as set by [`Config::nfa_size_limit`]. /// /// If it was not explicitly set, then a default value is returned. @@ -3126,6 +3205,7 @@ impl Config { utf8_empty: o.utf8_empty.or(self.utf8_empty), autopre: o.autopre.or(self.autopre), pre: o.pre.or_else(|| self.pre.clone()), + which_captures: o.which_captures.or(self.which_captures), nfa_size_limit: o.nfa_size_limit.or(self.nfa_size_limit), onepass_size_limit: o .onepass_size_limit diff --git a/regex-automata/src/meta/strategy.rs b/regex-automata/src/meta/strategy.rs index 52a501bf6..86610fbea 100644 --- a/regex-automata/src/meta/strategy.rs +++ b/regex-automata/src/meta/strategy.rs @@ -452,7 +452,7 @@ impl Core { .utf8(info.config().get_utf8_empty()) .nfa_size_limit(info.config().get_nfa_size_limit()) .shrink(false) - .which_captures(WhichCaptures::All) + .which_captures(info.config().get_which_captures()) .look_matcher(lookm); let nfa = thompson::Compiler::new() .configure(thompson_config.clone()) From 3127e3b57d5cfe491c86fcc8a2a451ab666beb8f Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Fri, 4 Aug 2023 10:51:36 -0400 Subject: [PATCH 017/136] regex: use new 'which_captures' knob for RegexSet While this reduces memory usage by half, unfortunately, it's still quite a bit more than memory usage prior to regex 1.9. This is because we are still allocating room to store two offsets per regex for a rather large regex. --- src/builders.rs | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/src/builders.rs b/src/builders.rs index d19a0ffe2..a0f9b28b5 100644 --- a/src/builders.rs +++ b/src/builders.rs @@ -28,7 +28,9 @@ use alloc::{ vec::Vec, }; -use regex_automata::{meta, util::syntax, MatchKind}; +use regex_automata::{ + meta, nfa::thompson::WhichCaptures, util::syntax, MatchKind, +}; use crate::error::Error; @@ -100,8 +102,12 @@ impl Builder { } fn build_many_string(&self) -> Result { - let metac = - self.metac.clone().match_kind(MatchKind::All).utf8_empty(true); + let metac = self + .metac + .clone() + .match_kind(MatchKind::All) + .utf8_empty(true) + .which_captures(WhichCaptures::Implicit); let syntaxc = self.syntaxc.clone().utf8(true); let patterns = Arc::from(self.pats.as_slice()); meta::Builder::new() @@ -113,8 +119,12 @@ impl Builder { } fn build_many_bytes(&self) -> Result { - let metac = - self.metac.clone().match_kind(MatchKind::All).utf8_empty(false); + let metac = self + .metac + .clone() + .match_kind(MatchKind::All) + .utf8_empty(false) + .which_captures(WhichCaptures::Implicit); let syntaxc = self.syntaxc.clone().utf8(false); let patterns = Arc::from(self.pats.as_slice()); meta::Builder::new() From e29b915c3878fadfe63041f29ddb5f4a5bfc4f8d Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Fri, 4 Aug 2023 13:54:56 -0400 Subject: [PATCH 018/136] automata: make PikeVM and backtracker work without capture states Previously, construction of these engines checked to make sure the NFA given had some capture states in it. If the NFA didn't, construction failed with an error. To support the case where the NFA has no capture states at all (to avoid gratuitous memory allocation), we remove this restriction and tweak the engine implementations to stop assuming that the NFA has capture states. This turned out to not be too hard, as we only assumed as much in a few places. The main reason why this restriction existed in the first place was semantics. Namely, it's important that the PikeVM remain infallible. But what happens when you ask for match offsets in a search with an NFA that has no capture states? The PikeVM just doesn't support that. Previously it would panic (and thus the reason construction would fail). But now instead it will just report "no match." It's a little hokey, but we justify it to ourselves because "simplicity" and "avoids footguns" are non-goals of this crate. --- regex-automata/src/meta/regex.rs | 6 ++- regex-automata/src/nfa/thompson/backtrack.rs | 29 ++++++------ regex-automata/src/nfa/thompson/compiler.rs | 46 +++++++++++++++----- regex-automata/src/nfa/thompson/error.rs | 12 ----- regex-automata/src/nfa/thompson/pikevm.rs | 40 +++++++---------- 5 files changed, 72 insertions(+), 61 deletions(-) diff --git a/regex-automata/src/meta/regex.rs b/regex-automata/src/meta/regex.rs index bc043793d..0d40eaa40 100644 --- a/regex-automata/src/meta/regex.rs +++ b/regex-automata/src/meta/regex.rs @@ -2635,8 +2635,10 @@ impl Config { /// states. For example, using `WhichCaptures::Implicit` will behave as if /// all explicit capturing groups in the pattern were non-capturing. /// - /// Setting this to `WhichCaptures::None` may result in an error when - /// building a meta regex. + /// Setting this to `WhichCaptures::None` is usually not the right thing to + /// do. When no capture states are compiled, some regex engines (such as + /// the `PikeVM`) won't be able to report match offsets. This will manifest + /// as no match being found. /// /// # Example /// diff --git a/regex-automata/src/nfa/thompson/backtrack.rs b/regex-automata/src/nfa/thompson/backtrack.rs index 75b6c096b..c68f9fa42 100644 --- a/regex-automata/src/nfa/thompson/backtrack.rs +++ b/regex-automata/src/nfa/thompson/backtrack.rs @@ -300,15 +300,6 @@ impl Builder { &self, nfa: NFA, ) -> Result { - // If the NFA has no captures, then the backtracker doesn't work since - // it relies on them in order to report match locations. However, in - // the special case of an NFA with no patterns, it is allowed, since - // no matches can ever be produced. And importantly, an NFA with no - // patterns has no capturing groups anyway, so this is necessary to - // permit the backtracker to work with regexes with zero patterns. - if !nfa.has_capture() && nfa.pattern_len() > 0 { - return Err(BuildError::missing_captures()); - } nfa.look_set_any().available().map_err(BuildError::word)?; Ok(BoundedBacktracker { config: self.config.clone(), nfa }) } @@ -954,8 +945,14 @@ impl BoundedBacktracker { None => return Ok(None), Some(pid) => pid, }; - let start = slots[0].unwrap().get(); - let end = slots[1].unwrap().get(); + let start = match slots[0] { + None => return Ok(None), + Some(s) => s.get(), + }; + let end = match slots[1] { + None => return Ok(None), + Some(s) => s.get(), + }; return Ok(Some(Match::new(pid, Span { start, end }))); } let ginfo = self.get_nfa().group_info(); @@ -965,8 +962,14 @@ impl BoundedBacktracker { None => return Ok(None), Some(pid) => pid, }; - let start = slots[pid.as_usize() * 2].unwrap().get(); - let end = slots[pid.as_usize() * 2 + 1].unwrap().get(); + let start = match slots[pid.as_usize() * 2] { + None => return Ok(None), + Some(s) => s.get(), + }; + let end = match slots[pid.as_usize() * 2 + 1] { + None => return Ok(None), + Some(s) => s.get(), + }; Ok(Some(Match::new(pid, Span { start, end }))) } diff --git a/regex-automata/src/nfa/thompson/compiler.rs b/regex-automata/src/nfa/thompson/compiler.rs index fc3e57710..065e9ef27 100644 --- a/regex-automata/src/nfa/thompson/compiler.rs +++ b/regex-automata/src/nfa/thompson/compiler.rs @@ -316,8 +316,8 @@ impl Config { /// # Example /// /// This example demonstrates that some regex engines, like the Pike VM, - /// require capturing groups to be present in the NFA. Building a Pike VM - /// with an NFA without capturing groups will result in an error. + /// require capturing states to be present in the NFA to report match + /// offsets. /// /// (Note that since this method is deprecated, the example below uses /// [`Config::which_captures`] to disable capture states.) @@ -329,10 +329,13 @@ impl Config { /// WhichCaptures, /// }; /// - /// let nfa = NFA::compiler() - /// .configure(NFA::config().which_captures(WhichCaptures::None)) + /// let re = PikeVM::builder() + /// .thompson(NFA::config().which_captures(WhichCaptures::None)) /// .build(r"[a-z]+")?; - /// assert!(PikeVM::new_from_nfa(nfa).is_err()); + /// let mut cache = re.create_cache(); + /// + /// assert!(re.is_match(&mut cache, "abc")); + /// assert_eq!(None, re.find(&mut cache, "abc")); /// /// # Ok::<(), Box>(()) /// ``` @@ -364,8 +367,8 @@ impl Config { /// # Example /// /// This example demonstrates that some regex engines, like the Pike VM, - /// require capturing groups to be present in the NFA. Building a Pike VM - /// with an NFA without capturing groups will result in an error. + /// require capturing states to be present in the NFA to report match + /// offsets. /// /// ``` /// use regex_automata::nfa::thompson::{ @@ -374,10 +377,33 @@ impl Config { /// WhichCaptures, /// }; /// - /// let nfa = NFA::compiler() - /// .configure(NFA::config().which_captures(WhichCaptures::None)) + /// let re = PikeVM::builder() + /// .thompson(NFA::config().which_captures(WhichCaptures::None)) + /// .build(r"[a-z]+")?; + /// let mut cache = re.create_cache(); + /// + /// assert!(re.is_match(&mut cache, "abc")); + /// assert_eq!(None, re.find(&mut cache, "abc")); + /// + /// # Ok::<(), Box>(()) + /// ``` + /// + /// The same applies to the bounded backtracker: + /// + /// ``` + /// use regex_automata::nfa::thompson::{ + /// backtrack::BoundedBacktracker, + /// NFA, + /// WhichCaptures, + /// }; + /// + /// let re = BoundedBacktracker::builder() + /// .thompson(NFA::config().which_captures(WhichCaptures::None)) /// .build(r"[a-z]+")?; - /// assert!(PikeVM::new_from_nfa(nfa).is_err()); + /// let mut cache = re.create_cache(); + /// + /// assert!(re.try_is_match(&mut cache, "abc")?); + /// assert_eq!(None, re.try_find(&mut cache, "abc")?); /// /// # Ok::<(), Box>(()) /// ``` diff --git a/regex-automata/src/nfa/thompson/error.rs b/regex-automata/src/nfa/thompson/error.rs index 82648813b..3c2fa8a21 100644 --- a/regex-automata/src/nfa/thompson/error.rs +++ b/regex-automata/src/nfa/thompson/error.rs @@ -68,9 +68,6 @@ enum BuildErrorKind { /// The invalid index that was given. index: u32, }, - /// An error that occurs when one tries to build an NFA simulation (such as - /// the PikeVM) without any capturing groups. - MissingCaptures, /// An error that occurs when one tries to build a reverse NFA with /// captures enabled. Currently, this isn't supported, but we probably /// should support it at some point. @@ -126,10 +123,6 @@ impl BuildError { BuildError { kind: BuildErrorKind::InvalidCaptureIndex { index } } } - pub(crate) fn missing_captures() -> BuildError { - BuildError { kind: BuildErrorKind::MissingCaptures } - } - #[cfg(feature = "syntax")] pub(crate) fn unsupported_captures() -> BuildError { BuildError { kind: BuildErrorKind::UnsupportedCaptures } @@ -181,11 +174,6 @@ impl core::fmt::Display for BuildError { "capture group index {} is invalid (too big or discontinuous)", index, ), - BuildErrorKind::MissingCaptures => write!( - f, - "operation requires the NFA to have capturing groups, \ - but the NFA given contains none", - ), #[cfg(feature = "syntax")] BuildErrorKind::UnsupportedCaptures => write!( f, diff --git a/regex-automata/src/nfa/thompson/pikevm.rs b/regex-automata/src/nfa/thompson/pikevm.rs index 79ce3c60d..f5c0b200e 100644 --- a/regex-automata/src/nfa/thompson/pikevm.rs +++ b/regex-automata/src/nfa/thompson/pikevm.rs @@ -275,15 +275,6 @@ impl Builder { /// construction of the NFA itself will of course be ignored, since the NFA /// given here is already built. pub fn build_from_nfa(&self, nfa: NFA) -> Result { - // If the NFA has no captures, then the PikeVM doesn't work since it - // relies on them in order to report match locations. However, in - // the special case of an NFA with no patterns, it is allowed, since - // no matches can ever be produced. And importantly, an NFA with no - // patterns has no capturing groups anyway, so this is necessary to - // permit the PikeVM to work with regexes with zero patterns. - if !nfa.has_capture() && nfa.pattern_len() > 0 { - return Err(BuildError::missing_captures()); - } nfa.look_set_any().available().map_err(BuildError::word)?; Ok(PikeVM { config: self.config.clone(), nfa }) } @@ -828,16 +819,16 @@ impl PikeVM { if self.get_nfa().pattern_len() == 1 { let mut slots = [None, None]; let pid = self.search_slots(cache, &input, &mut slots)?; - let start = slots[0].unwrap().get(); - let end = slots[1].unwrap().get(); + let start = slots[0]?.get(); + let end = slots[1]?.get(); return Some(Match::new(pid, Span { start, end })); } let ginfo = self.get_nfa().group_info(); let slots_len = ginfo.implicit_slot_len(); let mut slots = vec![None; slots_len]; let pid = self.search_slots(cache, &input, &mut slots)?; - let start = slots[pid.as_usize() * 2].unwrap().get(); - let end = slots[pid.as_usize() * 2 + 1].unwrap().get(); + let start = slots[pid.as_usize() * 2]?.get(); + let end = slots[pid.as_usize() * 2 + 1]?.get(); Some(Match::new(pid, Span { start, end })) } @@ -1123,15 +1114,15 @@ impl PikeVM { if self.get_nfa().pattern_len() == 1 { let mut enough = [None, None]; let got = self.search_slots_imp(cache, input, &mut enough); - // This is OK because we know `enough_slots` is strictly bigger - // than `slots`, otherwise this special case isn't reached. + // This is OK because we know `enough` is strictly bigger than + // `slots`, otherwise this special case isn't reached. slots.copy_from_slice(&enough[..slots.len()]); return got; } let mut enough = vec![None; min]; let got = self.search_slots_imp(cache, input, &mut enough); - // This is OK because we know `enough_slots` is strictly bigger than - // `slots`, otherwise this special case isn't reached. + // This is OK because we know `enough` is strictly bigger than `slots`, + // otherwise this special case isn't reached. slots.copy_from_slice(&enough[..slots.len()]); got } @@ -2108,15 +2099,16 @@ impl SlotTable { // if a 'Captures' has fewer slots, e.g., none at all or only slots // for tracking the overall match instead of all slots for every // group. - self.slots_for_captures = nfa.group_info().slot_len(); + self.slots_for_captures = core::cmp::max( + self.slots_per_state, + nfa.pattern_len().checked_mul(2).unwrap(), + ); let len = nfa .states() .len() - // We add 1 so that our last row is always empty. We use it as - // "scratch" space for computing the epsilon closure off of the - // starting state. - .checked_add(1) - .and_then(|x| x.checked_mul(self.slots_per_state)) + .checked_mul(self.slots_per_state) + // Add space to account for scratch space used during a search. + .and_then(|x| x.checked_add(self.slots_for_captures)) // It seems like this could actually panic on legitimate inputs on // 32-bit targets, and very likely to panic on 16-bit. Should we // somehow convert this to an error? What about something similar @@ -2170,7 +2162,7 @@ impl SlotTable { /// compute an epsilon closure outside of the user supplied regex, and thus /// never want it to have any capturing slots set. fn all_absent(&mut self) -> &mut [Option] { - let i = self.table.len() - self.slots_per_state; + let i = self.table.len() - self.slots_for_captures; &mut self.table[i..i + self.slots_for_captures] } } From 930770bb8b4811b80a9cfbd0237d1f225e7c7c20 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Fri, 4 Aug 2023 13:58:30 -0400 Subject: [PATCH 019/136] regex: switch RegexSet to use WhichCaptures::None And this finally resolves the memory usage problem, as the PikeVM cache used by the RegexSet in #1059 no longer allocates MBs of memory because of the existence of impossible-to-use capturing groups. Fixes #1059 --- src/builders.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/builders.rs b/src/builders.rs index a0f9b28b5..46c4824c5 100644 --- a/src/builders.rs +++ b/src/builders.rs @@ -107,7 +107,7 @@ impl Builder { .clone() .match_kind(MatchKind::All) .utf8_empty(true) - .which_captures(WhichCaptures::Implicit); + .which_captures(WhichCaptures::None); let syntaxc = self.syntaxc.clone().utf8(true); let patterns = Arc::from(self.pats.as_slice()); meta::Builder::new() @@ -124,7 +124,7 @@ impl Builder { .clone() .match_kind(MatchKind::All) .utf8_empty(false) - .which_captures(WhichCaptures::Implicit); + .which_captures(WhichCaptures::None); let syntaxc = self.syntaxc.clone().utf8(false); let patterns = Arc::from(self.pats.as_slice()); meta::Builder::new() From e003cae98254d0ad3ff47b0919143531ddb58689 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sat, 5 Aug 2023 10:41:24 -0400 Subject: [PATCH 020/136] automata: add 'is_match' as its own path to meta regex internals I originally prided myself on not having a dedicated `is_match` routine on the meta regex engine's internal `Strategy` trait, and actually spent a fair amount of attention ensuring that `is_match` and `find` always returned the same results. That is, `is_match` returns true if and only if `find` returns a match. But the fix in the previous commits for #1059 means that a `PikeVM` and a `BoundedBacktracker` can be used to run a search with an NFA that has no capture states. Since both engines are implemented to only track offsets via those capture states, it follows that the only thing that can be returned in such cases is whether a match occurs (and if so, which pattern matched). That in turn means that `is_match` can return `true` while `find` can return `None` for the same search. This is because the latter returns `None` even when a match is found but there are no capture states to record the offsets of the match. This in theory could be resolved by adding APIs to the `PikeVM` and the `BoundedBacktracker` that return a `HalfMatch` without depending on any capture states at all. Then `is_match` could be implemented in terms of those APIs. That is probably the right path, but it's pretty gnarly to do without breaking changes and I don't want to do any breaking changes right now. So instead, we just add a special path to the meta regex engine for `is_match` and permit some cases to have different results between `is_match` and `find`. Sigh. --- regex-automata/src/meta/regex.rs | 9 ++- regex-automata/src/meta/strategy.rs | 120 +++++++++++++++++++++++++++- regex-automata/src/meta/wrappers.rs | 32 ++++++++ 3 files changed, 158 insertions(+), 3 deletions(-) diff --git a/regex-automata/src/meta/regex.rs b/regex-automata/src/meta/regex.rs index 0d40eaa40..3a04b14d8 100644 --- a/regex-automata/src/meta/regex.rs +++ b/regex-automata/src/meta/regex.rs @@ -529,7 +529,14 @@ impl Regex { #[inline] pub fn is_match<'h, I: Into>>(&self, input: I) -> bool { let input = input.into().earliest(true); - self.search_half(&input).is_some() + if self.imp.info.is_impossible(&input) { + return false; + } + let mut guard = self.pool.get(); + let result = self.imp.strat.is_match(&mut guard, &input); + // See 'Regex::search' for why we put the guard back explicitly. + PoolGuard::put(guard); + result } /// Executes a leftmost search and returns the first match that is found, diff --git a/regex-automata/src/meta/strategy.rs b/regex-automata/src/meta/strategy.rs index 86610fbea..ea6c6ab57 100644 --- a/regex-automata/src/meta/strategy.rs +++ b/regex-automata/src/meta/strategy.rs @@ -58,6 +58,8 @@ pub(super) trait Strategy: input: &Input<'_>, ) -> Option; + fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool; + fn search_slots( &self, cache: &mut Cache, @@ -399,6 +401,10 @@ impl Strategy for Pre

{ self.search(cache, input).map(|m| HalfMatch::new(m.pattern(), m.end())) } + fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool { + self.search(cache, input).is_some() + } + fn search_slots( &self, cache: &mut Cache, @@ -623,6 +629,29 @@ impl Core { } } + fn is_match_nofail(&self, cache: &mut Cache, input: &Input<'_>) -> bool { + if let Some(ref e) = self.onepass.get(input) { + trace!( + "using OnePass for is-match search at {:?}", + input.get_span() + ); + e.search_slots(&mut cache.onepass, input, &mut []).is_some() + } else if let Some(ref e) = self.backtrack.get(input) { + trace!( + "using BoundedBacktracker for is-match search at {:?}", + input.get_span() + ); + e.is_match(&mut cache.backtrack, input) + } else { + trace!( + "using PikeVM for is-match search at {:?}", + input.get_span() + ); + let e = self.pikevm.get(); + e.is_match(&mut cache.pikevm, input) + } + } + fn is_capture_search_needed(&self, slots_len: usize) -> bool { slots_len > self.nfa.group_info().implicit_slot_len() } @@ -703,7 +732,7 @@ impl Strategy for Core { // The main difference with 'search' is that if we're using a DFA, we // can use a single forward scan without needing to run the reverse // DFA. - return if let Some(e) = self.dfa.get(input) { + if let Some(e) = self.dfa.get(input) { trace!("using full DFA for half search at {:?}", input.get_span()); match e.try_search_half_fwd(input) { Ok(x) => x, @@ -723,7 +752,38 @@ impl Strategy for Core { } } else { self.search_half_nofail(cache, input) - }; + } + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool { + if let Some(e) = self.dfa.get(input) { + trace!( + "using full DFA for is-match search at {:?}", + input.get_span() + ); + match e.try_search_half_fwd(input) { + Ok(x) => x.is_some(), + Err(_err) => { + trace!("full DFA half search failed: {}", _err); + self.is_match_nofail(cache, input) + } + } + } else if let Some(e) = self.hybrid.get(input) { + trace!( + "using lazy DFA for is-match search at {:?}", + input.get_span() + ); + match e.try_search_half_fwd(&mut cache.hybrid, input) { + Ok(x) => x.is_some(), + Err(_err) => { + trace!("lazy DFA half search failed: {}", _err); + self.is_match_nofail(cache, input) + } + } + } else { + self.is_match_nofail(cache, input) + } } #[cfg_attr(feature = "perf-inline", inline(always))] @@ -983,6 +1043,21 @@ impl Strategy for ReverseAnchored { } } + #[cfg_attr(feature = "perf-inline", inline(always))] + fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool { + if input.get_anchored().is_anchored() { + return self.core.is_match(cache, input); + } + match self.try_search_half_anchored_rev(cache, input) { + Err(_err) => { + trace!("fast reverse anchored search failed: {}", _err); + self.core.is_match_nofail(cache, input) + } + Ok(None) => false, + Ok(Some(_)) => true, + } + } + #[cfg_attr(feature = "perf-inline", inline(always))] fn search_slots( &self, @@ -1335,6 +1410,28 @@ impl Strategy for ReverseSuffix { } } + #[cfg_attr(feature = "perf-inline", inline(always))] + fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool { + if input.get_anchored().is_anchored() { + return self.core.is_match(cache, input); + } + match self.try_search_half_start(cache, input) { + Err(RetryError::Quadratic(_err)) => { + trace!("reverse suffix half optimization failed: {}", _err); + self.core.is_match_nofail(cache, input) + } + Err(RetryError::Fail(_err)) => { + trace!( + "reverse suffix reverse fast half search failed: {}", + _err + ); + self.core.is_match_nofail(cache, input) + } + Ok(None) => false, + Ok(Some(_)) => true, + } + } + #[cfg_attr(feature = "perf-inline", inline(always))] fn search_slots( &self, @@ -1717,6 +1814,25 @@ impl Strategy for ReverseInner { } } + #[cfg_attr(feature = "perf-inline", inline(always))] + fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool { + if input.get_anchored().is_anchored() { + return self.core.is_match(cache, input); + } + match self.try_search_full(cache, input) { + Err(RetryError::Quadratic(_err)) => { + trace!("reverse inner half optimization failed: {}", _err); + self.core.is_match_nofail(cache, input) + } + Err(RetryError::Fail(_err)) => { + trace!("reverse inner fast half search failed: {}", _err); + self.core.is_match_nofail(cache, input) + } + Ok(None) => false, + Ok(Some(_)) => true, + } + } + #[cfg_attr(feature = "perf-inline", inline(always))] fn search_slots( &self, diff --git a/regex-automata/src/meta/wrappers.rs b/regex-automata/src/meta/wrappers.rs index 8f58363a1..08110d9bb 100644 --- a/regex-automata/src/meta/wrappers.rs +++ b/regex-automata/src/meta/wrappers.rs @@ -87,6 +87,15 @@ impl PikeVMEngine { Ok(PikeVMEngine(engine)) } + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn is_match( + &self, + cache: &mut PikeVMCache, + input: &Input<'_>, + ) -> bool { + self.0.is_match(cache.0.as_mut().unwrap(), input.clone()) + } + #[cfg_attr(feature = "perf-inline", inline(always))] pub(crate) fn search_slots( &self, @@ -212,6 +221,29 @@ impl BoundedBacktrackerEngine { } } + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn is_match( + &self, + cache: &mut BoundedBacktrackerCache, + input: &Input<'_>, + ) -> bool { + #[cfg(feature = "nfa-backtrack")] + { + // OK because we only permit access to this engine when we know + // the haystack is short enough for the backtracker to run without + // reporting an error. + self.0 + .try_is_match(cache.0.as_mut().unwrap(), input.clone()) + .unwrap() + } + #[cfg(not(feature = "nfa-backtrack"))] + { + // Impossible to reach because this engine is never constructed + // if the requisite features aren't enabled. + unreachable!() + } + } + #[cfg_attr(feature = "perf-inline", inline(always))] pub(crate) fn search_slots( &self, From d93ddbefd77f61a771a9a71ac345e117c0c43054 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sat, 5 Aug 2023 13:52:37 -0400 Subject: [PATCH 021/136] automata: add internal HalfMatch APIs for NFA engines Welp, okay, turns out we do need to know at least the end offset of a match even when the NFA has no capture states. This is necessary for correctly handling the case where a regex can match the empty string but the caller has asked that matches not split a codepoint. If we don't know the end offset of a match, then we can't correctly determine whether a match exists or not and are forced to return no match even when a match exists. We can get away with this I think for `find`-style APIs where the caller has specifically requested match offsets while simultaneously configuring the NFA to not track offsets, but with `is_match`-style APIs, we really should be able to handle it correctly. We should eventually just expose the `HalfMatch` APIs on `PikeVM` and `BoundedBacktracker`, but for now we keep them private. --- regex-automata/src/nfa/thompson/backtrack.rs | 59 ++++++++----------- regex-automata/src/nfa/thompson/pikevm.rs | 60 +++++++++----------- 2 files changed, 49 insertions(+), 70 deletions(-) diff --git a/regex-automata/src/nfa/thompson/backtrack.rs b/regex-automata/src/nfa/thompson/backtrack.rs index c68f9fa42..eba037c1d 100644 --- a/regex-automata/src/nfa/thompson/backtrack.rs +++ b/regex-automata/src/nfa/thompson/backtrack.rs @@ -19,7 +19,7 @@ use crate::{ empty, iter, prefilter::Prefilter, primitives::{NonMaxUsize, PatternID, SmallIndex, StateID}, - search::{Anchored, Input, Match, MatchError, Span}, + search::{Anchored, HalfMatch, Input, Match, MatchError, Span}, }, }; @@ -1295,12 +1295,14 @@ impl BoundedBacktracker { ) -> Result, MatchError> { let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8(); if !utf8empty { - return self.try_search_slots_imp(cache, input, slots); + let maybe_hm = self.try_search_slots_imp(cache, input, slots)?; + return Ok(maybe_hm.map(|hm| hm.pattern())); } // See PikeVM::try_search_slots for why we do this. let min = self.get_nfa().group_info().implicit_slot_len(); if slots.len() >= min { - return self.try_search_slots_imp(cache, input, slots); + let maybe_hm = self.try_search_slots_imp(cache, input, slots)?; + return Ok(maybe_hm.map(|hm| hm.pattern())); } if self.get_nfa().pattern_len() == 1 { let mut enough = [None, None]; @@ -1308,14 +1310,14 @@ impl BoundedBacktracker { // This is OK because we know `enough_slots` is strictly bigger // than `slots`, otherwise this special case isn't reached. slots.copy_from_slice(&enough[..slots.len()]); - return Ok(got); + return Ok(got.map(|hm| hm.pattern())); } let mut enough = vec![None; min]; let got = self.try_search_slots_imp(cache, input, &mut enough)?; // This is OK because we know `enough_slots` is strictly bigger than // `slots`, otherwise this special case isn't reached. slots.copy_from_slice(&enough[..slots.len()]); - Ok(got) + Ok(got.map(|hm| hm.pattern())) } /// This is the actual implementation of `try_search_slots_imp` that @@ -1328,30 +1330,17 @@ impl BoundedBacktracker { cache: &mut Cache, input: &Input<'_>, slots: &mut [Option], - ) -> Result, MatchError> { + ) -> Result, MatchError> { let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8(); - let (pid, end) = match self.search_imp(cache, input, slots)? { + let hm = match self.search_imp(cache, input, slots)? { None => return Ok(None), - Some(pid) if !utf8empty => return Ok(Some(pid)), - Some(pid) => { - let slot_start = pid.as_usize() * 2; - let slot_end = slot_start + 1; - // OK because we know we have a match and we know our caller - // provided slots are big enough (which we make true above if - // the caller didn't). Namely, we're only here when 'utf8empty' - // is true, and when that's true, we require slots for every - // pattern. - (pid, slots[slot_end].unwrap().get()) - } + Some(hm) if !utf8empty => return Ok(Some(hm)), + Some(hm) => hm, }; - empty::skip_splits_fwd(input, pid, end, |input| { - let pid = match self.search_imp(cache, input, slots)? { - None => return Ok(None), - Some(pid) => pid, - }; - let slot_start = pid.as_usize() * 2; - let slot_end = slot_start + 1; - Ok(Some((pid, slots[slot_end].unwrap().get()))) + empty::skip_splits_fwd(input, hm, hm.offset(), |input| { + Ok(self + .search_imp(cache, input, slots)? + .map(|hm| (hm, hm.offset()))) }) } @@ -1367,7 +1356,7 @@ impl BoundedBacktracker { cache: &mut Cache, input: &Input<'_>, slots: &mut [Option], - ) -> Result, MatchError> { + ) -> Result, MatchError> { // Unlike in the PikeVM, we write our capturing group spans directly // into the caller's captures groups. So we have to make sure we're // starting with a blank slate first. In the PikeVM, we avoid this @@ -1414,10 +1403,9 @@ impl BoundedBacktracker { Some(ref span) => at = span.start, } } - if let Some(pid) = - self.backtrack(cache, input, at, start_id, slots) + if let Some(hm) = self.backtrack(cache, input, at, start_id, slots) { - return Ok(Some(pid)); + return Ok(Some(hm)); } at += 1; } @@ -1438,14 +1426,13 @@ impl BoundedBacktracker { at: usize, start_id: StateID, slots: &mut [Option], - ) -> Option { + ) -> Option { cache.stack.push(Frame::Step { sid: start_id, at }); while let Some(frame) = cache.stack.pop() { match frame { Frame::Step { sid, at } => { - if let Some(pid) = self.step(cache, input, sid, at, slots) - { - return Some(pid); + if let Some(hm) = self.step(cache, input, sid, at, slots) { + return Some(hm); } } Frame::RestoreCapture { slot, offset } => { @@ -1475,7 +1462,7 @@ impl BoundedBacktracker { mut sid: StateID, mut at: usize, slots: &mut [Option], - ) -> Option { + ) -> Option { loop { if !cache.visited.insert(sid, at - input.start()) { return None; @@ -1558,7 +1545,7 @@ impl BoundedBacktracker { } State::Fail => return None, State::Match { pattern_id } => { - return Some(pattern_id); + return Some(HalfMatch::new(pattern_id, at)); } } } diff --git a/regex-automata/src/nfa/thompson/pikevm.rs b/regex-automata/src/nfa/thompson/pikevm.rs index f5c0b200e..0128c151a 100644 --- a/regex-automata/src/nfa/thompson/pikevm.rs +++ b/regex-automata/src/nfa/thompson/pikevm.rs @@ -17,7 +17,9 @@ use crate::{ empty, iter, prefilter::Prefilter, primitives::{NonMaxUsize, PatternID, SmallIndex, StateID}, - search::{Anchored, Input, Match, MatchKind, PatternSet, Span}, + search::{ + Anchored, HalfMatch, Input, Match, MatchKind, PatternSet, Span, + }, sparse_set::SparseSet, }, }; @@ -1094,7 +1096,8 @@ impl PikeVM { ) -> Option { let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8(); if !utf8empty { - return self.search_slots_imp(cache, input, slots); + let hm = self.search_slots_imp(cache, input, slots)?; + return Some(hm.pattern()); } // There is an unfortunate special case where if the regex can // match the empty string and UTF-8 mode is enabled, the search @@ -1109,7 +1112,8 @@ impl PikeVM { // this case. let min = self.get_nfa().group_info().implicit_slot_len(); if slots.len() >= min { - return self.search_slots_imp(cache, input, slots); + let hm = self.search_slots_imp(cache, input, slots)?; + return Some(hm.pattern()); } if self.get_nfa().pattern_len() == 1 { let mut enough = [None, None]; @@ -1117,14 +1121,14 @@ impl PikeVM { // This is OK because we know `enough` is strictly bigger than // `slots`, otherwise this special case isn't reached. slots.copy_from_slice(&enough[..slots.len()]); - return got; + return got.map(|hm| hm.pattern()); } let mut enough = vec![None; min]; let got = self.search_slots_imp(cache, input, &mut enough); // This is OK because we know `enough` is strictly bigger than `slots`, // otherwise this special case isn't reached. slots.copy_from_slice(&enough[..slots.len()]); - got + got.map(|hm| hm.pattern()) } /// This is the actual implementation of `search_slots_imp` that @@ -1137,30 +1141,17 @@ impl PikeVM { cache: &mut Cache, input: &Input<'_>, slots: &mut [Option], - ) -> Option { + ) -> Option { let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8(); - let (pid, end) = match self.search_imp(cache, input, slots) { + let hm = match self.search_imp(cache, input, slots) { None => return None, - Some(pid) if !utf8empty => return Some(pid), - Some(pid) => { - let slot_start = pid.as_usize() * 2; - let slot_end = slot_start + 1; - // OK because we know we have a match and we know our caller - // provided slots are big enough (which we make true above if - // the caller didn't). Namely, we're only here when 'utf8empty' - // is true, and when that's true, we require slots for every - // pattern. - (pid, slots[slot_end].unwrap().get()) - } + Some(hm) if !utf8empty => return Some(hm), + Some(hm) => hm, }; - empty::skip_splits_fwd(input, pid, end, |input| { - let pid = match self.search_imp(cache, input, slots) { - None => return Ok(None), - Some(pid) => pid, - }; - let slot_start = pid.as_usize() * 2; - let slot_end = slot_start + 1; - Ok(Some((pid, slots[slot_end].unwrap().get()))) + empty::skip_splits_fwd(input, hm, hm.offset(), |input| { + Ok(self + .search_imp(cache, input, slots) + .map(|hm| (hm, hm.offset()))) }) // OK because the PikeVM never errors. .unwrap() @@ -1235,7 +1226,7 @@ impl PikeVM { cache: &mut Cache, input: &Input<'_>, slots: &mut [Option], - ) -> Option { + ) -> Option { cache.setup_search(slots.len()); if input.is_done() { return None; @@ -1264,7 +1255,7 @@ impl PikeVM { let pre = if anchored { None } else { self.get_config().get_prefilter() }; let Cache { ref mut stack, ref mut curr, ref mut next } = cache; - let mut pid = None; + let mut hm = None; // Yes, our search doesn't end at input.end(), but includes it. This // is necessary because matches are delayed by one byte, just like // how the DFA engines work. The delay is used to handle look-behind @@ -1283,7 +1274,7 @@ impl PikeVM { if curr.set.is_empty() { // We have a match and we haven't been instructed to continue // on even after finding a match, so we can quit. - if pid.is_some() && !allmatches { + if hm.is_some() && !allmatches { break; } // If we're running an anchored search and we've advanced @@ -1353,7 +1344,7 @@ impl PikeVM { // search. If we re-computed it at every position, we would be // simulating an unanchored search when we were tasked to perform // an anchored search. - if (!pid.is_some() || allmatches) + if (!hm.is_some() || allmatches) && (!anchored || at == input.start()) { // Since we are adding to the 'curr' active states and since @@ -1372,14 +1363,15 @@ impl PikeVM { let slots = next.slot_table.all_absent(); self.epsilon_closure(stack, slots, curr, input, at, start_id); } - if let Some(x) = self.nexts(stack, curr, next, input, at, slots) { - pid = Some(x); + if let Some(pid) = self.nexts(stack, curr, next, input, at, slots) + { + hm = Some(HalfMatch::new(pid, at)); } // Unless the caller asked us to return early, we need to mush on // to see if we can extend our match. (But note that 'nexts' will // quit right after seeing a match when match_kind==LeftmostFirst, // as is consistent with leftmost-first match priority.) - if input.get_earliest() && pid.is_some() { + if input.get_earliest() && hm.is_some() { break; } core::mem::swap(curr, next); @@ -1387,7 +1379,7 @@ impl PikeVM { at += 1; } instrument!(|c| c.eprint(&self.nfa)); - pid + hm } /// The implementation for the 'which_overlapping_matches' API. Basically, From 4f3390c55e98ae2d09312bc27e47216c7a19490c Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sat, 5 Aug 2023 14:31:40 -0400 Subject: [PATCH 022/136] changelog: 1.9.2 --- CHANGELOG.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index a6a2bcb41..06383f641 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,16 @@ +1.9.2 (2023-08-05) +================== +This is a patch release that fixes another memory usage regression. This +particular regression occurred only when using a `RegexSet`. In some cases, +much more heap memory (by one or two orders of magnitude) was allocated than in +versions prior to 1.9.0. + +Bug fixes: + +* [BUG #1059](https://github.com/rust-lang/regex/issues/1059): +Fix a memory usage regression when using a `RegexSet`. + + 1.9.1 (2023-07-07) ================== This is a patch release which fixes a memory usage regression. In the regex From 2f5bdb07974e037fdd61883fac83942c68b60512 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sat, 5 Aug 2023 14:31:50 -0400 Subject: [PATCH 023/136] regex-automata-0.3.5 --- regex-automata/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/regex-automata/Cargo.toml b/regex-automata/Cargo.toml index 1f423c605..b403d8250 100644 --- a/regex-automata/Cargo.toml +++ b/regex-automata/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "regex-automata" -version = "0.3.4" #:version +version = "0.3.5" #:version authors = ["The Rust Project Developers", "Andrew Gallant "] description = "Automata construction and matching using regular expressions." documentation = "https://docs.rs/regex-automata" From 8c01708a042399d14638dda5112469235c75f40a Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sat, 5 Aug 2023 14:32:15 -0400 Subject: [PATCH 024/136] deps: bump regex-automata to 0.3.5 --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index bfd6aea61..1119eca99 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -173,7 +173,7 @@ optional = true # For the actual regex engines. [dependencies.regex-automata] path = "regex-automata" -version = "0.3.1" +version = "0.3.5" default-features = false features = ["alloc", "syntax", "meta", "nfa-pikevm"] From bbf0b38df618734b92d7b92acc8a8bf31b6d0046 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sat, 5 Aug 2023 14:32:22 -0400 Subject: [PATCH 025/136] 1.9.2 --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 1119eca99..54b0e206e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "regex" -version = "1.9.1" #:version +version = "1.9.2" #:version authors = ["The Rust Project Developers", "Andrew Gallant "] license = "MIT OR Apache-2.0" readme = "README.md" From 73f7889021542ea80937b3adacefa5825eaa97fe Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sat, 5 Aug 2023 17:25:31 -0400 Subject: [PATCH 026/136] automata: fix incorrect offsets reported by reverse inner optimization Sadly it seems that my days of squashing optimization bugs are still before me. In this particular case, the reverse inner literal optimization (which is a new optimization introduced in regex 1.9) resulted in reporting incorrect match offsets in some cases. The offending case here is: $ regex-cli find match meta --no-table -p '(?:(\d+)[:.])?(\d{1,2})[:.](\d{2})' -y '888:77:66' 0:1:9:888:77:66 The above reports a match at 1..9, but the correct match is 0..9. The problem here is that the reverse inner literal optimization is being applied, which splits the regex into three (conceptual) pieces: 1. `(?:(\d+)[:.])?(\d{1,2})` 2. `[:.]` 3. `(\d{2})` The reverse inner optimization works by looking for occurrences of (2) first, then matching (1) in reverse to find the start position of the match and then searching for (3) in the forward direction to find the end of the match. The problem in this particular case is that (2) matches at position `3` in the `888:77:66` haystack. Since the first section of numbers is optional, the reverse inner optimization believes a match exists at offset `1` by virtue of matching (1) in reverse. That is, the `(\d{1,2})` matches at 1..3 while the `(?:(\d+)[:.])?` doesn't match at all. The reverse search here is correct in isolation, but it leads to an overall incorrect result by stopping the search early. The issue is that the true leftmost match requires (2) to match at 6..7, but since it matched at 3..4 first, it is considered first and leads to an incorrect overall match. To fix this, we add another "trip wire" to the reverse inner optimization (of which there are already several) that tries to detect cases where it cannot prove that the match it found is actually the leftmost match. Namely, if it reports a match offset greater than the start of the search and otherwise *could* have kept searching, then we don't know whether we have the true leftmost match. In that case, we bail on the optimization and let a slower path take over. This is yet another example of how the nature of regex searching, and in particular leftmost searching, inhibits the composition of different regex strategies. Or at least, makes them incredibly subtle. Fixes #1060 --- regex-automata/src/meta/limited.rs | 47 ++++++++++++++++++++++++++++++ testdata/regression.toml | 17 +++++++++++ 2 files changed, 64 insertions(+) diff --git a/regex-automata/src/meta/limited.rs b/regex-automata/src/meta/limited.rs index 005878acd..192a2625e 100644 --- a/regex-automata/src/meta/limited.rs +++ b/regex-automata/src/meta/limited.rs @@ -88,7 +88,41 @@ pub(crate) fn dfa_try_search_half_rev( return Err(RetryError::Quadratic(RetryQuadraticError::new())); } } + let was_dead = dfa.is_dead_state(sid); dfa_eoi_rev(dfa, input, &mut sid, &mut mat)?; + // If we reach the beginning of the search and we could otherwise still + // potentially keep matching if there was more to match, then we actually + // return an error to indicate giving up on this optimization. Why? Because + // we can't prove that the real match begins at where we would report it. + // + // This only happens when all of the following are true: + // + // 1) We reach the starting point of our search span. + // 2) The match we found is before the starting point. + // 3) The FSM reports we could possibly find a longer match. + // + // We need (1) because otherwise the search stopped before the starting + // point and there is no possible way to find a more leftmost position. + // + // We need (2) because if the match found has an offset equal to the minimum + // possible offset, then there is no possible more leftmost match. + // + // We need (3) because if the FSM couldn't continue anyway (i.e., it's in + // a dead state), then we know we couldn't find anything more leftmost + // than what we have. (We have to check the state we were in prior to the + // EOI transition since the EOI transition will usually bring us to a dead + // state by virtue of it represents the end-of-input.) + if at == input.start() + && mat.map_or(false, |m| m.offset() > input.start()) + && !was_dead + { + trace!( + "reached beginning of search at offset {} without hitting \ + a dead state, quitting to avoid potential false positive match", + at, + ); + return Err(RetryError::Quadratic(RetryQuadraticError::new())); + } Ok(mat) } @@ -140,7 +174,20 @@ pub(crate) fn hybrid_try_search_half_rev( return Err(RetryError::Quadratic(RetryQuadraticError::new())); } } + let was_dead = sid.is_dead(); hybrid_eoi_rev(dfa, cache, input, &mut sid, &mut mat)?; + // See the comments in the full DFA routine above for why we need this. + if at == input.start() + && mat.map_or(false, |m| m.offset() > input.start()) + && !was_dead + { + trace!( + "reached beginning of search at offset {} without hitting \ + a dead state, quitting to avoid potential false positive match", + at, + ); + return Err(RetryError::Quadratic(RetryQuadraticError::new())); + } Ok(mat) } diff --git a/testdata/regression.toml b/testdata/regression.toml index bb5e4fd46..a2efa2ad3 100644 --- a/testdata/regression.toml +++ b/testdata/regression.toml @@ -739,3 +739,20 @@ matches = [[0, 9]] utf8 = false match-kind = "all" search-kind = "overlapping" + +# See: https://github.com/rust-lang/regex/issues/1060 +[[test]] +name = "reverse-inner-plus-shorter-than-expected" +regex = '(?:(\d+)[:.])?(\d{1,2})[:.](\d{2})' +haystack = '102:12:39' +matches = [[[0, 9], [0, 3], [4, 6], [7, 9]]] + +# Like reverse-inner-plus-shorter-than-expected, but using a far simpler regex +# to demonstrate the extent of the rot. Sigh. +# +# See: https://github.com/rust-lang/regex/issues/1060 +[[test]] +name = "reverse-inner-short" +regex = '(?:([0-9][0-9][0-9]):)?([0-9][0-9]):([0-9][0-9])' +haystack = '102:12:39' +matches = [[[0, 9], [0, 3], [4, 6], [7, 9]]] From 3d21492e399d77415ebcd2eee4432e2feab87893 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sat, 5 Aug 2023 18:18:48 -0400 Subject: [PATCH 027/136] changelog: 1.9.3 --- CHANGELOG.md | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 06383f641..764bb11b8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,18 @@ +1.9.3 (2023-08-05) +================== +This is a patch release that fixes a bug where some searches could result in +incorrect match offsets being reported. It is difficult to characterize the +types of regexes susceptible to this bug. They generally involve patterns +that contain no prefix or suffix literals, but have an inner literal along with +a regex prefix that can conditionally match. + +Bug fixes: + +* [BUG #1060](https://github.com/rust-lang/regex/issues/1060): +Fix a bug with the reverse inner literal optimization reporting incorrect match +offsets. + + 1.9.2 (2023-08-05) ================== This is a patch release that fixes another memory usage regression. This From c892d08c7c9ccdef4278ebbe30b5a83f0a145780 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sat, 5 Aug 2023 18:19:39 -0400 Subject: [PATCH 028/136] regex-automata-0.3.6 --- regex-automata/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/regex-automata/Cargo.toml b/regex-automata/Cargo.toml index b403d8250..22af1d9a3 100644 --- a/regex-automata/Cargo.toml +++ b/regex-automata/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "regex-automata" -version = "0.3.5" #:version +version = "0.3.6" #:version authors = ["The Rust Project Developers", "Andrew Gallant "] description = "Automata construction and matching using regular expressions." documentation = "https://docs.rs/regex-automata" From 446ecd6154854274c70b015e6c2718cdf2f48c57 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sat, 5 Aug 2023 18:20:04 -0400 Subject: [PATCH 029/136] deps: bump regex-automata to 0.3.6 --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 54b0e206e..e056c5bb7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -173,7 +173,7 @@ optional = true # For the actual regex engines. [dependencies.regex-automata] path = "regex-automata" -version = "0.3.5" +version = "0.3.6" default-features = false features = ["alloc", "syntax", "meta", "nfa-pikevm"] From 72f889ef3cca59ebac6a026f3646e8d92f056d88 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sat, 5 Aug 2023 18:20:06 -0400 Subject: [PATCH 030/136] 1.9.3 --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index e056c5bb7..cd2e30a5d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "regex" -version = "1.9.2" #:version +version = "1.9.3" #:version authors = ["The Rust Project Developers", "Andrew Gallant "] license = "MIT OR Apache-2.0" readme = "README.md" From 431c4e4867e1eb33eb39b23ed47c9934b2672f8f Mon Sep 17 00:00:00 2001 From: Orson Peters Date: Mon, 7 Aug 2023 14:48:08 +0200 Subject: [PATCH 031/136] doc: fix typo in captures_read PR #1064 --- regex-lite/src/string.rs | 4 ++-- src/regex/bytes.rs | 4 ++-- src/regex/string.rs | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/regex-lite/src/string.rs b/regex-lite/src/string.rs index 91b81d008..1c6eb4ab9 100644 --- a/regex-lite/src/string.rs +++ b/regex-lite/src/string.rs @@ -1186,8 +1186,8 @@ impl Regex { /// To create a `CaptureLocations` value, use the /// [`Regex::capture_locations`] method. /// - /// This also the overall match if one was found. When a match is found, - /// its offsets are also always stored in `locs` at index `0`. + /// This also returns the overall match if one was found. When a match is + /// found, its offsets are also always stored in `locs` at index `0`. /// /// # Panics /// diff --git a/src/regex/bytes.rs b/src/regex/bytes.rs index 6522ee7e3..03982544b 100644 --- a/src/regex/bytes.rs +++ b/src/regex/bytes.rs @@ -1162,8 +1162,8 @@ impl Regex { /// To create a `CaptureLocations` value, use the /// [`Regex::capture_locations`] method. /// - /// This also the overall match if one was found. When a match is found, - /// its offsets are also always stored in `locs` at index `0`. + /// This also returns the overall match if one was found. When a match is + /// found, its offsets are also always stored in `locs` at index `0`. /// /// # Example /// diff --git a/src/regex/string.rs b/src/regex/string.rs index 65a76740e..b9a3c3390 100644 --- a/src/regex/string.rs +++ b/src/regex/string.rs @@ -1153,8 +1153,8 @@ impl Regex { /// To create a `CaptureLocations` value, use the /// [`Regex::capture_locations`] method. /// - /// This also the overall match if one was found. When a match is found, - /// its offsets are also always stored in `locs` at index `0`. + /// This also returns the overall match if one was found. When a match is + /// found, its offsets are also always stored in `locs` at index `0`. /// /// # Panics /// From 10faa44da9134c053c28a55857068909ca29a452 Mon Sep 17 00:00:00 2001 From: Gold Edem Hogan Date: Wed, 23 Aug 2023 11:29:59 +0000 Subject: [PATCH 032/136] doc: fix a couple typos PR #1068 --- src/regex/bytes.rs | 2 +- src/regex/string.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/regex/bytes.rs b/src/regex/bytes.rs index 03982544b..cc53482cb 100644 --- a/src/regex/bytes.rs +++ b/src/regex/bytes.rs @@ -1154,7 +1154,7 @@ impl Regex { /// /// A [`CaptureLocations`] stores the same byte offsets as a [`Captures`], /// but does *not* store a reference to the haystack. This makes its API - /// a bit lower level and less convenience. But in exchange, callers + /// a bit lower level and less convenient. But in exchange, callers /// may allocate their own `CaptureLocations` and reuse it for multiple /// searches. This may be helpful if allocating a `Captures` shows up in a /// profile as too costly. diff --git a/src/regex/string.rs b/src/regex/string.rs index b9a3c3390..d5908ae0d 100644 --- a/src/regex/string.rs +++ b/src/regex/string.rs @@ -1145,7 +1145,7 @@ impl Regex { /// /// A [`CaptureLocations`] stores the same byte offsets as a [`Captures`], /// but does *not* store a reference to the haystack. This makes its API - /// a bit lower level and less convenience. But in exchange, callers + /// a bit lower level and less convenient. But in exchange, callers /// may allocate their own `CaptureLocations` and reuse it for multiple /// searches. This may be helpful if allocating a `Captures` shows up in a /// profile as too costly. From 81e328a29f8c57cb3622930104177e8606270230 Mon Sep 17 00:00:00 2001 From: Xy Qian <102588769+qianxyz@users.noreply.github.com> Date: Wed, 23 Aug 2023 19:40:27 -0700 Subject: [PATCH 033/136] doc: fix typo in module-level doc PR #1069 --- regex-lite/src/lib.rs | 2 +- src/lib.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/regex-lite/src/lib.rs b/regex-lite/src/lib.rs index d8e901678..8008b9e59 100644 --- a/regex-lite/src/lib.rs +++ b/regex-lite/src/lib.rs @@ -107,7 +107,7 @@ fn main() { } ``` -Foruth, run it with `cargo run`: +Fourth, run it with `cargo run`: ```text $ cargo run diff --git a/src/lib.rs b/src/lib.rs index cd98be103..1e191b692 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -109,7 +109,7 @@ fn main() { } ``` -Foruth, run it with `cargo run`: +Fourth, run it with `cargo run`: ```text $ cargo run From 7536e055840f74f1f7bda8ffecf851cb3e500147 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sat, 26 Aug 2023 08:14:50 -0400 Subject: [PATCH 034/136] syntax: remove superfluous `borrow` Best guess is that the parser used to use something other than a `&str`, but I can't remember. --- regex-syntax/src/ast/parse.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/regex-syntax/src/ast/parse.rs b/regex-syntax/src/ast/parse.rs index 9cf64e9ec..47ea2586b 100644 --- a/regex-syntax/src/ast/parse.rs +++ b/regex-syntax/src/ast/parse.rs @@ -383,7 +383,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { /// Return a reference to the pattern being parsed. fn pattern(&self) -> &str { - self.pattern.borrow() + self.pattern } /// Create a new error with the given span and error type. From de0339959b491ae0a26e6f96c0b0dc1635bc0f94 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sat, 26 Aug 2023 08:48:26 -0400 Subject: [PATCH 035/136] automata: fix incorrect use of Aho-Corasick's "standard" semantics This fixes a bug in how prefilters were applied for multi-regexes compiled with "all" semantics. It turns out that this corresponds to the regex crate's RegexSet API, but only its `is_match` routine. See the comment on the regression test added in this PR for an explanation of what happened. Basically, it came down to incorrectly using Aho-Corasick's "standard" semantics, which doesn't necessarily report leftmost matches. Since the regex crate is really all about leftmost matching, this can lead to skipping over parts of the haystack and thus lead to missing matches. Fixes #1070 --- .../src/util/prefilter/aho_corasick.rs | 13 ++++++++-- regex-automata/src/util/prefilter/mod.rs | 9 ------- regex-automata/src/util/prefilter/teddy.rs | 9 +++++-- testdata/regression.toml | 26 +++++++++++++++++++ 4 files changed, 44 insertions(+), 13 deletions(-) diff --git a/regex-automata/src/util/prefilter/aho_corasick.rs b/regex-automata/src/util/prefilter/aho_corasick.rs index a7474d29a..50cce827e 100644 --- a/regex-automata/src/util/prefilter/aho_corasick.rs +++ b/regex-automata/src/util/prefilter/aho_corasick.rs @@ -22,11 +22,20 @@ impl AhoCorasick { } #[cfg(feature = "perf-literal-multisubstring")] { + // We used to use `aho_corasick::MatchKind::Standard` here when + // `kind` was `MatchKind::All`, but this is not correct. The + // "standard" Aho-Corasick match semantics are to report a match + // immediately as soon as it is seen, but `All` isn't like that. + // In particular, with "standard" semantics, given the needles + // "abc" and "b" and the haystack "abc," it would report a match + // at offset 1 before a match at offset 0. This is never what we + // want in the context of the regex engine, regardless of whether + // we have leftmost-first or 'all' semantics. Namely, we always + // want the leftmost match. let ac_match_kind = match kind { - MatchKind::LeftmostFirst => { + MatchKind::LeftmostFirst | MatchKind::All => { aho_corasick::MatchKind::LeftmostFirst } - MatchKind::All => aho_corasick::MatchKind::Standard, }; // This is kind of just an arbitrary number, but basically, if we // have a small enough set of literals, then we try to use the VERY diff --git a/regex-automata/src/util/prefilter/mod.rs b/regex-automata/src/util/prefilter/mod.rs index ea3eb73d8..51fc92233 100644 --- a/regex-automata/src/util/prefilter/mod.rs +++ b/regex-automata/src/util/prefilter/mod.rs @@ -195,15 +195,6 @@ impl Prefilter { /// Some(Span::from(6..9)), /// pre.find(hay.as_bytes(), Span::from(0..hay.len())), /// ); - /// // Now we put 'samwise' back before 'sam', but change the match - /// // semantics to 'All'. In this case, there is no preference - /// // order semantics and the first match detected is returned. - /// let pre = Prefilter::new(MatchKind::All, &["samwise", "sam"]) - /// .expect("a prefilter"); - /// assert_eq!( - /// Some(Span::from(6..9)), - /// pre.find(hay.as_bytes(), Span::from(0..hay.len())), - /// ); /// /// # Ok::<(), Box>(()) /// ``` diff --git a/regex-automata/src/util/prefilter/teddy.rs b/regex-automata/src/util/prefilter/teddy.rs index 02210a5ec..fc79f2b2f 100644 --- a/regex-automata/src/util/prefilter/teddy.rs +++ b/regex-automata/src/util/prefilter/teddy.rs @@ -50,12 +50,17 @@ impl Teddy { // theory we could at least support leftmost-longest, as the // aho-corasick crate does, but regex-automata doesn't know about // leftmost-longest currently. + // + // And like the aho-corasick prefilter, if we're using `All` + // semantics, then we can still use leftmost semantics for a + // prefilter. (This might be a suspicious choice for the literal + // engine, which uses a prefilter as a regex engine directly, but + // that only happens when using leftmost-first semantics.) let (packed_match_kind, ac_match_kind) = match kind { - MatchKind::LeftmostFirst => ( + MatchKind::LeftmostFirst | MatchKind::All => ( aho_corasick::packed::MatchKind::LeftmostFirst, aho_corasick::MatchKind::LeftmostFirst, ), - _ => return None, }; let minimum_len = needles.iter().map(|n| n.as_ref().len()).min().unwrap_or(0); diff --git a/testdata/regression.toml b/testdata/regression.toml index a2efa2ad3..03b15d6d5 100644 --- a/testdata/regression.toml +++ b/testdata/regression.toml @@ -756,3 +756,29 @@ name = "reverse-inner-short" regex = '(?:([0-9][0-9][0-9]):)?([0-9][0-9]):([0-9][0-9])' haystack = '102:12:39' matches = [[[0, 9], [0, 3], [4, 6], [7, 9]]] + +# This regression test was found via the RegexSet APIs. It triggered a +# particular code path where a regex was compiled with 'All' match semantics +# (to support overlapping search), but got funneled down into a standard +# leftmost search when calling 'is_match'. This is fine on its own, but the +# leftmost search will use a prefilter and that's where this went awry. +# +# Namely, since 'All' semantics were used, the aho-corasick prefilter was +# incorrectly compiled with 'Standard' semantics. This was wrong because +# 'Standard' immediately attempts to report a match at every position, even if +# that would mean reporting a match past the leftmost match before reporting +# the leftmost match. This breaks the prefilter contract of never having false +# negatives and leads overall to the engine not finding a match. +# +# See: https://github.com/rust-lang/regex/issues/1070 +[[test]] +name = "prefilter-with-aho-corasick-standard-semantics" +regex = '(?m)^ *v [0-9]' +haystack = 'v 0' +matches = [ + { id = 0, spans = [[0, 3]] }, +] +match-kind = "all" +search-kind = "overlapping" +unicode = true +utf8 = true From c788378d6fe407f4774df98a78436cea5d98525b Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sat, 26 Aug 2023 09:18:30 -0400 Subject: [PATCH 036/136] ci: drop mips, add powerpc and s390x The main reason we used mips before was to get test coverage on a big endian target. Now that mips no longer seems to work[1], I wanted to add at least one other big endian target. From the tier 2 supported platforms[2], the only big endian targets I could find were powerpc and s390x. So we just add both here. [1]: https://github.com/rust-lang/compiler-team/issues/648 [2]: https://doc.rust-lang.org/nightly/rustc/platform-support.html#tier-2-with-host-tools --- .github/workflows/ci.yml | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 25df2b301..2035178a4 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -54,10 +54,14 @@ jobs: os: ubuntu-latest rust: stable target: i686-unknown-linux-gnu - - build: stable-mips + - build: stable-powerpc64 os: ubuntu-latest rust: stable - target: mips64-unknown-linux-gnuabi64 + target: powerpc64-unknown-linux-gnu + - build: stable-s390x + os: ubuntu-latest + rust: stable + target: s390x-unknown-linux-gnu - build: beta os: ubuntu-latest rust: beta From e008f83090c1fa858cd602da94e9ffdf38fd7317 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sat, 26 Aug 2023 09:50:22 -0400 Subject: [PATCH 037/136] changelog: 1.9.4 --- CHANGELOG.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 764bb11b8..a5f218010 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,15 @@ +1.9.4 (2023-08-26) +================== +This is a patch release that fixes a bug where `RegexSet::is_match(..)` could +incorrectly return false (even when `RegexSet::matches(..).matched_any()` +returns true). + +Bug fixes: + +* [BUG #1070](https://github.com/rust-lang/regex/issues/1070): +Fix a bug where a prefilter was incorrectly configured for a `RegexSet`. + + 1.9.3 (2023-08-05) ================== This is a patch release that fixes a bug where some searches could result in From 990979bbdc28fa841e3ad55934ee445cd710d110 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sat, 26 Aug 2023 09:50:33 -0400 Subject: [PATCH 038/136] regex-syntax-0.7.5 --- regex-syntax/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/regex-syntax/Cargo.toml b/regex-syntax/Cargo.toml index b7a149c23..aaceeee7f 100644 --- a/regex-syntax/Cargo.toml +++ b/regex-syntax/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "regex-syntax" -version = "0.7.4" #:version +version = "0.7.5" #:version authors = ["The Rust Project Developers", "Andrew Gallant "] license = "MIT OR Apache-2.0" repository = "https://github.com/rust-lang/regex/tree/master/regex-syntax" From 89b452af302a00458a129f8f40f3b65daf7a278a Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sat, 26 Aug 2023 09:50:44 -0400 Subject: [PATCH 039/136] regex-automata-0.3.7 --- regex-automata/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/regex-automata/Cargo.toml b/regex-automata/Cargo.toml index 22af1d9a3..d069b176e 100644 --- a/regex-automata/Cargo.toml +++ b/regex-automata/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "regex-automata" -version = "0.3.6" #:version +version = "0.3.7" #:version authors = ["The Rust Project Developers", "Andrew Gallant "] description = "Automata construction and matching using regular expressions." documentation = "https://docs.rs/regex-automata" From 3f15f1cf355577fe369c15ce60e1d225a163bf29 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sat, 26 Aug 2023 09:51:18 -0400 Subject: [PATCH 040/136] deps: bump regex-syntax and regex-automata versions --- Cargo.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index cd2e30a5d..e8e1608ec 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -173,14 +173,14 @@ optional = true # For the actual regex engines. [dependencies.regex-automata] path = "regex-automata" -version = "0.3.6" +version = "0.3.7" default-features = false features = ["alloc", "syntax", "meta", "nfa-pikevm"] # For parsing regular expressions. [dependencies.regex-syntax] path = "regex-syntax" -version = "0.7.3" +version = "0.7.5" default-features = false [dev-dependencies] From f39ab4d1b7229924f0cf310c9f3e19822fa19b8a Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sat, 26 Aug 2023 09:51:37 -0400 Subject: [PATCH 041/136] 1.9.4 --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index e8e1608ec..0675337d7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "regex" -version = "1.9.3" #:version +version = "1.9.4" #:version authors = ["The Rust Project Developers", "Andrew Gallant "] license = "MIT OR Apache-2.0" readme = "README.md" From 2637e11b9fb9f3dc8bbfc3cbc625fd454c091d04 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sat, 26 Aug 2023 14:32:11 -0400 Subject: [PATCH 042/136] ci: remove stale comment --- .github/workflows/ci.yml | 6 ------ 1 file changed, 6 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 2035178a4..1efa31f07 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -96,12 +96,6 @@ jobs: cd "$dir" curl -LO "https://github.com/cross-rs/cross/releases/download/$CROSS_VERSION/cross-x86_64-unknown-linux-musl.tar.gz" tar xf cross-x86_64-unknown-linux-musl.tar.gz - - # We used to install 'cross' from master, but it kept failing. So now - # we build from a known-good version until 'cross' becomes more stable - # or we find an alternative. Notably, between v0.2.1 and current - # master (2022-06-14), the number of Cross's dependencies has doubled. - # cargo install --bins --git https://github.com/rust-embedded/cross --tag v0.2.1 echo "CARGO=cross" >> $GITHUB_ENV echo "TARGET=--target ${{ matrix.target }}" >> $GITHUB_ENV - name: Show command used for Cargo From 329c6a32451434fc2f229ad8d3c934c70148ae45 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sat, 26 Aug 2023 14:40:25 -0400 Subject: [PATCH 043/136] ci: use dtolnay@master instead of @v1 I believe dtolnay corrected me on this a while ago, but either the change got reverted or it was for some other project. In any case, we should use @master so we get the latest updates. --- .github/workflows/ci.yml | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 1efa31f07..c2a38d6d4 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -81,7 +81,7 @@ jobs: - name: Checkout repository uses: actions/checkout@v3 - name: Install Rust - uses: dtolnay/rust-toolchain@v1 + uses: dtolnay/rust-toolchain@master with: toolchain: ${{ matrix.rust }} - name: Install and configure Cross @@ -139,7 +139,7 @@ jobs: - name: Checkout repository uses: actions/checkout@v3 - name: Install Rust - uses: dtolnay/rust-toolchain@v1 + uses: dtolnay/rust-toolchain@master with: toolchain: 1.60.0 - name: Basic build @@ -160,7 +160,7 @@ jobs: - name: Checkout repository uses: actions/checkout@v3 - name: Install Rust - uses: dtolnay/rust-toolchain@v1 + uses: dtolnay/rust-toolchain@master with: toolchain: stable - name: Run full test suite @@ -173,7 +173,7 @@ jobs: - name: Checkout repository uses: actions/checkout@v3 - name: Install Rust - uses: dtolnay/rust-toolchain@v1 + uses: dtolnay/rust-toolchain@master with: toolchain: stable - name: Run full test suite @@ -186,7 +186,7 @@ jobs: - name: Checkout repository uses: actions/checkout@v3 - name: Install Rust - uses: dtolnay/rust-toolchain@v1 + uses: dtolnay/rust-toolchain@master with: toolchain: stable - name: Run full test suite @@ -199,7 +199,7 @@ jobs: - name: Checkout repository uses: actions/checkout@v3 - name: Install Rust - uses: dtolnay/rust-toolchain@v1 + uses: dtolnay/rust-toolchain@master with: toolchain: stable - name: Run full test suite @@ -214,7 +214,7 @@ jobs: - name: Checkout repository uses: actions/checkout@v3 - name: Install Rust - uses: dtolnay/rust-toolchain@v1 + uses: dtolnay/rust-toolchain@master with: # We use nightly here so that we can use miri I guess? # It caught me by surprise that miri seems to only be @@ -231,7 +231,7 @@ jobs: - name: Checkout repository uses: actions/checkout@v3 - name: Install Rust - uses: dtolnay/rust-toolchain@v1 + uses: dtolnay/rust-toolchain@master with: toolchain: stable components: rustfmt From 15cdc64869ea5508d96a0e7667c44c7c459986a1 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sun, 27 Aug 2023 16:38:25 -0400 Subject: [PATCH 044/136] cli: remove use of deprecated API I deprecated this API a couple releases ago. Update the `regex-cli` tool to be in line with that. --- regex-cli/args/thompson.rs | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/regex-cli/args/thompson.rs b/regex-cli/args/thompson.rs index 6e7b4afd8..151fc6a0b 100644 --- a/regex-cli/args/thompson.rs +++ b/regex-cli/args/thompson.rs @@ -28,7 +28,11 @@ impl Config { pub fn reversed(&self) -> Config { // Reverse DFAs require that captures are disabled. In practice, there // is no current use case for a reverse NFA with capture groups. - let thompson = self.thompson.clone().reverse(true).captures(false); + let thompson = self + .thompson + .clone() + .reverse(true) + .which_captures(thompson::WhichCaptures::None); Config { thompson } } @@ -67,7 +71,10 @@ impl Configurable for Config { self.thompson = self.thompson.clone().shrink(true); } Arg::Long("no-captures") => { - self.thompson = self.thompson.clone().captures(false); + self.thompson = self + .thompson + .clone() + .which_captures(thompson::WhichCaptures::None); } Arg::Long("line-terminator") => { let byte: flags::OneByte = From 9a505a1804f8f89e3448a2a2c5c70573dc6362e5 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sun, 27 Aug 2023 16:39:04 -0400 Subject: [PATCH 045/136] deps: bump to memchr 2.6 This bumps the minimum memchr version to 2.6, which brings in massive improvements to aarch64 for single substring search. We also can now enable the new `alloc` feature in `memchr` when `alloc` is enable for `regex` and `regex-automata`. We also squash some warnings. [1]: https://github.com/BurntSushi/memchr/pull/129 --- Cargo.toml | 3 ++- fuzz/fuzz_targets/ast_fuzz_match.rs | 9 +++++---- fuzz/fuzz_targets/ast_fuzz_match_bytes.rs | 9 +++++---- regex-automata/Cargo.toml | 4 ++-- 4 files changed, 14 insertions(+), 11 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 0675337d7..4cc42b6cd 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -52,6 +52,7 @@ std = [ # to actually emit the log messages somewhere. logging = [ "aho-corasick?/logging", + "memchr?/logging", "regex-automata/logging", ] # The 'use_std' feature is DEPRECATED. It will be removed in regex 2. Until @@ -167,7 +168,7 @@ optional = true # For skipping along search text quickly when a leading byte is known. [dependencies.memchr] -version = "2.5.0" +version = "2.6.0" optional = true # For the actual regex engines. diff --git a/fuzz/fuzz_targets/ast_fuzz_match.rs b/fuzz/fuzz_targets/ast_fuzz_match.rs index 58a8ebbf8..9ccb407dc 100644 --- a/fuzz/fuzz_targets/ast_fuzz_match.rs +++ b/fuzz/fuzz_targets/ast_fuzz_match.rs @@ -25,11 +25,12 @@ fuzz_target!(|data: FuzzData| -> Corpus { let _ = env_logger::try_init(); let pattern = format!("{}", data.ast); - let Ok(re) = RegexBuilder::new(&pattern).size_limit(1<<20).build() else { + let Ok(re) = RegexBuilder::new(&pattern).size_limit(1 << 20).build() + else { return Corpus::Reject; }; - re.is_match(&data.haystack); - re.find(&data.haystack); - re.captures(&data.haystack).map_or(0, |c| c.len()); + let _ = re.is_match(&data.haystack); + let _ = re.find(&data.haystack); + let _ = re.captures(&data.haystack).map_or(0, |c| c.len()); Corpus::Keep }); diff --git a/fuzz/fuzz_targets/ast_fuzz_match_bytes.rs b/fuzz/fuzz_targets/ast_fuzz_match_bytes.rs index a4fa0bd73..045c1fb18 100644 --- a/fuzz/fuzz_targets/ast_fuzz_match_bytes.rs +++ b/fuzz/fuzz_targets/ast_fuzz_match_bytes.rs @@ -25,11 +25,12 @@ fuzz_target!(|data: FuzzData| -> Corpus { let _ = env_logger::try_init(); let pattern = format!("{}", data.ast); - let Ok(re) = RegexBuilder::new(&pattern).size_limit(1<<20).build() else { + let Ok(re) = RegexBuilder::new(&pattern).size_limit(1 << 20).build() + else { return Corpus::Reject; }; - re.is_match(&data.haystack); - re.find(&data.haystack); - re.captures(&data.haystack).map_or(0, |c| c.len()); + let _ = re.is_match(&data.haystack); + let _ = re.find(&data.haystack); + let _ = re.captures(&data.haystack).map_or(0, |c| c.len()); Corpus::Keep }); diff --git a/regex-automata/Cargo.toml b/regex-automata/Cargo.toml index d069b176e..3cd9965b0 100644 --- a/regex-automata/Cargo.toml +++ b/regex-automata/Cargo.toml @@ -21,7 +21,7 @@ bench = false default = ["std", "syntax", "perf", "unicode", "meta", "nfa", "dfa", "hybrid"] std = ["regex-syntax?/std", "memchr?/std", "aho-corasick?/std", "alloc"] alloc = [] -logging = ["dep:log", "aho-corasick?/logging"] +logging = ["dep:log", "aho-corasick?/logging", "memchr?/logging"] syntax = ["dep:regex-syntax", "alloc"] @@ -84,7 +84,7 @@ internal-instrument-pikevm = ["logging", "std"] [dependencies] aho-corasick = { version = "1.0.0", optional = true, default-features = false } log = { version = "0.4.14", optional = true } -memchr = { version = "2.5.0", optional = true, default-features = false } +memchr = { version = "2.6.0", optional = true, default-features = false } regex-syntax = { path = "../regex-syntax", version = "0.7.4", optional = true, default-features = false } [dev-dependencies] From f578d74ff42f3df408378ff52d3bdf4433423532 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Wed, 30 Aug 2023 18:28:06 -0400 Subject: [PATCH 046/136] automata: reduce regex contention somewhat MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit > **Context:** A `Regex` uses internal mutable space (called a `Cache`) > while executing a search. Since a `Regex` really wants to be easily > shared across multiple threads simultaneously, it follows that a > `Regex` either needs to provide search functions that accept a `&mut > Cache` (thereby pushing synchronization to a problem for the caller > to solve) or it needs to do synchronization itself. While there are > lower level APIs in `regex-automata` that do the former, they are > less convenient. The higher level APIs, especially in the `regex` > crate proper, need to do some kind of synchronization to give a > search the mutable `Cache` that it needs. > > The current approach to that synchronization essentially uses a > `Mutex>` with an optimization for the "owning" thread > that lets it bypass the `Mutex`. The owning thread optimization > makes it so the single threaded use case essentially doesn't pay for > any synchronization overhead, and that all works fine. But once the > `Regex` is shared across multiple threads, that `Mutex>` > gets hit. And if you're doing a lot of regex searches on short > haystacks in parallel, that `Mutex` comes under extremely heavy > contention. To the point that a program can slow down by enormous > amounts. > > This PR attempts to address that problem. > > Note that it's worth pointing out that this issue can be worked > around. > > The simplest work-around is to clone a `Regex` and send it to other > threads instead of sharing a single `Regex`. This won't use any > additional memory (a `Regex` is reference counted internally), > but it will force each thread to use the "owner" optimization > described above. This does mean, for example, that you can't > share a `Regex` across multiple threads conveniently with a > `lazy_static`/`OnceCell`/`OnceLock`/whatever. > > The other work-around is to use the lower level search APIs on a > `meta::Regex` in the `regex-automata` crate. Those APIs accept a > `&mut Cache` explicitly. In that case, you can use the `thread_local` > crate or even an actual `thread_local!` or something else entirely. I wish I could say this PR was a home run that fixed the contention issues with `Regex` once and for all, but it's not. It just makes things a fair bit better by switching from one stack to eight stacks for the pool, plus a couple other heuristics. The stack is chosen by doing `self.stacks[thread_id % 8]`. It's a pretty dumb strategy, but it limits extra memory usage while at least reducing contention. Obviously, it works a lot better for the 8-16 thread case, and while it helps with the 64-128 thread case too, things are still pretty slow there. A benchmark for this problem is described in #934. We compare 8 and 16 threads, and for each thread count, we compare a `cloned` and `shared` approach. The `cloned` approach clones the regex before sending it to each thread where as the `shared` approach shares a single regex across multiple threads. The `cloned` approach is expected to be fast (and it is) because it forces each thread into the owner optimization. The `shared` approach, however, hit the shared stack behind a mutex and suffers majorly from contention. Here's what that benchmark looks like before this PR for 64 threads (on a 24-core CPU). ``` $ hyperfine "REGEX_BENCH_WHICH=cloned REGEX_BENCH_THREADS=64 ./target/release/repro" "REGEX_BENCH_WHICH=shared REGEX_BENCH_THREADS=64 ./tmp/repro-master" Benchmark 1: REGEX_BENCH_WHICH=cloned REGEX_BENCH_THREADS=64 ./target/release/repro Time (mean ± σ): 9.0 ms ± 0.6 ms [User: 128.3 ms, System: 5.7 ms] Range (min … max): 7.7 ms … 11.1 ms 278 runs Benchmark 2: REGEX_BENCH_WHICH=shared REGEX_BENCH_THREADS=64 ./tmp/repro-master Time (mean ± σ): 1.938 s ± 0.036 s [User: 4.827 s, System: 41.401 s] Range (min … max): 1.885 s … 1.992 s 10 runs Summary 'REGEX_BENCH_WHICH=cloned REGEX_BENCH_THREADS=64 ./target/release/repro' ran 215.02 ± 15.45 times faster than 'REGEX_BENCH_WHICH=shared REGEX_BENCH_THREADS=64 ./tmp/repro-master' ``` And here's what it looks like after this PR: ``` $ hyperfine "REGEX_BENCH_WHICH=cloned REGEX_BENCH_THREADS=64 ./target/release/repro" "REGEX_BENCH_WHICH=shared REGEX_BENCH_THREADS=64 ./target/release/repro" Benchmark 1: REGEX_BENCH_WHICH=cloned REGEX_BENCH_THREADS=64 ./target/release/repro Time (mean ± σ): 9.0 ms ± 0.6 ms [User: 127.6 ms, System: 6.2 ms] Range (min … max): 7.9 ms … 11.7 ms 287 runs Benchmark 2: REGEX_BENCH_WHICH=shared REGEX_BENCH_THREADS=64 ./target/release/repro Time (mean ± σ): 55.0 ms ± 5.1 ms [User: 1050.4 ms, System: 12.0 ms] Range (min … max): 46.1 ms … 67.3 ms 57 runs Summary 'REGEX_BENCH_WHICH=cloned REGEX_BENCH_THREADS=64 ./target/release/repro' ran 6.09 ± 0.71 times faster than 'REGEX_BENCH_WHICH=shared REGEX_BENCH_THREADS=64 ./target/release/repro' ``` So instead of things getting over 215x slower in the 64 thread case, it "only" gets 6x slower. Closes #934 --- regex-automata/src/util/pool.rs | 187 ++++++++++++++++++++++++++++---- 1 file changed, 168 insertions(+), 19 deletions(-) diff --git a/regex-automata/src/util/pool.rs b/regex-automata/src/util/pool.rs index 7f4a1c21e..c03d7b013 100644 --- a/regex-automata/src/util/pool.rs +++ b/regex-automata/src/util/pool.rs @@ -268,6 +268,64 @@ mod inner { /// do. static THREAD_ID_DROPPED: usize = 2; + /// The number of stacks we use inside of the pool. These are only used for + /// non-owners. That is, these represent the "slow" path. + /// + /// In the original implementation of this pool, we only used a single + /// stack. While this might be okay for a couple threads, the prevalence of + /// 32, 64 and even 128 core CPUs has made it untenable. The contention + /// such an environment introduces when threads are doing a lot of searches + /// on short haystacks (a not uncommon use case) is palpable and leads to + /// huge slowdowns. + /// + /// This constant reflects a change from using one stack to the number of + /// stacks that this constant is set to. The stack for a particular thread + /// is simply chosen by `thread_id % MAX_POOL_STACKS`. The idea behind + /// this setup is that there should be a good chance that accesses to the + /// pool will be distributed over several stacks instead of all of them + /// converging to one. + /// + /// This is not a particularly smart or dynamic strategy. Fixing this to a + /// specific number has at least two downsides. First is that it will help, + /// say, an 8 core CPU more than it will a 128 core CPU. (But, crucially, + /// it will still help the 128 core case.) Second is that this may wind + /// up being a little wasteful with respect to memory usage. Namely, if a + /// regex is used on one thread and then moved to another thread, then it + /// could result in creating a new copy of the data in the pool even though + /// only one is actually needed. + /// + /// And that memory usage bit is why this is set to 8 and not, say, 64. + /// Keeping it at 8 limits, to an extent, how much unnecessary memory can + /// be allocated. + /// + /// In an ideal world, we'd be able to have something like this: + /// + /// * Grow the number of stacks as the number of concurrent callers + /// increases. I spent a little time trying this, but even just adding an + /// atomic addition/subtraction for each pop/push for tracking concurrent + /// callers led to a big perf hit. Since even more work would seemingly be + /// required than just an addition/subtraction, I abandoned this approach. + /// * The maximum amount of memory used should scale with respect to the + /// number of concurrent callers and *not* the total number of existing + /// threads. This is primarily why the `thread_local` crate isn't used, as + /// as some environments spin up a lot of threads. This led to multiple + /// reports of extremely high memory usage (often described as memory + /// leaks). + /// * Even more ideally, the pool should contract in size. That is, it + /// should grow with bursts and then shrink. But this is a pretty thorny + /// issue to tackle and it might be better to just not. + /// * It would be nice to explore the use of, say, a lock-free stack + /// instead of using a mutex to guard a `Vec` that is ultimately just + /// treated as a stack. The main thing preventing me from exploring this + /// is the ABA problem. The `crossbeam` crate has tools for dealing with + /// this sort of problem (via its epoch based memory reclamation strategy), + /// but I can't justify bringing in all of `crossbeam` as a dependency of + /// `regex` for this. + /// + /// See this issue for more context and discussion: + /// https://github.com/rust-lang/regex/issues/934 + const MAX_POOL_STACKS: usize = 8; + thread_local!( /// A thread local used to assign an ID to a thread. static THREAD_ID: usize = { @@ -291,6 +349,17 @@ mod inner { }; ); + /// This puts each stack in the pool below into its own cache line. This is + /// an absolutely critical optimization that tends to have the most impact + /// in high contention workloads. Without forcing each mutex protected + /// into its own cache line, high contention exacerbates the performance + /// problem by causing "false sharing." By putting each mutex in its own + /// cache-line, we avoid the false sharing problem and the affects of + /// contention are greatly reduced. + #[derive(Debug)] + #[repr(C, align(64))] + struct CacheLine(T); + /// A thread safe pool utilizing std-only features. /// /// The main difference between this and the simplistic alloc-only pool is @@ -299,12 +368,16 @@ mod inner { /// This makes the common case of running a regex within a single thread /// faster by avoiding mutex unlocking. pub(super) struct Pool { - /// A stack of T values to hand out. These are used when a Pool is - /// accessed by a thread that didn't create it. - stack: Mutex>>, /// A function to create more T values when stack is empty and a caller /// has requested a T. create: F, + /// Multiple stacks of T values to hand out. These are used when a Pool + /// is accessed by a thread that didn't create it. + /// + /// Conceptually this is `Mutex>>`, but sharded out to make + /// it scale better under high contention work-loads. We index into + /// this sequence via `thread_id % stacks.len()`. + stacks: Vec>>>>, /// The ID of the thread that owns this pool. The owner is the thread /// that makes the first call to 'get'. When the owner calls 'get', it /// gets 'owner_val' directly instead of returning a T from 'stack'. @@ -354,9 +427,17 @@ mod inner { unsafe impl Sync for Pool {} // If T is UnwindSafe, then since we provide exclusive access to any - // particular value in the pool, it should therefore also be considered - // RefUnwindSafe. Also, since we use std::sync::Mutex, we get poisoning - // from it if another thread panics while the lock is held. + // particular value in the pool, the pool should therefore also be + // considered UnwindSafe. + // + // We require `F: UnwindSafe + RefUnwindSafe` because we call `F` at any + // point on demand, so it needs to be unwind safe on both dimensions for + // the entire Pool to be unwind safe. + impl UnwindSafe for Pool {} + + // If T is UnwindSafe, then since we provide exclusive access to any + // particular value in the pool, the pool should therefore also be + // considered RefUnwindSafe. // // We require `F: UnwindSafe + RefUnwindSafe` because we call `F` at any // point on demand, so it needs to be unwind safe on both dimensions for @@ -375,9 +456,13 @@ mod inner { // 'Pool::new' method as 'const' too. (The alloc-only Pool::new // is already 'const', so that should "just work" too.) The only // thing we're waiting for is Mutex::new to be const. + let mut stacks = Vec::with_capacity(MAX_POOL_STACKS); + for _ in 0..stacks.capacity() { + stacks.push(CacheLine(Mutex::new(vec![]))); + } let owner = AtomicUsize::new(THREAD_ID_UNOWNED); let owner_val = UnsafeCell::new(None); // init'd on first access - Pool { stack: Mutex::new(vec![]), create, owner, owner_val } + Pool { create, stacks, owner, owner_val } } } @@ -401,6 +486,9 @@ mod inner { let caller = THREAD_ID.with(|id| *id); let owner = self.owner.load(Ordering::Acquire); if caller == owner { + // N.B. We could also do a CAS here instead of a load/store, + // but ad hoc benchmarking suggests it is slower. And a lot + // slower in the case where `get_slow` is common. self.owner.store(THREAD_ID_INUSE, Ordering::Release); return self.guard_owned(caller); } @@ -444,37 +532,82 @@ mod inner { return self.guard_owned(caller); } } - let mut stack = self.stack.lock().unwrap(); - let value = match stack.pop() { - None => Box::new((self.create)()), - Some(value) => value, - }; - self.guard_stack(value) + let stack_id = caller % self.stacks.len(); + // We try to acquire exclusive access to this thread's stack, and + // if so, grab a value from it if we can. We put this in a loop so + // that it's easy to tweak and experiment with a different number + // of tries. In the end, I couldn't see anything obviously better + // than one attempt in ad hoc testing. + for _ in 0..1 { + let mut stack = match self.stacks[stack_id].0.try_lock() { + Err(_) => continue, + Ok(stack) => stack, + }; + if let Some(value) = stack.pop() { + return self.guard_stack(value); + } + // Unlock the mutex guarding the stack before creating a fresh + // value since we no longer need the stack. + drop(stack); + let value = Box::new((self.create)()); + return self.guard_stack(value); + } + // We're only here if we could get access to our stack, so just + // create a new value. This seems like it could be wasteful, but + // waiting for exclusive access to a stack when there's high + // contention is brutal for perf. + self.guard_stack_transient(Box::new((self.create)())) } /// Puts a value back into the pool. Callers don't need to call this. /// Once the guard that's returned by 'get' is dropped, it is put back /// into the pool automatically. fn put_value(&self, value: Box) { - let mut stack = self.stack.lock().unwrap(); - stack.push(value); + let caller = THREAD_ID.with(|id| *id); + let stack_id = caller % self.stacks.len(); + // As with trying to pop a value from this thread's stack, we + // merely attempt to get access to push this value back on the + // stack. If there's too much contention, we just give up and throw + // the value away. + // + // Interestingly, in ad hoc benchmarking, it is beneficial to + // attempt to push the value back more than once, unlike when + // popping the value. I don't have a good theory for why this is. + // I guess if we drop too many values then that winds up forcing + // the pop operation to create new fresh values and thus leads to + // less reuse. There's definitely a balancing act here. + for _ in 0..10 { + let mut stack = match self.stacks[stack_id].0.try_lock() { + Err(_) => continue, + Ok(stack) => stack, + }; + stack.push(value); + return; + } } /// Create a guard that represents the special owned T. fn guard_owned(&self, caller: usize) -> PoolGuard<'_, T, F> { - PoolGuard { pool: self, value: Err(caller) } + PoolGuard { pool: self, value: Err(caller), discard: false } } /// Create a guard that contains a value from the pool's stack. fn guard_stack(&self, value: Box) -> PoolGuard<'_, T, F> { - PoolGuard { pool: self, value: Ok(value) } + PoolGuard { pool: self, value: Ok(value), discard: false } + } + + /// Create a guard that contains a value from the pool's stack with an + /// instruction to throw away the value instead of putting it back + /// into the pool. + fn guard_stack_transient(&self, value: Box) -> PoolGuard<'_, T, F> { + PoolGuard { pool: self, value: Ok(value), discard: true } } } impl core::fmt::Debug for Pool { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { f.debug_struct("Pool") - .field("stack", &self.stack) + .field("stacks", &self.stacks) .field("owner", &self.owner) .field("owner_val", &self.owner_val) .finish() @@ -490,6 +623,12 @@ mod inner { /// in the special case of `Err(THREAD_ID_DROPPED)`, it means the /// guard has been put back into the pool and should no longer be used. value: Result, usize>, + /// When true, the value should be discarded instead of being pushed + /// back into the pool. We tend to use this under high contention, and + /// this allows us to avoid inflating the size of the pool. (Because + /// under contention, we tend to create more values instead of waiting + /// for access to a stack of existing values.) + discard: bool, } impl<'a, T: Send, F: Fn() -> T> PoolGuard<'a, T, F> { @@ -557,7 +696,17 @@ mod inner { #[inline(always)] fn put_imp(&mut self) { match core::mem::replace(&mut self.value, Err(THREAD_ID_DROPPED)) { - Ok(value) => self.pool.put_value(value), + Ok(value) => { + // If we were told to discard this value then don't bother + // trying to put it back into the pool. This occurs when + // the pop operation failed to acquire a lock and we + // decided to create a new value in lieu of contending for + // the lock. + if self.discard { + return; + } + self.pool.put_value(value); + } // If this guard has a value "owned" by the thread, then // the Pool guarantees that this is the ONLY such guard. // Therefore, in order to place it back into the pool and make From 135e11ba9c54b383072ae98043c31dfe1066886a Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sat, 2 Sep 2023 11:12:49 -0400 Subject: [PATCH 047/136] changelog: 1.9.5 --- CHANGELOG.md | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index a5f218010..885bb9bd7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,42 @@ +1.9.5 (2023-09-02) +================== +This is a patch release that hopefully mostly fixes a performance bug that +occurs when sharing a regex across multiple threads. + +Issue [#934](https://github.com/rust-lang/regex/issues/934) +explains this in more detail. It is [also noted in the crate +documentation](https://docs.rs/regex/latest/regex/#sharing-a-regex-across-threads-can-result-in-contention). +The bug can appear when sharing a regex across multiple threads simultaneously, +as might be the case when using a regex from a `OnceLock`, `lazy_static` or +similar primitive. Usually high contention only results when using many threads +to execute searches on small haystacks. + +One can avoid the contention problem entirely through one of two methods. +The first is to use lower level APIs from `regex-automata` that require passing +state explicitly, such as [`meta::Regex::search_with`](https://docs.rs/regex-automata/latest/regex_automata/meta/struct.Regex.html#method.search_with). +The second is to clone a regex and send it to other threads explicitly. This +will not use any additional memory usage compared to sharing the regex. The +only downside of this approach is that it may be less convenient, for example, +it won't work with things like `OnceLock` or `lazy_static` or `once_cell`. + +With that said, as of this release, the contention performance problems have +been greatly reduced. This was achieved by changing the free-list so that it +was sharded across threads, and that ensuring each sharded mutex occupies a +single cache line to mitigate false sharing. So while contention may still +impact performance in some cases, it should be a lot better now. + +Because of the changes to how the free-list works, please report any issues you +find with this release. That not only includes search time regressions but also +significant regressions in memory usage. Reporting improvements is also welcome +as well! If possible, provide a reproduction. + +Bug fixes: + +* [BUG #934](https://github.com/rust-lang/regex/issues/934): +Fix a performance bug where high contention on a single regex led to massive +slow downs. + + 1.9.4 (2023-08-26) ================== This is a patch release that fixes a bug where `RegexSet::is_match(..)` could From 894dcbe11e45d08b23db24f877574e06f3a69a35 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sat, 2 Sep 2023 11:12:58 -0400 Subject: [PATCH 048/136] regex-automata-0.3.8 --- regex-automata/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/regex-automata/Cargo.toml b/regex-automata/Cargo.toml index 3cd9965b0..c7e949c4c 100644 --- a/regex-automata/Cargo.toml +++ b/regex-automata/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "regex-automata" -version = "0.3.7" #:version +version = "0.3.8" #:version authors = ["The Rust Project Developers", "Andrew Gallant "] description = "Automata construction and matching using regular expressions." documentation = "https://docs.rs/regex-automata" From 48e09a85e46d2b8cc379cd0b69cd98467639f7ff Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sat, 2 Sep 2023 11:13:24 -0400 Subject: [PATCH 049/136] deps: bump regex-automata to 0.3.8 --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 4cc42b6cd..7afdfdc35 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -174,7 +174,7 @@ optional = true # For the actual regex engines. [dependencies.regex-automata] path = "regex-automata" -version = "0.3.7" +version = "0.3.8" default-features = false features = ["alloc", "syntax", "meta", "nfa-pikevm"] From 554469b8c1116322f3c0a054ceeb610224f8ac65 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sat, 2 Sep 2023 11:13:28 -0400 Subject: [PATCH 050/136] 1.9.5 --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 7afdfdc35..c78ed045f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "regex" -version = "1.9.4" #:version +version = "1.9.5" #:version authors = ["The Rust Project Developers", "Andrew Gallant "] license = "MIT OR Apache-2.0" readme = "README.md" From 4aaf3896ef1147000a5e63f174fa49bfa5d18d65 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sun, 3 Sep 2023 11:09:56 -0400 Subject: [PATCH 051/136] ci: pin to memchr 2.6.2 for MSRV CI job I botched the memchr 2.6 MSRV because it actually requires Rust 1.61 and not Rust 1.60. This crate's MSRV is Rust 1.60, so pin memchr to a version that works on Rust 1.60 (for x86-64 at least). --- .github/workflows/ci.yml | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c2a38d6d4..08cc60d9a 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -142,6 +142,25 @@ jobs: uses: dtolnay/rust-toolchain@master with: toolchain: 1.60.0 + # The memchr 2.6 release purportedly bumped its MSRV to Rust 1.60, but it + # turned out that on aarch64, it was using something that wasn't stabilized + # until Rust 1.61[1]. (This was an oversight on my part. I had previously + # thought everything I needed was on Rust 1.60.) To resolve that, I just + # bumped memchr's MSRV to 1.61. Since it was so soon after the memchr 2.6 + # release, I treated this as a bugfix. + # + # But the regex crate's MSRV is at Rust 1.60, and it now depends on at + # least memchr 2.6 (to make use of its `alloc` feature). So we can't set + # a lower minimal version. And I can't just bump the MSRV in a patch + # release as a bug fix because regex 1.9 was released quite some time ago. + # I could just release regex 1.10 and bump the MSRV there, but eh, I don't + # want to put out another minor version release just for this. + # + # So... pin memchr to 2.6.2, which at least works on x86-64 on Rust 1.60. + # + # [1]: https://github.com/BurntSushi/memchr/issues/136 + - name: Pin memchr to 2.6.2 + run: cargo update -p memchr --precise 2.6.2 - name: Basic build run: cargo build --verbose - name: Build docs From cdc0dbd3547462aedb6235197c2b743ec4ea75e5 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sun, 3 Sep 2023 11:22:41 -0400 Subject: [PATCH 052/136] readme: add section about performance and benchmarks --- README.md | 71 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) diff --git a/README.md b/README.md index a9d6fcd37..51188654d 100644 --- a/README.md +++ b/README.md @@ -219,6 +219,77 @@ The full set of features one can disable are [in the "Crate features" section of the documentation](https://docs.rs/regex/1.*/#crate-features). +### Performance + +One of the goals of this crate is for the regex engine to be "fast." What that +is a somewhat nebulous goal, it is usually interpreted in one of two ways. +First, it means that all searches take worst case `O(m * n)` time, where +`m` is proportional to `len(regex)` and `n` is proportional to `len(haystack)`. +Second, it means that even aside from the time complexity constraint, regex +searches are "fast" in practice. + +While the first interpretation is pretty unambiguous, the second one remains +nebulous. While nebulous, it guides this crate's architecture and the sorts of +the trade offs it makes. For example, here are some general architectural +statements that follow as a result of the goal to be "fast": + +* When given the choice between faster regex searches and faster Rust compile +times, this crate will generally choose faster regex searches. +* When given the choice between faster regex searches and faster regex compile +times, this crate will generally choose faster regex searches. That is, it is +generally acceptable for `Regex::new` to get a little slower if it means that +searches get faster. (This is a somewhat delicate balance to strike, because +the speed of `Regex::new` needs to remain somewhat reasonable. But this is why +one should avoid re-compiling the same regex over and over again.) +* When given the choice between faster regex searches and simpler API +design, this crate will generally choose faster regex searches. For example, +if one didn't care about performance, we could like get rid of both of +the `Regex::is_match` and `Regex::find` APIs and instead just rely on +`Regex::captures`. + +There are perhaps more ways that being "fast" influences things. + +While this repository used to provide its own benchmark suite, it has since +been moved to [rebar](https://github.com/BurntSushi/rebar). The benchmarks are +quite extensive, and there are many more than what is shown in rebar's README +(which is just limited to a "curated" set meant to compare performance between +regex engines). To run all of this crate's benchmarks, first start by cloning +and installing `rebar`: + +```text +$ git clone https://github.com/BurntSushi/rebar +$ cd rebar +$ cargo install --path ./ +``` + +Then build the benchmark harness for just this crate: + +```text +$ rebar build -e '^rust/regex$' +``` + +Run all benchmarks for this crate as tests (each benchmark is executed once to +ensure it works): + +```text +$ rebar measure -e '^rust/regex$' -t +``` + +Record measurements for all benchmarks and save them to a CSV file: + +```text +$ rebar measure -e '^rust/regex$' | tee results.csv +``` + +Explore benchmark timings: + +```text +$ rebar cmp results.csv +``` + +See the `rebar` documentation for more details on how it works and how to +compare results with other regex engines. + ### Minimum Rust version policy This crate's minimum supported `rustc` version is `1.60.0`. From 8275c1b3bef014a4393d9975285757f89d7e4592 Mon Sep 17 00:00:00 2001 From: Yutaka Kamei Date: Thu, 7 Sep 2023 23:00:49 +0900 Subject: [PATCH 053/136] doc: fix a few typos PR #1085 --- regex-automata/src/meta/regex.rs | 2 +- regex-automata/src/nfa/thompson/builder.rs | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/regex-automata/src/meta/regex.rs b/regex-automata/src/meta/regex.rs index 3a04b14d8..ce3bae0fa 100644 --- a/regex-automata/src/meta/regex.rs +++ b/regex-automata/src/meta/regex.rs @@ -2706,7 +2706,7 @@ impl Config { /// you're compiling untrusted patterns. /// /// Note that this limit is applied to _each_ NFA built, and if any of - /// them excceed the limit, then construction will fail. This limit does + /// them exceed the limit, then construction will fail. This limit does /// _not_ correspond to the total memory used by all NFAs in the meta regex /// engine. /// diff --git a/regex-automata/src/nfa/thompson/builder.rs b/regex-automata/src/nfa/thompson/builder.rs index b57e5bc0f..6b69e8784 100644 --- a/regex-automata/src/nfa/thompson/builder.rs +++ b/regex-automata/src/nfa/thompson/builder.rs @@ -61,7 +61,7 @@ enum State { Look { look: Look, next: StateID }, /// An empty state that records the start of a capture location. This is an /// unconditional epsilon transition like `Empty`, except it can be used to - /// record position information for a captue group when using the NFA for + /// record position information for a capture group when using the NFA for /// search. CaptureStart { /// The ID of the pattern that this capture was defined. @@ -77,7 +77,7 @@ enum State { }, /// An empty state that records the end of a capture location. This is an /// unconditional epsilon transition like `Empty`, except it can be used to - /// record position information for a captue group when using the NFA for + /// record position information for a capture group when using the NFA for /// search. CaptureEnd { /// The ID of the pattern that this capture was defined. @@ -128,7 +128,7 @@ enum State { } impl State { - /// If this state is an unconditional espilon transition, then this returns + /// If this state is an unconditional epsilon transition, then this returns /// the target of the transition. fn goto(&self) -> Option { match *self { From 061ee815ef2c44101dba7b0b124600fcb03c1912 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Thu, 7 Sep 2023 10:02:09 -0400 Subject: [PATCH 054/136] readme: visually emphasize performance criteria difference There was only a slight wording difference between these two points, and it was easy to gloss over. So we emphasize that wording difference to make it a bit easier to notice. PR #1082 --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 51188654d..7454c166d 100644 --- a/README.md +++ b/README.md @@ -233,10 +233,10 @@ nebulous. While nebulous, it guides this crate's architecture and the sorts of the trade offs it makes. For example, here are some general architectural statements that follow as a result of the goal to be "fast": -* When given the choice between faster regex searches and faster Rust compile -times, this crate will generally choose faster regex searches. -* When given the choice between faster regex searches and faster regex compile -times, this crate will generally choose faster regex searches. That is, it is +* When given the choice between faster regex searches and faster _Rust compile +times_, this crate will generally choose faster regex searches. +* When given the choice between faster regex searches and faster _regex compile +times_, this crate will generally choose faster regex searches. That is, it is generally acceptable for `Regex::new` to get a little slower if it means that searches get faster. (This is a somewhat delicate balance to strike, because the speed of `Regex::new` needs to remain somewhat reasonable. But this is why From 27a25385c0bd1228716271668febc88bd8c74932 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Thu, 28 Sep 2023 13:03:27 -0400 Subject: [PATCH 055/136] automata: add some #[inline] annotations This hopefully ensures these functions can be inlined across crate boundaries. (Although I think they likely already can be due to generics?) --- regex-automata/src/util/pool.rs | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/regex-automata/src/util/pool.rs b/regex-automata/src/util/pool.rs index c03d7b013..95afa4a0d 100644 --- a/regex-automata/src/util/pool.rs +++ b/regex-automata/src/util/pool.rs @@ -177,6 +177,7 @@ impl T> Pool { /// the value to go back into the pool) and then calling get again is /// *not* guaranteed to return the same value received in the first `get` /// call. + #[inline] pub fn get(&self) -> PoolGuard<'_, T, F> { PoolGuard(self.0.get()) } @@ -200,6 +201,7 @@ impl<'a, T: Send, F: Fn() -> T> PoolGuard<'a, T, F> { /// This circumvents the guard's `Drop` implementation. This can be useful /// in circumstances where the automatic `Drop` results in poorer codegen, /// such as calling non-inlined functions. + #[inline] pub fn put(this: PoolGuard<'_, T, F>) { inner::PoolGuard::put(this.0); } @@ -208,12 +210,14 @@ impl<'a, T: Send, F: Fn() -> T> PoolGuard<'a, T, F> { impl<'a, T: Send, F: Fn() -> T> core::ops::Deref for PoolGuard<'a, T, F> { type Target = T; + #[inline] fn deref(&self) -> &T { self.0.value() } } impl<'a, T: Send, F: Fn() -> T> core::ops::DerefMut for PoolGuard<'a, T, F> { + #[inline] fn deref_mut(&mut self) -> &mut T { self.0.value_mut() } @@ -469,6 +473,7 @@ mod inner { impl T> Pool { /// Get a value from the pool. This may block if another thread is also /// attempting to retrieve a value from the pool. + #[inline] pub(super) fn get(&self) -> PoolGuard<'_, T, F> { // Our fast path checks if the caller is the thread that "owns" // this pool. Or stated differently, whether it is the first thread @@ -562,6 +567,7 @@ mod inner { /// Puts a value back into the pool. Callers don't need to call this. /// Once the guard that's returned by 'get' is dropped, it is put back /// into the pool automatically. + #[inline] fn put_value(&self, value: Box) { let caller = THREAD_ID.with(|id| *id); let stack_id = caller % self.stacks.len(); @@ -587,11 +593,13 @@ mod inner { } /// Create a guard that represents the special owned T. + #[inline] fn guard_owned(&self, caller: usize) -> PoolGuard<'_, T, F> { PoolGuard { pool: self, value: Err(caller), discard: false } } /// Create a guard that contains a value from the pool's stack. + #[inline] fn guard_stack(&self, value: Box) -> PoolGuard<'_, T, F> { PoolGuard { pool: self, value: Ok(value), discard: false } } @@ -599,6 +607,7 @@ mod inner { /// Create a guard that contains a value from the pool's stack with an /// instruction to throw away the value instead of putting it back /// into the pool. + #[inline] fn guard_stack_transient(&self, value: Box) -> PoolGuard<'_, T, F> { PoolGuard { pool: self, value: Ok(value), discard: true } } @@ -633,6 +642,7 @@ mod inner { impl<'a, T: Send, F: Fn() -> T> PoolGuard<'a, T, F> { /// Return the underlying value. + #[inline] pub(super) fn value(&self) -> &T { match self.value { Ok(ref v) => &**v, @@ -657,6 +667,7 @@ mod inner { } /// Return the underlying value as a mutable borrow. + #[inline] pub(super) fn value_mut(&mut self) -> &mut T { match self.value { Ok(ref mut v) => &mut **v, @@ -681,6 +692,7 @@ mod inner { } /// Consumes this guard and puts it back into the pool. + #[inline] pub(super) fn put(this: PoolGuard<'_, T, F>) { // Since this is effectively consuming the guard and putting the // value back into the pool, there's no reason to run its Drop @@ -729,6 +741,7 @@ mod inner { } impl<'a, T: Send, F: Fn() -> T> Drop for PoolGuard<'a, T, F> { + #[inline] fn drop(&mut self) { self.put_imp(); } @@ -806,6 +819,7 @@ mod inner { impl T> Pool { /// Get a value from the pool. This may block if another thread is also /// attempting to retrieve a value from the pool. + #[inline] pub(super) fn get(&self) -> PoolGuard<'_, T, F> { let mut stack = self.stack.lock(); let value = match stack.pop() { @@ -815,6 +829,7 @@ mod inner { PoolGuard { pool: self, value: Some(value) } } + #[inline] fn put(&self, guard: PoolGuard<'_, T, F>) { let mut guard = core::mem::ManuallyDrop::new(guard); if let Some(value) = guard.value.take() { @@ -825,6 +840,7 @@ mod inner { /// Puts a value back into the pool. Callers don't need to call this. /// Once the guard that's returned by 'get' is dropped, it is put back /// into the pool automatically. + #[inline] fn put_value(&self, value: Box) { let mut stack = self.stack.lock(); stack.push(value); @@ -847,16 +863,19 @@ mod inner { impl<'a, T: Send, F: Fn() -> T> PoolGuard<'a, T, F> { /// Return the underlying value. + #[inline] pub(super) fn value(&self) -> &T { self.value.as_deref().unwrap() } /// Return the underlying value as a mutable borrow. + #[inline] pub(super) fn value_mut(&mut self) -> &mut T { self.value.as_deref_mut().unwrap() } /// Consumes this guard and puts it back into the pool. + #[inline] pub(super) fn put(this: PoolGuard<'_, T, F>) { // Since this is effectively consuming the guard and putting the // value back into the pool, there's no reason to run its Drop @@ -878,6 +897,7 @@ mod inner { } impl<'a, T: Send, F: Fn() -> T> Drop for PoolGuard<'a, T, F> { + #[inline] fn drop(&mut self) { self.put_imp(); } @@ -931,6 +951,7 @@ mod inner { /// Lock this mutex and return a guard providing exclusive access to /// `T`. This blocks if some other thread has already locked this /// mutex. + #[inline] fn lock(&self) -> MutexGuard<'_, T> { while self .locked @@ -963,18 +984,21 @@ mod inner { impl<'a, T> core::ops::Deref for MutexGuard<'a, T> { type Target = T; + #[inline] fn deref(&self) -> &T { self.data } } impl<'a, T> core::ops::DerefMut for MutexGuard<'a, T> { + #[inline] fn deref_mut(&mut self) -> &mut T { self.data } } impl<'a, T> Drop for MutexGuard<'a, T> { + #[inline] fn drop(&mut self) { // Drop means 'data' is no longer accessible, so we can unlock // the mutex. From aa4e4c7120b0090ce0624e3c42a2ed06dd8b918a Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sat, 30 Sep 2023 07:24:05 -0400 Subject: [PATCH 056/136] automata: fix unintended panic in max_haystack_len This fixes a bug where the bounded backtracker's `max_haystack_len` could panic if its bitset capacity ended up being smaller than the total number of NFA states. Under a default configuration this seems unlikely to happen due to the default limits on the size of a compiled regex. But if the compiled regex size limit is increased to a large number, then the likelihood of this panicking increases. Of course, one can provoke this even easier by just setting the visited capacity to a small number. Indeed, this is how we provoke it in a regression test. --- regex-automata/src/meta/wrappers.rs | 5 +++- regex-automata/src/nfa/thompson/backtrack.rs | 28 ++++++++++++++++++-- 2 files changed, 30 insertions(+), 3 deletions(-) diff --git a/regex-automata/src/meta/wrappers.rs b/regex-automata/src/meta/wrappers.rs index 08110d9bb..6cb19ba0d 100644 --- a/regex-automata/src/meta/wrappers.rs +++ b/regex-automata/src/meta/wrappers.rs @@ -212,7 +212,10 @@ impl BoundedBacktrackerEngine { .configure(backtrack_config) .build_from_nfa(nfa.clone()) .map_err(BuildError::nfa)?; - debug!("BoundedBacktracker built"); + debug!( + "BoundedBacktracker built (max haystack length: {:?})", + engine.max_haystack_len() + ); Ok(Some(BoundedBacktrackerEngine(engine))) } #[cfg(not(feature = "nfa-backtrack"))] diff --git a/regex-automata/src/nfa/thompson/backtrack.rs b/regex-automata/src/nfa/thompson/backtrack.rs index eba037c1d..df99e456d 100644 --- a/regex-automata/src/nfa/thompson/backtrack.rs +++ b/regex-automata/src/nfa/thompson/backtrack.rs @@ -820,8 +820,11 @@ impl BoundedBacktracker { // bytes to the capacity in bits. let capacity = 8 * self.get_config().get_visited_capacity(); let blocks = div_ceil(capacity, Visited::BLOCK_SIZE); - let real_capacity = blocks * Visited::BLOCK_SIZE; - (real_capacity / self.nfa.states().len()) - 1 + let real_capacity = blocks.saturating_mul(Visited::BLOCK_SIZE); + // It's possible for `real_capacity` to be smaller than the number of + // NFA states for particularly large regexes, so we saturate towards + // zero. + (real_capacity / self.nfa.states().len()).saturating_sub(1) } } @@ -1882,3 +1885,24 @@ fn div_ceil(lhs: usize, rhs: usize) -> usize { (lhs / rhs) + 1 } } + +#[cfg(test)] +mod tests { + use super::*; + + // This is a regression test for the maximum haystack length computation. + // Previously, it assumed that the total capacity of the backtracker's + // bitset would always be greater than the number of NFA states. But there + // is of course no guarantee that this is true. This regression test + // ensures that not only does `max_haystack_len` not panic, but that it + // should return `0`. + #[cfg(feature = "syntax")] + #[test] + fn max_haystack_len_overflow() { + let re = BoundedBacktracker::builder() + .configure(BoundedBacktracker::config().visited_capacity(10)) + .build(r"[0-9A-Za-z]{100}") + .unwrap(); + assert_eq!(0, re.max_haystack_len()); + } +} From e4674083346283cdf24fdc211dc44a4a6f6846b1 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sat, 30 Sep 2023 08:31:27 -0400 Subject: [PATCH 057/136] changelog: 1.9.6 --- CHANGELOG.md | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 885bb9bd7..a50b811dd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,14 @@ +1.9.6 (2023-09-30) +================== +This is a patch release that fixes a panic that can occur when the default +regex size limit is increased to a large number. + +* [BUG aa4e4c71](https://github.com/rust-lang/regex/commit/aa4e4c7120b0090ce0624e3c42a2ed06dd8b918a): +Fix a bug where computing the maximum haystack length for the bounded +backtracker could result underflow and thus provoke a panic later in a search +due to a broken invariant. + + 1.9.5 (2023-09-02) ================== This is a patch release that hopefully mostly fixes a performance bug that From 03f00bd756d85ee21714136e46836c4a5ad1b99c Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sat, 30 Sep 2023 08:31:34 -0400 Subject: [PATCH 058/136] regex-automata-0.3.9 --- regex-automata/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/regex-automata/Cargo.toml b/regex-automata/Cargo.toml index c7e949c4c..7d47140b0 100644 --- a/regex-automata/Cargo.toml +++ b/regex-automata/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "regex-automata" -version = "0.3.8" #:version +version = "0.3.9" #:version authors = ["The Rust Project Developers", "Andrew Gallant "] description = "Automata construction and matching using regular expressions." documentation = "https://docs.rs/regex-automata" From 3dda4255e11ddb9257f6b75135bb2f3f8a554acb Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sat, 30 Sep 2023 08:31:59 -0400 Subject: [PATCH 059/136] deps: bump regex-automata to 0.3.9 --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index c78ed045f..2d3b8076d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -174,7 +174,7 @@ optional = true # For the actual regex engines. [dependencies.regex-automata] path = "regex-automata" -version = "0.3.8" +version = "0.3.9" default-features = false features = ["alloc", "syntax", "meta", "nfa-pikevm"] From 11b44439786499014f61afe6e294650fb01550be Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sat, 30 Sep 2023 08:32:02 -0400 Subject: [PATCH 060/136] 1.9.6 --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 2d3b8076d..46664f669 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "regex" -version = "1.9.5" #:version +version = "1.9.6" #:version authors = ["The Rust Project Developers", "Andrew Gallant "] license = "MIT OR Apache-2.0" readme = "README.md" From 17284451f10aa06c6c42e622e3529b98513901a8 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Mon, 2 Oct 2023 09:39:24 -0400 Subject: [PATCH 061/136] syntax: fix Markdown for ASCII word class rendering --- regex-syntax/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/regex-syntax/src/lib.rs b/regex-syntax/src/lib.rs index 47d818a17..a552099c6 100644 --- a/regex-syntax/src/lib.rs +++ b/regex-syntax/src/lib.rs @@ -381,7 +381,7 @@ pub fn try_is_word_character( /// Returns true if and only if the given character is an ASCII word character. /// /// An ASCII word character is defined by the following character class: -/// `[_0-9a-zA-Z]'. +/// `[_0-9a-zA-Z]`. pub fn is_word_byte(c: u8) -> bool { match c { b'_' | b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' => true, From e598c4db2c5ed4d71ff611350becf42cb6faf1db Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Mon, 2 Oct 2023 20:24:51 -0400 Subject: [PATCH 062/136] automata: clean up regression test The name was quite vague, so add a little specificity. --- regex-automata/src/meta/regex.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/regex-automata/src/meta/regex.rs b/regex-automata/src/meta/regex.rs index ce3bae0fa..a06d2bb48 100644 --- a/regex-automata/src/meta/regex.rs +++ b/regex-automata/src/meta/regex.rs @@ -3640,8 +3640,8 @@ mod tests { // I found this in the course of building out the benchmark suite for // rebar. #[test] - fn regression() { - env_logger::init(); + fn regression_suffix_literal_count() { + let _ = env_logger::try_init(); let re = Regex::new(r"[a-zA-Z]+ing").unwrap(); assert_eq!(1, re.find_iter("tingling").count()); From c5e9de9d6e07786eb1ff7f88d7871e0f0ef28c32 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Mon, 2 Oct 2023 20:25:31 -0400 Subject: [PATCH 063/136] automata: fix line wrapping Breaking lines in the middle of backticks appears to be bad juju for some Markdown renderers. --- regex-automata/src/util/look.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/regex-automata/src/util/look.rs b/regex-automata/src/util/look.rs index aee31b34e..a34ea1d75 100644 --- a/regex-automata/src/util/look.rs +++ b/regex-automata/src/util/look.rs @@ -184,8 +184,8 @@ impl Look { pub struct LookSet { /// The underlying representation this set is exposed to make it possible /// to store it somewhere efficiently. The representation is that - /// of a bitset, where each assertion occupies bit `i` where `i = - /// Look::as_repr()`. + /// of a bitset, where each assertion occupies bit `i` where + /// `i = Look::as_repr()`. /// /// Note that users of this internal representation must permit the full /// range of `u16` values to be represented. For example, even if the From f15f3dcbc340eb98b40e60cc8b797263963d1e97 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Mon, 2 Oct 2023 20:27:11 -0400 Subject: [PATCH 064/136] automata: fix word boundary bug This fixes a bug that can occur when: 1. The regex has a Unicode word boundary. 2. The haystack contains some non-ASCII Unicode scalar value. 3. An inner or suffix literal optimization is in play. Specifically, this provokes a case where a match is detected in one of the meta engine's ad hoc DFA search routines, but before the match reaches its correct endpoint, a quit state is entered. (Because DFAs can't deal with Unicode word boundaries on non-ASCII haystacks.) The correct thing to do is to return a quit error and let the higher level logic divert to a different engine, but it was returning the match that it had found up until that point instead. The match returned is not technically incorrect in the sense that a match does indeed exist, but the offsets it reports may be shorter than what the true match actually is. So... if a quit state is entered, return an error regardless of whether a match has been found. Fixes #1046 --- CHANGELOG.md | 8 ++++++++ regex-automata/src/meta/limited.rs | 12 ------------ regex-automata/src/meta/stopat.rs | 12 ------------ testdata/regression.toml | 18 ++++++++++++++++++ 4 files changed, 26 insertions(+), 24 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a50b811dd..4a474af1b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,11 @@ +TBD +=== + +* [BUG #1046](https://github.com/rust-lang/regex/issues/1046): +Fix a bug that could result in incorrect match spans when using a Unicode word +boundary and searching non-ASCII strings. + + 1.9.6 (2023-09-30) ================== This is a patch release that fixes a panic that can occur when the default diff --git a/regex-automata/src/meta/limited.rs b/regex-automata/src/meta/limited.rs index 192a2625e..5653adc9a 100644 --- a/regex-automata/src/meta/limited.rs +++ b/regex-automata/src/meta/limited.rs @@ -69,9 +69,6 @@ pub(crate) fn dfa_try_search_half_rev( } else if dfa.is_dead_state(sid) { return Ok(mat); } else if dfa.is_quit_state(sid) { - if mat.is_some() { - return Ok(mat); - } return Err(MatchError::quit(input.haystack()[at], at).into()); } } @@ -155,9 +152,6 @@ pub(crate) fn hybrid_try_search_half_rev( } else if sid.is_dead() { return Ok(mat); } else if sid.is_quit() { - if mat.is_some() { - return Ok(mat); - } return Err(MatchError::quit(input.haystack()[at], at).into()); } } @@ -209,9 +203,6 @@ fn dfa_eoi_rev( let pattern = dfa.match_pattern(*sid, 0); *mat = Some(HalfMatch::new(pattern, sp.start)); } else if dfa.is_quit_state(*sid) { - if mat.is_some() { - return Ok(()); - } return Err(MatchError::quit(byte, sp.start - 1)); } } else { @@ -246,9 +237,6 @@ fn hybrid_eoi_rev( let pattern = dfa.match_pattern(cache, *sid, 0); *mat = Some(HalfMatch::new(pattern, sp.start)); } else if sid.is_quit() { - if mat.is_some() { - return Ok(()); - } return Err(MatchError::quit(byte, sp.start - 1)); } } else { diff --git a/regex-automata/src/meta/stopat.rs b/regex-automata/src/meta/stopat.rs index e8d716689..c4dcd797a 100644 --- a/regex-automata/src/meta/stopat.rs +++ b/regex-automata/src/meta/stopat.rs @@ -81,9 +81,6 @@ pub(crate) fn dfa_try_search_half_fwd( } else if dfa.is_dead_state(sid) { return Ok(mat.ok_or(at)); } else if dfa.is_quit_state(sid) { - if mat.is_some() { - return Ok(mat.ok_or(at)); - } return Err(MatchError::quit(input.haystack()[at], at).into()); } else { // Ideally we wouldn't use a DFA that specialized start states @@ -122,9 +119,6 @@ pub(crate) fn hybrid_try_search_half_fwd( } else if sid.is_dead() { return Ok(mat.ok_or(at)); } else if sid.is_quit() { - if mat.is_some() { - return Ok(mat.ok_or(at)); - } return Err(MatchError::quit(input.haystack()[at], at).into()); } else { // We should NEVER get an unknown state ID back from @@ -162,9 +156,6 @@ fn dfa_eoi_fwd( let pattern = dfa.match_pattern(*sid, 0); *mat = Some(HalfMatch::new(pattern, sp.end)); } else if dfa.is_quit_state(*sid) { - if mat.is_some() { - return Ok(()); - } return Err(MatchError::quit(b, sp.end)); } } @@ -201,9 +192,6 @@ fn hybrid_eoi_fwd( let pattern = dfa.match_pattern(cache, *sid, 0); *mat = Some(HalfMatch::new(pattern, sp.end)); } else if sid.is_quit() { - if mat.is_some() { - return Ok(()); - } return Err(MatchError::quit(b, sp.end)); } } diff --git a/testdata/regression.toml b/testdata/regression.toml index 03b15d6d5..09b2b1d1c 100644 --- a/testdata/regression.toml +++ b/testdata/regression.toml @@ -782,3 +782,21 @@ match-kind = "all" search-kind = "overlapping" unicode = true utf8 = true + +# This tests that the PikeVM and the meta regex agree on a particular regex. +# This test previously failed when the ad hoc engines inside the meta engine +# did not handle quit states correctly. Namely, the Unicode word boundary here +# combined with a non-ASCII codepoint provokes the quit state. The ad hoc +# engines were previously returning a match even after entering the quit state +# if a match had been previously detected, but this is incorrect. The reason +# is that if a quit state is found, then the search must give up *immediately* +# because it prevents the search from finding the "proper" leftmost-first +# match. If it instead returns a match that has been found, it risks reporting +# an improper match, as it did in this case. +# +# See: https://github.com/rust-lang/regex/issues/1046 +[[test]] +name = "non-prefix-literal-quit-state" +regex = '.+\b\n' +haystack = "β77\n" +matches = [[0, 5]] From b8c2066b6b6b424de95230ff1d63217a7d9e79c4 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Mon, 2 Oct 2023 22:45:13 -0400 Subject: [PATCH 065/136] automata/onepass: future proof bit packing This was previously using the raw representation of a `LookSet`, which is fine, but would have errantly overwritten bits unrelated to look-around assertions if they were set in a `LookSet`. This can't happen today because we don't have more than 10 assertions. And the one-pass DFA constructor specifically errors if more assertions exist and are in the pattern. But still, it seems like good form to mask out only the bits we care about. --- regex-automata/src/dfa/onepass.rs | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/regex-automata/src/dfa/onepass.rs b/regex-automata/src/dfa/onepass.rs index 44691d0c8..353bb1e17 100644 --- a/regex-automata/src/dfa/onepass.rs +++ b/regex-automata/src/dfa/onepass.rs @@ -2581,10 +2581,11 @@ impl Cache { /// Represents a single transition in a one-pass DFA. /// -/// The high 24 bits corresponds to the state ID. The low 48 bits corresponds -/// to the transition epsilons, which contains the slots that should be saved -/// when this transition is followed and the conditional epsilon transitions -/// that must be satisfied in order to follow this transition. +/// The high 21 bits corresponds to the state ID. The bit following corresponds +/// to the special "match wins" flag. The remaining low 42 bits corresponds to +/// the transition epsilons, which contains the slots that should be saved when +/// this transition is followed and the conditional epsilon transitions that +/// must be satisfied in order to follow this transition. #[derive(Clone, Copy, Eq, PartialEq)] struct Transition(u64); @@ -2741,7 +2742,7 @@ impl PatternEpsilons { fn set_epsilons(self, epsilons: Epsilons) -> PatternEpsilons { PatternEpsilons( (self.0 & PatternEpsilons::PATTERN_ID_MASK) - | u64::from(epsilons.0), + | (u64::from(epsilons.0) & PatternEpsilons::EPSILONS_MASK), ) } } @@ -2819,7 +2820,10 @@ impl Epsilons { /// Set the look-around assertions on these epsilon transitions. fn set_looks(self, look_set: LookSet) -> Epsilons { - Epsilons((self.0 & Epsilons::SLOT_MASK) | u64::from(look_set.bits)) + Epsilons( + (self.0 & Epsilons::SLOT_MASK) + | (u64::from(look_set.bits) & Epsilons::LOOK_MASK), + ) } } From 0ead12869417434fb39ae4b876c4fb97543cbbd8 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Tue, 3 Oct 2023 15:16:21 -0400 Subject: [PATCH 066/136] syntax: make Ast the size of a pointer This puts every Ast value behind a box to conserve space. It makes things like Vec quite a bit smaller than what they would be otherwise, which is especially beneficial for the representation of concatenations and alternations. This doesn't quite solve the memory usage problems though, since an AstKind is still quite big (over 200 bytes). The next step will be boxing each of the variants of an AstKind which should hopefully resolve the issue. Ref #1090 --- regex-syntax/src/ast/mod.rs | 180 ++++++++++------ regex-syntax/src/ast/parse.rs | 328 +++++++++++++++--------------- regex-syntax/src/ast/print.rs | 34 ++-- regex-syntax/src/ast/visitor.rs | 18 +- regex-syntax/src/hir/translate.rs | 44 ++-- 5 files changed, 332 insertions(+), 272 deletions(-) diff --git a/regex-syntax/src/ast/mod.rs b/regex-syntax/src/ast/mod.rs index 9e4284fee..6a6b58237 100644 --- a/regex-syntax/src/ast/mod.rs +++ b/regex-syntax/src/ast/mod.rs @@ -429,9 +429,19 @@ pub struct Comment { /// /// This type defines its own destructor that uses constant stack space and /// heap space proportional to the size of the `Ast`. +/// +/// This type boxes the actual kind of the AST element so that an `Ast` value +/// itself has a very small size. This in turn makes things like `Vec` use +/// a lot less memory than it might otherwise, which is particularly beneficial +/// for representing long concatenations or alternations. +#[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] +pub struct Ast(pub Box); + +/// The kind of an abstract syntax element. #[derive(Clone, Debug, Eq, PartialEq)] #[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] -pub enum Ast { +pub enum AstKind { /// An empty regex that matches everything. Empty(Span), /// A set of flags, e.g., `(?is)`. @@ -456,26 +466,76 @@ pub enum Ast { } impl Ast { + /// Create an "empty" AST item. + pub fn empty(span: Span) -> Ast { + Ast(Box::new(AstKind::Empty(span))) + } + + /// Create a "flags" AST item. + pub fn flags(e: SetFlags) -> Ast { + Ast(Box::new(AstKind::Flags(e))) + } + + /// Create a "literal" AST item. + pub fn literal(e: Literal) -> Ast { + Ast(Box::new(AstKind::Literal(e))) + } + + /// Create a "dot" AST item. + pub fn dot(span: Span) -> Ast { + Ast(Box::new(AstKind::Dot(span))) + } + + /// Create a "assertion" AST item. + pub fn assertion(e: Assertion) -> Ast { + Ast(Box::new(AstKind::Assertion(e))) + } + + /// Create a "class" AST item. + pub fn class(e: Class) -> Ast { + Ast(Box::new(AstKind::Class(e))) + } + + /// Create a "repetition" AST item. + pub fn repetition(e: Repetition) -> Ast { + Ast(Box::new(AstKind::Repetition(e))) + } + + /// Create a "group" AST item. + pub fn group(e: Group) -> Ast { + Ast(Box::new(AstKind::Group(e))) + } + + /// Create a "alternation" AST item. + pub fn alternation(e: Alternation) -> Ast { + Ast(Box::new(AstKind::Alternation(e))) + } + + /// Create a "concat" AST item. + pub fn concat(e: Concat) -> Ast { + Ast(Box::new(AstKind::Concat(e))) + } + /// Return the span of this abstract syntax tree. pub fn span(&self) -> &Span { - match *self { - Ast::Empty(ref span) => span, - Ast::Flags(ref x) => &x.span, - Ast::Literal(ref x) => &x.span, - Ast::Dot(ref span) => span, - Ast::Assertion(ref x) => &x.span, - Ast::Class(ref x) => x.span(), - Ast::Repetition(ref x) => &x.span, - Ast::Group(ref x) => &x.span, - Ast::Alternation(ref x) => &x.span, - Ast::Concat(ref x) => &x.span, + match *self.0 { + AstKind::Empty(ref span) => span, + AstKind::Flags(ref x) => &x.span, + AstKind::Literal(ref x) => &x.span, + AstKind::Dot(ref span) => span, + AstKind::Assertion(ref x) => &x.span, + AstKind::Class(ref x) => x.span(), + AstKind::Repetition(ref x) => &x.span, + AstKind::Group(ref x) => &x.span, + AstKind::Alternation(ref x) => &x.span, + AstKind::Concat(ref x) => &x.span, } } /// Return true if and only if this Ast is empty. pub fn is_empty(&self) -> bool { - match *self { - Ast::Empty(_) => true, + match *self.0 { + AstKind::Empty(_) => true, _ => false, } } @@ -483,17 +543,17 @@ impl Ast { /// Returns true if and only if this AST has any (including possibly empty) /// subexpressions. fn has_subexprs(&self) -> bool { - match *self { - Ast::Empty(_) - | Ast::Flags(_) - | Ast::Literal(_) - | Ast::Dot(_) - | Ast::Assertion(_) => false, - Ast::Class(_) - | Ast::Repetition(_) - | Ast::Group(_) - | Ast::Alternation(_) - | Ast::Concat(_) => true, + match *self.0 { + AstKind::Empty(_) + | AstKind::Flags(_) + | AstKind::Literal(_) + | AstKind::Dot(_) + | AstKind::Assertion(_) => false, + AstKind::Class(_) + | AstKind::Repetition(_) + | AstKind::Group(_) + | AstKind::Alternation(_) + | AstKind::Concat(_) => true, } } } @@ -526,14 +586,14 @@ pub struct Alternation { impl Alternation { /// Return this alternation as an AST. /// - /// If this alternation contains zero ASTs, then Ast::Empty is - /// returned. If this alternation contains exactly 1 AST, then the - /// corresponding AST is returned. Otherwise, Ast::Alternation is returned. + /// If this alternation contains zero ASTs, then `Ast::empty` is returned. + /// If this alternation contains exactly 1 AST, then the corresponding AST + /// is returned. Otherwise, `Ast::alternation` is returned. pub fn into_ast(mut self) -> Ast { match self.asts.len() { - 0 => Ast::Empty(self.span), + 0 => Ast::empty(self.span), 1 => self.asts.pop().unwrap(), - _ => Ast::Alternation(self), + _ => Ast::alternation(self), } } } @@ -551,14 +611,14 @@ pub struct Concat { impl Concat { /// Return this concatenation as an AST. /// - /// If this concatenation contains zero ASTs, then Ast::Empty is - /// returned. If this concatenation contains exactly 1 AST, then the - /// corresponding AST is returned. Otherwise, Ast::Concat is returned. + /// If this alternation contains zero ASTs, then `Ast::empty` is returned. + /// If this alternation contains exactly 1 AST, then the corresponding AST + /// is returned. Otherwise, `Ast::concat` is returned. pub fn into_ast(mut self) -> Ast { match self.asts.len() { - 0 => Ast::Empty(self.span), + 0 => Ast::empty(self.span), 1 => self.asts.pop().unwrap(), - _ => Ast::Concat(self), + _ => Ast::concat(self), } } } @@ -1544,43 +1604,43 @@ impl Drop for Ast { fn drop(&mut self) { use core::mem; - match *self { - Ast::Empty(_) - | Ast::Flags(_) - | Ast::Literal(_) - | Ast::Dot(_) - | Ast::Assertion(_) + match *self.0 { + AstKind::Empty(_) + | AstKind::Flags(_) + | AstKind::Literal(_) + | AstKind::Dot(_) + | AstKind::Assertion(_) // Classes are recursive, so they get their own Drop impl. - | Ast::Class(_) => return, - Ast::Repetition(ref x) if !x.ast.has_subexprs() => return, - Ast::Group(ref x) if !x.ast.has_subexprs() => return, - Ast::Alternation(ref x) if x.asts.is_empty() => return, - Ast::Concat(ref x) if x.asts.is_empty() => return, + | AstKind::Class(_) => return, + AstKind::Repetition(ref x) if !x.ast.has_subexprs() => return, + AstKind::Group(ref x) if !x.ast.has_subexprs() => return, + AstKind::Alternation(ref x) if x.asts.is_empty() => return, + AstKind::Concat(ref x) if x.asts.is_empty() => return, _ => {} } let empty_span = || Span::splat(Position::new(0, 0, 0)); - let empty_ast = || Ast::Empty(empty_span()); + let empty_ast = || Ast::empty(empty_span()); let mut stack = vec![mem::replace(self, empty_ast())]; while let Some(mut ast) = stack.pop() { - match ast { - Ast::Empty(_) - | Ast::Flags(_) - | Ast::Literal(_) - | Ast::Dot(_) - | Ast::Assertion(_) + match *ast.0 { + AstKind::Empty(_) + | AstKind::Flags(_) + | AstKind::Literal(_) + | AstKind::Dot(_) + | AstKind::Assertion(_) // Classes are recursive, so they get their own Drop impl. - | Ast::Class(_) => {} - Ast::Repetition(ref mut x) => { + | AstKind::Class(_) => {} + AstKind::Repetition(ref mut x) => { stack.push(mem::replace(&mut x.ast, empty_ast())); } - Ast::Group(ref mut x) => { + AstKind::Group(ref mut x) => { stack.push(mem::replace(&mut x.ast, empty_ast())); } - Ast::Alternation(ref mut x) => { + AstKind::Alternation(ref mut x) => { stack.extend(x.asts.drain(..)); } - Ast::Concat(ref mut x) => { + AstKind::Concat(ref mut x) => { stack.extend(x.asts.drain(..)); } } @@ -1663,9 +1723,9 @@ mod tests { let run = || { let span = || Span::splat(Position::new(0, 0, 0)); - let mut ast = Ast::Empty(span()); + let mut ast = Ast::empty(span()); for i in 0..200 { - ast = Ast::Group(Group { + ast = Ast::group(Group { span: span(), kind: GroupKind::CaptureIndex(i), ast: Box::new(ast), diff --git a/regex-syntax/src/ast/parse.rs b/regex-syntax/src/ast/parse.rs index 47ea2586b..b3f04bfdc 100644 --- a/regex-syntax/src/ast/parse.rs +++ b/regex-syntax/src/ast/parse.rs @@ -16,7 +16,7 @@ use alloc::{ }; use crate::{ - ast::{self, Ast, Position, Span}, + ast::{self, Ast, AstKind, Position, Span}, either::Either, is_escapeable_character, is_meta_character, }; @@ -53,11 +53,11 @@ impl Primitive { /// Convert this primitive into a proper AST. fn into_ast(self) -> Ast { match self { - Primitive::Literal(lit) => Ast::Literal(lit), - Primitive::Assertion(assert) => Ast::Assertion(assert), - Primitive::Dot(span) => Ast::Dot(span), - Primitive::Perl(cls) => Ast::Class(ast::Class::Perl(cls)), - Primitive::Unicode(cls) => Ast::Class(ast::Class::Unicode(cls)), + Primitive::Literal(lit) => Ast::literal(lit), + Primitive::Assertion(assert) => Ast::assertion(assert), + Primitive::Dot(span) => Ast::dot(span), + Primitive::Perl(cls) => Ast::class(ast::Class::Perl(cls)), + Primitive::Unicode(cls) => Ast::class(ast::Class::Unicode(cls)), } } @@ -691,7 +691,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { self.parser().ignore_whitespace.set(v); } - concat.asts.push(Ast::Flags(set)); + concat.asts.push(Ast::flags(set)); Ok(concat) } Either::Right(group) => { @@ -764,7 +764,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { group.ast = Box::new(group_concat.into_ast()); } } - prior_concat.asts.push(Ast::Group(group)); + prior_concat.asts.push(Ast::group(group)); Ok(prior_concat) } @@ -783,7 +783,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { Some(GroupState::Alternation(mut alt)) => { alt.span.end = self.pos(); alt.asts.push(concat.into_ast()); - Ok(Ast::Alternation(alt)) + Ok(Ast::alternation(alt)) } Some(GroupState::Group { group, .. }) => { return Err( @@ -976,7 +976,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { '|' => concat = self.push_alternate(concat)?, '[' => { let class = self.parse_set_class()?; - concat.asts.push(Ast::Class(class)); + concat.asts.push(Ast::class(class)); } '?' => { concat = self.parse_uncounted_repetition( @@ -1044,8 +1044,8 @@ impl<'s, P: Borrow> ParserI<'s, P> { ) } }; - match ast { - Ast::Empty(_) | Ast::Flags(_) => { + match *ast.0 { + AstKind::Empty(_) | AstKind::Flags(_) => { return Err( self.error(self.span(), ast::ErrorKind::RepetitionMissing) ) @@ -1057,7 +1057,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { greedy = false; self.bump(); } - concat.asts.push(Ast::Repetition(ast::Repetition { + concat.asts.push(Ast::repetition(ast::Repetition { span: ast.span().with_end(self.pos()), op: ast::RepetitionOp { span: Span::new(op_start, self.pos()), @@ -1096,8 +1096,8 @@ impl<'s, P: Borrow> ParserI<'s, P> { ) } }; - match ast { - Ast::Empty(_) | Ast::Flags(_) => { + match *ast.0 { + AstKind::Empty(_) | AstKind::Flags(_) => { return Err( self.error(self.span(), ast::ErrorKind::RepetitionMissing) ) @@ -1159,7 +1159,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { self.error(op_span, ast::ErrorKind::RepetitionCountInvalid) ); } - concat.asts.push(Ast::Repetition(ast::Repetition { + concat.asts.push(Ast::repetition(ast::Repetition { span: ast.span().with_end(self.pos()), op: ast::RepetitionOp { span: op_span, @@ -1212,7 +1212,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { Ok(Either::Right(ast::Group { span: open_span, kind: ast::GroupKind::CaptureName { starts_with_p, name }, - ast: Box::new(Ast::Empty(self.span())), + ast: Box::new(Ast::empty(self.span())), })) } else if self.bump_if("?") { if self.is_eof() { @@ -1241,7 +1241,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { Ok(Either::Right(ast::Group { span: open_span, kind: ast::GroupKind::NonCapturing(flags), - ast: Box::new(Ast::Empty(self.span())), + ast: Box::new(Ast::empty(self.span())), })) } } else { @@ -1249,7 +1249,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { Ok(Either::Right(ast::Group { span: open_span, kind: ast::GroupKind::CaptureIndex(capture_index), - ast: Box::new(Ast::Empty(self.span())), + ast: Box::new(Ast::empty(self.span())), })) } } @@ -2183,43 +2183,43 @@ impl<'p, 's, P: Borrow> ast::Visitor for NestLimiter<'p, 's, P> { } fn visit_pre(&mut self, ast: &Ast) -> Result<()> { - let span = match *ast { - Ast::Empty(_) - | Ast::Flags(_) - | Ast::Literal(_) - | Ast::Dot(_) - | Ast::Assertion(_) - | Ast::Class(ast::Class::Unicode(_)) - | Ast::Class(ast::Class::Perl(_)) => { + let span = match *ast.0 { + AstKind::Empty(_) + | AstKind::Flags(_) + | AstKind::Literal(_) + | AstKind::Dot(_) + | AstKind::Assertion(_) + | AstKind::Class(ast::Class::Unicode(_)) + | AstKind::Class(ast::Class::Perl(_)) => { // These are all base cases, so we don't increment depth. return Ok(()); } - Ast::Class(ast::Class::Bracketed(ref x)) => &x.span, - Ast::Repetition(ref x) => &x.span, - Ast::Group(ref x) => &x.span, - Ast::Alternation(ref x) => &x.span, - Ast::Concat(ref x) => &x.span, + AstKind::Class(ast::Class::Bracketed(ref x)) => &x.span, + AstKind::Repetition(ref x) => &x.span, + AstKind::Group(ref x) => &x.span, + AstKind::Alternation(ref x) => &x.span, + AstKind::Concat(ref x) => &x.span, }; self.increment_depth(span) } fn visit_post(&mut self, ast: &Ast) -> Result<()> { - match *ast { - Ast::Empty(_) - | Ast::Flags(_) - | Ast::Literal(_) - | Ast::Dot(_) - | Ast::Assertion(_) - | Ast::Class(ast::Class::Unicode(_)) - | Ast::Class(ast::Class::Perl(_)) => { + match *ast.0 { + AstKind::Empty(_) + | AstKind::Flags(_) + | AstKind::Literal(_) + | AstKind::Dot(_) + | AstKind::Assertion(_) + | AstKind::Class(ast::Class::Unicode(_)) + | AstKind::Class(ast::Class::Perl(_)) => { // These are all base cases, so we don't decrement depth. Ok(()) } - Ast::Class(ast::Class::Bracketed(_)) - | Ast::Repetition(_) - | Ast::Group(_) - | Ast::Alternation(_) - | Ast::Concat(_) => { + AstKind::Class(ast::Class::Bracketed(_)) + | AstKind::Repetition(_) + | AstKind::Group(_) + | AstKind::Alternation(_) + | AstKind::Concat(_) => { self.decrement_depth(); Ok(()) } @@ -2426,12 +2426,12 @@ mod tests { /// Create a meta literal starting at the given position. fn meta_lit(c: char, span: Span) -> Ast { - Ast::Literal(ast::Literal { span, kind: ast::LiteralKind::Meta, c }) + Ast::literal(ast::Literal { span, kind: ast::LiteralKind::Meta, c }) } /// Create a verbatim literal with the given span. fn lit_with(c: char, span: Span) -> Ast { - Ast::Literal(ast::Literal { + Ast::literal(ast::Literal { span, kind: ast::LiteralKind::Verbatim, c, @@ -2445,17 +2445,17 @@ mod tests { /// Create a concatenation with the given span. fn concat_with(span: Span, asts: Vec) -> Ast { - Ast::Concat(ast::Concat { span, asts }) + Ast::concat(ast::Concat { span, asts }) } /// Create an alternation with the given span. fn alt(range: Range, asts: Vec) -> Ast { - Ast::Alternation(ast::Alternation { span: span(range), asts }) + Ast::alternation(ast::Alternation { span: span(range), asts }) } /// Create a capturing group with the given span. fn group(range: Range, index: u32, ast: Ast) -> Ast { - Ast::Group(ast::Group { + Ast::group(ast::Group { span: span(range), kind: ast::GroupKind::CaptureIndex(index), ast: Box::new(ast), @@ -2488,7 +2488,7 @@ mod tests { }, ); } - Ast::Flags(ast::SetFlags { + Ast::flags(ast::SetFlags { span: span_range(pat, range.clone()), flags: ast::Flags { span: span_range(pat, (range.start + 2)..(range.end - 1)), @@ -2502,7 +2502,7 @@ mod tests { // A nest limit of 0 still allows some types of regexes. assert_eq!( parser_nest_limit("", 0).parse(), - Ok(Ast::Empty(span(0..0))) + Ok(Ast::empty(span(0..0))) ); assert_eq!(parser_nest_limit("a", 0).parse(), Ok(lit('a', 0))); @@ -2516,7 +2516,7 @@ mod tests { ); assert_eq!( parser_nest_limit("a+", 1).parse(), - Ok(Ast::Repetition(ast::Repetition { + Ok(Ast::repetition(ast::Repetition { span: span(0..2), op: ast::RepetitionOp { span: span(1..2), @@ -2542,14 +2542,14 @@ mod tests { ); assert_eq!( parser_nest_limit("a+*", 2).parse(), - Ok(Ast::Repetition(ast::Repetition { + Ok(Ast::repetition(ast::Repetition { span: span(0..3), op: ast::RepetitionOp { span: span(2..3), kind: ast::RepetitionKind::ZeroOrMore, }, greedy: true, - ast: Box::new(Ast::Repetition(ast::Repetition { + ast: Box::new(Ast::repetition(ast::Repetition { span: span(0..2), op: ast::RepetitionOp { span: span(1..2), @@ -2606,7 +2606,7 @@ mod tests { ); assert_eq!( parser_nest_limit("[a]", 1).parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { span: span(0..3), negated: false, kind: ast::ClassSet::Item(ast::ClassSetItem::Literal( @@ -2776,7 +2776,7 @@ bar vec![ lit_with('a', span_range(pat, 0..1)), lit_with(' ', span_range(pat, 1..2)), - Ast::Group(ast::Group { + Ast::group(ast::Group { span: span_range(pat, 2..9), kind: ast::GroupKind::NonCapturing(ast::Flags { span: span_range(pat, 4..5), @@ -2803,7 +2803,7 @@ bar span_range(pat, 0..pat.len()), vec![ flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), - Ast::Group(ast::Group { + Ast::group(ast::Group { span: span_range(pat, 4..pat.len()), kind: ast::GroupKind::CaptureName { starts_with_p: true, @@ -2825,7 +2825,7 @@ bar span_range(pat, 0..pat.len()), vec![ flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), - Ast::Group(ast::Group { + Ast::group(ast::Group { span: span_range(pat, 4..pat.len()), kind: ast::GroupKind::CaptureIndex(1), ast: Box::new(lit_with('a', span_range(pat, 7..8))), @@ -2840,7 +2840,7 @@ bar span_range(pat, 0..pat.len()), vec![ flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), - Ast::Group(ast::Group { + Ast::group(ast::Group { span: span_range(pat, 4..pat.len()), kind: ast::GroupKind::NonCapturing(ast::Flags { span: span_range(pat, 8..8), @@ -2858,7 +2858,7 @@ bar span_range(pat, 0..pat.len()), vec![ flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), - Ast::Literal(ast::Literal { + Ast::literal(ast::Literal { span: span(4..13), kind: ast::LiteralKind::HexBrace( ast::HexLiteralKind::X @@ -2877,7 +2877,7 @@ bar span_range(pat, 0..pat.len()), vec![ flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), - Ast::Literal(ast::Literal { + Ast::literal(ast::Literal { span: span_range(pat, 4..6), kind: ast::LiteralKind::Superfluous, c: ' ', @@ -2895,9 +2895,9 @@ bar Ok(concat_with( span_range(pat, 0..3), vec![ - Ast::Dot(span_range(pat, 0..1)), + Ast::dot(span_range(pat, 0..1)), lit_with('\n', span_range(pat, 1..2)), - Ast::Dot(span_range(pat, 2..3)), + Ast::dot(span_range(pat, 2..3)), ] )) ); @@ -2933,7 +2933,7 @@ bar fn parse_uncounted_repetition() { assert_eq!( parser(r"a*").parse(), - Ok(Ast::Repetition(ast::Repetition { + Ok(Ast::repetition(ast::Repetition { span: span(0..2), op: ast::RepetitionOp { span: span(1..2), @@ -2945,7 +2945,7 @@ bar ); assert_eq!( parser(r"a+").parse(), - Ok(Ast::Repetition(ast::Repetition { + Ok(Ast::repetition(ast::Repetition { span: span(0..2), op: ast::RepetitionOp { span: span(1..2), @@ -2958,7 +2958,7 @@ bar assert_eq!( parser(r"a?").parse(), - Ok(Ast::Repetition(ast::Repetition { + Ok(Ast::repetition(ast::Repetition { span: span(0..2), op: ast::RepetitionOp { span: span(1..2), @@ -2970,7 +2970,7 @@ bar ); assert_eq!( parser(r"a??").parse(), - Ok(Ast::Repetition(ast::Repetition { + Ok(Ast::repetition(ast::Repetition { span: span(0..3), op: ast::RepetitionOp { span: span(1..3), @@ -2982,7 +2982,7 @@ bar ); assert_eq!( parser(r"a?").parse(), - Ok(Ast::Repetition(ast::Repetition { + Ok(Ast::repetition(ast::Repetition { span: span(0..2), op: ast::RepetitionOp { span: span(1..2), @@ -2997,7 +2997,7 @@ bar Ok(concat( 0..3, vec![ - Ast::Repetition(ast::Repetition { + Ast::repetition(ast::Repetition { span: span(0..2), op: ast::RepetitionOp { span: span(1..2), @@ -3015,7 +3015,7 @@ bar Ok(concat( 0..4, vec![ - Ast::Repetition(ast::Repetition { + Ast::repetition(ast::Repetition { span: span(0..3), op: ast::RepetitionOp { span: span(1..3), @@ -3034,7 +3034,7 @@ bar 0..3, vec![ lit('a', 0), - Ast::Repetition(ast::Repetition { + Ast::repetition(ast::Repetition { span: span(1..3), op: ast::RepetitionOp { span: span(2..3), @@ -3048,7 +3048,7 @@ bar ); assert_eq!( parser(r"(ab)?").parse(), - Ok(Ast::Repetition(ast::Repetition { + Ok(Ast::repetition(ast::Repetition { span: span(0..5), op: ast::RepetitionOp { span: span(4..5), @@ -3067,8 +3067,8 @@ bar Ok(alt( 0..3, vec![ - Ast::Empty(span(0..0)), - Ast::Repetition(ast::Repetition { + Ast::empty(span(0..0)), + Ast::repetition(ast::Repetition { span: span(1..3), op: ast::RepetitionOp { span: span(2..3), @@ -3157,7 +3157,7 @@ bar fn parse_counted_repetition() { assert_eq!( parser(r"a{5}").parse(), - Ok(Ast::Repetition(ast::Repetition { + Ok(Ast::repetition(ast::Repetition { span: span(0..4), op: ast::RepetitionOp { span: span(1..4), @@ -3171,7 +3171,7 @@ bar ); assert_eq!( parser(r"a{5,}").parse(), - Ok(Ast::Repetition(ast::Repetition { + Ok(Ast::repetition(ast::Repetition { span: span(0..5), op: ast::RepetitionOp { span: span(1..5), @@ -3185,7 +3185,7 @@ bar ); assert_eq!( parser(r"a{5,9}").parse(), - Ok(Ast::Repetition(ast::Repetition { + Ok(Ast::repetition(ast::Repetition { span: span(0..6), op: ast::RepetitionOp { span: span(1..6), @@ -3199,7 +3199,7 @@ bar ); assert_eq!( parser(r"a{5}?").parse(), - Ok(Ast::Repetition(ast::Repetition { + Ok(Ast::repetition(ast::Repetition { span: span(0..5), op: ast::RepetitionOp { span: span(1..5), @@ -3217,7 +3217,7 @@ bar 0..5, vec![ lit('a', 0), - Ast::Repetition(ast::Repetition { + Ast::repetition(ast::Repetition { span: span(1..5), op: ast::RepetitionOp { span: span(2..5), @@ -3237,7 +3237,7 @@ bar 0..6, vec![ lit('a', 0), - Ast::Repetition(ast::Repetition { + Ast::repetition(ast::Repetition { span: span(1..5), op: ast::RepetitionOp { span: span(2..5), @@ -3255,7 +3255,7 @@ bar assert_eq!( parser(r"a{ 5 }").parse(), - Ok(Ast::Repetition(ast::Repetition { + Ok(Ast::repetition(ast::Repetition { span: span(0..6), op: ast::RepetitionOp { span: span(1..6), @@ -3269,7 +3269,7 @@ bar ); assert_eq!( parser(r"a{ 5 , 9 }").parse(), - Ok(Ast::Repetition(ast::Repetition { + Ok(Ast::repetition(ast::Repetition { span: span(0..10), op: ast::RepetitionOp { span: span(1..10), @@ -3283,7 +3283,7 @@ bar ); assert_eq!( parser_ignore_whitespace(r"a{5,9} ?").parse(), - Ok(Ast::Repetition(ast::Repetition { + Ok(Ast::repetition(ast::Repetition { span: span(0..8), op: ast::RepetitionOp { span: span(1..8), @@ -3414,7 +3414,7 @@ bar fn parse_alternate() { assert_eq!( parser(r"a|b").parse(), - Ok(Ast::Alternation(ast::Alternation { + Ok(Ast::alternation(ast::Alternation { span: span(0..3), asts: vec![lit('a', 0), lit('b', 2)], })) @@ -3424,7 +3424,7 @@ bar Ok(group( 0..5, 1, - Ast::Alternation(ast::Alternation { + Ast::alternation(ast::Alternation { span: span(1..4), asts: vec![lit('a', 1), lit('b', 3)], }) @@ -3433,14 +3433,14 @@ bar assert_eq!( parser(r"a|b|c").parse(), - Ok(Ast::Alternation(ast::Alternation { + Ok(Ast::alternation(ast::Alternation { span: span(0..5), asts: vec![lit('a', 0), lit('b', 2), lit('c', 4)], })) ); assert_eq!( parser(r"ax|by|cz").parse(), - Ok(Ast::Alternation(ast::Alternation { + Ok(Ast::alternation(ast::Alternation { span: span(0..8), asts: vec![ concat(0..2, vec![lit('a', 0), lit('x', 1)]), @@ -3454,7 +3454,7 @@ bar Ok(group( 0..10, 1, - Ast::Alternation(ast::Alternation { + Ast::alternation(ast::Alternation { span: span(1..9), asts: vec![ concat(1..3, vec![lit('a', 1), lit('x', 2)]), @@ -3503,7 +3503,7 @@ bar parser(r"|").parse(), Ok(alt( 0..1, - vec![Ast::Empty(span(0..0)), Ast::Empty(span(1..1)),] + vec![Ast::empty(span(0..0)), Ast::empty(span(1..1)),] )) ); assert_eq!( @@ -3511,19 +3511,19 @@ bar Ok(alt( 0..2, vec![ - Ast::Empty(span(0..0)), - Ast::Empty(span(1..1)), - Ast::Empty(span(2..2)), + Ast::empty(span(0..0)), + Ast::empty(span(1..1)), + Ast::empty(span(2..2)), ] )) ); assert_eq!( parser(r"a|").parse(), - Ok(alt(0..2, vec![lit('a', 0), Ast::Empty(span(2..2)),])) + Ok(alt(0..2, vec![lit('a', 0), Ast::empty(span(2..2)),])) ); assert_eq!( parser(r"|a").parse(), - Ok(alt(0..2, vec![Ast::Empty(span(0..0)), lit('a', 1),])) + Ok(alt(0..2, vec![Ast::empty(span(0..0)), lit('a', 1),])) ); assert_eq!( @@ -3533,7 +3533,7 @@ bar 1, alt( 1..2, - vec![Ast::Empty(span(1..1)), Ast::Empty(span(2..2)),] + vec![Ast::empty(span(1..1)), Ast::empty(span(2..2)),] ) )) ); @@ -3542,7 +3542,7 @@ bar Ok(group( 0..4, 1, - alt(1..3, vec![lit('a', 1), Ast::Empty(span(3..3)),]) + alt(1..3, vec![lit('a', 1), Ast::empty(span(3..3)),]) )) ); assert_eq!( @@ -3550,7 +3550,7 @@ bar Ok(group( 0..4, 1, - alt(1..3, vec![Ast::Empty(span(1..1)), lit('a', 2),]) + alt(1..3, vec![Ast::empty(span(1..1)), lit('a', 2),]) )) ); @@ -3606,7 +3606,7 @@ bar fn parse_group() { assert_eq!( parser("(?i)").parse(), - Ok(Ast::Flags(ast::SetFlags { + Ok(Ast::flags(ast::SetFlags { span: span(0..4), flags: ast::Flags { span: span(2..3), @@ -3621,7 +3621,7 @@ bar ); assert_eq!( parser("(?iU)").parse(), - Ok(Ast::Flags(ast::SetFlags { + Ok(Ast::flags(ast::SetFlags { span: span(0..5), flags: ast::Flags { span: span(2..4), @@ -3644,7 +3644,7 @@ bar ); assert_eq!( parser("(?i-U)").parse(), - Ok(Ast::Flags(ast::SetFlags { + Ok(Ast::flags(ast::SetFlags { span: span(0..6), flags: ast::Flags { span: span(2..5), @@ -3672,15 +3672,15 @@ bar assert_eq!( parser("()").parse(), - Ok(Ast::Group(ast::Group { + Ok(Ast::group(ast::Group { span: span(0..2), kind: ast::GroupKind::CaptureIndex(1), - ast: Box::new(Ast::Empty(span(1..1))), + ast: Box::new(Ast::empty(span(1..1))), })) ); assert_eq!( parser("(a)").parse(), - Ok(Ast::Group(ast::Group { + Ok(Ast::group(ast::Group { span: span(0..3), kind: ast::GroupKind::CaptureIndex(1), ast: Box::new(lit('a', 1)), @@ -3688,20 +3688,20 @@ bar ); assert_eq!( parser("(())").parse(), - Ok(Ast::Group(ast::Group { + Ok(Ast::group(ast::Group { span: span(0..4), kind: ast::GroupKind::CaptureIndex(1), - ast: Box::new(Ast::Group(ast::Group { + ast: Box::new(Ast::group(ast::Group { span: span(1..3), kind: ast::GroupKind::CaptureIndex(2), - ast: Box::new(Ast::Empty(span(2..2))), + ast: Box::new(Ast::empty(span(2..2))), })), })) ); assert_eq!( parser("(?:a)").parse(), - Ok(Ast::Group(ast::Group { + Ok(Ast::group(ast::Group { span: span(0..5), kind: ast::GroupKind::NonCapturing(ast::Flags { span: span(2..2), @@ -3713,7 +3713,7 @@ bar assert_eq!( parser("(?i:a)").parse(), - Ok(Ast::Group(ast::Group { + Ok(Ast::group(ast::Group { span: span(0..6), kind: ast::GroupKind::NonCapturing(ast::Flags { span: span(2..3), @@ -3729,7 +3729,7 @@ bar ); assert_eq!( parser("(?i-U:a)").parse(), - Ok(Ast::Group(ast::Group { + Ok(Ast::group(ast::Group { span: span(0..8), kind: ast::GroupKind::NonCapturing(ast::Flags { span: span(2..5), @@ -3818,7 +3818,7 @@ bar fn parse_capture_name() { assert_eq!( parser("(?z)").parse(), - Ok(Ast::Group(ast::Group { + Ok(Ast::group(ast::Group { span: span(0..7), kind: ast::GroupKind::CaptureName { starts_with_p: false, @@ -3833,7 +3833,7 @@ bar ); assert_eq!( parser("(?Pz)").parse(), - Ok(Ast::Group(ast::Group { + Ok(Ast::group(ast::Group { span: span(0..8), kind: ast::GroupKind::CaptureName { starts_with_p: true, @@ -3848,7 +3848,7 @@ bar ); assert_eq!( parser("(?Pz)").parse(), - Ok(Ast::Group(ast::Group { + Ok(Ast::group(ast::Group { span: span(0..10), kind: ast::GroupKind::CaptureName { starts_with_p: true, @@ -3864,7 +3864,7 @@ bar assert_eq!( parser("(?Pz)").parse(), - Ok(Ast::Group(ast::Group { + Ok(Ast::group(ast::Group { span: span(0..10), kind: ast::GroupKind::CaptureName { starts_with_p: true, @@ -3880,7 +3880,7 @@ bar assert_eq!( parser("(?Pz)").parse(), - Ok(Ast::Group(ast::Group { + Ok(Ast::group(ast::Group { span: span(0..10), kind: ast::GroupKind::CaptureName { starts_with_p: true, @@ -3896,7 +3896,7 @@ bar assert_eq!( parser("(?Pz)").parse(), - Ok(Ast::Group(ast::Group { + Ok(Ast::group(ast::Group { span: span(0..11), kind: ast::GroupKind::CaptureName { starts_with_p: true, @@ -3912,7 +3912,7 @@ bar assert_eq!( parser("(?P)").parse(), - Ok(Ast::Group(ast::Group { + Ok(Ast::group(ast::Group { span: Span::new( Position::new(0, 1, 1), Position::new(9, 1, 9), @@ -3928,7 +3928,7 @@ bar index: 1, } }, - ast: Box::new(Ast::Empty(Span::new( + ast: Box::new(Ast::empty(Span::new( Position::new(8, 1, 8), Position::new(8, 1, 8), ))), @@ -3936,7 +3936,7 @@ bar ); assert_eq!( parser("(?P<名字>)").parse(), - Ok(Ast::Group(ast::Group { + Ok(Ast::group(ast::Group { span: Span::new( Position::new(0, 1, 1), Position::new(12, 1, 9), @@ -3952,7 +3952,7 @@ bar index: 1, } }, - ast: Box::new(Ast::Empty(Span::new( + ast: Box::new(Ast::empty(Span::new( Position::new(11, 1, 8), Position::new(11, 1, 8), ))), @@ -4494,15 +4494,15 @@ bar ); assert_eq!( parser_octal(r"\778").parse(), - Ok(Ast::Concat(ast::Concat { + Ok(Ast::concat(ast::Concat { span: span(0..4), asts: vec![ - Ast::Literal(ast::Literal { + Ast::literal(ast::Literal { span: span(0..3), kind: ast::LiteralKind::Octal, c: '?', }), - Ast::Literal(ast::Literal { + Ast::literal(ast::Literal { span: span(3..4), kind: ast::LiteralKind::Verbatim, c: '8', @@ -4512,15 +4512,15 @@ bar ); assert_eq!( parser_octal(r"\7777").parse(), - Ok(Ast::Concat(ast::Concat { + Ok(Ast::concat(ast::Concat { span: span(0..5), asts: vec![ - Ast::Literal(ast::Literal { + Ast::literal(ast::Literal { span: span(0..4), kind: ast::LiteralKind::Octal, c: '\u{01FF}', }), - Ast::Literal(ast::Literal { + Ast::literal(ast::Literal { span: span(4..5), kind: ast::LiteralKind::Verbatim, c: '7', @@ -4965,7 +4965,7 @@ bar assert_eq!( parser("[[:alnum:]]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { span: span(0..11), negated: false, kind: itemset(item_ascii(alnum(span(1..10), false))), @@ -4973,7 +4973,7 @@ bar ); assert_eq!( parser("[[[:alnum:]]]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { span: span(0..13), negated: false, kind: itemset(item_bracket(ast::ClassBracketed { @@ -4985,7 +4985,7 @@ bar ); assert_eq!( parser("[[:alnum:]&&[:lower:]]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { span: span(0..22), negated: false, kind: intersection( @@ -4997,7 +4997,7 @@ bar ); assert_eq!( parser("[[:alnum:]--[:lower:]]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { span: span(0..22), negated: false, kind: difference( @@ -5009,7 +5009,7 @@ bar ); assert_eq!( parser("[[:alnum:]~~[:lower:]]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { span: span(0..22), negated: false, kind: symdifference( @@ -5022,7 +5022,7 @@ bar assert_eq!( parser("[a]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { span: span(0..3), negated: false, kind: itemset(lit(span(1..2), 'a')), @@ -5030,7 +5030,7 @@ bar ); assert_eq!( parser(r"[a\]]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { span: span(0..5), negated: false, kind: union( @@ -5048,7 +5048,7 @@ bar ); assert_eq!( parser(r"[a\-z]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { span: span(0..6), negated: false, kind: union( @@ -5067,7 +5067,7 @@ bar ); assert_eq!( parser("[ab]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { span: span(0..4), negated: false, kind: union( @@ -5078,7 +5078,7 @@ bar ); assert_eq!( parser("[a-]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { span: span(0..4), negated: false, kind: union( @@ -5089,7 +5089,7 @@ bar ); assert_eq!( parser("[-a]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { span: span(0..4), negated: false, kind: union( @@ -5100,7 +5100,7 @@ bar ); assert_eq!( parser(r"[\pL]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { span: span(0..5), negated: false, kind: itemset(item_unicode(ast::ClassUnicode { @@ -5112,7 +5112,7 @@ bar ); assert_eq!( parser(r"[\w]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { span: span(0..4), negated: false, kind: itemset(item_perl(ast::ClassPerl { @@ -5124,7 +5124,7 @@ bar ); assert_eq!( parser(r"[a\wz]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { span: span(0..6), negated: false, kind: union( @@ -5144,7 +5144,7 @@ bar assert_eq!( parser("[a-z]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { span: span(0..5), negated: false, kind: itemset(range(span(1..4), 'a', 'z')), @@ -5152,7 +5152,7 @@ bar ); assert_eq!( parser("[a-cx-z]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { span: span(0..8), negated: false, kind: union( @@ -5166,7 +5166,7 @@ bar ); assert_eq!( parser(r"[\w&&a-cx-z]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { span: span(0..12), negated: false, kind: intersection( @@ -5188,7 +5188,7 @@ bar ); assert_eq!( parser(r"[a-cx-z&&\w]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { span: span(0..12), negated: false, kind: intersection( @@ -5210,7 +5210,7 @@ bar ); assert_eq!( parser(r"[a--b--c]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { span: span(0..9), negated: false, kind: difference( @@ -5226,7 +5226,7 @@ bar ); assert_eq!( parser(r"[a~~b~~c]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { span: span(0..9), negated: false, kind: symdifference( @@ -5242,7 +5242,7 @@ bar ); assert_eq!( parser(r"[\^&&^]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { span: span(0..7), negated: false, kind: intersection( @@ -5258,7 +5258,7 @@ bar ); assert_eq!( parser(r"[\&&&&]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { span: span(0..7), negated: false, kind: intersection( @@ -5274,7 +5274,7 @@ bar ); assert_eq!( parser(r"[&&&&]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { span: span(0..6), negated: false, kind: intersection( @@ -5292,7 +5292,7 @@ bar let pat = "[☃-⛄]"; assert_eq!( parser(pat).parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { span: span_range(pat, 0..9), negated: false, kind: itemset(ast::ClassSetItem::Range(ast::ClassSetRange { @@ -5313,7 +5313,7 @@ bar assert_eq!( parser(r"[]]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { span: span(0..3), negated: false, kind: itemset(lit(span(1..2), ']')), @@ -5321,7 +5321,7 @@ bar ); assert_eq!( parser(r"[]\[]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { span: span(0..5), negated: false, kind: union( @@ -5342,7 +5342,7 @@ bar Ok(concat( 0..5, vec![ - Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ast::class(ast::Class::Bracketed(ast::ClassBracketed { span: span(0..4), negated: false, kind: itemset(ast::ClassSetItem::Literal( @@ -5353,7 +5353,7 @@ bar } )), })), - Ast::Literal(ast::Literal { + Ast::literal(ast::Literal { span: span(4..5), kind: ast::LiteralKind::Verbatim, c: ']', @@ -5914,15 +5914,15 @@ bar assert_eq!( parser(r"\pNz").parse(), - Ok(Ast::Concat(ast::Concat { + Ok(Ast::concat(ast::Concat { span: span(0..4), asts: vec![ - Ast::Class(ast::Class::Unicode(ast::ClassUnicode { + Ast::class(ast::Class::Unicode(ast::ClassUnicode { span: span(0..3), negated: false, kind: ast::ClassUnicodeKind::OneLetter('N'), })), - Ast::Literal(ast::Literal { + Ast::literal(ast::Literal { span: span(3..4), kind: ast::LiteralKind::Verbatim, c: 'z', @@ -5932,15 +5932,15 @@ bar ); assert_eq!( parser(r"\p{Greek}z").parse(), - Ok(Ast::Concat(ast::Concat { + Ok(Ast::concat(ast::Concat { span: span(0..10), asts: vec![ - Ast::Class(ast::Class::Unicode(ast::ClassUnicode { + Ast::class(ast::Class::Unicode(ast::ClassUnicode { span: span(0..9), negated: false, kind: ast::ClassUnicodeKind::Named(s("Greek")), })), - Ast::Literal(ast::Literal { + Ast::literal(ast::Literal { span: span(9..10), kind: ast::LiteralKind::Verbatim, c: 'z', @@ -6017,7 +6017,7 @@ bar assert_eq!( parser(r"\d").parse(), - Ok(Ast::Class(ast::Class::Perl(ast::ClassPerl { + Ok(Ast::class(ast::Class::Perl(ast::ClassPerl { span: span(0..2), kind: ast::ClassPerlKind::Digit, negated: false, @@ -6025,15 +6025,15 @@ bar ); assert_eq!( parser(r"\dz").parse(), - Ok(Ast::Concat(ast::Concat { + Ok(Ast::concat(ast::Concat { span: span(0..3), asts: vec![ - Ast::Class(ast::Class::Perl(ast::ClassPerl { + Ast::class(ast::Class::Perl(ast::ClassPerl { span: span(0..2), kind: ast::ClassPerlKind::Digit, negated: false, })), - Ast::Literal(ast::Literal { + Ast::literal(ast::Literal { span: span(2..3), kind: ast::LiteralKind::Verbatim, c: 'z', diff --git a/regex-syntax/src/ast/print.rs b/regex-syntax/src/ast/print.rs index 86a87e143..daf6776f2 100644 --- a/regex-syntax/src/ast/print.rs +++ b/regex-syntax/src/ast/print.rs @@ -7,7 +7,7 @@ use core::fmt; use crate::ast::{ self, visitor::{self, Visitor}, - Ast, + Ast, AstKind, }; /// A builder for constructing a printer. @@ -78,9 +78,9 @@ impl Visitor for Writer { } fn visit_pre(&mut self, ast: &Ast) -> fmt::Result { - match *ast { - Ast::Group(ref x) => self.fmt_group_pre(x), - Ast::Class(ast::Class::Bracketed(ref x)) => { + match *ast.0 { + AstKind::Group(ref x) => self.fmt_group_pre(x), + AstKind::Class(ast::Class::Bracketed(ref x)) => { self.fmt_class_bracketed_pre(x) } _ => Ok(()), @@ -90,21 +90,21 @@ impl Visitor for Writer { fn visit_post(&mut self, ast: &Ast) -> fmt::Result { use crate::ast::Class; - match *ast { - Ast::Empty(_) => Ok(()), - Ast::Flags(ref x) => self.fmt_set_flags(x), - Ast::Literal(ref x) => self.fmt_literal(x), - Ast::Dot(_) => self.wtr.write_str("."), - Ast::Assertion(ref x) => self.fmt_assertion(x), - Ast::Class(Class::Perl(ref x)) => self.fmt_class_perl(x), - Ast::Class(Class::Unicode(ref x)) => self.fmt_class_unicode(x), - Ast::Class(Class::Bracketed(ref x)) => { + match *ast.0 { + AstKind::Empty(_) => Ok(()), + AstKind::Flags(ref x) => self.fmt_set_flags(x), + AstKind::Literal(ref x) => self.fmt_literal(x), + AstKind::Dot(_) => self.wtr.write_str("."), + AstKind::Assertion(ref x) => self.fmt_assertion(x), + AstKind::Class(Class::Perl(ref x)) => self.fmt_class_perl(x), + AstKind::Class(Class::Unicode(ref x)) => self.fmt_class_unicode(x), + AstKind::Class(Class::Bracketed(ref x)) => { self.fmt_class_bracketed_post(x) } - Ast::Repetition(ref x) => self.fmt_repetition(x), - Ast::Group(ref x) => self.fmt_group_post(x), - Ast::Alternation(_) => Ok(()), - Ast::Concat(_) => Ok(()), + AstKind::Repetition(ref x) => self.fmt_repetition(x), + AstKind::Group(ref x) => self.fmt_group_post(x), + AstKind::Alternation(_) => Ok(()), + AstKind::Concat(_) => Ok(()), } } diff --git a/regex-syntax/src/ast/visitor.rs b/regex-syntax/src/ast/visitor.rs index 03d12a14d..05fdac89c 100644 --- a/regex-syntax/src/ast/visitor.rs +++ b/regex-syntax/src/ast/visitor.rs @@ -1,6 +1,6 @@ use alloc::{vec, vec::Vec}; -use crate::ast::{self, Ast}; +use crate::ast::{self, Ast, AstKind}; /// A trait for visiting an abstract syntax tree (AST) in depth first order. /// @@ -263,19 +263,19 @@ impl<'a> HeapVisitor<'a> { ast: &'a Ast, visitor: &mut V, ) -> Result>, V::Err> { - Ok(match *ast { - Ast::Class(ast::Class::Bracketed(ref x)) => { + Ok(match *ast.0 { + AstKind::Class(ast::Class::Bracketed(ref x)) => { self.visit_class(x, visitor)?; None } - Ast::Repetition(ref x) => Some(Frame::Repetition(x)), - Ast::Group(ref x) => Some(Frame::Group(x)), - Ast::Concat(ref x) if x.asts.is_empty() => None, - Ast::Concat(ref x) => { + AstKind::Repetition(ref x) => Some(Frame::Repetition(x)), + AstKind::Group(ref x) => Some(Frame::Group(x)), + AstKind::Concat(ref x) if x.asts.is_empty() => None, + AstKind::Concat(ref x) => { Some(Frame::Concat { head: &x.asts[0], tail: &x.asts[1..] }) } - Ast::Alternation(ref x) if x.asts.is_empty() => None, - Ast::Alternation(ref x) => Some(Frame::Alternation { + AstKind::Alternation(ref x) if x.asts.is_empty() => None, + AstKind::Alternation(ref x) => Some(Frame::Alternation { head: &x.asts[0], tail: &x.asts[1..], }), diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs index 5430b51b2..743218df4 100644 --- a/regex-syntax/src/hir/translate.rs +++ b/regex-syntax/src/hir/translate.rs @@ -7,7 +7,7 @@ use core::cell::{Cell, RefCell}; use alloc::{boxed::Box, string::ToString, vec, vec::Vec}; use crate::{ - ast::{self, Ast, Span, Visitor}, + ast::{self, Ast, AstKind, Span, Visitor}, either::Either, hir::{self, Error, ErrorKind, Hir, HirKind}, unicode::{self, ClassQuery}, @@ -336,8 +336,8 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { } fn visit_pre(&mut self, ast: &Ast) -> Result<()> { - match *ast { - Ast::Class(ast::Class::Bracketed(_)) => { + match *ast.0 { + AstKind::Class(ast::Class::Bracketed(_)) => { if self.flags().unicode() { let cls = hir::ClassUnicode::empty(); self.push(HirFrame::ClassUnicode(cls)); @@ -346,20 +346,20 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { self.push(HirFrame::ClassBytes(cls)); } } - Ast::Repetition(_) => self.push(HirFrame::Repetition), - Ast::Group(ref x) => { + AstKind::Repetition(_) => self.push(HirFrame::Repetition), + AstKind::Group(ref x) => { let old_flags = x .flags() .map(|ast| self.set_flags(ast)) .unwrap_or_else(|| self.flags()); self.push(HirFrame::Group { old_flags }); } - Ast::Concat(ref x) if x.asts.is_empty() => {} - Ast::Concat(_) => { + AstKind::Concat(ref x) if x.asts.is_empty() => {} + AstKind::Concat(_) => { self.push(HirFrame::Concat); } - Ast::Alternation(ref x) if x.asts.is_empty() => {} - Ast::Alternation(_) => { + AstKind::Alternation(ref x) if x.asts.is_empty() => {} + AstKind::Alternation(_) => { self.push(HirFrame::Alternation); self.push(HirFrame::AlternationBranch); } @@ -369,11 +369,11 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { } fn visit_post(&mut self, ast: &Ast) -> Result<()> { - match *ast { - Ast::Empty(_) => { + match *ast.0 { + AstKind::Empty(_) => { self.push(HirFrame::Expr(Hir::empty())); } - Ast::Flags(ref x) => { + AstKind::Flags(ref x) => { self.set_flags(&x.flags); // Flags in the AST are generally considered directives and // not actual sub-expressions. However, they can be used in @@ -386,7 +386,7 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { // consistency sake. self.push(HirFrame::Expr(Hir::empty())); } - Ast::Literal(ref x) => { + AstKind::Literal(ref x) => { match self.ast_literal_to_scalar(x)? { Either::Right(byte) => self.push_byte(byte), Either::Left(ch) => { @@ -402,13 +402,13 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { } // self.push(HirFrame::Expr(self.hir_literal(x)?)); } - Ast::Dot(span) => { + AstKind::Dot(span) => { self.push(HirFrame::Expr(self.hir_dot(span)?)); } - Ast::Assertion(ref x) => { + AstKind::Assertion(ref x) => { self.push(HirFrame::Expr(self.hir_assertion(x)?)); } - Ast::Class(ast::Class::Perl(ref x)) => { + AstKind::Class(ast::Class::Perl(ref x)) => { if self.flags().unicode() { let cls = self.hir_perl_unicode_class(x)?; let hcls = hir::Class::Unicode(cls); @@ -419,11 +419,11 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { self.push(HirFrame::Expr(Hir::class(hcls))); } } - Ast::Class(ast::Class::Unicode(ref x)) => { + AstKind::Class(ast::Class::Unicode(ref x)) => { let cls = hir::Class::Unicode(self.hir_unicode_class(x)?); self.push(HirFrame::Expr(Hir::class(cls))); } - Ast::Class(ast::Class::Bracketed(ref ast)) => { + AstKind::Class(ast::Class::Bracketed(ref ast)) => { if self.flags().unicode() { let mut cls = self.pop().unwrap().unwrap_class_unicode(); self.unicode_fold_and_negate( @@ -444,18 +444,18 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { self.push(HirFrame::Expr(expr)); } } - Ast::Repetition(ref x) => { + AstKind::Repetition(ref x) => { let expr = self.pop().unwrap().unwrap_expr(); self.pop().unwrap().unwrap_repetition(); self.push(HirFrame::Expr(self.hir_repetition(x, expr))); } - Ast::Group(ref x) => { + AstKind::Group(ref x) => { let expr = self.pop().unwrap().unwrap_expr(); let old_flags = self.pop().unwrap().unwrap_group(); self.trans().flags.set(old_flags); self.push(HirFrame::Expr(self.hir_capture(x, expr))); } - Ast::Concat(_) => { + AstKind::Concat(_) => { let mut exprs = vec![]; while let Some(expr) = self.pop_concat_expr() { if !matches!(*expr.kind(), HirKind::Empty) { @@ -465,7 +465,7 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { exprs.reverse(); self.push(HirFrame::Expr(Hir::concat(exprs))); } - Ast::Alternation(_) => { + AstKind::Alternation(_) => { let mut exprs = vec![]; while let Some(expr) = self.pop_alt_expr() { self.pop().unwrap().unwrap_alternation_pipe(); From 31b4398390e02767fd387c43ed53548413f53dcc Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Tue, 3 Oct 2023 16:01:43 -0400 Subject: [PATCH 067/136] syntax: box each AstKind variant This does reduce memory, but not as much as it is reduced if we don't box the Ast. --- regex-syntax/src/ast/mod.rs | 149 ++++++++++++++++++----------- regex-syntax/src/ast/parse.rs | 152 +++++++++++++++--------------- regex-syntax/src/ast/print.rs | 14 +-- regex-syntax/src/ast/visitor.rs | 2 +- regex-syntax/src/hir/translate.rs | 38 ++++---- 5 files changed, 192 insertions(+), 163 deletions(-) diff --git a/regex-syntax/src/ast/mod.rs b/regex-syntax/src/ast/mod.rs index 6a6b58237..c346abcb6 100644 --- a/regex-syntax/src/ast/mod.rs +++ b/regex-syntax/src/ast/mod.rs @@ -443,77 +443,92 @@ pub struct Ast(pub Box); #[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] pub enum AstKind { /// An empty regex that matches everything. - Empty(Span), + Empty(Box), /// A set of flags, e.g., `(?is)`. - Flags(SetFlags), + Flags(Box), /// A single character literal, which includes escape sequences. - Literal(Literal), + Literal(Box), /// The "any character" class. - Dot(Span), + Dot(Box), /// A single zero-width assertion. - Assertion(Assertion), - /// A single character class. This includes all forms of character classes - /// except for `.`. e.g., `\d`, `\pN`, `[a-z]` and `[[:alpha:]]`. - Class(Class), + Assertion(Box), + /// A single Unicode character class, e.g., `\pL` or `\p{Greek}`. + ClassUnicode(Box), + /// A single perl character class, e.g., `\d` or `\W`. + ClassPerl(Box), + /// A single bracketed character class set, which may contain zero or more + /// character ranges and/or zero or more nested classes. e.g., + /// `[a-zA-Z\pL]`. + ClassBracketed(Box), /// A repetition operator applied to an arbitrary regular expression. - Repetition(Repetition), + Repetition(Box), /// A grouped regular expression. - Group(Group), + Group(Box), /// An alternation of regular expressions. - Alternation(Alternation), + Alternation(Box), /// A concatenation of regular expressions. - Concat(Concat), + Concat(Box), } impl Ast { /// Create an "empty" AST item. pub fn empty(span: Span) -> Ast { - Ast(Box::new(AstKind::Empty(span))) + Ast(Box::new(AstKind::Empty(Box::new(span)))) } /// Create a "flags" AST item. pub fn flags(e: SetFlags) -> Ast { - Ast(Box::new(AstKind::Flags(e))) + Ast(Box::new(AstKind::Flags(Box::new(e)))) } /// Create a "literal" AST item. pub fn literal(e: Literal) -> Ast { - Ast(Box::new(AstKind::Literal(e))) + Ast(Box::new(AstKind::Literal(Box::new(e)))) } /// Create a "dot" AST item. pub fn dot(span: Span) -> Ast { - Ast(Box::new(AstKind::Dot(span))) + Ast(Box::new(AstKind::Dot(Box::new(span)))) } /// Create a "assertion" AST item. pub fn assertion(e: Assertion) -> Ast { - Ast(Box::new(AstKind::Assertion(e))) + Ast(Box::new(AstKind::Assertion(Box::new(e)))) + } + + /// Create a "Unicode class" AST item. + pub fn class_unicode(e: ClassUnicode) -> Ast { + Ast(Box::new(AstKind::ClassUnicode(Box::new(e)))) + } + + /// Create a "Perl class" AST item. + pub fn class_perl(e: ClassPerl) -> Ast { + Ast(Box::new(AstKind::ClassPerl(Box::new(e)))) } - /// Create a "class" AST item. - pub fn class(e: Class) -> Ast { - Ast(Box::new(AstKind::Class(e))) + /// Create a "bracketed class" AST item. + pub fn class_bracketed(e: ClassBracketed) -> Ast { + Ast(Box::new(AstKind::ClassBracketed(Box::new(e)))) } /// Create a "repetition" AST item. pub fn repetition(e: Repetition) -> Ast { - Ast(Box::new(AstKind::Repetition(e))) + Ast(Box::new(AstKind::Repetition(Box::new(e)))) } /// Create a "group" AST item. pub fn group(e: Group) -> Ast { - Ast(Box::new(AstKind::Group(e))) + Ast(Box::new(AstKind::Group(Box::new(e)))) } /// Create a "alternation" AST item. pub fn alternation(e: Alternation) -> Ast { - Ast(Box::new(AstKind::Alternation(e))) + Ast(Box::new(AstKind::Alternation(Box::new(e)))) } /// Create a "concat" AST item. pub fn concat(e: Concat) -> Ast { - Ast(Box::new(AstKind::Concat(e))) + Ast(Box::new(AstKind::Concat(Box::new(e)))) } /// Return the span of this abstract syntax tree. @@ -524,7 +539,9 @@ impl Ast { AstKind::Literal(ref x) => &x.span, AstKind::Dot(ref span) => span, AstKind::Assertion(ref x) => &x.span, - AstKind::Class(ref x) => x.span(), + AstKind::ClassUnicode(ref x) => &x.span, + AstKind::ClassPerl(ref x) => &x.span, + AstKind::ClassBracketed(ref x) => &x.span, AstKind::Repetition(ref x) => &x.span, AstKind::Group(ref x) => &x.span, AstKind::Alternation(ref x) => &x.span, @@ -548,8 +565,10 @@ impl Ast { | AstKind::Flags(_) | AstKind::Literal(_) | AstKind::Dot(_) - | AstKind::Assertion(_) => false, - AstKind::Class(_) + | AstKind::Assertion(_) + | AstKind::ClassUnicode(_) + | AstKind::ClassPerl(_) => false, + AstKind::ClassBracketed(_) | AstKind::Repetition(_) | AstKind::Group(_) | AstKind::Alternation(_) @@ -735,31 +754,6 @@ impl HexLiteralKind { } } -/// A single character class expression. -#[derive(Clone, Debug, Eq, PartialEq)] -#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] -pub enum Class { - /// A Unicode character class, e.g., `\pL` or `\p{Greek}`. - Unicode(ClassUnicode), - /// A perl character class, e.g., `\d` or `\W`. - Perl(ClassPerl), - /// A bracketed character class set, which may contain zero or more - /// character ranges and/or zero or more nested classes. e.g., - /// `[a-zA-Z\pL]`. - Bracketed(ClassBracketed), -} - -impl Class { - /// Return the span of this character class. - pub fn span(&self) -> &Span { - match *self { - Class::Perl(ref x) => &x.span, - Class::Unicode(ref x) => &x.span, - Class::Bracketed(ref x) => &x.span, - } - } -} - /// A Perl character class. #[derive(Clone, Debug, Eq, PartialEq)] #[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] @@ -1610,8 +1604,10 @@ impl Drop for Ast { | AstKind::Literal(_) | AstKind::Dot(_) | AstKind::Assertion(_) - // Classes are recursive, so they get their own Drop impl. - | AstKind::Class(_) => return, + | AstKind::ClassUnicode(_) + | AstKind::ClassPerl(_) + // Bracketed classes are recursive, they get their own Drop impl. + | AstKind::ClassBracketed(_) => return, AstKind::Repetition(ref x) if !x.ast.has_subexprs() => return, AstKind::Group(ref x) if !x.ast.has_subexprs() => return, AstKind::Alternation(ref x) if x.asts.is_empty() => return, @@ -1629,8 +1625,11 @@ impl Drop for Ast { | AstKind::Literal(_) | AstKind::Dot(_) | AstKind::Assertion(_) - // Classes are recursive, so they get their own Drop impl. - | AstKind::Class(_) => {} + | AstKind::ClassUnicode(_) + | AstKind::ClassPerl(_) + // Bracketed classes are recursive, so they get their own Drop + // impl. + | AstKind::ClassBracketed(_) => {} AstKind::Repetition(ref mut x) => { stack.push(mem::replace(&mut x.ast, empty_ast())); } @@ -1754,4 +1753,42 @@ mod tests { .join() .unwrap(); } + + // This tests that our `Ast` has a reasonable size. This isn't a hard rule + // and it can be increased if given a good enough reason. But this test + // exists because the size of `Ast` was at one point over 200 bytes on a + // 64-bit target. Wow. + #[test] + fn ast_size() { + std::dbg!(core::mem::size_of::()); + std::dbg!(core::mem::size_of::()); + std::dbg!(core::mem::size_of::()); + std::dbg!(core::mem::size_of::()); + std::dbg!(core::mem::size_of::()); + std::dbg!(core::mem::size_of::()); + std::dbg!(core::mem::size_of::()); + std::dbg!(core::mem::size_of::()); + std::dbg!(core::mem::size_of::()); + std::dbg!(core::mem::size_of::()); + std::dbg!(core::mem::size_of::()); + std::dbg!(core::mem::size_of::()); + + let max = core::mem::size_of::(); + let size = core::mem::size_of::(); + assert!( + size <= max, + "Ast size of {} bytes is bigger than suggested max {}", + size, + max + ); + + let max = 2 * core::mem::size_of::(); + let size = core::mem::size_of::(); + assert!( + size <= max, + "AstKind size of {} bytes is bigger than suggested max {}", + size, + max + ); + } } diff --git a/regex-syntax/src/ast/parse.rs b/regex-syntax/src/ast/parse.rs index b3f04bfdc..a87be0e02 100644 --- a/regex-syntax/src/ast/parse.rs +++ b/regex-syntax/src/ast/parse.rs @@ -56,8 +56,8 @@ impl Primitive { Primitive::Literal(lit) => Ast::literal(lit), Primitive::Assertion(assert) => Ast::assertion(assert), Primitive::Dot(span) => Ast::dot(span), - Primitive::Perl(cls) => Ast::class(ast::Class::Perl(cls)), - Primitive::Unicode(cls) => Ast::class(ast::Class::Unicode(cls)), + Primitive::Perl(cls) => Ast::class_perl(cls), + Primitive::Unicode(cls) => Ast::class_unicode(cls), } } @@ -850,7 +850,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { fn pop_class( &self, nested_union: ast::ClassSetUnion, - ) -> Result> { + ) -> Result> { assert_eq!(self.char(), ']'); let item = ast::ClassSet::Item(nested_union.into_item()); @@ -882,7 +882,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { set.span.end = self.pos(); set.kind = prevset; if stack.is_empty() { - Ok(Either::Right(ast::Class::Bracketed(set))) + Ok(Either::Right(set)) } else { union.push(ast::ClassSetItem::Bracketed(Box::new(set))); Ok(Either::Left(union)) @@ -976,7 +976,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { '|' => concat = self.push_alternate(concat)?, '[' => { let class = self.parse_set_class()?; - concat.asts.push(Ast::class(class)); + concat.asts.push(Ast::class_bracketed(class)); } '?' => { concat = self.parse_uncounted_repetition( @@ -1743,7 +1743,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { /// is successful, then the parser is advanced to the position immediately /// following the closing `]`. #[inline(never)] - fn parse_set_class(&self) -> Result { + fn parse_set_class(&self) -> Result { assert_eq!(self.char(), '['); let mut union = @@ -2189,12 +2189,12 @@ impl<'p, 's, P: Borrow> ast::Visitor for NestLimiter<'p, 's, P> { | AstKind::Literal(_) | AstKind::Dot(_) | AstKind::Assertion(_) - | AstKind::Class(ast::Class::Unicode(_)) - | AstKind::Class(ast::Class::Perl(_)) => { + | AstKind::ClassUnicode(_) + | AstKind::ClassPerl(_) => { // These are all base cases, so we don't increment depth. return Ok(()); } - AstKind::Class(ast::Class::Bracketed(ref x)) => &x.span, + AstKind::ClassBracketed(ref x) => &x.span, AstKind::Repetition(ref x) => &x.span, AstKind::Group(ref x) => &x.span, AstKind::Alternation(ref x) => &x.span, @@ -2210,12 +2210,12 @@ impl<'p, 's, P: Borrow> ast::Visitor for NestLimiter<'p, 's, P> { | AstKind::Literal(_) | AstKind::Dot(_) | AstKind::Assertion(_) - | AstKind::Class(ast::Class::Unicode(_)) - | AstKind::Class(ast::Class::Perl(_)) => { + | AstKind::ClassUnicode(_) + | AstKind::ClassPerl(_) => { // These are all base cases, so we don't decrement depth. Ok(()) } - AstKind::Class(ast::Class::Bracketed(_)) + AstKind::ClassBracketed(_) | AstKind::Repetition(_) | AstKind::Group(_) | AstKind::Alternation(_) @@ -2606,7 +2606,7 @@ mod tests { ); assert_eq!( parser_nest_limit("[a]", 1).parse(), - Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..3), negated: false, kind: ast::ClassSet::Item(ast::ClassSetItem::Literal( @@ -2616,7 +2616,7 @@ mod tests { c: 'a', } )), - }))) + })) ); assert_eq!( parser_nest_limit("[ab]", 1).parse().unwrap_err(), @@ -4965,15 +4965,15 @@ bar assert_eq!( parser("[[:alnum:]]").parse(), - Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..11), negated: false, kind: itemset(item_ascii(alnum(span(1..10), false))), - }))) + })) ); assert_eq!( parser("[[[:alnum:]]]").parse(), - Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..13), negated: false, kind: itemset(item_bracket(ast::ClassBracketed { @@ -4981,11 +4981,11 @@ bar negated: false, kind: itemset(item_ascii(alnum(span(2..11), false))), })), - }))) + })) ); assert_eq!( parser("[[:alnum:]&&[:lower:]]").parse(), - Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..22), negated: false, kind: intersection( @@ -4993,11 +4993,11 @@ bar itemset(item_ascii(alnum(span(1..10), false))), itemset(item_ascii(lower(span(12..21), false))), ), - }))) + })) ); assert_eq!( parser("[[:alnum:]--[:lower:]]").parse(), - Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..22), negated: false, kind: difference( @@ -5005,11 +5005,11 @@ bar itemset(item_ascii(alnum(span(1..10), false))), itemset(item_ascii(lower(span(12..21), false))), ), - }))) + })) ); assert_eq!( parser("[[:alnum:]~~[:lower:]]").parse(), - Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..22), negated: false, kind: symdifference( @@ -5017,20 +5017,20 @@ bar itemset(item_ascii(alnum(span(1..10), false))), itemset(item_ascii(lower(span(12..21), false))), ), - }))) + })) ); assert_eq!( parser("[a]").parse(), - Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..3), negated: false, kind: itemset(lit(span(1..2), 'a')), - }))) + })) ); assert_eq!( parser(r"[a\]]").parse(), - Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..5), negated: false, kind: union( @@ -5044,11 +5044,11 @@ bar }), ] ), - }))) + })) ); assert_eq!( parser(r"[a\-z]").parse(), - Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..6), negated: false, kind: union( @@ -5063,44 +5063,44 @@ bar lit(span(4..5), 'z'), ] ), - }))) + })) ); assert_eq!( parser("[ab]").parse(), - Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..4), negated: false, kind: union( span(1..3), vec![lit(span(1..2), 'a'), lit(span(2..3), 'b'),] ), - }))) + })) ); assert_eq!( parser("[a-]").parse(), - Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..4), negated: false, kind: union( span(1..3), vec![lit(span(1..2), 'a'), lit(span(2..3), '-'),] ), - }))) + })) ); assert_eq!( parser("[-a]").parse(), - Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..4), negated: false, kind: union( span(1..3), vec![lit(span(1..2), '-'), lit(span(2..3), 'a'),] ), - }))) + })) ); assert_eq!( parser(r"[\pL]").parse(), - Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..5), negated: false, kind: itemset(item_unicode(ast::ClassUnicode { @@ -5108,11 +5108,11 @@ bar negated: false, kind: ast::ClassUnicodeKind::OneLetter('L'), })), - }))) + })) ); assert_eq!( parser(r"[\w]").parse(), - Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..4), negated: false, kind: itemset(item_perl(ast::ClassPerl { @@ -5120,11 +5120,11 @@ bar kind: ast::ClassPerlKind::Word, negated: false, })), - }))) + })) ); assert_eq!( parser(r"[a\wz]").parse(), - Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..6), negated: false, kind: union( @@ -5139,20 +5139,20 @@ bar lit(span(4..5), 'z'), ] ), - }))) + })) ); assert_eq!( parser("[a-z]").parse(), - Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..5), negated: false, kind: itemset(range(span(1..4), 'a', 'z')), - }))) + })) ); assert_eq!( parser("[a-cx-z]").parse(), - Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..8), negated: false, kind: union( @@ -5162,11 +5162,11 @@ bar range(span(4..7), 'x', 'z'), ] ), - }))) + })) ); assert_eq!( parser(r"[\w&&a-cx-z]").parse(), - Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..12), negated: false, kind: intersection( @@ -5184,11 +5184,11 @@ bar ] ), ), - }))) + })) ); assert_eq!( parser(r"[a-cx-z&&\w]").parse(), - Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..12), negated: false, kind: intersection( @@ -5206,11 +5206,11 @@ bar negated: false, })), ), - }))) + })) ); assert_eq!( parser(r"[a--b--c]").parse(), - Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..9), negated: false, kind: difference( @@ -5222,11 +5222,11 @@ bar ), itemset(lit(span(7..8), 'c')), ), - }))) + })) ); assert_eq!( parser(r"[a~~b~~c]").parse(), - Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..9), negated: false, kind: symdifference( @@ -5238,11 +5238,11 @@ bar ), itemset(lit(span(7..8), 'c')), ), - }))) + })) ); assert_eq!( parser(r"[\^&&^]").parse(), - Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..7), negated: false, kind: intersection( @@ -5254,11 +5254,11 @@ bar })), itemset(lit(span(5..6), '^')), ), - }))) + })) ); assert_eq!( parser(r"[\&&&&]").parse(), - Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..7), negated: false, kind: intersection( @@ -5270,11 +5270,11 @@ bar })), itemset(lit(span(5..6), '&')), ), - }))) + })) ); assert_eq!( parser(r"[&&&&]").parse(), - Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..6), negated: false, kind: intersection( @@ -5286,13 +5286,13 @@ bar ), itemset(empty(span(5..5))), ), - }))) + })) ); let pat = "[☃-⛄]"; assert_eq!( parser(pat).parse(), - Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span_range(pat, 0..9), negated: false, kind: itemset(ast::ClassSetItem::Range(ast::ClassSetRange { @@ -5308,20 +5308,20 @@ bar c: '⛄', }, })), - }))) + })) ); assert_eq!( parser(r"[]]").parse(), - Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..3), negated: false, kind: itemset(lit(span(1..2), ']')), - }))) + })) ); assert_eq!( parser(r"[]\[]").parse(), - Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..5), negated: false, kind: union( @@ -5335,14 +5335,14 @@ bar }), ] ), - }))) + })) ); assert_eq!( parser(r"[\[]]").parse(), Ok(concat( 0..5, vec![ - Ast::class(ast::Class::Bracketed(ast::ClassBracketed { + Ast::class_bracketed(ast::ClassBracketed { span: span(0..4), negated: false, kind: itemset(ast::ClassSetItem::Literal( @@ -5352,7 +5352,7 @@ bar c: '[', } )), - })), + }), Ast::literal(ast::Literal { span: span(4..5), kind: ast::LiteralKind::Verbatim, @@ -5917,11 +5917,11 @@ bar Ok(Ast::concat(ast::Concat { span: span(0..4), asts: vec![ - Ast::class(ast::Class::Unicode(ast::ClassUnicode { + Ast::class_unicode(ast::ClassUnicode { span: span(0..3), negated: false, kind: ast::ClassUnicodeKind::OneLetter('N'), - })), + }), Ast::literal(ast::Literal { span: span(3..4), kind: ast::LiteralKind::Verbatim, @@ -5935,11 +5935,11 @@ bar Ok(Ast::concat(ast::Concat { span: span(0..10), asts: vec![ - Ast::class(ast::Class::Unicode(ast::ClassUnicode { + Ast::class_unicode(ast::ClassUnicode { span: span(0..9), negated: false, kind: ast::ClassUnicodeKind::Named(s("Greek")), - })), + }), Ast::literal(ast::Literal { span: span(9..10), kind: ast::LiteralKind::Verbatim, @@ -6017,22 +6017,22 @@ bar assert_eq!( parser(r"\d").parse(), - Ok(Ast::class(ast::Class::Perl(ast::ClassPerl { + Ok(Ast::class_perl(ast::ClassPerl { span: span(0..2), kind: ast::ClassPerlKind::Digit, negated: false, - }))) + })) ); assert_eq!( parser(r"\dz").parse(), Ok(Ast::concat(ast::Concat { span: span(0..3), asts: vec![ - Ast::class(ast::Class::Perl(ast::ClassPerl { + Ast::class_perl(ast::ClassPerl { span: span(0..2), kind: ast::ClassPerlKind::Digit, negated: false, - })), + }), Ast::literal(ast::Literal { span: span(2..3), kind: ast::LiteralKind::Verbatim, diff --git a/regex-syntax/src/ast/print.rs b/regex-syntax/src/ast/print.rs index daf6776f2..10ee56c2c 100644 --- a/regex-syntax/src/ast/print.rs +++ b/regex-syntax/src/ast/print.rs @@ -80,27 +80,21 @@ impl Visitor for Writer { fn visit_pre(&mut self, ast: &Ast) -> fmt::Result { match *ast.0 { AstKind::Group(ref x) => self.fmt_group_pre(x), - AstKind::Class(ast::Class::Bracketed(ref x)) => { - self.fmt_class_bracketed_pre(x) - } + AstKind::ClassBracketed(ref x) => self.fmt_class_bracketed_pre(x), _ => Ok(()), } } fn visit_post(&mut self, ast: &Ast) -> fmt::Result { - use crate::ast::Class; - match *ast.0 { AstKind::Empty(_) => Ok(()), AstKind::Flags(ref x) => self.fmt_set_flags(x), AstKind::Literal(ref x) => self.fmt_literal(x), AstKind::Dot(_) => self.wtr.write_str("."), AstKind::Assertion(ref x) => self.fmt_assertion(x), - AstKind::Class(Class::Perl(ref x)) => self.fmt_class_perl(x), - AstKind::Class(Class::Unicode(ref x)) => self.fmt_class_unicode(x), - AstKind::Class(Class::Bracketed(ref x)) => { - self.fmt_class_bracketed_post(x) - } + AstKind::ClassPerl(ref x) => self.fmt_class_perl(x), + AstKind::ClassUnicode(ref x) => self.fmt_class_unicode(x), + AstKind::ClassBracketed(ref x) => self.fmt_class_bracketed_post(x), AstKind::Repetition(ref x) => self.fmt_repetition(x), AstKind::Group(ref x) => self.fmt_group_post(x), AstKind::Alternation(_) => Ok(()), diff --git a/regex-syntax/src/ast/visitor.rs b/regex-syntax/src/ast/visitor.rs index 05fdac89c..2bd4b1956 100644 --- a/regex-syntax/src/ast/visitor.rs +++ b/regex-syntax/src/ast/visitor.rs @@ -264,7 +264,7 @@ impl<'a> HeapVisitor<'a> { visitor: &mut V, ) -> Result>, V::Err> { Ok(match *ast.0 { - AstKind::Class(ast::Class::Bracketed(ref x)) => { + AstKind::ClassBracketed(ref x) => { self.visit_class(x, visitor)?; None } diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs index 743218df4..ab3aa93d7 100644 --- a/regex-syntax/src/hir/translate.rs +++ b/regex-syntax/src/hir/translate.rs @@ -337,7 +337,7 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { fn visit_pre(&mut self, ast: &Ast) -> Result<()> { match *ast.0 { - AstKind::Class(ast::Class::Bracketed(_)) => { + AstKind::ClassBracketed(_) => { if self.flags().unicode() { let cls = hir::ClassUnicode::empty(); self.push(HirFrame::ClassUnicode(cls)); @@ -386,29 +386,27 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { // consistency sake. self.push(HirFrame::Expr(Hir::empty())); } - AstKind::Literal(ref x) => { - match self.ast_literal_to_scalar(x)? { - Either::Right(byte) => self.push_byte(byte), - Either::Left(ch) => { - if !self.flags().unicode() && ch.len_utf8() > 1 { - return Err(self - .error(x.span, ErrorKind::UnicodeNotAllowed)); - } - match self.case_fold_char(x.span, ch)? { - None => self.push_char(ch), - Some(expr) => self.push(HirFrame::Expr(expr)), - } + AstKind::Literal(ref x) => match self.ast_literal_to_scalar(x)? { + Either::Right(byte) => self.push_byte(byte), + Either::Left(ch) => { + if !self.flags().unicode() && ch.len_utf8() > 1 { + return Err( + self.error(x.span, ErrorKind::UnicodeNotAllowed) + ); + } + match self.case_fold_char(x.span, ch)? { + None => self.push_char(ch), + Some(expr) => self.push(HirFrame::Expr(expr)), } } - // self.push(HirFrame::Expr(self.hir_literal(x)?)); - } - AstKind::Dot(span) => { - self.push(HirFrame::Expr(self.hir_dot(span)?)); + }, + AstKind::Dot(ref span) => { + self.push(HirFrame::Expr(self.hir_dot(**span)?)); } AstKind::Assertion(ref x) => { self.push(HirFrame::Expr(self.hir_assertion(x)?)); } - AstKind::Class(ast::Class::Perl(ref x)) => { + AstKind::ClassPerl(ref x) => { if self.flags().unicode() { let cls = self.hir_perl_unicode_class(x)?; let hcls = hir::Class::Unicode(cls); @@ -419,11 +417,11 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { self.push(HirFrame::Expr(Hir::class(hcls))); } } - AstKind::Class(ast::Class::Unicode(ref x)) => { + AstKind::ClassUnicode(ref x) => { let cls = hir::Class::Unicode(self.hir_unicode_class(x)?); self.push(HirFrame::Expr(Hir::class(cls))); } - AstKind::Class(ast::Class::Bracketed(ref ast)) => { + AstKind::ClassBracketed(ref ast) => { if self.flags().unicode() { let mut cls = self.pop().unwrap().unwrap_class_unicode(); self.unicode_fold_and_negate( From 17d9c1c6c4368fb0a18f88d0698482063931a361 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Tue, 3 Oct 2023 16:09:12 -0400 Subject: [PATCH 068/136] syntax: unbox Ast and remove AstKind The AstKind experiment proved unfruitful. I think the issue here is that the savings on Vec didn't prove to be enough to offset the extra heap allocation that resulted from the indirection. This seems to be a sweet spot. It would be nice to get Ast down below 16 bytes, but it's not clear how to do that (without much larger changes that I don't feel inclined to pursue). Fixes #1090 --- fuzz/fuzz_targets/ast_roundtrip.rs | 21 ++-- regex-cli/cmd/generate/fowler.rs | 4 +- regex-syntax/src/ast/mod.rs | 168 ++++++++++++----------------- regex-syntax/src/ast/parse.rs | 62 +++++------ regex-syntax/src/ast/print.rs | 34 +++--- regex-syntax/src/ast/visitor.rs | 18 ++-- regex-syntax/src/hir/translate.rs | 44 ++++---- 7 files changed, 161 insertions(+), 190 deletions(-) diff --git a/fuzz/fuzz_targets/ast_roundtrip.rs b/fuzz/fuzz_targets/ast_roundtrip.rs index 040b59d63..c35ac962e 100644 --- a/fuzz/fuzz_targets/ast_roundtrip.rs +++ b/fuzz/fuzz_targets/ast_roundtrip.rs @@ -3,7 +3,7 @@ use { libfuzzer_sys::{fuzz_target, Corpus}, regex_syntax::ast::{ - parse::Parser, visit, Ast, Flag, Group, GroupKind, SetFlags, Visitor, + parse::Parser, visit, Ast, Flag, Flags, GroupKind, Visitor, }, }; @@ -32,16 +32,17 @@ impl Visitor for VerboseVisitor { } fn visit_pre(&mut self, ast: &Ast) -> Result { + let reject_flags = |flags: &Flags| { + flags.flag_state(Flag::IgnoreWhitespace).unwrap_or(false) + }; match ast { - Ast::Flags(SetFlags { flags, .. }) - | Ast::Group(Group { - kind: GroupKind::NonCapturing(flags), .. - }) if flags - .flag_state(Flag::IgnoreWhitespace) - .unwrap_or(false) => - { - Err(()) - } + Ast::Flags(x) if reject_flags(&x.flags) => return Err(()), + Ast::Group(x) => match x.kind { + GroupKind::NonCapturing(ref flags) if reject_flags(flags) => { + return Err(()) + } + _ => Ok(()), + }, _ => Ok(()), } } diff --git a/regex-cli/cmd/generate/fowler.rs b/regex-cli/cmd/generate/fowler.rs index c0ab1b361..c287f6f52 100644 --- a/regex-cli/cmd/generate/fowler.rs +++ b/regex-cli/cmd/generate/fowler.rs @@ -404,7 +404,9 @@ fn count_capturing_groups_ast(ast: ®ex_syntax::ast::Ast) -> usize { | Ast::Literal(_) | Ast::Dot(_) | Ast::Assertion(_) - | Ast::Class(_) => 0, + | Ast::ClassUnicode(_) + | Ast::ClassPerl(_) + | Ast::ClassBracketed(_) => 0, Ast::Repetition(ref rep) => count_capturing_groups_ast(&*rep.ast), Ast::Group(ref group) => { let this = if group.is_capturing() { 1 } else { 0 }; diff --git a/regex-syntax/src/ast/mod.rs b/regex-syntax/src/ast/mod.rs index c346abcb6..9e0f92606 100644 --- a/regex-syntax/src/ast/mod.rs +++ b/regex-syntax/src/ast/mod.rs @@ -429,19 +429,9 @@ pub struct Comment { /// /// This type defines its own destructor that uses constant stack space and /// heap space proportional to the size of the `Ast`. -/// -/// This type boxes the actual kind of the AST element so that an `Ast` value -/// itself has a very small size. This in turn makes things like `Vec` use -/// a lot less memory than it might otherwise, which is particularly beneficial -/// for representing long concatenations or alternations. -#[derive(Clone, Debug, Eq, PartialEq)] -#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] -pub struct Ast(pub Box); - -/// The kind of an abstract syntax element. #[derive(Clone, Debug, Eq, PartialEq)] #[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] -pub enum AstKind { +pub enum Ast { /// An empty regex that matches everything. Empty(Box), /// A set of flags, e.g., `(?is)`. @@ -473,86 +463,86 @@ pub enum AstKind { impl Ast { /// Create an "empty" AST item. pub fn empty(span: Span) -> Ast { - Ast(Box::new(AstKind::Empty(Box::new(span)))) + Ast::Empty(Box::new(span)) } /// Create a "flags" AST item. pub fn flags(e: SetFlags) -> Ast { - Ast(Box::new(AstKind::Flags(Box::new(e)))) + Ast::Flags(Box::new(e)) } /// Create a "literal" AST item. pub fn literal(e: Literal) -> Ast { - Ast(Box::new(AstKind::Literal(Box::new(e)))) + Ast::Literal(Box::new(e)) } /// Create a "dot" AST item. pub fn dot(span: Span) -> Ast { - Ast(Box::new(AstKind::Dot(Box::new(span)))) + Ast::Dot(Box::new(span)) } /// Create a "assertion" AST item. pub fn assertion(e: Assertion) -> Ast { - Ast(Box::new(AstKind::Assertion(Box::new(e)))) + Ast::Assertion(Box::new(e)) } /// Create a "Unicode class" AST item. pub fn class_unicode(e: ClassUnicode) -> Ast { - Ast(Box::new(AstKind::ClassUnicode(Box::new(e)))) + Ast::ClassUnicode(Box::new(e)) } /// Create a "Perl class" AST item. pub fn class_perl(e: ClassPerl) -> Ast { - Ast(Box::new(AstKind::ClassPerl(Box::new(e)))) + Ast::ClassPerl(Box::new(e)) } /// Create a "bracketed class" AST item. pub fn class_bracketed(e: ClassBracketed) -> Ast { - Ast(Box::new(AstKind::ClassBracketed(Box::new(e)))) + Ast::ClassBracketed(Box::new(e)) } /// Create a "repetition" AST item. pub fn repetition(e: Repetition) -> Ast { - Ast(Box::new(AstKind::Repetition(Box::new(e)))) + Ast::Repetition(Box::new(e)) } /// Create a "group" AST item. pub fn group(e: Group) -> Ast { - Ast(Box::new(AstKind::Group(Box::new(e)))) + Ast::Group(Box::new(e)) } /// Create a "alternation" AST item. pub fn alternation(e: Alternation) -> Ast { - Ast(Box::new(AstKind::Alternation(Box::new(e)))) + Ast::Alternation(Box::new(e)) } /// Create a "concat" AST item. pub fn concat(e: Concat) -> Ast { - Ast(Box::new(AstKind::Concat(Box::new(e)))) + Ast::Concat(Box::new(e)) } /// Return the span of this abstract syntax tree. pub fn span(&self) -> &Span { - match *self.0 { - AstKind::Empty(ref span) => span, - AstKind::Flags(ref x) => &x.span, - AstKind::Literal(ref x) => &x.span, - AstKind::Dot(ref span) => span, - AstKind::Assertion(ref x) => &x.span, - AstKind::ClassUnicode(ref x) => &x.span, - AstKind::ClassPerl(ref x) => &x.span, - AstKind::ClassBracketed(ref x) => &x.span, - AstKind::Repetition(ref x) => &x.span, - AstKind::Group(ref x) => &x.span, - AstKind::Alternation(ref x) => &x.span, - AstKind::Concat(ref x) => &x.span, + match *self { + Ast::Empty(ref span) => span, + Ast::Flags(ref x) => &x.span, + Ast::Literal(ref x) => &x.span, + Ast::Dot(ref span) => span, + Ast::Assertion(ref x) => &x.span, + Ast::ClassUnicode(ref x) => &x.span, + Ast::ClassPerl(ref x) => &x.span, + Ast::ClassBracketed(ref x) => &x.span, + Ast::Repetition(ref x) => &x.span, + Ast::Group(ref x) => &x.span, + Ast::Alternation(ref x) => &x.span, + Ast::Concat(ref x) => &x.span, } } /// Return true if and only if this Ast is empty. pub fn is_empty(&self) -> bool { - match *self.0 { - AstKind::Empty(_) => true, + match *self { + Ast::Empty(_) => true, _ => false, } } @@ -560,19 +550,19 @@ impl Ast { /// Returns true if and only if this AST has any (including possibly empty) /// subexpressions. fn has_subexprs(&self) -> bool { - match *self.0 { - AstKind::Empty(_) - | AstKind::Flags(_) - | AstKind::Literal(_) - | AstKind::Dot(_) - | AstKind::Assertion(_) - | AstKind::ClassUnicode(_) - | AstKind::ClassPerl(_) => false, - AstKind::ClassBracketed(_) - | AstKind::Repetition(_) - | AstKind::Group(_) - | AstKind::Alternation(_) - | AstKind::Concat(_) => true, + match *self { + Ast::Empty(_) + | Ast::Flags(_) + | Ast::Literal(_) + | Ast::Dot(_) + | Ast::Assertion(_) + | Ast::ClassUnicode(_) + | Ast::ClassPerl(_) => false, + Ast::ClassBracketed(_) + | Ast::Repetition(_) + | Ast::Group(_) + | Ast::Alternation(_) + | Ast::Concat(_) => true, } } } @@ -1598,20 +1588,20 @@ impl Drop for Ast { fn drop(&mut self) { use core::mem; - match *self.0 { - AstKind::Empty(_) - | AstKind::Flags(_) - | AstKind::Literal(_) - | AstKind::Dot(_) - | AstKind::Assertion(_) - | AstKind::ClassUnicode(_) - | AstKind::ClassPerl(_) + match *self { + Ast::Empty(_) + | Ast::Flags(_) + | Ast::Literal(_) + | Ast::Dot(_) + | Ast::Assertion(_) + | Ast::ClassUnicode(_) + | Ast::ClassPerl(_) // Bracketed classes are recursive, they get their own Drop impl. - | AstKind::ClassBracketed(_) => return, - AstKind::Repetition(ref x) if !x.ast.has_subexprs() => return, - AstKind::Group(ref x) if !x.ast.has_subexprs() => return, - AstKind::Alternation(ref x) if x.asts.is_empty() => return, - AstKind::Concat(ref x) if x.asts.is_empty() => return, + | Ast::ClassBracketed(_) => return, + Ast::Repetition(ref x) if !x.ast.has_subexprs() => return, + Ast::Group(ref x) if !x.ast.has_subexprs() => return, + Ast::Alternation(ref x) if x.asts.is_empty() => return, + Ast::Concat(ref x) if x.asts.is_empty() => return, _ => {} } @@ -1619,27 +1609,27 @@ impl Drop for Ast { let empty_ast = || Ast::empty(empty_span()); let mut stack = vec![mem::replace(self, empty_ast())]; while let Some(mut ast) = stack.pop() { - match *ast.0 { - AstKind::Empty(_) - | AstKind::Flags(_) - | AstKind::Literal(_) - | AstKind::Dot(_) - | AstKind::Assertion(_) - | AstKind::ClassUnicode(_) - | AstKind::ClassPerl(_) + match ast { + Ast::Empty(_) + | Ast::Flags(_) + | Ast::Literal(_) + | Ast::Dot(_) + | Ast::Assertion(_) + | Ast::ClassUnicode(_) + | Ast::ClassPerl(_) // Bracketed classes are recursive, so they get their own Drop // impl. - | AstKind::ClassBracketed(_) => {} - AstKind::Repetition(ref mut x) => { + | Ast::ClassBracketed(_) => {} + Ast::Repetition(ref mut x) => { stack.push(mem::replace(&mut x.ast, empty_ast())); } - AstKind::Group(ref mut x) => { + Ast::Group(ref mut x) => { stack.push(mem::replace(&mut x.ast, empty_ast())); } - AstKind::Alternation(ref mut x) => { + Ast::Alternation(ref mut x) => { stack.extend(x.asts.drain(..)); } - AstKind::Concat(ref mut x) => { + Ast::Concat(ref mut x) => { stack.extend(x.asts.drain(..)); } } @@ -1760,20 +1750,7 @@ mod tests { // 64-bit target. Wow. #[test] fn ast_size() { - std::dbg!(core::mem::size_of::()); - std::dbg!(core::mem::size_of::()); - std::dbg!(core::mem::size_of::()); - std::dbg!(core::mem::size_of::()); - std::dbg!(core::mem::size_of::()); - std::dbg!(core::mem::size_of::()); - std::dbg!(core::mem::size_of::()); - std::dbg!(core::mem::size_of::()); - std::dbg!(core::mem::size_of::()); - std::dbg!(core::mem::size_of::()); - std::dbg!(core::mem::size_of::()); - std::dbg!(core::mem::size_of::()); - - let max = core::mem::size_of::(); + let max = 2 * core::mem::size_of::(); let size = core::mem::size_of::(); assert!( size <= max, @@ -1781,14 +1758,5 @@ mod tests { size, max ); - - let max = 2 * core::mem::size_of::(); - let size = core::mem::size_of::(); - assert!( - size <= max, - "AstKind size of {} bytes is bigger than suggested max {}", - size, - max - ); } } diff --git a/regex-syntax/src/ast/parse.rs b/regex-syntax/src/ast/parse.rs index a87be0e02..f7bae7759 100644 --- a/regex-syntax/src/ast/parse.rs +++ b/regex-syntax/src/ast/parse.rs @@ -16,7 +16,7 @@ use alloc::{ }; use crate::{ - ast::{self, Ast, AstKind, Position, Span}, + ast::{self, Ast, Position, Span}, either::Either, is_escapeable_character, is_meta_character, }; @@ -1044,8 +1044,8 @@ impl<'s, P: Borrow> ParserI<'s, P> { ) } }; - match *ast.0 { - AstKind::Empty(_) | AstKind::Flags(_) => { + match ast { + Ast::Empty(_) | Ast::Flags(_) => { return Err( self.error(self.span(), ast::ErrorKind::RepetitionMissing) ) @@ -1096,8 +1096,8 @@ impl<'s, P: Borrow> ParserI<'s, P> { ) } }; - match *ast.0 { - AstKind::Empty(_) | AstKind::Flags(_) => { + match ast { + Ast::Empty(_) | Ast::Flags(_) => { return Err( self.error(self.span(), ast::ErrorKind::RepetitionMissing) ) @@ -2183,43 +2183,43 @@ impl<'p, 's, P: Borrow> ast::Visitor for NestLimiter<'p, 's, P> { } fn visit_pre(&mut self, ast: &Ast) -> Result<()> { - let span = match *ast.0 { - AstKind::Empty(_) - | AstKind::Flags(_) - | AstKind::Literal(_) - | AstKind::Dot(_) - | AstKind::Assertion(_) - | AstKind::ClassUnicode(_) - | AstKind::ClassPerl(_) => { + let span = match *ast { + Ast::Empty(_) + | Ast::Flags(_) + | Ast::Literal(_) + | Ast::Dot(_) + | Ast::Assertion(_) + | Ast::ClassUnicode(_) + | Ast::ClassPerl(_) => { // These are all base cases, so we don't increment depth. return Ok(()); } - AstKind::ClassBracketed(ref x) => &x.span, - AstKind::Repetition(ref x) => &x.span, - AstKind::Group(ref x) => &x.span, - AstKind::Alternation(ref x) => &x.span, - AstKind::Concat(ref x) => &x.span, + Ast::ClassBracketed(ref x) => &x.span, + Ast::Repetition(ref x) => &x.span, + Ast::Group(ref x) => &x.span, + Ast::Alternation(ref x) => &x.span, + Ast::Concat(ref x) => &x.span, }; self.increment_depth(span) } fn visit_post(&mut self, ast: &Ast) -> Result<()> { - match *ast.0 { - AstKind::Empty(_) - | AstKind::Flags(_) - | AstKind::Literal(_) - | AstKind::Dot(_) - | AstKind::Assertion(_) - | AstKind::ClassUnicode(_) - | AstKind::ClassPerl(_) => { + match *ast { + Ast::Empty(_) + | Ast::Flags(_) + | Ast::Literal(_) + | Ast::Dot(_) + | Ast::Assertion(_) + | Ast::ClassUnicode(_) + | Ast::ClassPerl(_) => { // These are all base cases, so we don't decrement depth. Ok(()) } - AstKind::ClassBracketed(_) - | AstKind::Repetition(_) - | AstKind::Group(_) - | AstKind::Alternation(_) - | AstKind::Concat(_) => { + Ast::ClassBracketed(_) + | Ast::Repetition(_) + | Ast::Group(_) + | Ast::Alternation(_) + | Ast::Concat(_) => { self.decrement_depth(); Ok(()) } diff --git a/regex-syntax/src/ast/print.rs b/regex-syntax/src/ast/print.rs index 10ee56c2c..7dedf7f48 100644 --- a/regex-syntax/src/ast/print.rs +++ b/regex-syntax/src/ast/print.rs @@ -7,7 +7,7 @@ use core::fmt; use crate::ast::{ self, visitor::{self, Visitor}, - Ast, AstKind, + Ast, }; /// A builder for constructing a printer. @@ -78,27 +78,27 @@ impl Visitor for Writer { } fn visit_pre(&mut self, ast: &Ast) -> fmt::Result { - match *ast.0 { - AstKind::Group(ref x) => self.fmt_group_pre(x), - AstKind::ClassBracketed(ref x) => self.fmt_class_bracketed_pre(x), + match *ast { + Ast::Group(ref x) => self.fmt_group_pre(x), + Ast::ClassBracketed(ref x) => self.fmt_class_bracketed_pre(x), _ => Ok(()), } } fn visit_post(&mut self, ast: &Ast) -> fmt::Result { - match *ast.0 { - AstKind::Empty(_) => Ok(()), - AstKind::Flags(ref x) => self.fmt_set_flags(x), - AstKind::Literal(ref x) => self.fmt_literal(x), - AstKind::Dot(_) => self.wtr.write_str("."), - AstKind::Assertion(ref x) => self.fmt_assertion(x), - AstKind::ClassPerl(ref x) => self.fmt_class_perl(x), - AstKind::ClassUnicode(ref x) => self.fmt_class_unicode(x), - AstKind::ClassBracketed(ref x) => self.fmt_class_bracketed_post(x), - AstKind::Repetition(ref x) => self.fmt_repetition(x), - AstKind::Group(ref x) => self.fmt_group_post(x), - AstKind::Alternation(_) => Ok(()), - AstKind::Concat(_) => Ok(()), + match *ast { + Ast::Empty(_) => Ok(()), + Ast::Flags(ref x) => self.fmt_set_flags(x), + Ast::Literal(ref x) => self.fmt_literal(x), + Ast::Dot(_) => self.wtr.write_str("."), + Ast::Assertion(ref x) => self.fmt_assertion(x), + Ast::ClassPerl(ref x) => self.fmt_class_perl(x), + Ast::ClassUnicode(ref x) => self.fmt_class_unicode(x), + Ast::ClassBracketed(ref x) => self.fmt_class_bracketed_post(x), + Ast::Repetition(ref x) => self.fmt_repetition(x), + Ast::Group(ref x) => self.fmt_group_post(x), + Ast::Alternation(_) => Ok(()), + Ast::Concat(_) => Ok(()), } } diff --git a/regex-syntax/src/ast/visitor.rs b/regex-syntax/src/ast/visitor.rs index 2bd4b1956..c1bb24d97 100644 --- a/regex-syntax/src/ast/visitor.rs +++ b/regex-syntax/src/ast/visitor.rs @@ -1,6 +1,6 @@ use alloc::{vec, vec::Vec}; -use crate::ast::{self, Ast, AstKind}; +use crate::ast::{self, Ast}; /// A trait for visiting an abstract syntax tree (AST) in depth first order. /// @@ -263,19 +263,19 @@ impl<'a> HeapVisitor<'a> { ast: &'a Ast, visitor: &mut V, ) -> Result>, V::Err> { - Ok(match *ast.0 { - AstKind::ClassBracketed(ref x) => { + Ok(match *ast { + Ast::ClassBracketed(ref x) => { self.visit_class(x, visitor)?; None } - AstKind::Repetition(ref x) => Some(Frame::Repetition(x)), - AstKind::Group(ref x) => Some(Frame::Group(x)), - AstKind::Concat(ref x) if x.asts.is_empty() => None, - AstKind::Concat(ref x) => { + Ast::Repetition(ref x) => Some(Frame::Repetition(x)), + Ast::Group(ref x) => Some(Frame::Group(x)), + Ast::Concat(ref x) if x.asts.is_empty() => None, + Ast::Concat(ref x) => { Some(Frame::Concat { head: &x.asts[0], tail: &x.asts[1..] }) } - AstKind::Alternation(ref x) if x.asts.is_empty() => None, - AstKind::Alternation(ref x) => Some(Frame::Alternation { + Ast::Alternation(ref x) if x.asts.is_empty() => None, + Ast::Alternation(ref x) => Some(Frame::Alternation { head: &x.asts[0], tail: &x.asts[1..], }), diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs index ab3aa93d7..56d261aa1 100644 --- a/regex-syntax/src/hir/translate.rs +++ b/regex-syntax/src/hir/translate.rs @@ -7,7 +7,7 @@ use core::cell::{Cell, RefCell}; use alloc::{boxed::Box, string::ToString, vec, vec::Vec}; use crate::{ - ast::{self, Ast, AstKind, Span, Visitor}, + ast::{self, Ast, Span, Visitor}, either::Either, hir::{self, Error, ErrorKind, Hir, HirKind}, unicode::{self, ClassQuery}, @@ -336,8 +336,8 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { } fn visit_pre(&mut self, ast: &Ast) -> Result<()> { - match *ast.0 { - AstKind::ClassBracketed(_) => { + match *ast { + Ast::ClassBracketed(_) => { if self.flags().unicode() { let cls = hir::ClassUnicode::empty(); self.push(HirFrame::ClassUnicode(cls)); @@ -346,20 +346,20 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { self.push(HirFrame::ClassBytes(cls)); } } - AstKind::Repetition(_) => self.push(HirFrame::Repetition), - AstKind::Group(ref x) => { + Ast::Repetition(_) => self.push(HirFrame::Repetition), + Ast::Group(ref x) => { let old_flags = x .flags() .map(|ast| self.set_flags(ast)) .unwrap_or_else(|| self.flags()); self.push(HirFrame::Group { old_flags }); } - AstKind::Concat(ref x) if x.asts.is_empty() => {} - AstKind::Concat(_) => { + Ast::Concat(ref x) if x.asts.is_empty() => {} + Ast::Concat(_) => { self.push(HirFrame::Concat); } - AstKind::Alternation(ref x) if x.asts.is_empty() => {} - AstKind::Alternation(_) => { + Ast::Alternation(ref x) if x.asts.is_empty() => {} + Ast::Alternation(_) => { self.push(HirFrame::Alternation); self.push(HirFrame::AlternationBranch); } @@ -369,11 +369,11 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { } fn visit_post(&mut self, ast: &Ast) -> Result<()> { - match *ast.0 { - AstKind::Empty(_) => { + match *ast { + Ast::Empty(_) => { self.push(HirFrame::Expr(Hir::empty())); } - AstKind::Flags(ref x) => { + Ast::Flags(ref x) => { self.set_flags(&x.flags); // Flags in the AST are generally considered directives and // not actual sub-expressions. However, they can be used in @@ -386,7 +386,7 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { // consistency sake. self.push(HirFrame::Expr(Hir::empty())); } - AstKind::Literal(ref x) => match self.ast_literal_to_scalar(x)? { + Ast::Literal(ref x) => match self.ast_literal_to_scalar(x)? { Either::Right(byte) => self.push_byte(byte), Either::Left(ch) => { if !self.flags().unicode() && ch.len_utf8() > 1 { @@ -400,13 +400,13 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { } } }, - AstKind::Dot(ref span) => { + Ast::Dot(ref span) => { self.push(HirFrame::Expr(self.hir_dot(**span)?)); } - AstKind::Assertion(ref x) => { + Ast::Assertion(ref x) => { self.push(HirFrame::Expr(self.hir_assertion(x)?)); } - AstKind::ClassPerl(ref x) => { + Ast::ClassPerl(ref x) => { if self.flags().unicode() { let cls = self.hir_perl_unicode_class(x)?; let hcls = hir::Class::Unicode(cls); @@ -417,11 +417,11 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { self.push(HirFrame::Expr(Hir::class(hcls))); } } - AstKind::ClassUnicode(ref x) => { + Ast::ClassUnicode(ref x) => { let cls = hir::Class::Unicode(self.hir_unicode_class(x)?); self.push(HirFrame::Expr(Hir::class(cls))); } - AstKind::ClassBracketed(ref ast) => { + Ast::ClassBracketed(ref ast) => { if self.flags().unicode() { let mut cls = self.pop().unwrap().unwrap_class_unicode(); self.unicode_fold_and_negate( @@ -442,18 +442,18 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { self.push(HirFrame::Expr(expr)); } } - AstKind::Repetition(ref x) => { + Ast::Repetition(ref x) => { let expr = self.pop().unwrap().unwrap_expr(); self.pop().unwrap().unwrap_repetition(); self.push(HirFrame::Expr(self.hir_repetition(x, expr))); } - AstKind::Group(ref x) => { + Ast::Group(ref x) => { let expr = self.pop().unwrap().unwrap_expr(); let old_flags = self.pop().unwrap().unwrap_group(); self.trans().flags.set(old_flags); self.push(HirFrame::Expr(self.hir_capture(x, expr))); } - AstKind::Concat(_) => { + Ast::Concat(_) => { let mut exprs = vec![]; while let Some(expr) = self.pop_concat_expr() { if !matches!(*expr.kind(), HirKind::Empty) { @@ -463,7 +463,7 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { exprs.reverse(); self.push(HirFrame::Expr(Hir::concat(exprs))); } - AstKind::Alternation(_) => { + Ast::Alternation(_) => { let mut exprs = vec![]; while let Some(expr) = self.pop_alt_expr() { self.pop().unwrap().unwrap_alternation_pipe(); From 536cf701ade853afd2a7a541684485f24491be91 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Fri, 6 Oct 2023 11:34:42 -0400 Subject: [PATCH 069/136] syntax: remove guarantees in the HIR related to 'u' flag Basically, we never should have guaranteed that a particular HIR would (or wouldn't) be used if the 'u' flag was present (or absent). Such a guarantee generally results in too little flexibility, particularly when it comes to HIR's smart constructors. We could probably uphold that guarantee, but it's somewhat gnarly to do and would require rejiggering some of the HIR types. For example, we would probably need a literal that is an enum of `&str` or `&[u8]` that correctly preserves the Unicode flag. This in turn comes with a bigger complexity cost in various rewriting rules. In general, it's much simpler to require the caller to be prepared for any kind of HIR regardless of what the flags are. I feel somewhat justified in this position due to the fact that part of the point of the HIR is to erase all of the regex flags so that callers no longer need to worry about them. That is, the erasure is the point that provides a simplification for everyone downstream. Closes #1088 --- CHANGELOG.md | 3 +++ regex-syntax/src/hir/mod.rs | 16 +++++++++++----- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4a474af1b..5b88d9e80 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,9 @@ TBD * [BUG #1046](https://github.com/rust-lang/regex/issues/1046): Fix a bug that could result in incorrect match spans when using a Unicode word boundary and searching non-ASCII strings. +* [BUG(regex-syntax) #1088](https://github.com/rust-lang/regex/issues/1088): +Remove guarantees in the API that connect the `u` flag with a specific HIR +representation. 1.9.6 (2023-09-30) diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index 6c1d2745e..f8a3d4a9e 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -797,13 +797,18 @@ impl core::fmt::Debug for Literal { /// The high-level intermediate representation of a character class. /// /// A character class corresponds to a set of characters. A character is either -/// defined by a Unicode scalar value or a byte. Unicode characters are used -/// by default, while bytes are used when Unicode mode (via the `u` flag) is -/// disabled. +/// defined by a Unicode scalar value or a byte. /// /// A character class, regardless of its character type, is represented by a /// sequence of non-overlapping non-adjacent ranges of characters. /// +/// There are no guarantees about which class variant is used. Generally +/// speaking, the Unicode variat is used whenever a class needs to contain +/// non-ASCII Unicode scalar values. But the Unicode variant can be used even +/// when Unicode mode is disabled. For example, at the time of writing, the +/// regex `(?-u:a|\xc2\xa0)` will compile down to HIR for the Unicode class +/// `[a\u00A0]` due to optimizations. +/// /// Note that `Bytes` variant may be produced even when it exclusively matches /// valid UTF-8. This is because a `Bytes` variant represents an intention by /// the author of the regular expression to disable Unicode mode, which in turn @@ -1326,8 +1331,9 @@ impl ClassUnicodeRange { } } -/// A set of characters represented by arbitrary bytes (where one byte -/// corresponds to one character). +/// A set of characters represented by arbitrary bytes. +/// +/// Each byte corresponds to one character. #[derive(Clone, Debug, Eq, PartialEq)] pub struct ClassBytes { set: IntervalSet, From 7a7ce8348f9cb74aac7c0a8f9836e76620ef035e Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Fri, 6 Oct 2023 11:39:14 -0400 Subject: [PATCH 070/136] automata: rejigger DFA start state computation It turns out that requiring callers to provide an `Input` (and thus a `&[u8]` haystack) is a bit onerous for all cases. Namely, part of the point of `regex-automata` was to expose enough guts to make it tractable to write a streaming regex engine. A streaming regex engine, especially one that does a byte-at-a-time loop, is somewhat antithetical to having a haystack in a single `&[u8]` slice. This made computing start states possible but very awkward and quite unclear in terms of what the implementation would actually do with the haystack. This commit fixes that by exposing a lower level `start_state` method on both of the DFAs that can be called without materializing an `Input`. Instead, callers must create a new `start::Config` value which provides all of the information necessary for the DFA to compute the correct start state. This in turn also exposes the `crate::util::start` module. This is ultimately a breaking change because it adds a new required method to the `Automaton` trait. It also makes `start_state_forward` and `start_state_reverse` optional. It isn't really expected for callers to implement the `Automaton` trait themselves (and perhaps I will seal it so we can do such changes in the future without it being breaking), but still, this is technically breaking. Callers using `start_state_forward` or `start_state_reverse` with either DFA remain unchanged and unaffected. Closes #1031 --- CHANGELOG.md | 7 + regex-automata/src/dfa/automaton.rs | 188 ++++++++++++++++++--- regex-automata/src/dfa/dense.rs | 95 +++++------ regex-automata/src/dfa/mod.rs | 2 +- regex-automata/src/dfa/sparse.rs | 59 +++---- regex-automata/src/hybrid/dfa.rs | 179 +++++++++++--------- regex-automata/src/hybrid/error.rs | 115 ++++++++++++- regex-automata/src/hybrid/mod.rs | 2 +- regex-automata/src/util/mod.rs | 2 +- regex-automata/src/util/start.rs | 243 ++++++++++++++++++++++++---- 10 files changed, 662 insertions(+), 230 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5b88d9e80..265f5cd48 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,13 @@ TBD === +New features: + +* [FEATURE(regex-automata) #1031](https://github.com/rust-lang/regex/pull/1031): +DFAs now have a `start_state` method that doesn't use an `Input`. + +Bug fixes: + * [BUG #1046](https://github.com/rust-lang/regex/issues/1046): Fix a bug that could result in incorrect match spans when using a Unicode word boundary and searching non-ASCII strings. diff --git a/regex-automata/src/dfa/automaton.rs b/regex-automata/src/dfa/automaton.rs index 7e2be9a15..cd597947e 100644 --- a/regex-automata/src/dfa/automaton.rs +++ b/regex-automata/src/dfa/automaton.rs @@ -7,6 +7,7 @@ use crate::{ prefilter::Prefilter, primitives::{PatternID, StateID}, search::{Anchored, HalfMatch, Input, MatchError}, + start, }, }; @@ -226,8 +227,8 @@ pub unsafe trait Automaton { /// ``` fn next_eoi_state(&self, current: StateID) -> StateID; - /// Return the ID of the start state for this lazy DFA when executing a - /// forward search. + /// Return the ID of the start state for this DFA for the given starting + /// configuration. /// /// Unlike typical DFA implementations, the start state for DFAs in this /// crate is dependent on a few different factors: @@ -235,12 +236,41 @@ pub unsafe trait Automaton { /// * The [`Anchored`] mode of the search. Unanchored, anchored and /// anchored searches for a specific [`PatternID`] all use different start /// states. - /// * The position at which the search begins, via [`Input::start`]. This - /// and the byte immediately preceding the start of the search (if one - /// exists) influence which look-behind assertions are true at the start - /// of the search. This in turn influences which start state is selected. - /// * Whether the search is a forward or reverse search. This routine can - /// only be used for forward searches. + /// * Whether a "look-behind" byte exists. For example, the `^` anchor + /// matches if and only if there is no look-behind byte. + /// * The specific value of that look-behind byte. For example, a `(?m:^)` + /// assertion only matches when there is either no look-behind byte, or + /// when the look-behind byte is a line terminator. + /// + /// The [starting configuration](start::Config) provides the above + /// information. + /// + /// This routine can be used for either forward or reverse searches. + /// Although, as a convenience, if you have an [`Input`], then it may + /// be more succinct to use [`Automaton::start_state_forward`] or + /// [`Automaton::start_state_reverse`]. Note, for example, that the + /// convenience routines return a [`MatchError`] on failure where as this + /// routine returns a [`StartError`]. + /// + /// # Errors + /// + /// This may return a [`StartError`] if the search needs to give up when + /// determining the start state (for example, if it sees a "quit" byte). + /// This can also return an error if the given configuration contains an + /// unsupported [`Anchored`] configuration. + fn start_state( + &self, + config: &start::Config, + ) -> Result; + + /// Return the ID of the start state for this DFA when executing a forward + /// search. + /// + /// This is a convenience routine for calling [`Automaton::start_state`] + /// that converts the given [`Input`] to a [start + /// configuration](start::Config). Additionally, if an error occurs, it is + /// converted from a [`StartError`] to a [`MatchError`] using the offset + /// information in the given [`Input`]. /// /// # Errors /// @@ -251,23 +281,30 @@ pub unsafe trait Automaton { fn start_state_forward( &self, input: &Input<'_>, - ) -> Result; + ) -> Result { + let config = start::Config::from_input_forward(input); + self.start_state(&config).map_err(|err| match err { + StartError::Quit { byte } => { + let offset = input + .start() + .checked_sub(1) + .expect("no quit in start without look-behind"); + MatchError::quit(byte, offset) + } + StartError::UnsupportedAnchored { mode } => { + MatchError::unsupported_anchored(mode) + } + }) + } - /// Return the ID of the start state for this lazy DFA when executing a - /// reverse search. + /// Return the ID of the start state for this DFA when executing a reverse + /// search. /// - /// Unlike typical DFA implementations, the start state for DFAs in this - /// crate is dependent on a few different factors: - /// - /// * The [`Anchored`] mode of the search. Unanchored, anchored and - /// anchored searches for a specific [`PatternID`] all use different start - /// states. - /// * The position at which the search begins, via [`Input::start`]. This - /// and the byte immediately preceding the start of the search (if one - /// exists) influence which look-behind assertions are true at the start - /// of the search. This in turn influences which start state is selected. - /// * Whether the search is a forward or reverse search. This routine can - /// only be used for reverse searches. + /// This is a convenience routine for calling [`Automaton::start_state`] + /// that converts the given [`Input`] to a [start + /// configuration](start::Config). Additionally, if an error occurs, it is + /// converted from a [`StartError`] to a [`MatchError`] using the offset + /// information in the given [`Input`]. /// /// # Errors /// @@ -278,7 +315,18 @@ pub unsafe trait Automaton { fn start_state_reverse( &self, input: &Input<'_>, - ) -> Result; + ) -> Result { + let config = start::Config::from_input_reverse(input); + self.start_state(&config).map_err(|err| match err { + StartError::Quit { byte } => { + let offset = input.end(); + MatchError::quit(byte, offset) + } + StartError::UnsupportedAnchored { mode } => { + MatchError::unsupported_anchored(mode) + } + }) + } /// If this DFA has a universal starting state for the given anchor mode /// and the DFA supports universal starting states, then this returns that @@ -1798,6 +1846,14 @@ unsafe impl<'a, A: Automaton + ?Sized> Automaton for &'a A { (**self).next_eoi_state(current) } + #[inline] + fn start_state( + &self, + config: &start::Config, + ) -> Result { + (**self).start_state(config) + } + #[inline] fn start_state_forward( &self, @@ -2015,6 +2071,90 @@ impl OverlappingState { } } +/// An error that can occur when computing the start state for a search. +/// +/// Computing a start state can fail for a few reasons, either based on +/// incorrect configuration or even based on whether the look-behind byte +/// triggers a quit state. Typically one does not need to handle this error +/// if you're using [`Automaton::start_state_forward`] (or its reverse +/// counterpart), as that routine automatically converts `StartError` to a +/// [`MatchError`] for you. +/// +/// This error may be returned by the [`Automaton::start_state`] routine. +/// +/// This error implements the `std::error::Error` trait when the `std` feature +/// is enabled. +/// +/// This error is marked as non-exhaustive. New variants may be added in a +/// semver compatible release. +#[non_exhaustive] +#[derive(Clone, Debug)] +pub enum StartError { + /// An error that occurs when a starting configuration's look-behind byte + /// is in this DFA's quit set. + Quit { + /// The quit byte that was found. + byte: u8, + }, + /// An error that occurs when the caller requests an anchored mode that + /// isn't supported by the DFA. + UnsupportedAnchored { + /// The anchored mode given that is unsupported. + mode: Anchored, + }, +} + +impl StartError { + pub(crate) fn quit(byte: u8) -> StartError { + StartError::Quit { byte } + } + + pub(crate) fn unsupported_anchored(mode: Anchored) -> StartError { + StartError::UnsupportedAnchored { mode } + } +} + +#[cfg(feature = "std")] +impl std::error::Error for StartError {} + +impl core::fmt::Display for StartError { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + match *self { + StartError::Quit { byte } => write!( + f, + "error computing start state because the look-behind byte \ + {:?} triggered a quit state", + crate::util::escape::DebugByte(byte), + ), + StartError::UnsupportedAnchored { mode: Anchored::Yes } => { + write!( + f, + "error computing start state because \ + anchored searches are not supported or enabled" + ) + } + StartError::UnsupportedAnchored { mode: Anchored::No } => { + write!( + f, + "error computing start state because \ + unanchored searches are not supported or enabled" + ) + } + StartError::UnsupportedAnchored { + mode: Anchored::Pattern(pid), + } => { + write!( + f, + "error computing start state because \ + anchored searches for a specific pattern ({}) \ + are not supported or enabled", + pid.as_usize(), + ) + } + } + } +} + /// Runs the given overlapping `search` function (forwards or backwards) until /// a match is found whose offset does not split a codepoint. /// diff --git a/regex-automata/src/dfa/dense.rs b/regex-automata/src/dfa/dense.rs index 6da865f97..7af38b546 100644 --- a/regex-automata/src/dfa/dense.rs +++ b/regex-automata/src/dfa/dense.rs @@ -30,7 +30,7 @@ use crate::{ use crate::{ dfa::{ accel::Accels, - automaton::{fmt_state_indicator, Automaton}, + automaton::{fmt_state_indicator, Automaton, StartError}, special::Special, start::StartKind, DEAD, @@ -40,8 +40,8 @@ use crate::{ int::{Pointer, Usize}, prefilter::Prefilter, primitives::{PatternID, StateID}, - search::{Anchored, Input, MatchError}, - start::{Start, StartByteMap}, + search::Anchored, + start::{self, Start, StartByteMap}, wire::{self, DeserializeError, Endian, SerializeError}, }, }; @@ -2885,31 +2885,33 @@ impl OwnedDFA { fn set_universal_starts(&mut self) { assert_eq!(6, Start::len(), "expected 6 start configurations"); - let start_id = |dfa: &mut OwnedDFA, inp: &Input<'_>, start: Start| { + let start_id = |dfa: &mut OwnedDFA, + anchored: Anchored, + start: Start| { // This OK because we only call 'start' under conditions // in which we know it will succeed. - dfa.st.start(inp, start).expect("valid Input configuration") + dfa.st.start(anchored, start).expect("valid Input configuration") }; if self.start_kind().has_unanchored() { - let inp = Input::new("").anchored(Anchored::No); - let sid = start_id(self, &inp, Start::NonWordByte); - if sid == start_id(self, &inp, Start::WordByte) - && sid == start_id(self, &inp, Start::Text) - && sid == start_id(self, &inp, Start::LineLF) - && sid == start_id(self, &inp, Start::LineCR) - && sid == start_id(self, &inp, Start::CustomLineTerminator) + let anchor = Anchored::No; + let sid = start_id(self, anchor, Start::NonWordByte); + if sid == start_id(self, anchor, Start::WordByte) + && sid == start_id(self, anchor, Start::Text) + && sid == start_id(self, anchor, Start::LineLF) + && sid == start_id(self, anchor, Start::LineCR) + && sid == start_id(self, anchor, Start::CustomLineTerminator) { self.st.universal_start_unanchored = Some(sid); } } if self.start_kind().has_anchored() { - let inp = Input::new("").anchored(Anchored::Yes); - let sid = start_id(self, &inp, Start::NonWordByte); - if sid == start_id(self, &inp, Start::WordByte) - && sid == start_id(self, &inp, Start::Text) - && sid == start_id(self, &inp, Start::LineLF) - && sid == start_id(self, &inp, Start::LineCR) - && sid == start_id(self, &inp, Start::CustomLineTerminator) + let anchor = Anchored::Yes; + let sid = start_id(self, anchor, Start::NonWordByte); + if sid == start_id(self, anchor, Start::WordByte) + && sid == start_id(self, anchor, Start::Text) + && sid == start_id(self, anchor, Start::LineLF) + && sid == start_id(self, anchor, Start::LineCR) + && sid == start_id(self, anchor, Start::CustomLineTerminator) { self.st.universal_start_anchored = Some(sid); } @@ -3216,35 +3218,21 @@ unsafe impl> Automaton for DFA { } #[cfg_attr(feature = "perf-inline", inline(always))] - fn start_state_forward( + fn start_state( &self, - input: &Input<'_>, - ) -> Result { - if !self.quitset.is_empty() && input.start() > 0 { - let offset = input.start() - 1; - let byte = input.haystack()[offset]; - if self.quitset.contains(byte) { - return Err(MatchError::quit(byte, offset)); - } - } - let start = self.st.start_map.fwd(&input); - self.st.start(input, start) - } - - #[cfg_attr(feature = "perf-inline", inline(always))] - fn start_state_reverse( - &self, - input: &Input<'_>, - ) -> Result { - if !self.quitset.is_empty() && input.end() < input.haystack().len() { - let offset = input.end(); - let byte = input.haystack()[offset]; - if self.quitset.contains(byte) { - return Err(MatchError::quit(byte, offset)); + config: &start::Config, + ) -> Result { + let anchored = config.get_anchored(); + let start = match config.get_look_behind() { + None => Start::Text, + Some(byte) => { + if !self.quitset.is_empty() && self.quitset.contains(byte) { + return Err(StartError::quit(byte)); + } + self.st.start_map.get(byte) } - } - let start = self.st.start_map.rev(&input); - self.st.start(input, start) + }; + self.st.start(anchored, start) } #[cfg_attr(feature = "perf-inline", inline(always))] @@ -4180,28 +4168,27 @@ impl> StartTable { #[cfg_attr(feature = "perf-inline", inline(always))] fn start( &self, - input: &Input<'_>, + anchored: Anchored, start: Start, - ) -> Result { + ) -> Result { let start_index = start.as_usize(); - let mode = input.get_anchored(); - let index = match mode { + let index = match anchored { Anchored::No => { if !self.kind.has_unanchored() { - return Err(MatchError::unsupported_anchored(mode)); + return Err(StartError::unsupported_anchored(anchored)); } start_index } Anchored::Yes => { if !self.kind.has_anchored() { - return Err(MatchError::unsupported_anchored(mode)); + return Err(StartError::unsupported_anchored(anchored)); } self.stride + start_index } Anchored::Pattern(pid) => { let len = match self.pattern_len { None => { - return Err(MatchError::unsupported_anchored(mode)) + return Err(StartError::unsupported_anchored(anchored)) } Some(len) => len, }; @@ -5086,6 +5073,8 @@ impl core::fmt::Display for BuildError { #[cfg(all(test, feature = "syntax", feature = "dfa-build"))] mod tests { + use crate::{Input, MatchError}; + use super::*; #[test] diff --git a/regex-automata/src/dfa/mod.rs b/regex-automata/src/dfa/mod.rs index 4bb870435..fd58cac23 100644 --- a/regex-automata/src/dfa/mod.rs +++ b/regex-automata/src/dfa/mod.rs @@ -320,7 +320,7 @@ dramatically. #[cfg(feature = "dfa-search")] pub use crate::dfa::{ - automaton::{Automaton, OverlappingState}, + automaton::{Automaton, OverlappingState, StartError}, start::StartKind, }; diff --git a/regex-automata/src/dfa/sparse.rs b/regex-automata/src/dfa/sparse.rs index 5d8ec2340..a5ccf9add 100644 --- a/regex-automata/src/dfa/sparse.rs +++ b/regex-automata/src/dfa/sparse.rs @@ -52,7 +52,7 @@ use alloc::{vec, vec::Vec}; use crate::dfa::dense::{self, BuildError}; use crate::{ dfa::{ - automaton::{fmt_state_indicator, Automaton}, + automaton::{fmt_state_indicator, Automaton, StartError}, dense::Flags, special::Special, StartKind, DEAD, @@ -63,8 +63,8 @@ use crate::{ int::{Pointer, Usize, U16, U32}, prefilter::Prefilter, primitives::{PatternID, StateID}, - search::{Anchored, Input, MatchError}, - start::{Start, StartByteMap}, + search::Anchored, + start::{self, Start, StartByteMap}, wire::{self, DeserializeError, Endian, SerializeError}, }, }; @@ -1207,35 +1207,21 @@ unsafe impl> Automaton for DFA { } #[inline] - fn start_state_forward( + fn start_state( &self, - input: &Input<'_>, - ) -> Result { - if !self.quitset.is_empty() && input.start() > 0 { - let offset = input.start() - 1; - let byte = input.haystack()[offset]; - if self.quitset.contains(byte) { - return Err(MatchError::quit(byte, offset)); - } - } - let start = self.st.start_map.fwd(&input); - self.st.start(input, start) - } - - #[inline] - fn start_state_reverse( - &self, - input: &Input<'_>, - ) -> Result { - if !self.quitset.is_empty() && input.end() < input.haystack().len() { - let offset = input.end(); - let byte = input.haystack()[offset]; - if self.quitset.contains(byte) { - return Err(MatchError::quit(byte, offset)); + config: &start::Config, + ) -> Result { + let anchored = config.get_anchored(); + let start = match config.get_look_behind() { + None => Start::Text, + Some(byte) => { + if !self.quitset.is_empty() && self.quitset.contains(byte) { + return Err(StartError::quit(byte)); + } + self.st.start_map.get(byte) } - } - let start = self.st.start_map.rev(&input); - self.st.start(input, start) + }; + self.st.start(anchored, start) } #[inline] @@ -2145,28 +2131,27 @@ impl> StartTable { /// panics. fn start( &self, - input: &Input<'_>, + anchored: Anchored, start: Start, - ) -> Result { + ) -> Result { let start_index = start.as_usize(); - let mode = input.get_anchored(); - let index = match mode { + let index = match anchored { Anchored::No => { if !self.kind.has_unanchored() { - return Err(MatchError::unsupported_anchored(mode)); + return Err(StartError::unsupported_anchored(anchored)); } start_index } Anchored::Yes => { if !self.kind.has_anchored() { - return Err(MatchError::unsupported_anchored(mode)); + return Err(StartError::unsupported_anchored(anchored)); } self.stride + start_index } Anchored::Pattern(pid) => { let len = match self.pattern_len { None => { - return Err(MatchError::unsupported_anchored(mode)) + return Err(StartError::unsupported_anchored(anchored)) } Some(len) => len, }; diff --git a/regex-automata/src/hybrid/dfa.rs b/regex-automata/src/hybrid/dfa.rs index 67261c1a3..102cfb6fe 100644 --- a/regex-automata/src/hybrid/dfa.rs +++ b/regex-automata/src/hybrid/dfa.rs @@ -13,7 +13,7 @@ use alloc::vec::Vec; use crate::{ hybrid::{ - error::{BuildError, CacheError}, + error::{BuildError, CacheError, StartError}, id::{LazyStateID, LazyStateIDError}, search, }, @@ -28,7 +28,7 @@ use crate::{ Anchored, HalfMatch, Input, MatchError, MatchKind, PatternSet, }, sparse_set::SparseSets, - start::{Start, StartByteMap}, + start::{self, Start, StartByteMap}, }, }; @@ -1518,8 +1518,8 @@ impl DFA { Lazy::new(self, cache).cache_next_state(current, unit) } - /// Return the ID of the start state for this lazy DFA when executing a - /// forward search. + /// Return the ID of the start state for this lazy DFA for the given + /// starting configuration. /// /// Unlike typical DFA implementations, the start state for DFAs in this /// crate is dependent on a few different factors: @@ -1527,85 +1527,122 @@ impl DFA { /// * The [`Anchored`] mode of the search. Unanchored, anchored and /// anchored searches for a specific [`PatternID`] all use different start /// states. - /// * The position at which the search begins, via [`Input::start`]. This - /// and the byte immediately preceding the start of the search (if one - /// exists) influence which look-behind assertions are true at the start - /// of the search. This in turn influences which start state is selected. - /// * Whether the search is a forward or reverse search. This routine can - /// only be used for forward searches. + /// * Whether a "look-behind" byte exists. For example, the `^` anchor + /// matches if and only if there is no look-behind byte. + /// * The specific value of that look-behind byte. For example, a `(?m:^)` + /// assertion only matches when there is either no look-behind byte, or + /// when the look-behind byte is a line terminator. + /// + /// The [starting configuration](start::Config) provides the above + /// information. + /// + /// This routine can be used for either forward or reverse searches. + /// Although, as a convenience, if you have an [`Input`], then it + /// may be more succinct to use [`DFA::start_state_forward`] or + /// [`DFA::start_state_reverse`]. Note, for example, that the convenience + /// routines return a [`MatchError`] on failure where as this routine + /// returns a [`StartError`]. + /// + /// # Errors + /// + /// This may return a [`StartError`] if the search needs to give up when + /// determining the start state (for example, if it sees a "quit" byte + /// or if the cache has become inefficient). This can also return an + /// error if the given configuration contains an unsupported [`Anchored`] + /// configuration. + #[cfg_attr(feature = "perf-inline", inline(always))] + pub fn start_state( + &self, + cache: &mut Cache, + config: &start::Config, + ) -> Result { + let lazy = LazyRef::new(self, cache); + let anchored = config.get_anchored(); + let start = match config.get_look_behind() { + None => Start::Text, + Some(byte) => { + if !self.quitset.is_empty() && self.quitset.contains(byte) { + return Err(StartError::quit(byte)); + } + self.start_map.get(byte) + } + }; + let start_id = lazy.get_cached_start_id(anchored, start)?; + if !start_id.is_unknown() { + return Ok(start_id); + } + Lazy::new(self, cache).cache_start_group(anchored, start) + } + + /// Return the ID of the start state for this lazy DFA when executing a + /// forward search. + /// + /// This is a convenience routine for calling [`DFA::start_state`] that + /// converts the given [`Input`] to a [start configuration](start::Config). + /// Additionally, if an error occurs, it is converted from a [`StartError`] + /// to a [`MatchError`] using the offset information in the given + /// [`Input`]. /// /// # Errors /// - /// This may return a [`MatchError`] (not a [`CacheError`]!) if the search - /// needs to give up when determining the start state (for example, if - /// it sees a "quit" byte or if the cache has been cleared too many - /// times). This can also return an error if the given `Input` contains an - /// unsupported [`Anchored`] configuration. + /// This may return a [`MatchError`] if the search needs to give up when + /// determining the start state (for example, if it sees a "quit" byte or + /// if the cache has become inefficient). This can also return an error if + /// the given `Input` contains an unsupported [`Anchored`] configuration. #[cfg_attr(feature = "perf-inline", inline(always))] pub fn start_state_forward( &self, cache: &mut Cache, input: &Input<'_>, ) -> Result { - if !self.quitset.is_empty() && input.start() > 0 { - let offset = input.start() - 1; - let byte = input.haystack()[offset]; - if self.quitset.contains(byte) { - return Err(MatchError::quit(byte, offset)); + let config = start::Config::from_input_forward(input); + self.start_state(cache, &config).map_err(|err| match err { + StartError::Cache { .. } => MatchError::gave_up(input.start()), + StartError::Quit { byte } => { + let offset = input + .start() + .checked_sub(1) + .expect("no quit in start without look-behind"); + MatchError::quit(byte, offset) } - } - let start_type = self.start_map.fwd(input); - let start = LazyRef::new(self, cache) - .get_cached_start_id(input, start_type)?; - if !start.is_unknown() { - return Ok(start); - } - Lazy::new(self, cache).cache_start_group(input, start_type) + StartError::UnsupportedAnchored { mode } => { + MatchError::unsupported_anchored(mode) + } + }) } /// Return the ID of the start state for this lazy DFA when executing a /// reverse search. /// - /// Unlike typical DFA implementations, the start state for DFAs in this - /// crate is dependent on a few different factors: - /// - /// * The [`Anchored`] mode of the search. Unanchored, anchored and - /// anchored searches for a specific [`PatternID`] all use different start - /// states. - /// * The position at which the search begins, via [`Input::start`]. This - /// and the byte immediately preceding the start of the search (if one - /// exists) influence which look-behind assertions are true at the start - /// of the search. This in turn influences which start state is selected. - /// * Whether the search is a forward or reverse search. This routine can - /// only be used for reverse searches. + /// This is a convenience routine for calling [`DFA::start_state`] that + /// converts the given [`Input`] to a [start configuration](start::Config). + /// Additionally, if an error occurs, it is converted from a [`StartError`] + /// to a [`MatchError`] using the offset information in the given + /// [`Input`]. /// /// # Errors /// - /// This may return a [`MatchError`] (not a [`CacheError`]!) if the search - /// needs to give up when determining the start state (for example, if - /// it sees a "quit" byte or if the cache has been cleared too many - /// times). This can also return an error if the given `Input` contains an - /// unsupported [`Anchored`] configuration. + /// This may return a [`MatchError`] if the search needs to give up when + /// determining the start state (for example, if it sees a "quit" byte or + /// if the cache has become inefficient). This can also return an error if + /// the given `Input` contains an unsupported [`Anchored`] configuration. #[cfg_attr(feature = "perf-inline", inline(always))] pub fn start_state_reverse( &self, cache: &mut Cache, input: &Input<'_>, ) -> Result { - if !self.quitset.is_empty() && input.end() < input.haystack().len() { - let offset = input.end(); - let byte = input.haystack()[offset]; - if self.quitset.contains(byte) { - return Err(MatchError::quit(byte, offset)); + let config = start::Config::from_input_reverse(input); + self.start_state(cache, &config).map_err(|err| match err { + StartError::Cache { .. } => MatchError::gave_up(input.end()), + StartError::Quit { byte } => { + let offset = input.end(); + MatchError::quit(byte, offset) } - } - let start_type = self.start_map.rev(input); - let start = LazyRef::new(self, cache) - .get_cached_start_id(input, start_type)?; - if !start.is_unknown() { - return Ok(start); - } - Lazy::new(self, cache).cache_start_group(input, start_type) + StartError::UnsupportedAnchored { mode } => { + MatchError::unsupported_anchored(mode) + } + }) } /// Returns the total number of patterns that match in this state. @@ -2122,16 +2159,15 @@ impl<'i, 'c> Lazy<'i, 'c> { #[inline(never)] fn cache_start_group( &mut self, - input: &Input<'_>, + anchored: Anchored, start: Start, - ) -> Result { - let mode = input.get_anchored(); - let nfa_start_id = match mode { + ) -> Result { + let nfa_start_id = match anchored { Anchored::No => self.dfa.get_nfa().start_unanchored(), Anchored::Yes => self.dfa.get_nfa().start_anchored(), Anchored::Pattern(pid) => { if !self.dfa.get_config().get_starts_for_each_pattern() { - return Err(MatchError::unsupported_anchored(mode)); + return Err(StartError::unsupported_anchored(anchored)); } match self.dfa.get_nfa().start_pattern(pid) { None => return Ok(self.as_ref().dead_id()), @@ -2142,8 +2178,8 @@ impl<'i, 'c> Lazy<'i, 'c> { let id = self .cache_start_one(nfa_start_id, start) - .map_err(|_| MatchError::gave_up(input.start()))?; - self.set_start_state(input, start, id); + .map_err(StartError::cache)?; + self.set_start_state(anchored, start, id); Ok(id) } @@ -2574,13 +2610,13 @@ impl<'i, 'c> Lazy<'i, 'c> { /// 'starts_for_each_pattern' is not enabled. fn set_start_state( &mut self, - input: &Input<'_>, + anchored: Anchored, start: Start, id: LazyStateID, ) { assert!(self.as_ref().is_valid(id)); let start_index = start.as_usize(); - let index = match input.get_anchored() { + let index = match anchored { Anchored::No => start_index, Anchored::Yes => Start::len() + start_index, Anchored::Pattern(pid) => { @@ -2642,17 +2678,16 @@ impl<'i, 'c> LazyRef<'i, 'c> { #[cfg_attr(feature = "perf-inline", inline(always))] fn get_cached_start_id( &self, - input: &Input<'_>, + anchored: Anchored, start: Start, - ) -> Result { + ) -> Result { let start_index = start.as_usize(); - let mode = input.get_anchored(); - let index = match mode { + let index = match anchored { Anchored::No => start_index, Anchored::Yes => Start::len() + start_index, Anchored::Pattern(pid) => { if !self.dfa.get_config().get_starts_for_each_pattern() { - return Err(MatchError::unsupported_anchored(mode)); + return Err(StartError::unsupported_anchored(anchored)); } if pid.as_usize() >= self.dfa.pattern_len() { return Ok(self.dead_id()); diff --git a/regex-automata/src/hybrid/error.rs b/regex-automata/src/hybrid/error.rs index 604daf3c3..d134e7ec9 100644 --- a/regex-automata/src/hybrid/error.rs +++ b/regex-automata/src/hybrid/error.rs @@ -1,4 +1,4 @@ -use crate::{hybrid::id::LazyStateIDError, nfa}; +use crate::{hybrid::id::LazyStateIDError, nfa, util::search::Anchored}; /// An error that occurs when initial construction of a lazy DFA fails. /// @@ -95,6 +95,113 @@ impl core::fmt::Display for BuildError { } } +/// An error that can occur when computing the start state for a search. +/// +/// Computing a start state can fail for a few reasons, either +/// based on incorrect configuration or even based on whether +/// the look-behind byte triggers a quit state. Typically +/// one does not need to handle this error if you're using +/// [`DFA::start_state_forward`](crate::hybrid::dfa::DFA::start_state_forward) +/// (or its reverse counterpart), as that routine automatically converts +/// `StartError` to a [`MatchError`](crate::MatchError) for you. +/// +/// This error may be returned by the +/// [`DFA::start_state`](crate::hybrid::dfa::DFA::start_state) routine. +/// +/// This error implements the `std::error::Error` trait when the `std` feature +/// is enabled. +/// +/// This error is marked as non-exhaustive. New variants may be added in a +/// semver compatible release. +#[non_exhaustive] +#[derive(Clone, Debug)] +pub enum StartError { + /// An error that occurs when cache inefficiency has dropped below the + /// configured heuristic thresholds. + Cache { + /// The underlying cache error that occurred. + err: CacheError, + }, + /// An error that occurs when a starting configuration's look-behind byte + /// is in this DFA's quit set. + Quit { + /// The quit byte that was found. + byte: u8, + }, + /// An error that occurs when the caller requests an anchored mode that + /// isn't supported by the DFA. + UnsupportedAnchored { + /// The anchored mode given that is unsupported. + mode: Anchored, + }, +} + +impl StartError { + pub(crate) fn cache(err: CacheError) -> StartError { + StartError::Cache { err } + } + + pub(crate) fn quit(byte: u8) -> StartError { + StartError::Quit { byte } + } + + pub(crate) fn unsupported_anchored(mode: Anchored) -> StartError { + StartError::UnsupportedAnchored { mode } + } +} + +#[cfg(feature = "std")] +impl std::error::Error for StartError { + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { + match *self { + StartError::Cache { ref err } => Some(err), + _ => None, + } + } +} + +impl core::fmt::Display for StartError { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + match *self { + StartError::Cache { .. } => write!( + f, + "error computing start state because of cache inefficiency" + ), + StartError::Quit { byte } => write!( + f, + "error computing start state because the look-behind byte \ + {:?} triggered a quit state", + crate::util::escape::DebugByte(byte), + ), + StartError::UnsupportedAnchored { mode: Anchored::Yes } => { + write!( + f, + "error computing start state because \ + anchored searches are not supported or enabled" + ) + } + StartError::UnsupportedAnchored { mode: Anchored::No } => { + write!( + f, + "error computing start state because \ + unanchored searches are not supported or enabled" + ) + } + StartError::UnsupportedAnchored { + mode: Anchored::Pattern(pid), + } => { + write!( + f, + "error computing start state because \ + anchored searches for a specific pattern ({}) \ + are not supported or enabled", + pid.as_usize(), + ) + } + } + } +} + /// An error that occurs when cache usage has become inefficient. /// /// One of the weaknesses of a lazy DFA is that it may need to clear its @@ -126,11 +233,7 @@ impl CacheError { } #[cfg(feature = "std")] -impl std::error::Error for CacheError { - fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { - None - } -} +impl std::error::Error for CacheError {} impl core::fmt::Display for CacheError { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { diff --git a/regex-automata/src/hybrid/mod.rs b/regex-automata/src/hybrid/mod.rs index 44e67e129..2feb839d1 100644 --- a/regex-automata/src/hybrid/mod.rs +++ b/regex-automata/src/hybrid/mod.rs @@ -133,7 +133,7 @@ compiled DFAs. */ pub use self::{ - error::{BuildError, CacheError}, + error::{BuildError, CacheError, StartError}, id::LazyStateID, }; diff --git a/regex-automata/src/util/mod.rs b/regex-automata/src/util/mod.rs index bb739df1d..b3eef64e6 100644 --- a/regex-automata/src/util/mod.rs +++ b/regex-automata/src/util/mod.rs @@ -40,6 +40,7 @@ pub mod look; pub mod pool; pub mod prefilter; pub mod primitives; +pub mod start; #[cfg(feature = "syntax")] pub mod syntax; pub mod wire; @@ -52,6 +53,5 @@ pub(crate) mod memchr; pub(crate) mod search; #[cfg(feature = "alloc")] pub(crate) mod sparse_set; -pub(crate) mod start; pub(crate) mod unicode_data; pub(crate) mod utf8; diff --git a/regex-automata/src/util/start.rs b/regex-automata/src/util/start.rs index 4e360d083..f2d1922c9 100644 --- a/regex-automata/src/util/start.rs +++ b/regex-automata/src/util/start.rs @@ -1,17 +1,195 @@ /*! -Provides some helpers for dealing with start state configurations in DFAs. - -[`Start`] represents the possible starting configurations, while -[`StartByteMap`] represents a way to retrieve the `Start` configuration for a -given position in a haystack. +Provides helpers for dealing with start state configurations in DFAs. */ use crate::util::{ look::LookMatcher, - search::Input, + search::{Anchored, Input}, wire::{self, DeserializeError, SerializeError}, }; +/// The configuration used to determine a DFA's start state for a search. +/// +/// A DFA has a single starting state in the typical textbook description. That +/// is, it corresponds to the set of all starting states for the NFA that built +/// it, along with their espsilon closures. In this crate, however, DFAs have +/// many possible start states due to a few factors: +/// +/// * DFAs support the ability to run either anchored or unanchored searches. +/// Each type of search needs its own start state. For example, an unanchored +/// search requires starting at a state corresponding to a regex with a +/// `(?s-u:.)*?` prefix, which will match through anything. +/// * DFAs also optionally support starting an anchored search for any one +/// specific pattern. Each such pattern requires its own start state. +/// * If a look-behind assertion like `^` or `\b` is used in the regex, then +/// the DFA will need to inspect a single byte immediately before the start of +/// the search to choose the correct start state. +/// +/// Indeed, this configuration precisely encapsulates all of the above factors. +/// The [`Config::anchored`] method sets which kind of anchored search to +/// perform while the [`Config::look_behind`] method provides a way to set +/// the byte that occurs immediately before the start of the search. +/// +/// Generally speaking, this type is only useful when you want to run searches +/// without using an [`Input`](crate::Input). In particular, an `Input` wants a +/// haystack slice, but callers may not have a contiguous sequence of bytes as +/// a haystack in all cases. This type provides a lower level of control such +/// that callers can provide their own anchored configuration and look-behind +/// byte explicitly. +/// +/// # Example +/// +/// This shows basic usage that permits running a search with a DFA without +/// using the `Input` abstraction. +/// +/// ``` +/// use regex_automata::{ +/// dfa::{Automaton, dense}, +/// util::start, +/// Anchored, +/// }; +/// +/// let dfa = dense::DFA::new(r"(?-u)\b\w+\b")?; +/// let haystack = "quartz"; +/// +/// let config = start::Config::new().anchored(Anchored::Yes); +/// let mut state = dfa.start_state(&config)?; +/// for &b in haystack.as_bytes().iter() { +/// state = dfa.next_state(state, b); +/// } +/// state = dfa.next_eoi_state(state); +/// assert!(dfa.is_match_state(state)); +/// +/// # Ok::<(), Box>(()) +/// ``` +/// +/// This example shows how to correctly run a search that doesn't begin at +/// the start of a haystack. Notice how we set the look-behind byte, and as +/// a result, the `\b` assertion does not match. +/// +/// ``` +/// use regex_automata::{ +/// dfa::{Automaton, dense}, +/// util::start, +/// Anchored, +/// }; +/// +/// let dfa = dense::DFA::new(r"(?-u)\b\w+\b")?; +/// let haystack = "quartz"; +/// +/// let config = start::Config::new() +/// .anchored(Anchored::Yes) +/// .look_behind(Some(b'q')); +/// let mut state = dfa.start_state(&config)?; +/// for &b in haystack.as_bytes().iter().skip(1) { +/// state = dfa.next_state(state, b); +/// } +/// state = dfa.next_eoi_state(state); +/// // No match! +/// assert!(!dfa.is_match_state(state)); +/// +/// # Ok::<(), Box>(()) +/// ``` +/// +/// If we had instead not set a look-behind byte, then the DFA would assume +/// that it was starting at the beginning of the haystack, and thus `\b` should +/// match. This in turn would result in erroneously reporting a match: +/// +/// ``` +/// use regex_automata::{ +/// dfa::{Automaton, dense}, +/// util::start, +/// Anchored, +/// }; +/// +/// let dfa = dense::DFA::new(r"(?-u)\b\w+\b")?; +/// let haystack = "quartz"; +/// +/// // Whoops, forgot the look-behind byte... +/// let config = start::Config::new().anchored(Anchored::Yes); +/// let mut state = dfa.start_state(&config)?; +/// for &b in haystack.as_bytes().iter().skip(1) { +/// state = dfa.next_state(state, b); +/// } +/// state = dfa.next_eoi_state(state); +/// // And now we get a match unexpectedly. +/// assert!(dfa.is_match_state(state)); +/// +/// # Ok::<(), Box>(()) +/// ``` +#[derive(Clone, Debug)] +pub struct Config { + look_behind: Option, + anchored: Anchored, +} + +impl Config { + /// Create a new default start configuration. + /// + /// The default is an unanchored search that starts at the beginning of the + /// haystack. + pub fn new() -> Config { + Config { anchored: Anchored::No, look_behind: None } + } + + /// A convenience routine for building a start configuration from an + /// [`Input`] for a forward search. + /// + /// This automatically sets the look-behind byte to the byte immediately + /// preceding the start of the search. If the start of the search is at + /// offset `0`, then no look-behind byte is set. + pub fn from_input_forward(input: &Input<'_>) -> Config { + let look_behind = input + .start() + .checked_sub(1) + .and_then(|i| input.haystack().get(i).copied()); + Config { look_behind, anchored: input.get_anchored() } + } + + /// A convenience routine for building a start configuration from an + /// [`Input`] for a reverse search. + /// + /// This automatically sets the look-behind byte to the byte immediately + /// following the end of the search. If the end of the search is at + /// offset `haystack.len()`, then no look-behind byte is set. + pub fn from_input_reverse(input: &Input<'_>) -> Config { + let look_behind = input.haystack().get(input.end()).copied(); + Config { look_behind, anchored: input.get_anchored() } + } + + /// Set the look-behind byte at the start of a search. + /// + /// Unless the search is intended to logically start at the beginning of a + /// haystack, this should _always_ be set to the byte immediately preceding + /// the start of the search. If no look-behind byte is set, then the start + /// configuration will assume it is at the beginning of the haystack. For + /// example, the anchor `^` will match. + /// + /// The default is that no look-behind byte is set. + pub fn look_behind(mut self, byte: Option) -> Config { + self.look_behind = byte; + self + } + + /// Set the anchored mode of a search. + /// + /// The default is an unanchored search. + pub fn anchored(mut self, mode: Anchored) -> Config { + self.anchored = mode; + self + } + + /// Return the look-behind byte in this configuration, if one exists. + pub fn get_look_behind(&self) -> Option { + self.look_behind + } + + /// Return the anchored mode in this configuration. + pub fn get_anchored(&self) -> Anchored { + self.anchored + } +} + /// A map from every possible byte value to its corresponding starting /// configuration. /// @@ -71,30 +249,11 @@ impl StartByteMap { StartByteMap { map } } - /// Return the forward starting configuration for the given `input`. - #[cfg_attr(feature = "perf-inline", inline(always))] - pub(crate) fn fwd(&self, input: &Input) -> Start { - match input - .start() - .checked_sub(1) - .and_then(|i| input.haystack().get(i)) - { - None => Start::Text, - Some(&byte) => self.get(byte), - } - } - - /// Return the reverse starting configuration for the given `input`. - #[cfg_attr(feature = "perf-inline", inline(always))] - pub(crate) fn rev(&self, input: &Input) -> Start { - match input.haystack().get(input.end()) { - None => Start::Text, - Some(&byte) => self.get(byte), - } - } - + /// Return the starting configuration for the given look-behind byte. + /// + /// If no look-behind exists, callers should use `Start::Text`. #[cfg_attr(feature = "perf-inline", inline(always))] - fn get(&self, byte: u8) -> Start { + pub(crate) fn get(&self, byte: u8) -> Start { self.map[usize::from(byte)] } @@ -253,21 +412,32 @@ mod tests { #[test] fn start_fwd_done_range() { let smap = StartByteMap::new(&LookMatcher::default()); - assert_eq!(Start::Text, smap.fwd(&Input::new("").range(1..0))); + let input = Input::new("").range(1..0); + let config = Config::from_input_forward(&input); + let start = + config.get_look_behind().map_or(Start::Text, |b| smap.get(b)); + assert_eq!(Start::Text, start); } #[test] fn start_rev_done_range() { let smap = StartByteMap::new(&LookMatcher::default()); - assert_eq!(Start::Text, smap.rev(&Input::new("").range(1..0))); + let input = Input::new("").range(1..0); + let config = Config::from_input_reverse(&input); + let start = + config.get_look_behind().map_or(Start::Text, |b| smap.get(b)); + assert_eq!(Start::Text, start); } #[test] fn start_fwd() { let f = |haystack, start, end| { let smap = StartByteMap::new(&LookMatcher::default()); - let input = &Input::new(haystack).range(start..end); - smap.fwd(input) + let input = Input::new(haystack).range(start..end); + let config = Config::from_input_forward(&input); + let start = + config.get_look_behind().map_or(Start::Text, |b| smap.get(b)); + start }; assert_eq!(Start::Text, f("", 0, 0)); @@ -287,8 +457,11 @@ mod tests { fn start_rev() { let f = |haystack, start, end| { let smap = StartByteMap::new(&LookMatcher::default()); - let input = &Input::new(haystack).range(start..end); - smap.rev(input) + let input = Input::new(haystack).range(start..end); + let config = Config::from_input_reverse(&input); + let start = + config.get_look_behind().map_or(Start::Text, |b| smap.get(b)); + start }; assert_eq!(Start::Text, f("", 0, 0)); From 201e055ef31760cb70893a0faa93a0941fd49c25 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Fri, 6 Oct 2023 11:52:10 -0400 Subject: [PATCH 071/136] automata: fix doc links --- regex-automata/src/dfa/dense.rs | 8 ++-- regex-automata/src/dfa/regex.rs | 2 +- regex-automata/src/dfa/sparse.rs | 75 +++++++++++++----------------- regex-automata/src/hybrid/dfa.rs | 16 +++---- regex-automata/src/hybrid/regex.rs | 2 +- regex-automata/src/util/start.rs | 6 +-- 6 files changed, 50 insertions(+), 59 deletions(-) diff --git a/regex-automata/src/dfa/dense.rs b/regex-automata/src/dfa/dense.rs index 7af38b546..25dcac989 100644 --- a/regex-automata/src/dfa/dense.rs +++ b/regex-automata/src/dfa/dense.rs @@ -66,8 +66,9 @@ const VERSION: u32 = 2; /// /// The default configuration guarantees that a search will never return /// a "quit" error, although it is possible for a search to fail if -/// [`Config::starts_for_each_pattern`] wasn't enabled (which it is not by -/// default) and an [`Anchored::Pattern`] mode is requested via [`Input`]. +/// [`Config::starts_for_each_pattern`] wasn't enabled (which it is +/// not by default) and an [`Anchored::Pattern`] mode is requested via +/// [`Input`](crate::Input). #[cfg(feature = "dfa-build")] #[derive(Clone, Debug, Default)] pub struct Config { @@ -113,8 +114,7 @@ impl Config { /// make searching slower than it otherwise would be if the transitions /// that leave accelerated states are traversed frequently. /// - /// See [`Automaton::accelerator`](crate::dfa::Automaton::accelerator) for - /// an example. + /// See [`Automaton::accelerator`] for an example. /// /// This is enabled by default. pub fn accelerate(mut self, yes: bool) -> Config { diff --git a/regex-automata/src/dfa/regex.rs b/regex-automata/src/dfa/regex.rs index f39c1c055..5e7e6e38a 100644 --- a/regex-automata/src/dfa/regex.rs +++ b/regex-automata/src/dfa/regex.rs @@ -853,7 +853,7 @@ impl Builder { } /// Set the dense DFA compilation configuration for this builder using - /// [`dense::Config`](dense::Config). + /// [`dense::Config`]. /// /// This permits setting things like whether the underlying DFAs should /// be minimized. diff --git a/regex-automata/src/dfa/sparse.rs b/regex-automata/src/dfa/sparse.rs index a5ccf9add..7862d48a2 100644 --- a/regex-automata/src/dfa/sparse.rs +++ b/regex-automata/src/dfa/sparse.rs @@ -3,13 +3,12 @@ Types and routines specific to sparse DFAs. This module is the home of [`sparse::DFA`](DFA). -Unlike the [`dense`](super::dense) module, this module does not contain a -builder or configuration specific for sparse DFAs. Instead, the intended -way to build a sparse DFA is either by using a default configuration with -its constructor [`sparse::DFA::new`](DFA::new), or by first configuring the -construction of a dense DFA with [`dense::Builder`](super::dense::Builder) -and then calling [`dense::DFA::to_sparse`](super::dense::DFA::to_sparse). For -example, this configures a sparse DFA to do an overlapping search: +Unlike the [`dense`] module, this module does not contain a builder or +configuration specific for sparse DFAs. Instead, the intended way to build a +sparse DFA is either by using a default configuration with its constructor +[`sparse::DFA::new`](DFA::new), or by first configuring the construction of a +dense DFA with [`dense::Builder`] and then calling [`dense::DFA::to_sparse`]. +For example, this configures a sparse DFA to do an overlapping search: ``` use regex_automata::{ @@ -74,18 +73,17 @@ const VERSION: u32 = 2; /// A sparse deterministic finite automaton (DFA) with variable sized states. /// -/// In contrast to a [dense::DFA](crate::dfa::dense::DFA), a sparse DFA uses -/// a more space efficient representation for its transitions. Consequently, -/// sparse DFAs may use much less memory than dense DFAs, but this comes at a -/// price. In particular, reading the more space efficient transitions takes -/// more work, and consequently, searching using a sparse DFA is typically -/// slower than a dense DFA. +/// In contrast to a [dense::DFA], a sparse DFA uses a more space efficient +/// representation for its transitions. Consequently, sparse DFAs may use much +/// less memory than dense DFAs, but this comes at a price. In particular, +/// reading the more space efficient transitions takes more work, and +/// consequently, searching using a sparse DFA is typically slower than a dense +/// DFA. /// /// A sparse DFA can be built using the default configuration via the -/// [`DFA::new`] constructor. Otherwise, one can configure various aspects -/// of a dense DFA via [`dense::Builder`](crate::dfa::dense::Builder), -/// and then convert a dense DFA to a sparse DFA using -/// [`dense::DFA::to_sparse`](crate::dfa::dense::DFA::to_sparse). +/// [`DFA::new`] constructor. Otherwise, one can configure various aspects of a +/// dense DFA via [`dense::Builder`], and then convert a dense DFA to a sparse +/// DFA using [`dense::DFA::to_sparse`]. /// /// In general, a sparse DFA supports all the same search operations as a dense /// DFA. @@ -140,11 +138,9 @@ impl DFA> { /// Parse the given regular expression using a default configuration and /// return the corresponding sparse DFA. /// - /// If you want a non-default configuration, then use - /// the [`dense::Builder`](crate::dfa::dense::Builder) - /// to set your own configuration, and then call - /// [`dense::DFA::to_sparse`](crate::dfa::dense::DFA::to_sparse) to create - /// a sparse DFA. + /// If you want a non-default configuration, then use the + /// [`dense::Builder`] to set your own configuration, and then call + /// [`dense::DFA::to_sparse`] to create a sparse DFA. /// /// # Example /// @@ -167,11 +163,9 @@ impl DFA> { /// Parse the given regular expressions using a default configuration and /// return the corresponding multi-DFA. /// - /// If you want a non-default configuration, then use - /// the [`dense::Builder`](crate::dfa::dense::Builder) - /// to set your own configuration, and then call - /// [`dense::DFA::to_sparse`](crate::dfa::dense::DFA::to_sparse) to create - /// a sparse DFA. + /// If you want a non-default configuration, then use the + /// [`dense::Builder`] to set your own configuration, and then call + /// [`dense::DFA::to_sparse`] to create a sparse DFA. /// /// # Example /// @@ -511,10 +505,9 @@ impl> DFA { /// * [`DFA::from_bytes`] /// * [`DFA::from_bytes_unchecked`] /// - /// Note that unlike a [`dense::DFA`](crate::dfa::dense::DFA)'s - /// serialization methods, this does not add any initial padding to the - /// returned bytes. Padding isn't required for sparse DFAs since they have - /// no alignment requirements. + /// Note that unlike a [`dense::DFA`]'s serialization methods, this does + /// not add any initial padding to the returned bytes. Padding isn't + /// required for sparse DFAs since they have no alignment requirements. /// /// # Example /// @@ -553,10 +546,9 @@ impl> DFA { /// * [`DFA::from_bytes`] /// * [`DFA::from_bytes_unchecked`] /// - /// Note that unlike a [`dense::DFA`](crate::dfa::dense::DFA)'s - /// serialization methods, this does not add any initial padding to the - /// returned bytes. Padding isn't required for sparse DFAs since they have - /// no alignment requirements. + /// Note that unlike a [`dense::DFA`]'s serialization methods, this does + /// not add any initial padding to the returned bytes. Padding isn't + /// required for sparse DFAs since they have no alignment requirements. /// /// # Example /// @@ -595,10 +587,9 @@ impl> DFA { /// * [`DFA::from_bytes`] /// * [`DFA::from_bytes_unchecked`] /// - /// Note that unlike a [`dense::DFA`](crate::dfa::dense::DFA)'s - /// serialization methods, this does not add any initial padding to the - /// returned bytes. Padding isn't required for sparse DFAs since they have - /// no alignment requirements. + /// Note that unlike a [`dense::DFA`]'s serialization methods, this does + /// not add any initial padding to the returned bytes. Padding isn't + /// required for sparse DFAs since they have no alignment requirements. /// /// Generally speaking, native endian format should only be used when /// you know that the target you're compiling the DFA for matches the @@ -903,9 +894,9 @@ impl<'a> DFA<&'a [u8]> { /// /// If any of the above are not true, then an error will be returned. /// - /// Note that unlike deserializing a - /// [`dense::DFA`](crate::dfa::dense::DFA), deserializing a sparse DFA has - /// no alignment requirements. That is, an alignment of `1` is valid. + /// Note that unlike deserializing a [`dense::DFA`], deserializing a sparse + /// DFA has no alignment requirements. That is, an alignment of `1` is + /// valid. /// /// # Panics /// diff --git a/regex-automata/src/hybrid/dfa.rs b/regex-automata/src/hybrid/dfa.rs index 102cfb6fe..9466e1e76 100644 --- a/regex-automata/src/hybrid/dfa.rs +++ b/regex-automata/src/hybrid/dfa.rs @@ -3213,12 +3213,12 @@ impl Config { /// be quit bytes _only_ when a Unicode word boundary is present in the /// pattern. /// - /// When enabling this option, callers _must_ be prepared to handle - /// a [`MatchError`](crate::MatchError) error during search. - /// When using a [`Regex`](crate::hybrid::regex::Regex), this - /// corresponds to using the `try_` suite of methods. Alternatively, - /// if callers can guarantee that their input is ASCII only, then a - /// [`MatchError::quit`] error will never be returned while searching. + /// When enabling this option, callers _must_ be prepared to + /// handle a [`MatchError`] error during search. When using a + /// [`Regex`](crate::hybrid::regex::Regex), this corresponds to using the + /// `try_` suite of methods. Alternatively, if callers can guarantee that + /// their input is ASCII only, then a [`MatchError::quit`] error will never + /// be returned while searching. /// /// This is disabled by default. /// @@ -3304,8 +3304,8 @@ impl Config { /// (The advantage being that non-ASCII quit bytes will only be added if a /// Unicode word boundary is in the pattern.) /// - /// When enabling this option, callers _must_ be prepared to handle a - /// [`MatchError`](crate::MatchError) error during search. When using a + /// When enabling this option, callers _must_ be prepared to + /// handle a [`MatchError`] error during search. When using a /// [`Regex`](crate::hybrid::regex::Regex), this corresponds to using the /// `try_` suite of methods. /// diff --git a/regex-automata/src/hybrid/regex.rs b/regex-automata/src/hybrid/regex.rs index 75667daf9..b3b1fe317 100644 --- a/regex-automata/src/hybrid/regex.rs +++ b/regex-automata/src/hybrid/regex.rs @@ -878,7 +878,7 @@ impl Builder { } /// Set the lazy DFA compilation configuration for this builder using - /// [`dfa::Config`](dfa::Config). + /// [`dfa::Config`]. /// /// This permits setting things like whether Unicode word boundaries should /// be heuristically supported or settings how the behavior of the cache. diff --git a/regex-automata/src/util/start.rs b/regex-automata/src/util/start.rs index f2d1922c9..27153780e 100644 --- a/regex-automata/src/util/start.rs +++ b/regex-automata/src/util/start.rs @@ -31,9 +31,9 @@ use crate::util::{ /// the byte that occurs immediately before the start of the search. /// /// Generally speaking, this type is only useful when you want to run searches -/// without using an [`Input`](crate::Input). In particular, an `Input` wants a -/// haystack slice, but callers may not have a contiguous sequence of bytes as -/// a haystack in all cases. This type provides a lower level of control such +/// without using an [`Input`]. In particular, an `Input` wants a haystack +/// slice, but callers may not have a contiguous sequence of bytes as a +/// haystack in all cases. This type provides a lower level of control such /// that callers can provide their own anchored configuration and look-behind /// byte explicitly. /// From 1c0bf9411b1ae1fc247d87dcb210eb374a014b5c Mon Sep 17 00:00:00 2001 From: Leachim <32847549+Licheam@users.noreply.github.com> Date: Sun, 23 Jul 2023 21:33:41 +0800 Subject: [PATCH 072/136] automata: fix one outdated regex-cli test command Ref #1053 --- regex-automata/src/dfa/dense.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/regex-automata/src/dfa/dense.rs b/regex-automata/src/dfa/dense.rs index 25dcac989..28b525eb7 100644 --- a/regex-automata/src/dfa/dense.rs +++ b/regex-automata/src/dfa/dense.rs @@ -1233,8 +1233,8 @@ impl Builder { // // Test case: // - // regex-cli find hybrid regex -w @conn.json.1000x.log \ - // '^#' '\b10\.55\.182\.100\b' + // regex-cli find match dense --unicode-word-boundary \ + // -p '^#' -p '\b10\.55\.182\.100\b' -y @conn.json.1000x.log if !quitset.is_empty() { set.add_set(&quitset); } From 9a4e2281a193a47cc396fdff8b813a76a3ed3873 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Fri, 6 Oct 2023 12:38:50 -0400 Subject: [PATCH 073/136] automata: fix more out-dated regex-cli commands That should cover all of them. Closes #1053 --- regex-automata/src/dfa/accel.rs | 13 +++++++------ regex-automata/src/dfa/automaton.rs | 2 +- regex-automata/src/dfa/dense.rs | 5 +++-- regex-automata/src/hybrid/dfa.rs | 10 ++++++---- regex-automata/src/hybrid/search.rs | 10 +++++----- regex-automata/src/nfa/thompson/compiler.rs | 2 +- regex-automata/src/nfa/thompson/map.rs | 2 +- regex-automata/src/nfa/thompson/nfa.rs | 6 ++---- regex-automata/src/nfa/thompson/range_trie.rs | 2 +- regex-automata/src/util/look.rs | 4 +++- 10 files changed, 30 insertions(+), 26 deletions(-) diff --git a/regex-automata/src/dfa/accel.rs b/regex-automata/src/dfa/accel.rs index 5ea2423dd..c0ba18ea8 100644 --- a/regex-automata/src/dfa/accel.rs +++ b/regex-automata/src/dfa/accel.rs @@ -6,15 +6,16 @@ // non-Unicode regexes. For example, consider '(?-u)[^a]+a'. We can look at its // DFA with regex-cli: // -// $ regex-cli debug dfa dense '(?-u)[^a]+a' -BbC -// dense::DFA( +// $ regex-cli debug dense dfa -p '(?-u)[^a]+a' -BbC --no-table // D 000000: // Q 000001: // *000002: -// A 000003: \x00-` => 3, a => 5, b-\xFF => 3 -// >000004: \x00-` => 3, a => 4, b-\xFF => 3 -// 000005: \x00-\xFF => 2, EOI => 2 -// ) +// A 000003: \x00-` => 3, a => 8, b-\xFF => 3 +// A 000004: \x00-` => 4, a => 7, b-\xFF => 4 +// 000005: \x00-` => 4, b-\xFF => 4 +// 000006: \x00-` => 3, a => 6, b-\xFF => 3 +// 000007: \x00-\xFF => 2, EOI => 2 +// 000008: \x00-\xFF => 2, EOI => 2 // // In particular, state 3 is accelerated (shown via the 'A' indicator) since // the only way to leave that state once entered is to see an 'a' byte. If diff --git a/regex-automata/src/dfa/automaton.rs b/regex-automata/src/dfa/automaton.rs index cd597947e..fcfcf2997 100644 --- a/regex-automata/src/dfa/automaton.rs +++ b/regex-automata/src/dfa/automaton.rs @@ -1132,7 +1132,7 @@ pub unsafe trait Automaton { /// // implementation defined. /// // /// // N.B. We get '3' by inspecting the state machine using 'regex-cli'. - /// // e.g., try `regex-cli debug dfa dense '[^abc]+a' -BbUC`. + /// // e.g., try `regex-cli debug dense dfa -p '[^abc]+a' -BbUC`. /// let id = StateID::new(3 * dfa.stride()).unwrap(); /// let accelerator = dfa.accelerator(id); /// // The `[^abc]+` sub-expression permits [a, b, c] to be accelerated. diff --git a/regex-automata/src/dfa/dense.rs b/regex-automata/src/dfa/dense.rs index 28b525eb7..c9fe3b381 100644 --- a/regex-automata/src/dfa/dense.rs +++ b/regex-automata/src/dfa/dense.rs @@ -1228,8 +1228,9 @@ impl Builder { } else { let mut set = nfa.byte_class_set().clone(); // It is important to distinguish any "quit" bytes from all other - // bytes. Otherwise, a non-quit byte may end up in the same class - // as a quit byte, and thus cause the DFA stop when it shouldn't. + // bytes. Otherwise, a non-quit byte may end up in the same + // class as a quit byte, and thus cause the DFA to stop when it + // shouldn't. // // Test case: // diff --git a/regex-automata/src/hybrid/dfa.rs b/regex-automata/src/hybrid/dfa.rs index 9466e1e76..bd9179b19 100644 --- a/regex-automata/src/hybrid/dfa.rs +++ b/regex-automata/src/hybrid/dfa.rs @@ -2103,8 +2103,10 @@ impl<'i, 'c> Lazy<'i, 'c> { /// Here's an example that justifies 'inline(never)' /// /// ```ignore - /// regex-cli find hybrid dfa \ - /// @all-codepoints-utf8-100x '\pL{100}' --cache-capacity 10000000 + /// regex-cli find match hybrid \ + /// --cache-capacity 100000000 \ + /// -p '\pL{100}' + /// all-codepoints-utf8-100x /// ``` /// /// Where 'all-codepoints-utf8-100x' is the UTF-8 encoding of every @@ -3830,8 +3832,8 @@ impl Config { // // Test case: // - // regex-cli find hybrid regex -w @conn.json.1000x.log \ - // '^#' '\b10\.55\.182\.100\b' + // regex-cli find match hybrid --unicode-word-boundary \ + // -p '^#' -p '\b10\.55\.182\.100\b' -y @conn.json.1000x.log if !quit.is_empty() { set.add_set(&quit); } diff --git a/regex-automata/src/hybrid/search.rs b/regex-automata/src/hybrid/search.rs index f23283685..1f4a505db 100644 --- a/regex-automata/src/hybrid/search.rs +++ b/regex-automata/src/hybrid/search.rs @@ -105,14 +105,14 @@ fn find_fwd_imp( // PERF: For justification of omitting bounds checks, it gives us a // ~10% bump in search time. This was used for a benchmark: // - // regex-cli find hybrid dfa @bigfile '(?m)^.+$' -UBb + // regex-cli find half hybrid -p '(?m)^.+$' -UBb bigfile // // PERF: For justification for the loop unrolling, we use a few // different tests: // - // regex-cli find hybrid dfa @$bigfile '\w{50}' -UBb - // regex-cli find hybrid dfa @$bigfile '(?m)^.+$' -UBb - // regex-cli find hybrid dfa @$bigfile 'ZQZQZQZQ' -UBb + // regex-cli find half hybrid -p '\w{50}' -UBb bigfile + // regex-cli find half hybrid -p '(?m)^.+$' -UBb bigfile + // regex-cli find half hybrid -p 'ZQZQZQZQ' -UBb bigfile // // And there are three different configurations: // @@ -353,7 +353,7 @@ fn find_rev_imp( // anchored and on shorter haystacks. However, this still makes a // difference. Take this command for example: // - // regex-cli find hybrid regex @$bigfile '(?m)^.+$' -UBb + // regex-cli find match hybrid -p '(?m)^.+$' -UBb bigfile // // (Notice that we use 'find hybrid regex', not 'find hybrid dfa' // like in the justification for the forward direction. The 'regex' diff --git a/regex-automata/src/nfa/thompson/compiler.rs b/regex-automata/src/nfa/thompson/compiler.rs index 065e9ef27..a188017d8 100644 --- a/regex-automata/src/nfa/thompson/compiler.rs +++ b/regex-automata/src/nfa/thompson/compiler.rs @@ -1466,7 +1466,7 @@ impl Compiler { // compare and contrast performance of the Pike VM when the code below // is active vs the code above. Here's an example to try: // - // regex-cli find match pikevm -b -p '(?m)^\w{20}' -y '@$smallishru' + // regex-cli find match pikevm -b -p '(?m)^\w{20}' non-ascii-file // // With Unicode classes generated below, this search takes about 45s on // my machine. But with the compressed version above, the search takes diff --git a/regex-automata/src/nfa/thompson/map.rs b/regex-automata/src/nfa/thompson/map.rs index c36ce5386..c92d4c0b8 100644 --- a/regex-automata/src/nfa/thompson/map.rs +++ b/regex-automata/src/nfa/thompson/map.rs @@ -65,7 +65,7 @@ const INIT: u64 = 14695981039346656037; /// Specifically, one could observe the difference with std's hashmap via /// something like the following benchmark: /// -/// hyperfine "regex-cli debug nfa thompson --quiet --reverse '\w{90} ecurB'" +/// hyperfine "regex-cli debug thompson -qr --no-captures '\w{90} ecurB'" /// /// But to observe that difference, you'd have to modify the code to use /// std's hashmap. diff --git a/regex-automata/src/nfa/thompson/nfa.rs b/regex-automata/src/nfa/thompson/nfa.rs index 2108fa338..1f57f8ebd 100644 --- a/regex-automata/src/nfa/thompson/nfa.rs +++ b/regex-automata/src/nfa/thompson/nfa.rs @@ -1841,14 +1841,12 @@ impl SparseTransitions { // This is an alternative implementation that uses binary search. In // some ad hoc experiments, like // - // smallishru=OpenSubtitles2018.raw.sample.smallish.ru - // regex-cli find nfa thompson pikevm -b "@$smallishru" '\b\w+\b' + // regex-cli find match pikevm -b -p '\b\w+\b' non-ascii-file // // I could not observe any improvement, and in fact, things seemed to // be a bit slower. I can see an improvement in at least one benchmark: // - // allcpssmall=all-codepoints-utf8-10x - // regex-cli find nfa thompson pikevm @$allcpssmall '\pL{100}' + // regex-cli find match pikevm -b -p '\pL{100}' all-codepoints-utf8 // // Where total search time goes from 3.2s to 2.4s when using binary // search. diff --git a/regex-automata/src/nfa/thompson/range_trie.rs b/regex-automata/src/nfa/thompson/range_trie.rs index 2d43a5b6f..75c9b796b 100644 --- a/regex-automata/src/nfa/thompson/range_trie.rs +++ b/regex-automata/src/nfa/thompson/range_trie.rs @@ -594,7 +594,7 @@ impl State { // Benchmarks suggest that binary search is just a bit faster than // straight linear search. Specifically when using the debug tool: // - // hyperfine "regex-cli debug nfa thompson --quiet --reverse '\w{90} ecurB'" + // hyperfine "regex-cli debug thompson -qr --no-captures '\w{90} ecurB'" binary_search(&self.transitions, |t| range.start <= t.range.end) } diff --git a/regex-automata/src/util/look.rs b/regex-automata/src/util/look.rs index a34ea1d75..81b4eb718 100644 --- a/regex-automata/src/util/look.rs +++ b/regex-automata/src/util/look.rs @@ -1024,7 +1024,9 @@ impl core::fmt::Display for UnicodeWordBoundaryError { // There are perhaps other choices as well. Why did I stop at these 4? Because // I wanted to preserve my sanity. I suspect I'll wind up adding the lazy DFA // approach eventually, as the benefits of the DFA approach are somewhat -// compelling. The 'boundary-words-holmes' benchmark tests this: +// compelling. The 'boundary-words-holmes' benchmark tests this. (Note that +// the commands below no longer work. If necessary, we should re-capitulate +// the benchmark from whole cloth in rebar.) // // $ regex-cli bench measure -f boundary-words-holmes -e pikevm > dfa.csv // From 6d2b09ed6fbc136cca007ce0c57ec9cbae16f3b4 Mon Sep 17 00:00:00 2001 From: Leachim <32847549+Licheam@users.noreply.github.com> Date: Fri, 21 Jul 2023 20:32:37 +0800 Subject: [PATCH 074/136] syntax: optimize most of the IntervalSet routines This reduces or eliminates allocation when combining Unicode classes and should make some things faster. It's unlikely for these optimizations to matter much in practice, but they are likely to help in niche or pathological cases where there are a lot of ops in a class. Closes #1051 --- regex-syntax/src/hir/interval.rs | 282 ++++++++++++++++++++----------- 1 file changed, 185 insertions(+), 97 deletions(-) diff --git a/regex-syntax/src/hir/interval.rs b/regex-syntax/src/hir/interval.rs index e063390a8..e3051bf31 100644 --- a/regex-syntax/src/hir/interval.rs +++ b/regex-syntax/src/hir/interval.rs @@ -19,7 +19,7 @@ use crate::unicode; // // Some of the implementation complexity here is a result of me wanting to // preserve the sequential representation without using additional memory. -// In many cases, we do use linear extra memory, but it is at most 2x and it +// In some cases, we do use linear extra memory, but it is at most 2x and it // is amortized. If we relaxed the memory requirements, this implementation // could become much simpler. The extra memory is honestly probably OK, but // character classes (especially of the Unicode variety) can become quite @@ -81,14 +81,45 @@ impl IntervalSet { /// Add a new interval to this set. pub fn push(&mut self, interval: I) { - // TODO: This could be faster. e.g., Push the interval such that - // it preserves canonicalization. - self.ranges.push(interval); - self.canonicalize(); // We don't know whether the new interval added here is considered // case folded, so we conservatively assume that the entire set is // no longer case folded if it was previously. self.folded = false; + + if self.ranges.is_empty() { + self.ranges.push(interval); + return; + } + + // Find the first range that is not greater than the new interval. + // This is the first range that could possibly be unioned with the + // new interval. + let mut drain_end = self.ranges.len(); + while drain_end > 0 + && self.ranges[drain_end - 1].lower() > interval.upper() + && !self.ranges[drain_end - 1].is_contiguous(&interval) + { + drain_end -= 1; + } + + // Try to union the new interval with old intervals backwards. + if drain_end > 0 && self.ranges[drain_end - 1].is_contiguous(&interval) + { + self.ranges[drain_end - 1] = + self.ranges[drain_end - 1].union(&interval).unwrap(); + for i in (0..drain_end - 1).rev() { + if let Some(union) = + self.ranges[drain_end - 1].union(&self.ranges[i]) + { + self.ranges[drain_end - 1] = union; + } else { + self.ranges.drain(i + 1..drain_end - 1); + break; + } + } + } else { + self.ranges.insert(drain_end, interval); + } } /// Return an iterator over all intervals in this set. @@ -192,34 +223,13 @@ impl IntervalSet { // Folks seem to suggest interval or segment trees, but I'd like to // avoid the overhead (both runtime and conceptual) of that. // - // The following is basically my Shitty First Draft. Therefore, in - // order to grok it, you probably need to read each line carefully. - // Simplifications are most welcome! - // // Remember, we can assume the canonical format invariant here, which // says that all ranges are sorted, not overlapping and not adjacent in // each class. let drain_end = self.ranges.len(); - let (mut a, mut b) = (0, 0); - 'LOOP: while a < drain_end && b < other.ranges.len() { - // Basically, the easy cases are when neither range overlaps with - // each other. If the `b` range is less than our current `a` - // range, then we can skip it and move on. - if other.ranges[b].upper() < self.ranges[a].lower() { - b += 1; - continue; - } - // ... similarly for the `a` range. If it's less than the smallest - // `b` range, then we can add it as-is. - if self.ranges[a].upper() < other.ranges[b].lower() { - let range = self.ranges[a]; - self.ranges.push(range); - a += 1; - continue; - } - // Otherwise, we have overlapping ranges. - assert!(!self.ranges[a].is_intersection_empty(&other.ranges[b])); + let mut b = 0; + for a in 0..drain_end { // This part is tricky and was non-obvious to me without looking // at explicit examples (see the tests). The trickiness stems from // two things: 1) subtracting a range from another range could @@ -231,47 +241,34 @@ impl IntervalSet { // For example, if our `a` range is `a-t` and our next three `b` // ranges are `a-c`, `g-i`, `r-t` and `x-z`, then we need to apply // subtraction three times before moving on to the next `a` range. - let mut range = self.ranges[a]; + self.ranges.push(self.ranges[a]); + // Only when `b` is not above `a`, `b` might apply to current + // `a` range. while b < other.ranges.len() - && !range.is_intersection_empty(&other.ranges[b]) + && other.ranges[b].lower() <= self.ranges[a].upper() { - let old_range = range; - range = match range.difference(&other.ranges[b]) { - (None, None) => { - // We lost the entire range, so move on to the next - // without adding this one. - a += 1; - continue 'LOOP; + match self.ranges.pop().unwrap().difference(&other.ranges[b]) { + (Some(range1), None) | (None, Some(range1)) => { + self.ranges.push(range1); } - (Some(range1), None) | (None, Some(range1)) => range1, (Some(range1), Some(range2)) => { self.ranges.push(range1); - range2 + self.ranges.push(range2); } - }; - // It's possible that the `b` range has more to contribute - // here. In particular, if it is greater than the original - // range, then it might impact the next `a` range *and* it - // has impacted the current `a` range as much as possible, - // so we can quit. We don't bump `b` so that the next `a` - // range can apply it. - if other.ranges[b].upper() > old_range.upper() { - break; + (None, None) => {} } - // Otherwise, the next `b` range might apply to the current + // The next `b` range might apply to the current // `a` range. b += 1; } - self.ranges.push(range); - a += 1; - } - while a < drain_end { - let range = self.ranges[a]; - self.ranges.push(range); - a += 1; + // It's possible that the last `b` range has more to + // contribute to the next `a`. We don't bump the last + // `b` so that the next `a` range can apply it. + b = b.saturating_sub(1); } + self.ranges.drain(..drain_end); - self.folded = self.folded && other.folded; + self.folded = self.ranges.is_empty() || (self.folded && other.folded); } /// Compute the symmetric difference of the two sets, in place. @@ -282,11 +279,83 @@ impl IntervalSet { /// set. That is, the set will contain all elements in either set, /// but will not contain any elements that are in both sets. pub fn symmetric_difference(&mut self, other: &IntervalSet) { - // TODO(burntsushi): Fix this so that it amortizes allocation. - let mut intersection = self.clone(); - intersection.intersect(other); - self.union(other); - self.difference(&intersection); + if self.ranges.is_empty() { + self.ranges.extend(&other.ranges); + self.folded = other.folded; + return; + } + if other.ranges.is_empty() { + return; + } + + // There should be a way to do this in-place with constant memory, + // but I couldn't figure out a simple way to do it. So just append + // the symmetric difference to the end of this range, and then drain + // it before we're done. + let drain_end = self.ranges.len(); + let mut b = 0; + let mut b_range = Some(other.ranges[b]); + for a in 0..drain_end { + self.ranges.push(self.ranges[a]); + while b_range + .map_or(false, |r| r.lower() <= self.ranges[a].upper()) + { + let (range1, range2) = match self + .ranges + .pop() + .unwrap() + .symmetric_difference(&b_range.as_ref().unwrap()) + { + (Some(range1), None) | (None, Some(range1)) => { + (Some(range1), None) + } + (Some(range1), Some(range2)) => { + (Some(range1), Some(range2)) + } + (None, None) => (None, None), + }; + if let Some(range) = range1 { + if self.ranges.len() > drain_end + && self.ranges.last().unwrap().is_contiguous(&range) + { + self.ranges + .last_mut() + .map(|last| *last = last.union(&range).unwrap()); + } else { + self.ranges.push(range); + } + } + if let Some(range) = range2 { + self.ranges.push(range); + } + + b_range = if self.ranges.len() > drain_end + && self.ranges.last().unwrap().upper() + > self.ranges[a].upper() + { + Some(*self.ranges.last().unwrap()) + } else { + b += 1; + other.ranges.get(b).cloned() + }; + } + } + while let Some(range) = b_range { + if self.ranges.len() > drain_end + && self.ranges.last().unwrap().is_contiguous(&range) + { + self.ranges + .last_mut() + .map(|last| *last = last.union(&range).unwrap()); + } else { + self.ranges.push(range); + } + b += 1; + b_range = other.ranges.get(b).cloned(); + } + + self.ranges.drain(..drain_end); + self.folded = self.ranges.is_empty() || (self.folded && other.folded); } /// Negate this interval set. @@ -302,28 +371,44 @@ impl IntervalSet { return; } - // There should be a way to do this in-place with constant memory, - // but I couldn't figure out a simple way to do it. So just append - // the negation to the end of this range, and then drain it before - // we're done. - let drain_end = self.ranges.len(); - // We do checked arithmetic below because of the canonical ordering // invariant. if self.ranges[0].lower() > I::Bound::min_value() { - let upper = self.ranges[0].lower().decrement(); - self.ranges.push(I::create(I::Bound::min_value(), upper)); - } - for i in 1..drain_end { - let lower = self.ranges[i - 1].upper().increment(); - let upper = self.ranges[i].lower().decrement(); - self.ranges.push(I::create(lower, upper)); - } - if self.ranges[drain_end - 1].upper() < I::Bound::max_value() { - let lower = self.ranges[drain_end - 1].upper().increment(); - self.ranges.push(I::create(lower, I::Bound::max_value())); + let mut pre_upper = self.ranges[0].upper(); + self.ranges[0] = I::create( + I::Bound::min_value(), + self.ranges[0].lower().decrement(), + ); + for i in 1..self.ranges.len() { + let lower = pre_upper.increment(); + pre_upper = self.ranges[i].upper(); + self.ranges[i] = + I::create(lower, self.ranges[i].lower().decrement()); + } + if pre_upper < I::Bound::max_value() { + self.ranges.push(I::create( + pre_upper.increment(), + I::Bound::max_value(), + )); + } + } else { + for i in 1..self.ranges.len() { + self.ranges[i - 1] = I::create( + self.ranges[i - 1].upper().increment(), + self.ranges[i].lower().decrement(), + ); + } + if self.ranges.last().unwrap().upper() < I::Bound::max_value() { + self.ranges.last_mut().map(|range| { + *range = I::create( + range.upper().increment(), + I::Bound::max_value(), + ) + }); + } else { + self.ranges.pop(); + } } - self.ranges.drain(..drain_end); // We don't need to update whether this set is folded or not, because // it is conservatively preserved through negation. Namely, if a set // is not folded, then it is possible that its negation is folded, for @@ -337,6 +422,7 @@ impl IntervalSet { // of case folded characters. Negating it in turn means that all // equivalence classes in the set are negated, and any equivalence // class that was previously not in the set is now entirely in the set. + self.folded = self.ranges.is_empty() || self.folded; } /// Converts this set into a canonical ordering. @@ -347,24 +433,20 @@ impl IntervalSet { self.ranges.sort(); assert!(!self.ranges.is_empty()); - // Is there a way to do this in-place with constant memory? I couldn't - // figure out a way to do it. So just append the canonicalization to - // the end of this range, and then drain it before we're done. - let drain_end = self.ranges.len(); - for oldi in 0..drain_end { - // If we've added at least one new range, then check if we can - // merge this range in the previously added range. - if self.ranges.len() > drain_end { - let (last, rest) = self.ranges.split_last_mut().unwrap(); - if let Some(union) = last.union(&rest[oldi]) { - *last = union; - continue; - } + // We maintain the canonicalization results in-place at `0..newi`. + // `newi` will keep track of the end of the canonicalized ranges. + let mut newi = 0; + for oldi in 1..self.ranges.len() { + // The last new range gets merged with currnet old range when + // unionable. If not, we update `newi` and store it as a new range. + if let Some(union) = self.ranges[newi].union(&self.ranges[oldi]) { + self.ranges[newi] = union; + } else { + newi += 1; + self.ranges[newi] = self.ranges[oldi]; } - let range = self.ranges[oldi]; - self.ranges.push(range); } - self.ranges.drain(..drain_end); + self.ranges.truncate(newi + 1); } /// Returns true if and only if this class is in a canonical ordering. @@ -486,7 +568,13 @@ pub trait Interval: other: &Self, ) -> (Option, Option) { let union = match self.union(other) { - None => return (Some(self.clone()), Some(other.clone())), + None => { + return if self.upper() < other.lower() { + (Some(self.clone()), Some(other.clone())) + } else { + (Some(other.clone()), Some(self.clone())) + } + } Some(union) => union, }; let intersection = match self.intersect(other) { From baf5b1ef29eec3136884a1595bda4833044a9bee Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sat, 7 Oct 2023 09:19:57 -0400 Subject: [PATCH 075/136] syntax and automata: bump LookSet representation from u16 to u32 This is in preparation for adding 8 new word boundary look-around assertions: \b{start}, \b{end}, \b{start-half} and \b{end-half}, along with Unicode and ASCII-only variants of each. Ref #469 --- regex-automata/src/dfa/dense.rs | 8 ++-- regex-automata/src/dfa/onepass.rs | 2 +- regex-automata/src/util/determinize/state.rs | 39 ++++++++++---------- regex-automata/src/util/look.rs | 26 +++++++------ regex-automata/tests/hybrid/api.rs | 4 +- regex-syntax/src/hir/mod.rs | 26 +++++++------ 6 files changed, 55 insertions(+), 50 deletions(-) diff --git a/regex-automata/src/dfa/dense.rs b/regex-automata/src/dfa/dense.rs index c9fe3b381..902f4b273 100644 --- a/regex-automata/src/dfa/dense.rs +++ b/regex-automata/src/dfa/dense.rs @@ -882,20 +882,20 @@ impl Config { /// # if !cfg!(target_pointer_width = "64") { return Ok(()); } // see #1039 /// use regex_automata::{dfa::{dense, Automaton}, Input}; /// - /// // 600KB isn't enough! + /// // 700KB isn't enough! /// dense::Builder::new() /// .configure(dense::Config::new() - /// .determinize_size_limit(Some(600_000)) + /// .determinize_size_limit(Some(700_000)) /// ) /// .build(r"\w{20}") /// .unwrap_err(); /// - /// // ... but 700KB probably is! + /// // ... but 800KB probably is! /// // (Note that auxiliary storage sizes aren't necessarily stable between /// // releases.) /// let dfa = dense::Builder::new() /// .configure(dense::Config::new() - /// .determinize_size_limit(Some(700_000)) + /// .determinize_size_limit(Some(800_000)) /// ) /// .build(r"\w{20}")?; /// let haystack = "A".repeat(20).into_bytes(); diff --git a/regex-automata/src/dfa/onepass.rs b/regex-automata/src/dfa/onepass.rs index 353bb1e17..e62bbd383 100644 --- a/regex-automata/src/dfa/onepass.rs +++ b/regex-automata/src/dfa/onepass.rs @@ -2815,7 +2815,7 @@ impl Epsilons { /// Return the set of look-around assertions in these epsilon transitions. fn looks(self) -> LookSet { - LookSet { bits: (self.0 & Epsilons::LOOK_MASK).low_u16() } + LookSet { bits: (self.0 & Epsilons::LOOK_MASK).low_u32() } } /// Set the look-around assertions on these epsilon transitions. diff --git a/regex-automata/src/util/determinize/state.rs b/regex-automata/src/util/determinize/state.rs index e64123587..effa6f44d 100644 --- a/regex-automata/src/util/determinize/state.rs +++ b/regex-automata/src/util/determinize/state.rs @@ -197,7 +197,7 @@ impl StateBuilderEmpty { } pub(crate) fn into_matches(mut self) -> StateBuilderMatches { - self.0.extend_from_slice(&[0, 0, 0, 0, 0]); + self.0.extend_from_slice(&[0, 0, 0, 0, 0, 0, 0, 0, 0]); StateBuilderMatches(self.0) } @@ -348,16 +348,17 @@ impl StateBuilderNFA { /// generated by a transition over a "word" byte. (Callers may not always set /// this. For example, if the NFA has no word boundary assertion, then needing /// to track whether a state came from a word byte or not is superfluous and -/// wasteful.) +/// wasteful.) Bit 3 is set to 1 if the state was generated by a transition +/// from a `\r` (forward search) or a `\n` (reverse search) when CRLF mode is +/// enabled. /// -/// Byte 1 corresponds to the look-behind assertions that were satisfied by -/// the transition that created this state. This generally only includes the -/// StartLF and Start assertions. (Look-ahead assertions are not tracked as -/// part of states. Instead, these are applied by re-computing the epsilon -/// closure of a state when computing the transition function. See `next` in -/// the parent module.) +/// Bytes 1..5 correspond to the look-behind assertions that were satisfied +/// by the transition that created this state. (Look-ahead assertions are not +/// tracked as part of states. Instead, these are applied by re-computing the +/// epsilon closure of a state when computing the transition function. See +/// `next` in the parent module.) /// -/// Byte 2 corresponds to the set of look-around assertions (including both +/// Bytes 5..9 correspond to the set of look-around assertions (including both /// look-behind and look-ahead) that appear somewhere in this state's set of /// NFA state IDs. This is used to determine whether this state's epsilon /// closure should be re-computed when computing the transition function. @@ -366,7 +367,7 @@ impl StateBuilderNFA { /// function, we should only re-compute the epsilon closure if those new /// assertions are relevant to this particular state. /// -/// Bytes 3..7 correspond to a 32-bit native-endian encoded integer +/// Bytes 9..13 correspond to a 32-bit native-endian encoded integer /// corresponding to the number of patterns encoded in this state. If the state /// is not a match state (byte 0 bit 0 is 0) or if it's only pattern ID is /// PatternID::ZERO, then no integer is encoded at this position. Instead, byte @@ -452,7 +453,7 @@ impl<'a> Repr<'a> { /// state has no conditional epsilon transitions, then there is no need /// to re-compute the epsilon closure. fn look_need(&self) -> LookSet { - LookSet::read_repr(&self.0[3..]) + LookSet::read_repr(&self.0[5..]) } /// Returns the total number of match pattern IDs in this state. @@ -476,7 +477,7 @@ impl<'a> Repr<'a> { if !self.has_pattern_ids() { PatternID::ZERO } else { - let offset = 9 + index * PatternID::SIZE; + let offset = 13 + index * PatternID::SIZE; // This is OK since we only ever serialize valid PatternIDs to // states. wire::read_pattern_id_unchecked(&self.0[offset..]).0 @@ -507,7 +508,7 @@ impl<'a> Repr<'a> { f(PatternID::ZERO); return; } - let mut pids = &self.0[9..self.pattern_offset_end()]; + let mut pids = &self.0[13..self.pattern_offset_end()]; while !pids.is_empty() { let pid = wire::read_u32(pids); pids = &pids[PatternID::SIZE..]; @@ -539,11 +540,11 @@ impl<'a> Repr<'a> { fn pattern_offset_end(&self) -> usize { let encoded = self.encoded_pattern_len(); if encoded == 0 { - return 5; + return 9; } // This arithmetic is OK since we were able to address this many bytes // when writing to the state, thus, it must fit into a usize. - encoded.checked_mul(4).unwrap().checked_add(9).unwrap() + encoded.checked_mul(4).unwrap().checked_add(13).unwrap() } /// Returns the total number of *encoded* pattern IDs in this state. @@ -557,7 +558,7 @@ impl<'a> Repr<'a> { } // This unwrap is OK since the total number of patterns is always // guaranteed to fit into a usize. - usize::try_from(wire::read_u32(&self.0[5..9])).unwrap() + usize::try_from(wire::read_u32(&self.0[9..13])).unwrap() } } @@ -643,7 +644,7 @@ impl<'a> ReprVec<'a> { /// Mutate the set of look-around (both behind and ahead) assertions that /// appear at least once in this state's set of NFA states. fn set_look_need(&mut self, mut set: impl FnMut(LookSet) -> LookSet) { - set(self.look_need()).write_repr(&mut self.0[3..]); + set(self.look_need()).write_repr(&mut self.0[5..]); } /// Add a pattern ID to this state. All match states must have at least @@ -703,14 +704,14 @@ impl<'a> ReprVec<'a> { return; } let patsize = PatternID::SIZE; - let pattern_bytes = self.0.len() - 9; + let pattern_bytes = self.0.len() - 13; // Every pattern ID uses 4 bytes, so number of bytes should be // divisible by 4. assert_eq!(pattern_bytes % patsize, 0); // This unwrap is OK since we are guaranteed that the maximum number // of possible patterns fits into a u32. let count32 = u32::try_from(pattern_bytes / patsize).unwrap(); - wire::NE::write_u32(count32, &mut self.0[5..9]); + wire::NE::write_u32(count32, &mut self.0[9..13]); } /// Add an NFA state ID to this state. The order in which NFA states are diff --git a/regex-automata/src/util/look.rs b/regex-automata/src/util/look.rs index 81b4eb718..f87b963ad 100644 --- a/regex-automata/src/util/look.rs +++ b/regex-automata/src/util/look.rs @@ -125,17 +125,17 @@ impl Look { /// constructor is guaranteed to return the same look-around variant that /// one started with within a semver compatible release of this crate. #[inline] - pub const fn as_repr(self) -> u16 { + pub const fn as_repr(self) -> u32 { // AFAIK, 'as' is the only way to zero-cost convert an int enum to an // actual int. - self as u16 + self as u32 } /// Given the underlying representation of a `Look` value, return the /// corresponding `Look` value if the representation is valid. Otherwise /// `None` is returned. #[inline] - pub const fn from_repr(repr: u16) -> Option { + pub const fn from_repr(repr: u32) -> Option { match repr { 0b00_0000_0001 => Some(Look::Start), 0b00_0000_0010 => Some(Look::End), @@ -191,7 +191,7 @@ pub struct LookSet { /// range of `u16` values to be represented. For example, even if the /// current implementation only makes use of the 10 least significant bits, /// it may use more bits in a future semver compatible release. - pub bits: u16, + pub bits: u32, } impl LookSet { @@ -379,29 +379,31 @@ impl LookSet { *self = self.intersect(other); } - /// Return a `LookSet` from the slice given as a native endian 16-bit + /// Return a `LookSet` from the slice given as a native endian 32-bit /// integer. /// /// # Panics /// - /// This panics if `slice.len() < 2`. + /// This panics if `slice.len() < 4`. #[inline] pub fn read_repr(slice: &[u8]) -> LookSet { - let bits = u16::from_ne_bytes(slice[..2].try_into().unwrap()); + let bits = u32::from_ne_bytes(slice[..4].try_into().unwrap()); LookSet { bits } } - /// Write a `LookSet` as a native endian 16-bit integer to the beginning + /// Write a `LookSet` as a native endian 32-bit integer to the beginning /// of the slice given. /// /// # Panics /// - /// This panics if `slice.len() < 2`. + /// This panics if `slice.len() < 4`. #[inline] pub fn write_repr(self, slice: &mut [u8]) { let raw = self.bits.to_ne_bytes(); slice[0] = raw[0]; slice[1] = raw[1]; + slice[2] = raw[2]; + slice[3] = raw[3]; } /// Checks that all assertions in this set can be matched. @@ -456,9 +458,9 @@ impl Iterator for LookSetIter { return None; } // We'll never have more than u8::MAX distinct look-around assertions, - // so 'repr' will always fit into a u16. - let repr = u16::try_from(self.set.bits.trailing_zeros()).unwrap(); - let look = Look::from_repr(1 << repr)?; + // so 'bit' will always fit into a u16. + let bit = u16::try_from(self.set.bits.trailing_zeros()).unwrap(); + let look = Look::from_repr(1 << bit)?; self.set = self.set.remove(look); Some(look) } diff --git a/regex-automata/tests/hybrid/api.rs b/regex-automata/tests/hybrid/api.rs index e82d808e3..4b04c4f8f 100644 --- a/regex-automata/tests/hybrid/api.rs +++ b/regex-automata/tests/hybrid/api.rs @@ -55,7 +55,7 @@ fn too_many_cache_resets_cause_quit() -> Result<(), Box> { let mut cache = dfa.create_cache(); let haystack = "a".repeat(101).into_bytes(); - let err = MatchError::gave_up(25); + let err = MatchError::gave_up(24); // Notice that we make the same amount of progress in each search! That's // because the cache is reused and already has states to handle the first // N bytes. @@ -83,7 +83,7 @@ fn too_many_cache_resets_cause_quit() -> Result<(), Box> { // OK, if we reset the cache, then we should be able to create more states // and make more progress with searching for betas. cache.reset(&dfa); - let err = MatchError::gave_up(27); + let err = MatchError::gave_up(26); assert_eq!( Err(err), dfa.try_search_fwd(&mut cache, &Input::new(&haystack)) diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index f8a3d4a9e..361ca41af 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -1664,17 +1664,17 @@ impl Look { /// constructor is guaranteed to return the same look-around variant that /// one started with within a semver compatible release of this crate. #[inline] - pub const fn as_repr(self) -> u16 { + pub const fn as_repr(self) -> u32 { // AFAIK, 'as' is the only way to zero-cost convert an int enum to an // actual int. - self as u16 + self as u32 } /// Given the underlying representation of a `Look` value, return the /// corresponding `Look` value if the representation is valid. Otherwise /// `None` is returned. #[inline] - pub const fn from_repr(repr: u16) -> Option { + pub const fn from_repr(repr: u32) -> Option { match repr { 0b00_0000_0001 => Some(Look::Start), 0b00_0000_0010 => Some(Look::End), @@ -2600,7 +2600,7 @@ pub struct LookSet { /// range of `u16` values to be represented. For example, even if the /// current implementation only makes use of the 10 least significant bits, /// it may use more bits in a future semver compatible release. - pub bits: u16, + pub bits: u32, } impl LookSet { @@ -2788,29 +2788,31 @@ impl LookSet { *self = self.intersect(other); } - /// Return a `LookSet` from the slice given as a native endian 16-bit + /// Return a `LookSet` from the slice given as a native endian 32-bit /// integer. /// /// # Panics /// - /// This panics if `slice.len() < 2`. + /// This panics if `slice.len() < 4`. #[inline] pub fn read_repr(slice: &[u8]) -> LookSet { - let bits = u16::from_ne_bytes(slice[..2].try_into().unwrap()); + let bits = u32::from_ne_bytes(slice[..4].try_into().unwrap()); LookSet { bits } } - /// Write a `LookSet` as a native endian 16-bit integer to the beginning + /// Write a `LookSet` as a native endian 32-bit integer to the beginning /// of the slice given. /// /// # Panics /// - /// This panics if `slice.len() < 2`. + /// This panics if `slice.len() < 4`. #[inline] pub fn write_repr(self, slice: &mut [u8]) { let raw = self.bits.to_ne_bytes(); slice[0] = raw[0]; slice[1] = raw[1]; + slice[2] = raw[2]; + slice[3] = raw[3]; } } @@ -2843,9 +2845,9 @@ impl Iterator for LookSetIter { return None; } // We'll never have more than u8::MAX distinct look-around assertions, - // so 'repr' will always fit into a u16. - let repr = u16::try_from(self.set.bits.trailing_zeros()).unwrap(); - let look = Look::from_repr(1 << repr)?; + // so 'bit' will always fit into a u16. + let bit = u16::try_from(self.set.bits.trailing_zeros()).unwrap(); + let look = Look::from_repr(1 << bit)?; self.set = self.set.remove(look); Some(look) } From 19e54d89f94a8785892b8f8f4568ac7d37066c09 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Fri, 6 Oct 2023 21:58:38 -0400 Subject: [PATCH 076/136] syntax/ast: add support for additional word boundary assertions This adds AST support for the following new assertions: \b{start}, \b{end}, \b{start-half}, \b{end-half}, \< and \>. The last two, \< and \>, are aliases for \b{start} and \b{end}. The parsing for this is a little suspect since there's a little ambiguity between, e.g., \b{5} and \b{start}, but we handle it by allowing the parser to look for one of the new special assertions, and then back-up if it fails to find one so that it can try to parse a counted repetition. Ref #469 --- regex-syntax/src/ast/mod.rs | 47 +++++++ regex-syntax/src/ast/parse.rs | 226 ++++++++++++++++++++++++++++-- regex-syntax/src/ast/print.rs | 6 + regex-syntax/src/hir/translate.rs | 14 ++ regex-syntax/src/lib.rs | 3 + 5 files changed, 281 insertions(+), 15 deletions(-) diff --git a/regex-syntax/src/ast/mod.rs b/regex-syntax/src/ast/mod.rs index 9e0f92606..6a77ee134 100644 --- a/regex-syntax/src/ast/mod.rs +++ b/regex-syntax/src/ast/mod.rs @@ -162,6 +162,18 @@ pub enum ErrorKind { /// `(?i)*`. It is, however, possible to create a repetition operating on /// an empty sub-expression. For example, `()*` is still considered valid. RepetitionMissing, + /// The special word boundary syntax, `\b{something}`, was used, but + /// either EOF without `}` was seen, or an invalid character in the + /// braces was seen. + SpecialWordBoundaryUnclosed, + /// The special word boundary syntax, `\b{something}`, was used, but + /// `something` was not recognized as a valid word boundary kind. + SpecialWordBoundaryUnrecognized, + /// The syntax `\b{` was observed, but afterwards the end of the pattern + /// was observed without being able to tell whether it was meant to be a + /// bounded repetition on the `\b` or the beginning of a special word + /// boundary assertion. + SpecialWordOrRepetitionUnexpectedEof, /// The Unicode class is not valid. This typically occurs when a `\p` is /// followed by something other than a `{`. UnicodeClassInvalid, @@ -260,6 +272,29 @@ impl core::fmt::Display for ErrorKind { RepetitionMissing => { write!(f, "repetition operator missing expression") } + SpecialWordBoundaryUnclosed => { + write!( + f, + "special word boundary assertion is either \ + unclosed or contains an invalid character", + ) + } + SpecialWordBoundaryUnrecognized => { + write!( + f, + "unrecognized special word boundary assertion, \ + valid choices are: start, end, start-half \ + or end-half", + ) + } + SpecialWordOrRepetitionUnexpectedEof => { + write!( + f, + "found either the beginning of a special word \ + boundary or a bounded repetition on a \\b with \ + an opening brace, but no closing brace", + ) + } UnicodeClassInvalid => { write!(f, "invalid Unicode character class") } @@ -1293,6 +1328,18 @@ pub enum AssertionKind { WordBoundary, /// `\B` NotWordBoundary, + /// `\b{start}` + WordBoundaryStart, + /// `\b{end}` + WordBoundaryEnd, + /// `\<` (alias for `\b{start}`) + WordBoundaryStartAngle, + /// `\>` (alias for `\b{end}`) + WordBoundaryEndAngle, + /// `\b{start-half}` + WordBoundaryStartHalf, + /// `\b{end-half}` + WordBoundaryEndHalf, } /// A repetition operation applied to a regular expression. diff --git a/regex-syntax/src/ast/parse.rs b/regex-syntax/src/ast/parse.rs index f7bae7759..593b14fbc 100644 --- a/regex-syntax/src/ast/parse.rs +++ b/regex-syntax/src/ast/parse.rs @@ -1528,18 +1528,115 @@ impl<'s, P: Borrow> ParserI<'s, P> { span, kind: ast::AssertionKind::EndText, })), - 'b' => Ok(Primitive::Assertion(ast::Assertion { - span, - kind: ast::AssertionKind::WordBoundary, - })), + 'b' => { + let mut wb = ast::Assertion { + span, + kind: ast::AssertionKind::WordBoundary, + }; + // After a \b, we "try" to parse things like \b{start} for + // special word boundary assertions. + if !self.is_eof() && self.char() == '{' { + if let Some(kind) = + self.maybe_parse_special_word_boundary(start)? + { + wb.kind = kind; + wb.span.end = self.pos(); + } + } + Ok(Primitive::Assertion(wb)) + } 'B' => Ok(Primitive::Assertion(ast::Assertion { span, kind: ast::AssertionKind::NotWordBoundary, })), + '<' => Ok(Primitive::Assertion(ast::Assertion { + span, + kind: ast::AssertionKind::WordBoundaryStartAngle, + })), + '>' => Ok(Primitive::Assertion(ast::Assertion { + span, + kind: ast::AssertionKind::WordBoundaryEndAngle, + })), _ => Err(self.error(span, ast::ErrorKind::EscapeUnrecognized)), } } + /// Attempt to parse a specialty word boundary. That is, `\b{start}`, + /// `\b{end}`, `\b{start-half}` or `\b{end-half}`. + /// + /// This is similar to `maybe_parse_ascii_class` in that, in most cases, + /// if it fails it will just return `None` with no error. This is done + /// because `\b{5}` is a valid expression and we want to let that be parsed + /// by the existing counted repetition parsing code. (I thought about just + /// invoking the counted repetition code from here, but it seemed a little + /// ham-fisted.) + /// + /// Unlike `maybe_parse_ascii_class` though, this can return an error. + /// Namely, if we definitely know it isn't a counted repetition, then we + /// return an error specific to the specialty word boundaries. + /// + /// This assumes the parser is positioned at a `{` immediately following + /// a `\b`. When `None` is returned, the parser is returned to the position + /// at which it started: pointing at a `{`. + /// + /// The position given should correspond to the start of the `\b`. + fn maybe_parse_special_word_boundary( + &self, + wb_start: Position, + ) -> Result> { + assert_eq!(self.char(), '{'); + + let is_valid_char = |c| match c { + 'A'..='Z' | 'a'..='z' | '-' => true, + _ => false, + }; + let start = self.pos(); + if !self.bump_and_bump_space() { + return Err(self.error( + Span::new(wb_start, self.pos()), + ast::ErrorKind::SpecialWordOrRepetitionUnexpectedEof, + )); + } + let start_contents = self.pos(); + // This is one of the critical bits: if the first non-whitespace + // character isn't in [-A-Za-z] (i.e., this can't be a special word + // boundary), then we bail and let the counted repetition parser deal + // with this. + if !is_valid_char(self.char()) { + self.parser().pos.set(start); + return Ok(None); + } + + // Now collect up our chars until we see a '}'. + let mut scratch = self.parser().scratch.borrow_mut(); + scratch.clear(); + while !self.is_eof() && is_valid_char(self.char()) { + scratch.push(self.char()); + self.bump_and_bump_space(); + } + if self.is_eof() || self.char() != '}' { + return Err(self.error( + Span::new(start, self.pos()), + ast::ErrorKind::SpecialWordBoundaryUnclosed, + )); + } + let end = self.pos(); + self.bump(); + let kind = match scratch.as_str() { + "start" => ast::AssertionKind::WordBoundaryStart, + "end" => ast::AssertionKind::WordBoundaryEnd, + "start-half" => ast::AssertionKind::WordBoundaryStartHalf, + "end-half" => ast::AssertionKind::WordBoundaryEndHalf, + _ => { + return Err(self.error( + Span::new(start_contents, end), + ast::ErrorKind::SpecialWordBoundaryUnrecognized, + )) + } + }; + Ok(Some(kind)) + } + /// Parse an octal representation of a Unicode codepoint up to 3 digits /// long. This expects the parser to be positioned at the first octal /// digit and advances the parser to the first character immediately @@ -1967,9 +2064,9 @@ impl<'s, P: Borrow> ParserI<'s, P> { // because parsing cannot fail with any interesting error. For example, // in order to use an ASCII character class, it must be enclosed in // double brackets, e.g., `[[:alnum:]]`. Alternatively, you might think - // of it as "ASCII character characters have the syntax `[:NAME:]` - // which can only appear within character brackets." This means that - // things like `[[:lower:]A]` are legal constructs. + // of it as "ASCII character classes have the syntax `[:NAME:]` which + // can only appear within character brackets." This means that things + // like `[[:lower:]A]` are legal constructs. // // However, if one types an incorrect ASCII character class, e.g., // `[[:loower:]]`, then we treat that as a normal nested character @@ -3295,6 +3392,23 @@ bar ast: Box::new(lit('a', 0)), })) ); + assert_eq!( + parser(r"\b{5,9}").parse(), + Ok(Ast::repetition(ast::Repetition { + span: span(0..7), + op: ast::RepetitionOp { + span: span(2..7), + kind: ast::RepetitionKind::Range( + ast::RepetitionRange::Bounded(5, 9) + ), + }, + greedy: true, + ast: Box::new(Ast::assertion(ast::Assertion { + span: span(0..2), + kind: ast::AssertionKind::WordBoundary, + })), + })) + ); assert_eq!( parser(r"(?i){0}").parse().unwrap_err(), @@ -4381,6 +4495,48 @@ bar kind: ast::AssertionKind::WordBoundary, })) ); + assert_eq!( + parser(r"\b{start}").parse_primitive(), + Ok(Primitive::Assertion(ast::Assertion { + span: span(0..9), + kind: ast::AssertionKind::WordBoundaryStart, + })) + ); + assert_eq!( + parser(r"\b{end}").parse_primitive(), + Ok(Primitive::Assertion(ast::Assertion { + span: span(0..7), + kind: ast::AssertionKind::WordBoundaryEnd, + })) + ); + assert_eq!( + parser(r"\b{start-half}").parse_primitive(), + Ok(Primitive::Assertion(ast::Assertion { + span: span(0..14), + kind: ast::AssertionKind::WordBoundaryStartHalf, + })) + ); + assert_eq!( + parser(r"\b{end-half}").parse_primitive(), + Ok(Primitive::Assertion(ast::Assertion { + span: span(0..12), + kind: ast::AssertionKind::WordBoundaryEndHalf, + })) + ); + assert_eq!( + parser(r"\<").parse_primitive(), + Ok(Primitive::Assertion(ast::Assertion { + span: span(0..2), + kind: ast::AssertionKind::WordBoundaryStartAngle, + })) + ); + assert_eq!( + parser(r"\>").parse_primitive(), + Ok(Primitive::Assertion(ast::Assertion { + span: span(0..2), + kind: ast::AssertionKind::WordBoundaryEndAngle, + })) + ); assert_eq!( parser(r"\B").parse_primitive(), Ok(Primitive::Assertion(ast::Assertion { @@ -4418,20 +4574,60 @@ bar kind: ast::ErrorKind::EscapeUnrecognized, } ); - // But also, < and > are banned, so that we may evolve them into - // start/end word boundary assertions. (Not sure if we will...) + + // Starting a special word boundary without any non-whitespace chars + // after the brace makes it ambiguous whether the user meant to write + // a counted repetition (probably not?) or an actual special word + // boundary assertion. assert_eq!( - parser(r"\<").parse_escape().unwrap_err(), + parser(r"\b{").parse_escape().unwrap_err(), TestError { - span: span(0..2), - kind: ast::ErrorKind::EscapeUnrecognized, + span: span(0..3), + kind: ast::ErrorKind::SpecialWordOrRepetitionUnexpectedEof, } ); assert_eq!( - parser(r"\>").parse_escape().unwrap_err(), + parser_ignore_whitespace(r"\b{ ").parse_escape().unwrap_err(), TestError { - span: span(0..2), - kind: ast::ErrorKind::EscapeUnrecognized, + span: span(0..4), + kind: ast::ErrorKind::SpecialWordOrRepetitionUnexpectedEof, + } + ); + // When 'x' is not enabled, the space is seen as a non-[-A-Za-z] char, + // and thus causes the parser to treat it as a counted repetition. + assert_eq!( + parser(r"\b{ ").parse().unwrap_err(), + TestError { + span: span(4..4), + kind: ast::ErrorKind::RepetitionCountDecimalEmpty, + } + ); + // In this case, we got some valid chars that makes it look like the + // user is writing one of the special word boundary assertions, but + // we forget to close the brace. + assert_eq!( + parser(r"\b{foo").parse_escape().unwrap_err(), + TestError { + span: span(2..6), + kind: ast::ErrorKind::SpecialWordBoundaryUnclosed, + } + ); + // We get the same error as above, except it is provoked by seeing a + // char that we know is invalid before seeing a closing brace. + assert_eq!( + parser(r"\b{foo!}").parse_escape().unwrap_err(), + TestError { + span: span(2..6), + kind: ast::ErrorKind::SpecialWordBoundaryUnclosed, + } + ); + // And this one occurs when, syntactically, everything looks okay, but + // we don't use a valid spelling of a word boundary assertion. + assert_eq!( + parser(r"\b{foo}").parse_escape().unwrap_err(), + TestError { + span: span(3..6), + kind: ast::ErrorKind::SpecialWordBoundaryUnrecognized, } ); diff --git a/regex-syntax/src/ast/print.rs b/regex-syntax/src/ast/print.rs index 7dedf7f48..1ceb3c7fa 100644 --- a/regex-syntax/src/ast/print.rs +++ b/regex-syntax/src/ast/print.rs @@ -261,6 +261,12 @@ impl Writer { EndText => self.wtr.write_str(r"\z"), WordBoundary => self.wtr.write_str(r"\b"), NotWordBoundary => self.wtr.write_str(r"\B"), + WordBoundaryStart => self.wtr.write_str(r"\b{start}"), + WordBoundaryEnd => self.wtr.write_str(r"\b{end}"), + WordBoundaryStartAngle => self.wtr.write_str(r"\<"), + WordBoundaryEndAngle => self.wtr.write_str(r"\>"), + WordBoundaryStartHalf => self.wtr.write_str(r"\b{start-half}"), + WordBoundaryEndHalf => self.wtr.write_str(r"\b{end-half}"), } } diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs index 56d261aa1..4ae279f92 100644 --- a/regex-syntax/src/hir/translate.rs +++ b/regex-syntax/src/hir/translate.rs @@ -962,6 +962,20 @@ impl<'t, 'p> TranslatorI<'t, 'p> { } else { hir::Look::WordAsciiNegate }), + ast::AssertionKind::WordBoundaryStart + | ast::AssertionKind::WordBoundaryStartAngle => { + Hir::look(if unicode { todo!() } else { todo!() }) + } + ast::AssertionKind::WordBoundaryEnd + | ast::AssertionKind::WordBoundaryEndAngle => { + Hir::look(if unicode { todo!() } else { todo!() }) + } + ast::AssertionKind::WordBoundaryStartHalf => { + Hir::look(if unicode { todo!() } else { todo!() }) + } + ast::AssertionKind::WordBoundaryEndHalf => { + Hir::look(if unicode { todo!() } else { todo!() }) + } }) } diff --git a/regex-syntax/src/lib.rs b/regex-syntax/src/lib.rs index a552099c6..38c8d88d4 100644 --- a/regex-syntax/src/lib.rs +++ b/regex-syntax/src/lib.rs @@ -334,6 +334,9 @@ pub fn is_escapeable_character(c: char) -> bool { // escapeable, \< and \> will result in a parse error. Thus, we can // turn them into something else in the future without it being a // backwards incompatible change. + // + // OK, now we support \< and \>, and we need to retain them as *not* + // escapeable here since the escape sequence is significant. '<' | '>' => false, _ => true, } From bbb98bbb1bd15f7a90469ec2470cb3fdfd2f8db8 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sat, 7 Oct 2023 09:59:51 -0400 Subject: [PATCH 077/136] syntax/hir: add new special word boundaries to HIR This builds on the previous commit to bring word boundary support to the HIR, and updates AST->HIR translation to produce them from the corresponding AST elements. Ref #469 --- regex-syntax/src/hir/mod.rs | 95 ++++++++++++++++++++++++++----- regex-syntax/src/hir/print.rs | 24 ++++++++ regex-syntax/src/hir/translate.rs | 26 +++++++-- 3 files changed, 126 insertions(+), 19 deletions(-) diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index 361ca41af..ce38ead7b 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -1635,6 +1635,42 @@ pub enum Look { WordUnicode = 1 << 8, /// Match a Unicode-aware negation of a word boundary. WordUnicodeNegate = 1 << 9, + /// Match the start of an ASCII-only word boundary. That is, this matches a + /// position at either the beginning of the haystack or where the previous + /// character is not a word character and the following character is a word + /// character. + WordStartAscii = 1 << 10, + /// Match the end of an ASCII-only word boundary. That is, this matches + /// a position at either the end of the haystack or where the previous + /// character is a word character and the following character is not a word + /// character. + WordEndAscii = 1 << 11, + /// Match the start of a Unicode word boundary. That is, this matches a + /// position at either the beginning of the haystack or where the previous + /// character is not a word character and the following character is a word + /// character. + WordStartUnicode = 1 << 12, + /// Match the end of a Unicode word boundary. That is, this matches a + /// position at either the end of the haystack or where the previous + /// character is a word character and the following character is not a word + /// character. + WordEndUnicode = 1 << 13, + /// Match the start half of an ASCII-only word boundary. That is, this + /// matches a position at either the beginning of the haystack or where the + /// previous character is not a word character. + WordStartHalfAscii = 1 << 14, + /// Match the end half of an ASCII-only word boundary. That is, this + /// matches a position at either the end of the haystack or where the + /// following character is not a word character. + WordEndHalfAscii = 1 << 15, + /// Match the start half of a Unicode word boundary. That is, this matches + /// a position at either the beginning of the haystack or where the + /// previous character is not a word character. + WordStartHalfUnicode = 1 << 16, + /// Match the end half of a Unicode word boundary. That is, this matches + /// a position at either the end of the haystack or where the following + /// character is not a word character. + WordEndHalfUnicode = 1 << 17, } impl Look { @@ -1656,6 +1692,14 @@ impl Look { Look::WordAsciiNegate => Look::WordAsciiNegate, Look::WordUnicode => Look::WordUnicode, Look::WordUnicodeNegate => Look::WordUnicodeNegate, + Look::WordStartAscii => Look::WordEndAscii, + Look::WordEndAscii => Look::WordStartAscii, + Look::WordStartUnicode => Look::WordEndUnicode, + Look::WordEndUnicode => Look::WordStartUnicode, + Look::WordStartHalfAscii => Look::WordEndHalfAscii, + Look::WordEndHalfAscii => Look::WordStartHalfAscii, + Look::WordStartHalfUnicode => Look::WordEndHalfUnicode, + Look::WordEndHalfUnicode => Look::WordStartHalfUnicode, } } @@ -1676,16 +1720,24 @@ impl Look { #[inline] pub const fn from_repr(repr: u32) -> Option { match repr { - 0b00_0000_0001 => Some(Look::Start), - 0b00_0000_0010 => Some(Look::End), - 0b00_0000_0100 => Some(Look::StartLF), - 0b00_0000_1000 => Some(Look::EndLF), - 0b00_0001_0000 => Some(Look::StartCRLF), - 0b00_0010_0000 => Some(Look::EndCRLF), - 0b00_0100_0000 => Some(Look::WordAscii), - 0b00_1000_0000 => Some(Look::WordAsciiNegate), - 0b01_0000_0000 => Some(Look::WordUnicode), - 0b10_0000_0000 => Some(Look::WordUnicodeNegate), + 0b00_0000_0000_0000_0001 => Some(Look::Start), + 0b00_0000_0000_0000_0010 => Some(Look::End), + 0b00_0000_0000_0000_0100 => Some(Look::StartLF), + 0b00_0000_0000_0000_1000 => Some(Look::EndLF), + 0b00_0000_0000_0001_0000 => Some(Look::StartCRLF), + 0b00_0000_0000_0010_0000 => Some(Look::EndCRLF), + 0b00_0000_0000_0100_0000 => Some(Look::WordAscii), + 0b00_0000_0000_1000_0000 => Some(Look::WordAsciiNegate), + 0b00_0000_0001_0000_0000 => Some(Look::WordUnicode), + 0b00_0000_0010_0000_0000 => Some(Look::WordUnicodeNegate), + 0b00_0000_0100_0000_0000 => Some(Look::WordStartAscii), + 0b00_0000_1000_0000_0000 => Some(Look::WordEndAscii), + 0b00_0001_0000_0000_0000 => Some(Look::WordStartUnicode), + 0b00_0010_0000_0000_0000 => Some(Look::WordEndUnicode), + 0b00_0100_0000_0000_0000 => Some(Look::WordStartHalfAscii), + 0b00_1000_0000_0000_0000 => Some(Look::WordEndHalfAscii), + 0b01_0000_0000_0000_0000 => Some(Look::WordStartHalfUnicode), + 0b10_0000_0000_0000_0000 => Some(Look::WordEndHalfUnicode), _ => None, } } @@ -1710,6 +1762,14 @@ impl Look { Look::WordAsciiNegate => 'B', Look::WordUnicode => '𝛃', Look::WordUnicodeNegate => '𝚩', + Look::WordStartAscii => '<', + Look::WordEndAscii => '>', + Look::WordStartUnicode => '〈', + Look::WordEndUnicode => '〉', + Look::WordStartHalfAscii => '◁', + Look::WordEndHalfAscii => '▷', + Look::WordStartHalfUnicode => '◀', + Look::WordEndHalfUnicode => '▶', } } } @@ -2703,13 +2763,22 @@ impl LookSet { pub fn contains_word_unicode(self) -> bool { self.contains(Look::WordUnicode) || self.contains(Look::WordUnicodeNegate) + || self.contains(Look::WordStartUnicode) + || self.contains(Look::WordEndUnicode) + || self.contains(Look::WordStartHalfUnicode) + || self.contains(Look::WordEndHalfUnicode) } /// Returns true if and only if this set contains any ASCII word boundary /// or negated ASCII word boundary assertions. #[inline] pub fn contains_word_ascii(self) -> bool { - self.contains(Look::WordAscii) || self.contains(Look::WordAsciiNegate) + self.contains(Look::WordAscii) + || self.contains(Look::WordAsciiNegate) + || self.contains(Look::WordStartAscii) + || self.contains(Look::WordEndAscii) + || self.contains(Look::WordStartHalfAscii) + || self.contains(Look::WordEndHalfAscii) } /// Returns an iterator over all of the look-around assertions in this set. @@ -3769,7 +3838,7 @@ mod tests { assert_eq!(0, set.iter().count()); let set = LookSet::full(); - assert_eq!(10, set.iter().count()); + assert_eq!(18, set.iter().count()); let set = LookSet::empty().insert(Look::StartLF).insert(Look::WordUnicode); @@ -3787,6 +3856,6 @@ mod tests { let res = format!("{:?}", LookSet::empty()); assert_eq!("∅", res); let res = format!("{:?}", LookSet::full()); - assert_eq!("Az^$rRbB𝛃𝚩", res); + assert_eq!("Az^$rRbB𝛃𝚩<>〈〉◁▷◀▶", res); } } diff --git a/regex-syntax/src/hir/print.rs b/regex-syntax/src/hir/print.rs index aa737a092..dfa6d4032 100644 --- a/regex-syntax/src/hir/print.rs +++ b/regex-syntax/src/hir/print.rs @@ -202,6 +202,30 @@ impl Visitor for Writer { hir::Look::WordUnicodeNegate => { self.wtr.write_str(r"\B")?; } + hir::Look::WordStartAscii => { + self.wtr.write_str(r"(?-u:\b{start})")?; + } + hir::Look::WordEndAscii => { + self.wtr.write_str(r"(?-u:\b{end})")?; + } + hir::Look::WordStartUnicode => { + self.wtr.write_str(r"\b{start}")?; + } + hir::Look::WordEndUnicode => { + self.wtr.write_str(r"\b{end}")?; + } + hir::Look::WordStartHalfAscii => { + self.wtr.write_str(r"(?-u:\b{start-half})")?; + } + hir::Look::WordEndHalfAscii => { + self.wtr.write_str(r"(?-u:\b{end-half})")?; + } + hir::Look::WordStartHalfUnicode => { + self.wtr.write_str(r"\b{start-half}")?; + } + hir::Look::WordEndHalfUnicode => { + self.wtr.write_str(r"\b{end-half}")?; + } }, HirKind::Capture(hir::Capture { ref name, .. }) => { self.wtr.write_str("(")?; diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs index 4ae279f92..55ca074fa 100644 --- a/regex-syntax/src/hir/translate.rs +++ b/regex-syntax/src/hir/translate.rs @@ -964,18 +964,32 @@ impl<'t, 'p> TranslatorI<'t, 'p> { }), ast::AssertionKind::WordBoundaryStart | ast::AssertionKind::WordBoundaryStartAngle => { - Hir::look(if unicode { todo!() } else { todo!() }) + Hir::look(if unicode { + hir::Look::WordStartUnicode + } else { + hir::Look::WordStartAscii + }) } ast::AssertionKind::WordBoundaryEnd | ast::AssertionKind::WordBoundaryEndAngle => { - Hir::look(if unicode { todo!() } else { todo!() }) + Hir::look(if unicode { + hir::Look::WordEndUnicode + } else { + hir::Look::WordEndAscii + }) } ast::AssertionKind::WordBoundaryStartHalf => { - Hir::look(if unicode { todo!() } else { todo!() }) - } - ast::AssertionKind::WordBoundaryEndHalf => { - Hir::look(if unicode { todo!() } else { todo!() }) + Hir::look(if unicode { + hir::Look::WordStartHalfUnicode + } else { + hir::Look::WordStartHalfAscii + }) } + ast::AssertionKind::WordBoundaryEndHalf => Hir::look(if unicode { + hir::Look::WordEndHalfUnicode + } else { + hir::Look::WordEndHalfAscii + }), }) } From 21eb31e38c31073258fc670cd80a8a26e96d11aa Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sat, 7 Oct 2023 18:04:56 -0400 Subject: [PATCH 078/136] automata: add special word boundaries to regex-automata In this commit, all of the regex engines now support the new special word boundary assertions: \b{start}, \b{end}, \b{start-half} and \b{end-half}. Of course, when they are Unicode-aware, the DFAs will quit upon seeing a non-ASCII character, just like for the \b and \B assertions. For now, we don't add support to the one-pass DFA, since it would either make it use more memory or reduce the number of capture groups it supports. I think these assertions will be rare enough that it isn't worth adding support yet. This is a breaking change because it adds new variants to the `Look` enum. --- regex-automata/src/nfa/thompson/compiler.rs | 8 + regex-automata/src/util/determinize/mod.rs | 60 +- regex-automata/src/util/look.rs | 898 ++++++++++++++++++-- regex-automata/tests/dfa/suite.rs | 6 +- regex-automata/tests/lib.rs | 1 + testdata/word-boundary-special.toml | 653 ++++++++++++++ tests/lib.rs | 1 + 7 files changed, 1563 insertions(+), 64 deletions(-) create mode 100644 testdata/word-boundary-special.toml diff --git a/regex-automata/src/nfa/thompson/compiler.rs b/regex-automata/src/nfa/thompson/compiler.rs index a188017d8..2d2172957 100644 --- a/regex-automata/src/nfa/thompson/compiler.rs +++ b/regex-automata/src/nfa/thompson/compiler.rs @@ -1557,6 +1557,14 @@ impl Compiler { hir::Look::WordAsciiNegate => Look::WordAsciiNegate, hir::Look::WordUnicode => Look::WordUnicode, hir::Look::WordUnicodeNegate => Look::WordUnicodeNegate, + hir::Look::WordStartAscii => Look::WordStartAscii, + hir::Look::WordEndAscii => Look::WordEndAscii, + hir::Look::WordStartUnicode => Look::WordStartUnicode, + hir::Look::WordEndUnicode => Look::WordEndUnicode, + hir::Look::WordStartHalfAscii => Look::WordStartHalfAscii, + hir::Look::WordEndHalfAscii => Look::WordEndHalfAscii, + hir::Look::WordStartHalfUnicode => Look::WordStartHalfUnicode, + hir::Look::WordEndHalfUnicode => Look::WordEndHalfUnicode, }; let id = self.add_look(look)?; Ok(ThompsonRef { start: id, end: id }) diff --git a/regex-automata/src/util/determinize/mod.rs b/regex-automata/src/util/determinize/mod.rs index 30a82afb8..d320fabc3 100644 --- a/regex-automata/src/util/determinize/mod.rs +++ b/regex-automata/src/util/determinize/mod.rs @@ -145,9 +145,10 @@ pub(crate) fn next( } Some(_) => {} None => { - look_have = look_have.insert(Look::End); - look_have = look_have.insert(Look::EndLF); - look_have = look_have.insert(Look::EndCRLF); + look_have = look_have + .insert(Look::End) + .insert(Look::EndLF) + .insert(Look::EndCRLF); } } if unit.is_byte(lookm.get_line_terminator()) { @@ -160,11 +161,26 @@ pub(crate) fn next( look_have = look_have.insert(Look::StartCRLF); } if state.is_from_word() == unit.is_word_byte() { - look_have = look_have.insert(Look::WordUnicodeNegate); - look_have = look_have.insert(Look::WordAsciiNegate); + look_have = look_have + .insert(Look::WordAsciiNegate) + .insert(Look::WordUnicodeNegate); } else { - look_have = look_have.insert(Look::WordUnicode); - look_have = look_have.insert(Look::WordAscii); + look_have = + look_have.insert(Look::WordAscii).insert(Look::WordUnicode); + } + if !unit.is_word_byte() { + look_have = look_have + .insert(Look::WordEndHalfAscii) + .insert(Look::WordEndHalfUnicode); + } + if state.is_from_word() && !unit.is_word_byte() { + look_have = look_have + .insert(Look::WordEndAscii) + .insert(Look::WordEndUnicode); + } else if !state.is_from_word() && unit.is_word_byte() { + look_have = look_have + .insert(Look::WordStartAscii) + .insert(Look::WordStartUnicode); } // If we have new assertions satisfied that are among the set of // assertions that exist in this state (that is, just because we added @@ -220,6 +236,14 @@ pub(crate) fn next( { builder.set_look_have(|have| have.insert(Look::StartCRLF)); } + // And also for the start-half word boundary assertions. As long as the + // look-behind byte is not a word char, then the assertions are satisfied. + if nfa.look_set_any().contains_word() && !unit.is_word_byte() { + builder.set_look_have(|have| { + have.insert(Look::WordStartHalfAscii) + .insert(Look::WordStartHalfUnicode) + }); + } for nfa_id in sparses.set1.iter() { match *nfa.state(nfa_id) { thompson::State::Union { .. } @@ -564,7 +588,12 @@ pub(crate) fn set_lookbehind_from_start( let rev = nfa.is_reverse(); let lineterm = nfa.look_matcher().get_line_terminator(); match *start { - Start::NonWordByte => {} + Start::NonWordByte => { + builder.set_look_have(|have| { + have.insert(Look::WordStartHalfAscii) + .insert(Look::WordStartHalfUnicode) + }); + } Start::WordByte => { builder.set_is_from_word(); } @@ -573,6 +602,8 @@ pub(crate) fn set_lookbehind_from_start( have.insert(Look::Start) .insert(Look::StartLF) .insert(Look::StartCRLF) + .insert(Look::WordStartHalfAscii) + .insert(Look::WordStartHalfUnicode) }); } Start::LineLF => { @@ -585,6 +616,10 @@ pub(crate) fn set_lookbehind_from_start( if lineterm == b'\n' { builder.set_look_have(|have| have.insert(Look::StartLF)); } + builder.set_look_have(|have| { + have.insert(Look::WordStartHalfAscii) + .insert(Look::WordStartHalfUnicode) + }); } Start::LineCR => { if rev { @@ -595,6 +630,10 @@ pub(crate) fn set_lookbehind_from_start( if lineterm == b'\r' { builder.set_look_have(|have| have.insert(Look::StartLF)); } + builder.set_look_have(|have| { + have.insert(Look::WordStartHalfAscii) + .insert(Look::WordStartHalfUnicode) + }); } Start::CustomLineTerminator => { builder.set_look_have(|have| have.insert(Look::StartLF)); @@ -604,6 +643,11 @@ pub(crate) fn set_lookbehind_from_start( // state as having come from a word byte. if utf8::is_word_byte(lineterm) { builder.set_is_from_word(); + } else { + builder.set_look_have(|have| { + have.insert(Look::WordStartHalfAscii) + .insert(Look::WordStartHalfUnicode) + }); } } } diff --git a/regex-automata/src/util/look.rs b/regex-automata/src/util/look.rs index f87b963ad..ddf8fb129 100644 --- a/regex-automata/src/util/look.rs +++ b/regex-automata/src/util/look.rs @@ -96,6 +96,42 @@ pub enum Look { WordUnicode = 1 << 8, /// Match a Unicode-aware negation of a word boundary. WordUnicodeNegate = 1 << 9, + /// Match the start of an ASCII-only word boundary. That is, this matches a + /// position at either the beginning of the haystack or where the previous + /// character is not a word character and the following character is a word + /// character. + WordStartAscii = 1 << 10, + /// Match the end of an ASCII-only word boundary. That is, this matches + /// a position at either the end of the haystack or where the previous + /// character is a word character and the following character is not a word + /// character. + WordEndAscii = 1 << 11, + /// Match the start of a Unicode word boundary. That is, this matches a + /// position at either the beginning of the haystack or where the previous + /// character is not a word character and the following character is a word + /// character. + WordStartUnicode = 1 << 12, + /// Match the end of a Unicode word boundary. That is, this matches a + /// position at either the end of the haystack or where the previous + /// character is a word character and the following character is not a word + /// character. + WordEndUnicode = 1 << 13, + /// Match the start half of an ASCII-only word boundary. That is, this + /// matches a position at either the beginning of the haystack or where the + /// previous character is not a word character. + WordStartHalfAscii = 1 << 14, + /// Match the end half of an ASCII-only word boundary. That is, this + /// matches a position at either the end of the haystack or where the + /// following character is not a word character. + WordEndHalfAscii = 1 << 15, + /// Match the start half of a Unicode word boundary. That is, this matches + /// a position at either the beginning of the haystack or where the + /// previous character is not a word character. + WordStartHalfUnicode = 1 << 16, + /// Match the end half of a Unicode word boundary. That is, this matches + /// a position at either the end of the haystack or where the following + /// character is not a word character. + WordEndHalfUnicode = 1 << 17, } impl Look { @@ -117,6 +153,14 @@ impl Look { Look::WordAsciiNegate => Look::WordAsciiNegate, Look::WordUnicode => Look::WordUnicode, Look::WordUnicodeNegate => Look::WordUnicodeNegate, + Look::WordStartAscii => Look::WordEndAscii, + Look::WordEndAscii => Look::WordStartAscii, + Look::WordStartUnicode => Look::WordEndUnicode, + Look::WordEndUnicode => Look::WordStartUnicode, + Look::WordStartHalfAscii => Look::WordEndHalfAscii, + Look::WordEndHalfAscii => Look::WordStartHalfAscii, + Look::WordStartHalfUnicode => Look::WordEndHalfUnicode, + Look::WordEndHalfUnicode => Look::WordStartHalfUnicode, } } @@ -137,16 +181,24 @@ impl Look { #[inline] pub const fn from_repr(repr: u32) -> Option { match repr { - 0b00_0000_0001 => Some(Look::Start), - 0b00_0000_0010 => Some(Look::End), - 0b00_0000_0100 => Some(Look::StartLF), - 0b00_0000_1000 => Some(Look::EndLF), - 0b00_0001_0000 => Some(Look::StartCRLF), - 0b00_0010_0000 => Some(Look::EndCRLF), - 0b00_0100_0000 => Some(Look::WordAscii), - 0b00_1000_0000 => Some(Look::WordAsciiNegate), - 0b01_0000_0000 => Some(Look::WordUnicode), - 0b10_0000_0000 => Some(Look::WordUnicodeNegate), + 0b00_0000_0000_0000_0001 => Some(Look::Start), + 0b00_0000_0000_0000_0010 => Some(Look::End), + 0b00_0000_0000_0000_0100 => Some(Look::StartLF), + 0b00_0000_0000_0000_1000 => Some(Look::EndLF), + 0b00_0000_0000_0001_0000 => Some(Look::StartCRLF), + 0b00_0000_0000_0010_0000 => Some(Look::EndCRLF), + 0b00_0000_0000_0100_0000 => Some(Look::WordAscii), + 0b00_0000_0000_1000_0000 => Some(Look::WordAsciiNegate), + 0b00_0000_0001_0000_0000 => Some(Look::WordUnicode), + 0b00_0000_0010_0000_0000 => Some(Look::WordUnicodeNegate), + 0b00_0000_0100_0000_0000 => Some(Look::WordStartAscii), + 0b00_0000_1000_0000_0000 => Some(Look::WordEndAscii), + 0b00_0001_0000_0000_0000 => Some(Look::WordStartUnicode), + 0b00_0010_0000_0000_0000 => Some(Look::WordEndUnicode), + 0b00_0100_0000_0000_0000 => Some(Look::WordStartHalfAscii), + 0b00_1000_0000_0000_0000 => Some(Look::WordEndHalfAscii), + 0b01_0000_0000_0000_0000 => Some(Look::WordStartHalfUnicode), + 0b10_0000_0000_0000_0000 => Some(Look::WordEndHalfUnicode), _ => None, } } @@ -171,6 +223,14 @@ impl Look { Look::WordAsciiNegate => 'B', Look::WordUnicode => '𝛃', Look::WordUnicodeNegate => '𝚩', + Look::WordStartAscii => '<', + Look::WordEndAscii => '>', + Look::WordStartUnicode => '〈', + Look::WordEndUnicode => '〉', + Look::WordStartHalfAscii => '◁', + Look::WordEndHalfAscii => '▷', + Look::WordStartHalfUnicode => '◀', + Look::WordEndHalfUnicode => '▶', } } } @@ -294,13 +354,22 @@ impl LookSet { pub fn contains_word_unicode(self) -> bool { self.contains(Look::WordUnicode) || self.contains(Look::WordUnicodeNegate) + || self.contains(Look::WordStartUnicode) + || self.contains(Look::WordEndUnicode) + || self.contains(Look::WordStartHalfUnicode) + || self.contains(Look::WordEndHalfUnicode) } /// Returns true if and only if this set contains any ASCII word boundary /// or negated ASCII word boundary assertions. #[inline] pub fn contains_word_ascii(self) -> bool { - self.contains(Look::WordAscii) || self.contains(Look::WordAsciiNegate) + self.contains(Look::WordAscii) + || self.contains(Look::WordAsciiNegate) + || self.contains(Look::WordStartAscii) + || self.contains(Look::WordEndAscii) + || self.contains(Look::WordStartHalfAscii) + || self.contains(Look::WordEndHalfAscii) } /// Returns an iterator over all of the look-around assertions in this set. @@ -568,6 +637,23 @@ impl LookMatcher { } /// Like `matches`, but forcefully inlined. + /// + /// # Panics + /// + /// This panics when testing any Unicode word boundary assertion in this + /// set and when the Unicode word data is not available. Specifically, this + /// only occurs when the `unicode-word-boundary` feature is not enabled. + /// + /// Since it's generally expected that this routine is called inside of + /// a matching engine, callers should check the error condition when + /// building the matching engine. If there is a Unicode word boundary + /// in the matcher and the data isn't available, then the matcher should + /// fail to build. + /// + /// Callers can check the error condition with [`LookSet::available`]. + /// + /// This also may panic when `at > haystack.len()`. Note that `at == + /// haystack.len()` is legal and guaranteed not to panic. #[cfg_attr(feature = "perf-inline", inline(always))] pub(crate) fn matches_inline( &self, @@ -588,6 +674,26 @@ impl LookMatcher { Look::WordUnicodeNegate => { self.is_word_unicode_negate(haystack, at).unwrap() } + Look::WordStartAscii => self.is_word_start_ascii(haystack, at), + Look::WordEndAscii => self.is_word_end_ascii(haystack, at), + Look::WordStartUnicode => { + self.is_word_start_unicode(haystack, at).unwrap() + } + Look::WordEndUnicode => { + self.is_word_end_unicode(haystack, at).unwrap() + } + Look::WordStartHalfAscii => { + self.is_word_start_half_ascii(haystack, at) + } + Look::WordEndHalfAscii => { + self.is_word_end_half_ascii(haystack, at) + } + Look::WordStartHalfUnicode => { + self.is_word_start_half_unicode(haystack, at).unwrap() + } + Look::WordEndHalfUnicode => { + self.is_word_end_half_unicode(haystack, at).unwrap() + } } } @@ -682,6 +788,46 @@ impl LookMatcher { return false; } } + if set.contains(Look::WordStartAscii) { + if !self.is_word_start_ascii(haystack, at) { + return false; + } + } + if set.contains(Look::WordEndAscii) { + if !self.is_word_end_ascii(haystack, at) { + return false; + } + } + if set.contains(Look::WordStartUnicode) { + if !self.is_word_start_unicode(haystack, at).unwrap() { + return false; + } + } + if set.contains(Look::WordEndUnicode) { + if !self.is_word_end_unicode(haystack, at).unwrap() { + return false; + } + } + if set.contains(Look::WordStartHalfAscii) { + if !self.is_word_start_half_ascii(haystack, at) { + return false; + } + } + if set.contains(Look::WordEndHalfAscii) { + if !self.is_word_end_half_ascii(haystack, at) { + return false; + } + } + if set.contains(Look::WordStartHalfUnicode) { + if !self.is_word_start_half_unicode(haystack, at).unwrap() { + return false; + } + } + if set.contains(Look::WordEndHalfUnicode) { + if !self.is_word_end_half_unicode(haystack, at).unwrap() { + return false; + } + } true } @@ -705,7 +851,15 @@ impl LookMatcher { Look::WordAscii | Look::WordAsciiNegate | Look::WordUnicode - | Look::WordUnicodeNegate => { + | Look::WordUnicodeNegate + | Look::WordStartAscii + | Look::WordEndAscii + | Look::WordStartUnicode + | Look::WordEndUnicode + | Look::WordStartHalfAscii + | Look::WordEndHalfAscii + | Look::WordStartHalfUnicode + | Look::WordEndHalfUnicode => { // We need to mark all ranges of bytes whose pairs result in // evaluating \b differently. This isn't technically correct // for Unicode word boundaries, but DFAs can't handle those @@ -933,6 +1087,177 @@ impl LookMatcher { }; Ok(word_before == word_after) } + + /// Returns true when [`Look::WordStartAscii`] is satisfied `at` the given + /// position in `haystack`. + /// + /// # Panics + /// + /// This may panic when `at > haystack.len()`. Note that `at == + /// haystack.len()` is legal and guaranteed not to panic. + #[inline] + pub fn is_word_start_ascii(&self, haystack: &[u8], at: usize) -> bool { + let word_before = at > 0 && utf8::is_word_byte(haystack[at - 1]); + let word_after = + at < haystack.len() && utf8::is_word_byte(haystack[at]); + !word_before && word_after + } + + /// Returns true when [`Look::WordEndAscii`] is satisfied `at` the given + /// position in `haystack`. + /// + /// # Panics + /// + /// This may panic when `at > haystack.len()`. Note that `at == + /// haystack.len()` is legal and guaranteed not to panic. + #[inline] + pub fn is_word_end_ascii(&self, haystack: &[u8], at: usize) -> bool { + let word_before = at > 0 && utf8::is_word_byte(haystack[at - 1]); + let word_after = + at < haystack.len() && utf8::is_word_byte(haystack[at]); + word_before && !word_after + } + + /// Returns true when [`Look::WordStartUnicode`] is satisfied `at` the + /// given position in `haystack`. + /// + /// # Panics + /// + /// This may panic when `at > haystack.len()`. Note that `at == + /// haystack.len()` is legal and guaranteed not to panic. + /// + /// # Errors + /// + /// This returns an error when Unicode word boundary tables + /// are not available. Specifically, this only occurs when the + /// `unicode-word-boundary` feature is not enabled. + #[inline] + pub fn is_word_start_unicode( + &self, + haystack: &[u8], + at: usize, + ) -> Result { + let word_before = is_word_char::rev(haystack, at)?; + let word_after = is_word_char::fwd(haystack, at)?; + Ok(!word_before && word_after) + } + + /// Returns true when [`Look::WordEndUnicode`] is satisfied `at` the + /// given position in `haystack`. + /// + /// # Panics + /// + /// This may panic when `at > haystack.len()`. Note that `at == + /// haystack.len()` is legal and guaranteed not to panic. + /// + /// # Errors + /// + /// This returns an error when Unicode word boundary tables + /// are not available. Specifically, this only occurs when the + /// `unicode-word-boundary` feature is not enabled. + #[inline] + pub fn is_word_end_unicode( + &self, + haystack: &[u8], + at: usize, + ) -> Result { + let word_before = is_word_char::rev(haystack, at)?; + let word_after = is_word_char::fwd(haystack, at)?; + Ok(word_before && !word_after) + } + + /// Returns true when [`Look::WordStartHalfAscii`] is satisfied `at` the + /// given position in `haystack`. + /// + /// # Panics + /// + /// This may panic when `at > haystack.len()`. Note that `at == + /// haystack.len()` is legal and guaranteed not to panic. + #[inline] + pub fn is_word_start_half_ascii( + &self, + haystack: &[u8], + at: usize, + ) -> bool { + let word_before = at > 0 && utf8::is_word_byte(haystack[at - 1]); + !word_before + } + + /// Returns true when [`Look::WordEndHalfAscii`] is satisfied `at` the + /// given position in `haystack`. + /// + /// # Panics + /// + /// This may panic when `at > haystack.len()`. Note that `at == + /// haystack.len()` is legal and guaranteed not to panic. + #[inline] + pub fn is_word_end_half_ascii(&self, haystack: &[u8], at: usize) -> bool { + let word_after = + at < haystack.len() && utf8::is_word_byte(haystack[at]); + !word_after + } + + /// Returns true when [`Look::WordStartHalfUnicode`] is satisfied `at` the + /// given position in `haystack`. + /// + /// # Panics + /// + /// This may panic when `at > haystack.len()`. Note that `at == + /// haystack.len()` is legal and guaranteed not to panic. + /// + /// # Errors + /// + /// This returns an error when Unicode word boundary tables + /// are not available. Specifically, this only occurs when the + /// `unicode-word-boundary` feature is not enabled. + #[inline] + pub fn is_word_start_half_unicode( + &self, + haystack: &[u8], + at: usize, + ) -> Result { + // See `is_word_unicode_negate` for why we need to do this. We don't + // need to do it for `is_word_start_unicode` because that guarantees + // that the position matched falls on a valid UTF-8 boundary given + // that the right side must be in \w. + let word_before = at > 0 + && match utf8::decode_last(&haystack[..at]) { + None | Some(Err(_)) => return Ok(false), + Some(Ok(_)) => is_word_char::rev(haystack, at)?, + }; + Ok(!word_before) + } + + /// Returns true when [`Look::WordEndHalfUnicode`] is satisfied `at` the + /// given position in `haystack`. + /// + /// # Panics + /// + /// This may panic when `at > haystack.len()`. Note that `at == + /// haystack.len()` is legal and guaranteed not to panic. + /// + /// # Errors + /// + /// This returns an error when Unicode word boundary tables + /// are not available. Specifically, this only occurs when the + /// `unicode-word-boundary` feature is not enabled. + #[inline] + pub fn is_word_end_half_unicode( + &self, + haystack: &[u8], + at: usize, + ) -> Result { + // See `is_word_unicode_negate` for why we need to do this. We don't + // need to do it for `is_word_end_unicode` because that guarantees + // that the position matched falls on a valid UTF-8 boundary given + // that the left side must be in \w. + let word_after = at < haystack.len() + && match utf8::decode(&haystack[at..]) { + None | Some(Err(_)) => return Ok(false), + Some(Ok(_)) => is_word_char::fwd(haystack, at)?, + }; + Ok(!word_after) + } } impl Default for LookMatcher { @@ -1660,50 +1985,478 @@ mod tests { } #[test] - fn look_set() { - let mut f = LookSet::default(); - assert!(!f.contains(Look::Start)); - assert!(!f.contains(Look::End)); - assert!(!f.contains(Look::StartLF)); - assert!(!f.contains(Look::EndLF)); - assert!(!f.contains(Look::WordUnicode)); - assert!(!f.contains(Look::WordUnicodeNegate)); - assert!(!f.contains(Look::WordAscii)); - assert!(!f.contains(Look::WordAsciiNegate)); + fn look_matches_word_start_ascii() { + let look = Look::WordStartAscii; - f = f.insert(Look::Start); - assert!(f.contains(Look::Start)); - f = f.remove(Look::Start); - assert!(!f.contains(Look::Start)); + // \xF0\x9D\x9B\x83 = 𝛃 (in \w) + // \xF0\x90\x86\x80 = 𐆀 (not in \w) - f = f.insert(Look::End); - assert!(f.contains(Look::End)); - f = f.remove(Look::End); - assert!(!f.contains(Look::End)); + // Simple ASCII word boundaries. + assert!(testlook!(look, "a", 0)); + assert!(!testlook!(look, "a", 1)); + assert!(!testlook!(look, "a ", 1)); + assert!(testlook!(look, " a ", 1)); + assert!(!testlook!(look, " a ", 2)); - f = f.insert(Look::StartLF); - assert!(f.contains(Look::StartLF)); - f = f.remove(Look::StartLF); - assert!(!f.contains(Look::StartLF)); + // Unicode word boundaries with a non-ASCII codepoint. Since this is + // an ASCII word boundary, none of these match. + assert!(!testlook!(look, "𝛃", 0)); + assert!(!testlook!(look, "𝛃", 4)); + assert!(!testlook!(look, "𝛃 ", 4)); + assert!(!testlook!(look, " 𝛃 ", 1)); + assert!(!testlook!(look, " 𝛃 ", 5)); - f = f.insert(Look::EndLF); - assert!(f.contains(Look::EndLF)); - f = f.remove(Look::EndLF); - assert!(!f.contains(Look::EndLF)); + // Unicode word boundaries between non-ASCII codepoints. Again, since + // this is an ASCII word boundary, none of these match. + assert!(!testlook!(look, "𝛃𐆀", 0)); + assert!(!testlook!(look, "𝛃𐆀", 4)); - f = f.insert(Look::StartCRLF); - assert!(f.contains(Look::StartCRLF)); - f = f.remove(Look::StartCRLF); - assert!(!f.contains(Look::StartCRLF)); + // Non word boundaries for ASCII. + assert!(!testlook!(look, "", 0)); + assert!(!testlook!(look, "ab", 1)); + assert!(!testlook!(look, "a ", 2)); + assert!(!testlook!(look, " a ", 0)); + assert!(!testlook!(look, " a ", 3)); - f = f.insert(Look::EndCRLF); - assert!(f.contains(Look::EndCRLF)); - f = f.remove(Look::EndCRLF); - assert!(!f.contains(Look::EndCRLF)); + // Non word boundaries with a non-ASCII codepoint. + assert!(testlook!(look, "𝛃b", 4)); + assert!(!testlook!(look, "b𝛃", 1)); + assert!(!testlook!(look, "𝛃 ", 5)); + assert!(!testlook!(look, " 𝛃 ", 0)); + assert!(!testlook!(look, " 𝛃 ", 6)); + assert!(!testlook!(look, "𝛃", 1)); + assert!(!testlook!(look, "𝛃", 2)); + assert!(!testlook!(look, "𝛃", 3)); - f = f.insert(Look::WordUnicode); - assert!(f.contains(Look::WordUnicode)); - f = f.remove(Look::WordUnicode); + // Non word boundaries with non-ASCII codepoints. + assert!(!testlook!(look, "𝛃𐆀", 1)); + assert!(!testlook!(look, "𝛃𐆀", 2)); + assert!(!testlook!(look, "𝛃𐆀", 3)); + assert!(!testlook!(look, "𝛃𐆀", 5)); + assert!(!testlook!(look, "𝛃𐆀", 6)); + assert!(!testlook!(look, "𝛃𐆀", 7)); + assert!(!testlook!(look, "𝛃𐆀", 8)); + } + + #[test] + fn look_matches_word_end_ascii() { + let look = Look::WordEndAscii; + + // \xF0\x9D\x9B\x83 = 𝛃 (in \w) + // \xF0\x90\x86\x80 = 𐆀 (not in \w) + + // Simple ASCII word boundaries. + assert!(!testlook!(look, "a", 0)); + assert!(testlook!(look, "a", 1)); + assert!(testlook!(look, "a ", 1)); + assert!(!testlook!(look, " a ", 1)); + assert!(testlook!(look, " a ", 2)); + + // Unicode word boundaries with a non-ASCII codepoint. Since this is + // an ASCII word boundary, none of these match. + assert!(!testlook!(look, "𝛃", 0)); + assert!(!testlook!(look, "𝛃", 4)); + assert!(!testlook!(look, "𝛃 ", 4)); + assert!(!testlook!(look, " 𝛃 ", 1)); + assert!(!testlook!(look, " 𝛃 ", 5)); + + // Unicode word boundaries between non-ASCII codepoints. Again, since + // this is an ASCII word boundary, none of these match. + assert!(!testlook!(look, "𝛃𐆀", 0)); + assert!(!testlook!(look, "𝛃𐆀", 4)); + + // Non word boundaries for ASCII. + assert!(!testlook!(look, "", 0)); + assert!(!testlook!(look, "ab", 1)); + assert!(!testlook!(look, "a ", 2)); + assert!(!testlook!(look, " a ", 0)); + assert!(!testlook!(look, " a ", 3)); + + // Non word boundaries with a non-ASCII codepoint. + assert!(!testlook!(look, "𝛃b", 4)); + assert!(testlook!(look, "b𝛃", 1)); + assert!(!testlook!(look, "𝛃 ", 5)); + assert!(!testlook!(look, " 𝛃 ", 0)); + assert!(!testlook!(look, " 𝛃 ", 6)); + assert!(!testlook!(look, "𝛃", 1)); + assert!(!testlook!(look, "𝛃", 2)); + assert!(!testlook!(look, "𝛃", 3)); + + // Non word boundaries with non-ASCII codepoints. + assert!(!testlook!(look, "𝛃𐆀", 1)); + assert!(!testlook!(look, "𝛃𐆀", 2)); + assert!(!testlook!(look, "𝛃𐆀", 3)); + assert!(!testlook!(look, "𝛃𐆀", 5)); + assert!(!testlook!(look, "𝛃𐆀", 6)); + assert!(!testlook!(look, "𝛃𐆀", 7)); + assert!(!testlook!(look, "𝛃𐆀", 8)); + } + + #[test] + #[cfg(all(not(miri), feature = "unicode-word-boundary"))] + fn look_matches_word_start_unicode() { + let look = Look::WordStartUnicode; + + // \xF0\x9D\x9B\x83 = 𝛃 (in \w) + // \xF0\x90\x86\x80 = 𐆀 (not in \w) + + // Simple ASCII word boundaries. + assert!(testlook!(look, "a", 0)); + assert!(!testlook!(look, "a", 1)); + assert!(!testlook!(look, "a ", 1)); + assert!(testlook!(look, " a ", 1)); + assert!(!testlook!(look, " a ", 2)); + + // Unicode word boundaries with a non-ASCII codepoint. + assert!(testlook!(look, "𝛃", 0)); + assert!(!testlook!(look, "𝛃", 4)); + assert!(!testlook!(look, "𝛃 ", 4)); + assert!(testlook!(look, " 𝛃 ", 1)); + assert!(!testlook!(look, " 𝛃 ", 5)); + + // Unicode word boundaries between non-ASCII codepoints. + assert!(testlook!(look, "𝛃𐆀", 0)); + assert!(!testlook!(look, "𝛃𐆀", 4)); + + // Non word boundaries for ASCII. + assert!(!testlook!(look, "", 0)); + assert!(!testlook!(look, "ab", 1)); + assert!(!testlook!(look, "a ", 2)); + assert!(!testlook!(look, " a ", 0)); + assert!(!testlook!(look, " a ", 3)); + + // Non word boundaries with a non-ASCII codepoint. + assert!(!testlook!(look, "𝛃b", 4)); + assert!(!testlook!(look, "b𝛃", 1)); + assert!(!testlook!(look, "𝛃 ", 5)); + assert!(!testlook!(look, " 𝛃 ", 0)); + assert!(!testlook!(look, " 𝛃 ", 6)); + assert!(!testlook!(look, "𝛃", 1)); + assert!(!testlook!(look, "𝛃", 2)); + assert!(!testlook!(look, "𝛃", 3)); + + // Non word boundaries with non-ASCII codepoints. + assert!(!testlook!(look, "𝛃𐆀", 1)); + assert!(!testlook!(look, "𝛃𐆀", 2)); + assert!(!testlook!(look, "𝛃𐆀", 3)); + assert!(!testlook!(look, "𝛃𐆀", 5)); + assert!(!testlook!(look, "𝛃𐆀", 6)); + assert!(!testlook!(look, "𝛃𐆀", 7)); + assert!(!testlook!(look, "𝛃𐆀", 8)); + } + + #[test] + #[cfg(all(not(miri), feature = "unicode-word-boundary"))] + fn look_matches_word_end_unicode() { + let look = Look::WordEndUnicode; + + // \xF0\x9D\x9B\x83 = 𝛃 (in \w) + // \xF0\x90\x86\x80 = 𐆀 (not in \w) + + // Simple ASCII word boundaries. + assert!(!testlook!(look, "a", 0)); + assert!(testlook!(look, "a", 1)); + assert!(testlook!(look, "a ", 1)); + assert!(!testlook!(look, " a ", 1)); + assert!(testlook!(look, " a ", 2)); + + // Unicode word boundaries with a non-ASCII codepoint. + assert!(!testlook!(look, "𝛃", 0)); + assert!(testlook!(look, "𝛃", 4)); + assert!(testlook!(look, "𝛃 ", 4)); + assert!(!testlook!(look, " 𝛃 ", 1)); + assert!(testlook!(look, " 𝛃 ", 5)); + + // Unicode word boundaries between non-ASCII codepoints. + assert!(!testlook!(look, "𝛃𐆀", 0)); + assert!(testlook!(look, "𝛃𐆀", 4)); + + // Non word boundaries for ASCII. + assert!(!testlook!(look, "", 0)); + assert!(!testlook!(look, "ab", 1)); + assert!(!testlook!(look, "a ", 2)); + assert!(!testlook!(look, " a ", 0)); + assert!(!testlook!(look, " a ", 3)); + + // Non word boundaries with a non-ASCII codepoint. + assert!(!testlook!(look, "𝛃b", 4)); + assert!(!testlook!(look, "b𝛃", 1)); + assert!(!testlook!(look, "𝛃 ", 5)); + assert!(!testlook!(look, " 𝛃 ", 0)); + assert!(!testlook!(look, " 𝛃 ", 6)); + assert!(!testlook!(look, "𝛃", 1)); + assert!(!testlook!(look, "𝛃", 2)); + assert!(!testlook!(look, "𝛃", 3)); + + // Non word boundaries with non-ASCII codepoints. + assert!(!testlook!(look, "𝛃𐆀", 1)); + assert!(!testlook!(look, "𝛃𐆀", 2)); + assert!(!testlook!(look, "𝛃𐆀", 3)); + assert!(!testlook!(look, "𝛃𐆀", 5)); + assert!(!testlook!(look, "𝛃𐆀", 6)); + assert!(!testlook!(look, "𝛃𐆀", 7)); + assert!(!testlook!(look, "𝛃𐆀", 8)); + } + + #[test] + fn look_matches_word_start_half_ascii() { + let look = Look::WordStartHalfAscii; + + // \xF0\x9D\x9B\x83 = 𝛃 (in \w) + // \xF0\x90\x86\x80 = 𐆀 (not in \w) + + // Simple ASCII word boundaries. + assert!(testlook!(look, "a", 0)); + assert!(!testlook!(look, "a", 1)); + assert!(!testlook!(look, "a ", 1)); + assert!(testlook!(look, " a ", 1)); + assert!(!testlook!(look, " a ", 2)); + + // Unicode word boundaries with a non-ASCII codepoint. Since this is + // an ASCII word boundary, none of these match. + assert!(testlook!(look, "𝛃", 0)); + assert!(testlook!(look, "𝛃", 4)); + assert!(testlook!(look, "𝛃 ", 4)); + assert!(testlook!(look, " 𝛃 ", 1)); + assert!(testlook!(look, " 𝛃 ", 5)); + + // Unicode word boundaries between non-ASCII codepoints. Again, since + // this is an ASCII word boundary, none of these match. + assert!(testlook!(look, "𝛃𐆀", 0)); + assert!(testlook!(look, "𝛃𐆀", 4)); + + // Non word boundaries for ASCII. + assert!(testlook!(look, "", 0)); + assert!(!testlook!(look, "ab", 1)); + assert!(testlook!(look, "a ", 2)); + assert!(testlook!(look, " a ", 0)); + assert!(testlook!(look, " a ", 3)); + + // Non word boundaries with a non-ASCII codepoint. + assert!(testlook!(look, "𝛃b", 4)); + assert!(!testlook!(look, "b𝛃", 1)); + assert!(testlook!(look, "𝛃 ", 5)); + assert!(testlook!(look, " 𝛃 ", 0)); + assert!(testlook!(look, " 𝛃 ", 6)); + assert!(testlook!(look, "𝛃", 1)); + assert!(testlook!(look, "𝛃", 2)); + assert!(testlook!(look, "𝛃", 3)); + + // Non word boundaries with non-ASCII codepoints. + assert!(testlook!(look, "𝛃𐆀", 1)); + assert!(testlook!(look, "𝛃𐆀", 2)); + assert!(testlook!(look, "𝛃𐆀", 3)); + assert!(testlook!(look, "𝛃𐆀", 5)); + assert!(testlook!(look, "𝛃𐆀", 6)); + assert!(testlook!(look, "𝛃𐆀", 7)); + assert!(testlook!(look, "𝛃𐆀", 8)); + } + + #[test] + fn look_matches_word_end_half_ascii() { + let look = Look::WordEndHalfAscii; + + // \xF0\x9D\x9B\x83 = 𝛃 (in \w) + // \xF0\x90\x86\x80 = 𐆀 (not in \w) + + // Simple ASCII word boundaries. + assert!(!testlook!(look, "a", 0)); + assert!(testlook!(look, "a", 1)); + assert!(testlook!(look, "a ", 1)); + assert!(!testlook!(look, " a ", 1)); + assert!(testlook!(look, " a ", 2)); + + // Unicode word boundaries with a non-ASCII codepoint. Since this is + // an ASCII word boundary, none of these match. + assert!(testlook!(look, "𝛃", 0)); + assert!(testlook!(look, "𝛃", 4)); + assert!(testlook!(look, "𝛃 ", 4)); + assert!(testlook!(look, " 𝛃 ", 1)); + assert!(testlook!(look, " 𝛃 ", 5)); + + // Unicode word boundaries between non-ASCII codepoints. Again, since + // this is an ASCII word boundary, none of these match. + assert!(testlook!(look, "𝛃𐆀", 0)); + assert!(testlook!(look, "𝛃𐆀", 4)); + + // Non word boundaries for ASCII. + assert!(testlook!(look, "", 0)); + assert!(!testlook!(look, "ab", 1)); + assert!(testlook!(look, "a ", 2)); + assert!(testlook!(look, " a ", 0)); + assert!(testlook!(look, " a ", 3)); + + // Non word boundaries with a non-ASCII codepoint. + assert!(!testlook!(look, "𝛃b", 4)); + assert!(testlook!(look, "b𝛃", 1)); + assert!(testlook!(look, "𝛃 ", 5)); + assert!(testlook!(look, " 𝛃 ", 0)); + assert!(testlook!(look, " 𝛃 ", 6)); + assert!(testlook!(look, "𝛃", 1)); + assert!(testlook!(look, "𝛃", 2)); + assert!(testlook!(look, "𝛃", 3)); + + // Non word boundaries with non-ASCII codepoints. + assert!(testlook!(look, "𝛃𐆀", 1)); + assert!(testlook!(look, "𝛃𐆀", 2)); + assert!(testlook!(look, "𝛃𐆀", 3)); + assert!(testlook!(look, "𝛃𐆀", 5)); + assert!(testlook!(look, "𝛃𐆀", 6)); + assert!(testlook!(look, "𝛃𐆀", 7)); + assert!(testlook!(look, "𝛃𐆀", 8)); + } + + #[test] + #[cfg(all(not(miri), feature = "unicode-word-boundary"))] + fn look_matches_word_start_half_unicode() { + let look = Look::WordStartHalfUnicode; + + // \xF0\x9D\x9B\x83 = 𝛃 (in \w) + // \xF0\x90\x86\x80 = 𐆀 (not in \w) + + // Simple ASCII word boundaries. + assert!(testlook!(look, "a", 0)); + assert!(!testlook!(look, "a", 1)); + assert!(!testlook!(look, "a ", 1)); + assert!(testlook!(look, " a ", 1)); + assert!(!testlook!(look, " a ", 2)); + + // Unicode word boundaries with a non-ASCII codepoint. + assert!(testlook!(look, "𝛃", 0)); + assert!(!testlook!(look, "𝛃", 4)); + assert!(!testlook!(look, "𝛃 ", 4)); + assert!(testlook!(look, " 𝛃 ", 1)); + assert!(!testlook!(look, " 𝛃 ", 5)); + + // Unicode word boundaries between non-ASCII codepoints. + assert!(testlook!(look, "𝛃𐆀", 0)); + assert!(!testlook!(look, "𝛃𐆀", 4)); + + // Non word boundaries for ASCII. + assert!(testlook!(look, "", 0)); + assert!(!testlook!(look, "ab", 1)); + assert!(testlook!(look, "a ", 2)); + assert!(testlook!(look, " a ", 0)); + assert!(testlook!(look, " a ", 3)); + + // Non word boundaries with a non-ASCII codepoint. + assert!(!testlook!(look, "𝛃b", 4)); + assert!(!testlook!(look, "b𝛃", 1)); + assert!(testlook!(look, "𝛃 ", 5)); + assert!(testlook!(look, " 𝛃 ", 0)); + assert!(testlook!(look, " 𝛃 ", 6)); + assert!(!testlook!(look, "𝛃", 1)); + assert!(!testlook!(look, "𝛃", 2)); + assert!(!testlook!(look, "𝛃", 3)); + + // Non word boundaries with non-ASCII codepoints. + assert!(!testlook!(look, "𝛃𐆀", 1)); + assert!(!testlook!(look, "𝛃𐆀", 2)); + assert!(!testlook!(look, "𝛃𐆀", 3)); + assert!(!testlook!(look, "𝛃𐆀", 5)); + assert!(!testlook!(look, "𝛃𐆀", 6)); + assert!(!testlook!(look, "𝛃𐆀", 7)); + assert!(testlook!(look, "𝛃𐆀", 8)); + } + + #[test] + #[cfg(all(not(miri), feature = "unicode-word-boundary"))] + fn look_matches_word_end_half_unicode() { + let look = Look::WordEndHalfUnicode; + + // \xF0\x9D\x9B\x83 = 𝛃 (in \w) + // \xF0\x90\x86\x80 = 𐆀 (not in \w) + + // Simple ASCII word boundaries. + assert!(!testlook!(look, "a", 0)); + assert!(testlook!(look, "a", 1)); + assert!(testlook!(look, "a ", 1)); + assert!(!testlook!(look, " a ", 1)); + assert!(testlook!(look, " a ", 2)); + + // Unicode word boundaries with a non-ASCII codepoint. + assert!(!testlook!(look, "𝛃", 0)); + assert!(testlook!(look, "𝛃", 4)); + assert!(testlook!(look, "𝛃 ", 4)); + assert!(!testlook!(look, " 𝛃 ", 1)); + assert!(testlook!(look, " 𝛃 ", 5)); + + // Unicode word boundaries between non-ASCII codepoints. + assert!(!testlook!(look, "𝛃𐆀", 0)); + assert!(testlook!(look, "𝛃𐆀", 4)); + + // Non word boundaries for ASCII. + assert!(testlook!(look, "", 0)); + assert!(!testlook!(look, "ab", 1)); + assert!(testlook!(look, "a ", 2)); + assert!(testlook!(look, " a ", 0)); + assert!(testlook!(look, " a ", 3)); + + // Non word boundaries with a non-ASCII codepoint. + assert!(!testlook!(look, "𝛃b", 4)); + assert!(!testlook!(look, "b𝛃", 1)); + assert!(testlook!(look, "𝛃 ", 5)); + assert!(testlook!(look, " 𝛃 ", 0)); + assert!(testlook!(look, " 𝛃 ", 6)); + assert!(!testlook!(look, "𝛃", 1)); + assert!(!testlook!(look, "𝛃", 2)); + assert!(!testlook!(look, "𝛃", 3)); + + // Non word boundaries with non-ASCII codepoints. + assert!(!testlook!(look, "𝛃𐆀", 1)); + assert!(!testlook!(look, "𝛃𐆀", 2)); + assert!(!testlook!(look, "𝛃𐆀", 3)); + assert!(!testlook!(look, "𝛃𐆀", 5)); + assert!(!testlook!(look, "𝛃𐆀", 6)); + assert!(!testlook!(look, "𝛃𐆀", 7)); + assert!(testlook!(look, "𝛃𐆀", 8)); + } + + #[test] + fn look_set() { + let mut f = LookSet::default(); + assert!(!f.contains(Look::Start)); + assert!(!f.contains(Look::End)); + assert!(!f.contains(Look::StartLF)); + assert!(!f.contains(Look::EndLF)); + assert!(!f.contains(Look::WordUnicode)); + assert!(!f.contains(Look::WordUnicodeNegate)); + assert!(!f.contains(Look::WordAscii)); + assert!(!f.contains(Look::WordAsciiNegate)); + + f = f.insert(Look::Start); + assert!(f.contains(Look::Start)); + f = f.remove(Look::Start); + assert!(!f.contains(Look::Start)); + + f = f.insert(Look::End); + assert!(f.contains(Look::End)); + f = f.remove(Look::End); + assert!(!f.contains(Look::End)); + + f = f.insert(Look::StartLF); + assert!(f.contains(Look::StartLF)); + f = f.remove(Look::StartLF); + assert!(!f.contains(Look::StartLF)); + + f = f.insert(Look::EndLF); + assert!(f.contains(Look::EndLF)); + f = f.remove(Look::EndLF); + assert!(!f.contains(Look::EndLF)); + + f = f.insert(Look::StartCRLF); + assert!(f.contains(Look::StartCRLF)); + f = f.remove(Look::StartCRLF); + assert!(!f.contains(Look::StartCRLF)); + + f = f.insert(Look::EndCRLF); + assert!(f.contains(Look::EndCRLF)); + f = f.remove(Look::EndCRLF); + assert!(!f.contains(Look::EndCRLF)); + + f = f.insert(Look::WordUnicode); + assert!(f.contains(Look::WordUnicode)); + f = f.remove(Look::WordUnicode); assert!(!f.contains(Look::WordUnicode)); f = f.insert(Look::WordUnicodeNegate); @@ -1720,6 +2473,46 @@ mod tests { assert!(f.contains(Look::WordAsciiNegate)); f = f.remove(Look::WordAsciiNegate); assert!(!f.contains(Look::WordAsciiNegate)); + + f = f.insert(Look::WordStartAscii); + assert!(f.contains(Look::WordStartAscii)); + f = f.remove(Look::WordStartAscii); + assert!(!f.contains(Look::WordStartAscii)); + + f = f.insert(Look::WordEndAscii); + assert!(f.contains(Look::WordEndAscii)); + f = f.remove(Look::WordEndAscii); + assert!(!f.contains(Look::WordEndAscii)); + + f = f.insert(Look::WordStartUnicode); + assert!(f.contains(Look::WordStartUnicode)); + f = f.remove(Look::WordStartUnicode); + assert!(!f.contains(Look::WordStartUnicode)); + + f = f.insert(Look::WordEndUnicode); + assert!(f.contains(Look::WordEndUnicode)); + f = f.remove(Look::WordEndUnicode); + assert!(!f.contains(Look::WordEndUnicode)); + + f = f.insert(Look::WordStartHalfAscii); + assert!(f.contains(Look::WordStartHalfAscii)); + f = f.remove(Look::WordStartHalfAscii); + assert!(!f.contains(Look::WordStartHalfAscii)); + + f = f.insert(Look::WordEndHalfAscii); + assert!(f.contains(Look::WordEndHalfAscii)); + f = f.remove(Look::WordEndHalfAscii); + assert!(!f.contains(Look::WordEndHalfAscii)); + + f = f.insert(Look::WordStartHalfUnicode); + assert!(f.contains(Look::WordStartHalfUnicode)); + f = f.remove(Look::WordStartHalfUnicode); + assert!(!f.contains(Look::WordStartHalfUnicode)); + + f = f.insert(Look::WordEndHalfUnicode); + assert!(f.contains(Look::WordEndHalfUnicode)); + f = f.remove(Look::WordEndHalfUnicode); + assert!(!f.contains(Look::WordEndHalfUnicode)); } #[test] @@ -1728,7 +2521,7 @@ mod tests { assert_eq!(0, set.iter().count()); let set = LookSet::full(); - assert_eq!(10, set.iter().count()); + assert_eq!(18, set.iter().count()); let set = LookSet::empty().insert(Look::StartLF).insert(Look::WordUnicode); @@ -1739,6 +2532,9 @@ mod tests { let set = LookSet::empty().insert(Look::WordAsciiNegate); assert_eq!(1, set.iter().count()); + + let set = LookSet::empty().insert(Look::WordEndHalfUnicode); + assert_eq!(1, set.iter().count()); } #[test] @@ -1747,6 +2543,6 @@ mod tests { let res = alloc::format!("{:?}", LookSet::empty()); assert_eq!("∅", res); let res = alloc::format!("{:?}", LookSet::full()); - assert_eq!("Az^$rRbB𝛃𝚩", res); + assert_eq!("Az^$rRbB𝛃𝚩<>〈〉◁▷◀▶", res); } } diff --git a/regex-automata/tests/dfa/suite.rs b/regex-automata/tests/dfa/suite.rs index f3445e02a..8ed6dd007 100644 --- a/regex-automata/tests/dfa/suite.rs +++ b/regex-automata/tests/dfa/suite.rs @@ -9,7 +9,6 @@ use { util::{prefilter::Prefilter, syntax}, Anchored, Input, PatternSet, }, - regex_syntax::hir, regex_test::{ CompiledRegex, Match, RegexTest, SearchKind, Span, TestResult, TestRunner, @@ -285,10 +284,7 @@ fn compiler( // That is, Unicode word boundaries when searching non-ASCII text. if !test.haystack().is_ascii() { for hir in hirs.iter() { - let looks = hir.properties().look_set(); - if looks.contains(hir::Look::WordUnicode) - || looks.contains(hir::Look::WordUnicodeNegate) - { + if hir.properties().look_set().contains_word_unicode() { return Ok(CompiledRegex::skip()); } } diff --git a/regex-automata/tests/lib.rs b/regex-automata/tests/lib.rs index 1465e51eb..67c979aa8 100644 --- a/regex-automata/tests/lib.rs +++ b/regex-automata/tests/lib.rs @@ -61,6 +61,7 @@ fn suite() -> anyhow::Result { load!("unicode"); load!("utf8"); load!("word-boundary"); + load!("word-boundary-special"); load!("fowler/basic"); load!("fowler/nullsubexpr"); load!("fowler/repetition"); diff --git a/testdata/word-boundary-special.toml b/testdata/word-boundary-special.toml new file mode 100644 index 000000000..c1689f5cc --- /dev/null +++ b/testdata/word-boundary-special.toml @@ -0,0 +1,653 @@ +# These tests are for the "special" word boundary assertions. That is, +# \b{start}, \b{end}, \b{start-half}, \b{end-half}. These are specialty +# assertions for more niche use cases, but hitting those cases without these +# assertions is difficult. For example, \b{start-half} and \b{end-half} are +# used to implement the -w/--word-regexp flag in a grep program. + +# Tests for (?-u:\b{start}) + +[[test]] +name = "word-start-ascii-010" +regex = '\b{start}' +haystack = "a" +matches = [[0, 0]] +unicode = false + +[[test]] +name = "word-start-ascii-020" +regex = '\b{start}' +haystack = "a " +matches = [[0, 0]] +unicode = false + +[[test]] +name = "word-start-ascii-030" +regex = '\b{start}' +haystack = " a " +matches = [[1, 1]] +unicode = false + +[[test]] +name = "word-start-ascii-040" +regex = '\b{start}' +haystack = "" +matches = [] +unicode = false + +[[test]] +name = "word-start-ascii-050" +regex = '\b{start}' +haystack = "ab" +matches = [[0, 0]] +unicode = false + +[[test]] +name = "word-start-ascii-060" +regex = '\b{start}' +haystack = "𝛃" +matches = [] +unicode = false + +[[test]] +name = "word-start-ascii-060-bounds" +regex = '\b{start}' +haystack = "𝛃" +bounds = [2, 3] +matches = [] +unicode = false + +[[test]] +name = "word-start-ascii-070" +regex = '\b{start}' +haystack = " 𝛃 " +matches = [] +unicode = false + +[[test]] +name = "word-start-ascii-080" +regex = '\b{start}' +haystack = "𝛃𐆀" +matches = [] +unicode = false + +[[test]] +name = "word-start-ascii-090" +regex = '\b{start}' +haystack = "𝛃b" +matches = [[4, 4]] +unicode = false + +[[test]] +name = "word-start-ascii-110" +regex = '\b{start}' +haystack = "b𝛃" +matches = [[0, 0]] +unicode = false + +# Tests for (?-u:\b{end}) + +[[test]] +name = "word-end-ascii-010" +regex = '\b{end}' +haystack = "a" +matches = [[1, 1]] +unicode = false + +[[test]] +name = "word-end-ascii-020" +regex = '\b{end}' +haystack = "a " +matches = [[1, 1]] +unicode = false + +[[test]] +name = "word-end-ascii-030" +regex = '\b{end}' +haystack = " a " +matches = [[2, 2]] +unicode = false + +[[test]] +name = "word-end-ascii-040" +regex = '\b{end}' +haystack = "" +matches = [] +unicode = false + +[[test]] +name = "word-end-ascii-050" +regex = '\b{end}' +haystack = "ab" +matches = [[2, 2]] +unicode = false + +[[test]] +name = "word-end-ascii-060" +regex = '\b{end}' +haystack = "𝛃" +matches = [] +unicode = false + +[[test]] +name = "word-end-ascii-060-bounds" +regex = '\b{end}' +haystack = "𝛃" +bounds = [2, 3] +matches = [] +unicode = false + +[[test]] +name = "word-end-ascii-070" +regex = '\b{end}' +haystack = " 𝛃 " +matches = [] +unicode = false + +[[test]] +name = "word-end-ascii-080" +regex = '\b{end}' +haystack = "𝛃𐆀" +matches = [] +unicode = false + +[[test]] +name = "word-end-ascii-090" +regex = '\b{end}' +haystack = "𝛃b" +matches = [[5, 5]] +unicode = false + +[[test]] +name = "word-end-ascii-110" +regex = '\b{end}' +haystack = "b𝛃" +matches = [[1, 1]] +unicode = false + +# Tests for \b{start} + +[[test]] +name = "word-start-unicode-010" +regex = '\b{start}' +haystack = "a" +matches = [[0, 0]] +unicode = true + +[[test]] +name = "word-start-unicode-020" +regex = '\b{start}' +haystack = "a " +matches = [[0, 0]] +unicode = true + +[[test]] +name = "word-start-unicode-030" +regex = '\b{start}' +haystack = " a " +matches = [[1, 1]] +unicode = true + +[[test]] +name = "word-start-unicode-040" +regex = '\b{start}' +haystack = "" +matches = [] +unicode = true + +[[test]] +name = "word-start-unicode-050" +regex = '\b{start}' +haystack = "ab" +matches = [[0, 0]] +unicode = true + +[[test]] +name = "word-start-unicode-060" +regex = '\b{start}' +haystack = "𝛃" +matches = [[0, 0]] +unicode = true + +[[test]] +name = "word-start-unicode-060-bounds" +regex = '\b{start}' +haystack = "𝛃" +bounds = [2, 3] +matches = [] +unicode = true + +[[test]] +name = "word-start-unicode-070" +regex = '\b{start}' +haystack = " 𝛃 " +matches = [[1, 1]] +unicode = true + +[[test]] +name = "word-start-unicode-080" +regex = '\b{start}' +haystack = "𝛃𐆀" +matches = [[0, 0]] +unicode = true + +[[test]] +name = "word-start-unicode-090" +regex = '\b{start}' +haystack = "𝛃b" +matches = [[0, 0]] +unicode = true + +[[test]] +name = "word-start-unicode-110" +regex = '\b{start}' +haystack = "b𝛃" +matches = [[0, 0]] +unicode = true + +# Tests for \b{end} + +[[test]] +name = "word-end-unicode-010" +regex = '\b{end}' +haystack = "a" +matches = [[1, 1]] +unicode = true + +[[test]] +name = "word-end-unicode-020" +regex = '\b{end}' +haystack = "a " +matches = [[1, 1]] +unicode = true + +[[test]] +name = "word-end-unicode-030" +regex = '\b{end}' +haystack = " a " +matches = [[2, 2]] +unicode = true + +[[test]] +name = "word-end-unicode-040" +regex = '\b{end}' +haystack = "" +matches = [] +unicode = true + +[[test]] +name = "word-end-unicode-050" +regex = '\b{end}' +haystack = "ab" +matches = [[2, 2]] +unicode = true + +[[test]] +name = "word-end-unicode-060" +regex = '\b{end}' +haystack = "𝛃" +matches = [[4, 4]] +unicode = true + +[[test]] +name = "word-end-unicode-060-bounds" +regex = '\b{end}' +haystack = "𝛃" +bounds = [2, 3] +matches = [] +unicode = true + +[[test]] +name = "word-end-unicode-070" +regex = '\b{end}' +haystack = " 𝛃 " +matches = [[5, 5]] +unicode = true + +[[test]] +name = "word-end-unicode-080" +regex = '\b{end}' +haystack = "𝛃𐆀" +matches = [[4, 4]] +unicode = true + +[[test]] +name = "word-end-unicode-090" +regex = '\b{end}' +haystack = "𝛃b" +matches = [[5, 5]] +unicode = true + +[[test]] +name = "word-end-unicode-110" +regex = '\b{end}' +haystack = "b𝛃" +matches = [[5, 5]] +unicode = true + +# Tests for (?-u:\b{start-half}) + +[[test]] +name = "word-start-half-ascii-010" +regex = '\b{start-half}' +haystack = "a" +matches = [[0, 0]] +unicode = false + +[[test]] +name = "word-start-half-ascii-020" +regex = '\b{start-half}' +haystack = "a " +matches = [[0, 0], [2, 2]] +unicode = false + +[[test]] +name = "word-start-half-ascii-030" +regex = '\b{start-half}' +haystack = " a " +matches = [[0, 0], [1, 1], [3, 3]] +unicode = false + +[[test]] +name = "word-start-half-ascii-040" +regex = '\b{start-half}' +haystack = "" +matches = [[0, 0]] +unicode = false + +[[test]] +name = "word-start-half-ascii-050" +regex = '\b{start-half}' +haystack = "ab" +matches = [[0, 0]] +unicode = false + +[[test]] +name = "word-start-half-ascii-060" +regex = '\b{start-half}' +haystack = "𝛃" +matches = [[0, 0], [4, 4]] +unicode = false + +[[test]] +name = "word-start-half-ascii-060-noutf8" +regex = '\b{start-half}' +haystack = "𝛃" +matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4]] +unicode = false +utf8 = false + +[[test]] +name = "word-start-half-ascii-060-bounds" +regex = '\b{start-half}' +haystack = "𝛃" +bounds = [2, 3] +matches = [] +unicode = false + +[[test]] +name = "word-start-half-ascii-070" +regex = '\b{start-half}' +haystack = " 𝛃 " +matches = [[0, 0], [1, 1], [5, 5], [6, 6]] +unicode = false + +[[test]] +name = "word-start-half-ascii-080" +regex = '\b{start-half}' +haystack = "𝛃𐆀" +matches = [[0, 0], [4, 4], [8, 8]] +unicode = false + +[[test]] +name = "word-start-half-ascii-090" +regex = '\b{start-half}' +haystack = "𝛃b" +matches = [[0, 0], [4, 4]] +unicode = false + +[[test]] +name = "word-start-half-ascii-110" +regex = '\b{start-half}' +haystack = "b𝛃" +matches = [[0, 0], [5, 5]] +unicode = false + +# Tests for (?-u:\b{end-half}) + +[[test]] +name = "word-end-half-ascii-010" +regex = '\b{end-half}' +haystack = "a" +matches = [[1, 1]] +unicode = false + +[[test]] +name = "word-end-half-ascii-020" +regex = '\b{end-half}' +haystack = "a " +matches = [[1, 1], [2, 2]] +unicode = false + +[[test]] +name = "word-end-half-ascii-030" +regex = '\b{end-half}' +haystack = " a " +matches = [[0, 0], [2, 2], [3, 3]] +unicode = false + +[[test]] +name = "word-end-half-ascii-040" +regex = '\b{end-half}' +haystack = "" +matches = [[0, 0]] +unicode = false + +[[test]] +name = "word-end-half-ascii-050" +regex = '\b{end-half}' +haystack = "ab" +matches = [[2, 2]] +unicode = false + +[[test]] +name = "word-end-half-ascii-060" +regex = '\b{end-half}' +haystack = "𝛃" +matches = [[0, 0], [4, 4]] +unicode = false + +[[test]] +name = "word-end-half-ascii-060-bounds" +regex = '\b{end-half}' +haystack = "𝛃" +bounds = [2, 3] +matches = [] +unicode = false + +[[test]] +name = "word-end-half-ascii-070" +regex = '\b{end-half}' +haystack = " 𝛃 " +matches = [[0, 0], [1, 1], [5, 5], [6, 6]] +unicode = false + +[[test]] +name = "word-end-half-ascii-080" +regex = '\b{end-half}' +haystack = "𝛃𐆀" +matches = [[0, 0], [4, 4], [8, 8]] +unicode = false + +[[test]] +name = "word-end-half-ascii-090" +regex = '\b{end-half}' +haystack = "𝛃b" +matches = [[0, 0], [5, 5]] +unicode = false + +[[test]] +name = "word-end-half-ascii-110" +regex = '\b{end-half}' +haystack = "b𝛃" +matches = [[1, 1], [5, 5]] +unicode = false + +# Tests for \b{start-half} + +[[test]] +name = "word-start-half-unicode-010" +regex = '\b{start-half}' +haystack = "a" +matches = [[0, 0]] +unicode = true + +[[test]] +name = "word-start-half-unicode-020" +regex = '\b{start-half}' +haystack = "a " +matches = [[0, 0], [2, 2]] +unicode = true + +[[test]] +name = "word-start-half-unicode-030" +regex = '\b{start-half}' +haystack = " a " +matches = [[0, 0], [1, 1], [3, 3]] +unicode = true + +[[test]] +name = "word-start-half-unicode-040" +regex = '\b{start-half}' +haystack = "" +matches = [[0, 0]] +unicode = true + +[[test]] +name = "word-start-half-unicode-050" +regex = '\b{start-half}' +haystack = "ab" +matches = [[0, 0]] +unicode = true + +[[test]] +name = "word-start-half-unicode-060" +regex = '\b{start-half}' +haystack = "𝛃" +matches = [[0, 0]] +unicode = true + +[[test]] +name = "word-start-half-unicode-060-bounds" +regex = '\b{start-half}' +haystack = "𝛃" +bounds = [2, 3] +matches = [] +unicode = true + +[[test]] +name = "word-start-half-unicode-070" +regex = '\b{start-half}' +haystack = " 𝛃 " +matches = [[0, 0], [1, 1], [6, 6]] +unicode = true + +[[test]] +name = "word-start-half-unicode-080" +regex = '\b{start-half}' +haystack = "𝛃𐆀" +matches = [[0, 0], [8, 8]] +unicode = true + +[[test]] +name = "word-start-half-unicode-090" +regex = '\b{start-half}' +haystack = "𝛃b" +matches = [[0, 0]] +unicode = true + +[[test]] +name = "word-start-half-unicode-110" +regex = '\b{start-half}' +haystack = "b𝛃" +matches = [[0, 0]] +unicode = true + +# Tests for \b{end-half} + +[[test]] +name = "word-end-half-unicode-010" +regex = '\b{end-half}' +haystack = "a" +matches = [[1, 1]] +unicode = true + +[[test]] +name = "word-end-half-unicode-020" +regex = '\b{end-half}' +haystack = "a " +matches = [[1, 1], [2, 2]] +unicode = true + +[[test]] +name = "word-end-half-unicode-030" +regex = '\b{end-half}' +haystack = " a " +matches = [[0, 0], [2, 2], [3, 3]] +unicode = true + +[[test]] +name = "word-end-half-unicode-040" +regex = '\b{end-half}' +haystack = "" +matches = [[0, 0]] +unicode = true + +[[test]] +name = "word-end-half-unicode-050" +regex = '\b{end-half}' +haystack = "ab" +matches = [[2, 2]] +unicode = true + +[[test]] +name = "word-end-half-unicode-060" +regex = '\b{end-half}' +haystack = "𝛃" +matches = [[4, 4]] +unicode = true + +[[test]] +name = "word-end-half-unicode-060-bounds" +regex = '\b{end-half}' +haystack = "𝛃" +bounds = [2, 3] +matches = [] +unicode = true + +[[test]] +name = "word-end-half-unicode-070" +regex = '\b{end-half}' +haystack = " 𝛃 " +matches = [[0, 0], [5, 5], [6, 6]] +unicode = true + +[[test]] +name = "word-end-half-unicode-080" +regex = '\b{end-half}' +haystack = "𝛃𐆀" +matches = [[4, 4], [8, 8]] +unicode = true + +[[test]] +name = "word-end-half-unicode-090" +regex = '\b{end-half}' +haystack = "𝛃b" +matches = [[5, 5]] +unicode = true + +[[test]] +name = "word-end-half-unicode-110" +regex = '\b{end-half}' +haystack = "b𝛃" +matches = [[5, 5]] +unicode = true diff --git a/tests/lib.rs b/tests/lib.rs index badd57455..b3f69423d 100644 --- a/tests/lib.rs +++ b/tests/lib.rs @@ -49,6 +49,7 @@ fn suite() -> anyhow::Result { load!("unicode"); load!("utf8"); load!("word-boundary"); + load!("word-boundary-special"); load!("fowler/basic"); load!("fowler/nullsubexpr"); load!("fowler/repetition"); From 2743a7a0181cf16069445ad11d56977b1b991674 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sun, 8 Oct 2023 09:29:27 -0400 Subject: [PATCH 079/136] doc: explain the new word boundary assertions Closes #469 --- CHANGELOG.md | 7 ++++++ src/lib.rs | 70 ++++++++++++++++++++++++++++++---------------------- 2 files changed, 47 insertions(+), 30 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 265f5cd48..7f90e45a8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,9 +3,16 @@ TBD New features: +* [FEATURE #469](https://github.com/rust-lang/regex/issues/469): +Add support for `\<` and `\>` word boundary assertions. * [FEATURE(regex-automata) #1031](https://github.com/rust-lang/regex/pull/1031): DFAs now have a `start_state` method that doesn't use an `Input`. +Performance improvements: + +* [PERF #1051](https://github.com/rust-lang/regex/pull/1051): +Unicode character class operations have been optimized in `regex-syntax`. + Bug fixes: * [BUG #1046](https://github.com/rust-lang/regex/issues/1046): diff --git a/src/lib.rs b/src/lib.rs index 1e191b692..6dbd3c202 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -543,8 +543,10 @@ scalar value, even when it is encoded using multiple bytes. When Unicode mode is disabled (e.g., `(?-u:.)`), then `.` will match a single byte in all cases. * The character classes `\w`, `\d` and `\s` are all Unicode-aware by default. Use `(?-u:\w)`, `(?-u:\d)` and `(?-u:\s)` to get their ASCII-only definitions. -* Similarly, `\b` and `\B` use a Unicode definition of a "word" character. To -get ASCII-only word boundaries, use `(?-u:\b)` and `(?-u:\B)`. +* Similarly, `\b` and `\B` use a Unicode definition of a "word" character. +To get ASCII-only word boundaries, use `(?-u:\b)` and `(?-u:\B)`. This also +applies to the special word boundary assertions. (That is, `\b{start}`, +`\b{end}`, `\b{start-half}`, `\b{end-half}`.) * `^` and `$` are **not** Unicode-aware in multi-line mode. Namely, they only recognize `\n` (assuming CRLF mode is not enabled) and not any of the other forms of line terminators defined by Unicode. @@ -723,12 +725,16 @@ x{n}? exactly n x ### Empty matches

-^     the beginning of a haystack (or start-of-line with multi-line mode)
-$     the end of a haystack (or end-of-line with multi-line mode)
-\A    only the beginning of a haystack (even with multi-line mode enabled)
-\z    only the end of a haystack (even with multi-line mode enabled)
-\b    a Unicode word boundary (\w on one side and \W, \A, or \z on other)
-\B    not a Unicode word boundary
+^               the beginning of a haystack (or start-of-line with multi-line mode)
+$               the end of a haystack (or end-of-line with multi-line mode)
+\A              only the beginning of a haystack (even with multi-line mode enabled)
+\z              only the end of a haystack (even with multi-line mode enabled)
+\b              a Unicode word boundary (\w on one side and \W, \A, or \z on other)
+\B              not a Unicode word boundary
+\b{start}, \<   a Unicode start-of-word boundary (\W|\A on the left, \w on the right)
+\b{end}, \>     a Unicode end-of-word boundary (\w on the left, \W|\z on the right))
+\b{start-half}  half of a Unicode start-of-word boundary (\W|\A on the left)
+\b{end-half}    half of a Unicode end-of-word boundary (\W|\z on the right)
 
The empty regex is valid and matches the empty string. For example, the @@ -856,28 +862,32 @@ Note that this includes all possible escape sequences, even ones that are documented elsewhere.
-\*          literal *, applies to all ASCII except [0-9A-Za-z<>]
-\a          bell (\x07)
-\f          form feed (\x0C)
-\t          horizontal tab
-\n          new line
-\r          carriage return
-\v          vertical tab (\x0B)
-\A          matches at the beginning of a haystack
-\z          matches at the end of a haystack
-\b          word boundary assertion
-\B          negated word boundary assertion
-\123        octal character code, up to three digits (when enabled)
-\x7F        hex character code (exactly two digits)
-\x{10FFFF}  any hex character code corresponding to a Unicode code point
-\u007F      hex character code (exactly four digits)
-\u{7F}      any hex character code corresponding to a Unicode code point
-\U0000007F  hex character code (exactly eight digits)
-\U{7F}      any hex character code corresponding to a Unicode code point
-\p{Letter}  Unicode character class
-\P{Letter}  negated Unicode character class
-\d, \s, \w  Perl character class
-\D, \S, \W  negated Perl character class
+\*              literal *, applies to all ASCII except [0-9A-Za-z<>]
+\a              bell (\x07)
+\f              form feed (\x0C)
+\t              horizontal tab
+\n              new line
+\r              carriage return
+\v              vertical tab (\x0B)
+\A              matches at the beginning of a haystack
+\z              matches at the end of a haystack
+\b              word boundary assertion
+\B              negated word boundary assertion
+\b{start}, \<   start-of-word boundary assertion
+\b{end}, \>     end-of-word boundary assertion
+\b{start-half}  half of a start-of-word boundary assertion
+\b{end-half}    half of a end-of-word boundary assertion
+\123            octal character code, up to three digits (when enabled)
+\x7F            hex character code (exactly two digits)
+\x{10FFFF}      any hex character code corresponding to a Unicode code point
+\u007F          hex character code (exactly four digits)
+\u{7F}          any hex character code corresponding to a Unicode code point
+\U0000007F      hex character code (exactly eight digits)
+\U{7F}          any hex character code corresponding to a Unicode code point
+\p{Letter}      Unicode character class
+\P{Letter}      negated Unicode character class
+\d, \s, \w      Perl character class
+\D, \S, \W      negated Perl character class
 
### Perl character classes (Unicode friendly) From dbc5e6d98ba731ddd7f5cddd8f96f5e147d14b51 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sun, 8 Oct 2023 10:20:24 -0400 Subject: [PATCH 080/136] lite: add special word boundaries to regex-lite This was substantially easier. Coupling, private abstractions and slow code are so much easier to deal with. Ref #469 --- regex-lite/src/hir/mod.rs | 42 +++++++++++++++++ regex-lite/src/hir/parse.rs | 89 +++++++++++++++++++++++++++++++++++-- regex-lite/src/lib.rs | 58 +++++++++++++----------- regex-lite/tests/lib.rs | 1 + 4 files changed, 162 insertions(+), 28 deletions(-) diff --git a/regex-lite/src/hir/mod.rs b/regex-lite/src/hir/mod.rs index f73a5420a..3d61ce8c9 100644 --- a/regex-lite/src/hir/mod.rs +++ b/regex-lite/src/hir/mod.rs @@ -592,6 +592,24 @@ pub(crate) enum Look { Word = 1 << 6, /// Match an ASCII-only negation of a word boundary. WordNegate = 1 << 7, + /// Match the start of an ASCII-only word boundary. That is, this matches a + /// position at either the beginning of the haystack or where the previous + /// character is not a word character and the following character is a word + /// character. + WordStart = 1 << 8, + /// Match the end of an ASCII-only word boundary. That is, this matches + /// a position at either the end of the haystack or where the previous + /// character is a word character and the following character is not a word + /// character. + WordEnd = 1 << 9, + /// Match the start half of an ASCII-only word boundary. That is, this + /// matches a position at either the beginning of the haystack or where the + /// previous character is not a word character. + WordStartHalf = 1 << 10, + /// Match the end half of an ASCII-only word boundary. That is, this + /// matches a position at either the end of the haystack or where the + /// following character is not a word character. + WordEndHalf = 1 << 11, } impl Look { @@ -631,6 +649,30 @@ impl Look { at < haystack.len() && utf8::is_word_byte(haystack[at]); word_before == word_after } + WordStart => { + let word_before = + at > 0 && utf8::is_word_byte(haystack[at - 1]); + let word_after = + at < haystack.len() && utf8::is_word_byte(haystack[at]); + !word_before && word_after + } + WordEnd => { + let word_before = + at > 0 && utf8::is_word_byte(haystack[at - 1]); + let word_after = + at < haystack.len() && utf8::is_word_byte(haystack[at]); + word_before && !word_after + } + WordStartHalf => { + let word_before = + at > 0 && utf8::is_word_byte(haystack[at - 1]); + !word_before + } + WordEndHalf => { + let word_after = + at < haystack.len() && utf8::is_word_byte(haystack[at]); + !word_after + } } } } diff --git a/regex-lite/src/hir/parse.rs b/regex-lite/src/hir/parse.rs index cc3c21fe6..33bb97a7d 100644 --- a/regex-lite/src/hir/parse.rs +++ b/regex-lite/src/hir/parse.rs @@ -111,6 +111,12 @@ const ERR_CLASS_DIFFERENCE_UNSUPPORTED: &str = "character class difference is not supported"; const ERR_CLASS_SYMDIFFERENCE_UNSUPPORTED: &str = "character class symmetric difference is not supported"; +const ERR_SPECIAL_WORD_BOUNDARY_UNCLOSED: &str = + "special word boundary assertion is unclosed or has an invalid character"; +const ERR_SPECIAL_WORD_BOUNDARY_UNRECOGNIZED: &str = + "special word boundary assertion is unrecognized"; +const ERR_SPECIAL_WORD_OR_REP_UNEXPECTED_EOF: &str = + "found start of special word boundary or repetition without an end"; /// A regular expression parser. /// @@ -479,12 +485,86 @@ impl<'a> Parser<'a> { 'v' => special('\x0B'), 'A' => Ok(Hir::look(hir::Look::Start)), 'z' => Ok(Hir::look(hir::Look::End)), - 'b' => Ok(Hir::look(hir::Look::Word)), + 'b' => { + let mut hir = Hir::look(hir::Look::Word); + if !self.is_done() && self.char() == '{' { + if let Some(special) = + self.maybe_parse_special_word_boundary()? + { + hir = special; + } + } + Ok(hir) + } 'B' => Ok(Hir::look(hir::Look::WordNegate)), + '<' => Ok(Hir::look(hir::Look::WordStart)), + '>' => Ok(Hir::look(hir::Look::WordEnd)), _ => Err(Error::new(ERR_ESCAPE_UNRECOGNIZED)), } } + /// Attempt to parse a specialty word boundary. That is, `\b{start}`, + /// `\b{end}`, `\b{start-half}` or `\b{end-half}`. + /// + /// This is similar to `maybe_parse_ascii_class` in that, in most cases, + /// if it fails it will just return `None` with no error. This is done + /// because `\b{5}` is a valid expression and we want to let that be parsed + /// by the existing counted repetition parsing code. (I thought about just + /// invoking the counted repetition code from here, but it seemed a little + /// ham-fisted.) + /// + /// Unlike `maybe_parse_ascii_class` though, this can return an error. + /// Namely, if we definitely know it isn't a counted repetition, then we + /// return an error specific to the specialty word boundaries. + /// + /// This assumes the parser is positioned at a `{` immediately following + /// a `\b`. When `None` is returned, the parser is returned to the position + /// at which it started: pointing at a `{`. + /// + /// The position given should correspond to the start of the `\b`. + fn maybe_parse_special_word_boundary(&self) -> Result, Error> { + assert_eq!(self.char(), '{'); + + let is_valid_char = |c| match c { + 'A'..='Z' | 'a'..='z' | '-' => true, + _ => false, + }; + let start = self.pos(); + if !self.bump_and_bump_space() { + return Err(Error::new(ERR_SPECIAL_WORD_OR_REP_UNEXPECTED_EOF)); + } + // This is one of the critical bits: if the first non-whitespace + // character isn't in [-A-Za-z] (i.e., this can't be a special word + // boundary), then we bail and let the counted repetition parser deal + // with this. + if !is_valid_char(self.char()) { + self.pos.set(start); + self.char.set(Some('{')); + return Ok(None); + } + + // Now collect up our chars until we see a '}'. + let mut scratch = String::new(); + while !self.is_done() && is_valid_char(self.char()) { + scratch.push(self.char()); + self.bump_and_bump_space(); + } + if self.is_done() || self.char() != '}' { + return Err(Error::new(ERR_SPECIAL_WORD_BOUNDARY_UNCLOSED)); + } + self.bump(); + let kind = match scratch.as_str() { + "start" => hir::Look::WordStart, + "end" => hir::Look::WordEnd, + "start-half" => hir::Look::WordStartHalf, + "end-half" => hir::Look::WordEndHalf, + _ => { + return Err(Error::new(ERR_SPECIAL_WORD_BOUNDARY_UNRECOGNIZED)) + } + }; + Ok(Some(Hir::look(kind))) + } + /// Parse a hex representation of a Unicode codepoint. This handles both /// hex notations, i.e., `\xFF` and `\x{FFFF}`. This expects the parser to /// be positioned at the `x`, `u` or `U` prefix. The parser is advanced to @@ -1948,8 +2028,6 @@ bar assert_eq!(ERR_UNICODE_CLASS_UNSUPPORTED, perr(r"\pL")); assert_eq!(ERR_UNICODE_CLASS_UNSUPPORTED, perr(r"\p{L}")); assert_eq!(ERR_ESCAPE_UNRECOGNIZED, perr(r"\i")); - assert_eq!(ERR_ESCAPE_UNRECOGNIZED, perr(r"\<")); - assert_eq!(ERR_ESCAPE_UNRECOGNIZED, perr(r"\>")); assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"?")); assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"*")); assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"+")); @@ -1983,6 +2061,11 @@ bar assert_eq!(ERR_CLASS_INTERSECTION_UNSUPPORTED, perr(r"[a&&b]")); assert_eq!(ERR_CLASS_DIFFERENCE_UNSUPPORTED, perr(r"[a--b]")); assert_eq!(ERR_CLASS_SYMDIFFERENCE_UNSUPPORTED, perr(r"[a~~b]")); + assert_eq!(ERR_SPECIAL_WORD_BOUNDARY_UNCLOSED, perr(r"\b{foo")); + assert_eq!(ERR_SPECIAL_WORD_BOUNDARY_UNCLOSED, perr(r"\b{foo!}")); + assert_eq!(ERR_SPECIAL_WORD_BOUNDARY_UNRECOGNIZED, perr(r"\b{foo}")); + assert_eq!(ERR_SPECIAL_WORD_OR_REP_UNEXPECTED_EOF, perr(r"\b{")); + assert_eq!(ERR_SPECIAL_WORD_OR_REP_UNEXPECTED_EOF, perr(r"(?x)\b{ ")); } #[test] diff --git a/regex-lite/src/lib.rs b/regex-lite/src/lib.rs index 8008b9e59..68d54824f 100644 --- a/regex-lite/src/lib.rs +++ b/regex-lite/src/lib.rs @@ -466,12 +466,16 @@ x{n}? exactly n x ### Empty matches
-^     the beginning of a haystack (or start-of-line with multi-line mode)
-$     the end of a haystack (or end-of-line with multi-line mode)
-\A    only the beginning of a haystack (even with multi-line mode enabled)
-\z    only the end of a haystack (even with multi-line mode enabled)
-\b    an ASCII word boundary (\w on one side and \W, \A, or \z on other)
-\B    not an ASCII word boundary
+^               the beginning of a haystack (or start-of-line with multi-line mode)
+$               the end of a haystack (or end-of-line with multi-line mode)
+\A              only the beginning of a haystack (even with multi-line mode enabled)
+\z              only the end of a haystack (even with multi-line mode enabled)
+\b              an ASCII word boundary (\w on one side and \W, \A, or \z on other)
+\B              not an ASCII word boundary
+\b{start}       an ASCII start-of-word boundary (\W|\A on the left, \w on the right)
+\b{end}         an ASCII end-of-word boundary (\w on the left, \W|\z on the right))
+\b{start-half}  half of an ASCII start-of-word boundary (\W|\A on the left)
+\b{end-half}    half of an ASCII end-of-word boundary (\W|\z on the right)
 
The empty regex is valid and matches the empty string. For example, the @@ -581,25 +585,29 @@ Note that this includes all possible escape sequences, even ones that are documented elsewhere.
-\*          literal *, applies to all ASCII except [0-9A-Za-z<>]
-\a          bell (\x07)
-\f          form feed (\x0C)
-\t          horizontal tab
-\n          new line
-\r          carriage return
-\v          vertical tab (\x0B)
-\A          matches at the beginning of a haystack
-\z          matches at the end of a haystack
-\b          word boundary assertion
-\B          negated word boundary assertion
-\x7F        hex character code (exactly two digits)
-\x{10FFFF}  any hex character code corresponding to a Unicode code point
-\u007F      hex character code (exactly four digits)
-\u{7F}      any hex character code corresponding to a Unicode code point
-\U0000007F  hex character code (exactly eight digits)
-\U{7F}      any hex character code corresponding to a Unicode code point
-\d, \s, \w  Perl character class
-\D, \S, \W  negated Perl character class
+\*              literal *, applies to all ASCII except [0-9A-Za-z<>]
+\a              bell (\x07)
+\f              form feed (\x0C)
+\t              horizontal tab
+\n              new line
+\r              carriage return
+\v              vertical tab (\x0B)
+\A              matches at the beginning of a haystack
+\z              matches at the end of a haystack
+\b              word boundary assertion
+\B              negated word boundary assertion
+\b{start}, \<   start-of-word boundary assertion
+\b{end}, \>     end-of-word boundary assertion
+\b{start-half}  half of a start-of-word boundary assertion
+\b{end-half}    half of a end-of-word boundary assertion
+\x7F            hex character code (exactly two digits)
+\x{10FFFF}      any hex character code corresponding to a Unicode code point
+\u007F          hex character code (exactly four digits)
+\u{7F}          any hex character code corresponding to a Unicode code point
+\U0000007F      hex character code (exactly eight digits)
+\U{7F}          any hex character code corresponding to a Unicode code point
+\d, \s, \w      Perl character class
+\D, \S, \W      negated Perl character class
 
### Perl character classes (ASCII only) diff --git a/regex-lite/tests/lib.rs b/regex-lite/tests/lib.rs index 757b39441..89635f2d7 100644 --- a/regex-lite/tests/lib.rs +++ b/regex-lite/tests/lib.rs @@ -38,6 +38,7 @@ fn suite() -> anyhow::Result { load!("unicode"); load!("utf8"); load!("word-boundary"); + load!("word-boundary-special"); load!("fowler/basic"); load!("fowler/nullsubexpr"); load!("fowler/repetition"); From 07dcf208ef201df907fcd1dc09c83cac61d1503b Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sun, 8 Oct 2023 14:55:20 -0400 Subject: [PATCH 081/136] doc: remove HACKING document It is almost completely wrong now. Instead of rewriting it---which would be a huge endeavor---we just point folks toward my blog on regex internals. Closes #1058 --- HACKING.md | 341 ----------------------------------------------------- README.md | 15 +++ 2 files changed, 15 insertions(+), 341 deletions(-) delete mode 100644 HACKING.md diff --git a/HACKING.md b/HACKING.md deleted file mode 100644 index 34af5b517..000000000 --- a/HACKING.md +++ /dev/null @@ -1,341 +0,0 @@ -Your friendly guide to hacking and navigating the regex library. - -This guide assumes familiarity with Rust and Cargo, and at least a perusal of -the user facing documentation for this crate. - -If you're looking for background on the implementation in this library, then -you can do no better than Russ Cox's article series on implementing regular -expressions using finite automata: https://swtch.com/~rsc/regexp/ - - -## Architecture overview - -As you probably already know, this library executes regular expressions using -finite automata. In particular, a design goal is to make searching linear -with respect to both the regular expression and the text being searched. -Meeting that design goal on its own is not so hard and can be done with an -implementation of the Pike VM (similar to Thompson's construction, but supports -capturing groups), as described in: https://swtch.com/~rsc/regexp/regexp2.html ---- This library contains such an implementation in src/pikevm.rs. - -Making it fast is harder. One of the key problems with the Pike VM is that it -can be in more than one state at any point in time, and must shuffle capture -positions between them. The Pike VM also spends a lot of time following the -same epsilon transitions over and over again. We can employ one trick to -speed up the Pike VM: extract one or more literal prefixes from the regular -expression and execute specialized code to quickly find matches of those -prefixes in the search text. The Pike VM can then be avoided for most the -search, and instead only executed when a prefix is found. The code to find -prefixes is in the regex-syntax crate (in this repository). The code to search -for literals is in src/literals.rs. When more than one literal prefix is found, -we fall back to an Aho-Corasick DFA using the aho-corasick crate. For one -literal, we use a variant of the Boyer-Moore algorithm. Both Aho-Corasick and -Boyer-Moore use `memchr` when appropriate. The Boyer-Moore variant in this -library also uses elementary frequency analysis to choose the right byte to run -`memchr` with. - -Of course, detecting prefix literals can only take us so far. Not all regular -expressions have literal prefixes. To remedy this, we try another approach -to executing the Pike VM: backtracking, whose implementation can be found in -src/backtrack.rs. One reason why backtracking can be faster is that it avoids -excessive shuffling of capture groups. Of course, backtracking is susceptible -to exponential runtimes, so we keep track of every state we've visited to make -sure we never visit it again. This guarantees linear time execution, but we -pay for it with the memory required to track visited states. Because of the -memory requirement, we only use this engine on small search strings *and* small -regular expressions. - -Lastly, the real workhorse of this library is the "lazy" DFA in src/dfa.rs. -It is distinct from the Pike VM in that the DFA is explicitly represented in -memory and is only ever in one state at a time. It is said to be "lazy" because -the DFA is computed as text is searched, where each byte in the search text -results in at most one new DFA state. It is made fast by caching states. DFAs -are susceptible to exponential state blow up (where the worst case is computing -a new state for every input byte, regardless of what's in the state cache). To -avoid using a lot of memory, the lazy DFA uses a bounded cache. Once the cache -is full, it is wiped and state computation starts over again. If the cache is -wiped too frequently, then the DFA gives up and searching falls back to one of -the aforementioned algorithms. - -All of the above matching engines expose precisely the same matching semantics. -This is indeed tested. (See the section below about testing.) - -The following sub-sections describe the rest of the library and how each of the -matching engines are actually used. - -### Parsing - -Regular expressions are parsed using the regex-syntax crate, which is -maintained in this repository. The regex-syntax crate defines an abstract -syntax and provides very detailed error messages when a parse error is -encountered. Parsing is done in a separate crate so that others may benefit -from its existence, and because it is relatively divorced from the rest of the -regex library. - -The regex-syntax crate also provides sophisticated support for extracting -prefix and suffix literals from regular expressions. - -### Compilation - -The compiler is in src/compile.rs. The input to the compiler is some abstract -syntax for a regular expression and the output is a sequence of opcodes that -matching engines use to execute a search. (One can think of matching engines as -mini virtual machines.) The sequence of opcodes is a particular encoding of a -non-deterministic finite automaton. In particular, the opcodes explicitly rely -on epsilon transitions. - -Consider a simple regular expression like `a|b`. Its compiled form looks like -this: - - 000 Save(0) - 001 Split(2, 3) - 002 'a' (goto: 4) - 003 'b' - 004 Save(1) - 005 Match - -The first column is the instruction pointer and the second column is the -instruction. Save instructions indicate that the current position in the input -should be stored in a captured location. Split instructions represent a binary -branch in the program (i.e., epsilon transitions). The instructions `'a'` and -`'b'` indicate that the literal bytes `'a'` or `'b'` should match. - -In older versions of this library, the compilation looked like this: - - 000 Save(0) - 001 Split(2, 3) - 002 'a' - 003 Jump(5) - 004 'b' - 005 Save(1) - 006 Match - -In particular, empty instructions that merely served to move execution from one -point in the program to another were removed. Instead, every instruction has a -`goto` pointer embedded into it. This resulted in a small performance boost for -the Pike VM, because it was one fewer epsilon transition that it had to follow. - -There exist more instructions and they are defined and documented in -src/prog.rs. - -Compilation has several knobs and a few unfortunately complicated invariants. -Namely, the output of compilation can be one of two types of programs: a -program that executes on Unicode scalar values or a program that executes -on raw bytes. In the former case, the matching engine is responsible for -performing UTF-8 decoding and executing instructions using Unicode codepoints. -In the latter case, the program handles UTF-8 decoding implicitly, so that the -matching engine can execute on raw bytes. All matching engines can execute -either Unicode or byte based programs except for the lazy DFA, which requires -byte based programs. In general, both representations were kept because (1) the -lazy DFA requires byte based programs so that states can be encoded in a memory -efficient manner and (2) the Pike VM benefits greatly from inlining Unicode -character classes into fewer instructions as it results in fewer epsilon -transitions. - -N.B. UTF-8 decoding is built into the compiled program by making use of the -utf8-ranges crate. The compiler in this library factors out common suffixes to -reduce the size of huge character classes (e.g., `\pL`). - -A regrettable consequence of this split in instruction sets is we generally -need to compile two programs; one for NFA execution and one for the lazy DFA. - -In fact, it is worse than that: the lazy DFA is not capable of finding the -starting location of a match in a single scan, and must instead execute a -backwards search after finding the end location. To execute a backwards search, -we must have compiled the regular expression *in reverse*. - -This means that every compilation of a regular expression generally results in -three distinct programs. It would be possible to lazily compile the Unicode -program, since it is never needed if (1) the regular expression uses no word -boundary assertions and (2) the caller never asks for sub-capture locations. - -### Execution - -At the time of writing, there are four matching engines in this library: - -1. The Pike VM (supports captures). -2. Bounded backtracking (supports captures). -3. Literal substring or multi-substring search. -4. Lazy DFA (no support for Unicode word boundary assertions). - -Only the first two matching engines are capable of executing every regular -expression program. They also happen to be the slowest, which means we need -some logic that (1) knows various facts about the regular expression and (2) -knows what the caller wants. Using this information, we can determine which -engine (or engines) to use. - -The logic for choosing which engine to execute is in src/exec.rs and is -documented on the Exec type. Exec values contain regular expression Programs -(defined in src/prog.rs), which contain all the necessary tidbits for actually -executing a regular expression on search text. - -For the most part, the execution logic is straight-forward and follows the -limitations of each engine described above pretty faithfully. The hairiest -part of src/exec.rs by far is the execution of the lazy DFA, since it requires -a forwards and backwards search, and then falls back to either the Pike VM or -backtracking if the caller requested capture locations. - -The Exec type also contains mutable scratch space for each type of matching -engine. This scratch space is used during search (for example, for the lazy -DFA, it contains compiled states that are reused on subsequent searches). - -### Programs - -A regular expression program is essentially a sequence of opcodes produced by -the compiler plus various facts about the regular expression (such as whether -it is anchored, its capture names, etc.). - -### The regex! macro - -The `regex!` macro no longer exists. It was developed in a bygone era as a -compiler plugin during the infancy of the regex crate. Back then, then only -matching engine in the crate was the Pike VM. The `regex!` macro was, itself, -also a Pike VM. The only advantages it offered over the dynamic Pike VM that -was built at runtime were the following: - - 1. Syntax checking was done at compile time. Your Rust program wouldn't - compile if your regex didn't compile. - 2. Reduction of overhead that was proportional to the size of the regex. - For the most part, this overhead consisted of heap allocation, which - was nearly eliminated in the compiler plugin. - -The main takeaway here is that the compiler plugin was a marginally faster -version of a slow regex engine. As the regex crate evolved, it grew other regex -engines (DFA, bounded backtracker) and sophisticated literal optimizations. -The regex macro didn't keep pace, and it therefore became (dramatically) slower -than the dynamic engines. The only reason left to use it was for the compile -time guarantee that your regex is correct. Fortunately, Clippy (the Rust lint -tool) has a lint that checks your regular expression validity, which mostly -replaces that use case. - -Additionally, the regex compiler plugin stopped receiving maintenance. Nobody -complained. At that point, it seemed prudent to just remove it. - -Will a compiler plugin be brought back? The future is murky, but there is -definitely an opportunity there to build something that is faster than the -dynamic engines in some cases. But it will be challenging! As of now, there -are no plans to work on this. - - -## Testing - -A key aspect of any mature regex library is its test suite. A subset of the -tests in this library come from Glenn Fowler's AT&T test suite (its online -presence seems gone at the time of writing). The source of the test suite is -located in src/testdata. The scripts/regex-match-tests.py takes the test suite -in src/testdata and generates tests/matches.rs. - -There are also many other manually crafted tests and regression tests in -tests/tests.rs. Some of these tests were taken from RE2. - -The biggest source of complexity in the tests is related to answering this -question: how can we reuse the tests to check all of our matching engines? One -approach would have been to encode every test into some kind of format (like -the AT&T test suite) and code generate tests for each matching engine. The -approach we use in this library is to create a Cargo.toml entry point for each -matching engine we want to test. The entry points are: - -* `tests/test_default.rs` - tests `Regex::new` -* `tests/test_default_bytes.rs` - tests `bytes::Regex::new` -* `tests/test_nfa.rs` - tests `Regex::new`, forced to use the NFA - algorithm on every regex. -* `tests/test_nfa_bytes.rs` - tests `Regex::new`, forced to use the NFA - algorithm on every regex and use *arbitrary* byte based programs. -* `tests/test_nfa_utf8bytes.rs` - tests `Regex::new`, forced to use the NFA - algorithm on every regex and use *UTF-8* byte based programs. -* `tests/test_backtrack.rs` - tests `Regex::new`, forced to use - backtracking on every regex. -* `tests/test_backtrack_bytes.rs` - tests `Regex::new`, forced to use - backtracking on every regex and use *arbitrary* byte based programs. -* `tests/test_backtrack_utf8bytes.rs` - tests `Regex::new`, forced to use - backtracking on every regex and use *UTF-8* byte based programs. -* `tests/test_crates_regex.rs` - tests to make sure that all of the - backends behave in the same way against a number of quickcheck - generated random inputs. These tests need to be enabled through - the `RUST_REGEX_RANDOM_TEST` environment variable (see - below). - -The lazy DFA and pure literal engines are absent from this list because -they cannot be used on every regular expression. Instead, we rely on -`tests/test_dynamic.rs` to test the lazy DFA and literal engines when possible. - -Since the tests are repeated several times, and because `cargo test` runs all -entry points, it can take a while to compile everything. To reduce compile -times slightly, try using `cargo test --test default`, which will only use the -`tests/test_default.rs` entry point. - -The random testing takes quite a while, so it is not enabled by default. -In order to run the random testing you can set the -`RUST_REGEX_RANDOM_TEST` environment variable to anything before -invoking `cargo test`. Note that this variable is inspected at compile -time, so if the tests don't seem to be running, you may need to run -`cargo clean`. - -## Benchmarking - -The benchmarking in this crate is made up of many micro-benchmarks. Currently, -there are two primary sets of benchmarks: the benchmarks that were adopted -at this library's inception (in `bench/src/misc.rs`) and a newer set of -benchmarks meant to test various optimizations. Specifically, the latter set -contain some analysis and are in `bench/src/sherlock.rs`. Also, the latter -set are all executed on the same lengthy input whereas the former benchmarks -are executed on strings of varying length. - -There is also a smattering of benchmarks for parsing and compilation. - -Benchmarks are in a separate crate so that its dependencies can be managed -separately from the main regex crate. - -Benchmarking follows a similarly wonky setup as tests. There are multiple entry -points: - -* `bench_rust.rs` - benchmarks `Regex::new` -* `bench_rust_bytes.rs` benchmarks `bytes::Regex::new` -* `bench_pcre.rs` - benchmarks PCRE -* `bench_onig.rs` - benchmarks Oniguruma - -The PCRE and Oniguruma benchmarks exist as a comparison point to a mature -regular expression library. In general, this regex library compares favorably -(there are even a few benchmarks that PCRE simply runs too slowly on or -outright can't execute at all). I would love to add other regular expression -library benchmarks (especially RE2). - -If you're hacking on one of the matching engines and just want to see -benchmarks, then all you need to run is: - - $ (cd bench && ./run rust) - -If you want to compare your results with older benchmarks, then try: - - $ (cd bench && ./run rust | tee old) - $ ... make it faster - $ (cd bench && ./run rust | tee new) - $ cargo benchcmp old new --improvements - -The `cargo-benchcmp` utility is available here: -https://github.com/BurntSushi/cargo-benchcmp - -The `./bench/run` utility can run benchmarks for PCRE and Oniguruma too. See -`./bench/bench --help`. - -## Dev Docs - -When digging your teeth into the codebase for the first time, the -crate documentation can be a great resource. By default `rustdoc` -will strip out all documentation of private crate members in an -effort to help consumers of the crate focus on the *interface* -without having to concern themselves with the *implementation*. -Normally this is a great thing, but if you want to start hacking -on regex internals it is not what you want. Many of the private members -of this crate are well documented with rustdoc style comments, and -it would be a shame to miss out on the opportunity that presents. -You can generate the private docs with: - -``` -$ rustdoc --crate-name docs src/lib.rs -o target/doc -L target/debug/deps --no-defaults --passes collapse-docs --passes unindent-comments -``` - -Then just point your browser at `target/doc/regex/index.html`. - -See https://github.com/rust-lang/rust/issues/15347 for more info -about generating developer docs for internal use. diff --git a/README.md b/README.md index 7454c166d..a23a266d3 100644 --- a/README.md +++ b/README.md @@ -290,6 +290,21 @@ $ rebar cmp results.csv See the `rebar` documentation for more details on how it works and how to compare results with other regex engines. + +### Hacking + +The `regex` crate is, for the most part, a pretty thin wrapper around the +[`meta::Regex`](https://docs.rs/regex-automata/latest/regex_automata/meta/struct.Regex.html) +from the +[`regex-automata` crate](https://docs.rs/regex-automata/latest/regex_automata/). +Therefore, if you're looking to work on the internals of this crate, you'll +likely either want to look in `regex-syntax` (for parsing) or `regex-automata` +(for construction of finite automata and the search routines). + +My [blog on regex internals](https://blog.burntsushi.net/regex-internals/) +goes into more depth. + + ### Minimum Rust version policy This crate's minimum supported `rustc` version is `1.60.0`. From f9671471ea8d69242a5aac5f8edc66aabedf3901 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sun, 8 Oct 2023 14:57:53 -0400 Subject: [PATCH 082/136] changelog: add note about decreasing memory usage Ref #1090 --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7f90e45a8..a813c4fdb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,8 @@ Performance improvements: * [PERF #1051](https://github.com/rust-lang/regex/pull/1051): Unicode character class operations have been optimized in `regex-syntax`. +* [PERF #1090](https://github.com/rust-lang/regex/issues/1090): +Make patterns containing lots of literal characters use less memory. Bug fixes: From ed8032195119caf2d691862efc0f5ff0377c8275 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sun, 8 Oct 2023 15:05:16 -0400 Subject: [PATCH 083/136] test: disable some tests on non-64-bit Some doc tests make 64-bit assumptions and fail on 32-bit. I'd be open to perhaps refactoring the tests somehow to make them work on both, but I literally have no easy way to run doc tests in a 32-bit environment. Without being able to actually run them myself, I don't feel comfortable doing anything other than squashing the tests in that case. Closes #1041 --- regex-lite/src/string.rs | 1 + src/builders.rs | 4 ++++ src/regex/bytes.rs | 1 + src/regex/string.rs | 1 + 4 files changed, 7 insertions(+) diff --git a/regex-lite/src/string.rs b/regex-lite/src/string.rs index 1c6eb4ab9..af0a5b629 100644 --- a/regex-lite/src/string.rs +++ b/regex-lite/src/string.rs @@ -2063,6 +2063,7 @@ impl<'h, 'n> core::ops::Index<&'n str> for Captures<'h> { /// This example shows how to create and use `CaptureLocations` in a search. /// /// ``` +/// # if !cfg!(target_pointer_width = "64") { return; } // see #1041 /// use regex_lite::Regex; /// /// let re = Regex::new(r"(?\w+)\s+(?\w+)").unwrap(); diff --git a/src/builders.rs b/src/builders.rs index 46c4824c5..c111a96c0 100644 --- a/src/builders.rs +++ b/src/builders.rs @@ -679,6 +679,7 @@ pub(crate) mod string { /// # Example /// /// ``` + /// # if !cfg!(target_pointer_width = "64") { return; } // see #1041 /// use regex::RegexBuilder; /// /// // It may surprise you how big some seemingly small patterns can @@ -1246,6 +1247,7 @@ pub(crate) mod string { /// # Example /// /// ``` + /// # if !cfg!(target_pointer_width = "64") { return; } // see #1041 /// use regex::RegexSetBuilder; /// /// // It may surprise you how big some seemingly small patterns can @@ -1856,6 +1858,7 @@ pub(crate) mod bytes { /// # Example /// /// ``` + /// # if !cfg!(target_pointer_width = "64") { return; } // see #1041 /// use regex::bytes::RegexBuilder; /// /// // It may surprise you how big some seemingly small patterns can @@ -2428,6 +2431,7 @@ pub(crate) mod bytes { /// # Example /// /// ``` + /// # if !cfg!(target_pointer_width = "64") { return; } // see #1041 /// use regex::bytes::RegexSetBuilder; /// /// // It may surprise you how big some seemingly small patterns can diff --git a/src/regex/bytes.rs b/src/regex/bytes.rs index cc53482cb..c742b095a 100644 --- a/src/regex/bytes.rs +++ b/src/regex/bytes.rs @@ -2025,6 +2025,7 @@ impl<'h, 'n> core::ops::Index<&'n str> for Captures<'h> { /// This example shows how to create and use `CaptureLocations` in a search. /// /// ``` +/// # if !cfg!(target_pointer_width = "64") { return; } // see #1041 /// use regex::bytes::Regex; /// /// let re = Regex::new(r"(?\w+)\s+(?\w+)").unwrap(); diff --git a/src/regex/string.rs b/src/regex/string.rs index d5908ae0d..177a2af34 100644 --- a/src/regex/string.rs +++ b/src/regex/string.rs @@ -2028,6 +2028,7 @@ impl<'h, 'n> core::ops::Index<&'n str> for Captures<'h> { /// This example shows how to create and use `CaptureLocations` in a search. /// /// ``` +/// # if !cfg!(target_pointer_width = "64") { return; } // see #1041 /// use regex::Regex; /// /// let re = Regex::new(r"(?\w+)\s+(?\w+)").unwrap(); From f6559fe270126c1a0593fe06218a89c1139267cc Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sun, 8 Oct 2023 15:23:17 -0400 Subject: [PATCH 084/136] syntax: fix panics that occur with non-sensical Ast values These panics I do not believe can occur from an actual pattern, since the parser will either never produce such things or will return an error. But still, the Ast->Hir translator shouldn't panic in such cases. Actually, the non-sensical Ast values are actually somewhat sensible, and they don't map to invalid regexes. These panics were likely the result of the regex crate not supporting empty patterns or "fail" patterns particularly well in the fast. But now that we do, we can just let the Asts through and generate the Hir you'd expect. Fixes #1047 --- CHANGELOG.md | 3 ++ regex-syntax/src/hir/translate.rs | 59 ++++++++++++++++++++++++++++--- 2 files changed, 58 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a813c4fdb..2c0d193a1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,6 +20,9 @@ Bug fixes: * [BUG #1046](https://github.com/rust-lang/regex/issues/1046): Fix a bug that could result in incorrect match spans when using a Unicode word boundary and searching non-ASCII strings. +* [BUG(regex-syntax) #1047](https://github.com/rust-lang/regex/issues/1047): +Fix panics that can occur in `Ast->Hir` translation (not reachable from `regex` +crate). * [BUG(regex-syntax) #1088](https://github.com/rust-lang/regex/issues/1088): Remove guarantees in the API that connect the `u` flag with a specific HIR representation. diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs index 55ca074fa..2b500cc2f 100644 --- a/regex-syntax/src/hir/translate.rs +++ b/regex-syntax/src/hir/translate.rs @@ -354,14 +354,14 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { .unwrap_or_else(|| self.flags()); self.push(HirFrame::Group { old_flags }); } - Ast::Concat(ref x) if x.asts.is_empty() => {} Ast::Concat(_) => { self.push(HirFrame::Concat); } - Ast::Alternation(ref x) if x.asts.is_empty() => {} - Ast::Alternation(_) => { + Ast::Alternation(ref x) => { self.push(HirFrame::Alternation); - self.push(HirFrame::AlternationBranch); + if !x.asts.is_empty() { + self.push(HirFrame::AlternationBranch); + } } _ => {} } @@ -3652,4 +3652,55 @@ mod tests { ]), ); } + + #[test] + fn regression_alt_empty_concat() { + use crate::ast::{self, Ast}; + + let span = Span::splat(Position::new(0, 0, 0)); + let ast = Ast::alternation(ast::Alternation { + span, + asts: vec![Ast::concat(ast::Concat { span, asts: vec![] })], + }); + + let mut t = Translator::new(); + assert_eq!(Ok(Hir::empty()), t.translate("", &ast)); + } + + #[test] + fn regression_empty_alt() { + use crate::ast::{self, Ast}; + + let span = Span::splat(Position::new(0, 0, 0)); + let ast = Ast::concat(ast::Concat { + span, + asts: vec![Ast::alternation(ast::Alternation { + span, + asts: vec![], + })], + }); + + let mut t = Translator::new(); + assert_eq!(Ok(Hir::fail()), t.translate("", &ast)); + } + + #[test] + fn regression_singleton_alt() { + use crate::{ + ast::{self, Ast}, + hir::Dot, + }; + + let span = Span::splat(Position::new(0, 0, 0)); + let ast = Ast::concat(ast::Concat { + span, + asts: vec![Ast::alternation(ast::Alternation { + span, + asts: vec![Ast::dot(span)], + })], + }); + + let mut t = Translator::new(); + assert_eq!(Ok(Hir::dot(Dot::AnyCharExceptLF)), t.translate("", &ast)); + } } From 674a952cf46318f07b6fde5f4fa14bca2159066a Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sun, 8 Oct 2023 16:06:14 -0400 Subject: [PATCH 085/136] changelog: start filling out the 1.10 release --- CHANGELOG.md | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 50 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2c0d193a1..b51142218 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,30 @@ -TBD -=== +1.10.0 (2023-10-09) +=================== +This is a new minor release of `regex` that adds support for start and end +word boundary assertions. That is, `\<` and `\>`. The minimum supported Rust +version has also been raised to 1.65, which was released about one year ago. + +The new word boundary assertions are: + +* `\<` or `\b{start}`: a Unicode start-of-word boundary (`\W|\A` on the left, +`\w` on the right). +* `\>` or `\b{end}`: a Unicode end-of-word boundary (`\w` on the left, `\W|\z` +on the right)). +* `\b{start-half}`: half of a Unicode start-of-word boundary (`\W|\A` on the +left). +* `\b{end-half}`: half of a Unicode end-of-word boundary (`\W|\z` on the +right). + +The `\<` and `\>` are GNU extensions to POSIX regexes. They have been added +to the `regex` crate because they enjoy somewhat broad support in other regex +engines as well (for example, vim). The `\b{start}` and `\b{end}` assertions +are aliases for `\<` and `\>`, respectively. + +The `\b{start-half}` and `\b{end-half}` assertions are not found in any +other regex engine (although regex engines with general look-around support +can certainly express them). They were added principally to support the +implementation of word matching in grep programs, where one generally wants to +be a bit more flexible in what is considered a word boundary. New features: @@ -27,6 +52,29 @@ crate). Remove guarantees in the API that connect the `u` flag with a specific HIR representation. +`regex-automata` breaking change release: + +This release includes a `regex-automata 0.4.0` breaking change release, which +was necessary in order to support the new word boundary assertions. For +example, the `Look` enum has new variants and the `LookSet` type now uses `u32` +instead of `u16` to represent a bitset of look-around assertions. These are +overall very minor changes, and most users of `regex-automata` should be able +to move to `0.4` from `0.3` without any changes at all. + +`regex-syntax` breaking change release: + +This release also includes a `regex-syntax 0.8.0` breaking change release, +which, like `regex-automata`, was necessary in order to support the new word +boundary assertions. This release also includes some changes to the `Ast` +type to reduce heap usage in some cases. If you are using the `Ast` type +directly, your code may require some minor modifications. Otherwise, users of +`regex-syntax 0.7` should be able to migrate to `0.8` without any code changes. + +`regex-lite` release: + +The `regex-lite 0.1.1` release contains support for the new word boundary +assertions. There are no breaking changes. + 1.9.6 (2023-09-30) ================== From 356d3c950414abfb5ba67124cdbc7ef3d9a018dc Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sun, 8 Oct 2023 21:31:16 -0400 Subject: [PATCH 086/136] automata: fix subtle DFA performance bug This commit fixes a subtle *performance* bug in the start state computation. The issue here is rather tricky, but it boils down to the fact that the way the look-behind assertions are computed in the start state is not quite precisely equivalent to how they're computed during normal state generation. Namely, in normal state generation, we only compute look-behind assertions if the NFA actually has one (or one similar to it) in its graph somewhere. If it doesn't, then there's no point in saving whether the assertion is satisfied or not. Logically speaking, this doesn't matter too much, because if the look-around assertions don't match up with how they're computed in the start state, a new state will simply be created. Not a huge deal, but wasteful. The real problem is that the new state will no longer be considered a start state. It will just be like any other normal state. We rely on being able to detect start states at search time to know when to trigger the prefilter. So if we re-generate start states as non-start states, then we may end up not triggering the prefilter. That's bad. rebar actually caught this bug via the `imported/sherlock/line-boundary-sherlock-holmes` benchmark, which recorded a 20x slowdown due to the prefilter not running. Owch! This specifically was caused by the start states unconditionally attaching half-starting word boundary assertions whenever they were satisfied, where as normal state generation only does this when there is actually a half-starting word boundary assertion in the NFA. So this led to re-generating start states needlessly. Interestingly, the start state computation was unconditionally attaching all different types of look-behind assertions, and thus in theory, this problem already existed under different circumstances. My hypothesis is that it wasn't "as bad" because it was mostly limited to line terminators. But the half-starting word boundary assertion is much more broadly applicable. We remedy this not only for the half-starting word boundary assertion, but for all others as well. I also did manual mutation testing in this start state computation and found a few branches not covered by tests. We add those tests here. Thanks rebar! --- regex-automata/src/util/determinize/mod.rs | 102 +++++++++++++-------- testdata/line-terminator.toml | 12 +++ testdata/word-boundary-special.toml | 34 +++++++ 3 files changed, 111 insertions(+), 37 deletions(-) diff --git a/regex-automata/src/util/determinize/mod.rs b/regex-automata/src/util/determinize/mod.rs index d320fabc3..ba32991d0 100644 --- a/regex-automata/src/util/determinize/mod.rs +++ b/regex-automata/src/util/determinize/mod.rs @@ -587,67 +587,95 @@ pub(crate) fn set_lookbehind_from_start( ) { let rev = nfa.is_reverse(); let lineterm = nfa.look_matcher().get_line_terminator(); + let lookset = nfa.look_set_any(); match *start { Start::NonWordByte => { - builder.set_look_have(|have| { - have.insert(Look::WordStartHalfAscii) - .insert(Look::WordStartHalfUnicode) - }); + if lookset.contains_word() { + builder.set_look_have(|have| { + have.insert(Look::WordStartHalfAscii) + .insert(Look::WordStartHalfUnicode) + }); + } } Start::WordByte => { - builder.set_is_from_word(); + if lookset.contains_word() { + builder.set_is_from_word(); + } } Start::Text => { - builder.set_look_have(|have| { - have.insert(Look::Start) - .insert(Look::StartLF) - .insert(Look::StartCRLF) - .insert(Look::WordStartHalfAscii) - .insert(Look::WordStartHalfUnicode) - }); + if lookset.contains_anchor_haystack() { + builder.set_look_have(|have| have.insert(Look::Start)); + } + if lookset.contains_anchor_line() { + builder.set_look_have(|have| { + have.insert(Look::StartLF).insert(Look::StartCRLF) + }); + } + if lookset.contains_word() { + builder.set_look_have(|have| { + have.insert(Look::WordStartHalfAscii) + .insert(Look::WordStartHalfUnicode) + }); + } } Start::LineLF => { if rev { - builder.set_is_half_crlf(); - builder.set_look_have(|have| have.insert(Look::StartLF)); + if lookset.contains_anchor_crlf() { + builder.set_is_half_crlf(); + } + if lookset.contains_anchor_line() { + builder.set_look_have(|have| have.insert(Look::StartLF)); + } } else { - builder.set_look_have(|have| have.insert(Look::StartCRLF)); + if lookset.contains_anchor_line() { + builder.set_look_have(|have| have.insert(Look::StartCRLF)); + } } - if lineterm == b'\n' { + if lookset.contains_anchor_line() && lineterm == b'\n' { builder.set_look_have(|have| have.insert(Look::StartLF)); } - builder.set_look_have(|have| { - have.insert(Look::WordStartHalfAscii) - .insert(Look::WordStartHalfUnicode) - }); + if lookset.contains_word() { + builder.set_look_have(|have| { + have.insert(Look::WordStartHalfAscii) + .insert(Look::WordStartHalfUnicode) + }); + } } Start::LineCR => { - if rev { - builder.set_look_have(|have| have.insert(Look::StartCRLF)); - } else { - builder.set_is_half_crlf(); + if lookset.contains_anchor_crlf() { + if rev { + builder.set_look_have(|have| have.insert(Look::StartCRLF)); + } else { + builder.set_is_half_crlf(); + } } - if lineterm == b'\r' { + if lookset.contains_anchor_line() && lineterm == b'\r' { builder.set_look_have(|have| have.insert(Look::StartLF)); } - builder.set_look_have(|have| { - have.insert(Look::WordStartHalfAscii) - .insert(Look::WordStartHalfUnicode) - }); + if lookset.contains_word() { + builder.set_look_have(|have| { + have.insert(Look::WordStartHalfAscii) + .insert(Look::WordStartHalfUnicode) + }); + } } Start::CustomLineTerminator => { - builder.set_look_have(|have| have.insert(Look::StartLF)); + if lookset.contains_anchor_line() { + builder.set_look_have(|have| have.insert(Look::StartLF)); + } // This is a bit of a tricky case, but if the line terminator was // set to a word byte, then we also need to behave as if the start // configuration is Start::WordByte. That is, we need to mark our // state as having come from a word byte. - if utf8::is_word_byte(lineterm) { - builder.set_is_from_word(); - } else { - builder.set_look_have(|have| { - have.insert(Look::WordStartHalfAscii) - .insert(Look::WordStartHalfUnicode) - }); + if lookset.contains_word() { + if utf8::is_word_byte(lineterm) { + builder.set_is_from_word(); + } else { + builder.set_look_have(|have| { + have.insert(Look::WordStartHalfAscii) + .insert(Look::WordStartHalfUnicode) + }); + } } } } diff --git a/testdata/line-terminator.toml b/testdata/line-terminator.toml index 4de72de31..a398dafa2 100644 --- a/testdata/line-terminator.toml +++ b/testdata/line-terminator.toml @@ -38,6 +38,18 @@ unescape = true line-terminator = '\xFF' utf8 = false +# This tests a tricky case where the line terminator is set to \r. This ensures +# that the StartLF look-behind assertion is tracked when computing the start +# state. +[[test]] +name = "carriage" +regex = '(?m)^[a-z]+' +haystack = 'ABC\rabc' +matches = [[4, 7]] +bounds = [4, 7] +unescape = true +line-terminator = '\r' + # This tests that we can set the line terminator to a byte corresponding to a # word character, and things work as expected. [[test]] diff --git a/testdata/word-boundary-special.toml b/testdata/word-boundary-special.toml index c1689f5cc..2b5a2a0ac 100644 --- a/testdata/word-boundary-special.toml +++ b/testdata/word-boundary-special.toml @@ -651,3 +651,37 @@ regex = '\b{end-half}' haystack = "b𝛃" matches = [[5, 5]] unicode = true + +# Specialty tests. + +# Since \r is special cased in the start state computation (to deal with CRLF +# mode), this test ensures that the correct start state is computed when the +# pattern starts with a half word boundary assertion. +[[test]] +name = "word-start-half-ascii-carriage" +regex = '\b{start-half}[a-z]+' +haystack = 'ABC\rabc' +matches = [[4, 7]] +bounds = [4, 7] +unescape = true + +# Since \n is also special cased in the start state computation, this test +# ensures that the correct start state is computed when the pattern starts with +# a half word boundary assertion. +[[test]] +name = "word-start-half-ascii-linefeed" +regex = '\b{start-half}[a-z]+' +haystack = 'ABC\nabc' +matches = [[4, 7]] +bounds = [4, 7] +unescape = true + +# Like the carriage return test above, but with a custom line terminator. +[[test]] +name = "word-start-half-ascii-customlineterm" +regex = '\b{start-half}[a-z]+' +haystack = 'ABC!abc' +matches = [[4, 7]] +bounds = [4, 7] +unescape = true +line-terminator = '!' From 8e13494bc898db42c64ef6a750203b1e6ce47214 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sun, 8 Oct 2023 22:15:37 -0400 Subject: [PATCH 087/136] msrv: bump to Rust 1.65 This MSRV bump is mostly motivated by "good sense," and in particular, Rust 1.65 means we can use 'let ... else'. We don't actually start peppering the code with 'let ... else' just yet, but we fix a few outstanding small issues and update our Rust version everywhere. Also, Rust 1.65 is about a year old at time of writing. Let's keep the trains moving. --- .github/workflows/ci.yml | 2 +- Cargo.toml | 2 +- README.md | 2 +- regex-automata/Cargo.toml | 1 + regex-automata/src/util/lazy.rs | 6 +---- regex-automata/src/util/look.rs | 3 +-- regex-automata/src/util/pool.rs | 43 +++++++++++++++++++++++++++++---- regex-cli/Cargo.toml | 1 + regex-lite/Cargo.toml | 2 +- regex-lite/README.md | 2 +- regex-syntax/Cargo.toml | 2 +- regex-syntax/src/hir/literal.rs | 21 ++++++---------- regex-syntax/src/lib.rs | 12 --------- 13 files changed, 56 insertions(+), 43 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 08cc60d9a..2813a1676 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -141,7 +141,7 @@ jobs: - name: Install Rust uses: dtolnay/rust-toolchain@master with: - toolchain: 1.60.0 + toolchain: 1.65.0 # The memchr 2.6 release purportedly bumped its MSRV to Rust 1.60, but it # turned out that on aarch64, it was using something that wasn't stabilized # until Rust 1.61[1]. (This was an oversight on my part. I had previously diff --git a/Cargo.toml b/Cargo.toml index 46664f669..6f94dc4ae 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -15,7 +15,7 @@ categories = ["text-processing"] autotests = false exclude = ["/scripts/*", "/.github/*"] edition = "2021" -rust-version = "1.60.0" +rust-version = "1.65" [workspace] members = [ diff --git a/README.md b/README.md index a23a266d3..f1e4c404a 100644 --- a/README.md +++ b/README.md @@ -307,7 +307,7 @@ goes into more depth. ### Minimum Rust version policy -This crate's minimum supported `rustc` version is `1.60.0`. +This crate's minimum supported `rustc` version is `1.65.0`. The policy is that the minimum Rust version required to use this crate can be increased in minor version updates. For example, if regex 1.0 requires Rust diff --git a/regex-automata/Cargo.toml b/regex-automata/Cargo.toml index 7d47140b0..2d08cec75 100644 --- a/regex-automata/Cargo.toml +++ b/regex-automata/Cargo.toml @@ -11,6 +11,7 @@ license = "MIT OR Apache-2.0" categories = ["text-processing"] edition = "2021" autoexamples = false +rust-version = "1.65" [lib] bench = false diff --git a/regex-automata/src/util/lazy.rs b/regex-automata/src/util/lazy.rs index de27a2a6e..0d0b4fb2a 100644 --- a/regex-automata/src/util/lazy.rs +++ b/regex-automata/src/util/lazy.rs @@ -384,11 +384,7 @@ mod lazy { // SAFETY: state is DONE if and only if data has been fully // initialized. At which point, it is safe to drop. unsafe { - // MSRV(1.60): Use assume_init_drop. The below is how - // assume_init_drop is implemented. - core::ptr::drop_in_place( - (*self.data.as_ptr()).as_mut_ptr(), - ) + self.data.get_mut().assume_init_drop(); } } } diff --git a/regex-automata/src/util/look.rs b/regex-automata/src/util/look.rs index ddf8fb129..73e51c0f6 100644 --- a/regex-automata/src/util/look.rs +++ b/regex-automata/src/util/look.rs @@ -1651,8 +1651,7 @@ mod is_word_char { fn is_word_character(c: char) -> bool { use crate::util::{unicode_data::perl_word::PERL_WORD, utf8}; - // MSRV(1.59): Use 'u8::try_from(c)' instead. - if u8::try_from(u32::from(c)).map_or(false, utf8::is_word_byte) { + if u8::try_from(c).map_or(false, utf8::is_word_byte) { return true; } PERL_WORD diff --git a/regex-automata/src/util/pool.rs b/regex-automata/src/util/pool.rs index 95afa4a0d..d90d4ecff 100644 --- a/regex-automata/src/util/pool.rs +++ b/regex-automata/src/util/pool.rs @@ -455,11 +455,44 @@ mod inner { /// Create a new pool. The given closure is used to create values in /// the pool when necessary. pub(super) fn new(create: F) -> Pool { - // MSRV(1.63): Mark this function as 'const'. I've arranged the - // code such that it should "just work." Then mark the public - // 'Pool::new' method as 'const' too. (The alloc-only Pool::new - // is already 'const', so that should "just work" too.) The only - // thing we're waiting for is Mutex::new to be const. + // FIXME: Now that we require 1.65+, Mutex::new is available as + // const... So we can almost mark this function as const. But of + // course, we're creating a Vec of stacks below (we didn't when I + // originally wrote this code). It seems like the best way to work + // around this would be to use a `[Stack; MAX_POOL_STACKS]` instead + // of a `Vec`. I refrained from making this change at time + // of writing (2023/10/08) because I was making a lot of other + // changes at the same time and wanted to do this more carefully. + // Namely, because of the cache line optimization, that `[Stack; + // MAX_POOL_STACKS]` would be quite big. It's unclear how bad (if + // at all) that would be. + // + // Another choice would be to lazily allocate the stacks, but... + // I'm not so sure about that. Seems like a fair bit of complexity? + // + // Maybe there's a simple solution I'm missing. + // + // ... OK, I tried to fix this. First, I did it by putting `stacks` + // in an `UnsafeCell` and using a `Once` to lazily initialize it. + // I benchmarked it and everything looked okay. I then made this + // function `const` and thought I was just about done. But the + // public pool type wraps its inner pool in a `Box` to keep its + // size down. Blech. + // + // So then I thought that I could push the box down into this + // type (and leave the non-std version unboxed) and use the same + // `UnsafeCell` technique to lazily initialize it. This has the + // downside of the `Once` now needing to get hit in the owner fast + // path, but maybe that's OK? However, I then realized that we can + // only lazily initialize `stacks`, `owner` and `owner_val`. The + // `create` function needs to be put somewhere outside of the box. + // So now the pool is a `Box`, `Once` and a function. Now we're + // starting to defeat the point of boxing in the first place. So I + // backed out that change too. + // + // Back to square one. I maybe we just don't make a pool's + // constructor const and live with it. It's probably not a huge + // deal. let mut stacks = Vec::with_capacity(MAX_POOL_STACKS); for _ in 0..stacks.capacity() { stacks.push(CacheLine(Mutex::new(vec![]))); diff --git a/regex-cli/Cargo.toml b/regex-cli/Cargo.toml index f9dec0024..b5de2b5e7 100644 --- a/regex-cli/Cargo.toml +++ b/regex-cli/Cargo.toml @@ -12,6 +12,7 @@ license = "MIT OR Apache-2.0" categories = ["text-processing"] autotests = false edition = "2021" +rust-version = "1.65" [[bin]] name = "regex-cli" diff --git a/regex-lite/Cargo.toml b/regex-lite/Cargo.toml index 1dc144b31..21330fd4e 100644 --- a/regex-lite/Cargo.toml +++ b/regex-lite/Cargo.toml @@ -10,7 +10,7 @@ A lightweight regex engine that optimizes for binary size and compilation time. """ workspace = ".." edition = "2021" -rust-version = "1.60.0" +rust-version = "1.65" autotests = false # Features are documented in the "Crate features" section of the crate docs: diff --git a/regex-lite/README.md b/regex-lite/README.md index 34c749b21..758fac6ae 100644 --- a/regex-lite/README.md +++ b/regex-lite/README.md @@ -78,7 +78,7 @@ year: 2014, month: 10, day: 14 ### Minimum Rust version policy -This crate's minimum supported `rustc` version is `1.60.0`. +This crate's minimum supported `rustc` version is `1.65.0`. The policy is that the minimum Rust version required to use this crate can be increased in semver compatible updates. diff --git a/regex-syntax/Cargo.toml b/regex-syntax/Cargo.toml index aaceeee7f..e5e541302 100644 --- a/regex-syntax/Cargo.toml +++ b/regex-syntax/Cargo.toml @@ -8,7 +8,7 @@ documentation = "https://docs.rs/regex-syntax" description = "A regular expression parser." workspace = ".." edition = "2021" -rust-version = "1.60.0" +rust-version = "1.65" # Features are documented in the "Crate features" section of the crate docs: # https://docs.rs/regex-syntax/*/#crate-features diff --git a/regex-syntax/src/hir/literal.rs b/regex-syntax/src/hir/literal.rs index afcd506e0..a5a3737f6 100644 --- a/regex-syntax/src/hir/literal.rs +++ b/regex-syntax/src/hir/literal.rs @@ -2235,24 +2235,19 @@ impl PreferenceTrie { /// after them and because any removed literals are guaranteed to never /// match. fn minimize(literals: &mut Vec, keep_exact: bool) { - use core::cell::RefCell; - - // MSRV(1.61): Use retain_mut here to avoid interior mutability. - let trie = RefCell::new(PreferenceTrie { + let mut trie = PreferenceTrie { states: vec![], matches: vec![], next_literal_index: 1, - }); + }; let mut make_inexact = vec![]; - literals.retain(|lit| { - match trie.borrow_mut().insert(lit.as_bytes()) { - Ok(_) => true, - Err(i) => { - if !keep_exact { - make_inexact.push(i.checked_sub(1).unwrap()); - } - false + literals.retain_mut(|lit| match trie.insert(lit.as_bytes()) { + Ok(_) => true, + Err(i) => { + if !keep_exact { + make_inexact.push(i.checked_sub(1).unwrap()); } + false } }); for i in make_inexact { diff --git a/regex-syntax/src/lib.rs b/regex-syntax/src/lib.rs index 38c8d88d4..20f25db71 100644 --- a/regex-syntax/src/lib.rs +++ b/regex-syntax/src/lib.rs @@ -168,18 +168,6 @@ The following features are available: #![forbid(unsafe_code)] #![deny(missing_docs, rustdoc::broken_intra_doc_links)] #![warn(missing_debug_implementations)] -// MSRV(1.62): Allow unused warnings. Needed for the 'allow' below, -// since the warning is no longer triggered in newer Rust releases. -// Once the 'allow(mutable_borrow_reservation_conflict)' can be -// removed, we can remove the 'allow(renamed_and_removed_lints)' too. -#![allow(renamed_and_removed_lints)] -// MSRV(1.62): This gets triggered on Rust <1.62, and since our MSRV -// is Rust 1.60 at the time of writing, a warning is displayed. But -// the lang team decided the code pattern flagged by this warning is -// OK, so the warning is innocuous. We can remove this explicit allow -// once we get to a Rust release where the warning is no longer -// triggered. I believe that's Rust 1.62. -#![allow(mutable_borrow_reservation_conflict)] #![cfg_attr(docsrs, feature(doc_auto_cfg))] #[cfg(any(test, feature = "std"))] From 0689353e43ef958f7a547d3c796f59f03c08b531 Mon Sep 17 00:00:00 2001 From: Addison Crump Date: Sat, 15 Jul 2023 16:00:21 +0200 Subject: [PATCH 088/136] fuzz: institute sane limits for arbitrary-based fuzzers Closes #1043 Fixes https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=61570 Fixes https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=62436 --- fuzz/ast-fuzzers.options | 2 ++ fuzz/oss-fuzz-build.sh | 5 ++++- ...e-minimized-ast_fuzz_match-5990349284442112 | Bin 0 -> 169710 bytes ...e-minimized-ast_fuzz_match-6114393576046592 | Bin 0 -> 51466 bytes ...mized-ast_fuzz_match_bytes-4820641084473344 | Bin 0 -> 47681 bytes 5 files changed, 6 insertions(+), 1 deletion(-) create mode 100644 fuzz/ast-fuzzers.options create mode 100644 fuzz/regressions/clusterfuzz-testcase-minimized-ast_fuzz_match-5990349284442112 create mode 100644 fuzz/regressions/clusterfuzz-testcase-minimized-ast_fuzz_match-6114393576046592 create mode 100644 fuzz/regressions/clusterfuzz-testcase-minimized-ast_fuzz_match_bytes-4820641084473344 diff --git a/fuzz/ast-fuzzers.options b/fuzz/ast-fuzzers.options new file mode 100644 index 000000000..678d526b1 --- /dev/null +++ b/fuzz/ast-fuzzers.options @@ -0,0 +1,2 @@ +[libfuzzer] +max_len = 65536 diff --git a/fuzz/oss-fuzz-build.sh b/fuzz/oss-fuzz-build.sh index f96474739..81f619dcb 100755 --- a/fuzz/oss-fuzz-build.sh +++ b/fuzz/oss-fuzz-build.sh @@ -14,5 +14,8 @@ targets=( ast_fuzz_match_bytes ) for target in "${targets[@]}"; do - cp fuzz/target/x86_64-unknown-linux-gnu/release/$target $OUT/ + cp "fuzz/target/x86_64-unknown-linux-gnu/release/${target}" "${OUT}/" + if [[ "$target" == ast_* ]]; then + cp fuzz/ast-fuzzers.options "${OUT}/${target}.options" + fi done diff --git a/fuzz/regressions/clusterfuzz-testcase-minimized-ast_fuzz_match-5990349284442112 b/fuzz/regressions/clusterfuzz-testcase-minimized-ast_fuzz_match-5990349284442112 new file mode 100644 index 0000000000000000000000000000000000000000..8de974975d4227f6da75038a92caa08dc6feac23 GIT binary patch literal 169710 zcmeI5F^=Rm7KRJh#20YS8C(koK0yPUj2C+e!*FE4FyO?Q53n875h|J!IGAH7oowR? zeE_4EeAFZPC_VK^WYx~Pzk(sY$C5bG&yT-IRdxCF_U-fE-abA)o*v))=ed48{^Q^A zzsE~@{e3Q1i*Vd({Z{3;2IE=xpX!r;va6?ho%grb9?zwIo$BOPv7AY0+`ocp{jJpt z(_J?k)Sa}9m-a#*s9=|i+w$H%sQz^gLmAKI?l1ZG-*3PE`R%v=(mkF(emp%sKm2sc zuXi6FPt{eFaJ83I&2XV*&&8#KUlvCN1HXtM_+2im<4Sq4<4S9tGP(`!r`G0_xSyp5 z{Nk_~@)BVk?6eyEz;6g_%SOPjo`YJn}wDekxA=wOW~sy648xSz!k{0_plJPiDB zzcjd(Y}}y+{J;gCF=ECv5Xb@MG?m2G=Gg zw|W!&zz_VI(Z$juArZDLJ(&At7$jin(YVF&RP@v}hEswc_#G!~^GNVx?w1DFCMLIf z^A+Q_kW{n?wW&MSw$oZAKYOrH@_2;^?A(R5n!B;Zkjdb2-kKx95BE!hs{!LaDfodO z_%&mV2Z0HRu)XFeXq00UJekZSJLU;Z7u+Y6lhsL{Q!@86UCJ-6jYAXsRxD+QXIL;7 zS?nl&(a*6pt;XIgH{dtS->|n8dsEsb4QvU1;McE!Fcq4yY}MIRCnW~Ie(~oPL{z=Y zIwd#Xf?kZk5B&NS5T-&imaRIQ>ZHWr*DwCuf{5z4o>t?21-%%7ANchvAWVg3EL(Lp z)k%rLuV4JR1rb$k&M9%ff?kZk5B&NS5T-&imaRIQ>ZHWr*DwCuf{5z4o>t?21-%%7 zANchvAWVg3EL(Lp)k%rLuV4JR1rb$k&M9%ff?kZk5B&NS5T-&imaRIQ>ZHWr*DwCu zf{5z4o>t?21-%%7ANchvAWVg3EL(Lp)k%rLuV4JR1rb$k&M9%ff?kZk5B&NS5T-&i zmaRIQ>ZHWr*DwCuf{5z4o>t?21-%%7ANchvAWVg3EL(Lp)k%rLuV4JR1rb$k&M9%f zf?kZk5B&NS5T-&imaRIQ>ZHWr*DwCuf{5z4o>t?21-%%7ANchvAWVg3EL(Lp)k%rL zuV4JR1rb$k&M9%ff?kZk5B&NS5T-&imaRIQ>ZHWr*DwCuf{5z4o>t?21-%%7ANchv zAWVg3EL(Lp)k%rLuV4JR1rb$k&M9%ff?kZk5B&NS5T-&imaRIQ>ZHWr*DwCuf{5z4 zo>t?21-%%7ANchvAWVg3EL(Lp)k%rLuV4JR1rb$k&M80D{q~=f{Ob^)WbYSnz!4(} z&Nj zYc5wWUDe#abnwess$k$3aeV&x@$~ro@Y5;3-hFsHWwGi2{Ax8; zoz-!ryI5P-TBiiRD2#)hR)e3V2mIo&8S)Zgl{2RVKkyp@+p-bxd$w%V*;FSb#{HHY z9jp;i9oN%p+^?V)Bk(&2+ww5*!~Nu$&JvB=)&L&h2?OzY_WfVr2Y%q!j5|K>NJxb3 zHAg|C9Gl?DWG2}$L2bI=KB=6nt}dLCMThB979Hcz1V6`8qA-)Fv)Jsgl*O^OBK9L6 zw>hq-)i1l>!^4N)^>upq&;hx=Ff@0i&4nGNHt$eQH`u`&gb*5KmYQ>_var& zq_1Vo`M0y#Bj)GxU2foVwt|$3<&-aLXu+JbRnql-9RBg@ph`Fi?{fX}@7wdJPAP^{ zlh5huh3T%F4eCx>#*1YLJJ@M8_$e-_1iv_JhP*^j<;*F;5B!F}wrm9a>PdjbjGOP0 zm%%mQ2YzYL4F>Dengc)Z1HWdhu<}hvgzYs)L8Ba-;K^hr*|AD$y5K&koUF6t&}x<* zrb~HrKMqasb1WqaGl@Ej%??Xh9BV6LKl0p9ZO$oiKMM``4YNNH*1=Az!S7Y#S6%cl z-1amCn%tNZgL|kMwlDaBANV!HiH9-?iLkxqC}@;p6Yx;R$T26Yvv*a~YT3~D41QTq z6^x%RiOUpzzJ$LaY|Wx0mre{Ml^b#A&^ z&CS=)$G2gI(U*q4a^{ra2Y%y-GcDnM+g!RLIj6+^ zEHvOZ%>G1J2Rp3>Kkyp{oNWmFxcR2RWx2Rh27cfNe$8Ox!An9SY_B;A8s*poJa{p3 z%*pBuUDdQ&HuUjj`=X}`#+U8mQiLzt^JV+iEIM)-X1c_6N>;wbjAFEE>$;1zb***E z*l|4fbFkBD@U!%QUx(k>Lq%BS%qhVS{46!YGVpVjlTnyS)LCqBKk(~UK(GqUEbz(| zQ;tjke*NOlEr_npO;@YA`5O9IdJLm44SnUzDZvl?#t~;)!u?+F&G$NwssKbaz6>5A z;S;DVjta&{0P#k}M*uv7ZOx)1=T20+aQ2)OOiezgECV}$vxkbJ&iq@enTJ~p!Ot?( zk%8a;#{D|FOb?Z=3yB>YbvD&Wi5WgBVRtE_s!cm3?p@G}5%_gurwxD~?w1Ca5^>iO z{J;MS-pEM;-5t%&`|SC`f1oD%o5(171C`x9Xu?6eyEz;76EwjuE2=9>nW<>F2m_<C#y4bRnuzO(8u=`i=HYN-&c%F5x%d;XTGgjbmTJ3 zbcySftbB_Z#c0*mbr);vTI-at<9P1pV5imKXXydI4!^U9im=L=Q-UA(S!#x5;O8tS zqcD@Gv)JH%;McE!U=^BK;FT+;9GL+8`o*7H5M7;{u2ysNHT1Fc7)DiLkxqC}@;p6FiyBBs*3~O&8oJm6LVmd|J)i z&vYqEk8x;%pJORem`T)GY<5`6;#gY|`;iab)aINL_p{J|-!S_VVIAzW8vMX-7;v^B z@Z;v22AAdHP8s-tANVzcjR!9YiLkxqC}@;p6Y${0$T26YGjvtcYT3}om+gz5Di~k3 zk4q80Y|poyTeIlMWtiy_*C|=~7Bh;`s;%oT*4DMwDPza++|R*ItHICG1AZNTXAc!& zl{2RVKk&2E49mdJSx!b_CQ)ax!TrFmUje}?G_$}fS4=rF0r>TcKer&dIyYUd=H_eY zdu>Y(d-3uBi1pG%2ZvKKW-K;XHkRG5fG`!Bv24}ZR3{|{zkc!O7DQCX^|TuIE9k`t z{5~`M9v(jYuCLR>AMd_&UvIwp^Q5ouVi58#QpML&R9E)R`F#HM=U;yK{`_Nz^tG%x z|8_Qe#Qc0dS!P1Uhi({&g#sg~hFf1K!s-pnDZ%friJuy4AgH>)mM6Y_l*LiO`1(h5 z3cmiql(sdCj+{GD?ZVk}QZP07oH7yW0L~sNiaHY@t!5%*F$6!$P)7!SxL+Dv9S|Kq z6#T#s{F?E@$~Peqwyb=a`(+p;VD8tr#qm`1)HQ}vf*<%DCv5Xb@MG?m2G=Ggw|W!& zzz_VI(Z$juArZDLJ(&At7$jin(YVF&RP=O~99j*2;CGy`%_G5&xnCMwo0#0{P4ELh z@M}gFOOJ#^*s}Cs?w4VZfTc&{7ROW3Q`ZCL8eE&0-0ICS`2BzU z0iX3S$)c?M+MILO(`xR1mK-Ku!`x_${n+lAv(;YGl~lNZ>Y(;=SbAo0R51S3EV?0o zYQ~?MwPw+g%P7;ktW$FH&0ZwIs;%oT*4DMwDPzYm_j9n*YVfo4fM18-*+WHG<;*F; z5Bw}O!!q!zmp~RXerb3ZW?nM_&Jsmg_%U1#b$@4ERMAmu^;)o!*M;W#{Dcb;5W?vL|Em_DZvl?h5=_A z0zYoPX>eIC?v#Na_<>(D*m&@gkOYCcNnnk1HsQ(Mn+*KQD?Ei{kECf@vdm=xSm$yewIM++X2}2uHg52-A|w81VXCu z)$+o_uPlxV#>208XX4=(Pa#^f=*YPf)h?VpCk0cJ&nXk34&dyeqNq-|osxOD#Sr`~ zLme6T;eKgwbwG6dQ1Amk@N32oE8m1f*s}6v?w4VZfVp4e7ROW3(^+z8HTZ$wal$r_ z1V83}X>e^~a;rDN5B$Kd8C@(r5)xs{(u28QhCu?B9*tWZPeo5%V>l)Ff!}e$Hje~9 z=6-2#ZDMk(H^C45z^@ryEIkquVaw8kxnG7s0+t?)TO3bCPiM)Y)!+wy#|hg!68xC^ zrNOm{$*tZTgWqTQ|C4=5vN4yQ`mWio-kx<97tX8Ny**2&y;&R;j6ZXU^AP^bg+Fs? z&7vchQKs`-r)1ijy-0#pTi0Ezt!u4Q#*X8+UL5SS8vHCh;Md`I_D~U4Ide+z`)lH- z)5<_Zb%BXN8}J+Ej3TUqomPV%_zeTjHUxh4BEe$DuM%2TZjgZ=_@zO21Grvp z68yjq{F-sXC)5dvu;s}ZOOFhL1gv};w>X}Pp1Q6P960r1W+~RmDdg>a(DPP9#v;2eX{{n8r{}2EG literal 0 HcmV?d00001 diff --git a/fuzz/regressions/clusterfuzz-testcase-minimized-ast_fuzz_match-6114393576046592 b/fuzz/regressions/clusterfuzz-testcase-minimized-ast_fuzz_match-6114393576046592 new file mode 100644 index 0000000000000000000000000000000000000000..a34eeaf2c0d2ba02b5f35ae85b06cfec4f1089b0 GIT binary patch literal 51466 zcmeIzv2Gi60LSrbDi(x?A|#eFAdnbH!Eqc1q*^DsWbKp=nM~z~LNJv8DH0M=pCE5w z1`ln=NGv=;o}v;9D?Gt*=S#x*n{Px!1ysuKKREt+ei}o6-qvq3B&eq}PMt{5i zdv#vdZZsF#FBawEhs9!k-WQAan-})?o5$|?>ftwEKc0N|=<{w|{`4m&Cr3v|H|u^} zpI?rT$Fur;G<#9KRkznZKD^V&H0QVWOXsnus`7qyx2h&`ewxqA?+@qm`3KJ?v)Oq3 z$LSmE&w2OD$?Q@0O0NEVttjqn4n|v_)^91_C~g(&r>FTAH*OVgo`0*`opZN-TXXjH zw69(9aX1_nwbsGG!A5_p*PN9%{`K-;`Qn9#d;j39{q1`$d$7t2SzmAC^6~7CKpQXpGqAjEH@CFYV^`X{{0DuRSM;M0 zg9H$pK{6GzAOXZ?kW2+FNC2@JBvU~P5*Y* zW{^w;El2>d86;Cd3lczV2FX;=f&>toK{6GzAOXZ?kW2+FNC2@JBvU~P5*Y*W{^w;El2>d86;Cd3lczV2FX;=f&>toK{6GzAOXZ? zkW2+FNC2@JBvU~P5*Y*W{^w;El2>d86;Cd z3lczV2FX;=f&>toK{6GzAOXZ?kW2+FNC2@JBvU~P5*Y*W{^w;El2>d86;Cd3lczV2FX;=f&>toK{6GzAOXZ?kW2+FNC2@JBvU~P z5*Y*W{^w;El2>d86;Cd3lczVUU#HZlk3iK z&we-9fLY_p0A!?e||C<)DNGn f=FS>-p3BoI6|S`Rdhg-iyUo>x&9asVEJiu`sPrYNauaQXt@)zBjvj`?=lw z?)f{pb-|dnAAYX*q}F;lx%~f83^Ia z)560l(1J&Yhr_+0%gh~{p>v}^BMQ&$%X zk&slGBeiHw4~1@ZB9s$m#6FfYrwYlV2%mn2YT*8X}+z0e%pI0Y;Rq1qGrgMqBwW(B}d^0A>MK zsWESolIzFjo=G%P^N5OTnQU1kkwalwj1MrzB0rpced0`WXl(U#%$!3CNiY~KToI`-2OE%aI_D#P^|#8!x_cJ#qJh&hpXxSZ;@%CoH{wE*VXLm zkRn|#2d^u}k;wB%pxa#!xEOE77l1u7)IPo^gn!Gls7$de#b`z};Dl`PE9e&krfJ+^ z2PkERjIPIIg;iJ^=7U`9Uc7j5^_{Cup2Pv($&*43^@7Dub7>@(Wl*ZUQeHLDj1e9? zboTC!*_Y}L%z5#%+((A*8QA`5{ku!fTz$)~Cm*@%qwgGfs&M1_W{2yVy$|1Z{Jx>9 zcYosU{9xg8e_1}a^X&I7ugl+F=>OGse{;vVdme49e|Kf~GoSX%-TA@nm#*5oa`j5j zsc8!~{o(xA&OLF9yX}p7qj6)0+qH$#j~;n4p~Os~N6#k2ip%$;3IrppmcW}B z8uUTrh=}7^fVQnI4L-D}06uG1-UKE3pSyCq%sovb;ZRptC=G%Ot#nq%i zfL7>=&*U)qlh7h`7z}ixLF?Hwy$35IdniRnXeO6XdK@GJ{&Df}uAZc_mXM27VEc{> zHgXZj_R;aqNd^PIOp3CaKv8Cr0xucB$ym+gkfyzsm6nU;RgTJX(Si$HJRCR@H01Lk%vOf{8NR#t+HzmicdW@_pQ4g5n7YP&VsfL|%_=P?B(7rnj6QZ(GM(cRYJ zZgBbBjWV}9tEo7ORXk>d_k~#!Z(7)iW*H%uDb4HIcfbfsbq?C8POOy@P^d5`=sflU z8HFX8ARS<7uOKv+(4YY{)PL~72OeY)kPP;0`%!zl$KU8{Z*Ok%w6wQ-+dA6drLi&A z!*AlDlp0OO=V6S>IuORJw7$vMSFe_2?Y$ECFpi(zMMg-Z;WY*)Btv-~mn@;2>f1n6 z5I+e7n7H8bxP7f156IGi`~|5ZTU1wn_~G?9^j{VVJ&po_f(=YcqAvw(7UE-NQbKP6 z{jFzPH>5n}E?V@HLFUR~?KL$>?3}qhoD=IJdXAuSf5b zQF7*yr}nDsXq0~QXe45D8h=b4Y)+Y@;}l4q0t!#lvq}A%#7bk_l&E%_nuKs91@qjc zZsUY?$}`*$;YN8}rc8~?(yOV2HQb{SUZ2p2PiQsDtm?Z)dR0q-$;#3P&bGzKk7o4C zKz`JTj|WUk7fwZfEE_!j6pE7?B5)Z}ks>FI3<+`p6A#QP8*`gLGUVX#plxC#@wQ_7 zZSZ(XdrWiSs;Pw)}cP)lrsQp^3Ehg}UaFC`{jT=TsXYX2?8_*`- zWWQROkR8VxE}b>0RO4v4G>L{vt*wjLet!0VH3c9%E(IrJ?iI*ZB^nl9CG)TbAg~V&3t(_S<@H>;o03R3*x~d+^pFYt4{h#-UD0{+5fdc_xT`rg zv)NKGrGRFpNhQtPNo^!-QHt#V%7$d5$!VNTL+Qw1itJ^TLS5<`nKIb+l#LQt6o9~9 zwrHJBny8au^L;i)X_V~&9-UDdV@GE5{7Dv2Q{^SAZrMX#Q&=c3|=b7*K{R@m&tViF3TMjrQ^6OyC+^$*&js%z6hoqM&V~p)G<0VRDR|pdM0Qr&zz{O6_xhD zl)N6Sc(L|mtrBKx4-9$fM~MO((@F=<_`aA{#0eW|>XmrHeyY*R_y!n|jaFJ&NeA_Y zg+iMzNnMV-8{sf}bB4pOu<89ZUNY-uzkh?D`u!tx06a&e;P6}%wI#4Tr`Cp%)@pY2dIUfystDp2iY8 z>E*=gF{V{xvVA!_zYc4={OiQk-y_kRQ_7&}h&Lw7?2B;$I@tgc6p6t_FUnYw=_VFM*gCgRvZ^+b zN}7@39tQZA;T;K-=(xw8xhSUWuQg7eW^af9^8-2L3QP63S(vIRVm%vB)v4NcH_jy8zQI+7a+RT7UDd0K zaHNBy90t&a;%bAOKpJ0cQ}@EPhAMhqWD{yiqI$v)8NoPhr#x0m>8diWH55DY@dla& z>KH5Jb#%^F+r4_n@fj(=wT2og!rwUs?4f|d(qs6i#zA7G8V9f{@v>lQ62h*UYYk-= zP$Yfp1lJl0b&}saAy1Xd5@Z3_8X9{iL{Cpc6W1D=nHb5nhVmP)qaAh@1_Psjk{_5% zYPaMhopHB&xWu)Fq7qH6H8dNHE6Ql4tL&!gfegLUO|DVQysGiZ{&Xk%jr-SA)lxBENf%I&)1);Vt~C_YH6hRfcmVlfjCS!b5X|LumzN3EOG<;N zQ{e!GV{JRc3JSLxGgvHKq$_+uw;MM2CwqmjZO6K})=;h)Jf@MS?7;hYMl#;VV~+r& zh`aHF_wn}j@m8u)xzOD=lVM$!_E)$|Qpq0uos>4j?^HzA6t6W=4FZFV%#<(H<{Tn%U_ ztDD{i-2^A;Ng;=NVLHukah0VeGP6b(45JoQz#zS7!m{aOZ=0*ZEjwDa8&nV}Uy4Wt z-I}H`OE?#Z;B-3Az)u5^e?sc&;O;tmY)M{#dV^mf-pQClkw;Q*@5p^-@znH!vWw>ridtQ*M0kN zxDXkc`l!wE-Qj9!EOBja2!dc-QBh9C{5^%Wu1%h;P2w>zPo(4+Dk$8B4+-Y%DRi-K ze}m|(6RUhGHngFPsZBxeD Date: Mon, 9 Oct 2023 13:31:11 -0400 Subject: [PATCH 089/136] automata: remove 'is_quit_state' debug assertions It's not feasible for us to check such things when deserializing a DFA, so we just have no real choice but to remove the assert and let the search proceed with incorrect results. I had previously wrote these as real asserts and then swapped them to debug_asserts, but of course, the fuzzer still trips over them. Fixes https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=60652 --- ...tomata_deserialize_sparse_dfa-5415338693754880 | Bin 0 -> 992 bytes regex-automata/src/dfa/search.rs | 10 ---------- 2 files changed, 10 deletions(-) create mode 100644 fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_automata_deserialize_sparse_dfa-5415338693754880 diff --git a/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_automata_deserialize_sparse_dfa-5415338693754880 b/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_automata_deserialize_sparse_dfa-5415338693754880 new file mode 100644 index 0000000000000000000000000000000000000000..cac835c53eda6ae94833f300257aec639d5d6b3c GIT binary patch literal 992 zcmb_Y!3~2j40Hfh;@J&O?7)K!I#&1S7L~dge>ewBDFzw=CpmWR>`Uaw8&{ztfLuht z9OEkMlRm@GPMN61D$F&Cn6+>ZrBLm@n~)%A$SW)_>nMgNphd}`#b1aGREYySj+v-< z*XI>_Ifh~0ogs77L89<;!BdWzdZxm)BoN!fKgw;ivK4+3tmiE)FJRP+7h?(Y-O4>9 COL@rv literal 0 HcmV?d00001 diff --git a/regex-automata/src/dfa/search.rs b/regex-automata/src/dfa/search.rs index 8c012a594..5a82261f9 100644 --- a/regex-automata/src/dfa/search.rs +++ b/regex-automata/src/dfa/search.rs @@ -176,7 +176,6 @@ fn find_fwd_imp( // It's important that this is a debug_assert, since this can // actually be tripped even if DFA::from_bytes succeeds and // returns a supposedly valid DFA. - debug_assert!(dfa.is_quit_state(sid)); return Err(MatchError::quit(input.haystack()[at], at)); } } @@ -297,7 +296,6 @@ fn find_rev_imp( } else if dfa.is_dead_state(sid) { return Ok(mat); } else { - debug_assert!(dfa.is_quit_state(sid)); return Err(MatchError::quit(input.haystack()[at], at)); } } @@ -422,7 +420,6 @@ fn find_overlapping_fwd_imp( } else if dfa.is_dead_state(sid) { return Ok(()); } else { - debug_assert!(dfa.is_quit_state(sid)); return Err(MatchError::quit( input.haystack()[state.at], state.at, @@ -526,7 +523,6 @@ pub(crate) fn find_overlapping_rev( } else if dfa.is_dead_state(sid) { return Ok(()); } else { - debug_assert!(dfa.is_quit_state(sid)); return Err(MatchError::quit( input.haystack()[state.at], state.at, @@ -600,9 +596,6 @@ fn eoi_fwd( let pattern = dfa.match_pattern(*sid, 0); *mat = Some(HalfMatch::new(pattern, input.haystack().len())); } - // N.B. We don't have to check 'is_quit' here because the EOI - // transition can never lead to a quit state. - debug_assert!(!dfa.is_quit_state(*sid)); } } Ok(()) @@ -631,9 +624,6 @@ fn eoi_rev( let pattern = dfa.match_pattern(*sid, 0); *mat = Some(HalfMatch::new(pattern, 0)); } - // N.B. We don't have to check 'is_quit' here because the EOI - // transition can never lead to a quit state. - debug_assert!(!dfa.is_quit_state(*sid)); } Ok(()) } From 39d8b45d0f485376f77fdde316210d7d3fd0e587 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Mon, 9 Oct 2023 13:50:42 -0400 Subject: [PATCH 090/136] automata: fix invalid accelerators It's possible for DFA deserialization to result in an otherwise valid DFA, but one that records accelerated DFA states without any actual accelerator. We remedy that by checking for it at deserialization time. Fixes https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=60739 Fixes https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=61255 fixup --- ...ata_deserialize_dense_dfa-5883983265923072 | Bin 0 -> 2734 bytes ...ata_deserialize_dense_dfa-6363062083649536 | Bin 0 -> 2735 bytes regex-automata/src/dfa/dense.rs | 18 ++++++++++++++++++ 3 files changed, 18 insertions(+) create mode 100644 fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_automata_deserialize_dense_dfa-5883983265923072 create mode 100644 fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_automata_deserialize_dense_dfa-6363062083649536 diff --git a/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_automata_deserialize_dense_dfa-5883983265923072 b/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_automata_deserialize_dense_dfa-5883983265923072 new file mode 100644 index 0000000000000000000000000000000000000000..233fcbc950a61bc614dc0e0a7418724fa0c36c56 GIT binary patch literal 2734 zcmZQ%VEF(4KNFB(WMp6fqESE~1R4m) DFA<&'a [u32]> { dfa.accels.validate()?; // N.B. dfa.special doesn't have a way to do unchecked deserialization, // so it has already been validated. + for state in dfa.states() { + // If the state is an accel state, then it must have a non-empty + // accelerator. + if dfa.is_accel_state(state.id()) { + let index = dfa.accelerator_index(state.id()); + if index >= dfa.accels.len() { + return Err(DeserializeError::generic( + "found DFA state with invalid accelerator index", + )); + } + let needles = dfa.accels.needles(index); + if !(1 <= needles.len() && needles.len() <= 3) { + return Err(DeserializeError::generic( + "accelerator needles has invalid length", + )); + } + } + } Ok((dfa, nread)) } From fc9a11a452adbd262d63990d6be813b577b96687 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Mon, 9 Oct 2023 13:54:25 -0400 Subject: [PATCH 091/136] lite: reduce size limit to avoid timeouts Fixes https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=60779 Fixes https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=61434 --- fuzz/fuzz_targets/fuzz_regex_lite_match.rs | 7 +++++-- ...ized-fuzz_regex_lite_match-5690981331369984 | Bin 0 -> 133532 bytes ...ized-fuzz_regex_lite_match-5888324890656768 | Bin 0 -> 233677 bytes 3 files changed, 5 insertions(+), 2 deletions(-) create mode 100644 fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_lite_match-5690981331369984 create mode 100644 fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_lite_match-5888324890656768 diff --git a/fuzz/fuzz_targets/fuzz_regex_lite_match.rs b/fuzz/fuzz_targets/fuzz_regex_lite_match.rs index 579078c71..155fa6d8d 100644 --- a/fuzz/fuzz_targets/fuzz_regex_lite_match.rs +++ b/fuzz/fuzz_targets/fuzz_regex_lite_match.rs @@ -57,8 +57,11 @@ fuzz_target!(|case: FuzzCase| -> Corpus { .dot_matches_new_line(case.dot_matches_new_line) .swap_greed(case.swap_greed) .ignore_whitespace(case.ignore_whitespace) - .size_limit(1<<20) - .build() else { return Corpus::Reject }; + .size_limit(1 << 16) + .build() + else { + return Corpus::Reject; + }; re.is_match(case.haystack); Corpus::Keep }); diff --git a/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_lite_match-5690981331369984 b/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_lite_match-5690981331369984 new file mode 100644 index 0000000000000000000000000000000000000000..d892bc31c496d70f689adcd583a32f42896f5fb5 GIT binary patch literal 133532 zcmeHQ&8j5HbsnLO+@|;1*$jx0rgcTTJv~3*VGxZPc{@U55YlXfAHX1kp_#|<6Zi!T z-dHf&NPE43Hrn_#TyaiDeDR$;aUwFSvTok0yo=weC!VMfg{6gWy9s_U%3(U{Y6CC_ak>g{ zNT7vG6vjcUo^TE&0f0XlJ%Ng4^qhb`a7s~8?vy?|<5c;kG=aBMRX=vqFlN`+Y2*vK zS*?^sb1B;|A0SteW#}7z>NS-X@`Da?VH7k~oo*c0@iVPXWE%-63X)LIx&=txPhCL5 zrY$nBvXMzF33Hv65JA5QD;vOBEt`SoY%uA;3`V3XWTJ2^#OetLo~KMKY$Slz5mEsT zA!P7e&QbKnhI$Hs`v5wn_fzGY5(K_NBMrS!$xlx>@Vu_7fkDJ$M!pWkfx1CAtAUqB zeb~O?r(RQ;K1hy?`nfO)s)d>8t}$QzfHNK_`Khk3MFI+gBowZ0f%4x99uh88^+!A* z5yePP)(urXKj^>0t6Il5SqT_b+0Bide1h+r3KSZuLJo`(t0(M3RZ!|bNRA4mv~79I z=FQue(_N!=x0r_RtK9Nk7`fvX78d(A98<%G$Clpjj7;0EIOkLFy%?oM-tVxu61Nnk(Rs%1M`mlY&LpA+pbBOUx3B-&pX#g`uq$*^ha8ShZg$*(h z>gNJ@FzO+PBu2gt!RCk*S|lI={k)b?%2l%2i%j_I(hJi%nxZ=W1E-D}R2dz|7)e8v zA!M{lsU}a8`Jn$wM^1$-`-W|IVIQ0b5Aj5WDC{=RBtunC0dU`BC$j7;MKl%*Vy_e$ zs=ZJNa!=TYDq2&JL2>{Pqrf19%*fZFI8Zm}W;O8As1MsW{M2hIErjIAsGkd?pjw@7 z9M|zPT|tWk6a`7BXWarM@Q|=+i_EKRWD-llT&E>O&~L)Z25?qWJMf$hCOw$Z(es3q zbcIe74u)7g;lTB}PO|us3vg!!&VdX0>dX*)Lzu5 zwcC8UmyfhP{bl_13H)G>Cu<|YMNap>oML0MRLgd%_gk<^%8qo3x?_wUe~TXA8M!O%x>7saOvqC4Q{V|*GsPC zZWkA^X)$P?p)N`Y`eBjU5@EOqPIVTn7)yM!tRP|1CH5{IpnHzst^Y4icg6ji%V#c$ zK~}C6YyjbG*7-3!se7|{Dj}oJ^B@0lN&P&-+QajY*`u8;p6rM_i$V7vd@?^gf{Y(3 z_}<9ZiP9rhA_2`GNvLPtBE7&v!W>?Cl2+LW#6_yp5-Ky{U~ho5JTVAt);G8QNDyh$ z+){yrL=?_>&a|Pb&0q@nCOfsy&Qh^}+Bc=K1Z7Q#NwpX1B4Hn@f>P)~a#X;iv=hph zj(k@4aZejfSRU>+4M~6u)x}sRo~RIorDy!(ZRtjqXPJVWjOy+;Io`B_*)RK?R+V7F z`Au=GMy9OT&52Ic1T`M{I)3V>4~nEZX0S`B)Tc&f68&4DH%RX$+33!_z+~V6-UIMZ!5Rj_3YC5&$s@ z3_{3^d>x7d`GRg%122vGuzkbs@-}zIiSkW}Rz}APFi5IGCJF~dEMGYJvCY%Z1@K_h zLk>xdd>w+#5h=7tKmwvq%}nP2>Q&hsbYzZrVuC0J-DKTR)$;?wmQmF?zR60!sH#3f zsS4EDhk;5M}WHcQzhmIyt z4GvEbG9zDyVDkY}p+y3EJg+6DBfN1fyS3PjU%@U4RgyEc0T;kzF&D5Ul=-O!P$ev^ z>MJx4B#2anOq7Ca z&YDwLIPko#s)0eoV@AGCBSh$CH4~8t96$Mxva-_>0$QWSBB1L5NoehLyLP##Y>;j4 zB9ZD$bCsEJIO@btV(L0c2A*?QvqD2v$bm6p^@Ib@>lz85bu^`FI1{VTb0PY`$w^V} zl-^I3Z%ShK6_6lO6*5r@t~qN?Vd21YKRE!1QD6{4X5{NMLWFKsGZCr41LR~>mCQF> z2^dw?M|;o^4>Duc&e)G8SZheNj{kGrVpM`(k0?zc-& zsn79!5b~+bv^jlXql{DLk1ql+`B=CH>_xovdNkFJo9SGx=<`cM2|aV|nw}&wsAYRY zFt@0CT1Cpbepv_0GSHJA%FvmG>|wYrKZ^LeNIY*5x{Vr2j(F86NP%_Uc{$F|gA;uS znUSyKUlNI`UKd3IdSyfs>QuMNPFp}BVRQXfUchCQhO&?YbHwTi`^gcl>GE*s$_Y>r z8fe)GcxlwPPmvVmNw4g~kHVJhAMY2RGHfzarAKl!*qx&0m!h^J3do=ao=^tbWA%o{99HZ!s z4fXT@hcc?>q7R@FFsiDLPzGah6=0fB2$?7x6tQ~37q4ny5b>Ci=onga*LOfcBOkDm z)xb+5E#&bi7rFQg3UCM^gFg-jjZf!N z&aPh7piNL#GxBvP4%8mHSq;22>NlfrxHFxzGoE<9DS_xK06nP+nJ63-v3y~fJ}@t% zelCoHYGEe2Ys|O~!RCk*S|p$-NJ2g979fF#gpEJqi3(AS^km&o)zbqU%Bbob-()3V zR8=3LRQdKz1qv`tszN3T2Su!&aM09KkU??)u-tjtU~oLjU7ysUo09vwNjwBOyfJ@# zAZC5TY@vC6MUM}}oZdCa&$n;!TdA+~fW&^Q!l>pL-lzTkNt$IR-51`1X(@92p+aDAk29%dGUPFL2^V1>(sy~&~ zDVD2Bp`j||z!-kg63~^7B-E*HmCYDQ*tA9FRW>q- zB~<_zv7blpSMxsy-n0Rn)q)*(&IUO$b$#pKqOBL+_LhnFO9{s}N0rO9D-O=TN9LLvOwx;sJld>3?vIyRF8%oEtJnB-C^0fr#Qi^RniBE^ zqZxu6zBaJ(*ty{>k|(~^=ez8=_`dpSr~(&*JmhVQdLIYLvT9y&aDKTzdiB{ypMCtv ztIu9fUlpJeYdD7U5v*Kbo=U?ZL9%Y0HhX+i1t@Q8VGWnMtMtBiXK1T~-OJks51`1Xx*Kme%69L- zdy=5}H{qkZ_M#gJC<>BLf4T)opdew>7MWMs$Rw78xlW!$FbCeW0i4I<4CDey4vKJm z+H9AfHj^q&eUCcEqdZNLu1fCUM?r(b%OchW$8pI~XZ@Myw6zGKHoi_Y2Sqm$&`C`a z>QA@IM%@#}B>X%1Yc%-t@~#3aB9OU1qHrdNH5icKhCdli1$pqv?9<3pyRD40aEfDZ^?t|U45WU;EuP{LS;V6(ngpe8eIur*&gl<*?FOB-JeZx<^rrw&+&jqrHQO+y3FZkZb*YPtg z`U)))kbo|~#7nmbgue5iAFX9FphY>i*ZgaQH*a6j!wdJf9gmxOZt{jMq@Lg%V%Bf< z`1Q01tuLUbIW!^mb#V5=sA==2pd(w_1ukeK|DzxQMe&z{6~DS=^(qp1PhWw ztXZbenKG)Onn;#*^D4jWv%lPulcGH7mHi+{uzXXqtUv)HQWY{$7zeR>!gW31Pe%RZ z7zNcr8Qr7kxDLgE5Mdx=2QQ8KuzkZrH9fwj=#*hp0Ha)sRDDqWObcJFL;~7KwZzm~ zu#ZtKcH_S|qv@M+k^UhI7~N4m7KNJo^onbSR;p1NgX4CpniJh>P~-z6Ux(_8e9hIs zOQTJX13Rv%Ktpn5)X#-cP_0fkj_Wi^h%!Y2ih?B6vu*(rcu3f^1@T0MC`Nj+Zm8<% z0S;v}WjV?ECMyA>s`?0}TGM?~fdWjEs*s7oK@qDb?9WsMrT&BDs6g_*{gnp4Sr75#G3#-CFF%uV5F2D#@AJfD2%< zmxynp99ChL+F?F3J1JAjuS)rjSf+;k=6m`N$Lc`N7>Vcq1*t7-lM1?3u zda`b)>gfRvWmI(zo2(dCC8IZVwU?Qv{~n*5hyYO-GqHSOac3M&(1TEog6!rQnJjvI zkLK$TY>r5wMFJ9#^LfAJuGErjJ?sQRGay?};U+VByw)|96)lRELB~*^Ks1f`WKUEA z=^6ict*epcS*9Q-qk1jiifsEUOxZ8{>@N;}Q_Q;0m_;*I>#=WYmKBpQ*(KFEg}pBl z&Us0Qjpu%HjDqY220igWhvGnpu##PYmqvZqzTu&owqNV2)H5#5sqZz2AAyppkcq
WzFI zKlRi3OsZp6+n}ajfAO2|eDB4(*FXN`n|B|7^Ww#eXYXE~#vV=ix;+V2e>3&AXm$WI z&$6NSPJh2lo0spo`rn@f6(J0KHyfDSKIP7oA*XV;9V?521+DwR>@^Cq>xb$|g68Wq z8dO?SbRz*>?MOoX=~mg}<(L~HikTvlbwgFpK{|btwwKgQvAsNZh~1uNh-pGGrxeF? z)||q^K2*_~f(())VzSxd#f5D1cmpYCV)_4}mp}IP2GS7Z@O_oz4W#9bvZcoP6@8>P zkiOh9D`U1LcW?4owyKKtVC zdS7@uRG)3HWG@`-4g5l7IHQPpd*!3>Hr$HRj<}x>J0B6+(f#+CEt`fbo-!zLdbzJV)cZ5s0vE`2gy+ZlhRHoV>-9@>%&u85I&U&IfOti4R~pE zuPrP2Lvm!)&xKJ?t&4PMszsxOC{rY$C`dxl>J}go0|}e9$h^u%Cb1;UwN|nBD~8{M zl?~v0>?2ruRsD*8i=GZ3VKQ3RxW-8LDbeXN=X8_I-udw^irQBt+g%j3E;Lrshptcd z70nRj@UrkpT~bXqY*SD6UY84kUfCYkQ9Hk)1IfAN3A~Ftg=hE5(FW^NIF!3vTgO+< z`s3Bw(r2gB966}ujh_3ok29}c<3_L;d4hdcf}anbU^GLJgA;=xeec-0;Vf$RpPq3) z!FB9nkcaBEMZJ%Uo%wpPJ>~pzfAs3Jk3ReOlUJX;rVEH+=;tF?d6<4G4Tl7kqv&vQ zuO|qU!K3kutx^|>YY1|<)Lo_buX%8M_3@_zv*`(z@nVqAZ<>JwdnJD=%T+KnQ|_i4 z(|q&0f8*7^XsHy^_DT46^4IuG(DQrv zym5VWrSzEu6HkqXi#r*TwfDvYQTWnQaRl~CL2*e14QLonwQ20kDx?qFUYI< znO#8UX-}P2IdTd$PT6KsGZDy1bCi?SwBfcv;5i%QMpOUCDb7IRxDx#-;G68M5`L9@ zQ&Kx$0c}O9LM93aMXa81;JKe10K_OT2q81_btn#m2;Hm(UK;gb`-Y!-O^+qsTnd=B zpsGs~5yePP3$vkW?>(ReWmI*JZ?Y0Fs;Ym}QHzpn{w zUenpQTw9qx+F*TVC3kOrlKxWb|Mt`L5X!!#erxOa%2~hRyGiQ{=xGj3h`Joh;T8Q)|I1MzvV?OAZzZ z`;jR`^&hkv0JcY%FpwdHUVjIu#0nXhQ`qDp;lT6M8CTB7D)gLyK5%kUlqbEij{?fb zPnB;fP=HlZ6*5s62eEp>K2!yz{ufp?FsSwzta`#!s18OH2ee=%y80dO1l2bs zd|#oVD&)Wzv3kOR=PAe_IRIF07H%*&e*1HMQitv%@9V+HAAbMmZ@>NHU;bKdb9)cy z!du_RTMO$OW(&=sCJZaO^7-YBe3sCDxyTkZ@@-;)$7ta2BF9>2$Y{yY1ggQ|2|{M% z>kw@3EFcollZRSDDb>*(AZoE2zk*#9s${5h=A<%Ha=A)K@bUsr2Gc*7!Dvw!i-dis z8m0OlI=uo^J3?gO5JFDkanjtl;ZHe}R1(Ex5m~qWw9{n{&5=9OL?i-V2F>WFqcfee zGoE<9DM8>X06nP+nJ63-v3y}EB+ScbrzK3WDKv_UL!RZ3jPIa39nLB4Kc1~c8*$F8sAXIk?q+eknYP7>-*x2T8i zr!F92(-y=VbZ#E__tQyEK5w^FNf)qVRG6m_LB)ui2$?7xH?e%-yXhQ1v6LZBp05y9 zLQ8%(89iR>n$7V|*QMD6Zkz+~N;uO(2V*n~gr|ZC=K0{Th@Wx*%NLzsfiYrU#Ah`T+KwJ0^8KI{aRPm5{tq~;C%%IN|_KcQ8<2L^@Ib@>&g$%I!g0g z|G*)H41U=Hn$a5@>S>z>SQ1N){PZ8eV?=2XGEq1vV)cY`UJ?NKlhL}W1_q6N?D{$s z2SS{yftN;o*uLROPmO*3#c#gzy%+CZ|M-({-hKSdix)4Ry?c2Ydo<`zK-8VqObXl1T=pnp`LY%^a2kFb9l#M*f`CmoJlH)VzP*=pR*YH53nyoJ>EcC zpU0t>Klb$o(h%hEeU;-4r1cH6h35GceWW*#zT6@k%WO*Sj_*x5eHTXWSDo;>nPhmV z>Grkqd7a^WJqmvl#l}A~G-cI;HcA_+M7W@@oc_lbsEn5A>T*#}Tpi5JVT7){7`;Wu znl&r|JDs27a!5wgh3s(+c)Ia%4>t#ct2bh;yVUcq^G(Z{-J)aYvRjz*&K3S5%mc!5 z0c(KVew8C!9k`h}q2XpjfMjDqyrI#fyaNdcdEI zc6uxV%8Z@+s1KVGRFpdtQJu9P6c0Y80eV6qWTF&YbJm=agr$36UPk>~7zNeRM)VAN z;(<;OkwS|E6h$pD;qpM#V#lYaOolqGO$Fq>gP}axQVl?*w6m(O&`=d}V2oHj;lT5{ z2?5YLn)+}!6RXghY^vu61Es1gIVsAOfKioQKUKb|Kmn#nRmemsxaO=mg@t{nqP4E7 zfkCnAZG4?Zh%k`VOhhVhcQuvigXG9)rzK1UQ)rDEi-4{NB%$HycI|Re*&y58MRocd zjvA?8+SBf$lbbZbg;wC%S5Vb+fAs2u&*RS@Ag2}9Iu*gJLU8R7>_etb#p$8WV?W`T z-TmE+FmzjaUyr1QAcuFtkB9M__0uZD=T~&)(>Ofa*!S-}1X-2b9S`I0_F??g8bed) z*4FWrv;KGnAFL-EvyLAz**3w^IOTyl^Q8ycO<;yx(lX+GqNESqp~hGu9nD3APPmUv0KxGR;+PuYB_u zx1vbyc5w|G-uYV;KKfyi+7e;7c-x+83s#IJzFAg~FzFI|mk!W9NATAFm!~`D{%zpx zR<3IWdx^rdY@8p%le#sF=La&H9(0wbE|!f457mQ>`pLtP0)$9a$V6cr#PWp&AI!_> zcz9RRSl$5C6!|&y(^qU5w(RC-Z7re9PvwC=)st0y1yw!w;a&2y&h@%ZLclu090Di8 zLp)I-ik~<`RZqcFN8jXAUk;GqprO`PH82RTW;EB44`J`(w6S$O9HyUt;4HTW@~NPt zZ&eSP#GBoD9eQhm81Uzag49V(66#d9tvUZY`D?weeGebQ89gA(Q%e+H@3bbGdWQo> z?u3?5=BN5WEAZ?qG*pEg7$a6sIPkn~g#&0EwR*VKy^P+|wjQ_&en4c^C=8~EOels+ zM3}&kX@hRcfT61A2mM!g)uQK{tOSgz>@Lq8>Qgm3OcM$r6Quy3v*r{Q_MwW_y1oqz zidApp>oh`yfvjdCQh_H}Q%$(E0)mv$PD_yzuh1H`5&`XPlF;yUi`GZbBy7ri4~OL- zequ?+h+>k3t$}nA3I-7F3AsLs5)1M%}t&_j=I{Ar9?L{5_rxAlL{0t zB2^(1g>ew8CtQ1GGsmZ6Z2_-c5BQVOPLD-EnR|)*s1KVOwkS_}WgmVt8&aL))3NzD z1;J#p4)ZIzjl+d`cuTM|PAb#*`` zgRB4{QWY{$7zeR>!ah_5rT+I^v5cM*FqJR>PAMwNoziD#TqNI=CNN1(f11nUN_I(g zPB}x^hpM0yNst^BVDEuJPh36a*iZin+a literal 0 HcmV?d00001 diff --git a/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_lite_match-5888324890656768 b/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_lite_match-5888324890656768 new file mode 100644 index 0000000000000000000000000000000000000000..86126585263977b28dc771444064df04f8961bdf GIT binary patch literal 233677 zcmeI*&2Agl6$fx;6M%~X-t+;yC?Ewb%n>P34ip&3(tUw0gf4>W4T5IT&dP7$7c$TS z>DG_a`JRj6mFA92i5&7LAh9Axwj|Cy|Bv%O=idKgQM~=7c>DXWZ}0Q_>RmSSCTrA2DzWrtJf&2V#JHKCk(oY|Y|NGy0@ykDdxc}|P`|rxnzs{G7{NdvAU#on1 zoxl6Tn@#s$)q8pM8~q3Kcgx^^X+HVqyYiFjFX^HC!Lp>2yt z+7mPO+&P=hCO|396x@829*naLRR*8CxI|k^IzNRu<8%1jv?oTTRcTe)=^lkjtJ12p zB|RAk9e@H*017~9ni0eD05{?03~{sThiR^LZALxBvRSHCsw1rNJ;|jc5l93QfkYq? zB&h_(%BY;|%}!>q2@*v_M)Ny6pP^l>U0elVc}}9a637W*ZbFO@Bg6LY5S!`5zqtl06owFJcOcJuXApsHN96i_{J!pw%q=_YkSQ(X-lY?zxTkA|Vumadns-+!_ zF)^c}@2{fU-(`Q7{ar4kjoU^ltx9W0P~jB~@&_D54k8DUgIEb1o>P$MwF;^8$fnS0 zfC5ke3P1rU1EyVjftm~Dg8@(oF)8dTw=@Fd?AS}p|E=#&B>57PCR$5gYxd#`Pbfvj@^d`MYZ_=CeCcSArMY*@< zrE&Aq=dX!>2RquzH7nO@O(#|*RwY&?R#hVOSl9<{!cDk2Yux<)Ia$C&TD?koyRySm zrEQbDBg(jGJIL&D-Cx>0NOvfXh6TASS+_dXRy&%9R>Z!Cf3h((==NSd$eh@QT< zzPP@)zPP@)zIX=;@%?)wUtoN|O}Gg+Um7=e1HO{alFu#}k`j>;krI&-krL@OOaESe zu*&lXjDKQv@F9TDq35rEBS0x|Xh`YkM`@zjruY>ta3^^G(5K z@kWzLCX;48nG|Fl>eI}?WMDEd8JG-A1||bDnA>fxLNXj#;Nyr`6(J31)!ZYb-+m>H z2&o9^OGili^#(}>hAV0MA6P6#qk^AM1y5WgGIyZzZH75|uO`}ob{rY)Xo928rnBj6 zI-91ki)SDtEm}~$EKAa{042M*-L6+_fC5ke3P1rUr_!5&go}r6&~Q0;t|>@>5AXp# zz#kMw3KDL@P4oagKo8IZ^x$cFz&>T479EJ`6Fg%HSSjJ6^qh(l)m{S=M8ftiGmCLCwsHG;dJ4X=B=$Hl~effA){r$NCe^u5>WzA%-ip&OASu)v_14zNG!{B$L#wnZtxBuX?nj7a6@?ps zmJ(!7rahT8&>CyQX<1%pVtK7seHvNHrghYZnif4k56}bjplP@b%LB`c<;C(UJ0oOu zY*!-#lYz;=+$|5;wi}z9TM?pi-5Hti0DH|NIDuRZfA!ta9hPYPIwTe^P zUi@HwFh7_d%n#mVF5+_$pNse+OR^ie6&cPV#0W7$j1b#Tpf!3IFoi;)P$(1%g+ig+ zZYY#vP$=%{4ZHlh)hE%dc0E)OppR&M(T#6j#3Ium(;(9jjn?Ls!Wr2!sRSuO>ukHW zN!ljq$1kkn7;+3bh8#nVA;*woSc>{I-V8V4CftOZa1(CAO}H8L>FV-0yP>R4pTA;3 zVbc=XL-vq8WDnUx_K-bf@07@%SM&H@&3Rrr+T{^s{7xd{*KBF>YI-$g@d}Wz=Fonj zU03KRvUnW3<0El!6K=vyxGBRPt<6V)o6L{sYctcnrB~Cd`9!n*`!}6KRX23^Sc2y1 z*ws5ks@`r|>Jg{KX>nSd_TZfM!`<2%rR|yAR_EI+neC!_Gs)MJICk`r+R^mF|#|#gLt4&>P z>S|LXJFWzexP0hA*@2i+9H{mAS?50Gbmc2`)!!AGEJPF)%iFD77MJq?fTb=onIP;dux{ud= zypLM~`lkA(`lkA(`lkA(`lkA(QDe+troqhR7*jNuX)tr0ZP)ZBz4>gt*&CGN;ycqM zEJ>E6_PzGK_I*^P7_%~FRd1LdJRWcpZo*Bt2{#9Y@jT<=A^|KK7R|FQnx<12<|EIu zj;>*-d{P;4sdd}&GfMrxcMC1%*rye zjGnZHxh`FO*B`^`J!!q_p{b^iQFqjx7sLxX4KHZD>Yd|v^@}=Xm}kuA;05u5Tqd&1 zH&OTJQ1|G-fD&XWj-@!$p<4-3g1GLypo6;Z?&)Q%vDTuz7Cysg_za&9iqEbJcU3q) zSi86w!PONHVgb71opPH}9&lQm7N^B&aoTA9j2FZU;ss^5>y44StM`IA7njw&Uq?xEuaorD)uIK#m_NKI7eSUQ%DrDLp(3Fop(3F&=Y&dPv8OlbO?s2wq&Ht$Z@P3;rB!KFT9x*t zD{akbKiru;li5o%Nm*VjFP0a}Yp<496`N1nwbONa*fB%`kw7F639lm(4E7l8xkzl4 z5D7%Wd5eVIF{fHGq>)?$#eE{&rp28_tfUX=L;8?Dqz~!S`C;*8)oG@Fw(S-h?-Cb# zP0DkIhM*z#bJ}ZSuSt(M{d@MBz)gEi65UbqoqV50DEaC9^z-)9`N2=@g9j)81)%VQ z`N90)5lx=z2fG5*6{u4zZwiG%p-_wk>~$;vr^RVsg44Q^ChS&l4^F~8a3xLe5yn!s zl#d;N={o?+XDRK8p?oMG%7^lyeCzDM3n*xcG?t zOlyGyl;Z&uz9L_dugF&%DeSM{E4sz$cu>J^4%%4e`%3<^sr&UWQ(^~Vu41lYuG7yA zZHxt-iP<{g7AN69S%6p;pn8O+&=TcUc|DWz>f1c#mYq9vjlLLtG5Yc*Fy3n-^9oiw z7K8;&P+#36>bddUpq0l=E8V(^R-%8#f^v@0hp-)Mgl%Uv0oR)!eOjAlkNquI!@k>lRTi{2Uy zPy)UIH{mASgqv^^Zog6wwXQjpS&RGk+3i-oji9b1lrZFn7czGR0qrrIDYJ=!WQq6{}69m_DmBcD$4z zB`CKXFJBH!D?*GA_nC|8$P7@wMrR@z$NC*jUR-2pmu61bV^JtDo_5+Ys zuQvw0%B(1`q98qU=4?8fR{Knc?js=ME$;|y?^DzqzRhyE@cA@p|8*04=}@sM9HTNXe3V-}0W zp9j{PyGhcZ-LgieKuF_5WRc_~GNnzkd8b DTKo+f literal 0 HcmV?d00001 From 914198fd288329cfb67290076286e32296496be0 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Mon, 9 Oct 2023 14:02:40 -0400 Subject: [PATCH 092/136] regex: reject large patterns when fuzzing Otherwise we risk timeouts. Fixes https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=61484 --- fuzz/fuzz_targets/fuzz_regex_match.rs | 10 ++++++++-- ...minimized-fuzz_regex_match-6659953212129280 | Bin 0 -> 399135 bytes 2 files changed, 8 insertions(+), 2 deletions(-) create mode 100644 fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_match-6659953212129280 diff --git a/fuzz/fuzz_targets/fuzz_regex_match.rs b/fuzz/fuzz_targets/fuzz_regex_match.rs index 6c375510d..a5dda53d6 100644 --- a/fuzz/fuzz_targets/fuzz_regex_match.rs +++ b/fuzz/fuzz_targets/fuzz_regex_match.rs @@ -54,6 +54,9 @@ re.is_match({haystack:?}); fuzz_target!(|case: FuzzCase| -> Corpus { let _ = env_logger::try_init(); + if case.pattern.len() > (16 * (1 << 10)) { + return Corpus::Reject; + } if case.haystack.len() > (16 * (1 << 10)) { return Corpus::Reject; } @@ -65,8 +68,11 @@ fuzz_target!(|case: FuzzCase| -> Corpus { .ignore_whitespace(case.ignore_whitespace) .unicode(case.unicode) .octal(case.octal) - .size_limit(1<<18) - .build() else { return Corpus::Reject }; + .size_limit(1 << 18) + .build() + else { + return Corpus::Reject; + }; re.is_match(case.haystack); Corpus::Keep }); diff --git a/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_match-6659953212129280 b/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_match-6659953212129280 new file mode 100644 index 0000000000000000000000000000000000000000..b8cdc138a42791040d908484abc77478ca1defec GIT binary patch literal 399135 zcmeI*Ps^tFS;q1ENi$g5B8ZEMB5fgc^ogStUDTkYbnCKEg)o>bvXDv^QlW)OX|OM3 z-hiv_Y)jsX@1WqU+yrmtAV(g)lRV~{oWJ|;H<)ss(<|royFS0;d|H?$-rPL7;s5=3 z{|gWKi+Sv3x7*$Q3+{Q;$L{Le-2?DX%fC-kyPk>bQaetP$K_n*x@q`+OfeSJS)ANH|Qq!45i z2vaEECcZwbTPacqvI)dJq`)S=K5XntkwTD7AWWfroA~;$Zly>e$R-f?kOG_d`mnJp zMG8SSfiQ*gZQ|?0x|Je@Ae%tkLkeu->%+#b6e$GR1i}=`w~4P0>sE>sf@}hD4=J#T zuMZo$Qlt=M69`i%-zL63tXnBk2(k&pJ*2=UzCLX1N|8d4O(0C6e4F_Cux_PDA;=~W z_mBdc`1-K1D@6)HHi0mO@@?Yl!@89sg&>;@s2O54rd|bq^`fyEnw35M&()Q+V+| zcuMJ)mcj)C#oADQUFYDjvx>~JCYEl zaAar?>au+Jic+6e$D|1OjQ@LkfV&%@G6wXh#yl6pjq-L7i5L6oLo>fwb-+1;FIy2m%4L zBMD&&M~3#GPAf$UK?H$7TKA9wU~+Q=fdJZ(gfN98LwiuCl_G^8fmE`7Om2=K5I{SU5T5D26_#vVR<`oV8*j`3n&`hoyL;m?U9g18bC zIO1jW)viP-Qd~R;Q+V+o+za+gOK~Mx@AsbI)w_oj=xv!!PzbUPgeg419~J$0&sGXo z0u<*C;vP}}7wqy20s*um31JFHhW4OND@6)H1c5+W_mBc$a&rWM0NRm+Foh#Sdr+s9 zB84D=Kp?GqNC7apIf6g{?MOnH!jYjpsMAW3LJ&b9kk&n<0GQkyK_Gy3Bq2=U$j~0t zX{AUZh#(M1>mE`7Om2=K5I{SU5T5D27o4=Dg9H%AZ%pdCpFQ#dlT z2X$I0QV1dl1k#?^9zJ{e^IzWk_p2IYT%#Zt19D=!=pH^7aKSES!TKTh->G{@fx@M@ zpRW)5SSfS^#peNG3e6J`1rY=SXh#y_9#UY1a%Ya*nbRtO;(U3T!t?!-UWuPjiWGvJ zK8Slrfz!XUp77`kcJu?3;>E(V!DIx4+cGk=2X$I0QV1dl1k$>P6abT(BM1c0jwFOB z92we!I;|8b1Q7%RY28B#fXU4f1OjMB62cUY4DCUkR*DpY2m*n$?jZ%h(97s;r4U3A2%sHFh_RS*cE9Z3jNI5M;cby_J>2qFjs(z=Hf0F#>|2n5iMB!np(8QOz7 ztrRH)5d;Eh-9rk1$;}Z20%%7P!W51S?LnPZiWGtf0)e#dAqBwX<_H1-v?B>&3P*ADQUFYDjvx>~JCYEl zaAar?>au+Jic+6e$D|1OjQ@LkfV&%@G6wXh#yl6pjq-L7i5L6oLo>fwb-+1;FIy2m%4L zBMD&&M~3#GPAf$UK?H$7TKA9wU~+Q=fdJZ(gfN98LwiuCl_G^8ffJ*M z^tMbVC|4=J!hxid%Z z%xM)salX7v;rV_^uf$I%MG8SqAH+SR!0F#vPw0Y`BE|WFFooy)CA|_qp%mwy&VIG{@f!@6#28AH&K$yad|G`sAzqAxC7$^=8;vP}}SK_(~0s*um31JFHhW4OND@6)H z1c5+W_mBc$a&rWM0NRm+Foh#Sdr+s9B84D=Kp?GqNC7apIf6g{?MOnH!jYjpsMAW3 zLJ&b9kk&n<0GQkyK_Gy3Bq2=U$j~0tX{AUZh#(M1>mE`7Om2=K5I{SU5T5D27o4=Dg9H%AZ%pdCpFQ#dlT2X$I0QV1dl1k$>P6abT(BM1c0jwFOB92we! zI;|8b1Q7%RY28B#fXU4f1OjMB62cUY4DCUkR*DpY2m*n$7r2N0&O?U&0v!*@ca%Hx zr4>MNzE|%aQs82)wjXl7>Aap#U<$pn-hB!|1c3nBk%YL16j-6$nIm_0TGJkUI^YF< z>1X^uu^IVNB906_t)Q&r^Ghp$!XRw|aSti5iLVbEyHcbOWD^KeDBmW&KCD|QQV6mM z#66_ICcZvw>`IYBkWC;=p?sV8`mk=LNFm535ciM*oA~;$u`5LiK{kOfh4O9U>%+R0 zB84EEK-@zLY~t&~#;z181la__6w0@WuMg{1iWGuu0&x#1u!*k^8@p1Z5M&bwQz+ji zzCNs5DN+cs3B*05z$U&vZ0t&rLXb@$Ord<6`1-JJrAQ&jCJ^_K0-N~yu(2ye3PCo3 zFop7M;_Jh@l_G^8n?T${3T)!*!^W-@DFoRB!W7E4iLVdqR*DpYYyxo)DX@vJ4;#Bu zq!45i2vaEECcZwbTPacqvI)dJq`)S=K5XntkwTD7AWY$P@_pko{uwrV%IgF{hVHh4 z+|`&$hA)pfw>YK-@zLtWfSuLtKg!g2aI-)Wx7c0IdmG z0pcD~V1;sL8sbu<5F`#vp)Ljm0%%Rh3J~{@0xOg|(-4;;g&=WY3Ux6k5I}1}R)DyN z6j-6$nTEI&DFlfFQ>cqUfdE<)vI4|Cq`(U0&NRfONFhiZm_l6)3Ix!akQE^AAq7?_ zccvjOMG8UUz!d6YP#}QTgscE@4=J!hxibxMDN+a$2c}RLg8~7xCS(POdq{y5%AIM5 zOOZm5I535}7!(MgH6bfN+(QbiQ0`1aT#6Ke#DOW)#h^d{tqEBH;vQ09g>q*a;!>m# zBo0iWE(QexXidlp5ciM*E0jCa5SJo_AaP&{bulOqKx;x)fVhVgSfSjRhPV_d1c?Jv zsEa{?09q5W0>nL}zzXHgG{mJyAxIpULR}0B1kjq06(H^*1y(3`rXemx3PIw)6zXD7 zAb{3{tN?KjDX>DhGYxSmQV0?Urcf7y0s*upWCe(ONP!i~ooR?mkwTC-Fon7p6bPU- zAuB-KLkg@=?o2~miWGvxfhp9*pg;hv30VQ+9#UY1a%URiQlt5so2dvf!69@`(p$L=r=--5h%Z1)j8BrkXDknip``qEOQ5ailG z+(QanTLSvME`KRf91?^nJmkCk*}k+ChfhFX?Jbnz@VDR|Qs9V}(O0_?r8xXb^wo#F z1*Y(j?>@>0`_c-a@am5U;vQ1qh?mhxqrtk!RRP^ILTPa)#P@Fr6dq@FXu*)w91kjEogee>u+Jic+6e$D|1OjQ@ zLkfV&%@G6wXh#yl6pjq-L7i5L6oLo>fwb-+1;FIy2m%4LBMD&&M~3#GPAf$UK?H$7 zTKA9wU~+Q=fdJZ(gfN98LwnE*n3iw3fDbw3CzvVqxjrAQ%&AP`9F9#Q~IZjK-jKs%BUrf_6v z59+j1q!2_92&8onDF7xnM-T|09Z3jNI5M;cby_J>2qFjs(z=Hf0F#>|2n5iMB!np( z8QOz7trRH)5d;Eh-9rk1$;}Z20%%7P!W51S?Lm(+?I>@*K*yt8o-b{DQwXvN#66_I zCcZvw>`IYBkWC;=p?sV8`mk=LNFm535ciM*oA~;@s2O54rd|bq^`fyEnw35M&()Q+V+|cuMJ)mcj)C#oADQUFYDjvx>~JCYElaAar?>au+Jic+6e$D|1OjQ@LkfV& z%@G6wXh#yl6pjq-L7i5L6oLo>fwb-+1;FIy2m%4LBMD&&M~3#GPAf$UK?H$7TKA9w zU~+Q=fdJZ(gfN98LwnG3nf8s(_-Fg0krog#66_ICcZv=zp{^20EKP=VG4CIC=ftvLRNsdhZI<$ z+?j^B6e$FW15>DrL4g2T6S4xtJ*2=2<<2z3rAQ%29GF5~3yaVb&=5(lPG7lQ%;v?gQ)hA)pfw>YK-@zLtWfSuLtKg!g2aI-)Wx7c0IdmG0pcD~V1;sL8sbu<5F`#v zp)Ljm0%%Rh3J~{@0xOg|(-4;;g&=WY3Ux6k5I}1}R)DyN6j-6$nTEI&DFlfFQ>cqU zfdE<)vI4|Cq`(U0&NRfONFhiZm_l6)3Ix!akQE^AAq7?_ccvjOMG8UUz!d6YP#}QT zgscE@4=J!hxibxMDN+a$2c}RLg8~7xCS(POdq{y5%AIM5OOZm5I535}7!(MgH6bfN z+(QbiQ0`1aT#6Ke#DOW)#h^d{tqEBH;vQ09g>q*a;!>m#Bo0iWE(QexXidlp5ciM* zE0jCa5SJo_AaP&{bulOqKx;x)fVhVgSfSjRhPV_d1c?JvsEa{?09q5W0>nL}zzXHg zG{mJyAxIpULR}0B1kjq06(H^*1y(3`rXemx3PIw)6zXD7Ab{3{tN?KjDX>DhGYxSm zQV0?Urcf7y0s*upWCe(ONP!i~ooR?mkzxbLv!@^Y>fy(?^~!HQ{P20b|J7gLeEh3- ziVcH1^v++-WA2{Ze4fYl2l25xjKjAe?;YEHL=VZ!9XsT^yN$lI6e$F`HW2ra0@s#+ zey_`4iWG+gVG0lV?tZo}Eydvz&{um4r8xX8xQ7%t;$`&Ju0$ygzY=}*A#Z^xJmkBN z^1;5e0w}!tBZ9bx6gc8#^wq9JDNmE`7Om2=K5I{SU5T z5D27o4=Dg9H%AZ%pdCpFQ#dlT2X$I0QV1dl1k$>P6abT(BM1c0jwFOB92we!I;|8b z1Q7%RY28B#fXU4f1OjMB62cUY4DCUkR*DpY2m*n$?jZ%hqxjrAQ%&AP`9F9#Q~IZjK-jKs%BUrf_6v z59+j1q!2_92&8onDF7xnM-T|09Z3jNI5M;cby_J>2qFjs(z=Hf0F#>|2n5iMB!np( z8QOz7trRH)5d;Eh-9rk1$;}Z20%%7P!W51S?LnPZiWGtf0)e#dAqBwX<_H1-v?fFl z2%t3~fDl~JM(8}It{lbh#(L^YeE2mRv$qifYyYZCdfZe{DDtP>y3j& z5J4b-)`SQG0kkGW5D1_(A%Z{vtqBnX0%%Q$AP_)nLIi;TS`%`rApe7d?OQJkCV~h8 z0kkGW5D1_(AukWId;EvH-R|o@-2L(HOZ#H~$M@d*(HoC{^soQ)#>4;Tf8Mx%sq!x` zFYnDK0W(MjDFPQ`U=%lCfz?%#j;@i%{b^VY-NpM2$OyMOzK#}D?u3G?r}x4-nYCvUy}r*FQq`?Gg; z55N7|n~xsv-gxvo|MLCU9z6N(`?r6-|MSuJ-ul7Me)jmiJAeG{?)P@j+vmTP^LKx- z+r7T8zWk57-A{IpU;Xy(o6mp!lYiL{`K$e(kN)<%fBl`mdH=s}-@8kZdIiXfZ^5tl zQb8bq)`VOI$p7x}nfvyOT>URT`shDC`sf!A{H@1N1R(rHcroe=Ue?7-yYV;s&*eA! eU)$Z@zJ7aq`=9@{|Cjvn`t94dx9|Mm5B?9F4sLb; literal 0 HcmV?d00001 From 3feff9e10e028eed26336ff1934d6b89fc6c74e9 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Mon, 9 Oct 2023 14:55:59 -0400 Subject: [PATCH 093/136] automata: improve sparse DFA validation This rejiggers some code so that we can more reliably check whether start state IDs are valid or not. Fixes https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=62726 --- ...ta_deserialize_sparse_dfa-4903112680538112 | Bin 0 -> 953 bytes regex-automata/src/dfa/sparse.rs | 159 +++++++++--------- 2 files changed, 83 insertions(+), 76 deletions(-) create mode 100644 fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_automata_deserialize_sparse_dfa-4903112680538112 diff --git a/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_automata_deserialize_sparse_dfa-4903112680538112 b/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_automata_deserialize_sparse_dfa-4903112680538112 new file mode 100644 index 0000000000000000000000000000000000000000..3056bca2f335559837ff22c307040e7d200693b5 GIT binary patch literal 953 zcmcgrzb^z)5dPkCdx%D_SrUavRGOo46+(5LlBjgl;_fdHC6V|8G}v1iCd3X&#o?ZHHI$Q{9vE`2)E1aXc z4dV*ibs}S9a$sF5@Ts&8Nm;m!h_LYvQgx@O&0-~*MqvXc*1mdu8%`|_4(ibjUcp(+ zUaNf^t&($zmCZ!j&`hg&V+sEZeFMdJm&UDZ#tk}9nN`%b_4cnaUQ`}Jy`sc(1@{;A cVu5+$3H6G4Prat{2Q-<>jd&&eHq)$ literal 0 HcmV?d00001 diff --git a/regex-automata/src/dfa/sparse.rs b/regex-automata/src/dfa/sparse.rs index 7862d48a2..38096d994 100644 --- a/regex-automata/src/dfa/sparse.rs +++ b/regex-automata/src/dfa/sparse.rs @@ -992,8 +992,8 @@ impl<'a> DFA<&'a [u8]> { // (by trying to decode every state) and start state ID list below. If // either validation fails, then we return an error. let (dfa, nread) = unsafe { DFA::from_bytes_unchecked(slice)? }; - dfa.tt.validate(&dfa.special)?; - dfa.st.validate(&dfa.special, &dfa.tt)?; + let seen = dfa.tt.validate(&dfa.special)?; + dfa.st.validate(&dfa.special, &seen)?; // N.B. dfa.special doesn't have a way to do unchecked deserialization, // so it has already been validated. Ok((dfa, nread)) @@ -1388,63 +1388,8 @@ impl> Transitions { /// /// That is, every state ID can be used to correctly index a state in this /// table. - fn validate(&self, sp: &Special) -> Result<(), DeserializeError> { - // In order to validate everything, we not only need to make sure we - // can decode every state, but that every transition in every state - // points to a valid state. There are many duplicative transitions, so - // we record state IDs that we've verified so that we don't redo the - // decoding work. - // - // Except, when in no_std mode, we don't have dynamic memory allocation - // available to us, so we skip this optimization. It's not clear - // whether doing something more clever is worth it just yet. If you're - // profiling this code and need it to run faster, please file an issue. - // - // OK, so we also use this to record the set of valid state IDs. Since - // it is possible for a transition to point to an invalid state ID that - // still (somehow) deserializes to a valid state. So we need to make - // sure our transitions are limited to actually correct state IDs. - // The problem is, I'm not sure how to do this verification step in - // no-std no-alloc mode. I think we'd *have* to store the set of valid - // state IDs in the DFA itself. For now, we don't do this verification - // in no-std no-alloc mode. The worst thing that can happen is an - // incorrect result. But no panics or memory safety problems should - // result. Because we still do validate that the state itself is - // "valid" in the sense that everything it points to actually exists. - // - // ---AG - struct Seen { - #[cfg(feature = "alloc")] - set: alloc::collections::BTreeSet, - #[cfg(not(feature = "alloc"))] - set: core::marker::PhantomData, - } - - #[cfg(feature = "alloc")] - impl Seen { - fn new() -> Seen { - Seen { set: alloc::collections::BTreeSet::new() } - } - fn insert(&mut self, id: StateID) { - self.set.insert(id); - } - fn contains(&self, id: &StateID) -> bool { - self.set.contains(id) - } - } - - #[cfg(not(feature = "alloc"))] - impl Seen { - fn new() -> Seen { - Seen { set: core::marker::PhantomData } - } - fn insert(&mut self, _id: StateID) {} - fn contains(&self, _id: &StateID) -> bool { - false - } - } - - let mut verified: Seen = Seen::new(); + fn validate(&self, sp: &Special) -> Result { + let mut verified = Seen::new(); // We need to make sure that we decode the correct number of states. // Otherwise, an empty set of transitions would validate even if the // recorded state length is non-empty. @@ -1521,7 +1466,7 @@ impl> Transitions { "mismatching sparse state length", )); } - Ok(()) + Ok(verified) } /// Converts these transitions to a borrowed value. @@ -1659,7 +1604,7 @@ impl> Transitions { let state = &state[nr..]; if npats == 0 { return Err(DeserializeError::generic( - "state marked as a match, but has no pattern IDs", + "state marked as a match, but pattern length is zero", )); } @@ -1681,6 +1626,21 @@ impl> Transitions { } else { (&[][..], state) }; + if is_match && pattern_ids.is_empty() { + return Err(DeserializeError::generic( + "state marked as a match, but has no pattern IDs", + )); + } + if sp.is_match_state(id) && pattern_ids.is_empty() { + return Err(DeserializeError::generic( + "state marked special as a match, but has no pattern IDs", + )); + } + if sp.is_match_state(id) != is_match { + return Err(DeserializeError::generic( + "whether state is a match or not is inconsistent", + )); + } // Now read this state's accelerator info. The first byte is the length // of the accelerator, which is typically 0 (for no acceleration) but @@ -2061,28 +2021,19 @@ impl> StartTable { fn validate( &self, sp: &Special, - trans: &Transitions, + seen: &Seen, ) -> Result<(), DeserializeError> { for (id, _, _) in self.iter() { + if !seen.contains(&id) { + return Err(DeserializeError::generic( + "found invalid start state ID", + )); + } if sp.is_match_state(id) { return Err(DeserializeError::generic( "start states cannot be match states", )); } - // Confirm that the start state points to a valid state. - let state = trans.try_state(sp, id)?; - // And like for the transition table, confirm that the transitions - // on all start states themselves point to a valid state. - // - // It'd probably be better to integrate this validation with the - // transition table, or otherwise store a sorted sequence of all - // valid state IDs in the sparse DFA itself. That way, we could - // check that every pointer to a state corresponds precisely to a - // correct and valid state. - for i in 0..state.ntrans { - let to = state.next_at(i); - let _ = trans.try_state(sp, to)?; - } } Ok(()) } @@ -2537,6 +2488,62 @@ impl<'a> fmt::Debug for StateMut<'a> { } } +// In order to validate everything, we not only need to make sure we +// can decode every state, but that every transition in every state +// points to a valid state. There are many duplicative transitions, so +// we record state IDs that we've verified so that we don't redo the +// decoding work. +// +// Except, when in no_std mode, we don't have dynamic memory allocation +// available to us, so we skip this optimization. It's not clear +// whether doing something more clever is worth it just yet. If you're +// profiling this code and need it to run faster, please file an issue. +// +// OK, so we also use this to record the set of valid state IDs. Since +// it is possible for a transition to point to an invalid state ID that +// still (somehow) deserializes to a valid state. So we need to make +// sure our transitions are limited to actually correct state IDs. +// The problem is, I'm not sure how to do this verification step in +// no-std no-alloc mode. I think we'd *have* to store the set of valid +// state IDs in the DFA itself. For now, we don't do this verification +// in no-std no-alloc mode. The worst thing that can happen is an +// incorrect result. But no panics or memory safety problems should +// result. Because we still do validate that the state itself is +// "valid" in the sense that everything it points to actually exists. +// +// ---AG +#[derive(Debug)] +struct Seen { + #[cfg(feature = "alloc")] + set: alloc::collections::BTreeSet, + #[cfg(not(feature = "alloc"))] + set: core::marker::PhantomData, +} + +#[cfg(feature = "alloc")] +impl Seen { + fn new() -> Seen { + Seen { set: alloc::collections::BTreeSet::new() } + } + fn insert(&mut self, id: StateID) { + self.set.insert(id); + } + fn contains(&self, id: &StateID) -> bool { + self.set.contains(id) + } +} + +#[cfg(not(feature = "alloc"))] +impl Seen { + fn new() -> Seen { + Seen { set: core::marker::PhantomData } + } + fn insert(&mut self, _id: StateID) {} + fn contains(&self, _id: &StateID) -> bool { + false + } +} + /* /// A binary search routine specialized specifically to a sparse DFA state's /// transitions. Specifically, the transitions are defined as a set of pairs From 2c44e2a6b63920bf1752a61231ee1349154ae717 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Mon, 9 Oct 2023 15:12:22 -0400 Subject: [PATCH 094/136] fuzz: add regression test for AST roundtripping I couldn't get this to reproduce. Maybe some of my recent changes to regex-syntax fixed this? Not sure. I'm not a huge fan of this fuzzer in general because it isn't really testing a rock solid guarantee that we provide. And the positions are tough to deal with. Fixes https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=62382 --- ...tcase-minimized-ast_roundtrip-5633607856947200 | Bin 0 -> 491 bytes 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 fuzz/regressions/clusterfuzz-testcase-minimized-ast_roundtrip-5633607856947200 diff --git a/fuzz/regressions/clusterfuzz-testcase-minimized-ast_roundtrip-5633607856947200 b/fuzz/regressions/clusterfuzz-testcase-minimized-ast_roundtrip-5633607856947200 new file mode 100644 index 0000000000000000000000000000000000000000..726609cf21cf2933ef0fa2a49301f10f675f091a GIT binary patch literal 491 zcma)0!4ZHk2=nO}S-}n1%X1I6tpU6uwYBnSNdhE5i9(c-gjGZd5jX?* Date: Mon, 9 Oct 2023 16:51:39 -0400 Subject: [PATCH 095/136] regex-lite-0.1.1 --- regex-lite/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/regex-lite/Cargo.toml b/regex-lite/Cargo.toml index 21330fd4e..225193c38 100644 --- a/regex-lite/Cargo.toml +++ b/regex-lite/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "regex-lite" -version = "0.1.0" #:version +version = "0.1.1" #:version authors = ["The Rust Project Developers", "Andrew Gallant "] license = "MIT OR Apache-2.0" repository = "https://github.com/rust-lang/regex/tree/master/regex-lite" From ea8f6c05f30e5148cea40194db1646de460869cd Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Mon, 9 Oct 2023 16:52:00 -0400 Subject: [PATCH 096/136] regex-syntax-0.8.0 --- regex-syntax/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/regex-syntax/Cargo.toml b/regex-syntax/Cargo.toml index e5e541302..f14298299 100644 --- a/regex-syntax/Cargo.toml +++ b/regex-syntax/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "regex-syntax" -version = "0.7.5" #:version +version = "0.8.0" #:version authors = ["The Rust Project Developers", "Andrew Gallant "] license = "MIT OR Apache-2.0" repository = "https://github.com/rust-lang/regex/tree/master/regex-syntax" From 68b701808a1694e53d3aae8a2390eaa7a8ba9403 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Mon, 9 Oct 2023 16:54:45 -0400 Subject: [PATCH 097/136] deps: bump regex-syntax to 0.8.0 --- Cargo.toml | 2 +- regex-automata/Cargo.toml | 2 +- regex-cli/Cargo.toml | 2 +- regex-lite/Cargo.toml | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 6f94dc4ae..17120a0a4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -181,7 +181,7 @@ features = ["alloc", "syntax", "meta", "nfa-pikevm"] # For parsing regular expressions. [dependencies.regex-syntax] path = "regex-syntax" -version = "0.7.5" +version = "0.8.0" default-features = false [dev-dependencies] diff --git a/regex-automata/Cargo.toml b/regex-automata/Cargo.toml index 2d08cec75..719f68c66 100644 --- a/regex-automata/Cargo.toml +++ b/regex-automata/Cargo.toml @@ -86,7 +86,7 @@ internal-instrument-pikevm = ["logging", "std"] aho-corasick = { version = "1.0.0", optional = true, default-features = false } log = { version = "0.4.14", optional = true } memchr = { version = "2.6.0", optional = true, default-features = false } -regex-syntax = { path = "../regex-syntax", version = "0.7.4", optional = true, default-features = false } +regex-syntax = { path = "../regex-syntax", version = "0.8.0", optional = true, default-features = false } [dev-dependencies] anyhow = "1.0.69" diff --git a/regex-cli/Cargo.toml b/regex-cli/Cargo.toml index b5de2b5e7..571191721 100644 --- a/regex-cli/Cargo.toml +++ b/regex-cli/Cargo.toml @@ -31,6 +31,6 @@ memmap2 = "0.5.10" regex = { version = "1.9.0", path = ".." } regex-automata = { version = "0.3.0", path = "../regex-automata", features = ["logging"] } regex-lite = { version = "0.1.0", path = "../regex-lite" } -regex-syntax = { version = "0.7.3", path = "../regex-syntax" } +regex-syntax = { version = "0.8.0", path = "../regex-syntax" } tabwriter = { version = "1.2.1", features = ["ansi_formatting"] } textwrap = { version = "0.16.0", default-features = false } diff --git a/regex-lite/Cargo.toml b/regex-lite/Cargo.toml index 225193c38..b378018c2 100644 --- a/regex-lite/Cargo.toml +++ b/regex-lite/Cargo.toml @@ -14,7 +14,7 @@ rust-version = "1.65" autotests = false # Features are documented in the "Crate features" section of the crate docs: -# https://docs.rs/regex-syntax/*/#crate-features +# https://docs.rs/regex-lite/*/#crate-features # # (Currently there are no supported features. 'std' is technically one, but it # is currently required.) From dc0d79e97e16dba1558a44aa5f68d1da4932bc33 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Mon, 9 Oct 2023 16:55:17 -0400 Subject: [PATCH 098/136] regex-automata-0.4.0 --- regex-automata/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/regex-automata/Cargo.toml b/regex-automata/Cargo.toml index 719f68c66..3792f53e6 100644 --- a/regex-automata/Cargo.toml +++ b/regex-automata/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "regex-automata" -version = "0.3.9" #:version +version = "0.4.0" #:version authors = ["The Rust Project Developers", "Andrew Gallant "] description = "Automata construction and matching using regular expressions." documentation = "https://docs.rs/regex-automata" From 26d8e3ad1ffe3ab88679d185103f6a7fe5a562b5 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Mon, 9 Oct 2023 16:56:44 -0400 Subject: [PATCH 099/136] deps: bump regex-automata to 0.4.0 --- Cargo.toml | 2 +- regex-cli/Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 17120a0a4..9bc90d0e3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -174,7 +174,7 @@ optional = true # For the actual regex engines. [dependencies.regex-automata] path = "regex-automata" -version = "0.3.9" +version = "0.4.0" default-features = false features = ["alloc", "syntax", "meta", "nfa-pikevm"] diff --git a/regex-cli/Cargo.toml b/regex-cli/Cargo.toml index 571191721..ab570a30f 100644 --- a/regex-cli/Cargo.toml +++ b/regex-cli/Cargo.toml @@ -29,7 +29,7 @@ lexopt = "0.3.0" log = { version = "0.4.17", features = ["std"] } memmap2 = "0.5.10" regex = { version = "1.9.0", path = ".." } -regex-automata = { version = "0.3.0", path = "../regex-automata", features = ["logging"] } +regex-automata = { version = "0.4.0", path = "../regex-automata", features = ["logging"] } regex-lite = { version = "0.1.0", path = "../regex-lite" } regex-syntax = { version = "0.8.0", path = "../regex-syntax" } tabwriter = { version = "1.2.1", features = ["ansi_formatting"] } From 2cbd34215d1df6415aeac8ed93018ca8ada0cfca Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Mon, 9 Oct 2023 16:56:54 -0400 Subject: [PATCH 100/136] 1.10.0 --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 9bc90d0e3..88f96b0b1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "regex" -version = "1.9.6" #:version +version = "1.10.0" #:version authors = ["The Rust Project Developers", "Andrew Gallant "] license = "MIT OR Apache-2.0" readme = "README.md" From 951eebd20781671a7aa6f5ceb6b9f284923b425d Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Mon, 9 Oct 2023 17:14:50 -0400 Subject: [PATCH 101/136] regex-cli-0.1.1 --- regex-cli/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/regex-cli/Cargo.toml b/regex-cli/Cargo.toml index ab570a30f..3fe5390aa 100644 --- a/regex-cli/Cargo.toml +++ b/regex-cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "regex-cli" -version = "0.1.0" #:version +version = "0.1.1" #:version authors = ["The Rust Project Developers", "Andrew Gallant "] description = """ A command line tool for debugging, ad hoc benchmarking and generating regular From f01f71b66940279835de25ee8687a7e0d30e854d Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Mon, 9 Oct 2023 17:15:35 -0400 Subject: [PATCH 102/136] lite: add \< and \> to the syntax docs This was probably a copy & paste error. --- regex-lite/src/lib.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/regex-lite/src/lib.rs b/regex-lite/src/lib.rs index 68d54824f..9b394a480 100644 --- a/regex-lite/src/lib.rs +++ b/regex-lite/src/lib.rs @@ -472,8 +472,8 @@ $ the end of a haystack (or end-of-line with multi-line mode) \z only the end of a haystack (even with multi-line mode enabled) \b an ASCII word boundary (\w on one side and \W, \A, or \z on other) \B not an ASCII word boundary -\b{start} an ASCII start-of-word boundary (\W|\A on the left, \w on the right) -\b{end} an ASCII end-of-word boundary (\w on the left, \W|\z on the right)) +\b{start}, \< an ASCII start-of-word boundary (\W|\A on the left, \w on the right) +\b{end}, \> an ASCII end-of-word boundary (\w on the left, \W|\z on the right)) \b{start-half} half of an ASCII start-of-word boundary (\W|\A on the left) \b{end-half} half of an ASCII end-of-word boundary (\W|\z on the right) From 452bc3211635a38a89190da993d4c87a6eeaaf9f Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Mon, 9 Oct 2023 17:15:59 -0400 Subject: [PATCH 103/136] regex-lite-0.1.2 --- regex-lite/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/regex-lite/Cargo.toml b/regex-lite/Cargo.toml index b378018c2..5a6c2ac8a 100644 --- a/regex-lite/Cargo.toml +++ b/regex-lite/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "regex-lite" -version = "0.1.1" #:version +version = "0.1.2" #:version authors = ["The Rust Project Developers", "Andrew Gallant "] license = "MIT OR Apache-2.0" repository = "https://github.com/rust-lang/regex/tree/master/regex-lite" From a2a1986b13aebbafc54ef4b7d9a76626270a0a24 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Mon, 9 Oct 2023 18:15:08 -0400 Subject: [PATCH 104/136] automata: fix sparse DFA state validation in no-std The verified set isn't tracked in no-std/no-alloc because it is probably not worth doing (and bloating the size of the sparse DFA itself to store the state IDs). So when we deserialize a DFA without std enabled, the verified set of states was always reporting `false`, and this now trips an error 100% of the time in the new start state validation code. We fix this by always reporting `true`, thus treating every possible state ID as possibly valid on its own. Not great, but maintains the status quo. --- regex-automata/src/dfa/sparse.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/regex-automata/src/dfa/sparse.rs b/regex-automata/src/dfa/sparse.rs index 38096d994..d461e0a0f 100644 --- a/regex-automata/src/dfa/sparse.rs +++ b/regex-automata/src/dfa/sparse.rs @@ -2540,7 +2540,7 @@ impl Seen { } fn insert(&mut self, _id: StateID) {} fn contains(&self, _id: &StateID) -> bool { - false + true } } From dd04a57e1db3099fdcee337b8219eddc58ce4eb4 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Mon, 9 Oct 2023 18:17:15 -0400 Subject: [PATCH 105/136] regex-automata-0.4.1 --- regex-automata/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/regex-automata/Cargo.toml b/regex-automata/Cargo.toml index 3792f53e6..63554314f 100644 --- a/regex-automata/Cargo.toml +++ b/regex-automata/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "regex-automata" -version = "0.4.0" #:version +version = "0.4.1" #:version authors = ["The Rust Project Developers", "Andrew Gallant "] description = "Automata construction and matching using regular expressions." documentation = "https://docs.rs/regex-automata" From f5b8cb4d52ca0fa01a9c4e8e70bcd3e4b6673368 Mon Sep 17 00:00:00 2001 From: Fabio Valentini Date: Tue, 10 Oct 2023 15:42:29 +0200 Subject: [PATCH 106/136] lite: fix doctests on 32-bit Returning early on non-64-bit architectures is not enough, since the doctest failed to compile due to two numeric literals being too large for usize on 32-bit architectures. PR #1101 --- regex-lite/src/string.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/regex-lite/src/string.rs b/regex-lite/src/string.rs index af0a5b629..4e4de9068 100644 --- a/regex-lite/src/string.rs +++ b/regex-lite/src/string.rs @@ -2063,7 +2063,6 @@ impl<'h, 'n> core::ops::Index<&'n str> for Captures<'h> { /// This example shows how to create and use `CaptureLocations` in a search. /// /// ``` -/// # if !cfg!(target_pointer_width = "64") { return; } // see #1041 /// use regex_lite::Regex; /// /// let re = Regex::new(r"(?\w+)\s+(?\w+)").unwrap(); @@ -2076,7 +2075,10 @@ impl<'h, 'n> core::ops::Index<&'n str> for Captures<'h> { /// /// // Asking for an invalid capture group always returns None. /// assert_eq!(None, locs.get(3)); +/// # // literals are too big for 32-bit usize: #1041 +/// # #[cfg(target_pointer_width = "64")] /// assert_eq!(None, locs.get(34973498648)); +/// # #[cfg(target_pointer_width = "64")] /// assert_eq!(None, locs.get(9944060567225171988)); /// ``` #[derive(Clone, Debug)] From 6ec0a00c0046d0fbbf64276cc28258ad7a4a7317 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Tue, 10 Oct 2023 09:42:40 -0400 Subject: [PATCH 107/136] regex-lite-0.1.3 --- regex-lite/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/regex-lite/Cargo.toml b/regex-lite/Cargo.toml index 5a6c2ac8a..e09229723 100644 --- a/regex-lite/Cargo.toml +++ b/regex-lite/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "regex-lite" -version = "0.1.2" #:version +version = "0.1.3" #:version authors = ["The Rust Project Developers", "Andrew Gallant "] license = "MIT OR Apache-2.0" repository = "https://github.com/rust-lang/regex/tree/master/regex-lite" From d5144b2f5fcf931a4fdf9e247ba20c93000391c3 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Wed, 11 Oct 2023 11:32:12 -0400 Subject: [PATCH 108/136] syntax: add regression tests for new bugs in internal set ops Ref https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=63153 Ref https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=63155 Ref https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=63168 Ref https://github.com/rust-lang/regex/pull/1051 fixup --- ...-minimized-ast_fuzz_regex-6345245270605824 | Bin 0 -> 3933 bytes ...inimized-fuzz_regex_match-5736465767989248 | Bin 0 -> 452 bytes ...inimized-fuzz_regex_match-6413499984904192 | Bin 0 -> 27 bytes regex-syntax/src/hir/translate.rs | 44 ++++++++++++++++++ 4 files changed, 44 insertions(+) create mode 100644 fuzz/regressions/clusterfuzz-testcase-minimized-ast_fuzz_regex-6345245270605824 create mode 100644 fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_match-5736465767989248 create mode 100644 fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_match-6413499984904192 diff --git a/fuzz/regressions/clusterfuzz-testcase-minimized-ast_fuzz_regex-6345245270605824 b/fuzz/regressions/clusterfuzz-testcase-minimized-ast_fuzz_regex-6345245270605824 new file mode 100644 index 0000000000000000000000000000000000000000..312767e97b7e1cff042e7a32d26ea481adeaf6c6 GIT binary patch literal 3933 zcmeyoaKVEGjT0IsH1s!2*a)J5(9^}3g^{6|fr%kB*e8U6fq@Z-S^oe3uL`7jfiwZY z$Z$;QP6Y!40cGS#f5)Zmz7<5~@#F6Q|93Y3|ND59g(D0`!-wKDf-`a51cO*GSqUU} zLIEfb0f7*Z3roQ;DIlE>0xMUpwAZZ8g@Ysv0F#24#lV}J>%G?-sK?ve+iK-#q6cPt zYpfX`tKbkSA*fKC4fP;ED$4LAbUxBEHj=@xOwEl&44ztGir@ldgzSL>`UkFGzkcA> zfddDCdHE2K`u`uu{QnmQ6kJZ(t4QfjFrqaF4Wr`=dW0jJvkm;USGmZVt5l429E8E_v} zFtM~GA5{b^50Y*cEIf^xfHWC~T3eKX!M!N4AR{$5HQsrDX)&-IU@A#1g7|6`8UTte zfQQB)N*HiAk<5S=W$Xxz?_u2vgb*^z7ny@Mz2i3r#TYaWFb))8LPVLO22zb0AF$#O z+JMApSSI0ZSpplTAYiXq32JDj)YjG}d^-^HpNoM({WmnoG8h&v1Q(gq_wmBVkN<=6Jpg;~t&9Kw literal 0 HcmV?d00001 diff --git a/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_match-5736465767989248 b/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_match-5736465767989248 new file mode 100644 index 0000000000000000000000000000000000000000..30a3a3ba0ef9fbdaff1b01233c44da7977d4bb7a GIT binary patch literal 452 zcmXSqE{~2Yx92VAh0+in4w3;c02za(k%57cA(nwbR|`af04f*EgGexFAxWZ{f@v;9 z1W5s7v1_fpY1%+=43jhEASKtQ#bkPY$ literal 0 HcmV?d00001 diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs index 2b500cc2f..2eff6318c 100644 --- a/regex-syntax/src/hir/translate.rs +++ b/regex-syntax/src/hir/translate.rs @@ -3703,4 +3703,48 @@ mod tests { let mut t = Translator::new(); assert_eq!(Ok(Hir::dot(Dot::AnyCharExceptLF)), t.translate("", &ast)); } + + // See: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=63168 + #[test] + fn regression_fuzz_match() { + let pat = "[(\u{6} \0-\u{afdf5}] \0 "; + let ast = ParserBuilder::new() + .octal(false) + .ignore_whitespace(true) + .build() + .parse(pat) + .unwrap(); + let hir = TranslatorBuilder::new() + .utf8(true) + .case_insensitive(false) + .multi_line(false) + .dot_matches_new_line(false) + .swap_greed(true) + .unicode(true) + .build() + .translate(pat, &ast) + .unwrap(); + assert_eq!( + hir, + Hir::concat(vec![ + hir_uclass(&[('\0', '\u{afdf5}')]), + hir_lit("\0"), + ]) + ); + } + + // See: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=63155 + #[cfg(feature = "unicode")] + #[test] + fn regression_fuzz_difference1() { + let pat = r"\W\W|\W[^\v--\W\W\P{Script_Extensions:Pau_Cin_Hau}\u10A1A1-\U{3E3E3}--~~~~--~~~~~~~~------~~~~~~--~~~~~~]*"; + let _ = t(pat); // shouldn't panic + } + + // See: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=63153 + #[test] + fn regression_fuzz_char_decrement1() { + let pat = "w[w[^w?\rw\rw[^w?\rw[^w?\rw[^w?\rw[^w?\rw[^w?\rw[^w?\r\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0w?\rw[^w?\rw[^w?\rw[^w\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\u{1}\0]\0\0\0\0\0\0\0\0\0*\0\0\u{1}\0]\0\0-*\0][^w?\rw[^w?\rw[^w?\rw[^w?\rw[^w?\rw[^w?\rw[^w\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\u{1}\0]\0\0\0\0\0\0\0\0\0x\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\0\0\0\0*??\0\u{7f}{2}\u{10}??\0\0\0\0\0\0\0\0\0\u{3}\0\0\0}\0-*\0]\0\0\0\0\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\0\0\u{1}\0]\0\u{1}\u{1}H-i]-]\0\0\0\0\u{1}\0]\0\0\0\u{1}\0]\0\0-*\0\0\0\0\u{1}9-\u{7f}]\0'|-\u{7f}]\0'|(?i-ux)[-\u{7f}]\0'\u{3}\0\0\0}\0-*\0] Date: Wed, 11 Oct 2023 11:34:47 -0400 Subject: [PATCH 109/136] syntax: revert interval set optimizations This reverts commit 6d2b09ed6fbc136cca007ce0c57ec9cbae16f3b4. Sadly I just don't have the time to fix this code myself. It's too subtle. So I'm just reverting it entirely for now. Fixes https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=63155 Fixes https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=63153 Fixes https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=63168 Ref https://github.com/rust-lang/regex/pull/1051 --- regex-syntax/src/hir/interval.rs | 282 +++++++++++-------------------- 1 file changed, 97 insertions(+), 185 deletions(-) diff --git a/regex-syntax/src/hir/interval.rs b/regex-syntax/src/hir/interval.rs index e3051bf31..e063390a8 100644 --- a/regex-syntax/src/hir/interval.rs +++ b/regex-syntax/src/hir/interval.rs @@ -19,7 +19,7 @@ use crate::unicode; // // Some of the implementation complexity here is a result of me wanting to // preserve the sequential representation without using additional memory. -// In some cases, we do use linear extra memory, but it is at most 2x and it +// In many cases, we do use linear extra memory, but it is at most 2x and it // is amortized. If we relaxed the memory requirements, this implementation // could become much simpler. The extra memory is honestly probably OK, but // character classes (especially of the Unicode variety) can become quite @@ -81,45 +81,14 @@ impl IntervalSet { /// Add a new interval to this set. pub fn push(&mut self, interval: I) { + // TODO: This could be faster. e.g., Push the interval such that + // it preserves canonicalization. + self.ranges.push(interval); + self.canonicalize(); // We don't know whether the new interval added here is considered // case folded, so we conservatively assume that the entire set is // no longer case folded if it was previously. self.folded = false; - - if self.ranges.is_empty() { - self.ranges.push(interval); - return; - } - - // Find the first range that is not greater than the new interval. - // This is the first range that could possibly be unioned with the - // new interval. - let mut drain_end = self.ranges.len(); - while drain_end > 0 - && self.ranges[drain_end - 1].lower() > interval.upper() - && !self.ranges[drain_end - 1].is_contiguous(&interval) - { - drain_end -= 1; - } - - // Try to union the new interval with old intervals backwards. - if drain_end > 0 && self.ranges[drain_end - 1].is_contiguous(&interval) - { - self.ranges[drain_end - 1] = - self.ranges[drain_end - 1].union(&interval).unwrap(); - for i in (0..drain_end - 1).rev() { - if let Some(union) = - self.ranges[drain_end - 1].union(&self.ranges[i]) - { - self.ranges[drain_end - 1] = union; - } else { - self.ranges.drain(i + 1..drain_end - 1); - break; - } - } - } else { - self.ranges.insert(drain_end, interval); - } } /// Return an iterator over all intervals in this set. @@ -223,13 +192,34 @@ impl IntervalSet { // Folks seem to suggest interval or segment trees, but I'd like to // avoid the overhead (both runtime and conceptual) of that. // + // The following is basically my Shitty First Draft. Therefore, in + // order to grok it, you probably need to read each line carefully. + // Simplifications are most welcome! + // // Remember, we can assume the canonical format invariant here, which // says that all ranges are sorted, not overlapping and not adjacent in // each class. let drain_end = self.ranges.len(); + let (mut a, mut b) = (0, 0); + 'LOOP: while a < drain_end && b < other.ranges.len() { + // Basically, the easy cases are when neither range overlaps with + // each other. If the `b` range is less than our current `a` + // range, then we can skip it and move on. + if other.ranges[b].upper() < self.ranges[a].lower() { + b += 1; + continue; + } + // ... similarly for the `a` range. If it's less than the smallest + // `b` range, then we can add it as-is. + if self.ranges[a].upper() < other.ranges[b].lower() { + let range = self.ranges[a]; + self.ranges.push(range); + a += 1; + continue; + } + // Otherwise, we have overlapping ranges. + assert!(!self.ranges[a].is_intersection_empty(&other.ranges[b])); - let mut b = 0; - for a in 0..drain_end { // This part is tricky and was non-obvious to me without looking // at explicit examples (see the tests). The trickiness stems from // two things: 1) subtracting a range from another range could @@ -241,34 +231,47 @@ impl IntervalSet { // For example, if our `a` range is `a-t` and our next three `b` // ranges are `a-c`, `g-i`, `r-t` and `x-z`, then we need to apply // subtraction three times before moving on to the next `a` range. - self.ranges.push(self.ranges[a]); - // Only when `b` is not above `a`, `b` might apply to current - // `a` range. + let mut range = self.ranges[a]; while b < other.ranges.len() - && other.ranges[b].lower() <= self.ranges[a].upper() + && !range.is_intersection_empty(&other.ranges[b]) { - match self.ranges.pop().unwrap().difference(&other.ranges[b]) { - (Some(range1), None) | (None, Some(range1)) => { - self.ranges.push(range1); + let old_range = range; + range = match range.difference(&other.ranges[b]) { + (None, None) => { + // We lost the entire range, so move on to the next + // without adding this one. + a += 1; + continue 'LOOP; } + (Some(range1), None) | (None, Some(range1)) => range1, (Some(range1), Some(range2)) => { self.ranges.push(range1); - self.ranges.push(range2); + range2 } - (None, None) => {} + }; + // It's possible that the `b` range has more to contribute + // here. In particular, if it is greater than the original + // range, then it might impact the next `a` range *and* it + // has impacted the current `a` range as much as possible, + // so we can quit. We don't bump `b` so that the next `a` + // range can apply it. + if other.ranges[b].upper() > old_range.upper() { + break; } - // The next `b` range might apply to the current + // Otherwise, the next `b` range might apply to the current // `a` range. b += 1; } - // It's possible that the last `b` range has more to - // contribute to the next `a`. We don't bump the last - // `b` so that the next `a` range can apply it. - b = b.saturating_sub(1); + self.ranges.push(range); + a += 1; + } + while a < drain_end { + let range = self.ranges[a]; + self.ranges.push(range); + a += 1; } - self.ranges.drain(..drain_end); - self.folded = self.ranges.is_empty() || (self.folded && other.folded); + self.folded = self.folded && other.folded; } /// Compute the symmetric difference of the two sets, in place. @@ -279,83 +282,11 @@ impl IntervalSet { /// set. That is, the set will contain all elements in either set, /// but will not contain any elements that are in both sets. pub fn symmetric_difference(&mut self, other: &IntervalSet) { - if self.ranges.is_empty() { - self.ranges.extend(&other.ranges); - self.folded = other.folded; - return; - } - if other.ranges.is_empty() { - return; - } - - // There should be a way to do this in-place with constant memory, - // but I couldn't figure out a simple way to do it. So just append - // the symmetric difference to the end of this range, and then drain - // it before we're done. - let drain_end = self.ranges.len(); - let mut b = 0; - let mut b_range = Some(other.ranges[b]); - for a in 0..drain_end { - self.ranges.push(self.ranges[a]); - while b_range - .map_or(false, |r| r.lower() <= self.ranges[a].upper()) - { - let (range1, range2) = match self - .ranges - .pop() - .unwrap() - .symmetric_difference(&b_range.as_ref().unwrap()) - { - (Some(range1), None) | (None, Some(range1)) => { - (Some(range1), None) - } - (Some(range1), Some(range2)) => { - (Some(range1), Some(range2)) - } - (None, None) => (None, None), - }; - if let Some(range) = range1 { - if self.ranges.len() > drain_end - && self.ranges.last().unwrap().is_contiguous(&range) - { - self.ranges - .last_mut() - .map(|last| *last = last.union(&range).unwrap()); - } else { - self.ranges.push(range); - } - } - if let Some(range) = range2 { - self.ranges.push(range); - } - - b_range = if self.ranges.len() > drain_end - && self.ranges.last().unwrap().upper() - > self.ranges[a].upper() - { - Some(*self.ranges.last().unwrap()) - } else { - b += 1; - other.ranges.get(b).cloned() - }; - } - } - while let Some(range) = b_range { - if self.ranges.len() > drain_end - && self.ranges.last().unwrap().is_contiguous(&range) - { - self.ranges - .last_mut() - .map(|last| *last = last.union(&range).unwrap()); - } else { - self.ranges.push(range); - } - b += 1; - b_range = other.ranges.get(b).cloned(); - } - - self.ranges.drain(..drain_end); - self.folded = self.ranges.is_empty() || (self.folded && other.folded); + // TODO(burntsushi): Fix this so that it amortizes allocation. + let mut intersection = self.clone(); + intersection.intersect(other); + self.union(other); + self.difference(&intersection); } /// Negate this interval set. @@ -371,44 +302,28 @@ impl IntervalSet { return; } + // There should be a way to do this in-place with constant memory, + // but I couldn't figure out a simple way to do it. So just append + // the negation to the end of this range, and then drain it before + // we're done. + let drain_end = self.ranges.len(); + // We do checked arithmetic below because of the canonical ordering // invariant. if self.ranges[0].lower() > I::Bound::min_value() { - let mut pre_upper = self.ranges[0].upper(); - self.ranges[0] = I::create( - I::Bound::min_value(), - self.ranges[0].lower().decrement(), - ); - for i in 1..self.ranges.len() { - let lower = pre_upper.increment(); - pre_upper = self.ranges[i].upper(); - self.ranges[i] = - I::create(lower, self.ranges[i].lower().decrement()); - } - if pre_upper < I::Bound::max_value() { - self.ranges.push(I::create( - pre_upper.increment(), - I::Bound::max_value(), - )); - } - } else { - for i in 1..self.ranges.len() { - self.ranges[i - 1] = I::create( - self.ranges[i - 1].upper().increment(), - self.ranges[i].lower().decrement(), - ); - } - if self.ranges.last().unwrap().upper() < I::Bound::max_value() { - self.ranges.last_mut().map(|range| { - *range = I::create( - range.upper().increment(), - I::Bound::max_value(), - ) - }); - } else { - self.ranges.pop(); - } + let upper = self.ranges[0].lower().decrement(); + self.ranges.push(I::create(I::Bound::min_value(), upper)); + } + for i in 1..drain_end { + let lower = self.ranges[i - 1].upper().increment(); + let upper = self.ranges[i].lower().decrement(); + self.ranges.push(I::create(lower, upper)); + } + if self.ranges[drain_end - 1].upper() < I::Bound::max_value() { + let lower = self.ranges[drain_end - 1].upper().increment(); + self.ranges.push(I::create(lower, I::Bound::max_value())); } + self.ranges.drain(..drain_end); // We don't need to update whether this set is folded or not, because // it is conservatively preserved through negation. Namely, if a set // is not folded, then it is possible that its negation is folded, for @@ -422,7 +337,6 @@ impl IntervalSet { // of case folded characters. Negating it in turn means that all // equivalence classes in the set are negated, and any equivalence // class that was previously not in the set is now entirely in the set. - self.folded = self.ranges.is_empty() || self.folded; } /// Converts this set into a canonical ordering. @@ -433,20 +347,24 @@ impl IntervalSet { self.ranges.sort(); assert!(!self.ranges.is_empty()); - // We maintain the canonicalization results in-place at `0..newi`. - // `newi` will keep track of the end of the canonicalized ranges. - let mut newi = 0; - for oldi in 1..self.ranges.len() { - // The last new range gets merged with currnet old range when - // unionable. If not, we update `newi` and store it as a new range. - if let Some(union) = self.ranges[newi].union(&self.ranges[oldi]) { - self.ranges[newi] = union; - } else { - newi += 1; - self.ranges[newi] = self.ranges[oldi]; + // Is there a way to do this in-place with constant memory? I couldn't + // figure out a way to do it. So just append the canonicalization to + // the end of this range, and then drain it before we're done. + let drain_end = self.ranges.len(); + for oldi in 0..drain_end { + // If we've added at least one new range, then check if we can + // merge this range in the previously added range. + if self.ranges.len() > drain_end { + let (last, rest) = self.ranges.split_last_mut().unwrap(); + if let Some(union) = last.union(&rest[oldi]) { + *last = union; + continue; + } } + let range = self.ranges[oldi]; + self.ranges.push(range); } - self.ranges.truncate(newi + 1); + self.ranges.drain(..drain_end); } /// Returns true if and only if this class is in a canonical ordering. @@ -568,13 +486,7 @@ pub trait Interval: other: &Self, ) -> (Option, Option) { let union = match self.union(other) { - None => { - return if self.upper() < other.lower() { - (Some(self.clone()), Some(other.clone())) - } else { - (Some(other.clone()), Some(self.clone())) - } - } + None => return (Some(self.clone()), Some(other.clone())), Some(union) => union, }; let intersection = match self.intersect(other) { From b99cff05449ba7f61e38e0efb18d4c95c8bc28e3 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Wed, 11 Oct 2023 12:43:42 -0400 Subject: [PATCH 110/136] regex-syntax-0.8.1 --- regex-syntax/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/regex-syntax/Cargo.toml b/regex-syntax/Cargo.toml index f14298299..b0ba658b8 100644 --- a/regex-syntax/Cargo.toml +++ b/regex-syntax/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "regex-syntax" -version = "0.8.0" #:version +version = "0.8.1" #:version authors = ["The Rust Project Developers", "Andrew Gallant "] license = "MIT OR Apache-2.0" repository = "https://github.com/rust-lang/regex/tree/master/regex-syntax" From ef3e01be4aa73afbc4a3b61d4f08c417a57e0c61 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Thu, 12 Oct 2023 09:28:15 -0400 Subject: [PATCH 111/136] syntax: add regression test for the errant HIR interval set optimizations Fixes #1103 Ref #1051, Ref #1102 --- testdata/regression.toml | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/testdata/regression.toml b/testdata/regression.toml index 09b2b1d1c..2954c9118 100644 --- a/testdata/regression.toml +++ b/testdata/regression.toml @@ -800,3 +800,16 @@ name = "non-prefix-literal-quit-state" regex = '.+\b\n' haystack = "β77\n" matches = [[0, 5]] + +# This is a regression test for some errant HIR interval set operations that +# were made in the regex-syntax 0.8.0 release and then reverted in 0.8.1. The +# issue here is that the HIR produced from the regex had out-of-order ranges. +# +# See: https://github.com/rust-lang/regex/issues/1103 +# Ref: https://github.com/rust-lang/regex/pull/1051 +# Ref: https://github.com/rust-lang/regex/pull/1102 +[[test]] +name = "hir-optimization-out-of-order-class" +regex = '^[[:alnum:]./-]+$' +haystack = "a-b" +matches = [[0, 3]] From 69051b797ba3065663564b382b024c3cb3484bf4 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Thu, 12 Oct 2023 10:16:36 -0400 Subject: [PATCH 112/136] fuzz: add another HIR interval set regression This is a new test case revealed by OSS-fuzz. It passes in the current releases. Fixes https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=63203 --- ...case-minimized-ast_fuzz_regex-4596093180313600 | Bin 0 -> 329 bytes 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 fuzz/regressions/clusterfuzz-testcase-minimized-ast_fuzz_regex-4596093180313600 diff --git a/fuzz/regressions/clusterfuzz-testcase-minimized-ast_fuzz_regex-4596093180313600 b/fuzz/regressions/clusterfuzz-testcase-minimized-ast_fuzz_regex-4596093180313600 new file mode 100644 index 0000000000000000000000000000000000000000..711817e4ed98c89f3eac4def9acfbea0451dbbf3 GIT binary patch literal 329 zcmeybps-MZMDQQ1{yzjLfGLOx{~ Date: Fri, 13 Oct 2023 09:51:09 -0400 Subject: [PATCH 113/136] bench: add a redirect This directory was linked in a fair number of places, so we re-introduce it with a README pointing folks toward rebar. --- bench/README.md | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 bench/README.md diff --git a/bench/README.md b/bench/README.md new file mode 100644 index 000000000..3cc6a1a7a --- /dev/null +++ b/bench/README.md @@ -0,0 +1,2 @@ +Benchmarks for this crate have been moved into the rebar project: +https://github.com/BurntSushi/rebar From cfd0ca2428c986777e21542a531b450178bc0cf2 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Thu, 12 Oct 2023 14:15:48 -0400 Subject: [PATCH 114/136] automata/meta: force some prefilter inlining In some ad hoc profiling, I noticed an extra function call that really didn't need to be there. --- regex-automata/src/meta/strategy.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/regex-automata/src/meta/strategy.rs b/regex-automata/src/meta/strategy.rs index ea6c6ab57..5b96d888a 100644 --- a/regex-automata/src/meta/strategy.rs +++ b/regex-automata/src/meta/strategy.rs @@ -353,6 +353,7 @@ impl Pre<()> { // strategy when len(patterns)==1 if the number of literals is large. In that // case, literal extraction gives up and will return an infinite set.) impl Strategy for Pre

{ + #[cfg_attr(feature = "perf-inline", inline(always))] fn group_info(&self) -> &GroupInfo { &self.group_info } @@ -378,6 +379,7 @@ impl Strategy for Pre

{ self.pre.memory_usage() } + #[cfg_attr(feature = "perf-inline", inline(always))] fn search(&self, _cache: &mut Cache, input: &Input<'_>) -> Option { if input.is_done() { return None; @@ -393,6 +395,7 @@ impl Strategy for Pre

{ .map(|sp| Match::new(PatternID::ZERO, sp)) } + #[cfg_attr(feature = "perf-inline", inline(always))] fn search_half( &self, cache: &mut Cache, @@ -401,10 +404,12 @@ impl Strategy for Pre

{ self.search(cache, input).map(|m| HalfMatch::new(m.pattern(), m.end())) } + #[cfg_attr(feature = "perf-inline", inline(always))] fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool { self.search(cache, input).is_some() } + #[cfg_attr(feature = "perf-inline", inline(always))] fn search_slots( &self, cache: &mut Cache, @@ -421,6 +426,7 @@ impl Strategy for Pre

{ Some(m.pattern()) } + #[cfg_attr(feature = "perf-inline", inline(always))] fn which_overlapping_matches( &self, cache: &mut Cache, From 04f5d7be4efc542864cc400f5d43fbea4eb9bab6 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Thu, 12 Oct 2023 14:16:20 -0400 Subject: [PATCH 115/136] syntax: loosen ASCII compatible rules MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously, patterns like `(?-u:☃)` were banned under the logic that Unicode scalar values shouldn't be available unless Unicode mode is enabled. But since patterns are required to be UTF-8, there really isn't any difficulty in just interpreting Unicode literals as their corresponding UTF-8 encoding. Note though that Unicode character classes, even things like `(?-u:[☃])`, remain banned. We probably could make character classes work too, but it's unclear how that plays with ASCII compatible mode requiring that a single byte is the fundamental atom of matching (where as Unicode mode requires that Unicode scalar values are the fundamental atom of matching). --- regex-syntax/src/hir/translate.rs | 46 +++++++------------------------ src/bytes.rs | 4 +-- 2 files changed, 12 insertions(+), 38 deletions(-) diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs index 2eff6318c..313a1e9e8 100644 --- a/regex-syntax/src/hir/translate.rs +++ b/regex-syntax/src/hir/translate.rs @@ -388,17 +388,10 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { } Ast::Literal(ref x) => match self.ast_literal_to_scalar(x)? { Either::Right(byte) => self.push_byte(byte), - Either::Left(ch) => { - if !self.flags().unicode() && ch.len_utf8() > 1 { - return Err( - self.error(x.span, ErrorKind::UnicodeNotAllowed) - ); - } - match self.case_fold_char(x.span, ch)? { - None => self.push_char(ch), - Some(expr) => self.push(HirFrame::Expr(expr)), - } - } + Either::Left(ch) => match self.case_fold_char(x.span, ch)? { + None => self.push_char(ch), + Some(expr) => self.push(HirFrame::Expr(expr)), + }, }, Ast::Dot(ref span) => { self.push(HirFrame::Expr(self.hir_dot(**span)?)); @@ -872,8 +865,8 @@ impl<'t, 'p> TranslatorI<'t, 'p> { })?; Ok(Some(Hir::class(hir::Class::Unicode(cls)))) } else { - if c.len_utf8() > 1 { - return Err(self.error(span, ErrorKind::UnicodeNotAllowed)); + if !c.is_ascii() { + return Ok(None); } // If case folding won't do anything, then don't bother trying. match c { @@ -1211,9 +1204,8 @@ impl<'t, 'p> TranslatorI<'t, 'p> { match self.ast_literal_to_scalar(ast)? { Either::Right(byte) => Ok(byte), Either::Left(ch) => { - let cp = u32::from(ch); - if cp <= 0x7F { - Ok(u8::try_from(cp).unwrap()) + if ch.is_ascii() { + Ok(u8::try_from(ch).unwrap()) } else { // We can't feasibly support Unicode in // byte oriented classes. Byte classes don't @@ -1661,16 +1653,7 @@ mod tests { assert_eq!(t_bytes(r"(?-u)\x61"), hir_lit("a")); assert_eq!(t_bytes(r"(?-u)\xFF"), hir_blit(b"\xFF")); - assert_eq!( - t_err("(?-u)☃"), - TestError { - kind: hir::ErrorKind::UnicodeNotAllowed, - span: Span::new( - Position::new(5, 1, 6), - Position::new(8, 1, 7) - ), - } - ); + assert_eq!(t("(?-u)☃"), hir_lit("☃")); assert_eq!( t_err(r"(?-u)\xFF"), TestError { @@ -1748,16 +1731,7 @@ mod tests { ); assert_eq!(t_bytes(r"(?i-u)\xFF"), hir_blit(b"\xFF")); - assert_eq!( - t_err("(?i-u)β"), - TestError { - kind: hir::ErrorKind::UnicodeNotAllowed, - span: Span::new( - Position::new(6, 1, 7), - Position::new(8, 1, 8), - ), - } - ); + assert_eq!(t("(?i-u)β"), hir_lit("β"),); } #[test] diff --git a/src/bytes.rs b/src/bytes.rs index 3f53a3ea5..383ac4a5b 100644 --- a/src/bytes.rs +++ b/src/bytes.rs @@ -68,8 +68,8 @@ bytes: 1. The `u` flag can be disabled even when disabling it might cause the regex to match invalid UTF-8. When the `u` flag is disabled, the regex is said to be in "ASCII compatible" mode. -2. In ASCII compatible mode, neither Unicode scalar values nor Unicode -character classes are allowed. +2. In ASCII compatible mode, Unicode character classes are not allowed. Literal +Unicode scalar values outside of character classes are allowed. 3. In ASCII compatible mode, Perl character classes (`\w`, `\d` and `\s`) revert to their typical ASCII definition. `\w` maps to `[[:word:]]`, `\d` maps to `[[:digit:]]` and `\s` maps to `[[:space:]]`. From 8a8d599f9d2f2d78e9ad84e4084788c2d563afa5 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Fri, 13 Oct 2023 14:52:09 -0400 Subject: [PATCH 116/136] automata/meta: tweak reverse suffix prefilter strategy Previously, we were only use the reverse suffix optimization if it found a non-empty longest common suffix *and* if the prefilter thought itself was fast. This was a heuristic used in the old regex crate before we grew the "is prefilter fast" heuristic. We change this optimization to just use the "is prefilter fast" heuristic instead of requiring a non-empty longest common suffix. This is, after all, what the inner literal optimization does. And in the inner literal case, one should probably be even more conservative because of the extra work that needs to be done. So if things are going okay with the inner literal optimization, then we should be fine with the reverse suffix optimization doing essentially the same thing. --- regex-automata/src/meta/strategy.rs | 37 ++++++++++------------------- 1 file changed, 12 insertions(+), 25 deletions(-) diff --git a/regex-automata/src/meta/strategy.rs b/regex-automata/src/meta/strategy.rs index 5b96d888a..4cb3b29b9 100644 --- a/regex-automata/src/meta/strategy.rs +++ b/regex-automata/src/meta/strategy.rs @@ -1167,34 +1167,21 @@ impl ReverseSuffix { return Err(core); } let kind = core.info.config().get_match_kind(); - let suffixes = crate::util::prefilter::suffixes(kind, hirs); - let lcs = match suffixes.longest_common_suffix() { - None => { - debug!( - "skipping reverse suffix optimization because \ - a longest common suffix could not be found", - ); - return Err(core); - } - Some(lcs) if lcs.is_empty() => { - debug!( - "skipping reverse suffix optimization because \ - the longest common suffix is the empty string", - ); - return Err(core); - } - Some(lcs) => lcs, + let suffixseq = crate::util::prefilter::suffixes(kind, hirs); + let Some(suffixes) = suffixseq.literals() else { + debug!( + "skipping reverse suffix optimization because \ + the extract suffix sequence is not finite", + ); + return Err(core); }; - let pre = match Prefilter::new(kind, &[lcs]) { - Some(pre) => pre, - None => { - debug!( - "skipping reverse suffix optimization because \ + let Some(pre) = Prefilter::new(kind, suffixes) else { + debug!( + "skipping reverse suffix optimization because \ a prefilter could not be constructed from the \ longest common suffix", - ); - return Err(core); - } + ); + return Err(core); }; if !pre.is_fast() { debug!( From 049d063ba1e8cb2f7203684865b58d6af44357e9 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sat, 14 Oct 2023 11:07:52 -0400 Subject: [PATCH 117/136] changelog: 1.10.1 --- CHANGELOG.md | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index b51142218..b5f31bec0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,19 @@ +1.10.1 (2023-10-14) +=================== +This is a new patch release with a minor increase in the number of valid +patterns and a broadening of some literal optimizations. + +New features: + +* [FEATURE 04f5d7be](https://github.com/rust-lang/regex/commit/04f5d7be4efc542864cc400f5d43fbea4eb9bab6): +Loosen ASCII-compatible rules such that regexes like `(?-u:☃)` are now allowed. + +Performance improvements: + +* [PERF 8a8d599f](https://github.com/rust-lang/regex/commit/8a8d599f9d2f2d78e9ad84e4084788c2d563afa5): +Broader the reverse suffix optimization to apply in more cases. + + 1.10.0 (2023-10-09) =================== This is a new minor release of `regex` that adds support for start and end From 1dbeee73b9fcde708502d3d5f799b198fe3a6cf5 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sat, 14 Oct 2023 11:08:27 -0400 Subject: [PATCH 118/136] regex-syntax-0.8.2 --- regex-syntax/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/regex-syntax/Cargo.toml b/regex-syntax/Cargo.toml index b0ba658b8..c9ce87da7 100644 --- a/regex-syntax/Cargo.toml +++ b/regex-syntax/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "regex-syntax" -version = "0.8.1" #:version +version = "0.8.2" #:version authors = ["The Rust Project Developers", "Andrew Gallant "] license = "MIT OR Apache-2.0" repository = "https://github.com/rust-lang/regex/tree/master/regex-syntax" From ee01ec2725279273630d2b1ebc99775b932131b2 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sat, 14 Oct 2023 11:09:04 -0400 Subject: [PATCH 119/136] deps: bump regex-syntax to 0.8.2 --- Cargo.toml | 2 +- regex-automata/Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 88f96b0b1..f3eaf7961 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -181,7 +181,7 @@ features = ["alloc", "syntax", "meta", "nfa-pikevm"] # For parsing regular expressions. [dependencies.regex-syntax] path = "regex-syntax" -version = "0.8.0" +version = "0.8.2" default-features = false [dev-dependencies] diff --git a/regex-automata/Cargo.toml b/regex-automata/Cargo.toml index 63554314f..99f9a9220 100644 --- a/regex-automata/Cargo.toml +++ b/regex-automata/Cargo.toml @@ -86,7 +86,7 @@ internal-instrument-pikevm = ["logging", "std"] aho-corasick = { version = "1.0.0", optional = true, default-features = false } log = { version = "0.4.14", optional = true } memchr = { version = "2.6.0", optional = true, default-features = false } -regex-syntax = { path = "../regex-syntax", version = "0.8.0", optional = true, default-features = false } +regex-syntax = { path = "../regex-syntax", version = "0.8.2", optional = true, default-features = false } [dev-dependencies] anyhow = "1.0.69" From 488604dd6f053104b008a22b9808e383f283992d Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sat, 14 Oct 2023 11:09:15 -0400 Subject: [PATCH 120/136] regex-automata-0.4.2 --- regex-automata/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/regex-automata/Cargo.toml b/regex-automata/Cargo.toml index 99f9a9220..f9f59feb3 100644 --- a/regex-automata/Cargo.toml +++ b/regex-automata/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "regex-automata" -version = "0.4.1" #:version +version = "0.4.2" #:version authors = ["The Rust Project Developers", "Andrew Gallant "] description = "Automata construction and matching using regular expressions." documentation = "https://docs.rs/regex-automata" From d242ede2ab07df6b32b9ee86f9ae2ae43252ebfa Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sat, 14 Oct 2023 11:09:51 -0400 Subject: [PATCH 121/136] deps: bump regex-automata to 0.4.2 --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index f3eaf7961..7d5a210b0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -174,7 +174,7 @@ optional = true # For the actual regex engines. [dependencies.regex-automata] path = "regex-automata" -version = "0.4.0" +version = "0.4.2" default-features = false features = ["alloc", "syntax", "meta", "nfa-pikevm"] From 5dff4bd7e3bf8b87a272e31c23f1b64417e4c5de Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sat, 14 Oct 2023 11:09:54 -0400 Subject: [PATCH 122/136] 1.10.1 --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 7d5a210b0..45132a906 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "regex" -version = "1.10.0" #:version +version = "1.10.1" #:version authors = ["The Rust Project Developers", "Andrew Gallant "] license = "MIT OR Apache-2.0" readme = "README.md" From 466e42ca2bea2480ff367e0e26e3967435ac3e30 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sat, 14 Oct 2023 12:48:09 -0400 Subject: [PATCH 123/136] lite: fix stack overflow in NFA compiler This commit fixes a bug where the parser could produce a very deeply nested Hir value beyond the configured nested limit. This was caused by the fact that the Hir can have some of its nested structures added to it without a corresponding recursive call in the parser. For example, repetition operators. This means that even if we don't blow the nest limit in the parser, the Hir itself can still become nested beyond the limit. This in turn will make it possible to unintentionally overflow the stack in subsequent recursion over the Hir value, such as in the Thompson NFA compiler. We fix this by checking the nesting limit both on every recursive parse call and also on the depth of the final Hir value once parsing is finished but before it has returned to the caller. Fixes https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=60608 --- ...zed-fuzz_regex_lite_match-4692452983046144 | Bin 0 -> 5437 bytes regex-lite/src/hir/parse.rs | 60 ++++++++++++++++-- regex-lite/tests/fuzz/mod.rs | 17 +++++ 3 files changed, 72 insertions(+), 5 deletions(-) create mode 100644 fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_lite_match-4692452983046144 diff --git a/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_lite_match-4692452983046144 b/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_lite_match-4692452983046144 new file mode 100644 index 0000000000000000000000000000000000000000..184b6ed7019033ef791366a8dfd3287d7d347965 GIT binary patch literal 5437 zcmdPk(umX40wR6=va~WS2!Jy6pj;v;b0XD^Dx+}-aM{z?f>Gl~Lx7YJ;37rwsM^sG z7?~lUOf_$FQN<+cm_ik&j2be+Ltr#5j_?F9>U~HEj8+oxl)wdNjWS3Ifzf~?DY%GK sJQ}J*2Go#K#>4;uYz#;aze`hqWDyORTr@Bfi;-4X#ON% Parser<'a> { /// own routine. impl<'a> Parser<'a> { pub(super) fn parse(&self) -> Result { + let hir = self.parse_inner()?; + // While we also check nesting during parsing, that only checks the + // number of recursive parse calls. It does not necessarily cover + // all possible recursive nestings of the Hir itself. For example, + // repetition operators don't require recursive parse calls. So one + // can stack them arbitrarily without overflowing the stack in the + // *parser*. But then if one recurses over the resulting Hir, a stack + // overflow is possible. So here we check the Hir nesting level + // thoroughly to ensure it isn't nested too deeply. + // + // Note that we do still need the nesting limit check in the parser as + // well, since that will avoid overflowing the stack during parse time + // before the complete Hir value is constructed. + check_hir_nesting(&hir, self.config.nest_limit)?; + Ok(hir) + } + + fn parse_inner(&self) -> Result { let depth = self.increment_depth()?; let mut alternates = vec![]; let mut concat = vec![]; @@ -806,7 +824,7 @@ impl<'a> Parser<'a> { if self.bump_if("?P<") || self.bump_if("?<") { let index = self.next_capture_index()?; let name = Some(Box::from(self.parse_capture_name()?)); - let sub = Box::new(self.parse()?); + let sub = Box::new(self.parse_inner()?); let cap = hir::Capture { index, name, sub }; Ok(Some(Hir::capture(cap))) } else if self.bump_if("?") { @@ -826,11 +844,11 @@ impl<'a> Parser<'a> { } else { assert_eq!(':', self.char()); self.bump(); - self.parse().map(Some) + self.parse_inner().map(Some) } } else { let index = self.next_capture_index()?; - let sub = Box::new(self.parse()?); + let sub = Box::new(self.parse_inner()?); let cap = hir::Capture { index, name: None, sub }; Ok(Some(Hir::capture(cap))) } @@ -1263,6 +1281,38 @@ impl<'a> Parser<'a> { } } +/// This checks the depth of the given `Hir` value, and if it exceeds the given +/// limit, then an error is returned. +fn check_hir_nesting(hir: &Hir, limit: u32) -> Result<(), Error> { + fn recurse(hir: &Hir, limit: u32, depth: u32) -> Result<(), Error> { + if depth > limit { + return Err(Error::new(ERR_TOO_MUCH_NESTING)); + } + let Some(next_depth) = depth.checked_add(1) else { + return Err(Error::new(ERR_TOO_MUCH_NESTING)); + }; + match *hir.kind() { + HirKind::Empty + | HirKind::Char(_) + | HirKind::Class(_) + | HirKind::Look(_) => Ok(()), + HirKind::Repetition(hir::Repetition { ref sub, .. }) => { + recurse(sub, limit, next_depth) + } + HirKind::Capture(hir::Capture { ref sub, .. }) => { + recurse(sub, limit, next_depth) + } + HirKind::Concat(ref subs) | HirKind::Alternation(ref subs) => { + for sub in subs.iter() { + recurse(sub, limit, next_depth)?; + } + Ok(()) + } + } + } + recurse(hir, limit, 0) +} + /// Converts the given Hir to a literal char if the Hir is just a single /// character. Otherwise this returns an error. /// @@ -1344,12 +1394,12 @@ mod tests { use super::*; fn p(pattern: &str) -> Hir { - Parser::new(Config::default(), pattern).parse().unwrap() + Parser::new(Config::default(), pattern).parse_inner().unwrap() } fn perr(pattern: &str) -> String { Parser::new(Config::default(), pattern) - .parse() + .parse_inner() .unwrap_err() .to_string() } diff --git a/regex-lite/tests/fuzz/mod.rs b/regex-lite/tests/fuzz/mod.rs index 6eb37b50b..747aab040 100644 --- a/regex-lite/tests/fuzz/mod.rs +++ b/regex-lite/tests/fuzz/mod.rs @@ -14,6 +14,23 @@ fn captures_wrong_order_min() { let _ = run(data); } +// Simpler regression test from a failure found by OSS-fuzz[1]. This test, +// when it failed, caused a stack overflow. We fixed it by adding another nest +// check on the Hir value itself, since the Hir type can have depth added to +// it without recursive calls in the parser (which is where the existing nest +// check was). +// +// Many thanks to Addison Crump for coming up with this test case[2]. +// +// [1]: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=60608 +// [2]: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=60608#c1 +#[test] +fn many_zero_to_many_reps() { + let pat = format!(".{}", "*".repeat(1 << 15)); + let Ok(re) = regex_lite::RegexBuilder::new(&pat).build() else { return }; + re.is_match(""); +} + // This is the fuzz target function. We duplicate it here since this is the // thing we use to interpret the data. It is ultimately what we want to // succeed. From cd79881df40755707ad9f1944b5f34881e1172b0 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sat, 14 Oct 2023 12:50:49 -0400 Subject: [PATCH 124/136] regex-lite-0.1.4 --- regex-lite/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/regex-lite/Cargo.toml b/regex-lite/Cargo.toml index e09229723..704970f2f 100644 --- a/regex-lite/Cargo.toml +++ b/regex-lite/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "regex-lite" -version = "0.1.3" #:version +version = "0.1.4" #:version authors = ["The Rust Project Developers", "Andrew Gallant "] license = "MIT OR Apache-2.0" repository = "https://github.com/rust-lang/regex/tree/master/regex-lite" From 4ae14720e9599830f653ca1a881b42e620eba11e Mon Sep 17 00:00:00 2001 From: Fabio Valentini Date: Sun, 15 Oct 2023 14:33:18 +0200 Subject: [PATCH 125/136] tests: fix compilation of doctests on 32-bit architectures PR #1107 --- regex-automata/src/util/captures.rs | 3 ++- src/regex/bytes.rs | 4 +++- src/regex/string.rs | 4 +++- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/regex-automata/src/util/captures.rs b/regex-automata/src/util/captures.rs index cd3a5f8f7..05db6a993 100644 --- a/regex-automata/src/util/captures.rs +++ b/regex-automata/src/util/captures.rs @@ -433,7 +433,6 @@ impl Captures { /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long - /// # if !cfg!(target_pointer_width = "64") { return Ok(()); } // see #1039 /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Span, Match}; /// /// let re = PikeVM::new(r"^(?P\pL+)\s+(?P\pL+)$")?; @@ -445,6 +444,8 @@ impl Captures { /// assert_eq!(Some(Span::from(6..17)), caps.get_group(2)); /// // Looking for a non-existent capturing group will return None: /// assert_eq!(None, caps.get_group(3)); + /// # // literals are too big for 32-bit usize: #1039 + /// # #[cfg(target_pointer_width = "64")] /// assert_eq!(None, caps.get_group(9944060567225171988)); /// /// # Ok::<(), Box>(()) diff --git a/src/regex/bytes.rs b/src/regex/bytes.rs index c742b095a..19f5701af 100644 --- a/src/regex/bytes.rs +++ b/src/regex/bytes.rs @@ -2025,7 +2025,6 @@ impl<'h, 'n> core::ops::Index<&'n str> for Captures<'h> { /// This example shows how to create and use `CaptureLocations` in a search. /// /// ``` -/// # if !cfg!(target_pointer_width = "64") { return; } // see #1041 /// use regex::bytes::Regex; /// /// let re = Regex::new(r"(?\w+)\s+(?\w+)").unwrap(); @@ -2038,7 +2037,10 @@ impl<'h, 'n> core::ops::Index<&'n str> for Captures<'h> { /// /// // Asking for an invalid capture group always returns None. /// assert_eq!(None, locs.get(3)); +/// # // literals are too big for 32-bit usize: #1041 +/// # #[cfg(target_pointer_width = "64")] /// assert_eq!(None, locs.get(34973498648)); +/// # #[cfg(target_pointer_width = "64")] /// assert_eq!(None, locs.get(9944060567225171988)); /// ``` #[derive(Clone, Debug)] diff --git a/src/regex/string.rs b/src/regex/string.rs index 177a2af34..880d6082a 100644 --- a/src/regex/string.rs +++ b/src/regex/string.rs @@ -2028,7 +2028,6 @@ impl<'h, 'n> core::ops::Index<&'n str> for Captures<'h> { /// This example shows how to create and use `CaptureLocations` in a search. /// /// ``` -/// # if !cfg!(target_pointer_width = "64") { return; } // see #1041 /// use regex::Regex; /// /// let re = Regex::new(r"(?\w+)\s+(?\w+)").unwrap(); @@ -2041,7 +2040,10 @@ impl<'h, 'n> core::ops::Index<&'n str> for Captures<'h> { /// /// // Asking for an invalid capture group always returns None. /// assert_eq!(None, locs.get(3)); +/// # // literals are too big for 32-bit usize: #1041 +/// # #[cfg(target_pointer_width = "64")] /// assert_eq!(None, locs.get(34973498648)); +/// # #[cfg(target_pointer_width = "64")] /// assert_eq!(None, locs.get(9944060567225171988)); /// ``` #[derive(Clone, Debug)] From 0086dec69a77a9e1153e97cd050ab567b5c7f109 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sat, 14 Oct 2023 13:18:09 -0400 Subject: [PATCH 126/136] lite: fix stack overflow test It turns out that we missed another case where the stack could overflow: dropping a deeply nested Hir. Namely, since we permit deeply nested Hirs to be constructed and only reject them after determining they are too deeply nested, they still then need to be dropped. We fix this by implementing a custom a Drop impl that uses the heap to traverse the Hir and drop things without using unbounded stack space. An alternative way to fix this would be to adjust the parser somehow to avoid building deeply nested Hir values in the first place. But that seems trickier, so we just stick with this for now. --- regex-lite/src/hir/mod.rs | 60 ++++++++++++++++++++++++++++++++++++ regex-lite/src/hir/parse.rs | 6 ++-- regex-lite/tests/fuzz/mod.rs | 2 +- 3 files changed, 65 insertions(+), 3 deletions(-) diff --git a/regex-lite/src/hir/mod.rs b/regex-lite/src/hir/mod.rs index 3d61ce8c9..6e5348a5b 100644 --- a/regex-lite/src/hir/mod.rs +++ b/regex-lite/src/hir/mod.rs @@ -366,6 +366,24 @@ impl Hir { } } +impl HirKind { + /// Returns a slice of this kind's sub-expressions, if any. + fn subs(&self) -> &[Hir] { + use core::slice::from_ref; + + match *self { + HirKind::Empty + | HirKind::Char(_) + | HirKind::Class(_) + | HirKind::Look(_) => &[], + HirKind::Repetition(Repetition { ref sub, .. }) => from_ref(sub), + HirKind::Capture(Capture { ref sub, .. }) => from_ref(sub), + HirKind::Concat(ref subs) => subs, + HirKind::Alternation(ref subs) => subs, + } + } +} + #[derive(Clone, Debug, Eq, PartialEq)] pub(crate) struct Class { pub(crate) ranges: Vec, @@ -747,3 +765,45 @@ fn prev_char(ch: char) -> Option { // and U+E000 yields a valid scalar value. Some(char::from_u32(u32::from(ch).checked_sub(1)?).unwrap()) } + +impl Drop for Hir { + fn drop(&mut self) { + use core::mem; + + match *self.kind() { + HirKind::Empty + | HirKind::Char(_) + | HirKind::Class(_) + | HirKind::Look(_) => return, + HirKind::Capture(ref x) if x.sub.kind.subs().is_empty() => return, + HirKind::Repetition(ref x) if x.sub.kind.subs().is_empty() => { + return + } + HirKind::Concat(ref x) if x.is_empty() => return, + HirKind::Alternation(ref x) if x.is_empty() => return, + _ => {} + } + + let mut stack = vec![mem::replace(self, Hir::empty())]; + while let Some(mut expr) = stack.pop() { + match expr.kind { + HirKind::Empty + | HirKind::Char(_) + | HirKind::Class(_) + | HirKind::Look(_) => {} + HirKind::Capture(ref mut x) => { + stack.push(mem::replace(&mut x.sub, Hir::empty())); + } + HirKind::Repetition(ref mut x) => { + stack.push(mem::replace(&mut x.sub, Hir::empty())); + } + HirKind::Concat(ref mut x) => { + stack.extend(x.drain(..)); + } + HirKind::Alternation(ref mut x) => { + stack.extend(x.drain(..)); + } + } + } + } +} diff --git a/regex-lite/src/hir/parse.rs b/regex-lite/src/hir/parse.rs index 0dcccdd46..ca93b8838 100644 --- a/regex-lite/src/hir/parse.rs +++ b/regex-lite/src/hir/parse.rs @@ -1328,8 +1328,10 @@ fn into_class_item_range(hir: Hir) -> Result { } } -fn into_class_item_ranges(hir: Hir) -> Result, Error> { - match hir.kind { +fn into_class_item_ranges( + mut hir: Hir, +) -> Result, Error> { + match core::mem::replace(&mut hir.kind, HirKind::Empty) { HirKind::Char(ch) => Ok(vec![hir::ClassRange { start: ch, end: ch }]), HirKind::Class(hir::Class { ranges }) => Ok(ranges), _ => Err(Error::new(ERR_CLASS_INVALID_ITEM)), diff --git a/regex-lite/tests/fuzz/mod.rs b/regex-lite/tests/fuzz/mod.rs index 747aab040..5a721f142 100644 --- a/regex-lite/tests/fuzz/mod.rs +++ b/regex-lite/tests/fuzz/mod.rs @@ -27,7 +27,7 @@ fn captures_wrong_order_min() { #[test] fn many_zero_to_many_reps() { let pat = format!(".{}", "*".repeat(1 << 15)); - let Ok(re) = regex_lite::RegexBuilder::new(&pat).build() else { return }; + let Ok(re) = regex_lite::Regex::new(&pat) else { return }; re.is_match(""); } From e7bd19dd3ebf4b1a861275f0353202bf93a39ab1 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sun, 15 Oct 2023 09:24:20 -0400 Subject: [PATCH 127/136] regex-lite-0.1.5 --- regex-lite/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/regex-lite/Cargo.toml b/regex-lite/Cargo.toml index 704970f2f..0ba53485b 100644 --- a/regex-lite/Cargo.toml +++ b/regex-lite/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "regex-lite" -version = "0.1.4" #:version +version = "0.1.5" #:version authors = ["The Rust Project Developers", "Andrew Gallant "] license = "MIT OR Apache-2.0" repository = "https://github.com/rust-lang/regex/tree/master/regex-lite" From eb950f65e660a45c7e123f3c6fba9f2c86b4a256 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Mon, 16 Oct 2023 10:20:25 -0400 Subject: [PATCH 128/136] automata/meta: revert broadening of reverse suffix optimization This reverts commit 8a8d599f9d2f2d78e9ad84e4084788c2d563afa5 and includes a regression test, as well as a tweak to a log message. Essentially, the broadening was improper. We have to be careful when dealing with suffixes as opposed to prefixes. Namely, my logic previously was that the broadening was okay because we were already doing it for the reverse inner optimization. But the reverse inner optimization works with prefixes, not suffixes. So the comparison wasn't quite correct. This goes back to only applying the reverse suffix optimization when there is a non-empty single common suffix. Fixes #1110 Ref https://github.com/astral-sh/ruff/pull/7980 --- regex-automata/src/meta/strategy.rs | 39 +++++++++++++++++++---------- testdata/regression.toml | 15 +++++++++++ 2 files changed, 41 insertions(+), 13 deletions(-) diff --git a/regex-automata/src/meta/strategy.rs b/regex-automata/src/meta/strategy.rs index 4cb3b29b9..04f2ba3c3 100644 --- a/regex-automata/src/meta/strategy.rs +++ b/regex-automata/src/meta/strategy.rs @@ -1167,21 +1167,34 @@ impl ReverseSuffix { return Err(core); } let kind = core.info.config().get_match_kind(); - let suffixseq = crate::util::prefilter::suffixes(kind, hirs); - let Some(suffixes) = suffixseq.literals() else { - debug!( - "skipping reverse suffix optimization because \ - the extract suffix sequence is not finite", - ); - return Err(core); + let suffixes = crate::util::prefilter::suffixes(kind, hirs); + let lcs = match suffixes.longest_common_suffix() { + None => { + debug!( + "skipping reverse suffix optimization because \ + a longest common suffix could not be found", + ); + return Err(core); + } + Some(lcs) if lcs.is_empty() => { + debug!( + "skipping reverse suffix optimization because \ + the longest common suffix is the empty string", + ); + return Err(core); + } + Some(lcs) => lcs, }; - let Some(pre) = Prefilter::new(kind, suffixes) else { - debug!( - "skipping reverse suffix optimization because \ + let pre = match Prefilter::new(kind, &[lcs]) { + Some(pre) => pre, + None => { + debug!( + "skipping reverse suffix optimization because \ a prefilter could not be constructed from the \ longest common suffix", - ); - return Err(core); + ); + return Err(core); + } }; if !pre.is_fast() { debug!( @@ -1268,7 +1281,7 @@ impl ReverseSuffix { e.try_search_half_rev_limited(&input, min_start) } else if let Some(e) = self.core.hybrid.get(&input) { trace!( - "using lazy DFA for reverse inner search at {:?}, \ + "using lazy DFA for reverse suffix search at {:?}, \ but will be stopped at {} to avoid quadratic behavior", input.get_span(), min_start, diff --git a/testdata/regression.toml b/testdata/regression.toml index 2954c9118..53b0701a3 100644 --- a/testdata/regression.toml +++ b/testdata/regression.toml @@ -813,3 +813,18 @@ name = "hir-optimization-out-of-order-class" regex = '^[[:alnum:]./-]+$' haystack = "a-b" matches = [[0, 3]] + +# This is a regression test for an improper reverse suffix optimization. This +# occurred when I "broadened" the applicability of the optimization to include +# multiple possible literal suffixes instead of only sticking to a non-empty +# longest common suffix. It turns out that, at least given how the reverse +# suffix optimization works, we need to stick to the longest common suffix for +# now. +# +# See: https://github.com/rust-lang/regex/issues/1110 +# See also: https://github.com/astral-sh/ruff/pull/7980 +[[test]] +name = 'improper-reverse-suffix-optimization' +regex = '(\\N\{[^}]+})|([{}])' +haystack = 'hiya \N{snowman} bye' +matches = [[[5, 16], [5, 16], []]] From 50fe7d177db6854ea1a2b1d04d3db75ec544f39c Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Mon, 16 Oct 2023 10:45:26 -0400 Subject: [PATCH 129/136] changelog: 1.10.2 --- CHANGELOG.md | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index b5f31bec0..420e08f74 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,14 @@ +1.10.2 (2023-10-16) +=================== +This is a new patch release that fixes a search regression where incorrect +matches could be reported. + +Bug fixes: + +* [BUG #1110](https://github.com/rust-lang/regex/issues/1110): +Revert broadening of reverse suffix literal optimization introduced in 1.10.1. + + 1.10.1 (2023-10-14) =================== This is a new patch release with a minor increase in the number of valid From 61242b1e0e9941dadc5ec7c6cd7391db3cca5710 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Mon, 16 Oct 2023 10:45:33 -0400 Subject: [PATCH 130/136] regex-automata-0.4.3 --- regex-automata/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/regex-automata/Cargo.toml b/regex-automata/Cargo.toml index f9f59feb3..3cb3d7c8e 100644 --- a/regex-automata/Cargo.toml +++ b/regex-automata/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "regex-automata" -version = "0.4.2" #:version +version = "0.4.3" #:version authors = ["The Rust Project Developers", "Andrew Gallant "] description = "Automata construction and matching using regular expressions." documentation = "https://docs.rs/regex-automata" From 1a54a829ba730257cbb8ed53521db11be318c43e Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Mon, 16 Oct 2023 10:46:01 -0400 Subject: [PATCH 131/136] deps: bump regex-automata to 0.4.3 --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 45132a906..55108a968 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -174,7 +174,7 @@ optional = true # For the actual regex engines. [dependencies.regex-automata] path = "regex-automata" -version = "0.4.2" +version = "0.4.3" default-features = false features = ["alloc", "syntax", "meta", "nfa-pikevm"] From 5f1f1c8b6db4d1fd373ef1ab4eab05a8f66c4235 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Mon, 16 Oct 2023 10:46:02 -0400 Subject: [PATCH 132/136] 1.10.2 --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 55108a968..3ba14c904 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "regex" -version = "1.10.1" #:version +version = "1.10.2" #:version authors = ["The Rust Project Developers", "Andrew Gallant "] license = "MIT OR Apache-2.0" readme = "README.md" From 20b5317f7a8accbf64ee21245b0a37f636017e13 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Fri, 20 Oct 2023 07:52:52 -0400 Subject: [PATCH 133/136] automata: fix panic in dense DFA deserialization This fixes a hole in the validation logic that accidentally permitted a dense DFA to contain a match state with zero pattern IDs. Since search code is permitted to assume that every match state has at least one corresponding pattern ID, this led to a panic. Fixes https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=63391 --- ...ata_deserialize_dense_dfa-5624222820728832 | Bin 0 -> 749 bytes regex-automata/src/dfa/dense.rs | 20 ++++++++++++------ 2 files changed, 13 insertions(+), 7 deletions(-) create mode 100644 fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_automata_deserialize_dense_dfa-5624222820728832 diff --git a/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_automata_deserialize_dense_dfa-5624222820728832 b/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_automata_deserialize_dense_dfa-5624222820728832 new file mode 100644 index 0000000000000000000000000000000000000000..e236ae735c7f413c90a0e9b61cc4add46ced15e7 GIT binary patch literal 749 zcmd5)TMED+469=gkFon-(j{nQDi!=03sh-q(;8}bMmo$a!2W|zr>V_O2(ZE8>!v1* z6U=#_h8}h#XfDExxv!cs^^Zrt{#L1#-lYZ{(nKt)H^)$Ct|9sII*#X6$oXD13{eTq K(N?9_h4%nug9@bp literal 0 HcmV?d00001 diff --git a/regex-automata/src/dfa/dense.rs b/regex-automata/src/dfa/dense.rs index fd96bc878..6fc61dc4f 100644 --- a/regex-automata/src/dfa/dense.rs +++ b/regex-automata/src/dfa/dense.rs @@ -2340,8 +2340,8 @@ impl<'a> DFA<&'a [u32]> { // table, match states and accelerators below. If any validation fails, // then we return an error. let (dfa, nread) = unsafe { DFA::from_bytes_unchecked(slice)? }; - dfa.tt.validate(&dfa.special)?; - dfa.st.validate(&dfa.tt)?; + dfa.tt.validate(&dfa)?; + dfa.st.validate(&dfa)?; dfa.ms.validate(&dfa)?; dfa.accels.validate()?; // N.B. dfa.special doesn't have a way to do unchecked deserialization, @@ -3593,7 +3593,8 @@ impl> TransitionTable { /// /// That is, every state ID can be used to correctly index a state in this /// table. - fn validate(&self, sp: &Special) -> Result<(), DeserializeError> { + fn validate(&self, dfa: &DFA) -> Result<(), DeserializeError> { + let sp = &dfa.special; for state in self.states() { // We check that the ID itself is well formed. That is, if it's // a special state then it must actually be a quit, dead, accel, @@ -3611,6 +3612,13 @@ impl> TransitionTable { wasn't actually special", )); } + if sp.is_match_state(state.id()) + && dfa.match_len(state.id()) == 0 + { + return Err(DeserializeError::generic( + "found match state with zero pattern IDs", + )); + } } for (_, to) in state.transitions() { if !self.is_valid(to) { @@ -4127,10 +4135,8 @@ impl> StartTable { /// it against the given transition table (which must be for the same DFA). /// /// That is, every state ID can be used to correctly index a state. - fn validate( - &self, - tt: &TransitionTable, - ) -> Result<(), DeserializeError> { + fn validate(&self, dfa: &DFA) -> Result<(), DeserializeError> { + let tt = &dfa.tt; if !self.universal_start_unanchored.map_or(true, |s| tt.is_valid(s)) { return Err(DeserializeError::generic( "found invalid universal unanchored starting state ID", From 6b72eec64b428859702ae5ee811048112af5269e Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Wed, 25 Oct 2023 09:37:53 -0400 Subject: [PATCH 134/136] syntax: add Hir::literal example for `char` The example shows a succinct way of creating an HIR literal from a `char` value by first encoding it to UTF-8. Closes #1114 --- regex-syntax/src/hir/mod.rs | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index ce38ead7b..ae3ba318e 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -322,6 +322,22 @@ impl Hir { /// let expected = HirKind::Literal(Literal(Box::from("☃".as_bytes()))); /// assert_eq!(&expected, concat.kind()); /// ``` + /// + /// # Example: building a literal from a `char` + /// + /// This example shows how to build a single `Hir` literal from a `char` + /// value. Since a [`Literal`] is just bytes, we just need to UTF-8 + /// encode a `char` value: + /// + /// ``` + /// use regex_syntax::hir::{Hir, HirKind, Literal}; + /// + /// let ch = '☃'; + /// let got = Hir::literal(ch.encode_utf8(&mut [0; 4]).as_bytes()); + /// + /// let expected = HirKind::Literal(Literal(Box::from("☃".as_bytes()))); + /// assert_eq!(&expected, got.kind()); + /// ``` #[inline] pub fn literal>>(lit: B) -> Hir { let bytes = lit.into(); From 662a8b93afa55b5c489f14bca83565ebe62ccf67 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Wed, 1 Nov 2023 11:52:44 -0400 Subject: [PATCH 135/136] cli: change --no-captures to --captures (all|implicit|none) When we added the WhichCaptures type, we didn't update the CLI to expose the full functionality. This change does that. --- regex-automata/src/nfa/thompson/map.rs | 2 +- regex-automata/src/nfa/thompson/range_trie.rs | 2 +- regex-cli/args/flags.rs | 52 +++++++++++++++++++ regex-cli/args/thompson.rs | 24 +++------ 4 files changed, 60 insertions(+), 20 deletions(-) diff --git a/regex-automata/src/nfa/thompson/map.rs b/regex-automata/src/nfa/thompson/map.rs index c92d4c0b8..7f074a353 100644 --- a/regex-automata/src/nfa/thompson/map.rs +++ b/regex-automata/src/nfa/thompson/map.rs @@ -65,7 +65,7 @@ const INIT: u64 = 14695981039346656037; /// Specifically, one could observe the difference with std's hashmap via /// something like the following benchmark: /// -/// hyperfine "regex-cli debug thompson -qr --no-captures '\w{90} ecurB'" +/// hyperfine "regex-cli debug thompson -qr --captures none '\w{90} ecurB'" /// /// But to observe that difference, you'd have to modify the code to use /// std's hashmap. diff --git a/regex-automata/src/nfa/thompson/range_trie.rs b/regex-automata/src/nfa/thompson/range_trie.rs index 75c9b796b..cd77cc150 100644 --- a/regex-automata/src/nfa/thompson/range_trie.rs +++ b/regex-automata/src/nfa/thompson/range_trie.rs @@ -594,7 +594,7 @@ impl State { // Benchmarks suggest that binary search is just a bit faster than // straight linear search. Specifically when using the debug tool: // - // hyperfine "regex-cli debug thompson -qr --no-captures '\w{90} ecurB'" + // hyperfine "regex-cli debug thompson -qr --captures none '\w{90} ecurB'" binary_search(&self.transitions, |t| range.start <= t.range.end) } diff --git a/regex-cli/args/flags.rs b/regex-cli/args/flags.rs index db8a847ef..61732a28e 100644 --- a/regex-cli/args/flags.rs +++ b/regex-cli/args/flags.rs @@ -152,3 +152,55 @@ impl std::str::FromStr for MatchKind { Ok(MatchKind { kind }) } } + +/// Provides an implementation of the --captures flag, for use with Thompson +/// NFA configuration. +#[derive(Debug)] +pub struct WhichCaptures { + pub which: regex_automata::nfa::thompson::WhichCaptures, +} + +impl WhichCaptures { + pub const USAGE: Usage = Usage::new( + "--captures ", + "One of: all, implicit or none.", + r#" +Selects which capture states should be included in the Thompson NFA. The +choices are 'all' (the default), 'implicit' or 'none'. + +'all' means that both explicit and implicit capture states are included. + +'implicit' means that only implicit capture states are included. That is, the +Thompson NFA will only be able to report the overall match offsets and not the +match offsets of each explicit capture group. + +'none' means that no capture states will be included. This is useful when +capture states aren't needed (like when building a DFA) or if they aren't +supported (like when building a reverse NFA). +"#, + ); +} + +impl Default for WhichCaptures { + fn default() -> WhichCaptures { + WhichCaptures { + which: regex_automata::nfa::thompson::WhichCaptures::All, + } + } +} + +impl std::str::FromStr for WhichCaptures { + type Err = anyhow::Error; + + fn from_str(s: &str) -> anyhow::Result { + let which = match s { + "all" => regex_automata::nfa::thompson::WhichCaptures::All, + "implicit" => { + regex_automata::nfa::thompson::WhichCaptures::Implicit + } + "none" => regex_automata::nfa::thompson::WhichCaptures::None, + unk => anyhow::bail!("unrecognized captures option '{}'", unk), + }; + Ok(WhichCaptures { which }) + } +} diff --git a/regex-cli/args/thompson.rs b/regex-cli/args/thompson.rs index 151fc6a0b..bd8388d11 100644 --- a/regex-cli/args/thompson.rs +++ b/regex-cli/args/thompson.rs @@ -70,11 +70,11 @@ impl Configurable for Config { Arg::Long("shrink") => { self.thompson = self.thompson.clone().shrink(true); } - Arg::Long("no-captures") => { - self.thompson = self - .thompson - .clone() - .which_captures(thompson::WhichCaptures::None); + Arg::Long("captures") => { + let which: flags::WhichCaptures = + args::parse(p, "--captures")?; + self.thompson = + self.thompson.clone().which_captures(which.which); } Arg::Long("line-terminator") => { let byte: flags::OneByte = @@ -136,19 +136,7 @@ spent shrinking the NFA can lead to far larger savings in the subsequent DFA determinization. "#, ), - Usage::new( - "--no-captures", - "Disable capture states.", - r#" -Disables capture states. By default, NFAs include special "capture" states that -instruct some regex engines (like the PikeVM) to record offset positions in -ancillary state. - -It can be useful to disable capture states in order to reduce "clutter" in the -automaton when debugging it. Also, at time of writing, reverse NFAs require -that capture groups are disabled. -"#, - ), + flags::WhichCaptures::USAGE, Usage::new( "--line-terminator", "Set the line terminator used by line anchors.", From 837fd85e79fac2a4ea64030411b9a4a7b17dfa42 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Wed, 1 Nov 2023 11:53:34 -0400 Subject: [PATCH 136/136] regex-cli-0.2.0 --- regex-cli/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/regex-cli/Cargo.toml b/regex-cli/Cargo.toml index 3fe5390aa..a107c09df 100644 --- a/regex-cli/Cargo.toml +++ b/regex-cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "regex-cli" -version = "0.1.1" #:version +version = "0.2.0" #:version authors = ["The Rust Project Developers", "Andrew Gallant "] description = """ A command line tool for debugging, ad hoc benchmarking and generating regular