diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 25df2b3014..2813a16763 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -54,10 +54,14 @@ jobs: os: ubuntu-latest rust: stable target: i686-unknown-linux-gnu - - build: stable-mips + - build: stable-powerpc64 os: ubuntu-latest rust: stable - target: mips64-unknown-linux-gnuabi64 + target: powerpc64-unknown-linux-gnu + - build: stable-s390x + os: ubuntu-latest + rust: stable + target: s390x-unknown-linux-gnu - build: beta os: ubuntu-latest rust: beta @@ -77,7 +81,7 @@ jobs: - name: Checkout repository uses: actions/checkout@v3 - name: Install Rust - uses: dtolnay/rust-toolchain@v1 + uses: dtolnay/rust-toolchain@master with: toolchain: ${{ matrix.rust }} - name: Install and configure Cross @@ -92,12 +96,6 @@ jobs: cd "$dir" curl -LO "https://github.com/cross-rs/cross/releases/download/$CROSS_VERSION/cross-x86_64-unknown-linux-musl.tar.gz" tar xf cross-x86_64-unknown-linux-musl.tar.gz - - # We used to install 'cross' from master, but it kept failing. So now - # we build from a known-good version until 'cross' becomes more stable - # or we find an alternative. Notably, between v0.2.1 and current - # master (2022-06-14), the number of Cross's dependencies has doubled. - # cargo install --bins --git https://github.com/rust-embedded/cross --tag v0.2.1 echo "CARGO=cross" >> $GITHUB_ENV echo "TARGET=--target ${{ matrix.target }}" >> $GITHUB_ENV - name: Show command used for Cargo @@ -141,9 +139,28 @@ jobs: - name: Checkout repository uses: actions/checkout@v3 - name: Install Rust - uses: dtolnay/rust-toolchain@v1 + uses: dtolnay/rust-toolchain@master with: - toolchain: 1.60.0 + toolchain: 1.65.0 + # The memchr 2.6 release purportedly bumped its MSRV to Rust 1.60, but it + # turned out that on aarch64, it was using something that wasn't stabilized + # until Rust 1.61[1]. (This was an oversight on my part. I had previously + # thought everything I needed was on Rust 1.60.) To resolve that, I just + # bumped memchr's MSRV to 1.61. Since it was so soon after the memchr 2.6 + # release, I treated this as a bugfix. + # + # But the regex crate's MSRV is at Rust 1.60, and it now depends on at + # least memchr 2.6 (to make use of its `alloc` feature). So we can't set + # a lower minimal version. And I can't just bump the MSRV in a patch + # release as a bug fix because regex 1.9 was released quite some time ago. + # I could just release regex 1.10 and bump the MSRV there, but eh, I don't + # want to put out another minor version release just for this. + # + # So... pin memchr to 2.6.2, which at least works on x86-64 on Rust 1.60. + # + # [1]: https://github.com/BurntSushi/memchr/issues/136 + - name: Pin memchr to 2.6.2 + run: cargo update -p memchr --precise 2.6.2 - name: Basic build run: cargo build --verbose - name: Build docs @@ -162,7 +179,7 @@ jobs: - name: Checkout repository uses: actions/checkout@v3 - name: Install Rust - uses: dtolnay/rust-toolchain@v1 + uses: dtolnay/rust-toolchain@master with: toolchain: stable - name: Run full test suite @@ -175,7 +192,7 @@ jobs: - name: Checkout repository uses: actions/checkout@v3 - name: Install Rust - uses: dtolnay/rust-toolchain@v1 + uses: dtolnay/rust-toolchain@master with: toolchain: stable - name: Run full test suite @@ -188,7 +205,7 @@ jobs: - name: Checkout repository uses: actions/checkout@v3 - name: Install Rust - uses: dtolnay/rust-toolchain@v1 + uses: dtolnay/rust-toolchain@master with: toolchain: stable - name: Run full test suite @@ -201,7 +218,7 @@ jobs: - name: Checkout repository uses: actions/checkout@v3 - name: Install Rust - uses: dtolnay/rust-toolchain@v1 + uses: dtolnay/rust-toolchain@master with: toolchain: stable - name: Run full test suite @@ -216,7 +233,7 @@ jobs: - name: Checkout repository uses: actions/checkout@v3 - name: Install Rust - uses: dtolnay/rust-toolchain@v1 + uses: dtolnay/rust-toolchain@master with: # We use nightly here so that we can use miri I guess? # It caught me by surprise that miri seems to only be @@ -233,7 +250,7 @@ jobs: - name: Checkout repository uses: actions/checkout@v3 - name: Install Rust - uses: dtolnay/rust-toolchain@v1 + uses: dtolnay/rust-toolchain@master with: toolchain: stable components: rustfmt diff --git a/CHANGELOG.md b/CHANGELOG.md index a6a2bcb411..420e08f741 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,198 @@ +1.10.2 (2023-10-16) +=================== +This is a new patch release that fixes a search regression where incorrect +matches could be reported. + +Bug fixes: + +* [BUG #1110](https://github.com/rust-lang/regex/issues/1110): +Revert broadening of reverse suffix literal optimization introduced in 1.10.1. + + +1.10.1 (2023-10-14) +=================== +This is a new patch release with a minor increase in the number of valid +patterns and a broadening of some literal optimizations. + +New features: + +* [FEATURE 04f5d7be](https://github.com/rust-lang/regex/commit/04f5d7be4efc542864cc400f5d43fbea4eb9bab6): +Loosen ASCII-compatible rules such that regexes like `(?-u:☃)` are now allowed. + +Performance improvements: + +* [PERF 8a8d599f](https://github.com/rust-lang/regex/commit/8a8d599f9d2f2d78e9ad84e4084788c2d563afa5): +Broader the reverse suffix optimization to apply in more cases. + + +1.10.0 (2023-10-09) +=================== +This is a new minor release of `regex` that adds support for start and end +word boundary assertions. That is, `\<` and `\>`. The minimum supported Rust +version has also been raised to 1.65, which was released about one year ago. + +The new word boundary assertions are: + +* `\<` or `\b{start}`: a Unicode start-of-word boundary (`\W|\A` on the left, +`\w` on the right). +* `\>` or `\b{end}`: a Unicode end-of-word boundary (`\w` on the left, `\W|\z` +on the right)). +* `\b{start-half}`: half of a Unicode start-of-word boundary (`\W|\A` on the +left). +* `\b{end-half}`: half of a Unicode end-of-word boundary (`\W|\z` on the +right). + +The `\<` and `\>` are GNU extensions to POSIX regexes. They have been added +to the `regex` crate because they enjoy somewhat broad support in other regex +engines as well (for example, vim). The `\b{start}` and `\b{end}` assertions +are aliases for `\<` and `\>`, respectively. + +The `\b{start-half}` and `\b{end-half}` assertions are not found in any +other regex engine (although regex engines with general look-around support +can certainly express them). They were added principally to support the +implementation of word matching in grep programs, where one generally wants to +be a bit more flexible in what is considered a word boundary. + +New features: + +* [FEATURE #469](https://github.com/rust-lang/regex/issues/469): +Add support for `\<` and `\>` word boundary assertions. +* [FEATURE(regex-automata) #1031](https://github.com/rust-lang/regex/pull/1031): +DFAs now have a `start_state` method that doesn't use an `Input`. + +Performance improvements: + +* [PERF #1051](https://github.com/rust-lang/regex/pull/1051): +Unicode character class operations have been optimized in `regex-syntax`. +* [PERF #1090](https://github.com/rust-lang/regex/issues/1090): +Make patterns containing lots of literal characters use less memory. + +Bug fixes: + +* [BUG #1046](https://github.com/rust-lang/regex/issues/1046): +Fix a bug that could result in incorrect match spans when using a Unicode word +boundary and searching non-ASCII strings. +* [BUG(regex-syntax) #1047](https://github.com/rust-lang/regex/issues/1047): +Fix panics that can occur in `Ast->Hir` translation (not reachable from `regex` +crate). +* [BUG(regex-syntax) #1088](https://github.com/rust-lang/regex/issues/1088): +Remove guarantees in the API that connect the `u` flag with a specific HIR +representation. + +`regex-automata` breaking change release: + +This release includes a `regex-automata 0.4.0` breaking change release, which +was necessary in order to support the new word boundary assertions. For +example, the `Look` enum has new variants and the `LookSet` type now uses `u32` +instead of `u16` to represent a bitset of look-around assertions. These are +overall very minor changes, and most users of `regex-automata` should be able +to move to `0.4` from `0.3` without any changes at all. + +`regex-syntax` breaking change release: + +This release also includes a `regex-syntax 0.8.0` breaking change release, +which, like `regex-automata`, was necessary in order to support the new word +boundary assertions. This release also includes some changes to the `Ast` +type to reduce heap usage in some cases. If you are using the `Ast` type +directly, your code may require some minor modifications. Otherwise, users of +`regex-syntax 0.7` should be able to migrate to `0.8` without any code changes. + +`regex-lite` release: + +The `regex-lite 0.1.1` release contains support for the new word boundary +assertions. There are no breaking changes. + + +1.9.6 (2023-09-30) +================== +This is a patch release that fixes a panic that can occur when the default +regex size limit is increased to a large number. + +* [BUG aa4e4c71](https://github.com/rust-lang/regex/commit/aa4e4c7120b0090ce0624e3c42a2ed06dd8b918a): +Fix a bug where computing the maximum haystack length for the bounded +backtracker could result underflow and thus provoke a panic later in a search +due to a broken invariant. + + +1.9.5 (2023-09-02) +================== +This is a patch release that hopefully mostly fixes a performance bug that +occurs when sharing a regex across multiple threads. + +Issue [#934](https://github.com/rust-lang/regex/issues/934) +explains this in more detail. It is [also noted in the crate +documentation](https://docs.rs/regex/latest/regex/#sharing-a-regex-across-threads-can-result-in-contention). +The bug can appear when sharing a regex across multiple threads simultaneously, +as might be the case when using a regex from a `OnceLock`, `lazy_static` or +similar primitive. Usually high contention only results when using many threads +to execute searches on small haystacks. + +One can avoid the contention problem entirely through one of two methods. +The first is to use lower level APIs from `regex-automata` that require passing +state explicitly, such as [`meta::Regex::search_with`](https://docs.rs/regex-automata/latest/regex_automata/meta/struct.Regex.html#method.search_with). +The second is to clone a regex and send it to other threads explicitly. This +will not use any additional memory usage compared to sharing the regex. The +only downside of this approach is that it may be less convenient, for example, +it won't work with things like `OnceLock` or `lazy_static` or `once_cell`. + +With that said, as of this release, the contention performance problems have +been greatly reduced. This was achieved by changing the free-list so that it +was sharded across threads, and that ensuring each sharded mutex occupies a +single cache line to mitigate false sharing. So while contention may still +impact performance in some cases, it should be a lot better now. + +Because of the changes to how the free-list works, please report any issues you +find with this release. That not only includes search time regressions but also +significant regressions in memory usage. Reporting improvements is also welcome +as well! If possible, provide a reproduction. + +Bug fixes: + +* [BUG #934](https://github.com/rust-lang/regex/issues/934): +Fix a performance bug where high contention on a single regex led to massive +slow downs. + + +1.9.4 (2023-08-26) +================== +This is a patch release that fixes a bug where `RegexSet::is_match(..)` could +incorrectly return false (even when `RegexSet::matches(..).matched_any()` +returns true). + +Bug fixes: + +* [BUG #1070](https://github.com/rust-lang/regex/issues/1070): +Fix a bug where a prefilter was incorrectly configured for a `RegexSet`. + + +1.9.3 (2023-08-05) +================== +This is a patch release that fixes a bug where some searches could result in +incorrect match offsets being reported. It is difficult to characterize the +types of regexes susceptible to this bug. They generally involve patterns +that contain no prefix or suffix literals, but have an inner literal along with +a regex prefix that can conditionally match. + +Bug fixes: + +* [BUG #1060](https://github.com/rust-lang/regex/issues/1060): +Fix a bug with the reverse inner literal optimization reporting incorrect match +offsets. + + +1.9.2 (2023-08-05) +================== +This is a patch release that fixes another memory usage regression. This +particular regression occurred only when using a `RegexSet`. In some cases, +much more heap memory (by one or two orders of magnitude) was allocated than in +versions prior to 1.9.0. + +Bug fixes: + +* [BUG #1059](https://github.com/rust-lang/regex/issues/1059): +Fix a memory usage regression when using a `RegexSet`. + + 1.9.1 (2023-07-07) ================== This is a patch release which fixes a memory usage regression. In the regex diff --git a/Cargo.toml b/Cargo.toml index bfd6aea615..3ba14c904c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "regex" -version = "1.9.1" #:version +version = "1.10.2" #:version authors = ["The Rust Project Developers", "Andrew Gallant "] license = "MIT OR Apache-2.0" readme = "README.md" @@ -15,7 +15,7 @@ categories = ["text-processing"] autotests = false exclude = ["/scripts/*", "/.github/*"] edition = "2021" -rust-version = "1.60.0" +rust-version = "1.65" [workspace] members = [ @@ -52,6 +52,7 @@ std = [ # to actually emit the log messages somewhere. logging = [ "aho-corasick?/logging", + "memchr?/logging", "regex-automata/logging", ] # The 'use_std' feature is DEPRECATED. It will be removed in regex 2. Until @@ -167,20 +168,20 @@ optional = true # For skipping along search text quickly when a leading byte is known. [dependencies.memchr] -version = "2.5.0" +version = "2.6.0" optional = true # For the actual regex engines. [dependencies.regex-automata] path = "regex-automata" -version = "0.3.1" +version = "0.4.3" default-features = false features = ["alloc", "syntax", "meta", "nfa-pikevm"] # For parsing regular expressions. [dependencies.regex-syntax] path = "regex-syntax" -version = "0.7.3" +version = "0.8.2" default-features = false [dev-dependencies] diff --git a/HACKING.md b/HACKING.md deleted file mode 100644 index 34af5b517c..0000000000 --- a/HACKING.md +++ /dev/null @@ -1,341 +0,0 @@ -Your friendly guide to hacking and navigating the regex library. - -This guide assumes familiarity with Rust and Cargo, and at least a perusal of -the user facing documentation for this crate. - -If you're looking for background on the implementation in this library, then -you can do no better than Russ Cox's article series on implementing regular -expressions using finite automata: https://swtch.com/~rsc/regexp/ - - -## Architecture overview - -As you probably already know, this library executes regular expressions using -finite automata. In particular, a design goal is to make searching linear -with respect to both the regular expression and the text being searched. -Meeting that design goal on its own is not so hard and can be done with an -implementation of the Pike VM (similar to Thompson's construction, but supports -capturing groups), as described in: https://swtch.com/~rsc/regexp/regexp2.html ---- This library contains such an implementation in src/pikevm.rs. - -Making it fast is harder. One of the key problems with the Pike VM is that it -can be in more than one state at any point in time, and must shuffle capture -positions between them. The Pike VM also spends a lot of time following the -same epsilon transitions over and over again. We can employ one trick to -speed up the Pike VM: extract one or more literal prefixes from the regular -expression and execute specialized code to quickly find matches of those -prefixes in the search text. The Pike VM can then be avoided for most the -search, and instead only executed when a prefix is found. The code to find -prefixes is in the regex-syntax crate (in this repository). The code to search -for literals is in src/literals.rs. When more than one literal prefix is found, -we fall back to an Aho-Corasick DFA using the aho-corasick crate. For one -literal, we use a variant of the Boyer-Moore algorithm. Both Aho-Corasick and -Boyer-Moore use `memchr` when appropriate. The Boyer-Moore variant in this -library also uses elementary frequency analysis to choose the right byte to run -`memchr` with. - -Of course, detecting prefix literals can only take us so far. Not all regular -expressions have literal prefixes. To remedy this, we try another approach -to executing the Pike VM: backtracking, whose implementation can be found in -src/backtrack.rs. One reason why backtracking can be faster is that it avoids -excessive shuffling of capture groups. Of course, backtracking is susceptible -to exponential runtimes, so we keep track of every state we've visited to make -sure we never visit it again. This guarantees linear time execution, but we -pay for it with the memory required to track visited states. Because of the -memory requirement, we only use this engine on small search strings *and* small -regular expressions. - -Lastly, the real workhorse of this library is the "lazy" DFA in src/dfa.rs. -It is distinct from the Pike VM in that the DFA is explicitly represented in -memory and is only ever in one state at a time. It is said to be "lazy" because -the DFA is computed as text is searched, where each byte in the search text -results in at most one new DFA state. It is made fast by caching states. DFAs -are susceptible to exponential state blow up (where the worst case is computing -a new state for every input byte, regardless of what's in the state cache). To -avoid using a lot of memory, the lazy DFA uses a bounded cache. Once the cache -is full, it is wiped and state computation starts over again. If the cache is -wiped too frequently, then the DFA gives up and searching falls back to one of -the aforementioned algorithms. - -All of the above matching engines expose precisely the same matching semantics. -This is indeed tested. (See the section below about testing.) - -The following sub-sections describe the rest of the library and how each of the -matching engines are actually used. - -### Parsing - -Regular expressions are parsed using the regex-syntax crate, which is -maintained in this repository. The regex-syntax crate defines an abstract -syntax and provides very detailed error messages when a parse error is -encountered. Parsing is done in a separate crate so that others may benefit -from its existence, and because it is relatively divorced from the rest of the -regex library. - -The regex-syntax crate also provides sophisticated support for extracting -prefix and suffix literals from regular expressions. - -### Compilation - -The compiler is in src/compile.rs. The input to the compiler is some abstract -syntax for a regular expression and the output is a sequence of opcodes that -matching engines use to execute a search. (One can think of matching engines as -mini virtual machines.) The sequence of opcodes is a particular encoding of a -non-deterministic finite automaton. In particular, the opcodes explicitly rely -on epsilon transitions. - -Consider a simple regular expression like `a|b`. Its compiled form looks like -this: - - 000 Save(0) - 001 Split(2, 3) - 002 'a' (goto: 4) - 003 'b' - 004 Save(1) - 005 Match - -The first column is the instruction pointer and the second column is the -instruction. Save instructions indicate that the current position in the input -should be stored in a captured location. Split instructions represent a binary -branch in the program (i.e., epsilon transitions). The instructions `'a'` and -`'b'` indicate that the literal bytes `'a'` or `'b'` should match. - -In older versions of this library, the compilation looked like this: - - 000 Save(0) - 001 Split(2, 3) - 002 'a' - 003 Jump(5) - 004 'b' - 005 Save(1) - 006 Match - -In particular, empty instructions that merely served to move execution from one -point in the program to another were removed. Instead, every instruction has a -`goto` pointer embedded into it. This resulted in a small performance boost for -the Pike VM, because it was one fewer epsilon transition that it had to follow. - -There exist more instructions and they are defined and documented in -src/prog.rs. - -Compilation has several knobs and a few unfortunately complicated invariants. -Namely, the output of compilation can be one of two types of programs: a -program that executes on Unicode scalar values or a program that executes -on raw bytes. In the former case, the matching engine is responsible for -performing UTF-8 decoding and executing instructions using Unicode codepoints. -In the latter case, the program handles UTF-8 decoding implicitly, so that the -matching engine can execute on raw bytes. All matching engines can execute -either Unicode or byte based programs except for the lazy DFA, which requires -byte based programs. In general, both representations were kept because (1) the -lazy DFA requires byte based programs so that states can be encoded in a memory -efficient manner and (2) the Pike VM benefits greatly from inlining Unicode -character classes into fewer instructions as it results in fewer epsilon -transitions. - -N.B. UTF-8 decoding is built into the compiled program by making use of the -utf8-ranges crate. The compiler in this library factors out common suffixes to -reduce the size of huge character classes (e.g., `\pL`). - -A regrettable consequence of this split in instruction sets is we generally -need to compile two programs; one for NFA execution and one for the lazy DFA. - -In fact, it is worse than that: the lazy DFA is not capable of finding the -starting location of a match in a single scan, and must instead execute a -backwards search after finding the end location. To execute a backwards search, -we must have compiled the regular expression *in reverse*. - -This means that every compilation of a regular expression generally results in -three distinct programs. It would be possible to lazily compile the Unicode -program, since it is never needed if (1) the regular expression uses no word -boundary assertions and (2) the caller never asks for sub-capture locations. - -### Execution - -At the time of writing, there are four matching engines in this library: - -1. The Pike VM (supports captures). -2. Bounded backtracking (supports captures). -3. Literal substring or multi-substring search. -4. Lazy DFA (no support for Unicode word boundary assertions). - -Only the first two matching engines are capable of executing every regular -expression program. They also happen to be the slowest, which means we need -some logic that (1) knows various facts about the regular expression and (2) -knows what the caller wants. Using this information, we can determine which -engine (or engines) to use. - -The logic for choosing which engine to execute is in src/exec.rs and is -documented on the Exec type. Exec values contain regular expression Programs -(defined in src/prog.rs), which contain all the necessary tidbits for actually -executing a regular expression on search text. - -For the most part, the execution logic is straight-forward and follows the -limitations of each engine described above pretty faithfully. The hairiest -part of src/exec.rs by far is the execution of the lazy DFA, since it requires -a forwards and backwards search, and then falls back to either the Pike VM or -backtracking if the caller requested capture locations. - -The Exec type also contains mutable scratch space for each type of matching -engine. This scratch space is used during search (for example, for the lazy -DFA, it contains compiled states that are reused on subsequent searches). - -### Programs - -A regular expression program is essentially a sequence of opcodes produced by -the compiler plus various facts about the regular expression (such as whether -it is anchored, its capture names, etc.). - -### The regex! macro - -The `regex!` macro no longer exists. It was developed in a bygone era as a -compiler plugin during the infancy of the regex crate. Back then, then only -matching engine in the crate was the Pike VM. The `regex!` macro was, itself, -also a Pike VM. The only advantages it offered over the dynamic Pike VM that -was built at runtime were the following: - - 1. Syntax checking was done at compile time. Your Rust program wouldn't - compile if your regex didn't compile. - 2. Reduction of overhead that was proportional to the size of the regex. - For the most part, this overhead consisted of heap allocation, which - was nearly eliminated in the compiler plugin. - -The main takeaway here is that the compiler plugin was a marginally faster -version of a slow regex engine. As the regex crate evolved, it grew other regex -engines (DFA, bounded backtracker) and sophisticated literal optimizations. -The regex macro didn't keep pace, and it therefore became (dramatically) slower -than the dynamic engines. The only reason left to use it was for the compile -time guarantee that your regex is correct. Fortunately, Clippy (the Rust lint -tool) has a lint that checks your regular expression validity, which mostly -replaces that use case. - -Additionally, the regex compiler plugin stopped receiving maintenance. Nobody -complained. At that point, it seemed prudent to just remove it. - -Will a compiler plugin be brought back? The future is murky, but there is -definitely an opportunity there to build something that is faster than the -dynamic engines in some cases. But it will be challenging! As of now, there -are no plans to work on this. - - -## Testing - -A key aspect of any mature regex library is its test suite. A subset of the -tests in this library come from Glenn Fowler's AT&T test suite (its online -presence seems gone at the time of writing). The source of the test suite is -located in src/testdata. The scripts/regex-match-tests.py takes the test suite -in src/testdata and generates tests/matches.rs. - -There are also many other manually crafted tests and regression tests in -tests/tests.rs. Some of these tests were taken from RE2. - -The biggest source of complexity in the tests is related to answering this -question: how can we reuse the tests to check all of our matching engines? One -approach would have been to encode every test into some kind of format (like -the AT&T test suite) and code generate tests for each matching engine. The -approach we use in this library is to create a Cargo.toml entry point for each -matching engine we want to test. The entry points are: - -* `tests/test_default.rs` - tests `Regex::new` -* `tests/test_default_bytes.rs` - tests `bytes::Regex::new` -* `tests/test_nfa.rs` - tests `Regex::new`, forced to use the NFA - algorithm on every regex. -* `tests/test_nfa_bytes.rs` - tests `Regex::new`, forced to use the NFA - algorithm on every regex and use *arbitrary* byte based programs. -* `tests/test_nfa_utf8bytes.rs` - tests `Regex::new`, forced to use the NFA - algorithm on every regex and use *UTF-8* byte based programs. -* `tests/test_backtrack.rs` - tests `Regex::new`, forced to use - backtracking on every regex. -* `tests/test_backtrack_bytes.rs` - tests `Regex::new`, forced to use - backtracking on every regex and use *arbitrary* byte based programs. -* `tests/test_backtrack_utf8bytes.rs` - tests `Regex::new`, forced to use - backtracking on every regex and use *UTF-8* byte based programs. -* `tests/test_crates_regex.rs` - tests to make sure that all of the - backends behave in the same way against a number of quickcheck - generated random inputs. These tests need to be enabled through - the `RUST_REGEX_RANDOM_TEST` environment variable (see - below). - -The lazy DFA and pure literal engines are absent from this list because -they cannot be used on every regular expression. Instead, we rely on -`tests/test_dynamic.rs` to test the lazy DFA and literal engines when possible. - -Since the tests are repeated several times, and because `cargo test` runs all -entry points, it can take a while to compile everything. To reduce compile -times slightly, try using `cargo test --test default`, which will only use the -`tests/test_default.rs` entry point. - -The random testing takes quite a while, so it is not enabled by default. -In order to run the random testing you can set the -`RUST_REGEX_RANDOM_TEST` environment variable to anything before -invoking `cargo test`. Note that this variable is inspected at compile -time, so if the tests don't seem to be running, you may need to run -`cargo clean`. - -## Benchmarking - -The benchmarking in this crate is made up of many micro-benchmarks. Currently, -there are two primary sets of benchmarks: the benchmarks that were adopted -at this library's inception (in `bench/src/misc.rs`) and a newer set of -benchmarks meant to test various optimizations. Specifically, the latter set -contain some analysis and are in `bench/src/sherlock.rs`. Also, the latter -set are all executed on the same lengthy input whereas the former benchmarks -are executed on strings of varying length. - -There is also a smattering of benchmarks for parsing and compilation. - -Benchmarks are in a separate crate so that its dependencies can be managed -separately from the main regex crate. - -Benchmarking follows a similarly wonky setup as tests. There are multiple entry -points: - -* `bench_rust.rs` - benchmarks `Regex::new` -* `bench_rust_bytes.rs` benchmarks `bytes::Regex::new` -* `bench_pcre.rs` - benchmarks PCRE -* `bench_onig.rs` - benchmarks Oniguruma - -The PCRE and Oniguruma benchmarks exist as a comparison point to a mature -regular expression library. In general, this regex library compares favorably -(there are even a few benchmarks that PCRE simply runs too slowly on or -outright can't execute at all). I would love to add other regular expression -library benchmarks (especially RE2). - -If you're hacking on one of the matching engines and just want to see -benchmarks, then all you need to run is: - - $ (cd bench && ./run rust) - -If you want to compare your results with older benchmarks, then try: - - $ (cd bench && ./run rust | tee old) - $ ... make it faster - $ (cd bench && ./run rust | tee new) - $ cargo benchcmp old new --improvements - -The `cargo-benchcmp` utility is available here: -https://github.com/BurntSushi/cargo-benchcmp - -The `./bench/run` utility can run benchmarks for PCRE and Oniguruma too. See -`./bench/bench --help`. - -## Dev Docs - -When digging your teeth into the codebase for the first time, the -crate documentation can be a great resource. By default `rustdoc` -will strip out all documentation of private crate members in an -effort to help consumers of the crate focus on the *interface* -without having to concern themselves with the *implementation*. -Normally this is a great thing, but if you want to start hacking -on regex internals it is not what you want. Many of the private members -of this crate are well documented with rustdoc style comments, and -it would be a shame to miss out on the opportunity that presents. -You can generate the private docs with: - -``` -$ rustdoc --crate-name docs src/lib.rs -o target/doc -L target/debug/deps --no-defaults --passes collapse-docs --passes unindent-comments -``` - -Then just point your browser at `target/doc/regex/index.html`. - -See https://github.com/rust-lang/rust/issues/15347 for more info -about generating developer docs for internal use. diff --git a/README.md b/README.md index a9d6fcd373..f1e4c404ad 100644 --- a/README.md +++ b/README.md @@ -219,9 +219,95 @@ The full set of features one can disable are [in the "Crate features" section of the documentation](https://docs.rs/regex/1.*/#crate-features). +### Performance + +One of the goals of this crate is for the regex engine to be "fast." What that +is a somewhat nebulous goal, it is usually interpreted in one of two ways. +First, it means that all searches take worst case `O(m * n)` time, where +`m` is proportional to `len(regex)` and `n` is proportional to `len(haystack)`. +Second, it means that even aside from the time complexity constraint, regex +searches are "fast" in practice. + +While the first interpretation is pretty unambiguous, the second one remains +nebulous. While nebulous, it guides this crate's architecture and the sorts of +the trade offs it makes. For example, here are some general architectural +statements that follow as a result of the goal to be "fast": + +* When given the choice between faster regex searches and faster _Rust compile +times_, this crate will generally choose faster regex searches. +* When given the choice between faster regex searches and faster _regex compile +times_, this crate will generally choose faster regex searches. That is, it is +generally acceptable for `Regex::new` to get a little slower if it means that +searches get faster. (This is a somewhat delicate balance to strike, because +the speed of `Regex::new` needs to remain somewhat reasonable. But this is why +one should avoid re-compiling the same regex over and over again.) +* When given the choice between faster regex searches and simpler API +design, this crate will generally choose faster regex searches. For example, +if one didn't care about performance, we could like get rid of both of +the `Regex::is_match` and `Regex::find` APIs and instead just rely on +`Regex::captures`. + +There are perhaps more ways that being "fast" influences things. + +While this repository used to provide its own benchmark suite, it has since +been moved to [rebar](https://github.com/BurntSushi/rebar). The benchmarks are +quite extensive, and there are many more than what is shown in rebar's README +(which is just limited to a "curated" set meant to compare performance between +regex engines). To run all of this crate's benchmarks, first start by cloning +and installing `rebar`: + +```text +$ git clone https://github.com/BurntSushi/rebar +$ cd rebar +$ cargo install --path ./ +``` + +Then build the benchmark harness for just this crate: + +```text +$ rebar build -e '^rust/regex$' +``` + +Run all benchmarks for this crate as tests (each benchmark is executed once to +ensure it works): + +```text +$ rebar measure -e '^rust/regex$' -t +``` + +Record measurements for all benchmarks and save them to a CSV file: + +```text +$ rebar measure -e '^rust/regex$' | tee results.csv +``` + +Explore benchmark timings: + +```text +$ rebar cmp results.csv +``` + +See the `rebar` documentation for more details on how it works and how to +compare results with other regex engines. + + +### Hacking + +The `regex` crate is, for the most part, a pretty thin wrapper around the +[`meta::Regex`](https://docs.rs/regex-automata/latest/regex_automata/meta/struct.Regex.html) +from the +[`regex-automata` crate](https://docs.rs/regex-automata/latest/regex_automata/). +Therefore, if you're looking to work on the internals of this crate, you'll +likely either want to look in `regex-syntax` (for parsing) or `regex-automata` +(for construction of finite automata and the search routines). + +My [blog on regex internals](https://blog.burntsushi.net/regex-internals/) +goes into more depth. + + ### Minimum Rust version policy -This crate's minimum supported `rustc` version is `1.60.0`. +This crate's minimum supported `rustc` version is `1.65.0`. The policy is that the minimum Rust version required to use this crate can be increased in minor version updates. For example, if regex 1.0 requires Rust diff --git a/bench/README.md b/bench/README.md new file mode 100644 index 0000000000..3cc6a1a7af --- /dev/null +++ b/bench/README.md @@ -0,0 +1,2 @@ +Benchmarks for this crate have been moved into the rebar project: +https://github.com/BurntSushi/rebar diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml index 8688e73e03..a7eec2c816 100644 --- a/fuzz/Cargo.toml +++ b/fuzz/Cargo.toml @@ -32,6 +32,9 @@ features = ["atty", "humantime", "termcolor"] [workspace] members = ["."] +# NOTE: If you add a new fuzzer below, please make sure to add it to the +# oss-fuzz-build.sh script, otherwise it won't get run in OSS-fuzz. + [[bin]] name = "fuzz_regex_match" path = "fuzz_targets/fuzz_regex_match.rs" diff --git a/fuzz/ast-fuzzers.options b/fuzz/ast-fuzzers.options new file mode 100644 index 0000000000..678d526b1e --- /dev/null +++ b/fuzz/ast-fuzzers.options @@ -0,0 +1,2 @@ +[libfuzzer] +max_len = 65536 diff --git a/fuzz/fuzz_targets/ast_fuzz_match.rs b/fuzz/fuzz_targets/ast_fuzz_match.rs index 58a8ebbf80..9ccb407dc0 100644 --- a/fuzz/fuzz_targets/ast_fuzz_match.rs +++ b/fuzz/fuzz_targets/ast_fuzz_match.rs @@ -25,11 +25,12 @@ fuzz_target!(|data: FuzzData| -> Corpus { let _ = env_logger::try_init(); let pattern = format!("{}", data.ast); - let Ok(re) = RegexBuilder::new(&pattern).size_limit(1<<20).build() else { + let Ok(re) = RegexBuilder::new(&pattern).size_limit(1 << 20).build() + else { return Corpus::Reject; }; - re.is_match(&data.haystack); - re.find(&data.haystack); - re.captures(&data.haystack).map_or(0, |c| c.len()); + let _ = re.is_match(&data.haystack); + let _ = re.find(&data.haystack); + let _ = re.captures(&data.haystack).map_or(0, |c| c.len()); Corpus::Keep }); diff --git a/fuzz/fuzz_targets/ast_fuzz_match_bytes.rs b/fuzz/fuzz_targets/ast_fuzz_match_bytes.rs index a4fa0bd737..045c1fb18f 100644 --- a/fuzz/fuzz_targets/ast_fuzz_match_bytes.rs +++ b/fuzz/fuzz_targets/ast_fuzz_match_bytes.rs @@ -25,11 +25,12 @@ fuzz_target!(|data: FuzzData| -> Corpus { let _ = env_logger::try_init(); let pattern = format!("{}", data.ast); - let Ok(re) = RegexBuilder::new(&pattern).size_limit(1<<20).build() else { + let Ok(re) = RegexBuilder::new(&pattern).size_limit(1 << 20).build() + else { return Corpus::Reject; }; - re.is_match(&data.haystack); - re.find(&data.haystack); - re.captures(&data.haystack).map_or(0, |c| c.len()); + let _ = re.is_match(&data.haystack); + let _ = re.find(&data.haystack); + let _ = re.captures(&data.haystack).map_or(0, |c| c.len()); Corpus::Keep }); diff --git a/fuzz/fuzz_targets/ast_roundtrip.rs b/fuzz/fuzz_targets/ast_roundtrip.rs index 040b59d631..c35ac962e0 100644 --- a/fuzz/fuzz_targets/ast_roundtrip.rs +++ b/fuzz/fuzz_targets/ast_roundtrip.rs @@ -3,7 +3,7 @@ use { libfuzzer_sys::{fuzz_target, Corpus}, regex_syntax::ast::{ - parse::Parser, visit, Ast, Flag, Group, GroupKind, SetFlags, Visitor, + parse::Parser, visit, Ast, Flag, Flags, GroupKind, Visitor, }, }; @@ -32,16 +32,17 @@ impl Visitor for VerboseVisitor { } fn visit_pre(&mut self, ast: &Ast) -> Result { + let reject_flags = |flags: &Flags| { + flags.flag_state(Flag::IgnoreWhitespace).unwrap_or(false) + }; match ast { - Ast::Flags(SetFlags { flags, .. }) - | Ast::Group(Group { - kind: GroupKind::NonCapturing(flags), .. - }) if flags - .flag_state(Flag::IgnoreWhitespace) - .unwrap_or(false) => - { - Err(()) - } + Ast::Flags(x) if reject_flags(&x.flags) => return Err(()), + Ast::Group(x) => match x.kind { + GroupKind::NonCapturing(ref flags) if reject_flags(flags) => { + return Err(()) + } + _ => Ok(()), + }, _ => Ok(()), } } diff --git a/fuzz/fuzz_targets/fuzz_regex_lite_match.rs b/fuzz/fuzz_targets/fuzz_regex_lite_match.rs index 579078c71e..155fa6d8dc 100644 --- a/fuzz/fuzz_targets/fuzz_regex_lite_match.rs +++ b/fuzz/fuzz_targets/fuzz_regex_lite_match.rs @@ -57,8 +57,11 @@ fuzz_target!(|case: FuzzCase| -> Corpus { .dot_matches_new_line(case.dot_matches_new_line) .swap_greed(case.swap_greed) .ignore_whitespace(case.ignore_whitespace) - .size_limit(1<<20) - .build() else { return Corpus::Reject }; + .size_limit(1 << 16) + .build() + else { + return Corpus::Reject; + }; re.is_match(case.haystack); Corpus::Keep }); diff --git a/fuzz/fuzz_targets/fuzz_regex_match.rs b/fuzz/fuzz_targets/fuzz_regex_match.rs index 6c375510d0..a5dda53d65 100644 --- a/fuzz/fuzz_targets/fuzz_regex_match.rs +++ b/fuzz/fuzz_targets/fuzz_regex_match.rs @@ -54,6 +54,9 @@ re.is_match({haystack:?}); fuzz_target!(|case: FuzzCase| -> Corpus { let _ = env_logger::try_init(); + if case.pattern.len() > (16 * (1 << 10)) { + return Corpus::Reject; + } if case.haystack.len() > (16 * (1 << 10)) { return Corpus::Reject; } @@ -65,8 +68,11 @@ fuzz_target!(|case: FuzzCase| -> Corpus { .ignore_whitespace(case.ignore_whitespace) .unicode(case.unicode) .octal(case.octal) - .size_limit(1<<18) - .build() else { return Corpus::Reject }; + .size_limit(1 << 18) + .build() + else { + return Corpus::Reject; + }; re.is_match(case.haystack); Corpus::Keep }); diff --git a/fuzz/oss-fuzz-build.sh b/fuzz/oss-fuzz-build.sh index 38750250b6..81f619dcb5 100755 --- a/fuzz/oss-fuzz-build.sh +++ b/fuzz/oss-fuzz-build.sh @@ -1,4 +1,21 @@ #!/bin/bash -eu + cd $SRC/regex -cargo fuzz build -O --debug-assertions -cp fuzz/target/x86_64-unknown-linux-gnu/release/fuzz_regex_match $OUT/ +cargo fuzz build -O --debug-assertions + +targets=( + fuzz_regex_match + fuzz_regex_lite_match + fuzz_regex_automata_deserialize_dense_dfa + fuzz_regex_automata_deserialize_sparse_dfa + ast_roundtrip + ast_fuzz_match + ast_fuzz_regex + ast_fuzz_match_bytes +) +for target in "${targets[@]}"; do + cp "fuzz/target/x86_64-unknown-linux-gnu/release/${target}" "${OUT}/" + if [[ "$target" == ast_* ]]; then + cp fuzz/ast-fuzzers.options "${OUT}/${target}.options" + fi +done diff --git a/fuzz/regressions/clusterfuzz-testcase-minimized-ast_fuzz_match-5990349284442112 b/fuzz/regressions/clusterfuzz-testcase-minimized-ast_fuzz_match-5990349284442112 new file mode 100644 index 0000000000..8de974975d Binary files /dev/null and b/fuzz/regressions/clusterfuzz-testcase-minimized-ast_fuzz_match-5990349284442112 differ diff --git a/fuzz/regressions/clusterfuzz-testcase-minimized-ast_fuzz_match-6114393576046592 b/fuzz/regressions/clusterfuzz-testcase-minimized-ast_fuzz_match-6114393576046592 new file mode 100644 index 0000000000..a34eeaf2c0 Binary files /dev/null and b/fuzz/regressions/clusterfuzz-testcase-minimized-ast_fuzz_match-6114393576046592 differ diff --git a/fuzz/regressions/clusterfuzz-testcase-minimized-ast_fuzz_match_bytes-4820641084473344 b/fuzz/regressions/clusterfuzz-testcase-minimized-ast_fuzz_match_bytes-4820641084473344 new file mode 100644 index 0000000000..ce5b868b95 Binary files /dev/null and b/fuzz/regressions/clusterfuzz-testcase-minimized-ast_fuzz_match_bytes-4820641084473344 differ diff --git a/fuzz/regressions/clusterfuzz-testcase-minimized-ast_fuzz_regex-4596093180313600 b/fuzz/regressions/clusterfuzz-testcase-minimized-ast_fuzz_regex-4596093180313600 new file mode 100644 index 0000000000..711817e4ed Binary files /dev/null and b/fuzz/regressions/clusterfuzz-testcase-minimized-ast_fuzz_regex-4596093180313600 differ diff --git a/fuzz/regressions/clusterfuzz-testcase-minimized-ast_fuzz_regex-6345245270605824 b/fuzz/regressions/clusterfuzz-testcase-minimized-ast_fuzz_regex-6345245270605824 new file mode 100644 index 0000000000..312767e97b Binary files /dev/null and b/fuzz/regressions/clusterfuzz-testcase-minimized-ast_fuzz_regex-6345245270605824 differ diff --git a/fuzz/regressions/clusterfuzz-testcase-minimized-ast_roundtrip-5633607856947200 b/fuzz/regressions/clusterfuzz-testcase-minimized-ast_roundtrip-5633607856947200 new file mode 100644 index 0000000000..726609cf21 Binary files /dev/null and b/fuzz/regressions/clusterfuzz-testcase-minimized-ast_roundtrip-5633607856947200 differ diff --git a/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_automata_deserialize_dense_dfa-5624222820728832 b/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_automata_deserialize_dense_dfa-5624222820728832 new file mode 100644 index 0000000000..e236ae735c Binary files /dev/null and b/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_automata_deserialize_dense_dfa-5624222820728832 differ diff --git a/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_automata_deserialize_dense_dfa-5883983265923072 b/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_automata_deserialize_dense_dfa-5883983265923072 new file mode 100644 index 0000000000..233fcbc950 Binary files /dev/null and b/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_automata_deserialize_dense_dfa-5883983265923072 differ diff --git a/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_automata_deserialize_dense_dfa-6363062083649536 b/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_automata_deserialize_dense_dfa-6363062083649536 new file mode 100644 index 0000000000..d4a35d1d10 Binary files /dev/null and b/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_automata_deserialize_dense_dfa-6363062083649536 differ diff --git a/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_automata_deserialize_sparse_dfa-4903112680538112 b/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_automata_deserialize_sparse_dfa-4903112680538112 new file mode 100644 index 0000000000..3056bca2f3 Binary files /dev/null and b/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_automata_deserialize_sparse_dfa-4903112680538112 differ diff --git a/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_automata_deserialize_sparse_dfa-5415338693754880 b/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_automata_deserialize_sparse_dfa-5415338693754880 new file mode 100644 index 0000000000..cac835c53e Binary files /dev/null and b/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_automata_deserialize_sparse_dfa-5415338693754880 differ diff --git a/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_lite_match-4692452983046144 b/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_lite_match-4692452983046144 new file mode 100644 index 0000000000..184b6ed701 Binary files /dev/null and b/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_lite_match-4692452983046144 differ diff --git a/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_lite_match-5690981331369984 b/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_lite_match-5690981331369984 new file mode 100644 index 0000000000..d892bc31c4 Binary files /dev/null and b/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_lite_match-5690981331369984 differ diff --git a/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_lite_match-5888324890656768 b/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_lite_match-5888324890656768 new file mode 100644 index 0000000000..8612658526 Binary files /dev/null and b/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_lite_match-5888324890656768 differ diff --git a/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_match-5736465767989248 b/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_match-5736465767989248 new file mode 100644 index 0000000000..30a3a3ba0e Binary files /dev/null and b/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_match-5736465767989248 differ diff --git a/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_match-6413499984904192 b/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_match-6413499984904192 new file mode 100644 index 0000000000..8b24e0a6e0 Binary files /dev/null and b/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_match-6413499984904192 differ diff --git a/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_match-6659953212129280 b/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_match-6659953212129280 new file mode 100644 index 0000000000..b8cdc138a4 Binary files /dev/null and b/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_match-6659953212129280 differ diff --git a/regex-automata/Cargo.toml b/regex-automata/Cargo.toml index 86eb7d8f5a..3cb3d7c8e9 100644 --- a/regex-automata/Cargo.toml +++ b/regex-automata/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "regex-automata" -version = "0.3.2" #:version +version = "0.4.3" #:version authors = ["The Rust Project Developers", "Andrew Gallant "] description = "Automata construction and matching using regular expressions." documentation = "https://docs.rs/regex-automata" @@ -11,6 +11,7 @@ license = "MIT OR Apache-2.0" categories = ["text-processing"] edition = "2021" autoexamples = false +rust-version = "1.65" [lib] bench = false @@ -21,7 +22,7 @@ bench = false default = ["std", "syntax", "perf", "unicode", "meta", "nfa", "dfa", "hybrid"] std = ["regex-syntax?/std", "memchr?/std", "aho-corasick?/std", "alloc"] alloc = [] -logging = ["dep:log", "aho-corasick?/logging"] +logging = ["dep:log", "aho-corasick?/logging", "memchr?/logging"] syntax = ["dep:regex-syntax", "alloc"] @@ -84,8 +85,8 @@ internal-instrument-pikevm = ["logging", "std"] [dependencies] aho-corasick = { version = "1.0.0", optional = true, default-features = false } log = { version = "0.4.14", optional = true } -memchr = { version = "2.5.0", optional = true, default-features = false } -regex-syntax = { path = "../regex-syntax", version = "0.7.0", optional = true, default-features = false } +memchr = { version = "2.6.0", optional = true, default-features = false } +regex-syntax = { path = "../regex-syntax", version = "0.8.2", optional = true, default-features = false } [dev-dependencies] anyhow = "1.0.69" diff --git a/regex-automata/src/dfa/accel.rs b/regex-automata/src/dfa/accel.rs index 5ea2423dd0..c0ba18ea89 100644 --- a/regex-automata/src/dfa/accel.rs +++ b/regex-automata/src/dfa/accel.rs @@ -6,15 +6,16 @@ // non-Unicode regexes. For example, consider '(?-u)[^a]+a'. We can look at its // DFA with regex-cli: // -// $ regex-cli debug dfa dense '(?-u)[^a]+a' -BbC -// dense::DFA( +// $ regex-cli debug dense dfa -p '(?-u)[^a]+a' -BbC --no-table // D 000000: // Q 000001: // *000002: -// A 000003: \x00-` => 3, a => 5, b-\xFF => 3 -// >000004: \x00-` => 3, a => 4, b-\xFF => 3 -// 000005: \x00-\xFF => 2, EOI => 2 -// ) +// A 000003: \x00-` => 3, a => 8, b-\xFF => 3 +// A 000004: \x00-` => 4, a => 7, b-\xFF => 4 +// 000005: \x00-` => 4, b-\xFF => 4 +// 000006: \x00-` => 3, a => 6, b-\xFF => 3 +// 000007: \x00-\xFF => 2, EOI => 2 +// 000008: \x00-\xFF => 2, EOI => 2 // // In particular, state 3 is accelerated (shown via the 'A' indicator) since // the only way to leave that state once entered is to see an 'a' byte. If diff --git a/regex-automata/src/dfa/automaton.rs b/regex-automata/src/dfa/automaton.rs index 7e2be9a151..fcfcf29975 100644 --- a/regex-automata/src/dfa/automaton.rs +++ b/regex-automata/src/dfa/automaton.rs @@ -7,6 +7,7 @@ use crate::{ prefilter::Prefilter, primitives::{PatternID, StateID}, search::{Anchored, HalfMatch, Input, MatchError}, + start, }, }; @@ -226,8 +227,8 @@ pub unsafe trait Automaton { /// ``` fn next_eoi_state(&self, current: StateID) -> StateID; - /// Return the ID of the start state for this lazy DFA when executing a - /// forward search. + /// Return the ID of the start state for this DFA for the given starting + /// configuration. /// /// Unlike typical DFA implementations, the start state for DFAs in this /// crate is dependent on a few different factors: @@ -235,12 +236,41 @@ pub unsafe trait Automaton { /// * The [`Anchored`] mode of the search. Unanchored, anchored and /// anchored searches for a specific [`PatternID`] all use different start /// states. - /// * The position at which the search begins, via [`Input::start`]. This - /// and the byte immediately preceding the start of the search (if one - /// exists) influence which look-behind assertions are true at the start - /// of the search. This in turn influences which start state is selected. - /// * Whether the search is a forward or reverse search. This routine can - /// only be used for forward searches. + /// * Whether a "look-behind" byte exists. For example, the `^` anchor + /// matches if and only if there is no look-behind byte. + /// * The specific value of that look-behind byte. For example, a `(?m:^)` + /// assertion only matches when there is either no look-behind byte, or + /// when the look-behind byte is a line terminator. + /// + /// The [starting configuration](start::Config) provides the above + /// information. + /// + /// This routine can be used for either forward or reverse searches. + /// Although, as a convenience, if you have an [`Input`], then it may + /// be more succinct to use [`Automaton::start_state_forward`] or + /// [`Automaton::start_state_reverse`]. Note, for example, that the + /// convenience routines return a [`MatchError`] on failure where as this + /// routine returns a [`StartError`]. + /// + /// # Errors + /// + /// This may return a [`StartError`] if the search needs to give up when + /// determining the start state (for example, if it sees a "quit" byte). + /// This can also return an error if the given configuration contains an + /// unsupported [`Anchored`] configuration. + fn start_state( + &self, + config: &start::Config, + ) -> Result; + + /// Return the ID of the start state for this DFA when executing a forward + /// search. + /// + /// This is a convenience routine for calling [`Automaton::start_state`] + /// that converts the given [`Input`] to a [start + /// configuration](start::Config). Additionally, if an error occurs, it is + /// converted from a [`StartError`] to a [`MatchError`] using the offset + /// information in the given [`Input`]. /// /// # Errors /// @@ -251,23 +281,30 @@ pub unsafe trait Automaton { fn start_state_forward( &self, input: &Input<'_>, - ) -> Result; + ) -> Result { + let config = start::Config::from_input_forward(input); + self.start_state(&config).map_err(|err| match err { + StartError::Quit { byte } => { + let offset = input + .start() + .checked_sub(1) + .expect("no quit in start without look-behind"); + MatchError::quit(byte, offset) + } + StartError::UnsupportedAnchored { mode } => { + MatchError::unsupported_anchored(mode) + } + }) + } - /// Return the ID of the start state for this lazy DFA when executing a - /// reverse search. + /// Return the ID of the start state for this DFA when executing a reverse + /// search. /// - /// Unlike typical DFA implementations, the start state for DFAs in this - /// crate is dependent on a few different factors: - /// - /// * The [`Anchored`] mode of the search. Unanchored, anchored and - /// anchored searches for a specific [`PatternID`] all use different start - /// states. - /// * The position at which the search begins, via [`Input::start`]. This - /// and the byte immediately preceding the start of the search (if one - /// exists) influence which look-behind assertions are true at the start - /// of the search. This in turn influences which start state is selected. - /// * Whether the search is a forward or reverse search. This routine can - /// only be used for reverse searches. + /// This is a convenience routine for calling [`Automaton::start_state`] + /// that converts the given [`Input`] to a [start + /// configuration](start::Config). Additionally, if an error occurs, it is + /// converted from a [`StartError`] to a [`MatchError`] using the offset + /// information in the given [`Input`]. /// /// # Errors /// @@ -278,7 +315,18 @@ pub unsafe trait Automaton { fn start_state_reverse( &self, input: &Input<'_>, - ) -> Result; + ) -> Result { + let config = start::Config::from_input_reverse(input); + self.start_state(&config).map_err(|err| match err { + StartError::Quit { byte } => { + let offset = input.end(); + MatchError::quit(byte, offset) + } + StartError::UnsupportedAnchored { mode } => { + MatchError::unsupported_anchored(mode) + } + }) + } /// If this DFA has a universal starting state for the given anchor mode /// and the DFA supports universal starting states, then this returns that @@ -1084,7 +1132,7 @@ pub unsafe trait Automaton { /// // implementation defined. /// // /// // N.B. We get '3' by inspecting the state machine using 'regex-cli'. - /// // e.g., try `regex-cli debug dfa dense '[^abc]+a' -BbUC`. + /// // e.g., try `regex-cli debug dense dfa -p '[^abc]+a' -BbUC`. /// let id = StateID::new(3 * dfa.stride()).unwrap(); /// let accelerator = dfa.accelerator(id); /// // The `[^abc]+` sub-expression permits [a, b, c] to be accelerated. @@ -1798,6 +1846,14 @@ unsafe impl<'a, A: Automaton + ?Sized> Automaton for &'a A { (**self).next_eoi_state(current) } + #[inline] + fn start_state( + &self, + config: &start::Config, + ) -> Result { + (**self).start_state(config) + } + #[inline] fn start_state_forward( &self, @@ -2015,6 +2071,90 @@ impl OverlappingState { } } +/// An error that can occur when computing the start state for a search. +/// +/// Computing a start state can fail for a few reasons, either based on +/// incorrect configuration or even based on whether the look-behind byte +/// triggers a quit state. Typically one does not need to handle this error +/// if you're using [`Automaton::start_state_forward`] (or its reverse +/// counterpart), as that routine automatically converts `StartError` to a +/// [`MatchError`] for you. +/// +/// This error may be returned by the [`Automaton::start_state`] routine. +/// +/// This error implements the `std::error::Error` trait when the `std` feature +/// is enabled. +/// +/// This error is marked as non-exhaustive. New variants may be added in a +/// semver compatible release. +#[non_exhaustive] +#[derive(Clone, Debug)] +pub enum StartError { + /// An error that occurs when a starting configuration's look-behind byte + /// is in this DFA's quit set. + Quit { + /// The quit byte that was found. + byte: u8, + }, + /// An error that occurs when the caller requests an anchored mode that + /// isn't supported by the DFA. + UnsupportedAnchored { + /// The anchored mode given that is unsupported. + mode: Anchored, + }, +} + +impl StartError { + pub(crate) fn quit(byte: u8) -> StartError { + StartError::Quit { byte } + } + + pub(crate) fn unsupported_anchored(mode: Anchored) -> StartError { + StartError::UnsupportedAnchored { mode } + } +} + +#[cfg(feature = "std")] +impl std::error::Error for StartError {} + +impl core::fmt::Display for StartError { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + match *self { + StartError::Quit { byte } => write!( + f, + "error computing start state because the look-behind byte \ + {:?} triggered a quit state", + crate::util::escape::DebugByte(byte), + ), + StartError::UnsupportedAnchored { mode: Anchored::Yes } => { + write!( + f, + "error computing start state because \ + anchored searches are not supported or enabled" + ) + } + StartError::UnsupportedAnchored { mode: Anchored::No } => { + write!( + f, + "error computing start state because \ + unanchored searches are not supported or enabled" + ) + } + StartError::UnsupportedAnchored { + mode: Anchored::Pattern(pid), + } => { + write!( + f, + "error computing start state because \ + anchored searches for a specific pattern ({}) \ + are not supported or enabled", + pid.as_usize(), + ) + } + } + } +} + /// Runs the given overlapping `search` function (forwards or backwards) until /// a match is found whose offset does not split a codepoint. /// diff --git a/regex-automata/src/dfa/dense.rs b/regex-automata/src/dfa/dense.rs index 35f037ca63..6fc61dc4f5 100644 --- a/regex-automata/src/dfa/dense.rs +++ b/regex-automata/src/dfa/dense.rs @@ -30,7 +30,7 @@ use crate::{ use crate::{ dfa::{ accel::Accels, - automaton::{fmt_state_indicator, Automaton}, + automaton::{fmt_state_indicator, Automaton, StartError}, special::Special, start::StartKind, DEAD, @@ -40,8 +40,8 @@ use crate::{ int::{Pointer, Usize}, prefilter::Prefilter, primitives::{PatternID, StateID}, - search::{Anchored, Input, MatchError}, - start::{Start, StartByteMap}, + search::Anchored, + start::{self, Start, StartByteMap}, wire::{self, DeserializeError, Endian, SerializeError}, }, }; @@ -66,8 +66,9 @@ const VERSION: u32 = 2; /// /// The default configuration guarantees that a search will never return /// a "quit" error, although it is possible for a search to fail if -/// [`Config::starts_for_each_pattern`] wasn't enabled (which it is not by -/// default) and an [`Anchored::Pattern`] mode is requested via [`Input`]. +/// [`Config::starts_for_each_pattern`] wasn't enabled (which it is +/// not by default) and an [`Anchored::Pattern`] mode is requested via +/// [`Input`](crate::Input). #[cfg(feature = "dfa-build")] #[derive(Clone, Debug, Default)] pub struct Config { @@ -113,8 +114,7 @@ impl Config { /// make searching slower than it otherwise would be if the transitions /// that leave accelerated states are traversed frequently. /// - /// See [`Automaton::accelerator`](crate::dfa::Automaton::accelerator) for - /// an example. + /// See [`Automaton::accelerator`] for an example. /// /// This is enabled by default. pub fn accelerate(mut self, yes: bool) -> Config { @@ -879,22 +879,23 @@ impl Config { /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// # if !cfg!(target_pointer_width = "64") { return Ok(()); } // see #1039 /// use regex_automata::{dfa::{dense, Automaton}, Input}; /// - /// // 600KB isn't enough! + /// // 700KB isn't enough! /// dense::Builder::new() /// .configure(dense::Config::new() - /// .determinize_size_limit(Some(600_000)) + /// .determinize_size_limit(Some(700_000)) /// ) /// .build(r"\w{20}") /// .unwrap_err(); /// - /// // ... but 700KB probably is! + /// // ... but 800KB probably is! /// // (Note that auxiliary storage sizes aren't necessarily stable between /// // releases.) /// let dfa = dense::Builder::new() /// .configure(dense::Config::new() - /// .determinize_size_limit(Some(700_000)) + /// .determinize_size_limit(Some(800_000)) /// ) /// .build(r"\w{20}")?; /// let haystack = "A".repeat(20).into_bytes(); @@ -912,6 +913,7 @@ impl Config { /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// # if !cfg!(target_pointer_width = "64") { return Ok(()); } // see #1039 /// use regex_automata::{ /// dfa::{dense, Automaton, StartKind}, /// Anchored, Input, @@ -1168,7 +1170,10 @@ impl Builder { .clone() // We can always forcefully disable captures because DFAs do not // support them. - .configure(thompson::Config::new().captures(false)) + .configure( + thompson::Config::new() + .which_captures(thompson::WhichCaptures::None), + ) .build_many(patterns) .map_err(BuildError::nfa)?; self.build_from_nfa(&nfa) @@ -1223,13 +1228,14 @@ impl Builder { } else { let mut set = nfa.byte_class_set().clone(); // It is important to distinguish any "quit" bytes from all other - // bytes. Otherwise, a non-quit byte may end up in the same class - // as a quit byte, and thus cause the DFA stop when it shouldn't. + // bytes. Otherwise, a non-quit byte may end up in the same + // class as a quit byte, and thus cause the DFA to stop when it + // shouldn't. // // Test case: // - // regex-cli find hybrid regex -w @conn.json.1000x.log \ - // '^#' '\b10\.55\.182\.100\b' + // regex-cli find match dense --unicode-word-boundary \ + // -p '^#' -p '\b10\.55\.182\.100\b' -y @conn.json.1000x.log if !quitset.is_empty() { set.add_set(&quitset); } @@ -2334,12 +2340,30 @@ impl<'a> DFA<&'a [u32]> { // table, match states and accelerators below. If any validation fails, // then we return an error. let (dfa, nread) = unsafe { DFA::from_bytes_unchecked(slice)? }; - dfa.tt.validate(&dfa.special)?; - dfa.st.validate(&dfa.tt)?; + dfa.tt.validate(&dfa)?; + dfa.st.validate(&dfa)?; dfa.ms.validate(&dfa)?; dfa.accels.validate()?; // N.B. dfa.special doesn't have a way to do unchecked deserialization, // so it has already been validated. + for state in dfa.states() { + // If the state is an accel state, then it must have a non-empty + // accelerator. + if dfa.is_accel_state(state.id()) { + let index = dfa.accelerator_index(state.id()); + if index >= dfa.accels.len() { + return Err(DeserializeError::generic( + "found DFA state with invalid accelerator index", + )); + } + let needles = dfa.accels.needles(index); + if !(1 <= needles.len() && needles.len() <= 3) { + return Err(DeserializeError::generic( + "accelerator needles has invalid length", + )); + } + } + } Ok((dfa, nread)) } @@ -2880,31 +2904,33 @@ impl OwnedDFA { fn set_universal_starts(&mut self) { assert_eq!(6, Start::len(), "expected 6 start configurations"); - let start_id = |dfa: &mut OwnedDFA, inp: &Input<'_>, start: Start| { + let start_id = |dfa: &mut OwnedDFA, + anchored: Anchored, + start: Start| { // This OK because we only call 'start' under conditions // in which we know it will succeed. - dfa.st.start(inp, start).expect("valid Input configuration") + dfa.st.start(anchored, start).expect("valid Input configuration") }; if self.start_kind().has_unanchored() { - let inp = Input::new("").anchored(Anchored::No); - let sid = start_id(self, &inp, Start::NonWordByte); - if sid == start_id(self, &inp, Start::WordByte) - && sid == start_id(self, &inp, Start::Text) - && sid == start_id(self, &inp, Start::LineLF) - && sid == start_id(self, &inp, Start::LineCR) - && sid == start_id(self, &inp, Start::CustomLineTerminator) + let anchor = Anchored::No; + let sid = start_id(self, anchor, Start::NonWordByte); + if sid == start_id(self, anchor, Start::WordByte) + && sid == start_id(self, anchor, Start::Text) + && sid == start_id(self, anchor, Start::LineLF) + && sid == start_id(self, anchor, Start::LineCR) + && sid == start_id(self, anchor, Start::CustomLineTerminator) { self.st.universal_start_unanchored = Some(sid); } } if self.start_kind().has_anchored() { - let inp = Input::new("").anchored(Anchored::Yes); - let sid = start_id(self, &inp, Start::NonWordByte); - if sid == start_id(self, &inp, Start::WordByte) - && sid == start_id(self, &inp, Start::Text) - && sid == start_id(self, &inp, Start::LineLF) - && sid == start_id(self, &inp, Start::LineCR) - && sid == start_id(self, &inp, Start::CustomLineTerminator) + let anchor = Anchored::Yes; + let sid = start_id(self, anchor, Start::NonWordByte); + if sid == start_id(self, anchor, Start::WordByte) + && sid == start_id(self, anchor, Start::Text) + && sid == start_id(self, anchor, Start::LineLF) + && sid == start_id(self, anchor, Start::LineCR) + && sid == start_id(self, anchor, Start::CustomLineTerminator) { self.st.universal_start_anchored = Some(sid); } @@ -3211,35 +3237,21 @@ unsafe impl> Automaton for DFA { } #[cfg_attr(feature = "perf-inline", inline(always))] - fn start_state_forward( + fn start_state( &self, - input: &Input<'_>, - ) -> Result { - if !self.quitset.is_empty() && input.start() > 0 { - let offset = input.start() - 1; - let byte = input.haystack()[offset]; - if self.quitset.contains(byte) { - return Err(MatchError::quit(byte, offset)); - } - } - let start = self.st.start_map.fwd(&input); - self.st.start(input, start) - } - - #[cfg_attr(feature = "perf-inline", inline(always))] - fn start_state_reverse( - &self, - input: &Input<'_>, - ) -> Result { - if !self.quitset.is_empty() && input.end() < input.haystack().len() { - let offset = input.end(); - let byte = input.haystack()[offset]; - if self.quitset.contains(byte) { - return Err(MatchError::quit(byte, offset)); + config: &start::Config, + ) -> Result { + let anchored = config.get_anchored(); + let start = match config.get_look_behind() { + None => Start::Text, + Some(byte) => { + if !self.quitset.is_empty() && self.quitset.contains(byte) { + return Err(StartError::quit(byte)); + } + self.st.start_map.get(byte) } - } - let start = self.st.start_map.rev(&input); - self.st.start(input, start) + }; + self.st.start(anchored, start) } #[cfg_attr(feature = "perf-inline", inline(always))] @@ -3581,7 +3593,8 @@ impl> TransitionTable { /// /// That is, every state ID can be used to correctly index a state in this /// table. - fn validate(&self, sp: &Special) -> Result<(), DeserializeError> { + fn validate(&self, dfa: &DFA) -> Result<(), DeserializeError> { + let sp = &dfa.special; for state in self.states() { // We check that the ID itself is well formed. That is, if it's // a special state then it must actually be a quit, dead, accel, @@ -3599,6 +3612,13 @@ impl> TransitionTable { wasn't actually special", )); } + if sp.is_match_state(state.id()) + && dfa.match_len(state.id()) == 0 + { + return Err(DeserializeError::generic( + "found match state with zero pattern IDs", + )); + } } for (_, to) in state.transitions() { if !self.is_valid(to) { @@ -4115,10 +4135,8 @@ impl> StartTable { /// it against the given transition table (which must be for the same DFA). /// /// That is, every state ID can be used to correctly index a state. - fn validate( - &self, - tt: &TransitionTable, - ) -> Result<(), DeserializeError> { + fn validate(&self, dfa: &DFA) -> Result<(), DeserializeError> { + let tt = &dfa.tt; if !self.universal_start_unanchored.map_or(true, |s| tt.is_valid(s)) { return Err(DeserializeError::generic( "found invalid universal unanchored starting state ID", @@ -4175,28 +4193,27 @@ impl> StartTable { #[cfg_attr(feature = "perf-inline", inline(always))] fn start( &self, - input: &Input<'_>, + anchored: Anchored, start: Start, - ) -> Result { + ) -> Result { let start_index = start.as_usize(); - let mode = input.get_anchored(); - let index = match mode { + let index = match anchored { Anchored::No => { if !self.kind.has_unanchored() { - return Err(MatchError::unsupported_anchored(mode)); + return Err(StartError::unsupported_anchored(anchored)); } start_index } Anchored::Yes => { if !self.kind.has_anchored() { - return Err(MatchError::unsupported_anchored(mode)); + return Err(StartError::unsupported_anchored(anchored)); } self.stride + start_index } Anchored::Pattern(pid) => { let len = match self.pattern_len { None => { - return Err(MatchError::unsupported_anchored(mode)) + return Err(StartError::unsupported_anchored(anchored)) } Some(len) => len, }; @@ -5081,6 +5098,8 @@ impl core::fmt::Display for BuildError { #[cfg(all(test, feature = "syntax", feature = "dfa-build"))] mod tests { + use crate::{Input, MatchError}; + use super::*; #[test] diff --git a/regex-automata/src/dfa/mod.rs b/regex-automata/src/dfa/mod.rs index 4bb8704352..fd58cac23a 100644 --- a/regex-automata/src/dfa/mod.rs +++ b/regex-automata/src/dfa/mod.rs @@ -320,7 +320,7 @@ dramatically. #[cfg(feature = "dfa-search")] pub use crate::dfa::{ - automaton::{Automaton, OverlappingState}, + automaton::{Automaton, OverlappingState, StartError}, start::StartKind, }; diff --git a/regex-automata/src/dfa/onepass.rs b/regex-automata/src/dfa/onepass.rs index 44691d0c8a..e62bbd383e 100644 --- a/regex-automata/src/dfa/onepass.rs +++ b/regex-automata/src/dfa/onepass.rs @@ -2581,10 +2581,11 @@ impl Cache { /// Represents a single transition in a one-pass DFA. /// -/// The high 24 bits corresponds to the state ID. The low 48 bits corresponds -/// to the transition epsilons, which contains the slots that should be saved -/// when this transition is followed and the conditional epsilon transitions -/// that must be satisfied in order to follow this transition. +/// The high 21 bits corresponds to the state ID. The bit following corresponds +/// to the special "match wins" flag. The remaining low 42 bits corresponds to +/// the transition epsilons, which contains the slots that should be saved when +/// this transition is followed and the conditional epsilon transitions that +/// must be satisfied in order to follow this transition. #[derive(Clone, Copy, Eq, PartialEq)] struct Transition(u64); @@ -2741,7 +2742,7 @@ impl PatternEpsilons { fn set_epsilons(self, epsilons: Epsilons) -> PatternEpsilons { PatternEpsilons( (self.0 & PatternEpsilons::PATTERN_ID_MASK) - | u64::from(epsilons.0), + | (u64::from(epsilons.0) & PatternEpsilons::EPSILONS_MASK), ) } } @@ -2814,12 +2815,15 @@ impl Epsilons { /// Return the set of look-around assertions in these epsilon transitions. fn looks(self) -> LookSet { - LookSet { bits: (self.0 & Epsilons::LOOK_MASK).low_u16() } + LookSet { bits: (self.0 & Epsilons::LOOK_MASK).low_u32() } } /// Set the look-around assertions on these epsilon transitions. fn set_looks(self, look_set: LookSet) -> Epsilons { - Epsilons((self.0 & Epsilons::SLOT_MASK) | u64::from(look_set.bits)) + Epsilons( + (self.0 & Epsilons::SLOT_MASK) + | (u64::from(look_set.bits) & Epsilons::LOOK_MASK), + ) } } diff --git a/regex-automata/src/dfa/regex.rs b/regex-automata/src/dfa/regex.rs index f39c1c055c..5e7e6e38ac 100644 --- a/regex-automata/src/dfa/regex.rs +++ b/regex-automata/src/dfa/regex.rs @@ -853,7 +853,7 @@ impl Builder { } /// Set the dense DFA compilation configuration for this builder using - /// [`dense::Config`](dense::Config). + /// [`dense::Config`]. /// /// This permits setting things like whether the underlying DFAs should /// be minimized. diff --git a/regex-automata/src/dfa/search.rs b/regex-automata/src/dfa/search.rs index 8c012a5944..5a82261f97 100644 --- a/regex-automata/src/dfa/search.rs +++ b/regex-automata/src/dfa/search.rs @@ -176,7 +176,6 @@ fn find_fwd_imp( // It's important that this is a debug_assert, since this can // actually be tripped even if DFA::from_bytes succeeds and // returns a supposedly valid DFA. - debug_assert!(dfa.is_quit_state(sid)); return Err(MatchError::quit(input.haystack()[at], at)); } } @@ -297,7 +296,6 @@ fn find_rev_imp( } else if dfa.is_dead_state(sid) { return Ok(mat); } else { - debug_assert!(dfa.is_quit_state(sid)); return Err(MatchError::quit(input.haystack()[at], at)); } } @@ -422,7 +420,6 @@ fn find_overlapping_fwd_imp( } else if dfa.is_dead_state(sid) { return Ok(()); } else { - debug_assert!(dfa.is_quit_state(sid)); return Err(MatchError::quit( input.haystack()[state.at], state.at, @@ -526,7 +523,6 @@ pub(crate) fn find_overlapping_rev( } else if dfa.is_dead_state(sid) { return Ok(()); } else { - debug_assert!(dfa.is_quit_state(sid)); return Err(MatchError::quit( input.haystack()[state.at], state.at, @@ -600,9 +596,6 @@ fn eoi_fwd( let pattern = dfa.match_pattern(*sid, 0); *mat = Some(HalfMatch::new(pattern, input.haystack().len())); } - // N.B. We don't have to check 'is_quit' here because the EOI - // transition can never lead to a quit state. - debug_assert!(!dfa.is_quit_state(*sid)); } } Ok(()) @@ -631,9 +624,6 @@ fn eoi_rev( let pattern = dfa.match_pattern(*sid, 0); *mat = Some(HalfMatch::new(pattern, 0)); } - // N.B. We don't have to check 'is_quit' here because the EOI - // transition can never lead to a quit state. - debug_assert!(!dfa.is_quit_state(*sid)); } Ok(()) } diff --git a/regex-automata/src/dfa/sparse.rs b/regex-automata/src/dfa/sparse.rs index 5d8ec23408..d461e0a0f3 100644 --- a/regex-automata/src/dfa/sparse.rs +++ b/regex-automata/src/dfa/sparse.rs @@ -3,13 +3,12 @@ Types and routines specific to sparse DFAs. This module is the home of [`sparse::DFA`](DFA). -Unlike the [`dense`](super::dense) module, this module does not contain a -builder or configuration specific for sparse DFAs. Instead, the intended -way to build a sparse DFA is either by using a default configuration with -its constructor [`sparse::DFA::new`](DFA::new), or by first configuring the -construction of a dense DFA with [`dense::Builder`](super::dense::Builder) -and then calling [`dense::DFA::to_sparse`](super::dense::DFA::to_sparse). For -example, this configures a sparse DFA to do an overlapping search: +Unlike the [`dense`] module, this module does not contain a builder or +configuration specific for sparse DFAs. Instead, the intended way to build a +sparse DFA is either by using a default configuration with its constructor +[`sparse::DFA::new`](DFA::new), or by first configuring the construction of a +dense DFA with [`dense::Builder`] and then calling [`dense::DFA::to_sparse`]. +For example, this configures a sparse DFA to do an overlapping search: ``` use regex_automata::{ @@ -52,7 +51,7 @@ use alloc::{vec, vec::Vec}; use crate::dfa::dense::{self, BuildError}; use crate::{ dfa::{ - automaton::{fmt_state_indicator, Automaton}, + automaton::{fmt_state_indicator, Automaton, StartError}, dense::Flags, special::Special, StartKind, DEAD, @@ -63,8 +62,8 @@ use crate::{ int::{Pointer, Usize, U16, U32}, prefilter::Prefilter, primitives::{PatternID, StateID}, - search::{Anchored, Input, MatchError}, - start::{Start, StartByteMap}, + search::Anchored, + start::{self, Start, StartByteMap}, wire::{self, DeserializeError, Endian, SerializeError}, }, }; @@ -74,18 +73,17 @@ const VERSION: u32 = 2; /// A sparse deterministic finite automaton (DFA) with variable sized states. /// -/// In contrast to a [dense::DFA](crate::dfa::dense::DFA), a sparse DFA uses -/// a more space efficient representation for its transitions. Consequently, -/// sparse DFAs may use much less memory than dense DFAs, but this comes at a -/// price. In particular, reading the more space efficient transitions takes -/// more work, and consequently, searching using a sparse DFA is typically -/// slower than a dense DFA. +/// In contrast to a [dense::DFA], a sparse DFA uses a more space efficient +/// representation for its transitions. Consequently, sparse DFAs may use much +/// less memory than dense DFAs, but this comes at a price. In particular, +/// reading the more space efficient transitions takes more work, and +/// consequently, searching using a sparse DFA is typically slower than a dense +/// DFA. /// /// A sparse DFA can be built using the default configuration via the -/// [`DFA::new`] constructor. Otherwise, one can configure various aspects -/// of a dense DFA via [`dense::Builder`](crate::dfa::dense::Builder), -/// and then convert a dense DFA to a sparse DFA using -/// [`dense::DFA::to_sparse`](crate::dfa::dense::DFA::to_sparse). +/// [`DFA::new`] constructor. Otherwise, one can configure various aspects of a +/// dense DFA via [`dense::Builder`], and then convert a dense DFA to a sparse +/// DFA using [`dense::DFA::to_sparse`]. /// /// In general, a sparse DFA supports all the same search operations as a dense /// DFA. @@ -140,11 +138,9 @@ impl DFA> { /// Parse the given regular expression using a default configuration and /// return the corresponding sparse DFA. /// - /// If you want a non-default configuration, then use - /// the [`dense::Builder`](crate::dfa::dense::Builder) - /// to set your own configuration, and then call - /// [`dense::DFA::to_sparse`](crate::dfa::dense::DFA::to_sparse) to create - /// a sparse DFA. + /// If you want a non-default configuration, then use the + /// [`dense::Builder`] to set your own configuration, and then call + /// [`dense::DFA::to_sparse`] to create a sparse DFA. /// /// # Example /// @@ -167,11 +163,9 @@ impl DFA> { /// Parse the given regular expressions using a default configuration and /// return the corresponding multi-DFA. /// - /// If you want a non-default configuration, then use - /// the [`dense::Builder`](crate::dfa::dense::Builder) - /// to set your own configuration, and then call - /// [`dense::DFA::to_sparse`](crate::dfa::dense::DFA::to_sparse) to create - /// a sparse DFA. + /// If you want a non-default configuration, then use the + /// [`dense::Builder`] to set your own configuration, and then call + /// [`dense::DFA::to_sparse`] to create a sparse DFA. /// /// # Example /// @@ -511,10 +505,9 @@ impl> DFA { /// * [`DFA::from_bytes`] /// * [`DFA::from_bytes_unchecked`] /// - /// Note that unlike a [`dense::DFA`](crate::dfa::dense::DFA)'s - /// serialization methods, this does not add any initial padding to the - /// returned bytes. Padding isn't required for sparse DFAs since they have - /// no alignment requirements. + /// Note that unlike a [`dense::DFA`]'s serialization methods, this does + /// not add any initial padding to the returned bytes. Padding isn't + /// required for sparse DFAs since they have no alignment requirements. /// /// # Example /// @@ -553,10 +546,9 @@ impl> DFA { /// * [`DFA::from_bytes`] /// * [`DFA::from_bytes_unchecked`] /// - /// Note that unlike a [`dense::DFA`](crate::dfa::dense::DFA)'s - /// serialization methods, this does not add any initial padding to the - /// returned bytes. Padding isn't required for sparse DFAs since they have - /// no alignment requirements. + /// Note that unlike a [`dense::DFA`]'s serialization methods, this does + /// not add any initial padding to the returned bytes. Padding isn't + /// required for sparse DFAs since they have no alignment requirements. /// /// # Example /// @@ -595,10 +587,9 @@ impl> DFA { /// * [`DFA::from_bytes`] /// * [`DFA::from_bytes_unchecked`] /// - /// Note that unlike a [`dense::DFA`](crate::dfa::dense::DFA)'s - /// serialization methods, this does not add any initial padding to the - /// returned bytes. Padding isn't required for sparse DFAs since they have - /// no alignment requirements. + /// Note that unlike a [`dense::DFA`]'s serialization methods, this does + /// not add any initial padding to the returned bytes. Padding isn't + /// required for sparse DFAs since they have no alignment requirements. /// /// Generally speaking, native endian format should only be used when /// you know that the target you're compiling the DFA for matches the @@ -903,9 +894,9 @@ impl<'a> DFA<&'a [u8]> { /// /// If any of the above are not true, then an error will be returned. /// - /// Note that unlike deserializing a - /// [`dense::DFA`](crate::dfa::dense::DFA), deserializing a sparse DFA has - /// no alignment requirements. That is, an alignment of `1` is valid. + /// Note that unlike deserializing a [`dense::DFA`], deserializing a sparse + /// DFA has no alignment requirements. That is, an alignment of `1` is + /// valid. /// /// # Panics /// @@ -1001,8 +992,8 @@ impl<'a> DFA<&'a [u8]> { // (by trying to decode every state) and start state ID list below. If // either validation fails, then we return an error. let (dfa, nread) = unsafe { DFA::from_bytes_unchecked(slice)? }; - dfa.tt.validate(&dfa.special)?; - dfa.st.validate(&dfa.special, &dfa.tt)?; + let seen = dfa.tt.validate(&dfa.special)?; + dfa.st.validate(&dfa.special, &seen)?; // N.B. dfa.special doesn't have a way to do unchecked deserialization, // so it has already been validated. Ok((dfa, nread)) @@ -1207,35 +1198,21 @@ unsafe impl> Automaton for DFA { } #[inline] - fn start_state_forward( + fn start_state( &self, - input: &Input<'_>, - ) -> Result { - if !self.quitset.is_empty() && input.start() > 0 { - let offset = input.start() - 1; - let byte = input.haystack()[offset]; - if self.quitset.contains(byte) { - return Err(MatchError::quit(byte, offset)); - } - } - let start = self.st.start_map.fwd(&input); - self.st.start(input, start) - } - - #[inline] - fn start_state_reverse( - &self, - input: &Input<'_>, - ) -> Result { - if !self.quitset.is_empty() && input.end() < input.haystack().len() { - let offset = input.end(); - let byte = input.haystack()[offset]; - if self.quitset.contains(byte) { - return Err(MatchError::quit(byte, offset)); + config: &start::Config, + ) -> Result { + let anchored = config.get_anchored(); + let start = match config.get_look_behind() { + None => Start::Text, + Some(byte) => { + if !self.quitset.is_empty() && self.quitset.contains(byte) { + return Err(StartError::quit(byte)); + } + self.st.start_map.get(byte) } - } - let start = self.st.start_map.rev(&input); - self.st.start(input, start) + }; + self.st.start(anchored, start) } #[inline] @@ -1411,63 +1388,8 @@ impl> Transitions { /// /// That is, every state ID can be used to correctly index a state in this /// table. - fn validate(&self, sp: &Special) -> Result<(), DeserializeError> { - // In order to validate everything, we not only need to make sure we - // can decode every state, but that every transition in every state - // points to a valid state. There are many duplicative transitions, so - // we record state IDs that we've verified so that we don't redo the - // decoding work. - // - // Except, when in no_std mode, we don't have dynamic memory allocation - // available to us, so we skip this optimization. It's not clear - // whether doing something more clever is worth it just yet. If you're - // profiling this code and need it to run faster, please file an issue. - // - // OK, so we also use this to record the set of valid state IDs. Since - // it is possible for a transition to point to an invalid state ID that - // still (somehow) deserializes to a valid state. So we need to make - // sure our transitions are limited to actually correct state IDs. - // The problem is, I'm not sure how to do this verification step in - // no-std no-alloc mode. I think we'd *have* to store the set of valid - // state IDs in the DFA itself. For now, we don't do this verification - // in no-std no-alloc mode. The worst thing that can happen is an - // incorrect result. But no panics or memory safety problems should - // result. Because we still do validate that the state itself is - // "valid" in the sense that everything it points to actually exists. - // - // ---AG - struct Seen { - #[cfg(feature = "alloc")] - set: alloc::collections::BTreeSet, - #[cfg(not(feature = "alloc"))] - set: core::marker::PhantomData, - } - - #[cfg(feature = "alloc")] - impl Seen { - fn new() -> Seen { - Seen { set: alloc::collections::BTreeSet::new() } - } - fn insert(&mut self, id: StateID) { - self.set.insert(id); - } - fn contains(&self, id: &StateID) -> bool { - self.set.contains(id) - } - } - - #[cfg(not(feature = "alloc"))] - impl Seen { - fn new() -> Seen { - Seen { set: core::marker::PhantomData } - } - fn insert(&mut self, _id: StateID) {} - fn contains(&self, _id: &StateID) -> bool { - false - } - } - - let mut verified: Seen = Seen::new(); + fn validate(&self, sp: &Special) -> Result { + let mut verified = Seen::new(); // We need to make sure that we decode the correct number of states. // Otherwise, an empty set of transitions would validate even if the // recorded state length is non-empty. @@ -1544,7 +1466,7 @@ impl> Transitions { "mismatching sparse state length", )); } - Ok(()) + Ok(verified) } /// Converts these transitions to a borrowed value. @@ -1682,7 +1604,7 @@ impl> Transitions { let state = &state[nr..]; if npats == 0 { return Err(DeserializeError::generic( - "state marked as a match, but has no pattern IDs", + "state marked as a match, but pattern length is zero", )); } @@ -1704,6 +1626,21 @@ impl> Transitions { } else { (&[][..], state) }; + if is_match && pattern_ids.is_empty() { + return Err(DeserializeError::generic( + "state marked as a match, but has no pattern IDs", + )); + } + if sp.is_match_state(id) && pattern_ids.is_empty() { + return Err(DeserializeError::generic( + "state marked special as a match, but has no pattern IDs", + )); + } + if sp.is_match_state(id) != is_match { + return Err(DeserializeError::generic( + "whether state is a match or not is inconsistent", + )); + } // Now read this state's accelerator info. The first byte is the length // of the accelerator, which is typically 0 (for no acceleration) but @@ -2084,28 +2021,19 @@ impl> StartTable { fn validate( &self, sp: &Special, - trans: &Transitions, + seen: &Seen, ) -> Result<(), DeserializeError> { for (id, _, _) in self.iter() { + if !seen.contains(&id) { + return Err(DeserializeError::generic( + "found invalid start state ID", + )); + } if sp.is_match_state(id) { return Err(DeserializeError::generic( "start states cannot be match states", )); } - // Confirm that the start state points to a valid state. - let state = trans.try_state(sp, id)?; - // And like for the transition table, confirm that the transitions - // on all start states themselves point to a valid state. - // - // It'd probably be better to integrate this validation with the - // transition table, or otherwise store a sorted sequence of all - // valid state IDs in the sparse DFA itself. That way, we could - // check that every pointer to a state corresponds precisely to a - // correct and valid state. - for i in 0..state.ntrans { - let to = state.next_at(i); - let _ = trans.try_state(sp, to)?; - } } Ok(()) } @@ -2145,28 +2073,27 @@ impl> StartTable { /// panics. fn start( &self, - input: &Input<'_>, + anchored: Anchored, start: Start, - ) -> Result { + ) -> Result { let start_index = start.as_usize(); - let mode = input.get_anchored(); - let index = match mode { + let index = match anchored { Anchored::No => { if !self.kind.has_unanchored() { - return Err(MatchError::unsupported_anchored(mode)); + return Err(StartError::unsupported_anchored(anchored)); } start_index } Anchored::Yes => { if !self.kind.has_anchored() { - return Err(MatchError::unsupported_anchored(mode)); + return Err(StartError::unsupported_anchored(anchored)); } self.stride + start_index } Anchored::Pattern(pid) => { let len = match self.pattern_len { None => { - return Err(MatchError::unsupported_anchored(mode)) + return Err(StartError::unsupported_anchored(anchored)) } Some(len) => len, }; @@ -2561,6 +2488,62 @@ impl<'a> fmt::Debug for StateMut<'a> { } } +// In order to validate everything, we not only need to make sure we +// can decode every state, but that every transition in every state +// points to a valid state. There are many duplicative transitions, so +// we record state IDs that we've verified so that we don't redo the +// decoding work. +// +// Except, when in no_std mode, we don't have dynamic memory allocation +// available to us, so we skip this optimization. It's not clear +// whether doing something more clever is worth it just yet. If you're +// profiling this code and need it to run faster, please file an issue. +// +// OK, so we also use this to record the set of valid state IDs. Since +// it is possible for a transition to point to an invalid state ID that +// still (somehow) deserializes to a valid state. So we need to make +// sure our transitions are limited to actually correct state IDs. +// The problem is, I'm not sure how to do this verification step in +// no-std no-alloc mode. I think we'd *have* to store the set of valid +// state IDs in the DFA itself. For now, we don't do this verification +// in no-std no-alloc mode. The worst thing that can happen is an +// incorrect result. But no panics or memory safety problems should +// result. Because we still do validate that the state itself is +// "valid" in the sense that everything it points to actually exists. +// +// ---AG +#[derive(Debug)] +struct Seen { + #[cfg(feature = "alloc")] + set: alloc::collections::BTreeSet, + #[cfg(not(feature = "alloc"))] + set: core::marker::PhantomData, +} + +#[cfg(feature = "alloc")] +impl Seen { + fn new() -> Seen { + Seen { set: alloc::collections::BTreeSet::new() } + } + fn insert(&mut self, id: StateID) { + self.set.insert(id); + } + fn contains(&self, id: &StateID) -> bool { + self.set.contains(id) + } +} + +#[cfg(not(feature = "alloc"))] +impl Seen { + fn new() -> Seen { + Seen { set: core::marker::PhantomData } + } + fn insert(&mut self, _id: StateID) {} + fn contains(&self, _id: &StateID) -> bool { + true + } +} + /* /// A binary search routine specialized specifically to a sparse DFA state's /// transitions. Specifically, the transitions are defined as a set of pairs diff --git a/regex-automata/src/hybrid/dfa.rs b/regex-automata/src/hybrid/dfa.rs index 86963248f7..bd9179b194 100644 --- a/regex-automata/src/hybrid/dfa.rs +++ b/regex-automata/src/hybrid/dfa.rs @@ -13,7 +13,7 @@ use alloc::vec::Vec; use crate::{ hybrid::{ - error::{BuildError, CacheError}, + error::{BuildError, CacheError, StartError}, id::{LazyStateID, LazyStateIDError}, search, }, @@ -28,7 +28,7 @@ use crate::{ Anchored, HalfMatch, Input, MatchError, MatchKind, PatternSet, }, sparse_set::SparseSets, - start::{Start, StartByteMap}, + start::{self, Start, StartByteMap}, }, }; @@ -1518,8 +1518,8 @@ impl DFA { Lazy::new(self, cache).cache_next_state(current, unit) } - /// Return the ID of the start state for this lazy DFA when executing a - /// forward search. + /// Return the ID of the start state for this lazy DFA for the given + /// starting configuration. /// /// Unlike typical DFA implementations, the start state for DFAs in this /// crate is dependent on a few different factors: @@ -1527,85 +1527,122 @@ impl DFA { /// * The [`Anchored`] mode of the search. Unanchored, anchored and /// anchored searches for a specific [`PatternID`] all use different start /// states. - /// * The position at which the search begins, via [`Input::start`]. This - /// and the byte immediately preceding the start of the search (if one - /// exists) influence which look-behind assertions are true at the start - /// of the search. This in turn influences which start state is selected. - /// * Whether the search is a forward or reverse search. This routine can - /// only be used for forward searches. + /// * Whether a "look-behind" byte exists. For example, the `^` anchor + /// matches if and only if there is no look-behind byte. + /// * The specific value of that look-behind byte. For example, a `(?m:^)` + /// assertion only matches when there is either no look-behind byte, or + /// when the look-behind byte is a line terminator. + /// + /// The [starting configuration](start::Config) provides the above + /// information. + /// + /// This routine can be used for either forward or reverse searches. + /// Although, as a convenience, if you have an [`Input`], then it + /// may be more succinct to use [`DFA::start_state_forward`] or + /// [`DFA::start_state_reverse`]. Note, for example, that the convenience + /// routines return a [`MatchError`] on failure where as this routine + /// returns a [`StartError`]. /// /// # Errors /// - /// This may return a [`MatchError`] (not a [`CacheError`]!) if the search - /// needs to give up when determining the start state (for example, if - /// it sees a "quit" byte or if the cache has been cleared too many - /// times). This can also return an error if the given `Input` contains an - /// unsupported [`Anchored`] configuration. + /// This may return a [`StartError`] if the search needs to give up when + /// determining the start state (for example, if it sees a "quit" byte + /// or if the cache has become inefficient). This can also return an + /// error if the given configuration contains an unsupported [`Anchored`] + /// configuration. + #[cfg_attr(feature = "perf-inline", inline(always))] + pub fn start_state( + &self, + cache: &mut Cache, + config: &start::Config, + ) -> Result { + let lazy = LazyRef::new(self, cache); + let anchored = config.get_anchored(); + let start = match config.get_look_behind() { + None => Start::Text, + Some(byte) => { + if !self.quitset.is_empty() && self.quitset.contains(byte) { + return Err(StartError::quit(byte)); + } + self.start_map.get(byte) + } + }; + let start_id = lazy.get_cached_start_id(anchored, start)?; + if !start_id.is_unknown() { + return Ok(start_id); + } + Lazy::new(self, cache).cache_start_group(anchored, start) + } + + /// Return the ID of the start state for this lazy DFA when executing a + /// forward search. + /// + /// This is a convenience routine for calling [`DFA::start_state`] that + /// converts the given [`Input`] to a [start configuration](start::Config). + /// Additionally, if an error occurs, it is converted from a [`StartError`] + /// to a [`MatchError`] using the offset information in the given + /// [`Input`]. + /// + /// # Errors + /// + /// This may return a [`MatchError`] if the search needs to give up when + /// determining the start state (for example, if it sees a "quit" byte or + /// if the cache has become inefficient). This can also return an error if + /// the given `Input` contains an unsupported [`Anchored`] configuration. #[cfg_attr(feature = "perf-inline", inline(always))] pub fn start_state_forward( &self, cache: &mut Cache, input: &Input<'_>, ) -> Result { - if !self.quitset.is_empty() && input.start() > 0 { - let offset = input.start() - 1; - let byte = input.haystack()[offset]; - if self.quitset.contains(byte) { - return Err(MatchError::quit(byte, offset)); + let config = start::Config::from_input_forward(input); + self.start_state(cache, &config).map_err(|err| match err { + StartError::Cache { .. } => MatchError::gave_up(input.start()), + StartError::Quit { byte } => { + let offset = input + .start() + .checked_sub(1) + .expect("no quit in start without look-behind"); + MatchError::quit(byte, offset) } - } - let start_type = self.start_map.fwd(input); - let start = LazyRef::new(self, cache) - .get_cached_start_id(input, start_type)?; - if !start.is_unknown() { - return Ok(start); - } - Lazy::new(self, cache).cache_start_group(input, start_type) + StartError::UnsupportedAnchored { mode } => { + MatchError::unsupported_anchored(mode) + } + }) } /// Return the ID of the start state for this lazy DFA when executing a /// reverse search. /// - /// Unlike typical DFA implementations, the start state for DFAs in this - /// crate is dependent on a few different factors: - /// - /// * The [`Anchored`] mode of the search. Unanchored, anchored and - /// anchored searches for a specific [`PatternID`] all use different start - /// states. - /// * The position at which the search begins, via [`Input::start`]. This - /// and the byte immediately preceding the start of the search (if one - /// exists) influence which look-behind assertions are true at the start - /// of the search. This in turn influences which start state is selected. - /// * Whether the search is a forward or reverse search. This routine can - /// only be used for reverse searches. + /// This is a convenience routine for calling [`DFA::start_state`] that + /// converts the given [`Input`] to a [start configuration](start::Config). + /// Additionally, if an error occurs, it is converted from a [`StartError`] + /// to a [`MatchError`] using the offset information in the given + /// [`Input`]. /// /// # Errors /// - /// This may return a [`MatchError`] (not a [`CacheError`]!) if the search - /// needs to give up when determining the start state (for example, if - /// it sees a "quit" byte or if the cache has been cleared too many - /// times). This can also return an error if the given `Input` contains an - /// unsupported [`Anchored`] configuration. + /// This may return a [`MatchError`] if the search needs to give up when + /// determining the start state (for example, if it sees a "quit" byte or + /// if the cache has become inefficient). This can also return an error if + /// the given `Input` contains an unsupported [`Anchored`] configuration. #[cfg_attr(feature = "perf-inline", inline(always))] pub fn start_state_reverse( &self, cache: &mut Cache, input: &Input<'_>, ) -> Result { - if !self.quitset.is_empty() && input.end() < input.haystack().len() { - let offset = input.end(); - let byte = input.haystack()[offset]; - if self.quitset.contains(byte) { - return Err(MatchError::quit(byte, offset)); + let config = start::Config::from_input_reverse(input); + self.start_state(cache, &config).map_err(|err| match err { + StartError::Cache { .. } => MatchError::gave_up(input.end()), + StartError::Quit { byte } => { + let offset = input.end(); + MatchError::quit(byte, offset) } - } - let start_type = self.start_map.rev(input); - let start = LazyRef::new(self, cache) - .get_cached_start_id(input, start_type)?; - if !start.is_unknown() { - return Ok(start); - } - Lazy::new(self, cache).cache_start_group(input, start_type) + StartError::UnsupportedAnchored { mode } => { + MatchError::unsupported_anchored(mode) + } + }) } /// Returns the total number of patterns that match in this state. @@ -2066,8 +2103,10 @@ impl<'i, 'c> Lazy<'i, 'c> { /// Here's an example that justifies 'inline(never)' /// /// ```ignore - /// regex-cli find hybrid dfa \ - /// @all-codepoints-utf8-100x '\pL{100}' --cache-capacity 10000000 + /// regex-cli find match hybrid \ + /// --cache-capacity 100000000 \ + /// -p '\pL{100}' + /// all-codepoints-utf8-100x /// ``` /// /// Where 'all-codepoints-utf8-100x' is the UTF-8 encoding of every @@ -2122,16 +2161,15 @@ impl<'i, 'c> Lazy<'i, 'c> { #[inline(never)] fn cache_start_group( &mut self, - input: &Input<'_>, + anchored: Anchored, start: Start, - ) -> Result { - let mode = input.get_anchored(); - let nfa_start_id = match mode { + ) -> Result { + let nfa_start_id = match anchored { Anchored::No => self.dfa.get_nfa().start_unanchored(), Anchored::Yes => self.dfa.get_nfa().start_anchored(), Anchored::Pattern(pid) => { if !self.dfa.get_config().get_starts_for_each_pattern() { - return Err(MatchError::unsupported_anchored(mode)); + return Err(StartError::unsupported_anchored(anchored)); } match self.dfa.get_nfa().start_pattern(pid) { None => return Ok(self.as_ref().dead_id()), @@ -2142,8 +2180,8 @@ impl<'i, 'c> Lazy<'i, 'c> { let id = self .cache_start_one(nfa_start_id, start) - .map_err(|_| MatchError::gave_up(input.start()))?; - self.set_start_state(input, start, id); + .map_err(StartError::cache)?; + self.set_start_state(anchored, start, id); Ok(id) } @@ -2574,13 +2612,13 @@ impl<'i, 'c> Lazy<'i, 'c> { /// 'starts_for_each_pattern' is not enabled. fn set_start_state( &mut self, - input: &Input<'_>, + anchored: Anchored, start: Start, id: LazyStateID, ) { assert!(self.as_ref().is_valid(id)); let start_index = start.as_usize(); - let index = match input.get_anchored() { + let index = match anchored { Anchored::No => start_index, Anchored::Yes => Start::len() + start_index, Anchored::Pattern(pid) => { @@ -2642,17 +2680,16 @@ impl<'i, 'c> LazyRef<'i, 'c> { #[cfg_attr(feature = "perf-inline", inline(always))] fn get_cached_start_id( &self, - input: &Input<'_>, + anchored: Anchored, start: Start, - ) -> Result { + ) -> Result { let start_index = start.as_usize(); - let mode = input.get_anchored(); - let index = match mode { + let index = match anchored { Anchored::No => start_index, Anchored::Yes => Start::len() + start_index, Anchored::Pattern(pid) => { if !self.dfa.get_config().get_starts_for_each_pattern() { - return Err(MatchError::unsupported_anchored(mode)); + return Err(StartError::unsupported_anchored(anchored)); } if pid.as_usize() >= self.dfa.pattern_len() { return Ok(self.dead_id()); @@ -3178,12 +3215,12 @@ impl Config { /// be quit bytes _only_ when a Unicode word boundary is present in the /// pattern. /// - /// When enabling this option, callers _must_ be prepared to handle - /// a [`MatchError`](crate::MatchError) error during search. - /// When using a [`Regex`](crate::hybrid::regex::Regex), this - /// corresponds to using the `try_` suite of methods. Alternatively, - /// if callers can guarantee that their input is ASCII only, then a - /// [`MatchError::quit`] error will never be returned while searching. + /// When enabling this option, callers _must_ be prepared to + /// handle a [`MatchError`] error during search. When using a + /// [`Regex`](crate::hybrid::regex::Regex), this corresponds to using the + /// `try_` suite of methods. Alternatively, if callers can guarantee that + /// their input is ASCII only, then a [`MatchError::quit`] error will never + /// be returned while searching. /// /// This is disabled by default. /// @@ -3269,8 +3306,8 @@ impl Config { /// (The advantage being that non-ASCII quit bytes will only be added if a /// Unicode word boundary is in the pattern.) /// - /// When enabling this option, callers _must_ be prepared to handle a - /// [`MatchError`](crate::MatchError) error during search. When using a + /// When enabling this option, callers _must_ be prepared to + /// handle a [`MatchError`] error during search. When using a /// [`Regex`](crate::hybrid::regex::Regex), this corresponds to using the /// `try_` suite of methods. /// @@ -3795,8 +3832,8 @@ impl Config { // // Test case: // - // regex-cli find hybrid regex -w @conn.json.1000x.log \ - // '^#' '\b10\.55\.182\.100\b' + // regex-cli find match hybrid --unicode-word-boundary \ + // -p '^#' -p '\b10\.55\.182\.100\b' -y @conn.json.1000x.log if !quit.is_empty() { set.add_set(&quit); } @@ -3973,7 +4010,10 @@ impl Builder { .clone() // We can always forcefully disable captures because DFAs do not // support them. - .configure(thompson::Config::new().captures(false)) + .configure( + thompson::Config::new() + .which_captures(thompson::WhichCaptures::None), + ) .build_many(patterns) .map_err(BuildError::nfa)?; self.build_from_nfa(nfa) diff --git a/regex-automata/src/hybrid/error.rs b/regex-automata/src/hybrid/error.rs index 604daf3c38..d134e7ec90 100644 --- a/regex-automata/src/hybrid/error.rs +++ b/regex-automata/src/hybrid/error.rs @@ -1,4 +1,4 @@ -use crate::{hybrid::id::LazyStateIDError, nfa}; +use crate::{hybrid::id::LazyStateIDError, nfa, util::search::Anchored}; /// An error that occurs when initial construction of a lazy DFA fails. /// @@ -95,6 +95,113 @@ impl core::fmt::Display for BuildError { } } +/// An error that can occur when computing the start state for a search. +/// +/// Computing a start state can fail for a few reasons, either +/// based on incorrect configuration or even based on whether +/// the look-behind byte triggers a quit state. Typically +/// one does not need to handle this error if you're using +/// [`DFA::start_state_forward`](crate::hybrid::dfa::DFA::start_state_forward) +/// (or its reverse counterpart), as that routine automatically converts +/// `StartError` to a [`MatchError`](crate::MatchError) for you. +/// +/// This error may be returned by the +/// [`DFA::start_state`](crate::hybrid::dfa::DFA::start_state) routine. +/// +/// This error implements the `std::error::Error` trait when the `std` feature +/// is enabled. +/// +/// This error is marked as non-exhaustive. New variants may be added in a +/// semver compatible release. +#[non_exhaustive] +#[derive(Clone, Debug)] +pub enum StartError { + /// An error that occurs when cache inefficiency has dropped below the + /// configured heuristic thresholds. + Cache { + /// The underlying cache error that occurred. + err: CacheError, + }, + /// An error that occurs when a starting configuration's look-behind byte + /// is in this DFA's quit set. + Quit { + /// The quit byte that was found. + byte: u8, + }, + /// An error that occurs when the caller requests an anchored mode that + /// isn't supported by the DFA. + UnsupportedAnchored { + /// The anchored mode given that is unsupported. + mode: Anchored, + }, +} + +impl StartError { + pub(crate) fn cache(err: CacheError) -> StartError { + StartError::Cache { err } + } + + pub(crate) fn quit(byte: u8) -> StartError { + StartError::Quit { byte } + } + + pub(crate) fn unsupported_anchored(mode: Anchored) -> StartError { + StartError::UnsupportedAnchored { mode } + } +} + +#[cfg(feature = "std")] +impl std::error::Error for StartError { + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { + match *self { + StartError::Cache { ref err } => Some(err), + _ => None, + } + } +} + +impl core::fmt::Display for StartError { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + match *self { + StartError::Cache { .. } => write!( + f, + "error computing start state because of cache inefficiency" + ), + StartError::Quit { byte } => write!( + f, + "error computing start state because the look-behind byte \ + {:?} triggered a quit state", + crate::util::escape::DebugByte(byte), + ), + StartError::UnsupportedAnchored { mode: Anchored::Yes } => { + write!( + f, + "error computing start state because \ + anchored searches are not supported or enabled" + ) + } + StartError::UnsupportedAnchored { mode: Anchored::No } => { + write!( + f, + "error computing start state because \ + unanchored searches are not supported or enabled" + ) + } + StartError::UnsupportedAnchored { + mode: Anchored::Pattern(pid), + } => { + write!( + f, + "error computing start state because \ + anchored searches for a specific pattern ({}) \ + are not supported or enabled", + pid.as_usize(), + ) + } + } + } +} + /// An error that occurs when cache usage has become inefficient. /// /// One of the weaknesses of a lazy DFA is that it may need to clear its @@ -126,11 +233,7 @@ impl CacheError { } #[cfg(feature = "std")] -impl std::error::Error for CacheError { - fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { - None - } -} +impl std::error::Error for CacheError {} impl core::fmt::Display for CacheError { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { diff --git a/regex-automata/src/hybrid/mod.rs b/regex-automata/src/hybrid/mod.rs index 44e67e1299..2feb839d16 100644 --- a/regex-automata/src/hybrid/mod.rs +++ b/regex-automata/src/hybrid/mod.rs @@ -133,7 +133,7 @@ compiled DFAs. */ pub use self::{ - error::{BuildError, CacheError}, + error::{BuildError, CacheError, StartError}, id::LazyStateID, }; diff --git a/regex-automata/src/hybrid/regex.rs b/regex-automata/src/hybrid/regex.rs index 75667daf91..b3b1fe317d 100644 --- a/regex-automata/src/hybrid/regex.rs +++ b/regex-automata/src/hybrid/regex.rs @@ -878,7 +878,7 @@ impl Builder { } /// Set the lazy DFA compilation configuration for this builder using - /// [`dfa::Config`](dfa::Config). + /// [`dfa::Config`]. /// /// This permits setting things like whether Unicode word boundaries should /// be heuristically supported or settings how the behavior of the cache. diff --git a/regex-automata/src/hybrid/search.rs b/regex-automata/src/hybrid/search.rs index f232836854..1f4a505db4 100644 --- a/regex-automata/src/hybrid/search.rs +++ b/regex-automata/src/hybrid/search.rs @@ -105,14 +105,14 @@ fn find_fwd_imp( // PERF: For justification of omitting bounds checks, it gives us a // ~10% bump in search time. This was used for a benchmark: // - // regex-cli find hybrid dfa @bigfile '(?m)^.+$' -UBb + // regex-cli find half hybrid -p '(?m)^.+$' -UBb bigfile // // PERF: For justification for the loop unrolling, we use a few // different tests: // - // regex-cli find hybrid dfa @$bigfile '\w{50}' -UBb - // regex-cli find hybrid dfa @$bigfile '(?m)^.+$' -UBb - // regex-cli find hybrid dfa @$bigfile 'ZQZQZQZQ' -UBb + // regex-cli find half hybrid -p '\w{50}' -UBb bigfile + // regex-cli find half hybrid -p '(?m)^.+$' -UBb bigfile + // regex-cli find half hybrid -p 'ZQZQZQZQ' -UBb bigfile // // And there are three different configurations: // @@ -353,7 +353,7 @@ fn find_rev_imp( // anchored and on shorter haystacks. However, this still makes a // difference. Take this command for example: // - // regex-cli find hybrid regex @$bigfile '(?m)^.+$' -UBb + // regex-cli find match hybrid -p '(?m)^.+$' -UBb bigfile // // (Notice that we use 'find hybrid regex', not 'find hybrid dfa' // like in the justification for the forward direction. The 'regex' diff --git a/regex-automata/src/meta/limited.rs b/regex-automata/src/meta/limited.rs index 005878acdb..5653adc9aa 100644 --- a/regex-automata/src/meta/limited.rs +++ b/regex-automata/src/meta/limited.rs @@ -69,9 +69,6 @@ pub(crate) fn dfa_try_search_half_rev( } else if dfa.is_dead_state(sid) { return Ok(mat); } else if dfa.is_quit_state(sid) { - if mat.is_some() { - return Ok(mat); - } return Err(MatchError::quit(input.haystack()[at], at).into()); } } @@ -88,7 +85,41 @@ pub(crate) fn dfa_try_search_half_rev( return Err(RetryError::Quadratic(RetryQuadraticError::new())); } } + let was_dead = dfa.is_dead_state(sid); dfa_eoi_rev(dfa, input, &mut sid, &mut mat)?; + // If we reach the beginning of the search and we could otherwise still + // potentially keep matching if there was more to match, then we actually + // return an error to indicate giving up on this optimization. Why? Because + // we can't prove that the real match begins at where we would report it. + // + // This only happens when all of the following are true: + // + // 1) We reach the starting point of our search span. + // 2) The match we found is before the starting point. + // 3) The FSM reports we could possibly find a longer match. + // + // We need (1) because otherwise the search stopped before the starting + // point and there is no possible way to find a more leftmost position. + // + // We need (2) because if the match found has an offset equal to the minimum + // possible offset, then there is no possible more leftmost match. + // + // We need (3) because if the FSM couldn't continue anyway (i.e., it's in + // a dead state), then we know we couldn't find anything more leftmost + // than what we have. (We have to check the state we were in prior to the + // EOI transition since the EOI transition will usually bring us to a dead + // state by virtue of it represents the end-of-input.) + if at == input.start() + && mat.map_or(false, |m| m.offset() > input.start()) + && !was_dead + { + trace!( + "reached beginning of search at offset {} without hitting \ + a dead state, quitting to avoid potential false positive match", + at, + ); + return Err(RetryError::Quadratic(RetryQuadraticError::new())); + } Ok(mat) } @@ -121,9 +152,6 @@ pub(crate) fn hybrid_try_search_half_rev( } else if sid.is_dead() { return Ok(mat); } else if sid.is_quit() { - if mat.is_some() { - return Ok(mat); - } return Err(MatchError::quit(input.haystack()[at], at).into()); } } @@ -140,7 +168,20 @@ pub(crate) fn hybrid_try_search_half_rev( return Err(RetryError::Quadratic(RetryQuadraticError::new())); } } + let was_dead = sid.is_dead(); hybrid_eoi_rev(dfa, cache, input, &mut sid, &mut mat)?; + // See the comments in the full DFA routine above for why we need this. + if at == input.start() + && mat.map_or(false, |m| m.offset() > input.start()) + && !was_dead + { + trace!( + "reached beginning of search at offset {} without hitting \ + a dead state, quitting to avoid potential false positive match", + at, + ); + return Err(RetryError::Quadratic(RetryQuadraticError::new())); + } Ok(mat) } @@ -162,9 +203,6 @@ fn dfa_eoi_rev( let pattern = dfa.match_pattern(*sid, 0); *mat = Some(HalfMatch::new(pattern, sp.start)); } else if dfa.is_quit_state(*sid) { - if mat.is_some() { - return Ok(()); - } return Err(MatchError::quit(byte, sp.start - 1)); } } else { @@ -199,9 +237,6 @@ fn hybrid_eoi_rev( let pattern = dfa.match_pattern(cache, *sid, 0); *mat = Some(HalfMatch::new(pattern, sp.start)); } else if sid.is_quit() { - if mat.is_some() { - return Ok(()); - } return Err(MatchError::quit(byte, sp.start - 1)); } } else { diff --git a/regex-automata/src/meta/regex.rs b/regex-automata/src/meta/regex.rs index 6e16ceedb6..a06d2bb48c 100644 --- a/regex-automata/src/meta/regex.rs +++ b/regex-automata/src/meta/regex.rs @@ -16,6 +16,7 @@ use crate::{ strategy::{self, Strategy}, wrappers, }, + nfa::thompson::WhichCaptures, util::{ captures::{Captures, GroupInfo}, iter, @@ -528,7 +529,14 @@ impl Regex { #[inline] pub fn is_match<'h, I: Into>>(&self, input: I) -> bool { let input = input.into().earliest(true); - self.search_half(&input).is_some() + if self.imp.info.is_impossible(&input) { + return false; + } + let mut guard = self.pool.get(); + let result = self.imp.strat.is_match(&mut guard, &input); + // See 'Regex::search' for why we put the guard back explicitly. + PoolGuard::put(guard); + result } /// Executes a leftmost search and returns the first match that is found, @@ -2429,6 +2437,7 @@ pub struct Config { utf8_empty: Option, autopre: Option, pre: Option>, + which_captures: Option, nfa_size_limit: Option>, onepass_size_limit: Option>, hybrid_cache_capacity: Option, @@ -2619,6 +2628,77 @@ impl Config { Config { pre: Some(pre), ..self } } + /// Configures what kinds of groups are compiled as "capturing" in the + /// underlying regex engine. + /// + /// This is set to [`WhichCaptures::All`] by default. Callers may wish to + /// use [`WhichCaptures::Implicit`] in cases where one wants avoid the + /// overhead of capture states for explicit groups. + /// + /// Note that another approach to avoiding the overhead of capture groups + /// is by using non-capturing groups in the regex pattern. That is, + /// `(?:a)` instead of `(a)`. This option is useful when you can't control + /// the concrete syntax but know that you don't need the underlying capture + /// states. For example, using `WhichCaptures::Implicit` will behave as if + /// all explicit capturing groups in the pattern were non-capturing. + /// + /// Setting this to `WhichCaptures::None` is usually not the right thing to + /// do. When no capture states are compiled, some regex engines (such as + /// the `PikeVM`) won't be able to report match offsets. This will manifest + /// as no match being found. + /// + /// # Example + /// + /// This example demonstrates how the results of capture groups can change + /// based on this option. First we show the default (all capture groups in + /// the pattern are capturing): + /// + /// ``` + /// use regex_automata::{meta::Regex, Match, Span}; + /// + /// let re = Regex::new(r"foo([0-9]+)bar")?; + /// let hay = "foo123bar"; + /// + /// let mut caps = re.create_captures(); + /// re.captures(hay, &mut caps); + /// assert_eq!(Some(Span::from(0..9)), caps.get_group(0)); + /// assert_eq!(Some(Span::from(3..6)), caps.get_group(1)); + /// + /// Ok::<(), Box>(()) + /// ``` + /// + /// And now we show the behavior when we only include implicit capture + /// groups. In this case, we can only find the overall match span, but the + /// spans of any other explicit group don't exist because they are treated + /// as non-capturing. (In effect, when `WhichCaptures::Implicit` is used, + /// there is no real point in using [`Regex::captures`] since it will never + /// be able to report more information than [`Regex::find`].) + /// + /// ``` + /// use regex_automata::{ + /// meta::Regex, + /// nfa::thompson::WhichCaptures, + /// Match, + /// Span, + /// }; + /// + /// let re = Regex::builder() + /// .configure(Regex::config().which_captures(WhichCaptures::Implicit)) + /// .build(r"foo([0-9]+)bar")?; + /// let hay = "foo123bar"; + /// + /// let mut caps = re.create_captures(); + /// re.captures(hay, &mut caps); + /// assert_eq!(Some(Span::from(0..9)), caps.get_group(0)); + /// assert_eq!(None, caps.get_group(1)); + /// + /// Ok::<(), Box>(()) + /// ``` + pub fn which_captures(mut self, which_captures: WhichCaptures) -> Config { + self.which_captures = Some(which_captures); + self + } + /// Sets the size limit, in bytes, to enforce on the construction of every /// NFA build by the meta regex engine. /// @@ -2626,7 +2706,7 @@ impl Config { /// you're compiling untrusted patterns. /// /// Note that this limit is applied to _each_ NFA built, and if any of - /// them excceed the limit, then construction will fail. This limit does + /// them exceed the limit, then construction will fail. This limit does /// _not_ correspond to the total memory used by all NFAs in the meta regex /// engine. /// @@ -2983,6 +3063,14 @@ impl Config { self.pre.as_ref().unwrap_or(&None).as_ref() } + /// Returns the capture configuration, as set by + /// [`Config::which_captures`]. + /// + /// If it was not explicitly set, then a default value is returned. + pub fn get_which_captures(&self) -> WhichCaptures { + self.which_captures.unwrap_or(WhichCaptures::All) + } + /// Returns NFA size limit, as set by [`Config::nfa_size_limit`]. /// /// If it was not explicitly set, then a default value is returned. @@ -3126,6 +3214,7 @@ impl Config { utf8_empty: o.utf8_empty.or(self.utf8_empty), autopre: o.autopre.or(self.autopre), pre: o.pre.or_else(|| self.pre.clone()), + which_captures: o.which_captures.or(self.which_captures), nfa_size_limit: o.nfa_size_limit.or(self.nfa_size_limit), onepass_size_limit: o .onepass_size_limit @@ -3551,8 +3640,8 @@ mod tests { // I found this in the course of building out the benchmark suite for // rebar. #[test] - fn regression() { - env_logger::init(); + fn regression_suffix_literal_count() { + let _ = env_logger::try_init(); let re = Regex::new(r"[a-zA-Z]+ing").unwrap(); assert_eq!(1, re.find_iter("tingling").count()); diff --git a/regex-automata/src/meta/stopat.rs b/regex-automata/src/meta/stopat.rs index e8d716689c..c4dcd797a0 100644 --- a/regex-automata/src/meta/stopat.rs +++ b/regex-automata/src/meta/stopat.rs @@ -81,9 +81,6 @@ pub(crate) fn dfa_try_search_half_fwd( } else if dfa.is_dead_state(sid) { return Ok(mat.ok_or(at)); } else if dfa.is_quit_state(sid) { - if mat.is_some() { - return Ok(mat.ok_or(at)); - } return Err(MatchError::quit(input.haystack()[at], at).into()); } else { // Ideally we wouldn't use a DFA that specialized start states @@ -122,9 +119,6 @@ pub(crate) fn hybrid_try_search_half_fwd( } else if sid.is_dead() { return Ok(mat.ok_or(at)); } else if sid.is_quit() { - if mat.is_some() { - return Ok(mat.ok_or(at)); - } return Err(MatchError::quit(input.haystack()[at], at).into()); } else { // We should NEVER get an unknown state ID back from @@ -162,9 +156,6 @@ fn dfa_eoi_fwd( let pattern = dfa.match_pattern(*sid, 0); *mat = Some(HalfMatch::new(pattern, sp.end)); } else if dfa.is_quit_state(*sid) { - if mat.is_some() { - return Ok(()); - } return Err(MatchError::quit(b, sp.end)); } } @@ -201,9 +192,6 @@ fn hybrid_eoi_fwd( let pattern = dfa.match_pattern(cache, *sid, 0); *mat = Some(HalfMatch::new(pattern, sp.end)); } else if sid.is_quit() { - if mat.is_some() { - return Ok(()); - } return Err(MatchError::quit(b, sp.end)); } } diff --git a/regex-automata/src/meta/strategy.rs b/regex-automata/src/meta/strategy.rs index 2de2c385ec..04f2ba3c3e 100644 --- a/regex-automata/src/meta/strategy.rs +++ b/regex-automata/src/meta/strategy.rs @@ -13,7 +13,7 @@ use crate::{ regex::{Cache, RegexInfo}, reverse_inner, wrappers, }, - nfa::thompson::{self, NFA}, + nfa::thompson::{self, WhichCaptures, NFA}, util::{ captures::{Captures, GroupInfo}, look::LookMatcher, @@ -58,6 +58,8 @@ pub(super) trait Strategy: input: &Input<'_>, ) -> Option; + fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool; + fn search_slots( &self, cache: &mut Cache, @@ -351,6 +353,7 @@ impl Pre<()> { // strategy when len(patterns)==1 if the number of literals is large. In that // case, literal extraction gives up and will return an infinite set.) impl Strategy for Pre

{ + #[cfg_attr(feature = "perf-inline", inline(always))] fn group_info(&self) -> &GroupInfo { &self.group_info } @@ -376,6 +379,7 @@ impl Strategy for Pre

{ self.pre.memory_usage() } + #[cfg_attr(feature = "perf-inline", inline(always))] fn search(&self, _cache: &mut Cache, input: &Input<'_>) -> Option { if input.is_done() { return None; @@ -391,6 +395,7 @@ impl Strategy for Pre

{ .map(|sp| Match::new(PatternID::ZERO, sp)) } + #[cfg_attr(feature = "perf-inline", inline(always))] fn search_half( &self, cache: &mut Cache, @@ -399,6 +404,12 @@ impl Strategy for Pre

{ self.search(cache, input).map(|m| HalfMatch::new(m.pattern(), m.end())) } + #[cfg_attr(feature = "perf-inline", inline(always))] + fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool { + self.search(cache, input).is_some() + } + + #[cfg_attr(feature = "perf-inline", inline(always))] fn search_slots( &self, cache: &mut Cache, @@ -415,6 +426,7 @@ impl Strategy for Pre

{ Some(m.pattern()) } + #[cfg_attr(feature = "perf-inline", inline(always))] fn which_overlapping_matches( &self, cache: &mut Cache, @@ -452,7 +464,7 @@ impl Core { .utf8(info.config().get_utf8_empty()) .nfa_size_limit(info.config().get_nfa_size_limit()) .shrink(false) - .captures(true) + .which_captures(info.config().get_which_captures()) .look_matcher(lookm); let nfa = thompson::Compiler::new() .configure(thompson_config.clone()) @@ -499,7 +511,10 @@ impl Core { // useful with capturing groups in reverse. And of course, // the lazy DFA ignores capturing groups in all cases. .configure( - thompson_config.clone().captures(false).reverse(true), + thompson_config + .clone() + .which_captures(WhichCaptures::None) + .reverse(true), ) .build_many_from_hir(hirs) .map_err(BuildError::nfa)?; @@ -620,6 +635,29 @@ impl Core { } } + fn is_match_nofail(&self, cache: &mut Cache, input: &Input<'_>) -> bool { + if let Some(ref e) = self.onepass.get(input) { + trace!( + "using OnePass for is-match search at {:?}", + input.get_span() + ); + e.search_slots(&mut cache.onepass, input, &mut []).is_some() + } else if let Some(ref e) = self.backtrack.get(input) { + trace!( + "using BoundedBacktracker for is-match search at {:?}", + input.get_span() + ); + e.is_match(&mut cache.backtrack, input) + } else { + trace!( + "using PikeVM for is-match search at {:?}", + input.get_span() + ); + let e = self.pikevm.get(); + e.is_match(&mut cache.pikevm, input) + } + } + fn is_capture_search_needed(&self, slots_len: usize) -> bool { slots_len > self.nfa.group_info().implicit_slot_len() } @@ -700,7 +738,7 @@ impl Strategy for Core { // The main difference with 'search' is that if we're using a DFA, we // can use a single forward scan without needing to run the reverse // DFA. - return if let Some(e) = self.dfa.get(input) { + if let Some(e) = self.dfa.get(input) { trace!("using full DFA for half search at {:?}", input.get_span()); match e.try_search_half_fwd(input) { Ok(x) => x, @@ -720,7 +758,38 @@ impl Strategy for Core { } } else { self.search_half_nofail(cache, input) - }; + } + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool { + if let Some(e) = self.dfa.get(input) { + trace!( + "using full DFA for is-match search at {:?}", + input.get_span() + ); + match e.try_search_half_fwd(input) { + Ok(x) => x.is_some(), + Err(_err) => { + trace!("full DFA half search failed: {}", _err); + self.is_match_nofail(cache, input) + } + } + } else if let Some(e) = self.hybrid.get(input) { + trace!( + "using lazy DFA for is-match search at {:?}", + input.get_span() + ); + match e.try_search_half_fwd(&mut cache.hybrid, input) { + Ok(x) => x.is_some(), + Err(_err) => { + trace!("lazy DFA half search failed: {}", _err); + self.is_match_nofail(cache, input) + } + } + } else { + self.is_match_nofail(cache, input) + } } #[cfg_attr(feature = "perf-inline", inline(always))] @@ -845,6 +914,14 @@ impl ReverseAnchored { ); return Err(core); } + // Note that the caller can still request an anchored search even when + // the regex isn't anchored at the start. We detect that case in the + // search routines below and just fallback to the core engine. This + // is fine because both searches are anchored. It's just a matter of + // picking one. Falling back to the core engine is a little simpler, + // since if we used the reverse anchored approach, we'd have to add an + // extra check to ensure the match reported starts at the place where + // the caller requested the search to start. if core.info.is_always_anchored_start() { debug!( "skipping reverse anchored optimization because \ @@ -930,6 +1007,9 @@ impl Strategy for ReverseAnchored { #[cfg_attr(feature = "perf-inline", inline(always))] fn search(&self, cache: &mut Cache, input: &Input<'_>) -> Option { + if input.get_anchored().is_anchored() { + return self.core.search(cache, input); + } match self.try_search_half_anchored_rev(cache, input) { Err(_err) => { trace!("fast reverse anchored search failed: {}", _err); @@ -948,6 +1028,9 @@ impl Strategy for ReverseAnchored { cache: &mut Cache, input: &Input<'_>, ) -> Option { + if input.get_anchored().is_anchored() { + return self.core.search_half(cache, input); + } match self.try_search_half_anchored_rev(cache, input) { Err(_err) => { trace!("fast reverse anchored search failed: {}", _err); @@ -966,6 +1049,21 @@ impl Strategy for ReverseAnchored { } } + #[cfg_attr(feature = "perf-inline", inline(always))] + fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool { + if input.get_anchored().is_anchored() { + return self.core.is_match(cache, input); + } + match self.try_search_half_anchored_rev(cache, input) { + Err(_err) => { + trace!("fast reverse anchored search failed: {}", _err); + self.core.is_match_nofail(cache, input) + } + Ok(None) => false, + Ok(Some(_)) => true, + } + } + #[cfg_attr(feature = "perf-inline", inline(always))] fn search_slots( &self, @@ -973,6 +1071,9 @@ impl Strategy for ReverseAnchored { input: &Input<'_>, slots: &mut [Option], ) -> Option { + if input.get_anchored().is_anchored() { + return self.core.search_slots(cache, input, slots); + } match self.try_search_half_anchored_rev(cache, input) { Err(_err) => { trace!("fast reverse anchored search failed: {}", _err); @@ -1034,6 +1135,13 @@ impl ReverseSuffix { // requires a reverse scan after a literal match to confirm or reject // the match. (Although, in the case of confirmation, it then needs to // do another forward scan to find the end position.) + // + // Note that the caller can still request an anchored search even + // when the regex isn't anchored. We detect that case in the search + // routines below and just fallback to the core engine. Currently this + // optimization assumes all searches are unanchored, so if we do want + // to enable this optimization for anchored searches, it will need a + // little work to support it. if core.info.is_always_anchored_start() { debug!( "skipping reverse suffix optimization because \ @@ -1173,7 +1281,7 @@ impl ReverseSuffix { e.try_search_half_rev_limited(&input, min_start) } else if let Some(e) = self.core.hybrid.get(&input) { trace!( - "using lazy DFA for reverse inner search at {:?}, \ + "using lazy DFA for reverse suffix search at {:?}, \ but will be stopped at {} to avoid quadratic behavior", input.get_span(), min_start, @@ -1211,6 +1319,9 @@ impl Strategy for ReverseSuffix { #[cfg_attr(feature = "perf-inline", inline(always))] fn search(&self, cache: &mut Cache, input: &Input<'_>) -> Option { + if input.get_anchored().is_anchored() { + return self.core.search(cache, input); + } match self.try_search_half_start(cache, input) { Err(RetryError::Quadratic(_err)) => { trace!("reverse suffix optimization failed: {}", _err); @@ -1255,6 +1366,9 @@ impl Strategy for ReverseSuffix { cache: &mut Cache, input: &Input<'_>, ) -> Option { + if input.get_anchored().is_anchored() { + return self.core.search_half(cache, input); + } match self.try_search_half_start(cache, input) { Err(RetryError::Quadratic(_err)) => { trace!("reverse suffix half optimization failed: {}", _err); @@ -1302,6 +1416,28 @@ impl Strategy for ReverseSuffix { } } + #[cfg_attr(feature = "perf-inline", inline(always))] + fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool { + if input.get_anchored().is_anchored() { + return self.core.is_match(cache, input); + } + match self.try_search_half_start(cache, input) { + Err(RetryError::Quadratic(_err)) => { + trace!("reverse suffix half optimization failed: {}", _err); + self.core.is_match_nofail(cache, input) + } + Err(RetryError::Fail(_err)) => { + trace!( + "reverse suffix reverse fast half search failed: {}", + _err + ); + self.core.is_match_nofail(cache, input) + } + Ok(None) => false, + Ok(Some(_)) => true, + } + } + #[cfg_attr(feature = "perf-inline", inline(always))] fn search_slots( &self, @@ -1309,6 +1445,9 @@ impl Strategy for ReverseSuffix { input: &Input<'_>, slots: &mut [Option], ) -> Option { + if input.get_anchored().is_anchored() { + return self.core.search_slots(cache, input, slots); + } if !self.core.is_capture_search_needed(slots.len()) { trace!("asked for slots unnecessarily, trying fast path"); let m = self.search(cache, input)?; @@ -1396,6 +1535,13 @@ impl ReverseInner { // or when the literal scan matches. If it matches, then confirming the // match requires a reverse scan followed by a forward scan to confirm // or reject, which is a fair bit of work. + // + // Note that the caller can still request an anchored search even + // when the regex isn't anchored. We detect that case in the search + // routines below and just fallback to the core engine. Currently this + // optimization assumes all searches are unanchored, so if we do want + // to enable this optimization for anchored searches, it will need a + // little work to support it. if core.info.is_always_anchored_start() { debug!( "skipping reverse inner optimization because \ @@ -1440,7 +1586,7 @@ impl ReverseInner { .utf8(core.info.config().get_utf8_empty()) .nfa_size_limit(core.info.config().get_nfa_size_limit()) .shrink(false) - .captures(false) + .which_captures(WhichCaptures::None) .look_matcher(lookm); let result = thompson::Compiler::new() .configure(thompson_config) @@ -1635,6 +1781,9 @@ impl Strategy for ReverseInner { #[cfg_attr(feature = "perf-inline", inline(always))] fn search(&self, cache: &mut Cache, input: &Input<'_>) -> Option { + if input.get_anchored().is_anchored() { + return self.core.search(cache, input); + } match self.try_search_full(cache, input) { Err(RetryError::Quadratic(_err)) => { trace!("reverse inner optimization failed: {}", _err); @@ -1654,6 +1803,9 @@ impl Strategy for ReverseInner { cache: &mut Cache, input: &Input<'_>, ) -> Option { + if input.get_anchored().is_anchored() { + return self.core.search_half(cache, input); + } match self.try_search_full(cache, input) { Err(RetryError::Quadratic(_err)) => { trace!("reverse inner half optimization failed: {}", _err); @@ -1668,6 +1820,25 @@ impl Strategy for ReverseInner { } } + #[cfg_attr(feature = "perf-inline", inline(always))] + fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool { + if input.get_anchored().is_anchored() { + return self.core.is_match(cache, input); + } + match self.try_search_full(cache, input) { + Err(RetryError::Quadratic(_err)) => { + trace!("reverse inner half optimization failed: {}", _err); + self.core.is_match_nofail(cache, input) + } + Err(RetryError::Fail(_err)) => { + trace!("reverse inner fast half search failed: {}", _err); + self.core.is_match_nofail(cache, input) + } + Ok(None) => false, + Ok(Some(_)) => true, + } + } + #[cfg_attr(feature = "perf-inline", inline(always))] fn search_slots( &self, @@ -1675,6 +1846,9 @@ impl Strategy for ReverseInner { input: &Input<'_>, slots: &mut [Option], ) -> Option { + if input.get_anchored().is_anchored() { + return self.core.search_slots(cache, input, slots); + } if !self.core.is_capture_search_needed(slots.len()) { trace!("asked for slots unnecessarily, trying fast path"); let m = self.search(cache, input)?; diff --git a/regex-automata/src/meta/wrappers.rs b/regex-automata/src/meta/wrappers.rs index 8f58363a17..6cb19ba0d2 100644 --- a/regex-automata/src/meta/wrappers.rs +++ b/regex-automata/src/meta/wrappers.rs @@ -87,6 +87,15 @@ impl PikeVMEngine { Ok(PikeVMEngine(engine)) } + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn is_match( + &self, + cache: &mut PikeVMCache, + input: &Input<'_>, + ) -> bool { + self.0.is_match(cache.0.as_mut().unwrap(), input.clone()) + } + #[cfg_attr(feature = "perf-inline", inline(always))] pub(crate) fn search_slots( &self, @@ -203,7 +212,10 @@ impl BoundedBacktrackerEngine { .configure(backtrack_config) .build_from_nfa(nfa.clone()) .map_err(BuildError::nfa)?; - debug!("BoundedBacktracker built"); + debug!( + "BoundedBacktracker built (max haystack length: {:?})", + engine.max_haystack_len() + ); Ok(Some(BoundedBacktrackerEngine(engine))) } #[cfg(not(feature = "nfa-backtrack"))] @@ -212,6 +224,29 @@ impl BoundedBacktrackerEngine { } } + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn is_match( + &self, + cache: &mut BoundedBacktrackerCache, + input: &Input<'_>, + ) -> bool { + #[cfg(feature = "nfa-backtrack")] + { + // OK because we only permit access to this engine when we know + // the haystack is short enough for the backtracker to run without + // reporting an error. + self.0 + .try_is_match(cache.0.as_mut().unwrap(), input.clone()) + .unwrap() + } + #[cfg(not(feature = "nfa-backtrack"))] + { + // Impossible to reach because this engine is never constructed + // if the requisite features aren't enabled. + unreachable!() + } + } + #[cfg_attr(feature = "perf-inline", inline(always))] pub(crate) fn search_slots( &self, diff --git a/regex-automata/src/nfa/thompson/backtrack.rs b/regex-automata/src/nfa/thompson/backtrack.rs index 75b6c096b2..df99e456df 100644 --- a/regex-automata/src/nfa/thompson/backtrack.rs +++ b/regex-automata/src/nfa/thompson/backtrack.rs @@ -19,7 +19,7 @@ use crate::{ empty, iter, prefilter::Prefilter, primitives::{NonMaxUsize, PatternID, SmallIndex, StateID}, - search::{Anchored, Input, Match, MatchError, Span}, + search::{Anchored, HalfMatch, Input, Match, MatchError, Span}, }, }; @@ -300,15 +300,6 @@ impl Builder { &self, nfa: NFA, ) -> Result { - // If the NFA has no captures, then the backtracker doesn't work since - // it relies on them in order to report match locations. However, in - // the special case of an NFA with no patterns, it is allowed, since - // no matches can ever be produced. And importantly, an NFA with no - // patterns has no capturing groups anyway, so this is necessary to - // permit the backtracker to work with regexes with zero patterns. - if !nfa.has_capture() && nfa.pattern_len() > 0 { - return Err(BuildError::missing_captures()); - } nfa.look_set_any().available().map_err(BuildError::word)?; Ok(BoundedBacktracker { config: self.config.clone(), nfa }) } @@ -829,8 +820,11 @@ impl BoundedBacktracker { // bytes to the capacity in bits. let capacity = 8 * self.get_config().get_visited_capacity(); let blocks = div_ceil(capacity, Visited::BLOCK_SIZE); - let real_capacity = blocks * Visited::BLOCK_SIZE; - (real_capacity / self.nfa.states().len()) - 1 + let real_capacity = blocks.saturating_mul(Visited::BLOCK_SIZE); + // It's possible for `real_capacity` to be smaller than the number of + // NFA states for particularly large regexes, so we saturate towards + // zero. + (real_capacity / self.nfa.states().len()).saturating_sub(1) } } @@ -954,8 +948,14 @@ impl BoundedBacktracker { None => return Ok(None), Some(pid) => pid, }; - let start = slots[0].unwrap().get(); - let end = slots[1].unwrap().get(); + let start = match slots[0] { + None => return Ok(None), + Some(s) => s.get(), + }; + let end = match slots[1] { + None => return Ok(None), + Some(s) => s.get(), + }; return Ok(Some(Match::new(pid, Span { start, end }))); } let ginfo = self.get_nfa().group_info(); @@ -965,8 +965,14 @@ impl BoundedBacktracker { None => return Ok(None), Some(pid) => pid, }; - let start = slots[pid.as_usize() * 2].unwrap().get(); - let end = slots[pid.as_usize() * 2 + 1].unwrap().get(); + let start = match slots[pid.as_usize() * 2] { + None => return Ok(None), + Some(s) => s.get(), + }; + let end = match slots[pid.as_usize() * 2 + 1] { + None => return Ok(None), + Some(s) => s.get(), + }; Ok(Some(Match::new(pid, Span { start, end }))) } @@ -1292,12 +1298,14 @@ impl BoundedBacktracker { ) -> Result, MatchError> { let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8(); if !utf8empty { - return self.try_search_slots_imp(cache, input, slots); + let maybe_hm = self.try_search_slots_imp(cache, input, slots)?; + return Ok(maybe_hm.map(|hm| hm.pattern())); } // See PikeVM::try_search_slots for why we do this. let min = self.get_nfa().group_info().implicit_slot_len(); if slots.len() >= min { - return self.try_search_slots_imp(cache, input, slots); + let maybe_hm = self.try_search_slots_imp(cache, input, slots)?; + return Ok(maybe_hm.map(|hm| hm.pattern())); } if self.get_nfa().pattern_len() == 1 { let mut enough = [None, None]; @@ -1305,14 +1313,14 @@ impl BoundedBacktracker { // This is OK because we know `enough_slots` is strictly bigger // than `slots`, otherwise this special case isn't reached. slots.copy_from_slice(&enough[..slots.len()]); - return Ok(got); + return Ok(got.map(|hm| hm.pattern())); } let mut enough = vec![None; min]; let got = self.try_search_slots_imp(cache, input, &mut enough)?; // This is OK because we know `enough_slots` is strictly bigger than // `slots`, otherwise this special case isn't reached. slots.copy_from_slice(&enough[..slots.len()]); - Ok(got) + Ok(got.map(|hm| hm.pattern())) } /// This is the actual implementation of `try_search_slots_imp` that @@ -1325,30 +1333,17 @@ impl BoundedBacktracker { cache: &mut Cache, input: &Input<'_>, slots: &mut [Option], - ) -> Result, MatchError> { + ) -> Result, MatchError> { let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8(); - let (pid, end) = match self.search_imp(cache, input, slots)? { + let hm = match self.search_imp(cache, input, slots)? { None => return Ok(None), - Some(pid) if !utf8empty => return Ok(Some(pid)), - Some(pid) => { - let slot_start = pid.as_usize() * 2; - let slot_end = slot_start + 1; - // OK because we know we have a match and we know our caller - // provided slots are big enough (which we make true above if - // the caller didn't). Namely, we're only here when 'utf8empty' - // is true, and when that's true, we require slots for every - // pattern. - (pid, slots[slot_end].unwrap().get()) - } + Some(hm) if !utf8empty => return Ok(Some(hm)), + Some(hm) => hm, }; - empty::skip_splits_fwd(input, pid, end, |input| { - let pid = match self.search_imp(cache, input, slots)? { - None => return Ok(None), - Some(pid) => pid, - }; - let slot_start = pid.as_usize() * 2; - let slot_end = slot_start + 1; - Ok(Some((pid, slots[slot_end].unwrap().get()))) + empty::skip_splits_fwd(input, hm, hm.offset(), |input| { + Ok(self + .search_imp(cache, input, slots)? + .map(|hm| (hm, hm.offset()))) }) } @@ -1364,7 +1359,7 @@ impl BoundedBacktracker { cache: &mut Cache, input: &Input<'_>, slots: &mut [Option], - ) -> Result, MatchError> { + ) -> Result, MatchError> { // Unlike in the PikeVM, we write our capturing group spans directly // into the caller's captures groups. So we have to make sure we're // starting with a blank slate first. In the PikeVM, we avoid this @@ -1411,10 +1406,9 @@ impl BoundedBacktracker { Some(ref span) => at = span.start, } } - if let Some(pid) = - self.backtrack(cache, input, at, start_id, slots) + if let Some(hm) = self.backtrack(cache, input, at, start_id, slots) { - return Ok(Some(pid)); + return Ok(Some(hm)); } at += 1; } @@ -1435,14 +1429,13 @@ impl BoundedBacktracker { at: usize, start_id: StateID, slots: &mut [Option], - ) -> Option { + ) -> Option { cache.stack.push(Frame::Step { sid: start_id, at }); while let Some(frame) = cache.stack.pop() { match frame { Frame::Step { sid, at } => { - if let Some(pid) = self.step(cache, input, sid, at, slots) - { - return Some(pid); + if let Some(hm) = self.step(cache, input, sid, at, slots) { + return Some(hm); } } Frame::RestoreCapture { slot, offset } => { @@ -1472,7 +1465,7 @@ impl BoundedBacktracker { mut sid: StateID, mut at: usize, slots: &mut [Option], - ) -> Option { + ) -> Option { loop { if !cache.visited.insert(sid, at - input.start()) { return None; @@ -1555,7 +1548,7 @@ impl BoundedBacktracker { } State::Fail => return None, State::Match { pattern_id } => { - return Some(pattern_id); + return Some(HalfMatch::new(pattern_id, at)); } } } @@ -1892,3 +1885,24 @@ fn div_ceil(lhs: usize, rhs: usize) -> usize { (lhs / rhs) + 1 } } + +#[cfg(test)] +mod tests { + use super::*; + + // This is a regression test for the maximum haystack length computation. + // Previously, it assumed that the total capacity of the backtracker's + // bitset would always be greater than the number of NFA states. But there + // is of course no guarantee that this is true. This regression test + // ensures that not only does `max_haystack_len` not panic, but that it + // should return `0`. + #[cfg(feature = "syntax")] + #[test] + fn max_haystack_len_overflow() { + let re = BoundedBacktracker::builder() + .configure(BoundedBacktracker::config().visited_capacity(10)) + .build(r"[0-9A-Za-z]{100}") + .unwrap(); + assert_eq!(0, re.max_haystack_len()); + } +} diff --git a/regex-automata/src/nfa/thompson/builder.rs b/regex-automata/src/nfa/thompson/builder.rs index b57e5bc0f3..6b69e8784d 100644 --- a/regex-automata/src/nfa/thompson/builder.rs +++ b/regex-automata/src/nfa/thompson/builder.rs @@ -61,7 +61,7 @@ enum State { Look { look: Look, next: StateID }, /// An empty state that records the start of a capture location. This is an /// unconditional epsilon transition like `Empty`, except it can be used to - /// record position information for a captue group when using the NFA for + /// record position information for a capture group when using the NFA for /// search. CaptureStart { /// The ID of the pattern that this capture was defined. @@ -77,7 +77,7 @@ enum State { }, /// An empty state that records the end of a capture location. This is an /// unconditional epsilon transition like `Empty`, except it can be used to - /// record position information for a captue group when using the NFA for + /// record position information for a capture group when using the NFA for /// search. CaptureEnd { /// The ID of the pattern that this capture was defined. @@ -128,7 +128,7 @@ enum State { } impl State { - /// If this state is an unconditional espilon transition, then this returns + /// If this state is an unconditional epsilon transition, then this returns /// the target of the transition. fn goto(&self) -> Option { match *self { diff --git a/regex-automata/src/nfa/thompson/compiler.rs b/regex-automata/src/nfa/thompson/compiler.rs index 497fc62b47..2d2172957f 100644 --- a/regex-automata/src/nfa/thompson/compiler.rs +++ b/regex-automata/src/nfa/thompson/compiler.rs @@ -30,7 +30,7 @@ pub struct Config { reverse: Option, nfa_size_limit: Option>, shrink: Option, - captures: Option, + which_captures: Option, look_matcher: Option, #[cfg(test)] unanchored_prefix: Option, @@ -178,12 +178,15 @@ impl Config { /// ``` /// use regex_automata::{ /// dfa::{self, Automaton}, - /// nfa::thompson::NFA, + /// nfa::thompson::{NFA, WhichCaptures}, /// HalfMatch, Input, /// }; /// /// let dfa = dfa::dense::Builder::new() - /// .thompson(NFA::config().captures(false).reverse(true)) + /// .thompson(NFA::config() + /// .which_captures(WhichCaptures::None) + /// .reverse(true) + /// ) /// .build("baz[0-9]+")?; /// let expected = Some(HalfMatch::must(0, 3)); /// assert_eq!( @@ -277,10 +280,12 @@ impl Config { /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long - /// use regex_automata::nfa::thompson::NFA; + /// use regex_automata::nfa::thompson::{NFA, WhichCaptures}; /// /// // Currently we have to disable captures when enabling reverse NFA. - /// let config = NFA::config().captures(false).reverse(true); + /// let config = NFA::config() + /// .which_captures(WhichCaptures::None) + /// .reverse(true); /// let not_shrunk = NFA::compiler() /// .configure(config.clone().shrink(false)) /// .build(r"\w")?; @@ -311,21 +316,99 @@ impl Config { /// # Example /// /// This example demonstrates that some regex engines, like the Pike VM, - /// require capturing groups to be present in the NFA. Building a Pike VM - /// with an NFA without capturing groups will result in an error. + /// require capturing states to be present in the NFA to report match + /// offsets. + /// + /// (Note that since this method is deprecated, the example below uses + /// [`Config::which_captures`] to disable capture states.) /// /// ``` - /// use regex_automata::nfa::thompson::{pikevm::PikeVM, NFA}; + /// use regex_automata::nfa::thompson::{ + /// pikevm::PikeVM, + /// NFA, + /// WhichCaptures, + /// }; /// - /// let nfa = NFA::compiler() - /// .configure(NFA::config().captures(false)) + /// let re = PikeVM::builder() + /// .thompson(NFA::config().which_captures(WhichCaptures::None)) + /// .build(r"[a-z]+")?; + /// let mut cache = re.create_cache(); + /// + /// assert!(re.is_match(&mut cache, "abc")); + /// assert_eq!(None, re.find(&mut cache, "abc")); + /// + /// # Ok::<(), Box>(()) + /// ``` + #[deprecated(since = "0.3.5", note = "use which_captures instead")] + pub fn captures(self, yes: bool) -> Config { + self.which_captures(if yes { + WhichCaptures::All + } else { + WhichCaptures::None + }) + } + + /// Configures what kinds of capture groups are compiled into + /// [`State::Capture`](crate::nfa::thompson::State::Capture) states in a + /// Thompson NFA. + /// + /// Currently, using any option except for [`WhichCaptures::None`] requires + /// disabling the [`reverse`](Config::reverse) setting. If both are + /// enabled, then the compiler will return an error. It is expected that + /// this limitation will be lifted in the future. + /// + /// This is set to [`WhichCaptures::All`] by default. Callers may wish to + /// use [`WhichCaptures::Implicit`] in cases where one wants avoid the + /// overhead of capture states for explicit groups. Usually this occurs + /// when one wants to use the `PikeVM` only for determining the overall + /// match. Otherwise, the `PikeVM` could use much more memory than is + /// necessary. + /// + /// # Example + /// + /// This example demonstrates that some regex engines, like the Pike VM, + /// require capturing states to be present in the NFA to report match + /// offsets. + /// + /// ``` + /// use regex_automata::nfa::thompson::{ + /// pikevm::PikeVM, + /// NFA, + /// WhichCaptures, + /// }; + /// + /// let re = PikeVM::builder() + /// .thompson(NFA::config().which_captures(WhichCaptures::None)) + /// .build(r"[a-z]+")?; + /// let mut cache = re.create_cache(); + /// + /// assert!(re.is_match(&mut cache, "abc")); + /// assert_eq!(None, re.find(&mut cache, "abc")); + /// + /// # Ok::<(), Box>(()) + /// ``` + /// + /// The same applies to the bounded backtracker: + /// + /// ``` + /// use regex_automata::nfa::thompson::{ + /// backtrack::BoundedBacktracker, + /// NFA, + /// WhichCaptures, + /// }; + /// + /// let re = BoundedBacktracker::builder() + /// .thompson(NFA::config().which_captures(WhichCaptures::None)) /// .build(r"[a-z]+")?; - /// assert!(PikeVM::new_from_nfa(nfa).is_err()); + /// let mut cache = re.create_cache(); + /// + /// assert!(re.try_is_match(&mut cache, "abc")?); + /// assert_eq!(None, re.try_find(&mut cache, "abc")?); /// /// # Ok::<(), Box>(()) /// ``` - pub fn captures(mut self, yes: bool) -> Config { - self.captures = Some(yes); + pub fn which_captures(mut self, which_captures: WhichCaptures) -> Config { + self.which_captures = Some(which_captures); self } @@ -405,8 +488,14 @@ impl Config { } /// Return whether NFA compilation is configured to produce capture states. + #[deprecated(since = "0.3.5", note = "use get_which_captures instead")] pub fn get_captures(&self) -> bool { - self.captures.unwrap_or(true) + self.get_which_captures().is_any() + } + + /// Return what kinds of capture states will be compiled into an NFA. + pub fn get_which_captures(&self) -> WhichCaptures { + self.which_captures.unwrap_or(WhichCaptures::All) } /// Return the look-around matcher for this NFA. @@ -439,7 +528,7 @@ impl Config { reverse: o.reverse.or(self.reverse), nfa_size_limit: o.nfa_size_limit.or(self.nfa_size_limit), shrink: o.shrink.or(self.shrink), - captures: o.captures.or(self.captures), + which_captures: o.which_captures.or(self.which_captures), look_matcher: o.look_matcher.or_else(|| self.look_matcher.clone()), #[cfg(test)] unanchored_prefix: o.unanchored_prefix.or(self.unanchored_prefix), @@ -447,6 +536,57 @@ impl Config { } } +/// A configuration indicating which kinds of +/// [`State::Capture`](crate::nfa::thompson::State::Capture) states to include. +/// +/// This configuration can be used with [`Config::which_captures`] to control +/// which capture states are compiled into a Thompson NFA. +/// +/// The default configuration is [`WhichCaptures::All`]. +#[derive(Clone, Copy, Debug)] +pub enum WhichCaptures { + /// All capture states, including those corresponding to both implicit and + /// explicit capture groups, are included in the Thompson NFA. + All, + /// Only capture states corresponding to implicit capture groups are + /// included. Implicit capture groups appear in every pattern implicitly + /// and correspond to the overall match of a pattern. + /// + /// This is useful when one only cares about the overall match of a + /// pattern. By excluding capture states from explicit capture groups, + /// one might be able to reduce the memory usage of a multi-pattern regex + /// substantially if it was otherwise written to have many explicit capture + /// groups. + Implicit, + /// No capture states are compiled into the Thompson NFA. + /// + /// This is useful when capture states are either not needed (for example, + /// if one is only trying to build a DFA) or if they aren't supported (for + /// example, a reverse NFA). + None, +} + +impl Default for WhichCaptures { + fn default() -> WhichCaptures { + WhichCaptures::All + } +} + +impl WhichCaptures { + /// Returns true if this configuration indicates that no capture states + /// should be produced in an NFA. + pub fn is_none(&self) -> bool { + matches!(*self, WhichCaptures::None) + } + + /// Returns true if this configuration indicates that some capture states + /// should be added to an NFA. Note that this might only include capture + /// states for implicit capture groups. + pub fn is_any(&self) -> bool { + !self.is_none() + } +} + /* This compiler below uses Thompson's construction algorithm. The compiler takes a regex-syntax::Hir as input and emits an NFA graph as output. The NFA graph @@ -800,7 +940,9 @@ impl Compiler { if exprs.len() > PatternID::LIMIT { return Err(BuildError::too_many_patterns(exprs.len())); } - if self.config.get_reverse() && self.config.get_captures() { + if self.config.get_reverse() + && self.config.get_which_captures().is_any() + { return Err(BuildError::unsupported_captures()); } @@ -978,8 +1120,13 @@ impl Compiler { name: Option<&str>, expr: &Hir, ) -> Result { - if !self.config.get_captures() { - return self.c(expr); + match self.config.get_which_captures() { + // No capture states means we always skip them. + WhichCaptures::None => return self.c(expr), + // Implicit captures states means we only add when index==0 since + // index==0 implies the group is implicit. + WhichCaptures::Implicit if index > 0 => return self.c(expr), + _ => {} } let start = self.add_capture_start(index, name)?; @@ -1319,7 +1466,7 @@ impl Compiler { // compare and contrast performance of the Pike VM when the code below // is active vs the code above. Here's an example to try: // - // regex-cli find nfa thompson pikevm -b @$smallishru '(?m)^\w{20}' + // regex-cli find match pikevm -b -p '(?m)^\w{20}' non-ascii-file // // With Unicode classes generated below, this search takes about 45s on // my machine. But with the compressed version above, the search takes @@ -1338,7 +1485,7 @@ impl Compiler { .map(|rng| self.c_range(rng.start, rng.end)); self.c_concat(it) }); - self.c_alt(it) + self.c_alt_iter(it) */ } @@ -1410,6 +1557,14 @@ impl Compiler { hir::Look::WordAsciiNegate => Look::WordAsciiNegate, hir::Look::WordUnicode => Look::WordUnicode, hir::Look::WordUnicodeNegate => Look::WordUnicodeNegate, + hir::Look::WordStartAscii => Look::WordStartAscii, + hir::Look::WordEndAscii => Look::WordEndAscii, + hir::Look::WordStartUnicode => Look::WordStartUnicode, + hir::Look::WordEndUnicode => Look::WordEndUnicode, + hir::Look::WordStartHalfAscii => Look::WordStartHalfAscii, + hir::Look::WordEndHalfAscii => Look::WordEndHalfAscii, + hir::Look::WordStartHalfUnicode => Look::WordStartHalfUnicode, + hir::Look::WordEndHalfUnicode => Look::WordEndHalfUnicode, }; let id = self.add_look(look)?; Ok(ThompsonRef { start: id, end: id }) @@ -1725,12 +1880,18 @@ mod tests { use crate::{ nfa::thompson::{SparseTransitions, State, Transition, NFA}, - util::primitives::{PatternID, StateID}, + util::primitives::{PatternID, SmallIndex, StateID}, }; + use super::*; + fn build(pattern: &str) -> NFA { NFA::compiler() - .configure(NFA::config().captures(false).unanchored_prefix(false)) + .configure( + NFA::config() + .which_captures(WhichCaptures::None) + .unanchored_prefix(false), + ) .build(pattern) .unwrap() } @@ -1781,6 +1942,15 @@ mod tests { } } + fn s_cap(next: usize, pattern: usize, index: usize, slot: usize) -> State { + State::Capture { + next: sid(next), + pattern_id: pid(pattern), + group_index: SmallIndex::new(index).unwrap(), + slot: SmallIndex::new(slot).unwrap(), + } + } + fn s_fail() -> State { State::Fail } @@ -1794,7 +1964,7 @@ mod tests { #[test] fn compile_unanchored_prefix() { let nfa = NFA::compiler() - .configure(NFA::config().captures(false)) + .configure(NFA::config().which_captures(WhichCaptures::None)) .build(r"a") .unwrap(); assert_eq!( @@ -1827,7 +1997,11 @@ mod tests { // Check that non-UTF-8 literals work. let nfa = NFA::compiler() - .configure(NFA::config().captures(false).unanchored_prefix(false)) + .configure( + NFA::config() + .which_captures(WhichCaptures::None) + .unanchored_prefix(false), + ) .syntax(crate::util::syntax::Config::new().utf8(false)) .build(r"(?-u)\xFF") .unwrap(); @@ -1937,7 +2111,7 @@ mod tests { let nfa = NFA::compiler() .configure( NFA::config() - .captures(false) + .which_captures(WhichCaptures::None) .reverse(true) .shrink(false) .unanchored_prefix(false), @@ -1965,7 +2139,11 @@ mod tests { #[test] fn compile_many_start_pattern() { let nfa = NFA::compiler() - .configure(NFA::config().captures(false).unanchored_prefix(false)) + .configure( + NFA::config() + .which_captures(WhichCaptures::None) + .unanchored_prefix(false), + ) .build_many(&["a", "b"]) .unwrap(); assert_eq!( @@ -1993,7 +2171,9 @@ mod tests { use regex_syntax::hir::{Class, ClassBytes, Hir}; let hir = Hir::class(Class::Bytes(ClassBytes::new(vec![]))); - let config = NFA::config().captures(false).unanchored_prefix(false); + let config = NFA::config() + .which_captures(WhichCaptures::None) + .unanchored_prefix(false); let nfa = NFA::compiler().configure(config).build_from_hir(&hir).unwrap(); assert_eq!(nfa.states(), &[s_fail(), s_match(0)]); @@ -2005,9 +2185,81 @@ mod tests { use regex_syntax::hir::{Class, ClassUnicode, Hir}; let hir = Hir::class(Class::Unicode(ClassUnicode::new(vec![]))); - let config = NFA::config().captures(false).unanchored_prefix(false); + let config = NFA::config() + .which_captures(WhichCaptures::None) + .unanchored_prefix(false); let nfa = NFA::compiler().configure(config).build_from_hir(&hir).unwrap(); assert_eq!(nfa.states(), &[s_fail(), s_match(0)]); } + + #[test] + fn compile_captures_all() { + let nfa = NFA::compiler() + .configure( + NFA::config() + .unanchored_prefix(false) + .which_captures(WhichCaptures::All), + ) + .build("a(b)c") + .unwrap(); + assert_eq!( + nfa.states(), + &[ + s_cap(1, 0, 0, 0), + s_byte(b'a', 2), + s_cap(3, 0, 1, 2), + s_byte(b'b', 4), + s_cap(5, 0, 1, 3), + s_byte(b'c', 6), + s_cap(7, 0, 0, 1), + s_match(0) + ] + ); + let ginfo = nfa.group_info(); + assert_eq!(2, ginfo.all_group_len()); + } + + #[test] + fn compile_captures_implicit() { + let nfa = NFA::compiler() + .configure( + NFA::config() + .unanchored_prefix(false) + .which_captures(WhichCaptures::Implicit), + ) + .build("a(b)c") + .unwrap(); + assert_eq!( + nfa.states(), + &[ + s_cap(1, 0, 0, 0), + s_byte(b'a', 2), + s_byte(b'b', 3), + s_byte(b'c', 4), + s_cap(5, 0, 0, 1), + s_match(0) + ] + ); + let ginfo = nfa.group_info(); + assert_eq!(1, ginfo.all_group_len()); + } + + #[test] + fn compile_captures_none() { + let nfa = NFA::compiler() + .configure( + NFA::config() + .unanchored_prefix(false) + .which_captures(WhichCaptures::None), + ) + .build("a(b)c") + .unwrap(); + assert_eq!( + nfa.states(), + &[s_byte(b'a', 1), s_byte(b'b', 2), s_byte(b'c', 3), s_match(0)] + ); + let ginfo = nfa.group_info(); + assert_eq!(0, ginfo.all_group_len()); + } } diff --git a/regex-automata/src/nfa/thompson/error.rs b/regex-automata/src/nfa/thompson/error.rs index 82648813ba..3c2fa8a215 100644 --- a/regex-automata/src/nfa/thompson/error.rs +++ b/regex-automata/src/nfa/thompson/error.rs @@ -68,9 +68,6 @@ enum BuildErrorKind { /// The invalid index that was given. index: u32, }, - /// An error that occurs when one tries to build an NFA simulation (such as - /// the PikeVM) without any capturing groups. - MissingCaptures, /// An error that occurs when one tries to build a reverse NFA with /// captures enabled. Currently, this isn't supported, but we probably /// should support it at some point. @@ -126,10 +123,6 @@ impl BuildError { BuildError { kind: BuildErrorKind::InvalidCaptureIndex { index } } } - pub(crate) fn missing_captures() -> BuildError { - BuildError { kind: BuildErrorKind::MissingCaptures } - } - #[cfg(feature = "syntax")] pub(crate) fn unsupported_captures() -> BuildError { BuildError { kind: BuildErrorKind::UnsupportedCaptures } @@ -181,11 +174,6 @@ impl core::fmt::Display for BuildError { "capture group index {} is invalid (too big or discontinuous)", index, ), - BuildErrorKind::MissingCaptures => write!( - f, - "operation requires the NFA to have capturing groups, \ - but the NFA given contains none", - ), #[cfg(feature = "syntax")] BuildErrorKind::UnsupportedCaptures => write!( f, diff --git a/regex-automata/src/nfa/thompson/map.rs b/regex-automata/src/nfa/thompson/map.rs index c36ce53866..7f074a353b 100644 --- a/regex-automata/src/nfa/thompson/map.rs +++ b/regex-automata/src/nfa/thompson/map.rs @@ -65,7 +65,7 @@ const INIT: u64 = 14695981039346656037; /// Specifically, one could observe the difference with std's hashmap via /// something like the following benchmark: /// -/// hyperfine "regex-cli debug nfa thompson --quiet --reverse '\w{90} ecurB'" +/// hyperfine "regex-cli debug thompson -qr --captures none '\w{90} ecurB'" /// /// But to observe that difference, you'd have to modify the code to use /// std's hashmap. diff --git a/regex-automata/src/nfa/thompson/mod.rs b/regex-automata/src/nfa/thompson/mod.rs index 3581d738c2..cf426736dc 100644 --- a/regex-automata/src/nfa/thompson/mod.rs +++ b/regex-automata/src/nfa/thompson/mod.rs @@ -78,4 +78,4 @@ pub use self::{ }, }; #[cfg(feature = "syntax")] -pub use compiler::{Compiler, Config}; +pub use compiler::{Compiler, Config, WhichCaptures}; diff --git a/regex-automata/src/nfa/thompson/nfa.rs b/regex-automata/src/nfa/thompson/nfa.rs index 86131406ca..1f57f8ebd9 100644 --- a/regex-automata/src/nfa/thompson/nfa.rs +++ b/regex-automata/src/nfa/thompson/nfa.rs @@ -453,10 +453,10 @@ impl NFA { /// predict the anchored starting state. /// /// ``` - /// use regex_automata::nfa::thompson::{NFA, State}; + /// use regex_automata::nfa::thompson::{NFA, State, WhichCaptures}; /// /// let nfa = NFA::compiler() - /// .configure(NFA::config().captures(false)) + /// .configure(NFA::config().which_captures(WhichCaptures::None)) /// .build("a")?; /// let state = nfa.state(nfa.start_anchored()); /// match *state { @@ -711,7 +711,7 @@ impl NFA { /// or not. /// /// ``` - /// use regex_automata::nfa::thompson::NFA; + /// use regex_automata::nfa::thompson::{NFA, WhichCaptures}; /// /// // Obviously has capture states. /// let nfa = NFA::new("(a)")?; @@ -733,7 +733,7 @@ impl NFA { /// // Notice that 'has_capture' is false here even when we have an /// // explicit capture group in the pattern. /// let nfa = NFA::compiler() - /// .configure(NFA::config().captures(false)) + /// .configure(NFA::config().which_captures(WhichCaptures::None)) /// .build("(a)")?; /// assert!(!nfa.has_capture()); /// @@ -1841,14 +1841,12 @@ impl SparseTransitions { // This is an alternative implementation that uses binary search. In // some ad hoc experiments, like // - // smallishru=OpenSubtitles2018.raw.sample.smallish.ru - // regex-cli find nfa thompson pikevm -b "@$smallishru" '\b\w+\b' + // regex-cli find match pikevm -b -p '\b\w+\b' non-ascii-file // // I could not observe any improvement, and in fact, things seemed to // be a bit slower. I can see an improvement in at least one benchmark: // - // allcpssmall=all-codepoints-utf8-10x - // regex-cli find nfa thompson pikevm @$allcpssmall '\pL{100}' + // regex-cli find match pikevm -b -p '\pL{100}' all-codepoints-utf8 // // Where total search time goes from 3.2s to 2.4s when using binary // search. diff --git a/regex-automata/src/nfa/thompson/pikevm.rs b/regex-automata/src/nfa/thompson/pikevm.rs index d737fb71e9..0128c151ae 100644 --- a/regex-automata/src/nfa/thompson/pikevm.rs +++ b/regex-automata/src/nfa/thompson/pikevm.rs @@ -17,7 +17,9 @@ use crate::{ empty, iter, prefilter::Prefilter, primitives::{NonMaxUsize, PatternID, SmallIndex, StateID}, - search::{Anchored, Input, Match, MatchKind, PatternSet, Span}, + search::{ + Anchored, HalfMatch, Input, Match, MatchKind, PatternSet, Span, + }, sparse_set::SparseSet, }, }; @@ -275,15 +277,6 @@ impl Builder { /// construction of the NFA itself will of course be ignored, since the NFA /// given here is already built. pub fn build_from_nfa(&self, nfa: NFA) -> Result { - // If the NFA has no captures, then the PikeVM doesn't work since it - // relies on them in order to report match locations. However, in - // the special case of an NFA with no patterns, it is allowed, since - // no matches can ever be produced. And importantly, an NFA with no - // patterns has no capturing groups anyway, so this is necessary to - // permit the PikeVM to work with regexes with zero patterns. - if !nfa.has_capture() && nfa.pattern_len() > 0 { - return Err(BuildError::missing_captures()); - } nfa.look_set_any().available().map_err(BuildError::word)?; Ok(PikeVM { config: self.config.clone(), nfa }) } @@ -828,16 +821,16 @@ impl PikeVM { if self.get_nfa().pattern_len() == 1 { let mut slots = [None, None]; let pid = self.search_slots(cache, &input, &mut slots)?; - let start = slots[0].unwrap().get(); - let end = slots[1].unwrap().get(); + let start = slots[0]?.get(); + let end = slots[1]?.get(); return Some(Match::new(pid, Span { start, end })); } let ginfo = self.get_nfa().group_info(); let slots_len = ginfo.implicit_slot_len(); let mut slots = vec![None; slots_len]; let pid = self.search_slots(cache, &input, &mut slots)?; - let start = slots[pid.as_usize() * 2].unwrap().get(); - let end = slots[pid.as_usize() * 2 + 1].unwrap().get(); + let start = slots[pid.as_usize() * 2]?.get(); + let end = slots[pid.as_usize() * 2 + 1]?.get(); Some(Match::new(pid, Span { start, end })) } @@ -1103,7 +1096,8 @@ impl PikeVM { ) -> Option { let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8(); if !utf8empty { - return self.search_slots_imp(cache, input, slots); + let hm = self.search_slots_imp(cache, input, slots)?; + return Some(hm.pattern()); } // There is an unfortunate special case where if the regex can // match the empty string and UTF-8 mode is enabled, the search @@ -1118,22 +1112,23 @@ impl PikeVM { // this case. let min = self.get_nfa().group_info().implicit_slot_len(); if slots.len() >= min { - return self.search_slots_imp(cache, input, slots); + let hm = self.search_slots_imp(cache, input, slots)?; + return Some(hm.pattern()); } if self.get_nfa().pattern_len() == 1 { let mut enough = [None, None]; let got = self.search_slots_imp(cache, input, &mut enough); - // This is OK because we know `enough_slots` is strictly bigger - // than `slots`, otherwise this special case isn't reached. + // This is OK because we know `enough` is strictly bigger than + // `slots`, otherwise this special case isn't reached. slots.copy_from_slice(&enough[..slots.len()]); - return got; + return got.map(|hm| hm.pattern()); } let mut enough = vec![None; min]; let got = self.search_slots_imp(cache, input, &mut enough); - // This is OK because we know `enough_slots` is strictly bigger than - // `slots`, otherwise this special case isn't reached. + // This is OK because we know `enough` is strictly bigger than `slots`, + // otherwise this special case isn't reached. slots.copy_from_slice(&enough[..slots.len()]); - got + got.map(|hm| hm.pattern()) } /// This is the actual implementation of `search_slots_imp` that @@ -1146,30 +1141,17 @@ impl PikeVM { cache: &mut Cache, input: &Input<'_>, slots: &mut [Option], - ) -> Option { + ) -> Option { let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8(); - let (pid, end) = match self.search_imp(cache, input, slots) { + let hm = match self.search_imp(cache, input, slots) { None => return None, - Some(pid) if !utf8empty => return Some(pid), - Some(pid) => { - let slot_start = pid.as_usize() * 2; - let slot_end = slot_start + 1; - // OK because we know we have a match and we know our caller - // provided slots are big enough (which we make true above if - // the caller didn't). Namely, we're only here when 'utf8empty' - // is true, and when that's true, we require slots for every - // pattern. - (pid, slots[slot_end].unwrap().get()) - } + Some(hm) if !utf8empty => return Some(hm), + Some(hm) => hm, }; - empty::skip_splits_fwd(input, pid, end, |input| { - let pid = match self.search_imp(cache, input, slots) { - None => return Ok(None), - Some(pid) => pid, - }; - let slot_start = pid.as_usize() * 2; - let slot_end = slot_start + 1; - Ok(Some((pid, slots[slot_end].unwrap().get()))) + empty::skip_splits_fwd(input, hm, hm.offset(), |input| { + Ok(self + .search_imp(cache, input, slots) + .map(|hm| (hm, hm.offset()))) }) // OK because the PikeVM never errors. .unwrap() @@ -1244,7 +1226,7 @@ impl PikeVM { cache: &mut Cache, input: &Input<'_>, slots: &mut [Option], - ) -> Option { + ) -> Option { cache.setup_search(slots.len()); if input.is_done() { return None; @@ -1273,7 +1255,7 @@ impl PikeVM { let pre = if anchored { None } else { self.get_config().get_prefilter() }; let Cache { ref mut stack, ref mut curr, ref mut next } = cache; - let mut pid = None; + let mut hm = None; // Yes, our search doesn't end at input.end(), but includes it. This // is necessary because matches are delayed by one byte, just like // how the DFA engines work. The delay is used to handle look-behind @@ -1292,7 +1274,7 @@ impl PikeVM { if curr.set.is_empty() { // We have a match and we haven't been instructed to continue // on even after finding a match, so we can quit. - if pid.is_some() && !allmatches { + if hm.is_some() && !allmatches { break; } // If we're running an anchored search and we've advanced @@ -1356,7 +1338,15 @@ impl PikeVM { // matches their behavior. (Generally, 'allmatches' is useful for // overlapping searches or leftmost anchored searches to find the // longest possible match by ignoring match priority.) - if !pid.is_some() || allmatches { + // + // Additionally, when we're running an anchored search, this + // epsilon closure should only be computed at the beginning of the + // search. If we re-computed it at every position, we would be + // simulating an unanchored search when we were tasked to perform + // an anchored search. + if (!hm.is_some() || allmatches) + && (!anchored || at == input.start()) + { // Since we are adding to the 'curr' active states and since // this is for the start ID, we use a slots slice that is // guaranteed to have the right length but where every element @@ -1373,14 +1363,15 @@ impl PikeVM { let slots = next.slot_table.all_absent(); self.epsilon_closure(stack, slots, curr, input, at, start_id); } - if let Some(x) = self.nexts(stack, curr, next, input, at, slots) { - pid = Some(x); + if let Some(pid) = self.nexts(stack, curr, next, input, at, slots) + { + hm = Some(HalfMatch::new(pid, at)); } // Unless the caller asked us to return early, we need to mush on // to see if we can extend our match. (But note that 'nexts' will // quit right after seeing a match when match_kind==LeftmostFirst, // as is consistent with leftmost-first match priority.) - if input.get_earliest() && pid.is_some() { + if input.get_earliest() && hm.is_some() { break; } core::mem::swap(curr, next); @@ -1388,7 +1379,7 @@ impl PikeVM { at += 1; } instrument!(|c| c.eprint(&self.nfa)); - pid + hm } /// The implementation for the 'which_overlapping_matches' API. Basically, @@ -2100,15 +2091,16 @@ impl SlotTable { // if a 'Captures' has fewer slots, e.g., none at all or only slots // for tracking the overall match instead of all slots for every // group. - self.slots_for_captures = nfa.group_info().slot_len(); + self.slots_for_captures = core::cmp::max( + self.slots_per_state, + nfa.pattern_len().checked_mul(2).unwrap(), + ); let len = nfa .states() .len() - // We add 1 so that our last row is always empty. We use it as - // "scratch" space for computing the epsilon closure off of the - // starting state. - .checked_add(1) - .and_then(|x| x.checked_mul(self.slots_per_state)) + .checked_mul(self.slots_per_state) + // Add space to account for scratch space used during a search. + .and_then(|x| x.checked_add(self.slots_for_captures)) // It seems like this could actually panic on legitimate inputs on // 32-bit targets, and very likely to panic on 16-bit. Should we // somehow convert this to an error? What about something similar @@ -2162,7 +2154,7 @@ impl SlotTable { /// compute an epsilon closure outside of the user supplied regex, and thus /// never want it to have any capturing slots set. fn all_absent(&mut self) -> &mut [Option] { - let i = self.table.len() - self.slots_per_state; + let i = self.table.len() - self.slots_for_captures; &mut self.table[i..i + self.slots_for_captures] } } diff --git a/regex-automata/src/nfa/thompson/range_trie.rs b/regex-automata/src/nfa/thompson/range_trie.rs index 2d43a5b6f7..cd77cc1507 100644 --- a/regex-automata/src/nfa/thompson/range_trie.rs +++ b/regex-automata/src/nfa/thompson/range_trie.rs @@ -594,7 +594,7 @@ impl State { // Benchmarks suggest that binary search is just a bit faster than // straight linear search. Specifically when using the debug tool: // - // hyperfine "regex-cli debug nfa thompson --quiet --reverse '\w{90} ecurB'" + // hyperfine "regex-cli debug thompson -qr --captures none '\w{90} ecurB'" binary_search(&self.transitions, |t| range.start <= t.range.end) } diff --git a/regex-automata/src/util/captures.rs b/regex-automata/src/util/captures.rs index 60b6df7e25..05db6a9936 100644 --- a/regex-automata/src/util/captures.rs +++ b/regex-automata/src/util/captures.rs @@ -444,6 +444,8 @@ impl Captures { /// assert_eq!(Some(Span::from(6..17)), caps.get_group(2)); /// // Looking for a non-existent capturing group will return None: /// assert_eq!(None, caps.get_group(3)); + /// # // literals are too big for 32-bit usize: #1039 + /// # #[cfg(target_pointer_width = "64")] /// assert_eq!(None, caps.get_group(9944060567225171988)); /// /// # Ok::<(), Box>(()) @@ -1809,10 +1811,10 @@ impl GroupInfo { /// panic even if captures aren't enabled on this NFA: /// /// ``` - /// use regex_automata::nfa::thompson::NFA; + /// use regex_automata::nfa::thompson::{NFA, WhichCaptures}; /// /// let nfa = NFA::compiler() - /// .configure(NFA::config().captures(false)) + /// .configure(NFA::config().which_captures(WhichCaptures::None)) /// .build_many(&[ /// r"(?Pa)", /// r"a", @@ -1957,7 +1959,7 @@ impl GroupInfo { /// for different patterns and NFA configurations. /// /// ``` - /// use regex_automata::{nfa::thompson::NFA, PatternID}; + /// use regex_automata::{nfa::thompson::{NFA, WhichCaptures}, PatternID}; /// /// let nfa = NFA::new(r"(a)(b)(c)")?; /// // There are 3 explicit groups in the pattern's concrete syntax and @@ -1969,13 +1971,13 @@ impl GroupInfo { /// assert_eq!(1, nfa.group_info().group_len(PatternID::ZERO)); /// /// let nfa = NFA::compiler() - /// .configure(NFA::config().captures(false)) + /// .configure(NFA::config().which_captures(WhichCaptures::None)) /// .build(r"abc")?; /// // We disabled capturing groups, so there are none. /// assert_eq!(0, nfa.group_info().group_len(PatternID::ZERO)); /// /// let nfa = NFA::compiler() - /// .configure(NFA::config().captures(false)) + /// .configure(NFA::config().which_captures(WhichCaptures::None)) /// .build(r"(a)(b)(c)")?; /// // We disabled capturing groups, so there are none, even if there are /// // explicit groups in the concrete syntax. @@ -1999,7 +2001,7 @@ impl GroupInfo { /// for different patterns and NFA configurations. /// /// ``` - /// use regex_automata::{nfa::thompson::NFA, PatternID}; + /// use regex_automata::{nfa::thompson::{NFA, WhichCaptures}, PatternID}; /// /// let nfa = NFA::new(r"(a)(b)(c)")?; /// // There are 3 explicit groups in the pattern's concrete syntax and @@ -2016,13 +2018,13 @@ impl GroupInfo { /// assert_eq!(5, nfa.group_info().all_group_len()); /// /// let nfa = NFA::compiler() - /// .configure(NFA::config().captures(false)) + /// .configure(NFA::config().which_captures(WhichCaptures::None)) /// .build(r"abc")?; /// // We disabled capturing groups, so there are none. /// assert_eq!(0, nfa.group_info().all_group_len()); /// /// let nfa = NFA::compiler() - /// .configure(NFA::config().captures(false)) + /// .configure(NFA::config().which_captures(WhichCaptures::None)) /// .build(r"(a)(b)(c)")?; /// // We disabled capturing groups, so there are none, even if there are /// // explicit groups in the concrete syntax. diff --git a/regex-automata/src/util/determinize/mod.rs b/regex-automata/src/util/determinize/mod.rs index 30a82afb81..ba32991d06 100644 --- a/regex-automata/src/util/determinize/mod.rs +++ b/regex-automata/src/util/determinize/mod.rs @@ -145,9 +145,10 @@ pub(crate) fn next( } Some(_) => {} None => { - look_have = look_have.insert(Look::End); - look_have = look_have.insert(Look::EndLF); - look_have = look_have.insert(Look::EndCRLF); + look_have = look_have + .insert(Look::End) + .insert(Look::EndLF) + .insert(Look::EndCRLF); } } if unit.is_byte(lookm.get_line_terminator()) { @@ -160,11 +161,26 @@ pub(crate) fn next( look_have = look_have.insert(Look::StartCRLF); } if state.is_from_word() == unit.is_word_byte() { - look_have = look_have.insert(Look::WordUnicodeNegate); - look_have = look_have.insert(Look::WordAsciiNegate); + look_have = look_have + .insert(Look::WordAsciiNegate) + .insert(Look::WordUnicodeNegate); } else { - look_have = look_have.insert(Look::WordUnicode); - look_have = look_have.insert(Look::WordAscii); + look_have = + look_have.insert(Look::WordAscii).insert(Look::WordUnicode); + } + if !unit.is_word_byte() { + look_have = look_have + .insert(Look::WordEndHalfAscii) + .insert(Look::WordEndHalfUnicode); + } + if state.is_from_word() && !unit.is_word_byte() { + look_have = look_have + .insert(Look::WordEndAscii) + .insert(Look::WordEndUnicode); + } else if !state.is_from_word() && unit.is_word_byte() { + look_have = look_have + .insert(Look::WordStartAscii) + .insert(Look::WordStartUnicode); } // If we have new assertions satisfied that are among the set of // assertions that exist in this state (that is, just because we added @@ -220,6 +236,14 @@ pub(crate) fn next( { builder.set_look_have(|have| have.insert(Look::StartCRLF)); } + // And also for the start-half word boundary assertions. As long as the + // look-behind byte is not a word char, then the assertions are satisfied. + if nfa.look_set_any().contains_word() && !unit.is_word_byte() { + builder.set_look_have(|have| { + have.insert(Look::WordStartHalfAscii) + .insert(Look::WordStartHalfUnicode) + }); + } for nfa_id in sparses.set1.iter() { match *nfa.state(nfa_id) { thompson::State::Union { .. } @@ -563,47 +587,95 @@ pub(crate) fn set_lookbehind_from_start( ) { let rev = nfa.is_reverse(); let lineterm = nfa.look_matcher().get_line_terminator(); + let lookset = nfa.look_set_any(); match *start { - Start::NonWordByte => {} + Start::NonWordByte => { + if lookset.contains_word() { + builder.set_look_have(|have| { + have.insert(Look::WordStartHalfAscii) + .insert(Look::WordStartHalfUnicode) + }); + } + } Start::WordByte => { - builder.set_is_from_word(); + if lookset.contains_word() { + builder.set_is_from_word(); + } } Start::Text => { - builder.set_look_have(|have| { - have.insert(Look::Start) - .insert(Look::StartLF) - .insert(Look::StartCRLF) - }); + if lookset.contains_anchor_haystack() { + builder.set_look_have(|have| have.insert(Look::Start)); + } + if lookset.contains_anchor_line() { + builder.set_look_have(|have| { + have.insert(Look::StartLF).insert(Look::StartCRLF) + }); + } + if lookset.contains_word() { + builder.set_look_have(|have| { + have.insert(Look::WordStartHalfAscii) + .insert(Look::WordStartHalfUnicode) + }); + } } Start::LineLF => { if rev { - builder.set_is_half_crlf(); - builder.set_look_have(|have| have.insert(Look::StartLF)); + if lookset.contains_anchor_crlf() { + builder.set_is_half_crlf(); + } + if lookset.contains_anchor_line() { + builder.set_look_have(|have| have.insert(Look::StartLF)); + } } else { - builder.set_look_have(|have| have.insert(Look::StartCRLF)); + if lookset.contains_anchor_line() { + builder.set_look_have(|have| have.insert(Look::StartCRLF)); + } } - if lineterm == b'\n' { + if lookset.contains_anchor_line() && lineterm == b'\n' { builder.set_look_have(|have| have.insert(Look::StartLF)); } + if lookset.contains_word() { + builder.set_look_have(|have| { + have.insert(Look::WordStartHalfAscii) + .insert(Look::WordStartHalfUnicode) + }); + } } Start::LineCR => { - if rev { - builder.set_look_have(|have| have.insert(Look::StartCRLF)); - } else { - builder.set_is_half_crlf(); + if lookset.contains_anchor_crlf() { + if rev { + builder.set_look_have(|have| have.insert(Look::StartCRLF)); + } else { + builder.set_is_half_crlf(); + } } - if lineterm == b'\r' { + if lookset.contains_anchor_line() && lineterm == b'\r' { builder.set_look_have(|have| have.insert(Look::StartLF)); } + if lookset.contains_word() { + builder.set_look_have(|have| { + have.insert(Look::WordStartHalfAscii) + .insert(Look::WordStartHalfUnicode) + }); + } } Start::CustomLineTerminator => { - builder.set_look_have(|have| have.insert(Look::StartLF)); + if lookset.contains_anchor_line() { + builder.set_look_have(|have| have.insert(Look::StartLF)); + } // This is a bit of a tricky case, but if the line terminator was // set to a word byte, then we also need to behave as if the start // configuration is Start::WordByte. That is, we need to mark our // state as having come from a word byte. - if utf8::is_word_byte(lineterm) { - builder.set_is_from_word(); + if lookset.contains_word() { + if utf8::is_word_byte(lineterm) { + builder.set_is_from_word(); + } else { + builder.set_look_have(|have| { + have.insert(Look::WordStartHalfAscii) + .insert(Look::WordStartHalfUnicode) + }); + } } } } diff --git a/regex-automata/src/util/determinize/state.rs b/regex-automata/src/util/determinize/state.rs index e641235874..effa6f44d7 100644 --- a/regex-automata/src/util/determinize/state.rs +++ b/regex-automata/src/util/determinize/state.rs @@ -197,7 +197,7 @@ impl StateBuilderEmpty { } pub(crate) fn into_matches(mut self) -> StateBuilderMatches { - self.0.extend_from_slice(&[0, 0, 0, 0, 0]); + self.0.extend_from_slice(&[0, 0, 0, 0, 0, 0, 0, 0, 0]); StateBuilderMatches(self.0) } @@ -348,16 +348,17 @@ impl StateBuilderNFA { /// generated by a transition over a "word" byte. (Callers may not always set /// this. For example, if the NFA has no word boundary assertion, then needing /// to track whether a state came from a word byte or not is superfluous and -/// wasteful.) +/// wasteful.) Bit 3 is set to 1 if the state was generated by a transition +/// from a `\r` (forward search) or a `\n` (reverse search) when CRLF mode is +/// enabled. /// -/// Byte 1 corresponds to the look-behind assertions that were satisfied by -/// the transition that created this state. This generally only includes the -/// StartLF and Start assertions. (Look-ahead assertions are not tracked as -/// part of states. Instead, these are applied by re-computing the epsilon -/// closure of a state when computing the transition function. See `next` in -/// the parent module.) +/// Bytes 1..5 correspond to the look-behind assertions that were satisfied +/// by the transition that created this state. (Look-ahead assertions are not +/// tracked as part of states. Instead, these are applied by re-computing the +/// epsilon closure of a state when computing the transition function. See +/// `next` in the parent module.) /// -/// Byte 2 corresponds to the set of look-around assertions (including both +/// Bytes 5..9 correspond to the set of look-around assertions (including both /// look-behind and look-ahead) that appear somewhere in this state's set of /// NFA state IDs. This is used to determine whether this state's epsilon /// closure should be re-computed when computing the transition function. @@ -366,7 +367,7 @@ impl StateBuilderNFA { /// function, we should only re-compute the epsilon closure if those new /// assertions are relevant to this particular state. /// -/// Bytes 3..7 correspond to a 32-bit native-endian encoded integer +/// Bytes 9..13 correspond to a 32-bit native-endian encoded integer /// corresponding to the number of patterns encoded in this state. If the state /// is not a match state (byte 0 bit 0 is 0) or if it's only pattern ID is /// PatternID::ZERO, then no integer is encoded at this position. Instead, byte @@ -452,7 +453,7 @@ impl<'a> Repr<'a> { /// state has no conditional epsilon transitions, then there is no need /// to re-compute the epsilon closure. fn look_need(&self) -> LookSet { - LookSet::read_repr(&self.0[3..]) + LookSet::read_repr(&self.0[5..]) } /// Returns the total number of match pattern IDs in this state. @@ -476,7 +477,7 @@ impl<'a> Repr<'a> { if !self.has_pattern_ids() { PatternID::ZERO } else { - let offset = 9 + index * PatternID::SIZE; + let offset = 13 + index * PatternID::SIZE; // This is OK since we only ever serialize valid PatternIDs to // states. wire::read_pattern_id_unchecked(&self.0[offset..]).0 @@ -507,7 +508,7 @@ impl<'a> Repr<'a> { f(PatternID::ZERO); return; } - let mut pids = &self.0[9..self.pattern_offset_end()]; + let mut pids = &self.0[13..self.pattern_offset_end()]; while !pids.is_empty() { let pid = wire::read_u32(pids); pids = &pids[PatternID::SIZE..]; @@ -539,11 +540,11 @@ impl<'a> Repr<'a> { fn pattern_offset_end(&self) -> usize { let encoded = self.encoded_pattern_len(); if encoded == 0 { - return 5; + return 9; } // This arithmetic is OK since we were able to address this many bytes // when writing to the state, thus, it must fit into a usize. - encoded.checked_mul(4).unwrap().checked_add(9).unwrap() + encoded.checked_mul(4).unwrap().checked_add(13).unwrap() } /// Returns the total number of *encoded* pattern IDs in this state. @@ -557,7 +558,7 @@ impl<'a> Repr<'a> { } // This unwrap is OK since the total number of patterns is always // guaranteed to fit into a usize. - usize::try_from(wire::read_u32(&self.0[5..9])).unwrap() + usize::try_from(wire::read_u32(&self.0[9..13])).unwrap() } } @@ -643,7 +644,7 @@ impl<'a> ReprVec<'a> { /// Mutate the set of look-around (both behind and ahead) assertions that /// appear at least once in this state's set of NFA states. fn set_look_need(&mut self, mut set: impl FnMut(LookSet) -> LookSet) { - set(self.look_need()).write_repr(&mut self.0[3..]); + set(self.look_need()).write_repr(&mut self.0[5..]); } /// Add a pattern ID to this state. All match states must have at least @@ -703,14 +704,14 @@ impl<'a> ReprVec<'a> { return; } let patsize = PatternID::SIZE; - let pattern_bytes = self.0.len() - 9; + let pattern_bytes = self.0.len() - 13; // Every pattern ID uses 4 bytes, so number of bytes should be // divisible by 4. assert_eq!(pattern_bytes % patsize, 0); // This unwrap is OK since we are guaranteed that the maximum number // of possible patterns fits into a u32. let count32 = u32::try_from(pattern_bytes / patsize).unwrap(); - wire::NE::write_u32(count32, &mut self.0[5..9]); + wire::NE::write_u32(count32, &mut self.0[9..13]); } /// Add an NFA state ID to this state. The order in which NFA states are diff --git a/regex-automata/src/util/lazy.rs b/regex-automata/src/util/lazy.rs index de27a2a6e6..0d0b4fb2ae 100644 --- a/regex-automata/src/util/lazy.rs +++ b/regex-automata/src/util/lazy.rs @@ -384,11 +384,7 @@ mod lazy { // SAFETY: state is DONE if and only if data has been fully // initialized. At which point, it is safe to drop. unsafe { - // MSRV(1.60): Use assume_init_drop. The below is how - // assume_init_drop is implemented. - core::ptr::drop_in_place( - (*self.data.as_ptr()).as_mut_ptr(), - ) + self.data.get_mut().assume_init_drop(); } } } diff --git a/regex-automata/src/util/look.rs b/regex-automata/src/util/look.rs index aee31b34e0..73e51c0f6e 100644 --- a/regex-automata/src/util/look.rs +++ b/regex-automata/src/util/look.rs @@ -96,6 +96,42 @@ pub enum Look { WordUnicode = 1 << 8, /// Match a Unicode-aware negation of a word boundary. WordUnicodeNegate = 1 << 9, + /// Match the start of an ASCII-only word boundary. That is, this matches a + /// position at either the beginning of the haystack or where the previous + /// character is not a word character and the following character is a word + /// character. + WordStartAscii = 1 << 10, + /// Match the end of an ASCII-only word boundary. That is, this matches + /// a position at either the end of the haystack or where the previous + /// character is a word character and the following character is not a word + /// character. + WordEndAscii = 1 << 11, + /// Match the start of a Unicode word boundary. That is, this matches a + /// position at either the beginning of the haystack or where the previous + /// character is not a word character and the following character is a word + /// character. + WordStartUnicode = 1 << 12, + /// Match the end of a Unicode word boundary. That is, this matches a + /// position at either the end of the haystack or where the previous + /// character is a word character and the following character is not a word + /// character. + WordEndUnicode = 1 << 13, + /// Match the start half of an ASCII-only word boundary. That is, this + /// matches a position at either the beginning of the haystack or where the + /// previous character is not a word character. + WordStartHalfAscii = 1 << 14, + /// Match the end half of an ASCII-only word boundary. That is, this + /// matches a position at either the end of the haystack or where the + /// following character is not a word character. + WordEndHalfAscii = 1 << 15, + /// Match the start half of a Unicode word boundary. That is, this matches + /// a position at either the beginning of the haystack or where the + /// previous character is not a word character. + WordStartHalfUnicode = 1 << 16, + /// Match the end half of a Unicode word boundary. That is, this matches + /// a position at either the end of the haystack or where the following + /// character is not a word character. + WordEndHalfUnicode = 1 << 17, } impl Look { @@ -117,6 +153,14 @@ impl Look { Look::WordAsciiNegate => Look::WordAsciiNegate, Look::WordUnicode => Look::WordUnicode, Look::WordUnicodeNegate => Look::WordUnicodeNegate, + Look::WordStartAscii => Look::WordEndAscii, + Look::WordEndAscii => Look::WordStartAscii, + Look::WordStartUnicode => Look::WordEndUnicode, + Look::WordEndUnicode => Look::WordStartUnicode, + Look::WordStartHalfAscii => Look::WordEndHalfAscii, + Look::WordEndHalfAscii => Look::WordStartHalfAscii, + Look::WordStartHalfUnicode => Look::WordEndHalfUnicode, + Look::WordEndHalfUnicode => Look::WordStartHalfUnicode, } } @@ -125,28 +169,36 @@ impl Look { /// constructor is guaranteed to return the same look-around variant that /// one started with within a semver compatible release of this crate. #[inline] - pub const fn as_repr(self) -> u16 { + pub const fn as_repr(self) -> u32 { // AFAIK, 'as' is the only way to zero-cost convert an int enum to an // actual int. - self as u16 + self as u32 } /// Given the underlying representation of a `Look` value, return the /// corresponding `Look` value if the representation is valid. Otherwise /// `None` is returned. #[inline] - pub const fn from_repr(repr: u16) -> Option { + pub const fn from_repr(repr: u32) -> Option { match repr { - 0b00_0000_0001 => Some(Look::Start), - 0b00_0000_0010 => Some(Look::End), - 0b00_0000_0100 => Some(Look::StartLF), - 0b00_0000_1000 => Some(Look::EndLF), - 0b00_0001_0000 => Some(Look::StartCRLF), - 0b00_0010_0000 => Some(Look::EndCRLF), - 0b00_0100_0000 => Some(Look::WordAscii), - 0b00_1000_0000 => Some(Look::WordAsciiNegate), - 0b01_0000_0000 => Some(Look::WordUnicode), - 0b10_0000_0000 => Some(Look::WordUnicodeNegate), + 0b00_0000_0000_0000_0001 => Some(Look::Start), + 0b00_0000_0000_0000_0010 => Some(Look::End), + 0b00_0000_0000_0000_0100 => Some(Look::StartLF), + 0b00_0000_0000_0000_1000 => Some(Look::EndLF), + 0b00_0000_0000_0001_0000 => Some(Look::StartCRLF), + 0b00_0000_0000_0010_0000 => Some(Look::EndCRLF), + 0b00_0000_0000_0100_0000 => Some(Look::WordAscii), + 0b00_0000_0000_1000_0000 => Some(Look::WordAsciiNegate), + 0b00_0000_0001_0000_0000 => Some(Look::WordUnicode), + 0b00_0000_0010_0000_0000 => Some(Look::WordUnicodeNegate), + 0b00_0000_0100_0000_0000 => Some(Look::WordStartAscii), + 0b00_0000_1000_0000_0000 => Some(Look::WordEndAscii), + 0b00_0001_0000_0000_0000 => Some(Look::WordStartUnicode), + 0b00_0010_0000_0000_0000 => Some(Look::WordEndUnicode), + 0b00_0100_0000_0000_0000 => Some(Look::WordStartHalfAscii), + 0b00_1000_0000_0000_0000 => Some(Look::WordEndHalfAscii), + 0b01_0000_0000_0000_0000 => Some(Look::WordStartHalfUnicode), + 0b10_0000_0000_0000_0000 => Some(Look::WordEndHalfUnicode), _ => None, } } @@ -171,6 +223,14 @@ impl Look { Look::WordAsciiNegate => 'B', Look::WordUnicode => '𝛃', Look::WordUnicodeNegate => '𝚩', + Look::WordStartAscii => '<', + Look::WordEndAscii => '>', + Look::WordStartUnicode => '〈', + Look::WordEndUnicode => '〉', + Look::WordStartHalfAscii => '◁', + Look::WordEndHalfAscii => '▷', + Look::WordStartHalfUnicode => '◀', + Look::WordEndHalfUnicode => '▶', } } } @@ -184,14 +244,14 @@ impl Look { pub struct LookSet { /// The underlying representation this set is exposed to make it possible /// to store it somewhere efficiently. The representation is that - /// of a bitset, where each assertion occupies bit `i` where `i = - /// Look::as_repr()`. + /// of a bitset, where each assertion occupies bit `i` where + /// `i = Look::as_repr()`. /// /// Note that users of this internal representation must permit the full /// range of `u16` values to be represented. For example, even if the /// current implementation only makes use of the 10 least significant bits, /// it may use more bits in a future semver compatible release. - pub bits: u16, + pub bits: u32, } impl LookSet { @@ -294,13 +354,22 @@ impl LookSet { pub fn contains_word_unicode(self) -> bool { self.contains(Look::WordUnicode) || self.contains(Look::WordUnicodeNegate) + || self.contains(Look::WordStartUnicode) + || self.contains(Look::WordEndUnicode) + || self.contains(Look::WordStartHalfUnicode) + || self.contains(Look::WordEndHalfUnicode) } /// Returns true if and only if this set contains any ASCII word boundary /// or negated ASCII word boundary assertions. #[inline] pub fn contains_word_ascii(self) -> bool { - self.contains(Look::WordAscii) || self.contains(Look::WordAsciiNegate) + self.contains(Look::WordAscii) + || self.contains(Look::WordAsciiNegate) + || self.contains(Look::WordStartAscii) + || self.contains(Look::WordEndAscii) + || self.contains(Look::WordStartHalfAscii) + || self.contains(Look::WordEndHalfAscii) } /// Returns an iterator over all of the look-around assertions in this set. @@ -379,29 +448,31 @@ impl LookSet { *self = self.intersect(other); } - /// Return a `LookSet` from the slice given as a native endian 16-bit + /// Return a `LookSet` from the slice given as a native endian 32-bit /// integer. /// /// # Panics /// - /// This panics if `slice.len() < 2`. + /// This panics if `slice.len() < 4`. #[inline] pub fn read_repr(slice: &[u8]) -> LookSet { - let bits = u16::from_ne_bytes(slice[..2].try_into().unwrap()); + let bits = u32::from_ne_bytes(slice[..4].try_into().unwrap()); LookSet { bits } } - /// Write a `LookSet` as a native endian 16-bit integer to the beginning + /// Write a `LookSet` as a native endian 32-bit integer to the beginning /// of the slice given. /// /// # Panics /// - /// This panics if `slice.len() < 2`. + /// This panics if `slice.len() < 4`. #[inline] pub fn write_repr(self, slice: &mut [u8]) { let raw = self.bits.to_ne_bytes(); slice[0] = raw[0]; slice[1] = raw[1]; + slice[2] = raw[2]; + slice[3] = raw[3]; } /// Checks that all assertions in this set can be matched. @@ -456,9 +527,9 @@ impl Iterator for LookSetIter { return None; } // We'll never have more than u8::MAX distinct look-around assertions, - // so 'repr' will always fit into a u16. - let repr = u16::try_from(self.set.bits.trailing_zeros()).unwrap(); - let look = Look::from_repr(1 << repr)?; + // so 'bit' will always fit into a u16. + let bit = u16::try_from(self.set.bits.trailing_zeros()).unwrap(); + let look = Look::from_repr(1 << bit)?; self.set = self.set.remove(look); Some(look) } @@ -566,6 +637,23 @@ impl LookMatcher { } /// Like `matches`, but forcefully inlined. + /// + /// # Panics + /// + /// This panics when testing any Unicode word boundary assertion in this + /// set and when the Unicode word data is not available. Specifically, this + /// only occurs when the `unicode-word-boundary` feature is not enabled. + /// + /// Since it's generally expected that this routine is called inside of + /// a matching engine, callers should check the error condition when + /// building the matching engine. If there is a Unicode word boundary + /// in the matcher and the data isn't available, then the matcher should + /// fail to build. + /// + /// Callers can check the error condition with [`LookSet::available`]. + /// + /// This also may panic when `at > haystack.len()`. Note that `at == + /// haystack.len()` is legal and guaranteed not to panic. #[cfg_attr(feature = "perf-inline", inline(always))] pub(crate) fn matches_inline( &self, @@ -586,6 +674,26 @@ impl LookMatcher { Look::WordUnicodeNegate => { self.is_word_unicode_negate(haystack, at).unwrap() } + Look::WordStartAscii => self.is_word_start_ascii(haystack, at), + Look::WordEndAscii => self.is_word_end_ascii(haystack, at), + Look::WordStartUnicode => { + self.is_word_start_unicode(haystack, at).unwrap() + } + Look::WordEndUnicode => { + self.is_word_end_unicode(haystack, at).unwrap() + } + Look::WordStartHalfAscii => { + self.is_word_start_half_ascii(haystack, at) + } + Look::WordEndHalfAscii => { + self.is_word_end_half_ascii(haystack, at) + } + Look::WordStartHalfUnicode => { + self.is_word_start_half_unicode(haystack, at).unwrap() + } + Look::WordEndHalfUnicode => { + self.is_word_end_half_unicode(haystack, at).unwrap() + } } } @@ -680,6 +788,46 @@ impl LookMatcher { return false; } } + if set.contains(Look::WordStartAscii) { + if !self.is_word_start_ascii(haystack, at) { + return false; + } + } + if set.contains(Look::WordEndAscii) { + if !self.is_word_end_ascii(haystack, at) { + return false; + } + } + if set.contains(Look::WordStartUnicode) { + if !self.is_word_start_unicode(haystack, at).unwrap() { + return false; + } + } + if set.contains(Look::WordEndUnicode) { + if !self.is_word_end_unicode(haystack, at).unwrap() { + return false; + } + } + if set.contains(Look::WordStartHalfAscii) { + if !self.is_word_start_half_ascii(haystack, at) { + return false; + } + } + if set.contains(Look::WordEndHalfAscii) { + if !self.is_word_end_half_ascii(haystack, at) { + return false; + } + } + if set.contains(Look::WordStartHalfUnicode) { + if !self.is_word_start_half_unicode(haystack, at).unwrap() { + return false; + } + } + if set.contains(Look::WordEndHalfUnicode) { + if !self.is_word_end_half_unicode(haystack, at).unwrap() { + return false; + } + } true } @@ -703,7 +851,15 @@ impl LookMatcher { Look::WordAscii | Look::WordAsciiNegate | Look::WordUnicode - | Look::WordUnicodeNegate => { + | Look::WordUnicodeNegate + | Look::WordStartAscii + | Look::WordEndAscii + | Look::WordStartUnicode + | Look::WordEndUnicode + | Look::WordStartHalfAscii + | Look::WordEndHalfAscii + | Look::WordStartHalfUnicode + | Look::WordEndHalfUnicode => { // We need to mark all ranges of bytes whose pairs result in // evaluating \b differently. This isn't technically correct // for Unicode word boundaries, but DFAs can't handle those @@ -931,6 +1087,177 @@ impl LookMatcher { }; Ok(word_before == word_after) } + + /// Returns true when [`Look::WordStartAscii`] is satisfied `at` the given + /// position in `haystack`. + /// + /// # Panics + /// + /// This may panic when `at > haystack.len()`. Note that `at == + /// haystack.len()` is legal and guaranteed not to panic. + #[inline] + pub fn is_word_start_ascii(&self, haystack: &[u8], at: usize) -> bool { + let word_before = at > 0 && utf8::is_word_byte(haystack[at - 1]); + let word_after = + at < haystack.len() && utf8::is_word_byte(haystack[at]); + !word_before && word_after + } + + /// Returns true when [`Look::WordEndAscii`] is satisfied `at` the given + /// position in `haystack`. + /// + /// # Panics + /// + /// This may panic when `at > haystack.len()`. Note that `at == + /// haystack.len()` is legal and guaranteed not to panic. + #[inline] + pub fn is_word_end_ascii(&self, haystack: &[u8], at: usize) -> bool { + let word_before = at > 0 && utf8::is_word_byte(haystack[at - 1]); + let word_after = + at < haystack.len() && utf8::is_word_byte(haystack[at]); + word_before && !word_after + } + + /// Returns true when [`Look::WordStartUnicode`] is satisfied `at` the + /// given position in `haystack`. + /// + /// # Panics + /// + /// This may panic when `at > haystack.len()`. Note that `at == + /// haystack.len()` is legal and guaranteed not to panic. + /// + /// # Errors + /// + /// This returns an error when Unicode word boundary tables + /// are not available. Specifically, this only occurs when the + /// `unicode-word-boundary` feature is not enabled. + #[inline] + pub fn is_word_start_unicode( + &self, + haystack: &[u8], + at: usize, + ) -> Result { + let word_before = is_word_char::rev(haystack, at)?; + let word_after = is_word_char::fwd(haystack, at)?; + Ok(!word_before && word_after) + } + + /// Returns true when [`Look::WordEndUnicode`] is satisfied `at` the + /// given position in `haystack`. + /// + /// # Panics + /// + /// This may panic when `at > haystack.len()`. Note that `at == + /// haystack.len()` is legal and guaranteed not to panic. + /// + /// # Errors + /// + /// This returns an error when Unicode word boundary tables + /// are not available. Specifically, this only occurs when the + /// `unicode-word-boundary` feature is not enabled. + #[inline] + pub fn is_word_end_unicode( + &self, + haystack: &[u8], + at: usize, + ) -> Result { + let word_before = is_word_char::rev(haystack, at)?; + let word_after = is_word_char::fwd(haystack, at)?; + Ok(word_before && !word_after) + } + + /// Returns true when [`Look::WordStartHalfAscii`] is satisfied `at` the + /// given position in `haystack`. + /// + /// # Panics + /// + /// This may panic when `at > haystack.len()`. Note that `at == + /// haystack.len()` is legal and guaranteed not to panic. + #[inline] + pub fn is_word_start_half_ascii( + &self, + haystack: &[u8], + at: usize, + ) -> bool { + let word_before = at > 0 && utf8::is_word_byte(haystack[at - 1]); + !word_before + } + + /// Returns true when [`Look::WordEndHalfAscii`] is satisfied `at` the + /// given position in `haystack`. + /// + /// # Panics + /// + /// This may panic when `at > haystack.len()`. Note that `at == + /// haystack.len()` is legal and guaranteed not to panic. + #[inline] + pub fn is_word_end_half_ascii(&self, haystack: &[u8], at: usize) -> bool { + let word_after = + at < haystack.len() && utf8::is_word_byte(haystack[at]); + !word_after + } + + /// Returns true when [`Look::WordStartHalfUnicode`] is satisfied `at` the + /// given position in `haystack`. + /// + /// # Panics + /// + /// This may panic when `at > haystack.len()`. Note that `at == + /// haystack.len()` is legal and guaranteed not to panic. + /// + /// # Errors + /// + /// This returns an error when Unicode word boundary tables + /// are not available. Specifically, this only occurs when the + /// `unicode-word-boundary` feature is not enabled. + #[inline] + pub fn is_word_start_half_unicode( + &self, + haystack: &[u8], + at: usize, + ) -> Result { + // See `is_word_unicode_negate` for why we need to do this. We don't + // need to do it for `is_word_start_unicode` because that guarantees + // that the position matched falls on a valid UTF-8 boundary given + // that the right side must be in \w. + let word_before = at > 0 + && match utf8::decode_last(&haystack[..at]) { + None | Some(Err(_)) => return Ok(false), + Some(Ok(_)) => is_word_char::rev(haystack, at)?, + }; + Ok(!word_before) + } + + /// Returns true when [`Look::WordEndHalfUnicode`] is satisfied `at` the + /// given position in `haystack`. + /// + /// # Panics + /// + /// This may panic when `at > haystack.len()`. Note that `at == + /// haystack.len()` is legal and guaranteed not to panic. + /// + /// # Errors + /// + /// This returns an error when Unicode word boundary tables + /// are not available. Specifically, this only occurs when the + /// `unicode-word-boundary` feature is not enabled. + #[inline] + pub fn is_word_end_half_unicode( + &self, + haystack: &[u8], + at: usize, + ) -> Result { + // See `is_word_unicode_negate` for why we need to do this. We don't + // need to do it for `is_word_end_unicode` because that guarantees + // that the position matched falls on a valid UTF-8 boundary given + // that the left side must be in \w. + let word_after = at < haystack.len() + && match utf8::decode(&haystack[at..]) { + None | Some(Err(_)) => return Ok(false), + Some(Ok(_)) => is_word_char::fwd(haystack, at)?, + }; + Ok(!word_after) + } } impl Default for LookMatcher { @@ -1024,7 +1351,9 @@ impl core::fmt::Display for UnicodeWordBoundaryError { // There are perhaps other choices as well. Why did I stop at these 4? Because // I wanted to preserve my sanity. I suspect I'll wind up adding the lazy DFA // approach eventually, as the benefits of the DFA approach are somewhat -// compelling. The 'boundary-words-holmes' benchmark tests this: +// compelling. The 'boundary-words-holmes' benchmark tests this. (Note that +// the commands below no longer work. If necessary, we should re-capitulate +// the benchmark from whole cloth in rebar.) // // $ regex-cli bench measure -f boundary-words-holmes -e pikevm > dfa.csv // @@ -1322,8 +1651,7 @@ mod is_word_char { fn is_word_character(c: char) -> bool { use crate::util::{unicode_data::perl_word::PERL_WORD, utf8}; - // MSRV(1.59): Use 'u8::try_from(c)' instead. - if u8::try_from(u32::from(c)).map_or(false, utf8::is_word_byte) { + if u8::try_from(c).map_or(false, utf8::is_word_byte) { return true; } PERL_WORD @@ -1656,50 +1984,478 @@ mod tests { } #[test] - fn look_set() { - let mut f = LookSet::default(); - assert!(!f.contains(Look::Start)); - assert!(!f.contains(Look::End)); - assert!(!f.contains(Look::StartLF)); - assert!(!f.contains(Look::EndLF)); - assert!(!f.contains(Look::WordUnicode)); - assert!(!f.contains(Look::WordUnicodeNegate)); - assert!(!f.contains(Look::WordAscii)); - assert!(!f.contains(Look::WordAsciiNegate)); + fn look_matches_word_start_ascii() { + let look = Look::WordStartAscii; - f = f.insert(Look::Start); - assert!(f.contains(Look::Start)); - f = f.remove(Look::Start); - assert!(!f.contains(Look::Start)); + // \xF0\x9D\x9B\x83 = 𝛃 (in \w) + // \xF0\x90\x86\x80 = 𐆀 (not in \w) - f = f.insert(Look::End); - assert!(f.contains(Look::End)); - f = f.remove(Look::End); - assert!(!f.contains(Look::End)); + // Simple ASCII word boundaries. + assert!(testlook!(look, "a", 0)); + assert!(!testlook!(look, "a", 1)); + assert!(!testlook!(look, "a ", 1)); + assert!(testlook!(look, " a ", 1)); + assert!(!testlook!(look, " a ", 2)); - f = f.insert(Look::StartLF); - assert!(f.contains(Look::StartLF)); - f = f.remove(Look::StartLF); - assert!(!f.contains(Look::StartLF)); + // Unicode word boundaries with a non-ASCII codepoint. Since this is + // an ASCII word boundary, none of these match. + assert!(!testlook!(look, "𝛃", 0)); + assert!(!testlook!(look, "𝛃", 4)); + assert!(!testlook!(look, "𝛃 ", 4)); + assert!(!testlook!(look, " 𝛃 ", 1)); + assert!(!testlook!(look, " 𝛃 ", 5)); - f = f.insert(Look::EndLF); - assert!(f.contains(Look::EndLF)); - f = f.remove(Look::EndLF); - assert!(!f.contains(Look::EndLF)); + // Unicode word boundaries between non-ASCII codepoints. Again, since + // this is an ASCII word boundary, none of these match. + assert!(!testlook!(look, "𝛃𐆀", 0)); + assert!(!testlook!(look, "𝛃𐆀", 4)); - f = f.insert(Look::StartCRLF); - assert!(f.contains(Look::StartCRLF)); - f = f.remove(Look::StartCRLF); - assert!(!f.contains(Look::StartCRLF)); + // Non word boundaries for ASCII. + assert!(!testlook!(look, "", 0)); + assert!(!testlook!(look, "ab", 1)); + assert!(!testlook!(look, "a ", 2)); + assert!(!testlook!(look, " a ", 0)); + assert!(!testlook!(look, " a ", 3)); - f = f.insert(Look::EndCRLF); - assert!(f.contains(Look::EndCRLF)); - f = f.remove(Look::EndCRLF); - assert!(!f.contains(Look::EndCRLF)); + // Non word boundaries with a non-ASCII codepoint. + assert!(testlook!(look, "𝛃b", 4)); + assert!(!testlook!(look, "b𝛃", 1)); + assert!(!testlook!(look, "𝛃 ", 5)); + assert!(!testlook!(look, " 𝛃 ", 0)); + assert!(!testlook!(look, " 𝛃 ", 6)); + assert!(!testlook!(look, "𝛃", 1)); + assert!(!testlook!(look, "𝛃", 2)); + assert!(!testlook!(look, "𝛃", 3)); - f = f.insert(Look::WordUnicode); - assert!(f.contains(Look::WordUnicode)); - f = f.remove(Look::WordUnicode); + // Non word boundaries with non-ASCII codepoints. + assert!(!testlook!(look, "𝛃𐆀", 1)); + assert!(!testlook!(look, "𝛃𐆀", 2)); + assert!(!testlook!(look, "𝛃𐆀", 3)); + assert!(!testlook!(look, "𝛃𐆀", 5)); + assert!(!testlook!(look, "𝛃𐆀", 6)); + assert!(!testlook!(look, "𝛃𐆀", 7)); + assert!(!testlook!(look, "𝛃𐆀", 8)); + } + + #[test] + fn look_matches_word_end_ascii() { + let look = Look::WordEndAscii; + + // \xF0\x9D\x9B\x83 = 𝛃 (in \w) + // \xF0\x90\x86\x80 = 𐆀 (not in \w) + + // Simple ASCII word boundaries. + assert!(!testlook!(look, "a", 0)); + assert!(testlook!(look, "a", 1)); + assert!(testlook!(look, "a ", 1)); + assert!(!testlook!(look, " a ", 1)); + assert!(testlook!(look, " a ", 2)); + + // Unicode word boundaries with a non-ASCII codepoint. Since this is + // an ASCII word boundary, none of these match. + assert!(!testlook!(look, "𝛃", 0)); + assert!(!testlook!(look, "𝛃", 4)); + assert!(!testlook!(look, "𝛃 ", 4)); + assert!(!testlook!(look, " 𝛃 ", 1)); + assert!(!testlook!(look, " 𝛃 ", 5)); + + // Unicode word boundaries between non-ASCII codepoints. Again, since + // this is an ASCII word boundary, none of these match. + assert!(!testlook!(look, "𝛃𐆀", 0)); + assert!(!testlook!(look, "𝛃𐆀", 4)); + + // Non word boundaries for ASCII. + assert!(!testlook!(look, "", 0)); + assert!(!testlook!(look, "ab", 1)); + assert!(!testlook!(look, "a ", 2)); + assert!(!testlook!(look, " a ", 0)); + assert!(!testlook!(look, " a ", 3)); + + // Non word boundaries with a non-ASCII codepoint. + assert!(!testlook!(look, "𝛃b", 4)); + assert!(testlook!(look, "b𝛃", 1)); + assert!(!testlook!(look, "𝛃 ", 5)); + assert!(!testlook!(look, " 𝛃 ", 0)); + assert!(!testlook!(look, " 𝛃 ", 6)); + assert!(!testlook!(look, "𝛃", 1)); + assert!(!testlook!(look, "𝛃", 2)); + assert!(!testlook!(look, "𝛃", 3)); + + // Non word boundaries with non-ASCII codepoints. + assert!(!testlook!(look, "𝛃𐆀", 1)); + assert!(!testlook!(look, "𝛃𐆀", 2)); + assert!(!testlook!(look, "𝛃𐆀", 3)); + assert!(!testlook!(look, "𝛃𐆀", 5)); + assert!(!testlook!(look, "𝛃𐆀", 6)); + assert!(!testlook!(look, "𝛃𐆀", 7)); + assert!(!testlook!(look, "𝛃𐆀", 8)); + } + + #[test] + #[cfg(all(not(miri), feature = "unicode-word-boundary"))] + fn look_matches_word_start_unicode() { + let look = Look::WordStartUnicode; + + // \xF0\x9D\x9B\x83 = 𝛃 (in \w) + // \xF0\x90\x86\x80 = 𐆀 (not in \w) + + // Simple ASCII word boundaries. + assert!(testlook!(look, "a", 0)); + assert!(!testlook!(look, "a", 1)); + assert!(!testlook!(look, "a ", 1)); + assert!(testlook!(look, " a ", 1)); + assert!(!testlook!(look, " a ", 2)); + + // Unicode word boundaries with a non-ASCII codepoint. + assert!(testlook!(look, "𝛃", 0)); + assert!(!testlook!(look, "𝛃", 4)); + assert!(!testlook!(look, "𝛃 ", 4)); + assert!(testlook!(look, " 𝛃 ", 1)); + assert!(!testlook!(look, " 𝛃 ", 5)); + + // Unicode word boundaries between non-ASCII codepoints. + assert!(testlook!(look, "𝛃𐆀", 0)); + assert!(!testlook!(look, "𝛃𐆀", 4)); + + // Non word boundaries for ASCII. + assert!(!testlook!(look, "", 0)); + assert!(!testlook!(look, "ab", 1)); + assert!(!testlook!(look, "a ", 2)); + assert!(!testlook!(look, " a ", 0)); + assert!(!testlook!(look, " a ", 3)); + + // Non word boundaries with a non-ASCII codepoint. + assert!(!testlook!(look, "𝛃b", 4)); + assert!(!testlook!(look, "b𝛃", 1)); + assert!(!testlook!(look, "𝛃 ", 5)); + assert!(!testlook!(look, " 𝛃 ", 0)); + assert!(!testlook!(look, " 𝛃 ", 6)); + assert!(!testlook!(look, "𝛃", 1)); + assert!(!testlook!(look, "𝛃", 2)); + assert!(!testlook!(look, "𝛃", 3)); + + // Non word boundaries with non-ASCII codepoints. + assert!(!testlook!(look, "𝛃𐆀", 1)); + assert!(!testlook!(look, "𝛃𐆀", 2)); + assert!(!testlook!(look, "𝛃𐆀", 3)); + assert!(!testlook!(look, "𝛃𐆀", 5)); + assert!(!testlook!(look, "𝛃𐆀", 6)); + assert!(!testlook!(look, "𝛃𐆀", 7)); + assert!(!testlook!(look, "𝛃𐆀", 8)); + } + + #[test] + #[cfg(all(not(miri), feature = "unicode-word-boundary"))] + fn look_matches_word_end_unicode() { + let look = Look::WordEndUnicode; + + // \xF0\x9D\x9B\x83 = 𝛃 (in \w) + // \xF0\x90\x86\x80 = 𐆀 (not in \w) + + // Simple ASCII word boundaries. + assert!(!testlook!(look, "a", 0)); + assert!(testlook!(look, "a", 1)); + assert!(testlook!(look, "a ", 1)); + assert!(!testlook!(look, " a ", 1)); + assert!(testlook!(look, " a ", 2)); + + // Unicode word boundaries with a non-ASCII codepoint. + assert!(!testlook!(look, "𝛃", 0)); + assert!(testlook!(look, "𝛃", 4)); + assert!(testlook!(look, "𝛃 ", 4)); + assert!(!testlook!(look, " 𝛃 ", 1)); + assert!(testlook!(look, " 𝛃 ", 5)); + + // Unicode word boundaries between non-ASCII codepoints. + assert!(!testlook!(look, "𝛃𐆀", 0)); + assert!(testlook!(look, "𝛃𐆀", 4)); + + // Non word boundaries for ASCII. + assert!(!testlook!(look, "", 0)); + assert!(!testlook!(look, "ab", 1)); + assert!(!testlook!(look, "a ", 2)); + assert!(!testlook!(look, " a ", 0)); + assert!(!testlook!(look, " a ", 3)); + + // Non word boundaries with a non-ASCII codepoint. + assert!(!testlook!(look, "𝛃b", 4)); + assert!(!testlook!(look, "b𝛃", 1)); + assert!(!testlook!(look, "𝛃 ", 5)); + assert!(!testlook!(look, " 𝛃 ", 0)); + assert!(!testlook!(look, " 𝛃 ", 6)); + assert!(!testlook!(look, "𝛃", 1)); + assert!(!testlook!(look, "𝛃", 2)); + assert!(!testlook!(look, "𝛃", 3)); + + // Non word boundaries with non-ASCII codepoints. + assert!(!testlook!(look, "𝛃𐆀", 1)); + assert!(!testlook!(look, "𝛃𐆀", 2)); + assert!(!testlook!(look, "𝛃𐆀", 3)); + assert!(!testlook!(look, "𝛃𐆀", 5)); + assert!(!testlook!(look, "𝛃𐆀", 6)); + assert!(!testlook!(look, "𝛃𐆀", 7)); + assert!(!testlook!(look, "𝛃𐆀", 8)); + } + + #[test] + fn look_matches_word_start_half_ascii() { + let look = Look::WordStartHalfAscii; + + // \xF0\x9D\x9B\x83 = 𝛃 (in \w) + // \xF0\x90\x86\x80 = 𐆀 (not in \w) + + // Simple ASCII word boundaries. + assert!(testlook!(look, "a", 0)); + assert!(!testlook!(look, "a", 1)); + assert!(!testlook!(look, "a ", 1)); + assert!(testlook!(look, " a ", 1)); + assert!(!testlook!(look, " a ", 2)); + + // Unicode word boundaries with a non-ASCII codepoint. Since this is + // an ASCII word boundary, none of these match. + assert!(testlook!(look, "𝛃", 0)); + assert!(testlook!(look, "𝛃", 4)); + assert!(testlook!(look, "𝛃 ", 4)); + assert!(testlook!(look, " 𝛃 ", 1)); + assert!(testlook!(look, " 𝛃 ", 5)); + + // Unicode word boundaries between non-ASCII codepoints. Again, since + // this is an ASCII word boundary, none of these match. + assert!(testlook!(look, "𝛃𐆀", 0)); + assert!(testlook!(look, "𝛃𐆀", 4)); + + // Non word boundaries for ASCII. + assert!(testlook!(look, "", 0)); + assert!(!testlook!(look, "ab", 1)); + assert!(testlook!(look, "a ", 2)); + assert!(testlook!(look, " a ", 0)); + assert!(testlook!(look, " a ", 3)); + + // Non word boundaries with a non-ASCII codepoint. + assert!(testlook!(look, "𝛃b", 4)); + assert!(!testlook!(look, "b𝛃", 1)); + assert!(testlook!(look, "𝛃 ", 5)); + assert!(testlook!(look, " 𝛃 ", 0)); + assert!(testlook!(look, " 𝛃 ", 6)); + assert!(testlook!(look, "𝛃", 1)); + assert!(testlook!(look, "𝛃", 2)); + assert!(testlook!(look, "𝛃", 3)); + + // Non word boundaries with non-ASCII codepoints. + assert!(testlook!(look, "𝛃𐆀", 1)); + assert!(testlook!(look, "𝛃𐆀", 2)); + assert!(testlook!(look, "𝛃𐆀", 3)); + assert!(testlook!(look, "𝛃𐆀", 5)); + assert!(testlook!(look, "𝛃𐆀", 6)); + assert!(testlook!(look, "𝛃𐆀", 7)); + assert!(testlook!(look, "𝛃𐆀", 8)); + } + + #[test] + fn look_matches_word_end_half_ascii() { + let look = Look::WordEndHalfAscii; + + // \xF0\x9D\x9B\x83 = 𝛃 (in \w) + // \xF0\x90\x86\x80 = 𐆀 (not in \w) + + // Simple ASCII word boundaries. + assert!(!testlook!(look, "a", 0)); + assert!(testlook!(look, "a", 1)); + assert!(testlook!(look, "a ", 1)); + assert!(!testlook!(look, " a ", 1)); + assert!(testlook!(look, " a ", 2)); + + // Unicode word boundaries with a non-ASCII codepoint. Since this is + // an ASCII word boundary, none of these match. + assert!(testlook!(look, "𝛃", 0)); + assert!(testlook!(look, "𝛃", 4)); + assert!(testlook!(look, "𝛃 ", 4)); + assert!(testlook!(look, " 𝛃 ", 1)); + assert!(testlook!(look, " 𝛃 ", 5)); + + // Unicode word boundaries between non-ASCII codepoints. Again, since + // this is an ASCII word boundary, none of these match. + assert!(testlook!(look, "𝛃𐆀", 0)); + assert!(testlook!(look, "𝛃𐆀", 4)); + + // Non word boundaries for ASCII. + assert!(testlook!(look, "", 0)); + assert!(!testlook!(look, "ab", 1)); + assert!(testlook!(look, "a ", 2)); + assert!(testlook!(look, " a ", 0)); + assert!(testlook!(look, " a ", 3)); + + // Non word boundaries with a non-ASCII codepoint. + assert!(!testlook!(look, "𝛃b", 4)); + assert!(testlook!(look, "b𝛃", 1)); + assert!(testlook!(look, "𝛃 ", 5)); + assert!(testlook!(look, " 𝛃 ", 0)); + assert!(testlook!(look, " 𝛃 ", 6)); + assert!(testlook!(look, "𝛃", 1)); + assert!(testlook!(look, "𝛃", 2)); + assert!(testlook!(look, "𝛃", 3)); + + // Non word boundaries with non-ASCII codepoints. + assert!(testlook!(look, "𝛃𐆀", 1)); + assert!(testlook!(look, "𝛃𐆀", 2)); + assert!(testlook!(look, "𝛃𐆀", 3)); + assert!(testlook!(look, "𝛃𐆀", 5)); + assert!(testlook!(look, "𝛃𐆀", 6)); + assert!(testlook!(look, "𝛃𐆀", 7)); + assert!(testlook!(look, "𝛃𐆀", 8)); + } + + #[test] + #[cfg(all(not(miri), feature = "unicode-word-boundary"))] + fn look_matches_word_start_half_unicode() { + let look = Look::WordStartHalfUnicode; + + // \xF0\x9D\x9B\x83 = 𝛃 (in \w) + // \xF0\x90\x86\x80 = 𐆀 (not in \w) + + // Simple ASCII word boundaries. + assert!(testlook!(look, "a", 0)); + assert!(!testlook!(look, "a", 1)); + assert!(!testlook!(look, "a ", 1)); + assert!(testlook!(look, " a ", 1)); + assert!(!testlook!(look, " a ", 2)); + + // Unicode word boundaries with a non-ASCII codepoint. + assert!(testlook!(look, "𝛃", 0)); + assert!(!testlook!(look, "𝛃", 4)); + assert!(!testlook!(look, "𝛃 ", 4)); + assert!(testlook!(look, " 𝛃 ", 1)); + assert!(!testlook!(look, " 𝛃 ", 5)); + + // Unicode word boundaries between non-ASCII codepoints. + assert!(testlook!(look, "𝛃𐆀", 0)); + assert!(!testlook!(look, "𝛃𐆀", 4)); + + // Non word boundaries for ASCII. + assert!(testlook!(look, "", 0)); + assert!(!testlook!(look, "ab", 1)); + assert!(testlook!(look, "a ", 2)); + assert!(testlook!(look, " a ", 0)); + assert!(testlook!(look, " a ", 3)); + + // Non word boundaries with a non-ASCII codepoint. + assert!(!testlook!(look, "𝛃b", 4)); + assert!(!testlook!(look, "b𝛃", 1)); + assert!(testlook!(look, "𝛃 ", 5)); + assert!(testlook!(look, " 𝛃 ", 0)); + assert!(testlook!(look, " 𝛃 ", 6)); + assert!(!testlook!(look, "𝛃", 1)); + assert!(!testlook!(look, "𝛃", 2)); + assert!(!testlook!(look, "𝛃", 3)); + + // Non word boundaries with non-ASCII codepoints. + assert!(!testlook!(look, "𝛃𐆀", 1)); + assert!(!testlook!(look, "𝛃𐆀", 2)); + assert!(!testlook!(look, "𝛃𐆀", 3)); + assert!(!testlook!(look, "𝛃𐆀", 5)); + assert!(!testlook!(look, "𝛃𐆀", 6)); + assert!(!testlook!(look, "𝛃𐆀", 7)); + assert!(testlook!(look, "𝛃𐆀", 8)); + } + + #[test] + #[cfg(all(not(miri), feature = "unicode-word-boundary"))] + fn look_matches_word_end_half_unicode() { + let look = Look::WordEndHalfUnicode; + + // \xF0\x9D\x9B\x83 = 𝛃 (in \w) + // \xF0\x90\x86\x80 = 𐆀 (not in \w) + + // Simple ASCII word boundaries. + assert!(!testlook!(look, "a", 0)); + assert!(testlook!(look, "a", 1)); + assert!(testlook!(look, "a ", 1)); + assert!(!testlook!(look, " a ", 1)); + assert!(testlook!(look, " a ", 2)); + + // Unicode word boundaries with a non-ASCII codepoint. + assert!(!testlook!(look, "𝛃", 0)); + assert!(testlook!(look, "𝛃", 4)); + assert!(testlook!(look, "𝛃 ", 4)); + assert!(!testlook!(look, " 𝛃 ", 1)); + assert!(testlook!(look, " 𝛃 ", 5)); + + // Unicode word boundaries between non-ASCII codepoints. + assert!(!testlook!(look, "𝛃𐆀", 0)); + assert!(testlook!(look, "𝛃𐆀", 4)); + + // Non word boundaries for ASCII. + assert!(testlook!(look, "", 0)); + assert!(!testlook!(look, "ab", 1)); + assert!(testlook!(look, "a ", 2)); + assert!(testlook!(look, " a ", 0)); + assert!(testlook!(look, " a ", 3)); + + // Non word boundaries with a non-ASCII codepoint. + assert!(!testlook!(look, "𝛃b", 4)); + assert!(!testlook!(look, "b𝛃", 1)); + assert!(testlook!(look, "𝛃 ", 5)); + assert!(testlook!(look, " 𝛃 ", 0)); + assert!(testlook!(look, " 𝛃 ", 6)); + assert!(!testlook!(look, "𝛃", 1)); + assert!(!testlook!(look, "𝛃", 2)); + assert!(!testlook!(look, "𝛃", 3)); + + // Non word boundaries with non-ASCII codepoints. + assert!(!testlook!(look, "𝛃𐆀", 1)); + assert!(!testlook!(look, "𝛃𐆀", 2)); + assert!(!testlook!(look, "𝛃𐆀", 3)); + assert!(!testlook!(look, "𝛃𐆀", 5)); + assert!(!testlook!(look, "𝛃𐆀", 6)); + assert!(!testlook!(look, "𝛃𐆀", 7)); + assert!(testlook!(look, "𝛃𐆀", 8)); + } + + #[test] + fn look_set() { + let mut f = LookSet::default(); + assert!(!f.contains(Look::Start)); + assert!(!f.contains(Look::End)); + assert!(!f.contains(Look::StartLF)); + assert!(!f.contains(Look::EndLF)); + assert!(!f.contains(Look::WordUnicode)); + assert!(!f.contains(Look::WordUnicodeNegate)); + assert!(!f.contains(Look::WordAscii)); + assert!(!f.contains(Look::WordAsciiNegate)); + + f = f.insert(Look::Start); + assert!(f.contains(Look::Start)); + f = f.remove(Look::Start); + assert!(!f.contains(Look::Start)); + + f = f.insert(Look::End); + assert!(f.contains(Look::End)); + f = f.remove(Look::End); + assert!(!f.contains(Look::End)); + + f = f.insert(Look::StartLF); + assert!(f.contains(Look::StartLF)); + f = f.remove(Look::StartLF); + assert!(!f.contains(Look::StartLF)); + + f = f.insert(Look::EndLF); + assert!(f.contains(Look::EndLF)); + f = f.remove(Look::EndLF); + assert!(!f.contains(Look::EndLF)); + + f = f.insert(Look::StartCRLF); + assert!(f.contains(Look::StartCRLF)); + f = f.remove(Look::StartCRLF); + assert!(!f.contains(Look::StartCRLF)); + + f = f.insert(Look::EndCRLF); + assert!(f.contains(Look::EndCRLF)); + f = f.remove(Look::EndCRLF); + assert!(!f.contains(Look::EndCRLF)); + + f = f.insert(Look::WordUnicode); + assert!(f.contains(Look::WordUnicode)); + f = f.remove(Look::WordUnicode); assert!(!f.contains(Look::WordUnicode)); f = f.insert(Look::WordUnicodeNegate); @@ -1716,6 +2472,46 @@ mod tests { assert!(f.contains(Look::WordAsciiNegate)); f = f.remove(Look::WordAsciiNegate); assert!(!f.contains(Look::WordAsciiNegate)); + + f = f.insert(Look::WordStartAscii); + assert!(f.contains(Look::WordStartAscii)); + f = f.remove(Look::WordStartAscii); + assert!(!f.contains(Look::WordStartAscii)); + + f = f.insert(Look::WordEndAscii); + assert!(f.contains(Look::WordEndAscii)); + f = f.remove(Look::WordEndAscii); + assert!(!f.contains(Look::WordEndAscii)); + + f = f.insert(Look::WordStartUnicode); + assert!(f.contains(Look::WordStartUnicode)); + f = f.remove(Look::WordStartUnicode); + assert!(!f.contains(Look::WordStartUnicode)); + + f = f.insert(Look::WordEndUnicode); + assert!(f.contains(Look::WordEndUnicode)); + f = f.remove(Look::WordEndUnicode); + assert!(!f.contains(Look::WordEndUnicode)); + + f = f.insert(Look::WordStartHalfAscii); + assert!(f.contains(Look::WordStartHalfAscii)); + f = f.remove(Look::WordStartHalfAscii); + assert!(!f.contains(Look::WordStartHalfAscii)); + + f = f.insert(Look::WordEndHalfAscii); + assert!(f.contains(Look::WordEndHalfAscii)); + f = f.remove(Look::WordEndHalfAscii); + assert!(!f.contains(Look::WordEndHalfAscii)); + + f = f.insert(Look::WordStartHalfUnicode); + assert!(f.contains(Look::WordStartHalfUnicode)); + f = f.remove(Look::WordStartHalfUnicode); + assert!(!f.contains(Look::WordStartHalfUnicode)); + + f = f.insert(Look::WordEndHalfUnicode); + assert!(f.contains(Look::WordEndHalfUnicode)); + f = f.remove(Look::WordEndHalfUnicode); + assert!(!f.contains(Look::WordEndHalfUnicode)); } #[test] @@ -1724,7 +2520,7 @@ mod tests { assert_eq!(0, set.iter().count()); let set = LookSet::full(); - assert_eq!(10, set.iter().count()); + assert_eq!(18, set.iter().count()); let set = LookSet::empty().insert(Look::StartLF).insert(Look::WordUnicode); @@ -1735,6 +2531,9 @@ mod tests { let set = LookSet::empty().insert(Look::WordAsciiNegate); assert_eq!(1, set.iter().count()); + + let set = LookSet::empty().insert(Look::WordEndHalfUnicode); + assert_eq!(1, set.iter().count()); } #[test] @@ -1743,6 +2542,6 @@ mod tests { let res = alloc::format!("{:?}", LookSet::empty()); assert_eq!("∅", res); let res = alloc::format!("{:?}", LookSet::full()); - assert_eq!("Az^$rRbB𝛃𝚩", res); + assert_eq!("Az^$rRbB𝛃𝚩<>〈〉◁▷◀▶", res); } } diff --git a/regex-automata/src/util/mod.rs b/regex-automata/src/util/mod.rs index bb739df1df..b3eef64e64 100644 --- a/regex-automata/src/util/mod.rs +++ b/regex-automata/src/util/mod.rs @@ -40,6 +40,7 @@ pub mod look; pub mod pool; pub mod prefilter; pub mod primitives; +pub mod start; #[cfg(feature = "syntax")] pub mod syntax; pub mod wire; @@ -52,6 +53,5 @@ pub(crate) mod memchr; pub(crate) mod search; #[cfg(feature = "alloc")] pub(crate) mod sparse_set; -pub(crate) mod start; pub(crate) mod unicode_data; pub(crate) mod utf8; diff --git a/regex-automata/src/util/pool.rs b/regex-automata/src/util/pool.rs index 7f4a1c21e2..d90d4ecffa 100644 --- a/regex-automata/src/util/pool.rs +++ b/regex-automata/src/util/pool.rs @@ -177,6 +177,7 @@ impl T> Pool { /// the value to go back into the pool) and then calling get again is /// *not* guaranteed to return the same value received in the first `get` /// call. + #[inline] pub fn get(&self) -> PoolGuard<'_, T, F> { PoolGuard(self.0.get()) } @@ -200,6 +201,7 @@ impl<'a, T: Send, F: Fn() -> T> PoolGuard<'a, T, F> { /// This circumvents the guard's `Drop` implementation. This can be useful /// in circumstances where the automatic `Drop` results in poorer codegen, /// such as calling non-inlined functions. + #[inline] pub fn put(this: PoolGuard<'_, T, F>) { inner::PoolGuard::put(this.0); } @@ -208,12 +210,14 @@ impl<'a, T: Send, F: Fn() -> T> PoolGuard<'a, T, F> { impl<'a, T: Send, F: Fn() -> T> core::ops::Deref for PoolGuard<'a, T, F> { type Target = T; + #[inline] fn deref(&self) -> &T { self.0.value() } } impl<'a, T: Send, F: Fn() -> T> core::ops::DerefMut for PoolGuard<'a, T, F> { + #[inline] fn deref_mut(&mut self) -> &mut T { self.0.value_mut() } @@ -268,6 +272,64 @@ mod inner { /// do. static THREAD_ID_DROPPED: usize = 2; + /// The number of stacks we use inside of the pool. These are only used for + /// non-owners. That is, these represent the "slow" path. + /// + /// In the original implementation of this pool, we only used a single + /// stack. While this might be okay for a couple threads, the prevalence of + /// 32, 64 and even 128 core CPUs has made it untenable. The contention + /// such an environment introduces when threads are doing a lot of searches + /// on short haystacks (a not uncommon use case) is palpable and leads to + /// huge slowdowns. + /// + /// This constant reflects a change from using one stack to the number of + /// stacks that this constant is set to. The stack for a particular thread + /// is simply chosen by `thread_id % MAX_POOL_STACKS`. The idea behind + /// this setup is that there should be a good chance that accesses to the + /// pool will be distributed over several stacks instead of all of them + /// converging to one. + /// + /// This is not a particularly smart or dynamic strategy. Fixing this to a + /// specific number has at least two downsides. First is that it will help, + /// say, an 8 core CPU more than it will a 128 core CPU. (But, crucially, + /// it will still help the 128 core case.) Second is that this may wind + /// up being a little wasteful with respect to memory usage. Namely, if a + /// regex is used on one thread and then moved to another thread, then it + /// could result in creating a new copy of the data in the pool even though + /// only one is actually needed. + /// + /// And that memory usage bit is why this is set to 8 and not, say, 64. + /// Keeping it at 8 limits, to an extent, how much unnecessary memory can + /// be allocated. + /// + /// In an ideal world, we'd be able to have something like this: + /// + /// * Grow the number of stacks as the number of concurrent callers + /// increases. I spent a little time trying this, but even just adding an + /// atomic addition/subtraction for each pop/push for tracking concurrent + /// callers led to a big perf hit. Since even more work would seemingly be + /// required than just an addition/subtraction, I abandoned this approach. + /// * The maximum amount of memory used should scale with respect to the + /// number of concurrent callers and *not* the total number of existing + /// threads. This is primarily why the `thread_local` crate isn't used, as + /// as some environments spin up a lot of threads. This led to multiple + /// reports of extremely high memory usage (often described as memory + /// leaks). + /// * Even more ideally, the pool should contract in size. That is, it + /// should grow with bursts and then shrink. But this is a pretty thorny + /// issue to tackle and it might be better to just not. + /// * It would be nice to explore the use of, say, a lock-free stack + /// instead of using a mutex to guard a `Vec` that is ultimately just + /// treated as a stack. The main thing preventing me from exploring this + /// is the ABA problem. The `crossbeam` crate has tools for dealing with + /// this sort of problem (via its epoch based memory reclamation strategy), + /// but I can't justify bringing in all of `crossbeam` as a dependency of + /// `regex` for this. + /// + /// See this issue for more context and discussion: + /// https://github.com/rust-lang/regex/issues/934 + const MAX_POOL_STACKS: usize = 8; + thread_local!( /// A thread local used to assign an ID to a thread. static THREAD_ID: usize = { @@ -291,6 +353,17 @@ mod inner { }; ); + /// This puts each stack in the pool below into its own cache line. This is + /// an absolutely critical optimization that tends to have the most impact + /// in high contention workloads. Without forcing each mutex protected + /// into its own cache line, high contention exacerbates the performance + /// problem by causing "false sharing." By putting each mutex in its own + /// cache-line, we avoid the false sharing problem and the affects of + /// contention are greatly reduced. + #[derive(Debug)] + #[repr(C, align(64))] + struct CacheLine(T); + /// A thread safe pool utilizing std-only features. /// /// The main difference between this and the simplistic alloc-only pool is @@ -299,12 +372,16 @@ mod inner { /// This makes the common case of running a regex within a single thread /// faster by avoiding mutex unlocking. pub(super) struct Pool { - /// A stack of T values to hand out. These are used when a Pool is - /// accessed by a thread that didn't create it. - stack: Mutex>>, /// A function to create more T values when stack is empty and a caller /// has requested a T. create: F, + /// Multiple stacks of T values to hand out. These are used when a Pool + /// is accessed by a thread that didn't create it. + /// + /// Conceptually this is `Mutex>>`, but sharded out to make + /// it scale better under high contention work-loads. We index into + /// this sequence via `thread_id % stacks.len()`. + stacks: Vec>>>>, /// The ID of the thread that owns this pool. The owner is the thread /// that makes the first call to 'get'. When the owner calls 'get', it /// gets 'owner_val' directly instead of returning a T from 'stack'. @@ -354,9 +431,17 @@ mod inner { unsafe impl Sync for Pool {} // If T is UnwindSafe, then since we provide exclusive access to any - // particular value in the pool, it should therefore also be considered - // RefUnwindSafe. Also, since we use std::sync::Mutex, we get poisoning - // from it if another thread panics while the lock is held. + // particular value in the pool, the pool should therefore also be + // considered UnwindSafe. + // + // We require `F: UnwindSafe + RefUnwindSafe` because we call `F` at any + // point on demand, so it needs to be unwind safe on both dimensions for + // the entire Pool to be unwind safe. + impl UnwindSafe for Pool {} + + // If T is UnwindSafe, then since we provide exclusive access to any + // particular value in the pool, the pool should therefore also be + // considered RefUnwindSafe. // // We require `F: UnwindSafe + RefUnwindSafe` because we call `F` at any // point on demand, so it needs to be unwind safe on both dimensions for @@ -370,20 +455,58 @@ mod inner { /// Create a new pool. The given closure is used to create values in /// the pool when necessary. pub(super) fn new(create: F) -> Pool { - // MSRV(1.63): Mark this function as 'const'. I've arranged the - // code such that it should "just work." Then mark the public - // 'Pool::new' method as 'const' too. (The alloc-only Pool::new - // is already 'const', so that should "just work" too.) The only - // thing we're waiting for is Mutex::new to be const. + // FIXME: Now that we require 1.65+, Mutex::new is available as + // const... So we can almost mark this function as const. But of + // course, we're creating a Vec of stacks below (we didn't when I + // originally wrote this code). It seems like the best way to work + // around this would be to use a `[Stack; MAX_POOL_STACKS]` instead + // of a `Vec`. I refrained from making this change at time + // of writing (2023/10/08) because I was making a lot of other + // changes at the same time and wanted to do this more carefully. + // Namely, because of the cache line optimization, that `[Stack; + // MAX_POOL_STACKS]` would be quite big. It's unclear how bad (if + // at all) that would be. + // + // Another choice would be to lazily allocate the stacks, but... + // I'm not so sure about that. Seems like a fair bit of complexity? + // + // Maybe there's a simple solution I'm missing. + // + // ... OK, I tried to fix this. First, I did it by putting `stacks` + // in an `UnsafeCell` and using a `Once` to lazily initialize it. + // I benchmarked it and everything looked okay. I then made this + // function `const` and thought I was just about done. But the + // public pool type wraps its inner pool in a `Box` to keep its + // size down. Blech. + // + // So then I thought that I could push the box down into this + // type (and leave the non-std version unboxed) and use the same + // `UnsafeCell` technique to lazily initialize it. This has the + // downside of the `Once` now needing to get hit in the owner fast + // path, but maybe that's OK? However, I then realized that we can + // only lazily initialize `stacks`, `owner` and `owner_val`. The + // `create` function needs to be put somewhere outside of the box. + // So now the pool is a `Box`, `Once` and a function. Now we're + // starting to defeat the point of boxing in the first place. So I + // backed out that change too. + // + // Back to square one. I maybe we just don't make a pool's + // constructor const and live with it. It's probably not a huge + // deal. + let mut stacks = Vec::with_capacity(MAX_POOL_STACKS); + for _ in 0..stacks.capacity() { + stacks.push(CacheLine(Mutex::new(vec![]))); + } let owner = AtomicUsize::new(THREAD_ID_UNOWNED); let owner_val = UnsafeCell::new(None); // init'd on first access - Pool { stack: Mutex::new(vec![]), create, owner, owner_val } + Pool { create, stacks, owner, owner_val } } } impl T> Pool { /// Get a value from the pool. This may block if another thread is also /// attempting to retrieve a value from the pool. + #[inline] pub(super) fn get(&self) -> PoolGuard<'_, T, F> { // Our fast path checks if the caller is the thread that "owns" // this pool. Or stated differently, whether it is the first thread @@ -401,6 +524,9 @@ mod inner { let caller = THREAD_ID.with(|id| *id); let owner = self.owner.load(Ordering::Acquire); if caller == owner { + // N.B. We could also do a CAS here instead of a load/store, + // but ad hoc benchmarking suggests it is slower. And a lot + // slower in the case where `get_slow` is common. self.owner.store(THREAD_ID_INUSE, Ordering::Release); return self.guard_owned(caller); } @@ -444,37 +570,86 @@ mod inner { return self.guard_owned(caller); } } - let mut stack = self.stack.lock().unwrap(); - let value = match stack.pop() { - None => Box::new((self.create)()), - Some(value) => value, - }; - self.guard_stack(value) + let stack_id = caller % self.stacks.len(); + // We try to acquire exclusive access to this thread's stack, and + // if so, grab a value from it if we can. We put this in a loop so + // that it's easy to tweak and experiment with a different number + // of tries. In the end, I couldn't see anything obviously better + // than one attempt in ad hoc testing. + for _ in 0..1 { + let mut stack = match self.stacks[stack_id].0.try_lock() { + Err(_) => continue, + Ok(stack) => stack, + }; + if let Some(value) = stack.pop() { + return self.guard_stack(value); + } + // Unlock the mutex guarding the stack before creating a fresh + // value since we no longer need the stack. + drop(stack); + let value = Box::new((self.create)()); + return self.guard_stack(value); + } + // We're only here if we could get access to our stack, so just + // create a new value. This seems like it could be wasteful, but + // waiting for exclusive access to a stack when there's high + // contention is brutal for perf. + self.guard_stack_transient(Box::new((self.create)())) } /// Puts a value back into the pool. Callers don't need to call this. /// Once the guard that's returned by 'get' is dropped, it is put back /// into the pool automatically. + #[inline] fn put_value(&self, value: Box) { - let mut stack = self.stack.lock().unwrap(); - stack.push(value); + let caller = THREAD_ID.with(|id| *id); + let stack_id = caller % self.stacks.len(); + // As with trying to pop a value from this thread's stack, we + // merely attempt to get access to push this value back on the + // stack. If there's too much contention, we just give up and throw + // the value away. + // + // Interestingly, in ad hoc benchmarking, it is beneficial to + // attempt to push the value back more than once, unlike when + // popping the value. I don't have a good theory for why this is. + // I guess if we drop too many values then that winds up forcing + // the pop operation to create new fresh values and thus leads to + // less reuse. There's definitely a balancing act here. + for _ in 0..10 { + let mut stack = match self.stacks[stack_id].0.try_lock() { + Err(_) => continue, + Ok(stack) => stack, + }; + stack.push(value); + return; + } } /// Create a guard that represents the special owned T. + #[inline] fn guard_owned(&self, caller: usize) -> PoolGuard<'_, T, F> { - PoolGuard { pool: self, value: Err(caller) } + PoolGuard { pool: self, value: Err(caller), discard: false } } /// Create a guard that contains a value from the pool's stack. + #[inline] fn guard_stack(&self, value: Box) -> PoolGuard<'_, T, F> { - PoolGuard { pool: self, value: Ok(value) } + PoolGuard { pool: self, value: Ok(value), discard: false } + } + + /// Create a guard that contains a value from the pool's stack with an + /// instruction to throw away the value instead of putting it back + /// into the pool. + #[inline] + fn guard_stack_transient(&self, value: Box) -> PoolGuard<'_, T, F> { + PoolGuard { pool: self, value: Ok(value), discard: true } } } impl core::fmt::Debug for Pool { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { f.debug_struct("Pool") - .field("stack", &self.stack) + .field("stacks", &self.stacks) .field("owner", &self.owner) .field("owner_val", &self.owner_val) .finish() @@ -490,10 +665,17 @@ mod inner { /// in the special case of `Err(THREAD_ID_DROPPED)`, it means the /// guard has been put back into the pool and should no longer be used. value: Result, usize>, + /// When true, the value should be discarded instead of being pushed + /// back into the pool. We tend to use this under high contention, and + /// this allows us to avoid inflating the size of the pool. (Because + /// under contention, we tend to create more values instead of waiting + /// for access to a stack of existing values.) + discard: bool, } impl<'a, T: Send, F: Fn() -> T> PoolGuard<'a, T, F> { /// Return the underlying value. + #[inline] pub(super) fn value(&self) -> &T { match self.value { Ok(ref v) => &**v, @@ -518,6 +700,7 @@ mod inner { } /// Return the underlying value as a mutable borrow. + #[inline] pub(super) fn value_mut(&mut self) -> &mut T { match self.value { Ok(ref mut v) => &mut **v, @@ -542,6 +725,7 @@ mod inner { } /// Consumes this guard and puts it back into the pool. + #[inline] pub(super) fn put(this: PoolGuard<'_, T, F>) { // Since this is effectively consuming the guard and putting the // value back into the pool, there's no reason to run its Drop @@ -557,7 +741,17 @@ mod inner { #[inline(always)] fn put_imp(&mut self) { match core::mem::replace(&mut self.value, Err(THREAD_ID_DROPPED)) { - Ok(value) => self.pool.put_value(value), + Ok(value) => { + // If we were told to discard this value then don't bother + // trying to put it back into the pool. This occurs when + // the pop operation failed to acquire a lock and we + // decided to create a new value in lieu of contending for + // the lock. + if self.discard { + return; + } + self.pool.put_value(value); + } // If this guard has a value "owned" by the thread, then // the Pool guarantees that this is the ONLY such guard. // Therefore, in order to place it back into the pool and make @@ -580,6 +774,7 @@ mod inner { } impl<'a, T: Send, F: Fn() -> T> Drop for PoolGuard<'a, T, F> { + #[inline] fn drop(&mut self) { self.put_imp(); } @@ -657,6 +852,7 @@ mod inner { impl T> Pool { /// Get a value from the pool. This may block if another thread is also /// attempting to retrieve a value from the pool. + #[inline] pub(super) fn get(&self) -> PoolGuard<'_, T, F> { let mut stack = self.stack.lock(); let value = match stack.pop() { @@ -666,6 +862,7 @@ mod inner { PoolGuard { pool: self, value: Some(value) } } + #[inline] fn put(&self, guard: PoolGuard<'_, T, F>) { let mut guard = core::mem::ManuallyDrop::new(guard); if let Some(value) = guard.value.take() { @@ -676,6 +873,7 @@ mod inner { /// Puts a value back into the pool. Callers don't need to call this. /// Once the guard that's returned by 'get' is dropped, it is put back /// into the pool automatically. + #[inline] fn put_value(&self, value: Box) { let mut stack = self.stack.lock(); stack.push(value); @@ -698,16 +896,19 @@ mod inner { impl<'a, T: Send, F: Fn() -> T> PoolGuard<'a, T, F> { /// Return the underlying value. + #[inline] pub(super) fn value(&self) -> &T { self.value.as_deref().unwrap() } /// Return the underlying value as a mutable borrow. + #[inline] pub(super) fn value_mut(&mut self) -> &mut T { self.value.as_deref_mut().unwrap() } /// Consumes this guard and puts it back into the pool. + #[inline] pub(super) fn put(this: PoolGuard<'_, T, F>) { // Since this is effectively consuming the guard and putting the // value back into the pool, there's no reason to run its Drop @@ -729,6 +930,7 @@ mod inner { } impl<'a, T: Send, F: Fn() -> T> Drop for PoolGuard<'a, T, F> { + #[inline] fn drop(&mut self) { self.put_imp(); } @@ -782,6 +984,7 @@ mod inner { /// Lock this mutex and return a guard providing exclusive access to /// `T`. This blocks if some other thread has already locked this /// mutex. + #[inline] fn lock(&self) -> MutexGuard<'_, T> { while self .locked @@ -814,18 +1017,21 @@ mod inner { impl<'a, T> core::ops::Deref for MutexGuard<'a, T> { type Target = T; + #[inline] fn deref(&self) -> &T { self.data } } impl<'a, T> core::ops::DerefMut for MutexGuard<'a, T> { + #[inline] fn deref_mut(&mut self) -> &mut T { self.data } } impl<'a, T> Drop for MutexGuard<'a, T> { + #[inline] fn drop(&mut self) { // Drop means 'data' is no longer accessible, so we can unlock // the mutex. diff --git a/regex-automata/src/util/prefilter/aho_corasick.rs b/regex-automata/src/util/prefilter/aho_corasick.rs index a7474d29ab..50cce827ee 100644 --- a/regex-automata/src/util/prefilter/aho_corasick.rs +++ b/regex-automata/src/util/prefilter/aho_corasick.rs @@ -22,11 +22,20 @@ impl AhoCorasick { } #[cfg(feature = "perf-literal-multisubstring")] { + // We used to use `aho_corasick::MatchKind::Standard` here when + // `kind` was `MatchKind::All`, but this is not correct. The + // "standard" Aho-Corasick match semantics are to report a match + // immediately as soon as it is seen, but `All` isn't like that. + // In particular, with "standard" semantics, given the needles + // "abc" and "b" and the haystack "abc," it would report a match + // at offset 1 before a match at offset 0. This is never what we + // want in the context of the regex engine, regardless of whether + // we have leftmost-first or 'all' semantics. Namely, we always + // want the leftmost match. let ac_match_kind = match kind { - MatchKind::LeftmostFirst => { + MatchKind::LeftmostFirst | MatchKind::All => { aho_corasick::MatchKind::LeftmostFirst } - MatchKind::All => aho_corasick::MatchKind::Standard, }; // This is kind of just an arbitrary number, but basically, if we // have a small enough set of literals, then we try to use the VERY diff --git a/regex-automata/src/util/prefilter/mod.rs b/regex-automata/src/util/prefilter/mod.rs index ea3eb73d8c..51fc922337 100644 --- a/regex-automata/src/util/prefilter/mod.rs +++ b/regex-automata/src/util/prefilter/mod.rs @@ -195,15 +195,6 @@ impl Prefilter { /// Some(Span::from(6..9)), /// pre.find(hay.as_bytes(), Span::from(0..hay.len())), /// ); - /// // Now we put 'samwise' back before 'sam', but change the match - /// // semantics to 'All'. In this case, there is no preference - /// // order semantics and the first match detected is returned. - /// let pre = Prefilter::new(MatchKind::All, &["samwise", "sam"]) - /// .expect("a prefilter"); - /// assert_eq!( - /// Some(Span::from(6..9)), - /// pre.find(hay.as_bytes(), Span::from(0..hay.len())), - /// ); /// /// # Ok::<(), Box>(()) /// ``` diff --git a/regex-automata/src/util/prefilter/teddy.rs b/regex-automata/src/util/prefilter/teddy.rs index 02210a5eca..fc79f2b2f3 100644 --- a/regex-automata/src/util/prefilter/teddy.rs +++ b/regex-automata/src/util/prefilter/teddy.rs @@ -50,12 +50,17 @@ impl Teddy { // theory we could at least support leftmost-longest, as the // aho-corasick crate does, but regex-automata doesn't know about // leftmost-longest currently. + // + // And like the aho-corasick prefilter, if we're using `All` + // semantics, then we can still use leftmost semantics for a + // prefilter. (This might be a suspicious choice for the literal + // engine, which uses a prefilter as a regex engine directly, but + // that only happens when using leftmost-first semantics.) let (packed_match_kind, ac_match_kind) = match kind { - MatchKind::LeftmostFirst => ( + MatchKind::LeftmostFirst | MatchKind::All => ( aho_corasick::packed::MatchKind::LeftmostFirst, aho_corasick::MatchKind::LeftmostFirst, ), - _ => return None, }; let minimum_len = needles.iter().map(|n| n.as_ref().len()).min().unwrap_or(0); diff --git a/regex-automata/src/util/search.rs b/regex-automata/src/util/search.rs index b7bf934ea9..39aec522be 100644 --- a/regex-automata/src/util/search.rs +++ b/regex-automata/src/util/search.rs @@ -246,7 +246,7 @@ impl<'h> Input<'h> { /// When a search is anchored (so that's [`Anchored::Yes`] or /// [`Anchored::Pattern`]), a match must begin at the start of a search. /// When a search is not anchored (that's [`Anchored::No`]), regex engines - /// will behave as if the pattern started with a `(?:s-u.)*?`. This prefix + /// will behave as if the pattern started with a `(?s-u:.)*?`. This prefix /// permits a match to appear anywhere. /// /// By default, the anchored mode is [`Anchored::No`]. diff --git a/regex-automata/src/util/start.rs b/regex-automata/src/util/start.rs index 4e360d083a..27153780ec 100644 --- a/regex-automata/src/util/start.rs +++ b/regex-automata/src/util/start.rs @@ -1,17 +1,195 @@ /*! -Provides some helpers for dealing with start state configurations in DFAs. - -[`Start`] represents the possible starting configurations, while -[`StartByteMap`] represents a way to retrieve the `Start` configuration for a -given position in a haystack. +Provides helpers for dealing with start state configurations in DFAs. */ use crate::util::{ look::LookMatcher, - search::Input, + search::{Anchored, Input}, wire::{self, DeserializeError, SerializeError}, }; +/// The configuration used to determine a DFA's start state for a search. +/// +/// A DFA has a single starting state in the typical textbook description. That +/// is, it corresponds to the set of all starting states for the NFA that built +/// it, along with their espsilon closures. In this crate, however, DFAs have +/// many possible start states due to a few factors: +/// +/// * DFAs support the ability to run either anchored or unanchored searches. +/// Each type of search needs its own start state. For example, an unanchored +/// search requires starting at a state corresponding to a regex with a +/// `(?s-u:.)*?` prefix, which will match through anything. +/// * DFAs also optionally support starting an anchored search for any one +/// specific pattern. Each such pattern requires its own start state. +/// * If a look-behind assertion like `^` or `\b` is used in the regex, then +/// the DFA will need to inspect a single byte immediately before the start of +/// the search to choose the correct start state. +/// +/// Indeed, this configuration precisely encapsulates all of the above factors. +/// The [`Config::anchored`] method sets which kind of anchored search to +/// perform while the [`Config::look_behind`] method provides a way to set +/// the byte that occurs immediately before the start of the search. +/// +/// Generally speaking, this type is only useful when you want to run searches +/// without using an [`Input`]. In particular, an `Input` wants a haystack +/// slice, but callers may not have a contiguous sequence of bytes as a +/// haystack in all cases. This type provides a lower level of control such +/// that callers can provide their own anchored configuration and look-behind +/// byte explicitly. +/// +/// # Example +/// +/// This shows basic usage that permits running a search with a DFA without +/// using the `Input` abstraction. +/// +/// ``` +/// use regex_automata::{ +/// dfa::{Automaton, dense}, +/// util::start, +/// Anchored, +/// }; +/// +/// let dfa = dense::DFA::new(r"(?-u)\b\w+\b")?; +/// let haystack = "quartz"; +/// +/// let config = start::Config::new().anchored(Anchored::Yes); +/// let mut state = dfa.start_state(&config)?; +/// for &b in haystack.as_bytes().iter() { +/// state = dfa.next_state(state, b); +/// } +/// state = dfa.next_eoi_state(state); +/// assert!(dfa.is_match_state(state)); +/// +/// # Ok::<(), Box>(()) +/// ``` +/// +/// This example shows how to correctly run a search that doesn't begin at +/// the start of a haystack. Notice how we set the look-behind byte, and as +/// a result, the `\b` assertion does not match. +/// +/// ``` +/// use regex_automata::{ +/// dfa::{Automaton, dense}, +/// util::start, +/// Anchored, +/// }; +/// +/// let dfa = dense::DFA::new(r"(?-u)\b\w+\b")?; +/// let haystack = "quartz"; +/// +/// let config = start::Config::new() +/// .anchored(Anchored::Yes) +/// .look_behind(Some(b'q')); +/// let mut state = dfa.start_state(&config)?; +/// for &b in haystack.as_bytes().iter().skip(1) { +/// state = dfa.next_state(state, b); +/// } +/// state = dfa.next_eoi_state(state); +/// // No match! +/// assert!(!dfa.is_match_state(state)); +/// +/// # Ok::<(), Box>(()) +/// ``` +/// +/// If we had instead not set a look-behind byte, then the DFA would assume +/// that it was starting at the beginning of the haystack, and thus `\b` should +/// match. This in turn would result in erroneously reporting a match: +/// +/// ``` +/// use regex_automata::{ +/// dfa::{Automaton, dense}, +/// util::start, +/// Anchored, +/// }; +/// +/// let dfa = dense::DFA::new(r"(?-u)\b\w+\b")?; +/// let haystack = "quartz"; +/// +/// // Whoops, forgot the look-behind byte... +/// let config = start::Config::new().anchored(Anchored::Yes); +/// let mut state = dfa.start_state(&config)?; +/// for &b in haystack.as_bytes().iter().skip(1) { +/// state = dfa.next_state(state, b); +/// } +/// state = dfa.next_eoi_state(state); +/// // And now we get a match unexpectedly. +/// assert!(dfa.is_match_state(state)); +/// +/// # Ok::<(), Box>(()) +/// ``` +#[derive(Clone, Debug)] +pub struct Config { + look_behind: Option, + anchored: Anchored, +} + +impl Config { + /// Create a new default start configuration. + /// + /// The default is an unanchored search that starts at the beginning of the + /// haystack. + pub fn new() -> Config { + Config { anchored: Anchored::No, look_behind: None } + } + + /// A convenience routine for building a start configuration from an + /// [`Input`] for a forward search. + /// + /// This automatically sets the look-behind byte to the byte immediately + /// preceding the start of the search. If the start of the search is at + /// offset `0`, then no look-behind byte is set. + pub fn from_input_forward(input: &Input<'_>) -> Config { + let look_behind = input + .start() + .checked_sub(1) + .and_then(|i| input.haystack().get(i).copied()); + Config { look_behind, anchored: input.get_anchored() } + } + + /// A convenience routine for building a start configuration from an + /// [`Input`] for a reverse search. + /// + /// This automatically sets the look-behind byte to the byte immediately + /// following the end of the search. If the end of the search is at + /// offset `haystack.len()`, then no look-behind byte is set. + pub fn from_input_reverse(input: &Input<'_>) -> Config { + let look_behind = input.haystack().get(input.end()).copied(); + Config { look_behind, anchored: input.get_anchored() } + } + + /// Set the look-behind byte at the start of a search. + /// + /// Unless the search is intended to logically start at the beginning of a + /// haystack, this should _always_ be set to the byte immediately preceding + /// the start of the search. If no look-behind byte is set, then the start + /// configuration will assume it is at the beginning of the haystack. For + /// example, the anchor `^` will match. + /// + /// The default is that no look-behind byte is set. + pub fn look_behind(mut self, byte: Option) -> Config { + self.look_behind = byte; + self + } + + /// Set the anchored mode of a search. + /// + /// The default is an unanchored search. + pub fn anchored(mut self, mode: Anchored) -> Config { + self.anchored = mode; + self + } + + /// Return the look-behind byte in this configuration, if one exists. + pub fn get_look_behind(&self) -> Option { + self.look_behind + } + + /// Return the anchored mode in this configuration. + pub fn get_anchored(&self) -> Anchored { + self.anchored + } +} + /// A map from every possible byte value to its corresponding starting /// configuration. /// @@ -71,30 +249,11 @@ impl StartByteMap { StartByteMap { map } } - /// Return the forward starting configuration for the given `input`. - #[cfg_attr(feature = "perf-inline", inline(always))] - pub(crate) fn fwd(&self, input: &Input) -> Start { - match input - .start() - .checked_sub(1) - .and_then(|i| input.haystack().get(i)) - { - None => Start::Text, - Some(&byte) => self.get(byte), - } - } - - /// Return the reverse starting configuration for the given `input`. - #[cfg_attr(feature = "perf-inline", inline(always))] - pub(crate) fn rev(&self, input: &Input) -> Start { - match input.haystack().get(input.end()) { - None => Start::Text, - Some(&byte) => self.get(byte), - } - } - + /// Return the starting configuration for the given look-behind byte. + /// + /// If no look-behind exists, callers should use `Start::Text`. #[cfg_attr(feature = "perf-inline", inline(always))] - fn get(&self, byte: u8) -> Start { + pub(crate) fn get(&self, byte: u8) -> Start { self.map[usize::from(byte)] } @@ -253,21 +412,32 @@ mod tests { #[test] fn start_fwd_done_range() { let smap = StartByteMap::new(&LookMatcher::default()); - assert_eq!(Start::Text, smap.fwd(&Input::new("").range(1..0))); + let input = Input::new("").range(1..0); + let config = Config::from_input_forward(&input); + let start = + config.get_look_behind().map_or(Start::Text, |b| smap.get(b)); + assert_eq!(Start::Text, start); } #[test] fn start_rev_done_range() { let smap = StartByteMap::new(&LookMatcher::default()); - assert_eq!(Start::Text, smap.rev(&Input::new("").range(1..0))); + let input = Input::new("").range(1..0); + let config = Config::from_input_reverse(&input); + let start = + config.get_look_behind().map_or(Start::Text, |b| smap.get(b)); + assert_eq!(Start::Text, start); } #[test] fn start_fwd() { let f = |haystack, start, end| { let smap = StartByteMap::new(&LookMatcher::default()); - let input = &Input::new(haystack).range(start..end); - smap.fwd(input) + let input = Input::new(haystack).range(start..end); + let config = Config::from_input_forward(&input); + let start = + config.get_look_behind().map_or(Start::Text, |b| smap.get(b)); + start }; assert_eq!(Start::Text, f("", 0, 0)); @@ -287,8 +457,11 @@ mod tests { fn start_rev() { let f = |haystack, start, end| { let smap = StartByteMap::new(&LookMatcher::default()); - let input = &Input::new(haystack).range(start..end); - smap.rev(input) + let input = Input::new(haystack).range(start..end); + let config = Config::from_input_reverse(&input); + let start = + config.get_look_behind().map_or(Start::Text, |b| smap.get(b)); + start }; assert_eq!(Start::Text, f("", 0, 0)); diff --git a/regex-automata/tests/dfa/suite.rs b/regex-automata/tests/dfa/suite.rs index f3445e02a4..8ed6dd0077 100644 --- a/regex-automata/tests/dfa/suite.rs +++ b/regex-automata/tests/dfa/suite.rs @@ -9,7 +9,6 @@ use { util::{prefilter::Prefilter, syntax}, Anchored, Input, PatternSet, }, - regex_syntax::hir, regex_test::{ CompiledRegex, Match, RegexTest, SearchKind, Span, TestResult, TestRunner, @@ -285,10 +284,7 @@ fn compiler( // That is, Unicode word boundaries when searching non-ASCII text. if !test.haystack().is_ascii() { for hir in hirs.iter() { - let looks = hir.properties().look_set(); - if looks.contains(hir::Look::WordUnicode) - || looks.contains(hir::Look::WordUnicodeNegate) - { + if hir.properties().look_set().contains_word_unicode() { return Ok(CompiledRegex::skip()); } } diff --git a/regex-automata/tests/hybrid/api.rs b/regex-automata/tests/hybrid/api.rs index e82d808e34..4b04c4f8fd 100644 --- a/regex-automata/tests/hybrid/api.rs +++ b/regex-automata/tests/hybrid/api.rs @@ -55,7 +55,7 @@ fn too_many_cache_resets_cause_quit() -> Result<(), Box> { let mut cache = dfa.create_cache(); let haystack = "a".repeat(101).into_bytes(); - let err = MatchError::gave_up(25); + let err = MatchError::gave_up(24); // Notice that we make the same amount of progress in each search! That's // because the cache is reused and already has states to handle the first // N bytes. @@ -83,7 +83,7 @@ fn too_many_cache_resets_cause_quit() -> Result<(), Box> { // OK, if we reset the cache, then we should be able to create more states // and make more progress with searching for betas. cache.reset(&dfa); - let err = MatchError::gave_up(27); + let err = MatchError::gave_up(26); assert_eq!( Err(err), dfa.try_search_fwd(&mut cache, &Input::new(&haystack)) diff --git a/regex-automata/tests/lib.rs b/regex-automata/tests/lib.rs index 1465e51eb7..67c979aa8d 100644 --- a/regex-automata/tests/lib.rs +++ b/regex-automata/tests/lib.rs @@ -61,6 +61,7 @@ fn suite() -> anyhow::Result { load!("unicode"); load!("utf8"); load!("word-boundary"); + load!("word-boundary-special"); load!("fowler/basic"); load!("fowler/nullsubexpr"); load!("fowler/repetition"); diff --git a/regex-cli/Cargo.toml b/regex-cli/Cargo.toml index f9dec00242..a107c09df2 100644 --- a/regex-cli/Cargo.toml +++ b/regex-cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "regex-cli" -version = "0.1.0" #:version +version = "0.2.0" #:version authors = ["The Rust Project Developers", "Andrew Gallant "] description = """ A command line tool for debugging, ad hoc benchmarking and generating regular @@ -12,6 +12,7 @@ license = "MIT OR Apache-2.0" categories = ["text-processing"] autotests = false edition = "2021" +rust-version = "1.65" [[bin]] name = "regex-cli" @@ -28,8 +29,8 @@ lexopt = "0.3.0" log = { version = "0.4.17", features = ["std"] } memmap2 = "0.5.10" regex = { version = "1.9.0", path = ".." } -regex-automata = { version = "0.3.0", path = "../regex-automata", features = ["logging"] } +regex-automata = { version = "0.4.0", path = "../regex-automata", features = ["logging"] } regex-lite = { version = "0.1.0", path = "../regex-lite" } -regex-syntax = { version = "0.7.3", path = "../regex-syntax" } +regex-syntax = { version = "0.8.0", path = "../regex-syntax" } tabwriter = { version = "1.2.1", features = ["ansi_formatting"] } textwrap = { version = "0.16.0", default-features = false } diff --git a/regex-cli/README.md b/regex-cli/README.md index 36dc50e772..376d89091a 100644 --- a/regex-cli/README.md +++ b/regex-cli/README.md @@ -7,11 +7,10 @@ various regex development tasks such as generating tests. ### Installation -Currently `regex-cli` is not on crates.io and should be installed from this -git repository: +Simply use `cargo` to install from crates.io. ``` -$ cargo install --git https://github.com/rust-lang/regex regex-cli +$ cargo install regex-cli ``` diff --git a/regex-cli/args/flags.rs b/regex-cli/args/flags.rs index db8a847ef8..61732a28e7 100644 --- a/regex-cli/args/flags.rs +++ b/regex-cli/args/flags.rs @@ -152,3 +152,55 @@ impl std::str::FromStr for MatchKind { Ok(MatchKind { kind }) } } + +/// Provides an implementation of the --captures flag, for use with Thompson +/// NFA configuration. +#[derive(Debug)] +pub struct WhichCaptures { + pub which: regex_automata::nfa::thompson::WhichCaptures, +} + +impl WhichCaptures { + pub const USAGE: Usage = Usage::new( + "--captures ", + "One of: all, implicit or none.", + r#" +Selects which capture states should be included in the Thompson NFA. The +choices are 'all' (the default), 'implicit' or 'none'. + +'all' means that both explicit and implicit capture states are included. + +'implicit' means that only implicit capture states are included. That is, the +Thompson NFA will only be able to report the overall match offsets and not the +match offsets of each explicit capture group. + +'none' means that no capture states will be included. This is useful when +capture states aren't needed (like when building a DFA) or if they aren't +supported (like when building a reverse NFA). +"#, + ); +} + +impl Default for WhichCaptures { + fn default() -> WhichCaptures { + WhichCaptures { + which: regex_automata::nfa::thompson::WhichCaptures::All, + } + } +} + +impl std::str::FromStr for WhichCaptures { + type Err = anyhow::Error; + + fn from_str(s: &str) -> anyhow::Result { + let which = match s { + "all" => regex_automata::nfa::thompson::WhichCaptures::All, + "implicit" => { + regex_automata::nfa::thompson::WhichCaptures::Implicit + } + "none" => regex_automata::nfa::thompson::WhichCaptures::None, + unk => anyhow::bail!("unrecognized captures option '{}'", unk), + }; + Ok(WhichCaptures { which }) + } +} diff --git a/regex-cli/args/thompson.rs b/regex-cli/args/thompson.rs index 6e7b4afd80..bd8388d117 100644 --- a/regex-cli/args/thompson.rs +++ b/regex-cli/args/thompson.rs @@ -28,7 +28,11 @@ impl Config { pub fn reversed(&self) -> Config { // Reverse DFAs require that captures are disabled. In practice, there // is no current use case for a reverse NFA with capture groups. - let thompson = self.thompson.clone().reverse(true).captures(false); + let thompson = self + .thompson + .clone() + .reverse(true) + .which_captures(thompson::WhichCaptures::None); Config { thompson } } @@ -66,8 +70,11 @@ impl Configurable for Config { Arg::Long("shrink") => { self.thompson = self.thompson.clone().shrink(true); } - Arg::Long("no-captures") => { - self.thompson = self.thompson.clone().captures(false); + Arg::Long("captures") => { + let which: flags::WhichCaptures = + args::parse(p, "--captures")?; + self.thompson = + self.thompson.clone().which_captures(which.which); } Arg::Long("line-terminator") => { let byte: flags::OneByte = @@ -129,19 +136,7 @@ spent shrinking the NFA can lead to far larger savings in the subsequent DFA determinization. "#, ), - Usage::new( - "--no-captures", - "Disable capture states.", - r#" -Disables capture states. By default, NFAs include special "capture" states that -instruct some regex engines (like the PikeVM) to record offset positions in -ancillary state. - -It can be useful to disable capture states in order to reduce "clutter" in the -automaton when debugging it. Also, at time of writing, reverse NFAs require -that capture groups are disabled. -"#, - ), + flags::WhichCaptures::USAGE, Usage::new( "--line-terminator", "Set the line terminator used by line anchors.", diff --git a/regex-cli/cmd/generate/fowler.rs b/regex-cli/cmd/generate/fowler.rs index c0ab1b361c..c287f6f527 100644 --- a/regex-cli/cmd/generate/fowler.rs +++ b/regex-cli/cmd/generate/fowler.rs @@ -404,7 +404,9 @@ fn count_capturing_groups_ast(ast: ®ex_syntax::ast::Ast) -> usize { | Ast::Literal(_) | Ast::Dot(_) | Ast::Assertion(_) - | Ast::Class(_) => 0, + | Ast::ClassUnicode(_) + | Ast::ClassPerl(_) + | Ast::ClassBracketed(_) => 0, Ast::Repetition(ref rep) => count_capturing_groups_ast(&*rep.ast), Ast::Group(ref group) => { let this = if group.is_capturing() { 1 } else { 0 }; diff --git a/regex-lite/Cargo.toml b/regex-lite/Cargo.toml index 1dc144b316..0ba53485b9 100644 --- a/regex-lite/Cargo.toml +++ b/regex-lite/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "regex-lite" -version = "0.1.0" #:version +version = "0.1.5" #:version authors = ["The Rust Project Developers", "Andrew Gallant "] license = "MIT OR Apache-2.0" repository = "https://github.com/rust-lang/regex/tree/master/regex-lite" @@ -10,11 +10,11 @@ A lightweight regex engine that optimizes for binary size and compilation time. """ workspace = ".." edition = "2021" -rust-version = "1.60.0" +rust-version = "1.65" autotests = false # Features are documented in the "Crate features" section of the crate docs: -# https://docs.rs/regex-syntax/*/#crate-features +# https://docs.rs/regex-lite/*/#crate-features # # (Currently there are no supported features. 'std' is technically one, but it # is currently required.) diff --git a/regex-lite/README.md b/regex-lite/README.md index 34c749b216..758fac6aea 100644 --- a/regex-lite/README.md +++ b/regex-lite/README.md @@ -78,7 +78,7 @@ year: 2014, month: 10, day: 14 ### Minimum Rust version policy -This crate's minimum supported `rustc` version is `1.60.0`. +This crate's minimum supported `rustc` version is `1.65.0`. The policy is that the minimum Rust version required to use this crate can be increased in semver compatible updates. diff --git a/regex-lite/src/hir/mod.rs b/regex-lite/src/hir/mod.rs index f73a5420ab..6e5348a5bc 100644 --- a/regex-lite/src/hir/mod.rs +++ b/regex-lite/src/hir/mod.rs @@ -366,6 +366,24 @@ impl Hir { } } +impl HirKind { + /// Returns a slice of this kind's sub-expressions, if any. + fn subs(&self) -> &[Hir] { + use core::slice::from_ref; + + match *self { + HirKind::Empty + | HirKind::Char(_) + | HirKind::Class(_) + | HirKind::Look(_) => &[], + HirKind::Repetition(Repetition { ref sub, .. }) => from_ref(sub), + HirKind::Capture(Capture { ref sub, .. }) => from_ref(sub), + HirKind::Concat(ref subs) => subs, + HirKind::Alternation(ref subs) => subs, + } + } +} + #[derive(Clone, Debug, Eq, PartialEq)] pub(crate) struct Class { pub(crate) ranges: Vec, @@ -592,6 +610,24 @@ pub(crate) enum Look { Word = 1 << 6, /// Match an ASCII-only negation of a word boundary. WordNegate = 1 << 7, + /// Match the start of an ASCII-only word boundary. That is, this matches a + /// position at either the beginning of the haystack or where the previous + /// character is not a word character and the following character is a word + /// character. + WordStart = 1 << 8, + /// Match the end of an ASCII-only word boundary. That is, this matches + /// a position at either the end of the haystack or where the previous + /// character is a word character and the following character is not a word + /// character. + WordEnd = 1 << 9, + /// Match the start half of an ASCII-only word boundary. That is, this + /// matches a position at either the beginning of the haystack or where the + /// previous character is not a word character. + WordStartHalf = 1 << 10, + /// Match the end half of an ASCII-only word boundary. That is, this + /// matches a position at either the end of the haystack or where the + /// following character is not a word character. + WordEndHalf = 1 << 11, } impl Look { @@ -631,6 +667,30 @@ impl Look { at < haystack.len() && utf8::is_word_byte(haystack[at]); word_before == word_after } + WordStart => { + let word_before = + at > 0 && utf8::is_word_byte(haystack[at - 1]); + let word_after = + at < haystack.len() && utf8::is_word_byte(haystack[at]); + !word_before && word_after + } + WordEnd => { + let word_before = + at > 0 && utf8::is_word_byte(haystack[at - 1]); + let word_after = + at < haystack.len() && utf8::is_word_byte(haystack[at]); + word_before && !word_after + } + WordStartHalf => { + let word_before = + at > 0 && utf8::is_word_byte(haystack[at - 1]); + !word_before + } + WordEndHalf => { + let word_after = + at < haystack.len() && utf8::is_word_byte(haystack[at]); + !word_after + } } } } @@ -705,3 +765,45 @@ fn prev_char(ch: char) -> Option { // and U+E000 yields a valid scalar value. Some(char::from_u32(u32::from(ch).checked_sub(1)?).unwrap()) } + +impl Drop for Hir { + fn drop(&mut self) { + use core::mem; + + match *self.kind() { + HirKind::Empty + | HirKind::Char(_) + | HirKind::Class(_) + | HirKind::Look(_) => return, + HirKind::Capture(ref x) if x.sub.kind.subs().is_empty() => return, + HirKind::Repetition(ref x) if x.sub.kind.subs().is_empty() => { + return + } + HirKind::Concat(ref x) if x.is_empty() => return, + HirKind::Alternation(ref x) if x.is_empty() => return, + _ => {} + } + + let mut stack = vec![mem::replace(self, Hir::empty())]; + while let Some(mut expr) = stack.pop() { + match expr.kind { + HirKind::Empty + | HirKind::Char(_) + | HirKind::Class(_) + | HirKind::Look(_) => {} + HirKind::Capture(ref mut x) => { + stack.push(mem::replace(&mut x.sub, Hir::empty())); + } + HirKind::Repetition(ref mut x) => { + stack.push(mem::replace(&mut x.sub, Hir::empty())); + } + HirKind::Concat(ref mut x) => { + stack.extend(x.drain(..)); + } + HirKind::Alternation(ref mut x) => { + stack.extend(x.drain(..)); + } + } + } + } +} diff --git a/regex-lite/src/hir/parse.rs b/regex-lite/src/hir/parse.rs index cc3c21fe63..ca93b88387 100644 --- a/regex-lite/src/hir/parse.rs +++ b/regex-lite/src/hir/parse.rs @@ -111,6 +111,12 @@ const ERR_CLASS_DIFFERENCE_UNSUPPORTED: &str = "character class difference is not supported"; const ERR_CLASS_SYMDIFFERENCE_UNSUPPORTED: &str = "character class symmetric difference is not supported"; +const ERR_SPECIAL_WORD_BOUNDARY_UNCLOSED: &str = + "special word boundary assertion is unclosed or has an invalid character"; +const ERR_SPECIAL_WORD_BOUNDARY_UNRECOGNIZED: &str = + "special word boundary assertion is unrecognized"; +const ERR_SPECIAL_WORD_OR_REP_UNEXPECTED_EOF: &str = + "found start of special word boundary or repetition without an end"; /// A regular expression parser. /// @@ -371,6 +377,24 @@ impl<'a> Parser<'a> { /// own routine. impl<'a> Parser<'a> { pub(super) fn parse(&self) -> Result { + let hir = self.parse_inner()?; + // While we also check nesting during parsing, that only checks the + // number of recursive parse calls. It does not necessarily cover + // all possible recursive nestings of the Hir itself. For example, + // repetition operators don't require recursive parse calls. So one + // can stack them arbitrarily without overflowing the stack in the + // *parser*. But then if one recurses over the resulting Hir, a stack + // overflow is possible. So here we check the Hir nesting level + // thoroughly to ensure it isn't nested too deeply. + // + // Note that we do still need the nesting limit check in the parser as + // well, since that will avoid overflowing the stack during parse time + // before the complete Hir value is constructed. + check_hir_nesting(&hir, self.config.nest_limit)?; + Ok(hir) + } + + fn parse_inner(&self) -> Result { let depth = self.increment_depth()?; let mut alternates = vec![]; let mut concat = vec![]; @@ -479,12 +503,86 @@ impl<'a> Parser<'a> { 'v' => special('\x0B'), 'A' => Ok(Hir::look(hir::Look::Start)), 'z' => Ok(Hir::look(hir::Look::End)), - 'b' => Ok(Hir::look(hir::Look::Word)), + 'b' => { + let mut hir = Hir::look(hir::Look::Word); + if !self.is_done() && self.char() == '{' { + if let Some(special) = + self.maybe_parse_special_word_boundary()? + { + hir = special; + } + } + Ok(hir) + } 'B' => Ok(Hir::look(hir::Look::WordNegate)), + '<' => Ok(Hir::look(hir::Look::WordStart)), + '>' => Ok(Hir::look(hir::Look::WordEnd)), _ => Err(Error::new(ERR_ESCAPE_UNRECOGNIZED)), } } + /// Attempt to parse a specialty word boundary. That is, `\b{start}`, + /// `\b{end}`, `\b{start-half}` or `\b{end-half}`. + /// + /// This is similar to `maybe_parse_ascii_class` in that, in most cases, + /// if it fails it will just return `None` with no error. This is done + /// because `\b{5}` is a valid expression and we want to let that be parsed + /// by the existing counted repetition parsing code. (I thought about just + /// invoking the counted repetition code from here, but it seemed a little + /// ham-fisted.) + /// + /// Unlike `maybe_parse_ascii_class` though, this can return an error. + /// Namely, if we definitely know it isn't a counted repetition, then we + /// return an error specific to the specialty word boundaries. + /// + /// This assumes the parser is positioned at a `{` immediately following + /// a `\b`. When `None` is returned, the parser is returned to the position + /// at which it started: pointing at a `{`. + /// + /// The position given should correspond to the start of the `\b`. + fn maybe_parse_special_word_boundary(&self) -> Result, Error> { + assert_eq!(self.char(), '{'); + + let is_valid_char = |c| match c { + 'A'..='Z' | 'a'..='z' | '-' => true, + _ => false, + }; + let start = self.pos(); + if !self.bump_and_bump_space() { + return Err(Error::new(ERR_SPECIAL_WORD_OR_REP_UNEXPECTED_EOF)); + } + // This is one of the critical bits: if the first non-whitespace + // character isn't in [-A-Za-z] (i.e., this can't be a special word + // boundary), then we bail and let the counted repetition parser deal + // with this. + if !is_valid_char(self.char()) { + self.pos.set(start); + self.char.set(Some('{')); + return Ok(None); + } + + // Now collect up our chars until we see a '}'. + let mut scratch = String::new(); + while !self.is_done() && is_valid_char(self.char()) { + scratch.push(self.char()); + self.bump_and_bump_space(); + } + if self.is_done() || self.char() != '}' { + return Err(Error::new(ERR_SPECIAL_WORD_BOUNDARY_UNCLOSED)); + } + self.bump(); + let kind = match scratch.as_str() { + "start" => hir::Look::WordStart, + "end" => hir::Look::WordEnd, + "start-half" => hir::Look::WordStartHalf, + "end-half" => hir::Look::WordEndHalf, + _ => { + return Err(Error::new(ERR_SPECIAL_WORD_BOUNDARY_UNRECOGNIZED)) + } + }; + Ok(Some(Hir::look(kind))) + } + /// Parse a hex representation of a Unicode codepoint. This handles both /// hex notations, i.e., `\xFF` and `\x{FFFF}`. This expects the parser to /// be positioned at the `x`, `u` or `U` prefix. The parser is advanced to @@ -726,7 +824,7 @@ impl<'a> Parser<'a> { if self.bump_if("?P<") || self.bump_if("?<") { let index = self.next_capture_index()?; let name = Some(Box::from(self.parse_capture_name()?)); - let sub = Box::new(self.parse()?); + let sub = Box::new(self.parse_inner()?); let cap = hir::Capture { index, name, sub }; Ok(Some(Hir::capture(cap))) } else if self.bump_if("?") { @@ -746,11 +844,11 @@ impl<'a> Parser<'a> { } else { assert_eq!(':', self.char()); self.bump(); - self.parse().map(Some) + self.parse_inner().map(Some) } } else { let index = self.next_capture_index()?; - let sub = Box::new(self.parse()?); + let sub = Box::new(self.parse_inner()?); let cap = hir::Capture { index, name: None, sub }; Ok(Some(Hir::capture(cap))) } @@ -1183,6 +1281,38 @@ impl<'a> Parser<'a> { } } +/// This checks the depth of the given `Hir` value, and if it exceeds the given +/// limit, then an error is returned. +fn check_hir_nesting(hir: &Hir, limit: u32) -> Result<(), Error> { + fn recurse(hir: &Hir, limit: u32, depth: u32) -> Result<(), Error> { + if depth > limit { + return Err(Error::new(ERR_TOO_MUCH_NESTING)); + } + let Some(next_depth) = depth.checked_add(1) else { + return Err(Error::new(ERR_TOO_MUCH_NESTING)); + }; + match *hir.kind() { + HirKind::Empty + | HirKind::Char(_) + | HirKind::Class(_) + | HirKind::Look(_) => Ok(()), + HirKind::Repetition(hir::Repetition { ref sub, .. }) => { + recurse(sub, limit, next_depth) + } + HirKind::Capture(hir::Capture { ref sub, .. }) => { + recurse(sub, limit, next_depth) + } + HirKind::Concat(ref subs) | HirKind::Alternation(ref subs) => { + for sub in subs.iter() { + recurse(sub, limit, next_depth)?; + } + Ok(()) + } + } + } + recurse(hir, limit, 0) +} + /// Converts the given Hir to a literal char if the Hir is just a single /// character. Otherwise this returns an error. /// @@ -1198,8 +1328,10 @@ fn into_class_item_range(hir: Hir) -> Result { } } -fn into_class_item_ranges(hir: Hir) -> Result, Error> { - match hir.kind { +fn into_class_item_ranges( + mut hir: Hir, +) -> Result, Error> { + match core::mem::replace(&mut hir.kind, HirKind::Empty) { HirKind::Char(ch) => Ok(vec![hir::ClassRange { start: ch, end: ch }]), HirKind::Class(hir::Class { ranges }) => Ok(ranges), _ => Err(Error::new(ERR_CLASS_INVALID_ITEM)), @@ -1264,12 +1396,12 @@ mod tests { use super::*; fn p(pattern: &str) -> Hir { - Parser::new(Config::default(), pattern).parse().unwrap() + Parser::new(Config::default(), pattern).parse_inner().unwrap() } fn perr(pattern: &str) -> String { Parser::new(Config::default(), pattern) - .parse() + .parse_inner() .unwrap_err() .to_string() } @@ -1948,8 +2080,6 @@ bar assert_eq!(ERR_UNICODE_CLASS_UNSUPPORTED, perr(r"\pL")); assert_eq!(ERR_UNICODE_CLASS_UNSUPPORTED, perr(r"\p{L}")); assert_eq!(ERR_ESCAPE_UNRECOGNIZED, perr(r"\i")); - assert_eq!(ERR_ESCAPE_UNRECOGNIZED, perr(r"\<")); - assert_eq!(ERR_ESCAPE_UNRECOGNIZED, perr(r"\>")); assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"?")); assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"*")); assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"+")); @@ -1983,6 +2113,11 @@ bar assert_eq!(ERR_CLASS_INTERSECTION_UNSUPPORTED, perr(r"[a&&b]")); assert_eq!(ERR_CLASS_DIFFERENCE_UNSUPPORTED, perr(r"[a--b]")); assert_eq!(ERR_CLASS_SYMDIFFERENCE_UNSUPPORTED, perr(r"[a~~b]")); + assert_eq!(ERR_SPECIAL_WORD_BOUNDARY_UNCLOSED, perr(r"\b{foo")); + assert_eq!(ERR_SPECIAL_WORD_BOUNDARY_UNCLOSED, perr(r"\b{foo!}")); + assert_eq!(ERR_SPECIAL_WORD_BOUNDARY_UNRECOGNIZED, perr(r"\b{foo}")); + assert_eq!(ERR_SPECIAL_WORD_OR_REP_UNEXPECTED_EOF, perr(r"\b{")); + assert_eq!(ERR_SPECIAL_WORD_OR_REP_UNEXPECTED_EOF, perr(r"(?x)\b{ ")); } #[test] diff --git a/regex-lite/src/lib.rs b/regex-lite/src/lib.rs index d8e9016788..9b394a480b 100644 --- a/regex-lite/src/lib.rs +++ b/regex-lite/src/lib.rs @@ -107,7 +107,7 @@ fn main() { } ``` -Foruth, run it with `cargo run`: +Fourth, run it with `cargo run`: ```text $ cargo run @@ -466,12 +466,16 @@ x{n}? exactly n x ### Empty matches

-^     the beginning of a haystack (or start-of-line with multi-line mode)
-$     the end of a haystack (or end-of-line with multi-line mode)
-\A    only the beginning of a haystack (even with multi-line mode enabled)
-\z    only the end of a haystack (even with multi-line mode enabled)
-\b    an ASCII word boundary (\w on one side and \W, \A, or \z on other)
-\B    not an ASCII word boundary
+^               the beginning of a haystack (or start-of-line with multi-line mode)
+$               the end of a haystack (or end-of-line with multi-line mode)
+\A              only the beginning of a haystack (even with multi-line mode enabled)
+\z              only the end of a haystack (even with multi-line mode enabled)
+\b              an ASCII word boundary (\w on one side and \W, \A, or \z on other)
+\B              not an ASCII word boundary
+\b{start}, \<   an ASCII start-of-word boundary (\W|\A on the left, \w on the right)
+\b{end}, \>     an ASCII end-of-word boundary (\w on the left, \W|\z on the right))
+\b{start-half}  half of an ASCII start-of-word boundary (\W|\A on the left)
+\b{end-half}    half of an ASCII end-of-word boundary (\W|\z on the right)
 
The empty regex is valid and matches the empty string. For example, the @@ -581,25 +585,29 @@ Note that this includes all possible escape sequences, even ones that are documented elsewhere.
-\*          literal *, applies to all ASCII except [0-9A-Za-z<>]
-\a          bell (\x07)
-\f          form feed (\x0C)
-\t          horizontal tab
-\n          new line
-\r          carriage return
-\v          vertical tab (\x0B)
-\A          matches at the beginning of a haystack
-\z          matches at the end of a haystack
-\b          word boundary assertion
-\B          negated word boundary assertion
-\x7F        hex character code (exactly two digits)
-\x{10FFFF}  any hex character code corresponding to a Unicode code point
-\u007F      hex character code (exactly four digits)
-\u{7F}      any hex character code corresponding to a Unicode code point
-\U0000007F  hex character code (exactly eight digits)
-\U{7F}      any hex character code corresponding to a Unicode code point
-\d, \s, \w  Perl character class
-\D, \S, \W  negated Perl character class
+\*              literal *, applies to all ASCII except [0-9A-Za-z<>]
+\a              bell (\x07)
+\f              form feed (\x0C)
+\t              horizontal tab
+\n              new line
+\r              carriage return
+\v              vertical tab (\x0B)
+\A              matches at the beginning of a haystack
+\z              matches at the end of a haystack
+\b              word boundary assertion
+\B              negated word boundary assertion
+\b{start}, \<   start-of-word boundary assertion
+\b{end}, \>     end-of-word boundary assertion
+\b{start-half}  half of a start-of-word boundary assertion
+\b{end-half}    half of a end-of-word boundary assertion
+\x7F            hex character code (exactly two digits)
+\x{10FFFF}      any hex character code corresponding to a Unicode code point
+\u007F          hex character code (exactly four digits)
+\u{7F}          any hex character code corresponding to a Unicode code point
+\U0000007F      hex character code (exactly eight digits)
+\U{7F}          any hex character code corresponding to a Unicode code point
+\d, \s, \w      Perl character class
+\D, \S, \W      negated Perl character class
 
### Perl character classes (ASCII only) diff --git a/regex-lite/src/string.rs b/regex-lite/src/string.rs index 91b81d008a..4e4de90683 100644 --- a/regex-lite/src/string.rs +++ b/regex-lite/src/string.rs @@ -1186,8 +1186,8 @@ impl Regex { /// To create a `CaptureLocations` value, use the /// [`Regex::capture_locations`] method. /// - /// This also the overall match if one was found. When a match is found, - /// its offsets are also always stored in `locs` at index `0`. + /// This also returns the overall match if one was found. When a match is + /// found, its offsets are also always stored in `locs` at index `0`. /// /// # Panics /// @@ -2075,7 +2075,10 @@ impl<'h, 'n> core::ops::Index<&'n str> for Captures<'h> { /// /// // Asking for an invalid capture group always returns None. /// assert_eq!(None, locs.get(3)); +/// # // literals are too big for 32-bit usize: #1041 +/// # #[cfg(target_pointer_width = "64")] /// assert_eq!(None, locs.get(34973498648)); +/// # #[cfg(target_pointer_width = "64")] /// assert_eq!(None, locs.get(9944060567225171988)); /// ``` #[derive(Clone, Debug)] diff --git a/regex-lite/tests/fuzz/mod.rs b/regex-lite/tests/fuzz/mod.rs index 6eb37b50bb..5a721f142a 100644 --- a/regex-lite/tests/fuzz/mod.rs +++ b/regex-lite/tests/fuzz/mod.rs @@ -14,6 +14,23 @@ fn captures_wrong_order_min() { let _ = run(data); } +// Simpler regression test from a failure found by OSS-fuzz[1]. This test, +// when it failed, caused a stack overflow. We fixed it by adding another nest +// check on the Hir value itself, since the Hir type can have depth added to +// it without recursive calls in the parser (which is where the existing nest +// check was). +// +// Many thanks to Addison Crump for coming up with this test case[2]. +// +// [1]: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=60608 +// [2]: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=60608#c1 +#[test] +fn many_zero_to_many_reps() { + let pat = format!(".{}", "*".repeat(1 << 15)); + let Ok(re) = regex_lite::Regex::new(&pat) else { return }; + re.is_match(""); +} + // This is the fuzz target function. We duplicate it here since this is the // thing we use to interpret the data. It is ultimately what we want to // succeed. diff --git a/regex-lite/tests/lib.rs b/regex-lite/tests/lib.rs index 757b394411..89635f2d78 100644 --- a/regex-lite/tests/lib.rs +++ b/regex-lite/tests/lib.rs @@ -38,6 +38,7 @@ fn suite() -> anyhow::Result { load!("unicode"); load!("utf8"); load!("word-boundary"); + load!("word-boundary-special"); load!("fowler/basic"); load!("fowler/nullsubexpr"); load!("fowler/repetition"); diff --git a/regex-syntax/Cargo.toml b/regex-syntax/Cargo.toml index e6d7965be6..c9ce87da70 100644 --- a/regex-syntax/Cargo.toml +++ b/regex-syntax/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "regex-syntax" -version = "0.7.3" #:version +version = "0.8.2" #:version authors = ["The Rust Project Developers", "Andrew Gallant "] license = "MIT OR Apache-2.0" repository = "https://github.com/rust-lang/regex/tree/master/regex-syntax" @@ -8,7 +8,7 @@ documentation = "https://docs.rs/regex-syntax" description = "A regular expression parser." workspace = ".." edition = "2021" -rust-version = "1.60.0" +rust-version = "1.65" # Features are documented in the "Crate features" section of the crate docs: # https://docs.rs/regex-syntax/*/#crate-features diff --git a/regex-syntax/src/ast/mod.rs b/regex-syntax/src/ast/mod.rs index 9e4284fee8..6a77ee1343 100644 --- a/regex-syntax/src/ast/mod.rs +++ b/regex-syntax/src/ast/mod.rs @@ -162,6 +162,18 @@ pub enum ErrorKind { /// `(?i)*`. It is, however, possible to create a repetition operating on /// an empty sub-expression. For example, `()*` is still considered valid. RepetitionMissing, + /// The special word boundary syntax, `\b{something}`, was used, but + /// either EOF without `}` was seen, or an invalid character in the + /// braces was seen. + SpecialWordBoundaryUnclosed, + /// The special word boundary syntax, `\b{something}`, was used, but + /// `something` was not recognized as a valid word boundary kind. + SpecialWordBoundaryUnrecognized, + /// The syntax `\b{` was observed, but afterwards the end of the pattern + /// was observed without being able to tell whether it was meant to be a + /// bounded repetition on the `\b` or the beginning of a special word + /// boundary assertion. + SpecialWordOrRepetitionUnexpectedEof, /// The Unicode class is not valid. This typically occurs when a `\p` is /// followed by something other than a `{`. UnicodeClassInvalid, @@ -260,6 +272,29 @@ impl core::fmt::Display for ErrorKind { RepetitionMissing => { write!(f, "repetition operator missing expression") } + SpecialWordBoundaryUnclosed => { + write!( + f, + "special word boundary assertion is either \ + unclosed or contains an invalid character", + ) + } + SpecialWordBoundaryUnrecognized => { + write!( + f, + "unrecognized special word boundary assertion, \ + valid choices are: start, end, start-half \ + or end-half", + ) + } + SpecialWordOrRepetitionUnexpectedEof => { + write!( + f, + "found either the beginning of a special word \ + boundary or a bounded repetition on a \\b with \ + an opening brace, but no closing brace", + ) + } UnicodeClassInvalid => { write!(f, "invalid Unicode character class") } @@ -433,29 +468,94 @@ pub struct Comment { #[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] pub enum Ast { /// An empty regex that matches everything. - Empty(Span), + Empty(Box), /// A set of flags, e.g., `(?is)`. - Flags(SetFlags), + Flags(Box), /// A single character literal, which includes escape sequences. - Literal(Literal), + Literal(Box), /// The "any character" class. - Dot(Span), + Dot(Box), /// A single zero-width assertion. - Assertion(Assertion), - /// A single character class. This includes all forms of character classes - /// except for `.`. e.g., `\d`, `\pN`, `[a-z]` and `[[:alpha:]]`. - Class(Class), + Assertion(Box), + /// A single Unicode character class, e.g., `\pL` or `\p{Greek}`. + ClassUnicode(Box), + /// A single perl character class, e.g., `\d` or `\W`. + ClassPerl(Box), + /// A single bracketed character class set, which may contain zero or more + /// character ranges and/or zero or more nested classes. e.g., + /// `[a-zA-Z\pL]`. + ClassBracketed(Box), /// A repetition operator applied to an arbitrary regular expression. - Repetition(Repetition), + Repetition(Box), /// A grouped regular expression. - Group(Group), + Group(Box), /// An alternation of regular expressions. - Alternation(Alternation), + Alternation(Box), /// A concatenation of regular expressions. - Concat(Concat), + Concat(Box), } impl Ast { + /// Create an "empty" AST item. + pub fn empty(span: Span) -> Ast { + Ast::Empty(Box::new(span)) + } + + /// Create a "flags" AST item. + pub fn flags(e: SetFlags) -> Ast { + Ast::Flags(Box::new(e)) + } + + /// Create a "literal" AST item. + pub fn literal(e: Literal) -> Ast { + Ast::Literal(Box::new(e)) + } + + /// Create a "dot" AST item. + pub fn dot(span: Span) -> Ast { + Ast::Dot(Box::new(span)) + } + + /// Create a "assertion" AST item. + pub fn assertion(e: Assertion) -> Ast { + Ast::Assertion(Box::new(e)) + } + + /// Create a "Unicode class" AST item. + pub fn class_unicode(e: ClassUnicode) -> Ast { + Ast::ClassUnicode(Box::new(e)) + } + + /// Create a "Perl class" AST item. + pub fn class_perl(e: ClassPerl) -> Ast { + Ast::ClassPerl(Box::new(e)) + } + + /// Create a "bracketed class" AST item. + pub fn class_bracketed(e: ClassBracketed) -> Ast { + Ast::ClassBracketed(Box::new(e)) + } + + /// Create a "repetition" AST item. + pub fn repetition(e: Repetition) -> Ast { + Ast::Repetition(Box::new(e)) + } + + /// Create a "group" AST item. + pub fn group(e: Group) -> Ast { + Ast::Group(Box::new(e)) + } + + /// Create a "alternation" AST item. + pub fn alternation(e: Alternation) -> Ast { + Ast::Alternation(Box::new(e)) + } + + /// Create a "concat" AST item. + pub fn concat(e: Concat) -> Ast { + Ast::Concat(Box::new(e)) + } + /// Return the span of this abstract syntax tree. pub fn span(&self) -> &Span { match *self { @@ -464,7 +564,9 @@ impl Ast { Ast::Literal(ref x) => &x.span, Ast::Dot(ref span) => span, Ast::Assertion(ref x) => &x.span, - Ast::Class(ref x) => x.span(), + Ast::ClassUnicode(ref x) => &x.span, + Ast::ClassPerl(ref x) => &x.span, + Ast::ClassBracketed(ref x) => &x.span, Ast::Repetition(ref x) => &x.span, Ast::Group(ref x) => &x.span, Ast::Alternation(ref x) => &x.span, @@ -488,8 +590,10 @@ impl Ast { | Ast::Flags(_) | Ast::Literal(_) | Ast::Dot(_) - | Ast::Assertion(_) => false, - Ast::Class(_) + | Ast::Assertion(_) + | Ast::ClassUnicode(_) + | Ast::ClassPerl(_) => false, + Ast::ClassBracketed(_) | Ast::Repetition(_) | Ast::Group(_) | Ast::Alternation(_) @@ -526,14 +630,14 @@ pub struct Alternation { impl Alternation { /// Return this alternation as an AST. /// - /// If this alternation contains zero ASTs, then Ast::Empty is - /// returned. If this alternation contains exactly 1 AST, then the - /// corresponding AST is returned. Otherwise, Ast::Alternation is returned. + /// If this alternation contains zero ASTs, then `Ast::empty` is returned. + /// If this alternation contains exactly 1 AST, then the corresponding AST + /// is returned. Otherwise, `Ast::alternation` is returned. pub fn into_ast(mut self) -> Ast { match self.asts.len() { - 0 => Ast::Empty(self.span), + 0 => Ast::empty(self.span), 1 => self.asts.pop().unwrap(), - _ => Ast::Alternation(self), + _ => Ast::alternation(self), } } } @@ -551,14 +655,14 @@ pub struct Concat { impl Concat { /// Return this concatenation as an AST. /// - /// If this concatenation contains zero ASTs, then Ast::Empty is - /// returned. If this concatenation contains exactly 1 AST, then the - /// corresponding AST is returned. Otherwise, Ast::Concat is returned. + /// If this alternation contains zero ASTs, then `Ast::empty` is returned. + /// If this alternation contains exactly 1 AST, then the corresponding AST + /// is returned. Otherwise, `Ast::concat` is returned. pub fn into_ast(mut self) -> Ast { match self.asts.len() { - 0 => Ast::Empty(self.span), + 0 => Ast::empty(self.span), 1 => self.asts.pop().unwrap(), - _ => Ast::Concat(self), + _ => Ast::concat(self), } } } @@ -675,31 +779,6 @@ impl HexLiteralKind { } } -/// A single character class expression. -#[derive(Clone, Debug, Eq, PartialEq)] -#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] -pub enum Class { - /// A Unicode character class, e.g., `\pL` or `\p{Greek}`. - Unicode(ClassUnicode), - /// A perl character class, e.g., `\d` or `\W`. - Perl(ClassPerl), - /// A bracketed character class set, which may contain zero or more - /// character ranges and/or zero or more nested classes. e.g., - /// `[a-zA-Z\pL]`. - Bracketed(ClassBracketed), -} - -impl Class { - /// Return the span of this character class. - pub fn span(&self) -> &Span { - match *self { - Class::Perl(ref x) => &x.span, - Class::Unicode(ref x) => &x.span, - Class::Bracketed(ref x) => &x.span, - } - } -} - /// A Perl character class. #[derive(Clone, Debug, Eq, PartialEq)] #[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] @@ -1249,6 +1328,18 @@ pub enum AssertionKind { WordBoundary, /// `\B` NotWordBoundary, + /// `\b{start}` + WordBoundaryStart, + /// `\b{end}` + WordBoundaryEnd, + /// `\<` (alias for `\b{start}`) + WordBoundaryStartAngle, + /// `\>` (alias for `\b{end}`) + WordBoundaryEndAngle, + /// `\b{start-half}` + WordBoundaryStartHalf, + /// `\b{end-half}` + WordBoundaryEndHalf, } /// A repetition operation applied to a regular expression. @@ -1550,8 +1641,10 @@ impl Drop for Ast { | Ast::Literal(_) | Ast::Dot(_) | Ast::Assertion(_) - // Classes are recursive, so they get their own Drop impl. - | Ast::Class(_) => return, + | Ast::ClassUnicode(_) + | Ast::ClassPerl(_) + // Bracketed classes are recursive, they get their own Drop impl. + | Ast::ClassBracketed(_) => return, Ast::Repetition(ref x) if !x.ast.has_subexprs() => return, Ast::Group(ref x) if !x.ast.has_subexprs() => return, Ast::Alternation(ref x) if x.asts.is_empty() => return, @@ -1560,7 +1653,7 @@ impl Drop for Ast { } let empty_span = || Span::splat(Position::new(0, 0, 0)); - let empty_ast = || Ast::Empty(empty_span()); + let empty_ast = || Ast::empty(empty_span()); let mut stack = vec![mem::replace(self, empty_ast())]; while let Some(mut ast) = stack.pop() { match ast { @@ -1569,8 +1662,11 @@ impl Drop for Ast { | Ast::Literal(_) | Ast::Dot(_) | Ast::Assertion(_) - // Classes are recursive, so they get their own Drop impl. - | Ast::Class(_) => {} + | Ast::ClassUnicode(_) + | Ast::ClassPerl(_) + // Bracketed classes are recursive, so they get their own Drop + // impl. + | Ast::ClassBracketed(_) => {} Ast::Repetition(ref mut x) => { stack.push(mem::replace(&mut x.ast, empty_ast())); } @@ -1663,9 +1759,9 @@ mod tests { let run = || { let span = || Span::splat(Position::new(0, 0, 0)); - let mut ast = Ast::Empty(span()); + let mut ast = Ast::empty(span()); for i in 0..200 { - ast = Ast::Group(Group { + ast = Ast::group(Group { span: span(), kind: GroupKind::CaptureIndex(i), ast: Box::new(ast), @@ -1694,4 +1790,20 @@ mod tests { .join() .unwrap(); } + + // This tests that our `Ast` has a reasonable size. This isn't a hard rule + // and it can be increased if given a good enough reason. But this test + // exists because the size of `Ast` was at one point over 200 bytes on a + // 64-bit target. Wow. + #[test] + fn ast_size() { + let max = 2 * core::mem::size_of::(); + let size = core::mem::size_of::(); + assert!( + size <= max, + "Ast size of {} bytes is bigger than suggested max {}", + size, + max + ); + } } diff --git a/regex-syntax/src/ast/parse.rs b/regex-syntax/src/ast/parse.rs index 9cf64e9ec7..593b14fbc3 100644 --- a/regex-syntax/src/ast/parse.rs +++ b/regex-syntax/src/ast/parse.rs @@ -53,11 +53,11 @@ impl Primitive { /// Convert this primitive into a proper AST. fn into_ast(self) -> Ast { match self { - Primitive::Literal(lit) => Ast::Literal(lit), - Primitive::Assertion(assert) => Ast::Assertion(assert), - Primitive::Dot(span) => Ast::Dot(span), - Primitive::Perl(cls) => Ast::Class(ast::Class::Perl(cls)), - Primitive::Unicode(cls) => Ast::Class(ast::Class::Unicode(cls)), + Primitive::Literal(lit) => Ast::literal(lit), + Primitive::Assertion(assert) => Ast::assertion(assert), + Primitive::Dot(span) => Ast::dot(span), + Primitive::Perl(cls) => Ast::class_perl(cls), + Primitive::Unicode(cls) => Ast::class_unicode(cls), } } @@ -383,7 +383,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { /// Return a reference to the pattern being parsed. fn pattern(&self) -> &str { - self.pattern.borrow() + self.pattern } /// Create a new error with the given span and error type. @@ -691,7 +691,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { self.parser().ignore_whitespace.set(v); } - concat.asts.push(Ast::Flags(set)); + concat.asts.push(Ast::flags(set)); Ok(concat) } Either::Right(group) => { @@ -764,7 +764,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { group.ast = Box::new(group_concat.into_ast()); } } - prior_concat.asts.push(Ast::Group(group)); + prior_concat.asts.push(Ast::group(group)); Ok(prior_concat) } @@ -783,7 +783,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { Some(GroupState::Alternation(mut alt)) => { alt.span.end = self.pos(); alt.asts.push(concat.into_ast()); - Ok(Ast::Alternation(alt)) + Ok(Ast::alternation(alt)) } Some(GroupState::Group { group, .. }) => { return Err( @@ -850,7 +850,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { fn pop_class( &self, nested_union: ast::ClassSetUnion, - ) -> Result> { + ) -> Result> { assert_eq!(self.char(), ']'); let item = ast::ClassSet::Item(nested_union.into_item()); @@ -882,7 +882,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { set.span.end = self.pos(); set.kind = prevset; if stack.is_empty() { - Ok(Either::Right(ast::Class::Bracketed(set))) + Ok(Either::Right(set)) } else { union.push(ast::ClassSetItem::Bracketed(Box::new(set))); Ok(Either::Left(union)) @@ -976,7 +976,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { '|' => concat = self.push_alternate(concat)?, '[' => { let class = self.parse_set_class()?; - concat.asts.push(Ast::Class(class)); + concat.asts.push(Ast::class_bracketed(class)); } '?' => { concat = self.parse_uncounted_repetition( @@ -1057,7 +1057,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { greedy = false; self.bump(); } - concat.asts.push(Ast::Repetition(ast::Repetition { + concat.asts.push(Ast::repetition(ast::Repetition { span: ast.span().with_end(self.pos()), op: ast::RepetitionOp { span: Span::new(op_start, self.pos()), @@ -1159,7 +1159,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { self.error(op_span, ast::ErrorKind::RepetitionCountInvalid) ); } - concat.asts.push(Ast::Repetition(ast::Repetition { + concat.asts.push(Ast::repetition(ast::Repetition { span: ast.span().with_end(self.pos()), op: ast::RepetitionOp { span: op_span, @@ -1212,7 +1212,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { Ok(Either::Right(ast::Group { span: open_span, kind: ast::GroupKind::CaptureName { starts_with_p, name }, - ast: Box::new(Ast::Empty(self.span())), + ast: Box::new(Ast::empty(self.span())), })) } else if self.bump_if("?") { if self.is_eof() { @@ -1241,7 +1241,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { Ok(Either::Right(ast::Group { span: open_span, kind: ast::GroupKind::NonCapturing(flags), - ast: Box::new(Ast::Empty(self.span())), + ast: Box::new(Ast::empty(self.span())), })) } } else { @@ -1249,7 +1249,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { Ok(Either::Right(ast::Group { span: open_span, kind: ast::GroupKind::CaptureIndex(capture_index), - ast: Box::new(Ast::Empty(self.span())), + ast: Box::new(Ast::empty(self.span())), })) } } @@ -1528,18 +1528,115 @@ impl<'s, P: Borrow> ParserI<'s, P> { span, kind: ast::AssertionKind::EndText, })), - 'b' => Ok(Primitive::Assertion(ast::Assertion { - span, - kind: ast::AssertionKind::WordBoundary, - })), + 'b' => { + let mut wb = ast::Assertion { + span, + kind: ast::AssertionKind::WordBoundary, + }; + // After a \b, we "try" to parse things like \b{start} for + // special word boundary assertions. + if !self.is_eof() && self.char() == '{' { + if let Some(kind) = + self.maybe_parse_special_word_boundary(start)? + { + wb.kind = kind; + wb.span.end = self.pos(); + } + } + Ok(Primitive::Assertion(wb)) + } 'B' => Ok(Primitive::Assertion(ast::Assertion { span, kind: ast::AssertionKind::NotWordBoundary, })), + '<' => Ok(Primitive::Assertion(ast::Assertion { + span, + kind: ast::AssertionKind::WordBoundaryStartAngle, + })), + '>' => Ok(Primitive::Assertion(ast::Assertion { + span, + kind: ast::AssertionKind::WordBoundaryEndAngle, + })), _ => Err(self.error(span, ast::ErrorKind::EscapeUnrecognized)), } } + /// Attempt to parse a specialty word boundary. That is, `\b{start}`, + /// `\b{end}`, `\b{start-half}` or `\b{end-half}`. + /// + /// This is similar to `maybe_parse_ascii_class` in that, in most cases, + /// if it fails it will just return `None` with no error. This is done + /// because `\b{5}` is a valid expression and we want to let that be parsed + /// by the existing counted repetition parsing code. (I thought about just + /// invoking the counted repetition code from here, but it seemed a little + /// ham-fisted.) + /// + /// Unlike `maybe_parse_ascii_class` though, this can return an error. + /// Namely, if we definitely know it isn't a counted repetition, then we + /// return an error specific to the specialty word boundaries. + /// + /// This assumes the parser is positioned at a `{` immediately following + /// a `\b`. When `None` is returned, the parser is returned to the position + /// at which it started: pointing at a `{`. + /// + /// The position given should correspond to the start of the `\b`. + fn maybe_parse_special_word_boundary( + &self, + wb_start: Position, + ) -> Result> { + assert_eq!(self.char(), '{'); + + let is_valid_char = |c| match c { + 'A'..='Z' | 'a'..='z' | '-' => true, + _ => false, + }; + let start = self.pos(); + if !self.bump_and_bump_space() { + return Err(self.error( + Span::new(wb_start, self.pos()), + ast::ErrorKind::SpecialWordOrRepetitionUnexpectedEof, + )); + } + let start_contents = self.pos(); + // This is one of the critical bits: if the first non-whitespace + // character isn't in [-A-Za-z] (i.e., this can't be a special word + // boundary), then we bail and let the counted repetition parser deal + // with this. + if !is_valid_char(self.char()) { + self.parser().pos.set(start); + return Ok(None); + } + + // Now collect up our chars until we see a '}'. + let mut scratch = self.parser().scratch.borrow_mut(); + scratch.clear(); + while !self.is_eof() && is_valid_char(self.char()) { + scratch.push(self.char()); + self.bump_and_bump_space(); + } + if self.is_eof() || self.char() != '}' { + return Err(self.error( + Span::new(start, self.pos()), + ast::ErrorKind::SpecialWordBoundaryUnclosed, + )); + } + let end = self.pos(); + self.bump(); + let kind = match scratch.as_str() { + "start" => ast::AssertionKind::WordBoundaryStart, + "end" => ast::AssertionKind::WordBoundaryEnd, + "start-half" => ast::AssertionKind::WordBoundaryStartHalf, + "end-half" => ast::AssertionKind::WordBoundaryEndHalf, + _ => { + return Err(self.error( + Span::new(start_contents, end), + ast::ErrorKind::SpecialWordBoundaryUnrecognized, + )) + } + }; + Ok(Some(kind)) + } + /// Parse an octal representation of a Unicode codepoint up to 3 digits /// long. This expects the parser to be positioned at the first octal /// digit and advances the parser to the first character immediately @@ -1743,7 +1840,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { /// is successful, then the parser is advanced to the position immediately /// following the closing `]`. #[inline(never)] - fn parse_set_class(&self) -> Result { + fn parse_set_class(&self) -> Result { assert_eq!(self.char(), '['); let mut union = @@ -1967,9 +2064,9 @@ impl<'s, P: Borrow> ParserI<'s, P> { // because parsing cannot fail with any interesting error. For example, // in order to use an ASCII character class, it must be enclosed in // double brackets, e.g., `[[:alnum:]]`. Alternatively, you might think - // of it as "ASCII character characters have the syntax `[:NAME:]` - // which can only appear within character brackets." This means that - // things like `[[:lower:]A]` are legal constructs. + // of it as "ASCII character classes have the syntax `[:NAME:]` which + // can only appear within character brackets." This means that things + // like `[[:lower:]A]` are legal constructs. // // However, if one types an incorrect ASCII character class, e.g., // `[[:loower:]]`, then we treat that as a normal nested character @@ -2189,12 +2286,12 @@ impl<'p, 's, P: Borrow> ast::Visitor for NestLimiter<'p, 's, P> { | Ast::Literal(_) | Ast::Dot(_) | Ast::Assertion(_) - | Ast::Class(ast::Class::Unicode(_)) - | Ast::Class(ast::Class::Perl(_)) => { + | Ast::ClassUnicode(_) + | Ast::ClassPerl(_) => { // These are all base cases, so we don't increment depth. return Ok(()); } - Ast::Class(ast::Class::Bracketed(ref x)) => &x.span, + Ast::ClassBracketed(ref x) => &x.span, Ast::Repetition(ref x) => &x.span, Ast::Group(ref x) => &x.span, Ast::Alternation(ref x) => &x.span, @@ -2210,12 +2307,12 @@ impl<'p, 's, P: Borrow> ast::Visitor for NestLimiter<'p, 's, P> { | Ast::Literal(_) | Ast::Dot(_) | Ast::Assertion(_) - | Ast::Class(ast::Class::Unicode(_)) - | Ast::Class(ast::Class::Perl(_)) => { + | Ast::ClassUnicode(_) + | Ast::ClassPerl(_) => { // These are all base cases, so we don't decrement depth. Ok(()) } - Ast::Class(ast::Class::Bracketed(_)) + Ast::ClassBracketed(_) | Ast::Repetition(_) | Ast::Group(_) | Ast::Alternation(_) @@ -2426,12 +2523,12 @@ mod tests { /// Create a meta literal starting at the given position. fn meta_lit(c: char, span: Span) -> Ast { - Ast::Literal(ast::Literal { span, kind: ast::LiteralKind::Meta, c }) + Ast::literal(ast::Literal { span, kind: ast::LiteralKind::Meta, c }) } /// Create a verbatim literal with the given span. fn lit_with(c: char, span: Span) -> Ast { - Ast::Literal(ast::Literal { + Ast::literal(ast::Literal { span, kind: ast::LiteralKind::Verbatim, c, @@ -2445,17 +2542,17 @@ mod tests { /// Create a concatenation with the given span. fn concat_with(span: Span, asts: Vec) -> Ast { - Ast::Concat(ast::Concat { span, asts }) + Ast::concat(ast::Concat { span, asts }) } /// Create an alternation with the given span. fn alt(range: Range, asts: Vec) -> Ast { - Ast::Alternation(ast::Alternation { span: span(range), asts }) + Ast::alternation(ast::Alternation { span: span(range), asts }) } /// Create a capturing group with the given span. fn group(range: Range, index: u32, ast: Ast) -> Ast { - Ast::Group(ast::Group { + Ast::group(ast::Group { span: span(range), kind: ast::GroupKind::CaptureIndex(index), ast: Box::new(ast), @@ -2488,7 +2585,7 @@ mod tests { }, ); } - Ast::Flags(ast::SetFlags { + Ast::flags(ast::SetFlags { span: span_range(pat, range.clone()), flags: ast::Flags { span: span_range(pat, (range.start + 2)..(range.end - 1)), @@ -2502,7 +2599,7 @@ mod tests { // A nest limit of 0 still allows some types of regexes. assert_eq!( parser_nest_limit("", 0).parse(), - Ok(Ast::Empty(span(0..0))) + Ok(Ast::empty(span(0..0))) ); assert_eq!(parser_nest_limit("a", 0).parse(), Ok(lit('a', 0))); @@ -2516,7 +2613,7 @@ mod tests { ); assert_eq!( parser_nest_limit("a+", 1).parse(), - Ok(Ast::Repetition(ast::Repetition { + Ok(Ast::repetition(ast::Repetition { span: span(0..2), op: ast::RepetitionOp { span: span(1..2), @@ -2542,14 +2639,14 @@ mod tests { ); assert_eq!( parser_nest_limit("a+*", 2).parse(), - Ok(Ast::Repetition(ast::Repetition { + Ok(Ast::repetition(ast::Repetition { span: span(0..3), op: ast::RepetitionOp { span: span(2..3), kind: ast::RepetitionKind::ZeroOrMore, }, greedy: true, - ast: Box::new(Ast::Repetition(ast::Repetition { + ast: Box::new(Ast::repetition(ast::Repetition { span: span(0..2), op: ast::RepetitionOp { span: span(1..2), @@ -2606,7 +2703,7 @@ mod tests { ); assert_eq!( parser_nest_limit("[a]", 1).parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..3), negated: false, kind: ast::ClassSet::Item(ast::ClassSetItem::Literal( @@ -2616,7 +2713,7 @@ mod tests { c: 'a', } )), - }))) + })) ); assert_eq!( parser_nest_limit("[ab]", 1).parse().unwrap_err(), @@ -2776,7 +2873,7 @@ bar vec![ lit_with('a', span_range(pat, 0..1)), lit_with(' ', span_range(pat, 1..2)), - Ast::Group(ast::Group { + Ast::group(ast::Group { span: span_range(pat, 2..9), kind: ast::GroupKind::NonCapturing(ast::Flags { span: span_range(pat, 4..5), @@ -2803,7 +2900,7 @@ bar span_range(pat, 0..pat.len()), vec![ flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), - Ast::Group(ast::Group { + Ast::group(ast::Group { span: span_range(pat, 4..pat.len()), kind: ast::GroupKind::CaptureName { starts_with_p: true, @@ -2825,7 +2922,7 @@ bar span_range(pat, 0..pat.len()), vec![ flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), - Ast::Group(ast::Group { + Ast::group(ast::Group { span: span_range(pat, 4..pat.len()), kind: ast::GroupKind::CaptureIndex(1), ast: Box::new(lit_with('a', span_range(pat, 7..8))), @@ -2840,7 +2937,7 @@ bar span_range(pat, 0..pat.len()), vec![ flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), - Ast::Group(ast::Group { + Ast::group(ast::Group { span: span_range(pat, 4..pat.len()), kind: ast::GroupKind::NonCapturing(ast::Flags { span: span_range(pat, 8..8), @@ -2858,7 +2955,7 @@ bar span_range(pat, 0..pat.len()), vec![ flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), - Ast::Literal(ast::Literal { + Ast::literal(ast::Literal { span: span(4..13), kind: ast::LiteralKind::HexBrace( ast::HexLiteralKind::X @@ -2877,7 +2974,7 @@ bar span_range(pat, 0..pat.len()), vec![ flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), - Ast::Literal(ast::Literal { + Ast::literal(ast::Literal { span: span_range(pat, 4..6), kind: ast::LiteralKind::Superfluous, c: ' ', @@ -2895,9 +2992,9 @@ bar Ok(concat_with( span_range(pat, 0..3), vec![ - Ast::Dot(span_range(pat, 0..1)), + Ast::dot(span_range(pat, 0..1)), lit_with('\n', span_range(pat, 1..2)), - Ast::Dot(span_range(pat, 2..3)), + Ast::dot(span_range(pat, 2..3)), ] )) ); @@ -2933,7 +3030,7 @@ bar fn parse_uncounted_repetition() { assert_eq!( parser(r"a*").parse(), - Ok(Ast::Repetition(ast::Repetition { + Ok(Ast::repetition(ast::Repetition { span: span(0..2), op: ast::RepetitionOp { span: span(1..2), @@ -2945,7 +3042,7 @@ bar ); assert_eq!( parser(r"a+").parse(), - Ok(Ast::Repetition(ast::Repetition { + Ok(Ast::repetition(ast::Repetition { span: span(0..2), op: ast::RepetitionOp { span: span(1..2), @@ -2958,7 +3055,7 @@ bar assert_eq!( parser(r"a?").parse(), - Ok(Ast::Repetition(ast::Repetition { + Ok(Ast::repetition(ast::Repetition { span: span(0..2), op: ast::RepetitionOp { span: span(1..2), @@ -2970,7 +3067,7 @@ bar ); assert_eq!( parser(r"a??").parse(), - Ok(Ast::Repetition(ast::Repetition { + Ok(Ast::repetition(ast::Repetition { span: span(0..3), op: ast::RepetitionOp { span: span(1..3), @@ -2982,7 +3079,7 @@ bar ); assert_eq!( parser(r"a?").parse(), - Ok(Ast::Repetition(ast::Repetition { + Ok(Ast::repetition(ast::Repetition { span: span(0..2), op: ast::RepetitionOp { span: span(1..2), @@ -2997,7 +3094,7 @@ bar Ok(concat( 0..3, vec![ - Ast::Repetition(ast::Repetition { + Ast::repetition(ast::Repetition { span: span(0..2), op: ast::RepetitionOp { span: span(1..2), @@ -3015,7 +3112,7 @@ bar Ok(concat( 0..4, vec![ - Ast::Repetition(ast::Repetition { + Ast::repetition(ast::Repetition { span: span(0..3), op: ast::RepetitionOp { span: span(1..3), @@ -3034,7 +3131,7 @@ bar 0..3, vec![ lit('a', 0), - Ast::Repetition(ast::Repetition { + Ast::repetition(ast::Repetition { span: span(1..3), op: ast::RepetitionOp { span: span(2..3), @@ -3048,7 +3145,7 @@ bar ); assert_eq!( parser(r"(ab)?").parse(), - Ok(Ast::Repetition(ast::Repetition { + Ok(Ast::repetition(ast::Repetition { span: span(0..5), op: ast::RepetitionOp { span: span(4..5), @@ -3067,8 +3164,8 @@ bar Ok(alt( 0..3, vec![ - Ast::Empty(span(0..0)), - Ast::Repetition(ast::Repetition { + Ast::empty(span(0..0)), + Ast::repetition(ast::Repetition { span: span(1..3), op: ast::RepetitionOp { span: span(2..3), @@ -3157,7 +3254,7 @@ bar fn parse_counted_repetition() { assert_eq!( parser(r"a{5}").parse(), - Ok(Ast::Repetition(ast::Repetition { + Ok(Ast::repetition(ast::Repetition { span: span(0..4), op: ast::RepetitionOp { span: span(1..4), @@ -3171,7 +3268,7 @@ bar ); assert_eq!( parser(r"a{5,}").parse(), - Ok(Ast::Repetition(ast::Repetition { + Ok(Ast::repetition(ast::Repetition { span: span(0..5), op: ast::RepetitionOp { span: span(1..5), @@ -3185,7 +3282,7 @@ bar ); assert_eq!( parser(r"a{5,9}").parse(), - Ok(Ast::Repetition(ast::Repetition { + Ok(Ast::repetition(ast::Repetition { span: span(0..6), op: ast::RepetitionOp { span: span(1..6), @@ -3199,7 +3296,7 @@ bar ); assert_eq!( parser(r"a{5}?").parse(), - Ok(Ast::Repetition(ast::Repetition { + Ok(Ast::repetition(ast::Repetition { span: span(0..5), op: ast::RepetitionOp { span: span(1..5), @@ -3217,7 +3314,7 @@ bar 0..5, vec![ lit('a', 0), - Ast::Repetition(ast::Repetition { + Ast::repetition(ast::Repetition { span: span(1..5), op: ast::RepetitionOp { span: span(2..5), @@ -3237,7 +3334,7 @@ bar 0..6, vec![ lit('a', 0), - Ast::Repetition(ast::Repetition { + Ast::repetition(ast::Repetition { span: span(1..5), op: ast::RepetitionOp { span: span(2..5), @@ -3255,7 +3352,7 @@ bar assert_eq!( parser(r"a{ 5 }").parse(), - Ok(Ast::Repetition(ast::Repetition { + Ok(Ast::repetition(ast::Repetition { span: span(0..6), op: ast::RepetitionOp { span: span(1..6), @@ -3269,7 +3366,7 @@ bar ); assert_eq!( parser(r"a{ 5 , 9 }").parse(), - Ok(Ast::Repetition(ast::Repetition { + Ok(Ast::repetition(ast::Repetition { span: span(0..10), op: ast::RepetitionOp { span: span(1..10), @@ -3283,7 +3380,7 @@ bar ); assert_eq!( parser_ignore_whitespace(r"a{5,9} ?").parse(), - Ok(Ast::Repetition(ast::Repetition { + Ok(Ast::repetition(ast::Repetition { span: span(0..8), op: ast::RepetitionOp { span: span(1..8), @@ -3295,6 +3392,23 @@ bar ast: Box::new(lit('a', 0)), })) ); + assert_eq!( + parser(r"\b{5,9}").parse(), + Ok(Ast::repetition(ast::Repetition { + span: span(0..7), + op: ast::RepetitionOp { + span: span(2..7), + kind: ast::RepetitionKind::Range( + ast::RepetitionRange::Bounded(5, 9) + ), + }, + greedy: true, + ast: Box::new(Ast::assertion(ast::Assertion { + span: span(0..2), + kind: ast::AssertionKind::WordBoundary, + })), + })) + ); assert_eq!( parser(r"(?i){0}").parse().unwrap_err(), @@ -3414,7 +3528,7 @@ bar fn parse_alternate() { assert_eq!( parser(r"a|b").parse(), - Ok(Ast::Alternation(ast::Alternation { + Ok(Ast::alternation(ast::Alternation { span: span(0..3), asts: vec![lit('a', 0), lit('b', 2)], })) @@ -3424,7 +3538,7 @@ bar Ok(group( 0..5, 1, - Ast::Alternation(ast::Alternation { + Ast::alternation(ast::Alternation { span: span(1..4), asts: vec![lit('a', 1), lit('b', 3)], }) @@ -3433,14 +3547,14 @@ bar assert_eq!( parser(r"a|b|c").parse(), - Ok(Ast::Alternation(ast::Alternation { + Ok(Ast::alternation(ast::Alternation { span: span(0..5), asts: vec![lit('a', 0), lit('b', 2), lit('c', 4)], })) ); assert_eq!( parser(r"ax|by|cz").parse(), - Ok(Ast::Alternation(ast::Alternation { + Ok(Ast::alternation(ast::Alternation { span: span(0..8), asts: vec![ concat(0..2, vec![lit('a', 0), lit('x', 1)]), @@ -3454,7 +3568,7 @@ bar Ok(group( 0..10, 1, - Ast::Alternation(ast::Alternation { + Ast::alternation(ast::Alternation { span: span(1..9), asts: vec![ concat(1..3, vec![lit('a', 1), lit('x', 2)]), @@ -3503,7 +3617,7 @@ bar parser(r"|").parse(), Ok(alt( 0..1, - vec![Ast::Empty(span(0..0)), Ast::Empty(span(1..1)),] + vec![Ast::empty(span(0..0)), Ast::empty(span(1..1)),] )) ); assert_eq!( @@ -3511,19 +3625,19 @@ bar Ok(alt( 0..2, vec![ - Ast::Empty(span(0..0)), - Ast::Empty(span(1..1)), - Ast::Empty(span(2..2)), + Ast::empty(span(0..0)), + Ast::empty(span(1..1)), + Ast::empty(span(2..2)), ] )) ); assert_eq!( parser(r"a|").parse(), - Ok(alt(0..2, vec![lit('a', 0), Ast::Empty(span(2..2)),])) + Ok(alt(0..2, vec![lit('a', 0), Ast::empty(span(2..2)),])) ); assert_eq!( parser(r"|a").parse(), - Ok(alt(0..2, vec![Ast::Empty(span(0..0)), lit('a', 1),])) + Ok(alt(0..2, vec![Ast::empty(span(0..0)), lit('a', 1),])) ); assert_eq!( @@ -3533,7 +3647,7 @@ bar 1, alt( 1..2, - vec![Ast::Empty(span(1..1)), Ast::Empty(span(2..2)),] + vec![Ast::empty(span(1..1)), Ast::empty(span(2..2)),] ) )) ); @@ -3542,7 +3656,7 @@ bar Ok(group( 0..4, 1, - alt(1..3, vec![lit('a', 1), Ast::Empty(span(3..3)),]) + alt(1..3, vec![lit('a', 1), Ast::empty(span(3..3)),]) )) ); assert_eq!( @@ -3550,7 +3664,7 @@ bar Ok(group( 0..4, 1, - alt(1..3, vec![Ast::Empty(span(1..1)), lit('a', 2),]) + alt(1..3, vec![Ast::empty(span(1..1)), lit('a', 2),]) )) ); @@ -3606,7 +3720,7 @@ bar fn parse_group() { assert_eq!( parser("(?i)").parse(), - Ok(Ast::Flags(ast::SetFlags { + Ok(Ast::flags(ast::SetFlags { span: span(0..4), flags: ast::Flags { span: span(2..3), @@ -3621,7 +3735,7 @@ bar ); assert_eq!( parser("(?iU)").parse(), - Ok(Ast::Flags(ast::SetFlags { + Ok(Ast::flags(ast::SetFlags { span: span(0..5), flags: ast::Flags { span: span(2..4), @@ -3644,7 +3758,7 @@ bar ); assert_eq!( parser("(?i-U)").parse(), - Ok(Ast::Flags(ast::SetFlags { + Ok(Ast::flags(ast::SetFlags { span: span(0..6), flags: ast::Flags { span: span(2..5), @@ -3672,15 +3786,15 @@ bar assert_eq!( parser("()").parse(), - Ok(Ast::Group(ast::Group { + Ok(Ast::group(ast::Group { span: span(0..2), kind: ast::GroupKind::CaptureIndex(1), - ast: Box::new(Ast::Empty(span(1..1))), + ast: Box::new(Ast::empty(span(1..1))), })) ); assert_eq!( parser("(a)").parse(), - Ok(Ast::Group(ast::Group { + Ok(Ast::group(ast::Group { span: span(0..3), kind: ast::GroupKind::CaptureIndex(1), ast: Box::new(lit('a', 1)), @@ -3688,20 +3802,20 @@ bar ); assert_eq!( parser("(())").parse(), - Ok(Ast::Group(ast::Group { + Ok(Ast::group(ast::Group { span: span(0..4), kind: ast::GroupKind::CaptureIndex(1), - ast: Box::new(Ast::Group(ast::Group { + ast: Box::new(Ast::group(ast::Group { span: span(1..3), kind: ast::GroupKind::CaptureIndex(2), - ast: Box::new(Ast::Empty(span(2..2))), + ast: Box::new(Ast::empty(span(2..2))), })), })) ); assert_eq!( parser("(?:a)").parse(), - Ok(Ast::Group(ast::Group { + Ok(Ast::group(ast::Group { span: span(0..5), kind: ast::GroupKind::NonCapturing(ast::Flags { span: span(2..2), @@ -3713,7 +3827,7 @@ bar assert_eq!( parser("(?i:a)").parse(), - Ok(Ast::Group(ast::Group { + Ok(Ast::group(ast::Group { span: span(0..6), kind: ast::GroupKind::NonCapturing(ast::Flags { span: span(2..3), @@ -3729,7 +3843,7 @@ bar ); assert_eq!( parser("(?i-U:a)").parse(), - Ok(Ast::Group(ast::Group { + Ok(Ast::group(ast::Group { span: span(0..8), kind: ast::GroupKind::NonCapturing(ast::Flags { span: span(2..5), @@ -3818,7 +3932,7 @@ bar fn parse_capture_name() { assert_eq!( parser("(?z)").parse(), - Ok(Ast::Group(ast::Group { + Ok(Ast::group(ast::Group { span: span(0..7), kind: ast::GroupKind::CaptureName { starts_with_p: false, @@ -3833,7 +3947,7 @@ bar ); assert_eq!( parser("(?Pz)").parse(), - Ok(Ast::Group(ast::Group { + Ok(Ast::group(ast::Group { span: span(0..8), kind: ast::GroupKind::CaptureName { starts_with_p: true, @@ -3848,7 +3962,7 @@ bar ); assert_eq!( parser("(?Pz)").parse(), - Ok(Ast::Group(ast::Group { + Ok(Ast::group(ast::Group { span: span(0..10), kind: ast::GroupKind::CaptureName { starts_with_p: true, @@ -3864,7 +3978,7 @@ bar assert_eq!( parser("(?Pz)").parse(), - Ok(Ast::Group(ast::Group { + Ok(Ast::group(ast::Group { span: span(0..10), kind: ast::GroupKind::CaptureName { starts_with_p: true, @@ -3880,7 +3994,7 @@ bar assert_eq!( parser("(?Pz)").parse(), - Ok(Ast::Group(ast::Group { + Ok(Ast::group(ast::Group { span: span(0..10), kind: ast::GroupKind::CaptureName { starts_with_p: true, @@ -3896,7 +4010,7 @@ bar assert_eq!( parser("(?Pz)").parse(), - Ok(Ast::Group(ast::Group { + Ok(Ast::group(ast::Group { span: span(0..11), kind: ast::GroupKind::CaptureName { starts_with_p: true, @@ -3912,7 +4026,7 @@ bar assert_eq!( parser("(?P)").parse(), - Ok(Ast::Group(ast::Group { + Ok(Ast::group(ast::Group { span: Span::new( Position::new(0, 1, 1), Position::new(9, 1, 9), @@ -3928,7 +4042,7 @@ bar index: 1, } }, - ast: Box::new(Ast::Empty(Span::new( + ast: Box::new(Ast::empty(Span::new( Position::new(8, 1, 8), Position::new(8, 1, 8), ))), @@ -3936,7 +4050,7 @@ bar ); assert_eq!( parser("(?P<名字>)").parse(), - Ok(Ast::Group(ast::Group { + Ok(Ast::group(ast::Group { span: Span::new( Position::new(0, 1, 1), Position::new(12, 1, 9), @@ -3952,7 +4066,7 @@ bar index: 1, } }, - ast: Box::new(Ast::Empty(Span::new( + ast: Box::new(Ast::empty(Span::new( Position::new(11, 1, 8), Position::new(11, 1, 8), ))), @@ -4381,6 +4495,48 @@ bar kind: ast::AssertionKind::WordBoundary, })) ); + assert_eq!( + parser(r"\b{start}").parse_primitive(), + Ok(Primitive::Assertion(ast::Assertion { + span: span(0..9), + kind: ast::AssertionKind::WordBoundaryStart, + })) + ); + assert_eq!( + parser(r"\b{end}").parse_primitive(), + Ok(Primitive::Assertion(ast::Assertion { + span: span(0..7), + kind: ast::AssertionKind::WordBoundaryEnd, + })) + ); + assert_eq!( + parser(r"\b{start-half}").parse_primitive(), + Ok(Primitive::Assertion(ast::Assertion { + span: span(0..14), + kind: ast::AssertionKind::WordBoundaryStartHalf, + })) + ); + assert_eq!( + parser(r"\b{end-half}").parse_primitive(), + Ok(Primitive::Assertion(ast::Assertion { + span: span(0..12), + kind: ast::AssertionKind::WordBoundaryEndHalf, + })) + ); + assert_eq!( + parser(r"\<").parse_primitive(), + Ok(Primitive::Assertion(ast::Assertion { + span: span(0..2), + kind: ast::AssertionKind::WordBoundaryStartAngle, + })) + ); + assert_eq!( + parser(r"\>").parse_primitive(), + Ok(Primitive::Assertion(ast::Assertion { + span: span(0..2), + kind: ast::AssertionKind::WordBoundaryEndAngle, + })) + ); assert_eq!( parser(r"\B").parse_primitive(), Ok(Primitive::Assertion(ast::Assertion { @@ -4418,20 +4574,60 @@ bar kind: ast::ErrorKind::EscapeUnrecognized, } ); - // But also, < and > are banned, so that we may evolve them into - // start/end word boundary assertions. (Not sure if we will...) + + // Starting a special word boundary without any non-whitespace chars + // after the brace makes it ambiguous whether the user meant to write + // a counted repetition (probably not?) or an actual special word + // boundary assertion. assert_eq!( - parser(r"\<").parse_escape().unwrap_err(), + parser(r"\b{").parse_escape().unwrap_err(), TestError { - span: span(0..2), - kind: ast::ErrorKind::EscapeUnrecognized, + span: span(0..3), + kind: ast::ErrorKind::SpecialWordOrRepetitionUnexpectedEof, } ); assert_eq!( - parser(r"\>").parse_escape().unwrap_err(), + parser_ignore_whitespace(r"\b{ ").parse_escape().unwrap_err(), TestError { - span: span(0..2), - kind: ast::ErrorKind::EscapeUnrecognized, + span: span(0..4), + kind: ast::ErrorKind::SpecialWordOrRepetitionUnexpectedEof, + } + ); + // When 'x' is not enabled, the space is seen as a non-[-A-Za-z] char, + // and thus causes the parser to treat it as a counted repetition. + assert_eq!( + parser(r"\b{ ").parse().unwrap_err(), + TestError { + span: span(4..4), + kind: ast::ErrorKind::RepetitionCountDecimalEmpty, + } + ); + // In this case, we got some valid chars that makes it look like the + // user is writing one of the special word boundary assertions, but + // we forget to close the brace. + assert_eq!( + parser(r"\b{foo").parse_escape().unwrap_err(), + TestError { + span: span(2..6), + kind: ast::ErrorKind::SpecialWordBoundaryUnclosed, + } + ); + // We get the same error as above, except it is provoked by seeing a + // char that we know is invalid before seeing a closing brace. + assert_eq!( + parser(r"\b{foo!}").parse_escape().unwrap_err(), + TestError { + span: span(2..6), + kind: ast::ErrorKind::SpecialWordBoundaryUnclosed, + } + ); + // And this one occurs when, syntactically, everything looks okay, but + // we don't use a valid spelling of a word boundary assertion. + assert_eq!( + parser(r"\b{foo}").parse_escape().unwrap_err(), + TestError { + span: span(3..6), + kind: ast::ErrorKind::SpecialWordBoundaryUnrecognized, } ); @@ -4494,15 +4690,15 @@ bar ); assert_eq!( parser_octal(r"\778").parse(), - Ok(Ast::Concat(ast::Concat { + Ok(Ast::concat(ast::Concat { span: span(0..4), asts: vec![ - Ast::Literal(ast::Literal { + Ast::literal(ast::Literal { span: span(0..3), kind: ast::LiteralKind::Octal, c: '?', }), - Ast::Literal(ast::Literal { + Ast::literal(ast::Literal { span: span(3..4), kind: ast::LiteralKind::Verbatim, c: '8', @@ -4512,15 +4708,15 @@ bar ); assert_eq!( parser_octal(r"\7777").parse(), - Ok(Ast::Concat(ast::Concat { + Ok(Ast::concat(ast::Concat { span: span(0..5), asts: vec![ - Ast::Literal(ast::Literal { + Ast::literal(ast::Literal { span: span(0..4), kind: ast::LiteralKind::Octal, c: '\u{01FF}', }), - Ast::Literal(ast::Literal { + Ast::literal(ast::Literal { span: span(4..5), kind: ast::LiteralKind::Verbatim, c: '7', @@ -4965,15 +5161,15 @@ bar assert_eq!( parser("[[:alnum:]]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..11), negated: false, kind: itemset(item_ascii(alnum(span(1..10), false))), - }))) + })) ); assert_eq!( parser("[[[:alnum:]]]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..13), negated: false, kind: itemset(item_bracket(ast::ClassBracketed { @@ -4981,11 +5177,11 @@ bar negated: false, kind: itemset(item_ascii(alnum(span(2..11), false))), })), - }))) + })) ); assert_eq!( parser("[[:alnum:]&&[:lower:]]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..22), negated: false, kind: intersection( @@ -4993,11 +5189,11 @@ bar itemset(item_ascii(alnum(span(1..10), false))), itemset(item_ascii(lower(span(12..21), false))), ), - }))) + })) ); assert_eq!( parser("[[:alnum:]--[:lower:]]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..22), negated: false, kind: difference( @@ -5005,11 +5201,11 @@ bar itemset(item_ascii(alnum(span(1..10), false))), itemset(item_ascii(lower(span(12..21), false))), ), - }))) + })) ); assert_eq!( parser("[[:alnum:]~~[:lower:]]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..22), negated: false, kind: symdifference( @@ -5017,20 +5213,20 @@ bar itemset(item_ascii(alnum(span(1..10), false))), itemset(item_ascii(lower(span(12..21), false))), ), - }))) + })) ); assert_eq!( parser("[a]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..3), negated: false, kind: itemset(lit(span(1..2), 'a')), - }))) + })) ); assert_eq!( parser(r"[a\]]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..5), negated: false, kind: union( @@ -5044,11 +5240,11 @@ bar }), ] ), - }))) + })) ); assert_eq!( parser(r"[a\-z]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..6), negated: false, kind: union( @@ -5063,44 +5259,44 @@ bar lit(span(4..5), 'z'), ] ), - }))) + })) ); assert_eq!( parser("[ab]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..4), negated: false, kind: union( span(1..3), vec![lit(span(1..2), 'a'), lit(span(2..3), 'b'),] ), - }))) + })) ); assert_eq!( parser("[a-]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..4), negated: false, kind: union( span(1..3), vec![lit(span(1..2), 'a'), lit(span(2..3), '-'),] ), - }))) + })) ); assert_eq!( parser("[-a]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..4), negated: false, kind: union( span(1..3), vec![lit(span(1..2), '-'), lit(span(2..3), 'a'),] ), - }))) + })) ); assert_eq!( parser(r"[\pL]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..5), negated: false, kind: itemset(item_unicode(ast::ClassUnicode { @@ -5108,11 +5304,11 @@ bar negated: false, kind: ast::ClassUnicodeKind::OneLetter('L'), })), - }))) + })) ); assert_eq!( parser(r"[\w]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..4), negated: false, kind: itemset(item_perl(ast::ClassPerl { @@ -5120,11 +5316,11 @@ bar kind: ast::ClassPerlKind::Word, negated: false, })), - }))) + })) ); assert_eq!( parser(r"[a\wz]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..6), negated: false, kind: union( @@ -5139,20 +5335,20 @@ bar lit(span(4..5), 'z'), ] ), - }))) + })) ); assert_eq!( parser("[a-z]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..5), negated: false, kind: itemset(range(span(1..4), 'a', 'z')), - }))) + })) ); assert_eq!( parser("[a-cx-z]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..8), negated: false, kind: union( @@ -5162,11 +5358,11 @@ bar range(span(4..7), 'x', 'z'), ] ), - }))) + })) ); assert_eq!( parser(r"[\w&&a-cx-z]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..12), negated: false, kind: intersection( @@ -5184,11 +5380,11 @@ bar ] ), ), - }))) + })) ); assert_eq!( parser(r"[a-cx-z&&\w]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..12), negated: false, kind: intersection( @@ -5206,11 +5402,11 @@ bar negated: false, })), ), - }))) + })) ); assert_eq!( parser(r"[a--b--c]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..9), negated: false, kind: difference( @@ -5222,11 +5418,11 @@ bar ), itemset(lit(span(7..8), 'c')), ), - }))) + })) ); assert_eq!( parser(r"[a~~b~~c]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..9), negated: false, kind: symdifference( @@ -5238,11 +5434,11 @@ bar ), itemset(lit(span(7..8), 'c')), ), - }))) + })) ); assert_eq!( parser(r"[\^&&^]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..7), negated: false, kind: intersection( @@ -5254,11 +5450,11 @@ bar })), itemset(lit(span(5..6), '^')), ), - }))) + })) ); assert_eq!( parser(r"[\&&&&]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..7), negated: false, kind: intersection( @@ -5270,11 +5466,11 @@ bar })), itemset(lit(span(5..6), '&')), ), - }))) + })) ); assert_eq!( parser(r"[&&&&]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..6), negated: false, kind: intersection( @@ -5286,13 +5482,13 @@ bar ), itemset(empty(span(5..5))), ), - }))) + })) ); let pat = "[☃-⛄]"; assert_eq!( parser(pat).parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span_range(pat, 0..9), negated: false, kind: itemset(ast::ClassSetItem::Range(ast::ClassSetRange { @@ -5308,20 +5504,20 @@ bar c: '⛄', }, })), - }))) + })) ); assert_eq!( parser(r"[]]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..3), negated: false, kind: itemset(lit(span(1..2), ']')), - }))) + })) ); assert_eq!( parser(r"[]\[]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..5), negated: false, kind: union( @@ -5335,14 +5531,14 @@ bar }), ] ), - }))) + })) ); assert_eq!( parser(r"[\[]]").parse(), Ok(concat( 0..5, vec![ - Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ast::class_bracketed(ast::ClassBracketed { span: span(0..4), negated: false, kind: itemset(ast::ClassSetItem::Literal( @@ -5352,8 +5548,8 @@ bar c: '[', } )), - })), - Ast::Literal(ast::Literal { + }), + Ast::literal(ast::Literal { span: span(4..5), kind: ast::LiteralKind::Verbatim, c: ']', @@ -5914,15 +6110,15 @@ bar assert_eq!( parser(r"\pNz").parse(), - Ok(Ast::Concat(ast::Concat { + Ok(Ast::concat(ast::Concat { span: span(0..4), asts: vec![ - Ast::Class(ast::Class::Unicode(ast::ClassUnicode { + Ast::class_unicode(ast::ClassUnicode { span: span(0..3), negated: false, kind: ast::ClassUnicodeKind::OneLetter('N'), - })), - Ast::Literal(ast::Literal { + }), + Ast::literal(ast::Literal { span: span(3..4), kind: ast::LiteralKind::Verbatim, c: 'z', @@ -5932,15 +6128,15 @@ bar ); assert_eq!( parser(r"\p{Greek}z").parse(), - Ok(Ast::Concat(ast::Concat { + Ok(Ast::concat(ast::Concat { span: span(0..10), asts: vec![ - Ast::Class(ast::Class::Unicode(ast::ClassUnicode { + Ast::class_unicode(ast::ClassUnicode { span: span(0..9), negated: false, kind: ast::ClassUnicodeKind::Named(s("Greek")), - })), - Ast::Literal(ast::Literal { + }), + Ast::literal(ast::Literal { span: span(9..10), kind: ast::LiteralKind::Verbatim, c: 'z', @@ -6017,23 +6213,23 @@ bar assert_eq!( parser(r"\d").parse(), - Ok(Ast::Class(ast::Class::Perl(ast::ClassPerl { + Ok(Ast::class_perl(ast::ClassPerl { span: span(0..2), kind: ast::ClassPerlKind::Digit, negated: false, - }))) + })) ); assert_eq!( parser(r"\dz").parse(), - Ok(Ast::Concat(ast::Concat { + Ok(Ast::concat(ast::Concat { span: span(0..3), asts: vec![ - Ast::Class(ast::Class::Perl(ast::ClassPerl { + Ast::class_perl(ast::ClassPerl { span: span(0..2), kind: ast::ClassPerlKind::Digit, negated: false, - })), - Ast::Literal(ast::Literal { + }), + Ast::literal(ast::Literal { span: span(2..3), kind: ast::LiteralKind::Verbatim, c: 'z', diff --git a/regex-syntax/src/ast/print.rs b/regex-syntax/src/ast/print.rs index 86a87e1439..1ceb3c7faa 100644 --- a/regex-syntax/src/ast/print.rs +++ b/regex-syntax/src/ast/print.rs @@ -80,27 +80,21 @@ impl Visitor for Writer { fn visit_pre(&mut self, ast: &Ast) -> fmt::Result { match *ast { Ast::Group(ref x) => self.fmt_group_pre(x), - Ast::Class(ast::Class::Bracketed(ref x)) => { - self.fmt_class_bracketed_pre(x) - } + Ast::ClassBracketed(ref x) => self.fmt_class_bracketed_pre(x), _ => Ok(()), } } fn visit_post(&mut self, ast: &Ast) -> fmt::Result { - use crate::ast::Class; - match *ast { Ast::Empty(_) => Ok(()), Ast::Flags(ref x) => self.fmt_set_flags(x), Ast::Literal(ref x) => self.fmt_literal(x), Ast::Dot(_) => self.wtr.write_str("."), Ast::Assertion(ref x) => self.fmt_assertion(x), - Ast::Class(Class::Perl(ref x)) => self.fmt_class_perl(x), - Ast::Class(Class::Unicode(ref x)) => self.fmt_class_unicode(x), - Ast::Class(Class::Bracketed(ref x)) => { - self.fmt_class_bracketed_post(x) - } + Ast::ClassPerl(ref x) => self.fmt_class_perl(x), + Ast::ClassUnicode(ref x) => self.fmt_class_unicode(x), + Ast::ClassBracketed(ref x) => self.fmt_class_bracketed_post(x), Ast::Repetition(ref x) => self.fmt_repetition(x), Ast::Group(ref x) => self.fmt_group_post(x), Ast::Alternation(_) => Ok(()), @@ -267,6 +261,12 @@ impl Writer { EndText => self.wtr.write_str(r"\z"), WordBoundary => self.wtr.write_str(r"\b"), NotWordBoundary => self.wtr.write_str(r"\B"), + WordBoundaryStart => self.wtr.write_str(r"\b{start}"), + WordBoundaryEnd => self.wtr.write_str(r"\b{end}"), + WordBoundaryStartAngle => self.wtr.write_str(r"\<"), + WordBoundaryEndAngle => self.wtr.write_str(r"\>"), + WordBoundaryStartHalf => self.wtr.write_str(r"\b{start-half}"), + WordBoundaryEndHalf => self.wtr.write_str(r"\b{end-half}"), } } diff --git a/regex-syntax/src/ast/visitor.rs b/regex-syntax/src/ast/visitor.rs index 03d12a14db..c1bb24d971 100644 --- a/regex-syntax/src/ast/visitor.rs +++ b/regex-syntax/src/ast/visitor.rs @@ -264,7 +264,7 @@ impl<'a> HeapVisitor<'a> { visitor: &mut V, ) -> Result>, V::Err> { Ok(match *ast { - Ast::Class(ast::Class::Bracketed(ref x)) => { + Ast::ClassBracketed(ref x) => { self.visit_class(x, visitor)?; None } diff --git a/regex-syntax/src/hir/literal.rs b/regex-syntax/src/hir/literal.rs index 9461db9891..a5a3737f68 100644 --- a/regex-syntax/src/hir/literal.rs +++ b/regex-syntax/src/hir/literal.rs @@ -477,7 +477,7 @@ impl Extractor { } seq } - hir::Repetition { min, max: Some(max), .. } if min < max => { + hir::Repetition { min, .. } => { assert!(min > 0); // handled above let limit = u32::try_from(self.limit_repeat).unwrap_or(u32::MAX); @@ -491,10 +491,6 @@ impl Extractor { seq.make_inexact(); seq } - hir::Repetition { .. } => { - subseq.make_inexact(); - subseq - } } } @@ -2239,24 +2235,19 @@ impl PreferenceTrie { /// after them and because any removed literals are guaranteed to never /// match. fn minimize(literals: &mut Vec, keep_exact: bool) { - use core::cell::RefCell; - - // MSRV(1.61): Use retain_mut here to avoid interior mutability. - let trie = RefCell::new(PreferenceTrie { + let mut trie = PreferenceTrie { states: vec![], matches: vec![], next_literal_index: 1, - }); + }; let mut make_inexact = vec![]; - literals.retain(|lit| { - match trie.borrow_mut().insert(lit.as_bytes()) { - Ok(_) => true, - Err(i) => { - if !keep_exact { - make_inexact.push(i.checked_sub(1).unwrap()); - } - false + literals.retain_mut(|lit| match trie.insert(lit.as_bytes()) { + Ok(_) => true, + Err(i) => { + if !keep_exact { + make_inexact.push(i.checked_sub(1).unwrap()); } + false } }); for i in make_inexact { @@ -2655,6 +2646,12 @@ mod tests { ]), e(r"(ab|cd)(ef|gh)(ij|kl)") ); + + assert_eq!(inexact([E("abab")], [E("abab")]), e(r"(ab){2}")); + + assert_eq!(inexact([I("abab")], [I("abab")]), e(r"(ab){2,3}")); + + assert_eq!(inexact([I("abab")], [I("abab")]), e(r"(ab){2,}")); } #[test] diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index 6c1d2745e0..ae3ba318ee 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -322,6 +322,22 @@ impl Hir { /// let expected = HirKind::Literal(Literal(Box::from("☃".as_bytes()))); /// assert_eq!(&expected, concat.kind()); /// ``` + /// + /// # Example: building a literal from a `char` + /// + /// This example shows how to build a single `Hir` literal from a `char` + /// value. Since a [`Literal`] is just bytes, we just need to UTF-8 + /// encode a `char` value: + /// + /// ``` + /// use regex_syntax::hir::{Hir, HirKind, Literal}; + /// + /// let ch = '☃'; + /// let got = Hir::literal(ch.encode_utf8(&mut [0; 4]).as_bytes()); + /// + /// let expected = HirKind::Literal(Literal(Box::from("☃".as_bytes()))); + /// assert_eq!(&expected, got.kind()); + /// ``` #[inline] pub fn literal>>(lit: B) -> Hir { let bytes = lit.into(); @@ -797,13 +813,18 @@ impl core::fmt::Debug for Literal { /// The high-level intermediate representation of a character class. /// /// A character class corresponds to a set of characters. A character is either -/// defined by a Unicode scalar value or a byte. Unicode characters are used -/// by default, while bytes are used when Unicode mode (via the `u` flag) is -/// disabled. +/// defined by a Unicode scalar value or a byte. /// /// A character class, regardless of its character type, is represented by a /// sequence of non-overlapping non-adjacent ranges of characters. /// +/// There are no guarantees about which class variant is used. Generally +/// speaking, the Unicode variat is used whenever a class needs to contain +/// non-ASCII Unicode scalar values. But the Unicode variant can be used even +/// when Unicode mode is disabled. For example, at the time of writing, the +/// regex `(?-u:a|\xc2\xa0)` will compile down to HIR for the Unicode class +/// `[a\u00A0]` due to optimizations. +/// /// Note that `Bytes` variant may be produced even when it exclusively matches /// valid UTF-8. This is because a `Bytes` variant represents an intention by /// the author of the regular expression to disable Unicode mode, which in turn @@ -1326,8 +1347,9 @@ impl ClassUnicodeRange { } } -/// A set of characters represented by arbitrary bytes (where one byte -/// corresponds to one character). +/// A set of characters represented by arbitrary bytes. +/// +/// Each byte corresponds to one character. #[derive(Clone, Debug, Eq, PartialEq)] pub struct ClassBytes { set: IntervalSet, @@ -1629,6 +1651,42 @@ pub enum Look { WordUnicode = 1 << 8, /// Match a Unicode-aware negation of a word boundary. WordUnicodeNegate = 1 << 9, + /// Match the start of an ASCII-only word boundary. That is, this matches a + /// position at either the beginning of the haystack or where the previous + /// character is not a word character and the following character is a word + /// character. + WordStartAscii = 1 << 10, + /// Match the end of an ASCII-only word boundary. That is, this matches + /// a position at either the end of the haystack or where the previous + /// character is a word character and the following character is not a word + /// character. + WordEndAscii = 1 << 11, + /// Match the start of a Unicode word boundary. That is, this matches a + /// position at either the beginning of the haystack or where the previous + /// character is not a word character and the following character is a word + /// character. + WordStartUnicode = 1 << 12, + /// Match the end of a Unicode word boundary. That is, this matches a + /// position at either the end of the haystack or where the previous + /// character is a word character and the following character is not a word + /// character. + WordEndUnicode = 1 << 13, + /// Match the start half of an ASCII-only word boundary. That is, this + /// matches a position at either the beginning of the haystack or where the + /// previous character is not a word character. + WordStartHalfAscii = 1 << 14, + /// Match the end half of an ASCII-only word boundary. That is, this + /// matches a position at either the end of the haystack or where the + /// following character is not a word character. + WordEndHalfAscii = 1 << 15, + /// Match the start half of a Unicode word boundary. That is, this matches + /// a position at either the beginning of the haystack or where the + /// previous character is not a word character. + WordStartHalfUnicode = 1 << 16, + /// Match the end half of a Unicode word boundary. That is, this matches + /// a position at either the end of the haystack or where the following + /// character is not a word character. + WordEndHalfUnicode = 1 << 17, } impl Look { @@ -1650,6 +1708,14 @@ impl Look { Look::WordAsciiNegate => Look::WordAsciiNegate, Look::WordUnicode => Look::WordUnicode, Look::WordUnicodeNegate => Look::WordUnicodeNegate, + Look::WordStartAscii => Look::WordEndAscii, + Look::WordEndAscii => Look::WordStartAscii, + Look::WordStartUnicode => Look::WordEndUnicode, + Look::WordEndUnicode => Look::WordStartUnicode, + Look::WordStartHalfAscii => Look::WordEndHalfAscii, + Look::WordEndHalfAscii => Look::WordStartHalfAscii, + Look::WordStartHalfUnicode => Look::WordEndHalfUnicode, + Look::WordEndHalfUnicode => Look::WordStartHalfUnicode, } } @@ -1658,28 +1724,36 @@ impl Look { /// constructor is guaranteed to return the same look-around variant that /// one started with within a semver compatible release of this crate. #[inline] - pub const fn as_repr(self) -> u16 { + pub const fn as_repr(self) -> u32 { // AFAIK, 'as' is the only way to zero-cost convert an int enum to an // actual int. - self as u16 + self as u32 } /// Given the underlying representation of a `Look` value, return the /// corresponding `Look` value if the representation is valid. Otherwise /// `None` is returned. #[inline] - pub const fn from_repr(repr: u16) -> Option { + pub const fn from_repr(repr: u32) -> Option { match repr { - 0b00_0000_0001 => Some(Look::Start), - 0b00_0000_0010 => Some(Look::End), - 0b00_0000_0100 => Some(Look::StartLF), - 0b00_0000_1000 => Some(Look::EndLF), - 0b00_0001_0000 => Some(Look::StartCRLF), - 0b00_0010_0000 => Some(Look::EndCRLF), - 0b00_0100_0000 => Some(Look::WordAscii), - 0b00_1000_0000 => Some(Look::WordAsciiNegate), - 0b01_0000_0000 => Some(Look::WordUnicode), - 0b10_0000_0000 => Some(Look::WordUnicodeNegate), + 0b00_0000_0000_0000_0001 => Some(Look::Start), + 0b00_0000_0000_0000_0010 => Some(Look::End), + 0b00_0000_0000_0000_0100 => Some(Look::StartLF), + 0b00_0000_0000_0000_1000 => Some(Look::EndLF), + 0b00_0000_0000_0001_0000 => Some(Look::StartCRLF), + 0b00_0000_0000_0010_0000 => Some(Look::EndCRLF), + 0b00_0000_0000_0100_0000 => Some(Look::WordAscii), + 0b00_0000_0000_1000_0000 => Some(Look::WordAsciiNegate), + 0b00_0000_0001_0000_0000 => Some(Look::WordUnicode), + 0b00_0000_0010_0000_0000 => Some(Look::WordUnicodeNegate), + 0b00_0000_0100_0000_0000 => Some(Look::WordStartAscii), + 0b00_0000_1000_0000_0000 => Some(Look::WordEndAscii), + 0b00_0001_0000_0000_0000 => Some(Look::WordStartUnicode), + 0b00_0010_0000_0000_0000 => Some(Look::WordEndUnicode), + 0b00_0100_0000_0000_0000 => Some(Look::WordStartHalfAscii), + 0b00_1000_0000_0000_0000 => Some(Look::WordEndHalfAscii), + 0b01_0000_0000_0000_0000 => Some(Look::WordStartHalfUnicode), + 0b10_0000_0000_0000_0000 => Some(Look::WordEndHalfUnicode), _ => None, } } @@ -1704,6 +1778,14 @@ impl Look { Look::WordAsciiNegate => 'B', Look::WordUnicode => '𝛃', Look::WordUnicodeNegate => '𝚩', + Look::WordStartAscii => '<', + Look::WordEndAscii => '>', + Look::WordStartUnicode => '〈', + Look::WordEndUnicode => '〉', + Look::WordStartHalfAscii => '◁', + Look::WordEndHalfAscii => '▷', + Look::WordStartHalfUnicode => '◀', + Look::WordEndHalfUnicode => '▶', } } } @@ -2594,7 +2676,7 @@ pub struct LookSet { /// range of `u16` values to be represented. For example, even if the /// current implementation only makes use of the 10 least significant bits, /// it may use more bits in a future semver compatible release. - pub bits: u16, + pub bits: u32, } impl LookSet { @@ -2697,13 +2779,22 @@ impl LookSet { pub fn contains_word_unicode(self) -> bool { self.contains(Look::WordUnicode) || self.contains(Look::WordUnicodeNegate) + || self.contains(Look::WordStartUnicode) + || self.contains(Look::WordEndUnicode) + || self.contains(Look::WordStartHalfUnicode) + || self.contains(Look::WordEndHalfUnicode) } /// Returns true if and only if this set contains any ASCII word boundary /// or negated ASCII word boundary assertions. #[inline] pub fn contains_word_ascii(self) -> bool { - self.contains(Look::WordAscii) || self.contains(Look::WordAsciiNegate) + self.contains(Look::WordAscii) + || self.contains(Look::WordAsciiNegate) + || self.contains(Look::WordStartAscii) + || self.contains(Look::WordEndAscii) + || self.contains(Look::WordStartHalfAscii) + || self.contains(Look::WordEndHalfAscii) } /// Returns an iterator over all of the look-around assertions in this set. @@ -2782,29 +2873,31 @@ impl LookSet { *self = self.intersect(other); } - /// Return a `LookSet` from the slice given as a native endian 16-bit + /// Return a `LookSet` from the slice given as a native endian 32-bit /// integer. /// /// # Panics /// - /// This panics if `slice.len() < 2`. + /// This panics if `slice.len() < 4`. #[inline] pub fn read_repr(slice: &[u8]) -> LookSet { - let bits = u16::from_ne_bytes(slice[..2].try_into().unwrap()); + let bits = u32::from_ne_bytes(slice[..4].try_into().unwrap()); LookSet { bits } } - /// Write a `LookSet` as a native endian 16-bit integer to the beginning + /// Write a `LookSet` as a native endian 32-bit integer to the beginning /// of the slice given. /// /// # Panics /// - /// This panics if `slice.len() < 2`. + /// This panics if `slice.len() < 4`. #[inline] pub fn write_repr(self, slice: &mut [u8]) { let raw = self.bits.to_ne_bytes(); slice[0] = raw[0]; slice[1] = raw[1]; + slice[2] = raw[2]; + slice[3] = raw[3]; } } @@ -2837,9 +2930,9 @@ impl Iterator for LookSetIter { return None; } // We'll never have more than u8::MAX distinct look-around assertions, - // so 'repr' will always fit into a u16. - let repr = u16::try_from(self.set.bits.trailing_zeros()).unwrap(); - let look = Look::from_repr(1 << repr)?; + // so 'bit' will always fit into a u16. + let bit = u16::try_from(self.set.bits.trailing_zeros()).unwrap(); + let look = Look::from_repr(1 << bit)?; self.set = self.set.remove(look); Some(look) } @@ -3761,7 +3854,7 @@ mod tests { assert_eq!(0, set.iter().count()); let set = LookSet::full(); - assert_eq!(10, set.iter().count()); + assert_eq!(18, set.iter().count()); let set = LookSet::empty().insert(Look::StartLF).insert(Look::WordUnicode); @@ -3779,6 +3872,6 @@ mod tests { let res = format!("{:?}", LookSet::empty()); assert_eq!("∅", res); let res = format!("{:?}", LookSet::full()); - assert_eq!("Az^$rRbB𝛃𝚩", res); + assert_eq!("Az^$rRbB𝛃𝚩<>〈〉◁▷◀▶", res); } } diff --git a/regex-syntax/src/hir/print.rs b/regex-syntax/src/hir/print.rs index aa737a092d..dfa6d40322 100644 --- a/regex-syntax/src/hir/print.rs +++ b/regex-syntax/src/hir/print.rs @@ -202,6 +202,30 @@ impl Visitor for Writer { hir::Look::WordUnicodeNegate => { self.wtr.write_str(r"\B")?; } + hir::Look::WordStartAscii => { + self.wtr.write_str(r"(?-u:\b{start})")?; + } + hir::Look::WordEndAscii => { + self.wtr.write_str(r"(?-u:\b{end})")?; + } + hir::Look::WordStartUnicode => { + self.wtr.write_str(r"\b{start}")?; + } + hir::Look::WordEndUnicode => { + self.wtr.write_str(r"\b{end}")?; + } + hir::Look::WordStartHalfAscii => { + self.wtr.write_str(r"(?-u:\b{start-half})")?; + } + hir::Look::WordEndHalfAscii => { + self.wtr.write_str(r"(?-u:\b{end-half})")?; + } + hir::Look::WordStartHalfUnicode => { + self.wtr.write_str(r"\b{start-half}")?; + } + hir::Look::WordEndHalfUnicode => { + self.wtr.write_str(r"\b{end-half}")?; + } }, HirKind::Capture(hir::Capture { ref name, .. }) => { self.wtr.write_str("(")?; diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs index 5430b51b27..313a1e9e8b 100644 --- a/regex-syntax/src/hir/translate.rs +++ b/regex-syntax/src/hir/translate.rs @@ -337,7 +337,7 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { fn visit_pre(&mut self, ast: &Ast) -> Result<()> { match *ast { - Ast::Class(ast::Class::Bracketed(_)) => { + Ast::ClassBracketed(_) => { if self.flags().unicode() { let cls = hir::ClassUnicode::empty(); self.push(HirFrame::ClassUnicode(cls)); @@ -354,14 +354,14 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { .unwrap_or_else(|| self.flags()); self.push(HirFrame::Group { old_flags }); } - Ast::Concat(ref x) if x.asts.is_empty() => {} Ast::Concat(_) => { self.push(HirFrame::Concat); } - Ast::Alternation(ref x) if x.asts.is_empty() => {} - Ast::Alternation(_) => { + Ast::Alternation(ref x) => { self.push(HirFrame::Alternation); - self.push(HirFrame::AlternationBranch); + if !x.asts.is_empty() { + self.push(HirFrame::AlternationBranch); + } } _ => {} } @@ -386,29 +386,20 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { // consistency sake. self.push(HirFrame::Expr(Hir::empty())); } - Ast::Literal(ref x) => { - match self.ast_literal_to_scalar(x)? { - Either::Right(byte) => self.push_byte(byte), - Either::Left(ch) => { - if !self.flags().unicode() && ch.len_utf8() > 1 { - return Err(self - .error(x.span, ErrorKind::UnicodeNotAllowed)); - } - match self.case_fold_char(x.span, ch)? { - None => self.push_char(ch), - Some(expr) => self.push(HirFrame::Expr(expr)), - } - } - } - // self.push(HirFrame::Expr(self.hir_literal(x)?)); - } - Ast::Dot(span) => { - self.push(HirFrame::Expr(self.hir_dot(span)?)); + Ast::Literal(ref x) => match self.ast_literal_to_scalar(x)? { + Either::Right(byte) => self.push_byte(byte), + Either::Left(ch) => match self.case_fold_char(x.span, ch)? { + None => self.push_char(ch), + Some(expr) => self.push(HirFrame::Expr(expr)), + }, + }, + Ast::Dot(ref span) => { + self.push(HirFrame::Expr(self.hir_dot(**span)?)); } Ast::Assertion(ref x) => { self.push(HirFrame::Expr(self.hir_assertion(x)?)); } - Ast::Class(ast::Class::Perl(ref x)) => { + Ast::ClassPerl(ref x) => { if self.flags().unicode() { let cls = self.hir_perl_unicode_class(x)?; let hcls = hir::Class::Unicode(cls); @@ -419,11 +410,11 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { self.push(HirFrame::Expr(Hir::class(hcls))); } } - Ast::Class(ast::Class::Unicode(ref x)) => { + Ast::ClassUnicode(ref x) => { let cls = hir::Class::Unicode(self.hir_unicode_class(x)?); self.push(HirFrame::Expr(Hir::class(cls))); } - Ast::Class(ast::Class::Bracketed(ref ast)) => { + Ast::ClassBracketed(ref ast) => { if self.flags().unicode() { let mut cls = self.pop().unwrap().unwrap_class_unicode(); self.unicode_fold_and_negate( @@ -874,8 +865,8 @@ impl<'t, 'p> TranslatorI<'t, 'p> { })?; Ok(Some(Hir::class(hir::Class::Unicode(cls)))) } else { - if c.len_utf8() > 1 { - return Err(self.error(span, ErrorKind::UnicodeNotAllowed)); + if !c.is_ascii() { + return Ok(None); } // If case folding won't do anything, then don't bother trying. match c { @@ -964,6 +955,34 @@ impl<'t, 'p> TranslatorI<'t, 'p> { } else { hir::Look::WordAsciiNegate }), + ast::AssertionKind::WordBoundaryStart + | ast::AssertionKind::WordBoundaryStartAngle => { + Hir::look(if unicode { + hir::Look::WordStartUnicode + } else { + hir::Look::WordStartAscii + }) + } + ast::AssertionKind::WordBoundaryEnd + | ast::AssertionKind::WordBoundaryEndAngle => { + Hir::look(if unicode { + hir::Look::WordEndUnicode + } else { + hir::Look::WordEndAscii + }) + } + ast::AssertionKind::WordBoundaryStartHalf => { + Hir::look(if unicode { + hir::Look::WordStartHalfUnicode + } else { + hir::Look::WordStartHalfAscii + }) + } + ast::AssertionKind::WordBoundaryEndHalf => Hir::look(if unicode { + hir::Look::WordEndHalfUnicode + } else { + hir::Look::WordEndHalfAscii + }), }) } @@ -1185,9 +1204,8 @@ impl<'t, 'p> TranslatorI<'t, 'p> { match self.ast_literal_to_scalar(ast)? { Either::Right(byte) => Ok(byte), Either::Left(ch) => { - let cp = u32::from(ch); - if cp <= 0x7F { - Ok(u8::try_from(cp).unwrap()) + if ch.is_ascii() { + Ok(u8::try_from(ch).unwrap()) } else { // We can't feasibly support Unicode in // byte oriented classes. Byte classes don't @@ -1635,16 +1653,7 @@ mod tests { assert_eq!(t_bytes(r"(?-u)\x61"), hir_lit("a")); assert_eq!(t_bytes(r"(?-u)\xFF"), hir_blit(b"\xFF")); - assert_eq!( - t_err("(?-u)☃"), - TestError { - kind: hir::ErrorKind::UnicodeNotAllowed, - span: Span::new( - Position::new(5, 1, 6), - Position::new(8, 1, 7) - ), - } - ); + assert_eq!(t("(?-u)☃"), hir_lit("☃")); assert_eq!( t_err(r"(?-u)\xFF"), TestError { @@ -1722,16 +1731,7 @@ mod tests { ); assert_eq!(t_bytes(r"(?i-u)\xFF"), hir_blit(b"\xFF")); - assert_eq!( - t_err("(?i-u)β"), - TestError { - kind: hir::ErrorKind::UnicodeNotAllowed, - span: Span::new( - Position::new(6, 1, 7), - Position::new(8, 1, 8), - ), - } - ); + assert_eq!(t("(?i-u)β"), hir_lit("β"),); } #[test] @@ -3626,4 +3626,99 @@ mod tests { ]), ); } + + #[test] + fn regression_alt_empty_concat() { + use crate::ast::{self, Ast}; + + let span = Span::splat(Position::new(0, 0, 0)); + let ast = Ast::alternation(ast::Alternation { + span, + asts: vec![Ast::concat(ast::Concat { span, asts: vec![] })], + }); + + let mut t = Translator::new(); + assert_eq!(Ok(Hir::empty()), t.translate("", &ast)); + } + + #[test] + fn regression_empty_alt() { + use crate::ast::{self, Ast}; + + let span = Span::splat(Position::new(0, 0, 0)); + let ast = Ast::concat(ast::Concat { + span, + asts: vec![Ast::alternation(ast::Alternation { + span, + asts: vec![], + })], + }); + + let mut t = Translator::new(); + assert_eq!(Ok(Hir::fail()), t.translate("", &ast)); + } + + #[test] + fn regression_singleton_alt() { + use crate::{ + ast::{self, Ast}, + hir::Dot, + }; + + let span = Span::splat(Position::new(0, 0, 0)); + let ast = Ast::concat(ast::Concat { + span, + asts: vec![Ast::alternation(ast::Alternation { + span, + asts: vec![Ast::dot(span)], + })], + }); + + let mut t = Translator::new(); + assert_eq!(Ok(Hir::dot(Dot::AnyCharExceptLF)), t.translate("", &ast)); + } + + // See: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=63168 + #[test] + fn regression_fuzz_match() { + let pat = "[(\u{6} \0-\u{afdf5}] \0 "; + let ast = ParserBuilder::new() + .octal(false) + .ignore_whitespace(true) + .build() + .parse(pat) + .unwrap(); + let hir = TranslatorBuilder::new() + .utf8(true) + .case_insensitive(false) + .multi_line(false) + .dot_matches_new_line(false) + .swap_greed(true) + .unicode(true) + .build() + .translate(pat, &ast) + .unwrap(); + assert_eq!( + hir, + Hir::concat(vec![ + hir_uclass(&[('\0', '\u{afdf5}')]), + hir_lit("\0"), + ]) + ); + } + + // See: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=63155 + #[cfg(feature = "unicode")] + #[test] + fn regression_fuzz_difference1() { + let pat = r"\W\W|\W[^\v--\W\W\P{Script_Extensions:Pau_Cin_Hau}\u10A1A1-\U{3E3E3}--~~~~--~~~~~~~~------~~~~~~--~~~~~~]*"; + let _ = t(pat); // shouldn't panic + } + + // See: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=63153 + #[test] + fn regression_fuzz_char_decrement1() { + let pat = "w[w[^w?\rw\rw[^w?\rw[^w?\rw[^w?\rw[^w?\rw[^w?\rw[^w?\r\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0w?\rw[^w?\rw[^w?\rw[^w\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\u{1}\0]\0\0\0\0\0\0\0\0\0*\0\0\u{1}\0]\0\0-*\0][^w?\rw[^w?\rw[^w?\rw[^w?\rw[^w?\rw[^w?\rw[^w\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\u{1}\0]\0\0\0\0\0\0\0\0\0x\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\0\0\0\0*??\0\u{7f}{2}\u{10}??\0\0\0\0\0\0\0\0\0\u{3}\0\0\0}\0-*\0]\0\0\0\0\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\0\0\u{1}\0]\0\u{1}\u{1}H-i]-]\0\0\0\0\u{1}\0]\0\0\0\u{1}\0]\0\0-*\0\0\0\0\u{1}9-\u{7f}]\0'|-\u{7f}]\0'|(?i-ux)[-\u{7f}]\0'\u{3}\0\0\0}\0-*\0] bool { // escapeable, \< and \> will result in a parse error. Thus, we can // turn them into something else in the future without it being a // backwards incompatible change. + // + // OK, now we support \< and \>, and we need to retain them as *not* + // escapeable here since the escape sequence is significant. '<' | '>' => false, _ => true, } @@ -381,7 +372,7 @@ pub fn try_is_word_character( /// Returns true if and only if the given character is an ASCII word character. /// /// An ASCII word character is defined by the following character class: -/// `[_0-9a-zA-Z]'. +/// `[_0-9a-zA-Z]`. pub fn is_word_byte(c: u8) -> bool { match c { b'_' | b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' => true, diff --git a/src/builders.rs b/src/builders.rs index d19a0ffe23..c111a96c0b 100644 --- a/src/builders.rs +++ b/src/builders.rs @@ -28,7 +28,9 @@ use alloc::{ vec::Vec, }; -use regex_automata::{meta, util::syntax, MatchKind}; +use regex_automata::{ + meta, nfa::thompson::WhichCaptures, util::syntax, MatchKind, +}; use crate::error::Error; @@ -100,8 +102,12 @@ impl Builder { } fn build_many_string(&self) -> Result { - let metac = - self.metac.clone().match_kind(MatchKind::All).utf8_empty(true); + let metac = self + .metac + .clone() + .match_kind(MatchKind::All) + .utf8_empty(true) + .which_captures(WhichCaptures::None); let syntaxc = self.syntaxc.clone().utf8(true); let patterns = Arc::from(self.pats.as_slice()); meta::Builder::new() @@ -113,8 +119,12 @@ impl Builder { } fn build_many_bytes(&self) -> Result { - let metac = - self.metac.clone().match_kind(MatchKind::All).utf8_empty(false); + let metac = self + .metac + .clone() + .match_kind(MatchKind::All) + .utf8_empty(false) + .which_captures(WhichCaptures::None); let syntaxc = self.syntaxc.clone().utf8(false); let patterns = Arc::from(self.pats.as_slice()); meta::Builder::new() @@ -669,6 +679,7 @@ pub(crate) mod string { /// # Example /// /// ``` + /// # if !cfg!(target_pointer_width = "64") { return; } // see #1041 /// use regex::RegexBuilder; /// /// // It may surprise you how big some seemingly small patterns can @@ -1236,6 +1247,7 @@ pub(crate) mod string { /// # Example /// /// ``` + /// # if !cfg!(target_pointer_width = "64") { return; } // see #1041 /// use regex::RegexSetBuilder; /// /// // It may surprise you how big some seemingly small patterns can @@ -1846,6 +1858,7 @@ pub(crate) mod bytes { /// # Example /// /// ``` + /// # if !cfg!(target_pointer_width = "64") { return; } // see #1041 /// use regex::bytes::RegexBuilder; /// /// // It may surprise you how big some seemingly small patterns can @@ -2418,6 +2431,7 @@ pub(crate) mod bytes { /// # Example /// /// ``` + /// # if !cfg!(target_pointer_width = "64") { return; } // see #1041 /// use regex::bytes::RegexSetBuilder; /// /// // It may surprise you how big some seemingly small patterns can diff --git a/src/bytes.rs b/src/bytes.rs index 3f53a3ea55..383ac4a5b5 100644 --- a/src/bytes.rs +++ b/src/bytes.rs @@ -68,8 +68,8 @@ bytes: 1. The `u` flag can be disabled even when disabling it might cause the regex to match invalid UTF-8. When the `u` flag is disabled, the regex is said to be in "ASCII compatible" mode. -2. In ASCII compatible mode, neither Unicode scalar values nor Unicode -character classes are allowed. +2. In ASCII compatible mode, Unicode character classes are not allowed. Literal +Unicode scalar values outside of character classes are allowed. 3. In ASCII compatible mode, Perl character classes (`\w`, `\d` and `\s`) revert to their typical ASCII definition. `\w` maps to `[[:word:]]`, `\d` maps to `[[:digit:]]` and `\s` maps to `[[:space:]]`. diff --git a/src/lib.rs b/src/lib.rs index e9c9c570fc..e4c67f4bc1 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -543,8 +543,10 @@ scalar value, even when it is encoded using multiple bytes. When Unicode mode is disabled (e.g., `(?-u:.)`), then `.` will match a single byte in all cases. * The character classes `\w`, `\d` and `\s` are all Unicode-aware by default. Use `(?-u:\w)`, `(?-u:\d)` and `(?-u:\s)` to get their ASCII-only definitions. -* Similarly, `\b` and `\B` use a Unicode definition of a "word" character. To -get ASCII-only word boundaries, use `(?-u:\b)` and `(?-u:\B)`. +* Similarly, `\b` and `\B` use a Unicode definition of a "word" character. +To get ASCII-only word boundaries, use `(?-u:\b)` and `(?-u:\B)`. This also +applies to the special word boundary assertions. (That is, `\b{start}`, +`\b{end}`, `\b{start-half}`, `\b{end-half}`.) * `^` and `$` are **not** Unicode-aware in multi-line mode. Namely, they only recognize `\n` (assuming CRLF mode is not enabled) and not any of the other forms of line terminators defined by Unicode. @@ -665,8 +667,8 @@ a separate crate, [`regex-syntax`](https://docs.rs/regex-syntax). Any named character class may appear inside a bracketed `[...]` character -class. For example, `[\p{Greek}[:digit:]]` matches any Greek or ASCII -digit. `[\p{Greek}&&\pL]` matches Greek letters. +class. For example, `[\p{Greek}[:digit:]]` matches any ASCII digit or any +codepoint in the `Greek` script. `[\p{Greek}&&\pL]` matches Greek letters. Precedence in character classes, from most binding to least: @@ -723,12 +725,16 @@ x{n}? exactly n x ### Empty matches
-^     the beginning of a haystack (or start-of-line with multi-line mode)
-$     the end of a haystack (or end-of-line with multi-line mode)
-\A    only the beginning of a haystack (even with multi-line mode enabled)
-\z    only the end of a haystack (even with multi-line mode enabled)
-\b    a Unicode word boundary (\w on one side and \W, \A, or \z on other)
-\B    not a Unicode word boundary
+^               the beginning of a haystack (or start-of-line with multi-line mode)
+$               the end of a haystack (or end-of-line with multi-line mode)
+\A              only the beginning of a haystack (even with multi-line mode enabled)
+\z              only the end of a haystack (even with multi-line mode enabled)
+\b              a Unicode word boundary (\w on one side and \W, \A, or \z on other)
+\B              not a Unicode word boundary
+\b{start}, \<   a Unicode start-of-word boundary (\W|\A on the left, \w on the right)
+\b{end}, \>     a Unicode end-of-word boundary (\w on the left, \W|\z on the right))
+\b{start-half}  half of a Unicode start-of-word boundary (\W|\A on the left)
+\b{end-half}    half of a Unicode end-of-word boundary (\W|\z on the right)
 
The empty regex is valid and matches the empty string. For example, the @@ -856,28 +862,32 @@ Note that this includes all possible escape sequences, even ones that are documented elsewhere.
-\*          literal *, applies to all ASCII except [0-9A-Za-z<>]
-\a          bell (\x07)
-\f          form feed (\x0C)
-\t          horizontal tab
-\n          new line
-\r          carriage return
-\v          vertical tab (\x0B)
-\A          matches at the beginning of a haystack
-\z          matches at the end of a haystack
-\b          word boundary assertion
-\B          negated word boundary assertion
-\123        octal character code, up to three digits (when enabled)
-\x7F        hex character code (exactly two digits)
-\x{10FFFF}  any hex character code corresponding to a Unicode code point
-\u007F      hex character code (exactly four digits)
-\u{7F}      any hex character code corresponding to a Unicode code point
-\U0000007F  hex character code (exactly eight digits)
-\U{7F}      any hex character code corresponding to a Unicode code point
-\p{Letter}  Unicode character class
-\P{Letter}  negated Unicode character class
-\d, \s, \w  Perl character class
-\D, \S, \W  negated Perl character class
+\*              literal *, applies to all ASCII except [0-9A-Za-z<>]
+\a              bell (\x07)
+\f              form feed (\x0C)
+\t              horizontal tab
+\n              new line
+\r              carriage return
+\v              vertical tab (\x0B)
+\A              matches at the beginning of a haystack
+\z              matches at the end of a haystack
+\b              word boundary assertion
+\B              negated word boundary assertion
+\b{start}, \<   start-of-word boundary assertion
+\b{end}, \>     end-of-word boundary assertion
+\b{start-half}  half of a start-of-word boundary assertion
+\b{end-half}    half of a end-of-word boundary assertion
+\123            octal character code, up to three digits (when enabled)
+\x7F            hex character code (exactly two digits)
+\x{10FFFF}      any hex character code corresponding to a Unicode code point
+\u007F          hex character code (exactly four digits)
+\u{7F}          any hex character code corresponding to a Unicode code point
+\U0000007F      hex character code (exactly eight digits)
+\U{7F}          any hex character code corresponding to a Unicode code point
+\p{Letter}      Unicode character class
+\P{Letter}      negated Unicode character class
+\d, \s, \w      Perl character class
+\D, \S, \W      negated Perl character class
 
### Perl character classes (Unicode friendly) diff --git a/src/regex/bytes.rs b/src/regex/bytes.rs index 6522ee7e3b..19f5701afd 100644 --- a/src/regex/bytes.rs +++ b/src/regex/bytes.rs @@ -1154,7 +1154,7 @@ impl Regex { /// /// A [`CaptureLocations`] stores the same byte offsets as a [`Captures`], /// but does *not* store a reference to the haystack. This makes its API - /// a bit lower level and less convenience. But in exchange, callers + /// a bit lower level and less convenient. But in exchange, callers /// may allocate their own `CaptureLocations` and reuse it for multiple /// searches. This may be helpful if allocating a `Captures` shows up in a /// profile as too costly. @@ -1162,8 +1162,8 @@ impl Regex { /// To create a `CaptureLocations` value, use the /// [`Regex::capture_locations`] method. /// - /// This also the overall match if one was found. When a match is found, - /// its offsets are also always stored in `locs` at index `0`. + /// This also returns the overall match if one was found. When a match is + /// found, its offsets are also always stored in `locs` at index `0`. /// /// # Example /// @@ -2037,7 +2037,10 @@ impl<'h, 'n> core::ops::Index<&'n str> for Captures<'h> { /// /// // Asking for an invalid capture group always returns None. /// assert_eq!(None, locs.get(3)); +/// # // literals are too big for 32-bit usize: #1041 +/// # #[cfg(target_pointer_width = "64")] /// assert_eq!(None, locs.get(34973498648)); +/// # #[cfg(target_pointer_width = "64")] /// assert_eq!(None, locs.get(9944060567225171988)); /// ``` #[derive(Clone, Debug)] diff --git a/src/regex/string.rs b/src/regex/string.rs index 65a76740ed..880d6082ad 100644 --- a/src/regex/string.rs +++ b/src/regex/string.rs @@ -1145,7 +1145,7 @@ impl Regex { /// /// A [`CaptureLocations`] stores the same byte offsets as a [`Captures`], /// but does *not* store a reference to the haystack. This makes its API - /// a bit lower level and less convenience. But in exchange, callers + /// a bit lower level and less convenient. But in exchange, callers /// may allocate their own `CaptureLocations` and reuse it for multiple /// searches. This may be helpful if allocating a `Captures` shows up in a /// profile as too costly. @@ -1153,8 +1153,8 @@ impl Regex { /// To create a `CaptureLocations` value, use the /// [`Regex::capture_locations`] method. /// - /// This also the overall match if one was found. When a match is found, - /// its offsets are also always stored in `locs` at index `0`. + /// This also returns the overall match if one was found. When a match is + /// found, its offsets are also always stored in `locs` at index `0`. /// /// # Panics /// @@ -2040,7 +2040,10 @@ impl<'h, 'n> core::ops::Index<&'n str> for Captures<'h> { /// /// // Asking for an invalid capture group always returns None. /// assert_eq!(None, locs.get(3)); +/// # // literals are too big for 32-bit usize: #1041 +/// # #[cfg(target_pointer_width = "64")] /// assert_eq!(None, locs.get(34973498648)); +/// # #[cfg(target_pointer_width = "64")] /// assert_eq!(None, locs.get(9944060567225171988)); /// ``` #[derive(Clone, Debug)] diff --git a/testdata/anchored.toml b/testdata/anchored.toml index cca561de10..0f2248d098 100644 --- a/testdata/anchored.toml +++ b/testdata/anchored.toml @@ -69,3 +69,59 @@ haystack = 'abcβ' matches = [[0, 3]] anchored = true unicode = false + +# Tests that '.c' doesn't match 'abc' when performing an anchored search from +# the beginning of the haystack. This test found two different bugs in the +# PikeVM and the meta engine. +[[test]] +name = "no-match-at-start" +regex = '.c' +haystack = 'abc' +matches = [] +anchored = true + +# Like above, but at a non-zero start offset. +[[test]] +name = "no-match-at-start-bounds" +regex = '.c' +haystack = 'aabc' +bounds = [1, 4] +matches = [] +anchored = true + +# This is like no-match-at-start, but hits the "reverse inner" optimization +# inside the meta engine. (no-match-at-start hits the "reverse suffix" +# optimization.) +[[test]] +name = "no-match-at-start-reverse-inner" +regex = '.c[a-z]' +haystack = 'abcz' +matches = [] +anchored = true + +# Like above, but at a non-zero start offset. +[[test]] +name = "no-match-at-start-reverse-inner-bounds" +regex = '.c[a-z]' +haystack = 'aabcz' +bounds = [1, 5] +matches = [] +anchored = true + +# Same as no-match-at-start, but applies to the meta engine's "reverse +# anchored" optimization. +[[test]] +name = "no-match-at-start-reverse-anchored" +regex = '.c[a-z]$' +haystack = 'abcz' +matches = [] +anchored = true + +# Like above, but at a non-zero start offset. +[[test]] +name = "no-match-at-start-reverse-anchored-bounds" +regex = '.c[a-z]$' +haystack = 'aabcz' +bounds = [1, 5] +matches = [] +anchored = true diff --git a/testdata/line-terminator.toml b/testdata/line-terminator.toml index 4de72de31e..a398dafa2e 100644 --- a/testdata/line-terminator.toml +++ b/testdata/line-terminator.toml @@ -38,6 +38,18 @@ unescape = true line-terminator = '\xFF' utf8 = false +# This tests a tricky case where the line terminator is set to \r. This ensures +# that the StartLF look-behind assertion is tracked when computing the start +# state. +[[test]] +name = "carriage" +regex = '(?m)^[a-z]+' +haystack = 'ABC\rabc' +matches = [[4, 7]] +bounds = [4, 7] +unescape = true +line-terminator = '\r' + # This tests that we can set the line terminator to a byte corresponding to a # word character, and things work as expected. [[test]] diff --git a/testdata/regression.toml b/testdata/regression.toml index bb5e4fd46f..53b0701a3c 100644 --- a/testdata/regression.toml +++ b/testdata/regression.toml @@ -739,3 +739,92 @@ matches = [[0, 9]] utf8 = false match-kind = "all" search-kind = "overlapping" + +# See: https://github.com/rust-lang/regex/issues/1060 +[[test]] +name = "reverse-inner-plus-shorter-than-expected" +regex = '(?:(\d+)[:.])?(\d{1,2})[:.](\d{2})' +haystack = '102:12:39' +matches = [[[0, 9], [0, 3], [4, 6], [7, 9]]] + +# Like reverse-inner-plus-shorter-than-expected, but using a far simpler regex +# to demonstrate the extent of the rot. Sigh. +# +# See: https://github.com/rust-lang/regex/issues/1060 +[[test]] +name = "reverse-inner-short" +regex = '(?:([0-9][0-9][0-9]):)?([0-9][0-9]):([0-9][0-9])' +haystack = '102:12:39' +matches = [[[0, 9], [0, 3], [4, 6], [7, 9]]] + +# This regression test was found via the RegexSet APIs. It triggered a +# particular code path where a regex was compiled with 'All' match semantics +# (to support overlapping search), but got funneled down into a standard +# leftmost search when calling 'is_match'. This is fine on its own, but the +# leftmost search will use a prefilter and that's where this went awry. +# +# Namely, since 'All' semantics were used, the aho-corasick prefilter was +# incorrectly compiled with 'Standard' semantics. This was wrong because +# 'Standard' immediately attempts to report a match at every position, even if +# that would mean reporting a match past the leftmost match before reporting +# the leftmost match. This breaks the prefilter contract of never having false +# negatives and leads overall to the engine not finding a match. +# +# See: https://github.com/rust-lang/regex/issues/1070 +[[test]] +name = "prefilter-with-aho-corasick-standard-semantics" +regex = '(?m)^ *v [0-9]' +haystack = 'v 0' +matches = [ + { id = 0, spans = [[0, 3]] }, +] +match-kind = "all" +search-kind = "overlapping" +unicode = true +utf8 = true + +# This tests that the PikeVM and the meta regex agree on a particular regex. +# This test previously failed when the ad hoc engines inside the meta engine +# did not handle quit states correctly. Namely, the Unicode word boundary here +# combined with a non-ASCII codepoint provokes the quit state. The ad hoc +# engines were previously returning a match even after entering the quit state +# if a match had been previously detected, but this is incorrect. The reason +# is that if a quit state is found, then the search must give up *immediately* +# because it prevents the search from finding the "proper" leftmost-first +# match. If it instead returns a match that has been found, it risks reporting +# an improper match, as it did in this case. +# +# See: https://github.com/rust-lang/regex/issues/1046 +[[test]] +name = "non-prefix-literal-quit-state" +regex = '.+\b\n' +haystack = "β77\n" +matches = [[0, 5]] + +# This is a regression test for some errant HIR interval set operations that +# were made in the regex-syntax 0.8.0 release and then reverted in 0.8.1. The +# issue here is that the HIR produced from the regex had out-of-order ranges. +# +# See: https://github.com/rust-lang/regex/issues/1103 +# Ref: https://github.com/rust-lang/regex/pull/1051 +# Ref: https://github.com/rust-lang/regex/pull/1102 +[[test]] +name = "hir-optimization-out-of-order-class" +regex = '^[[:alnum:]./-]+$' +haystack = "a-b" +matches = [[0, 3]] + +# This is a regression test for an improper reverse suffix optimization. This +# occurred when I "broadened" the applicability of the optimization to include +# multiple possible literal suffixes instead of only sticking to a non-empty +# longest common suffix. It turns out that, at least given how the reverse +# suffix optimization works, we need to stick to the longest common suffix for +# now. +# +# See: https://github.com/rust-lang/regex/issues/1110 +# See also: https://github.com/astral-sh/ruff/pull/7980 +[[test]] +name = 'improper-reverse-suffix-optimization' +regex = '(\\N\{[^}]+})|([{}])' +haystack = 'hiya \N{snowman} bye' +matches = [[[5, 16], [5, 16], []]] diff --git a/testdata/word-boundary-special.toml b/testdata/word-boundary-special.toml new file mode 100644 index 0000000000..2b5a2a0acf --- /dev/null +++ b/testdata/word-boundary-special.toml @@ -0,0 +1,687 @@ +# These tests are for the "special" word boundary assertions. That is, +# \b{start}, \b{end}, \b{start-half}, \b{end-half}. These are specialty +# assertions for more niche use cases, but hitting those cases without these +# assertions is difficult. For example, \b{start-half} and \b{end-half} are +# used to implement the -w/--word-regexp flag in a grep program. + +# Tests for (?-u:\b{start}) + +[[test]] +name = "word-start-ascii-010" +regex = '\b{start}' +haystack = "a" +matches = [[0, 0]] +unicode = false + +[[test]] +name = "word-start-ascii-020" +regex = '\b{start}' +haystack = "a " +matches = [[0, 0]] +unicode = false + +[[test]] +name = "word-start-ascii-030" +regex = '\b{start}' +haystack = " a " +matches = [[1, 1]] +unicode = false + +[[test]] +name = "word-start-ascii-040" +regex = '\b{start}' +haystack = "" +matches = [] +unicode = false + +[[test]] +name = "word-start-ascii-050" +regex = '\b{start}' +haystack = "ab" +matches = [[0, 0]] +unicode = false + +[[test]] +name = "word-start-ascii-060" +regex = '\b{start}' +haystack = "𝛃" +matches = [] +unicode = false + +[[test]] +name = "word-start-ascii-060-bounds" +regex = '\b{start}' +haystack = "𝛃" +bounds = [2, 3] +matches = [] +unicode = false + +[[test]] +name = "word-start-ascii-070" +regex = '\b{start}' +haystack = " 𝛃 " +matches = [] +unicode = false + +[[test]] +name = "word-start-ascii-080" +regex = '\b{start}' +haystack = "𝛃𐆀" +matches = [] +unicode = false + +[[test]] +name = "word-start-ascii-090" +regex = '\b{start}' +haystack = "𝛃b" +matches = [[4, 4]] +unicode = false + +[[test]] +name = "word-start-ascii-110" +regex = '\b{start}' +haystack = "b𝛃" +matches = [[0, 0]] +unicode = false + +# Tests for (?-u:\b{end}) + +[[test]] +name = "word-end-ascii-010" +regex = '\b{end}' +haystack = "a" +matches = [[1, 1]] +unicode = false + +[[test]] +name = "word-end-ascii-020" +regex = '\b{end}' +haystack = "a " +matches = [[1, 1]] +unicode = false + +[[test]] +name = "word-end-ascii-030" +regex = '\b{end}' +haystack = " a " +matches = [[2, 2]] +unicode = false + +[[test]] +name = "word-end-ascii-040" +regex = '\b{end}' +haystack = "" +matches = [] +unicode = false + +[[test]] +name = "word-end-ascii-050" +regex = '\b{end}' +haystack = "ab" +matches = [[2, 2]] +unicode = false + +[[test]] +name = "word-end-ascii-060" +regex = '\b{end}' +haystack = "𝛃" +matches = [] +unicode = false + +[[test]] +name = "word-end-ascii-060-bounds" +regex = '\b{end}' +haystack = "𝛃" +bounds = [2, 3] +matches = [] +unicode = false + +[[test]] +name = "word-end-ascii-070" +regex = '\b{end}' +haystack = " 𝛃 " +matches = [] +unicode = false + +[[test]] +name = "word-end-ascii-080" +regex = '\b{end}' +haystack = "𝛃𐆀" +matches = [] +unicode = false + +[[test]] +name = "word-end-ascii-090" +regex = '\b{end}' +haystack = "𝛃b" +matches = [[5, 5]] +unicode = false + +[[test]] +name = "word-end-ascii-110" +regex = '\b{end}' +haystack = "b𝛃" +matches = [[1, 1]] +unicode = false + +# Tests for \b{start} + +[[test]] +name = "word-start-unicode-010" +regex = '\b{start}' +haystack = "a" +matches = [[0, 0]] +unicode = true + +[[test]] +name = "word-start-unicode-020" +regex = '\b{start}' +haystack = "a " +matches = [[0, 0]] +unicode = true + +[[test]] +name = "word-start-unicode-030" +regex = '\b{start}' +haystack = " a " +matches = [[1, 1]] +unicode = true + +[[test]] +name = "word-start-unicode-040" +regex = '\b{start}' +haystack = "" +matches = [] +unicode = true + +[[test]] +name = "word-start-unicode-050" +regex = '\b{start}' +haystack = "ab" +matches = [[0, 0]] +unicode = true + +[[test]] +name = "word-start-unicode-060" +regex = '\b{start}' +haystack = "𝛃" +matches = [[0, 0]] +unicode = true + +[[test]] +name = "word-start-unicode-060-bounds" +regex = '\b{start}' +haystack = "𝛃" +bounds = [2, 3] +matches = [] +unicode = true + +[[test]] +name = "word-start-unicode-070" +regex = '\b{start}' +haystack = " 𝛃 " +matches = [[1, 1]] +unicode = true + +[[test]] +name = "word-start-unicode-080" +regex = '\b{start}' +haystack = "𝛃𐆀" +matches = [[0, 0]] +unicode = true + +[[test]] +name = "word-start-unicode-090" +regex = '\b{start}' +haystack = "𝛃b" +matches = [[0, 0]] +unicode = true + +[[test]] +name = "word-start-unicode-110" +regex = '\b{start}' +haystack = "b𝛃" +matches = [[0, 0]] +unicode = true + +# Tests for \b{end} + +[[test]] +name = "word-end-unicode-010" +regex = '\b{end}' +haystack = "a" +matches = [[1, 1]] +unicode = true + +[[test]] +name = "word-end-unicode-020" +regex = '\b{end}' +haystack = "a " +matches = [[1, 1]] +unicode = true + +[[test]] +name = "word-end-unicode-030" +regex = '\b{end}' +haystack = " a " +matches = [[2, 2]] +unicode = true + +[[test]] +name = "word-end-unicode-040" +regex = '\b{end}' +haystack = "" +matches = [] +unicode = true + +[[test]] +name = "word-end-unicode-050" +regex = '\b{end}' +haystack = "ab" +matches = [[2, 2]] +unicode = true + +[[test]] +name = "word-end-unicode-060" +regex = '\b{end}' +haystack = "𝛃" +matches = [[4, 4]] +unicode = true + +[[test]] +name = "word-end-unicode-060-bounds" +regex = '\b{end}' +haystack = "𝛃" +bounds = [2, 3] +matches = [] +unicode = true + +[[test]] +name = "word-end-unicode-070" +regex = '\b{end}' +haystack = " 𝛃 " +matches = [[5, 5]] +unicode = true + +[[test]] +name = "word-end-unicode-080" +regex = '\b{end}' +haystack = "𝛃𐆀" +matches = [[4, 4]] +unicode = true + +[[test]] +name = "word-end-unicode-090" +regex = '\b{end}' +haystack = "𝛃b" +matches = [[5, 5]] +unicode = true + +[[test]] +name = "word-end-unicode-110" +regex = '\b{end}' +haystack = "b𝛃" +matches = [[5, 5]] +unicode = true + +# Tests for (?-u:\b{start-half}) + +[[test]] +name = "word-start-half-ascii-010" +regex = '\b{start-half}' +haystack = "a" +matches = [[0, 0]] +unicode = false + +[[test]] +name = "word-start-half-ascii-020" +regex = '\b{start-half}' +haystack = "a " +matches = [[0, 0], [2, 2]] +unicode = false + +[[test]] +name = "word-start-half-ascii-030" +regex = '\b{start-half}' +haystack = " a " +matches = [[0, 0], [1, 1], [3, 3]] +unicode = false + +[[test]] +name = "word-start-half-ascii-040" +regex = '\b{start-half}' +haystack = "" +matches = [[0, 0]] +unicode = false + +[[test]] +name = "word-start-half-ascii-050" +regex = '\b{start-half}' +haystack = "ab" +matches = [[0, 0]] +unicode = false + +[[test]] +name = "word-start-half-ascii-060" +regex = '\b{start-half}' +haystack = "𝛃" +matches = [[0, 0], [4, 4]] +unicode = false + +[[test]] +name = "word-start-half-ascii-060-noutf8" +regex = '\b{start-half}' +haystack = "𝛃" +matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4]] +unicode = false +utf8 = false + +[[test]] +name = "word-start-half-ascii-060-bounds" +regex = '\b{start-half}' +haystack = "𝛃" +bounds = [2, 3] +matches = [] +unicode = false + +[[test]] +name = "word-start-half-ascii-070" +regex = '\b{start-half}' +haystack = " 𝛃 " +matches = [[0, 0], [1, 1], [5, 5], [6, 6]] +unicode = false + +[[test]] +name = "word-start-half-ascii-080" +regex = '\b{start-half}' +haystack = "𝛃𐆀" +matches = [[0, 0], [4, 4], [8, 8]] +unicode = false + +[[test]] +name = "word-start-half-ascii-090" +regex = '\b{start-half}' +haystack = "𝛃b" +matches = [[0, 0], [4, 4]] +unicode = false + +[[test]] +name = "word-start-half-ascii-110" +regex = '\b{start-half}' +haystack = "b𝛃" +matches = [[0, 0], [5, 5]] +unicode = false + +# Tests for (?-u:\b{end-half}) + +[[test]] +name = "word-end-half-ascii-010" +regex = '\b{end-half}' +haystack = "a" +matches = [[1, 1]] +unicode = false + +[[test]] +name = "word-end-half-ascii-020" +regex = '\b{end-half}' +haystack = "a " +matches = [[1, 1], [2, 2]] +unicode = false + +[[test]] +name = "word-end-half-ascii-030" +regex = '\b{end-half}' +haystack = " a " +matches = [[0, 0], [2, 2], [3, 3]] +unicode = false + +[[test]] +name = "word-end-half-ascii-040" +regex = '\b{end-half}' +haystack = "" +matches = [[0, 0]] +unicode = false + +[[test]] +name = "word-end-half-ascii-050" +regex = '\b{end-half}' +haystack = "ab" +matches = [[2, 2]] +unicode = false + +[[test]] +name = "word-end-half-ascii-060" +regex = '\b{end-half}' +haystack = "𝛃" +matches = [[0, 0], [4, 4]] +unicode = false + +[[test]] +name = "word-end-half-ascii-060-bounds" +regex = '\b{end-half}' +haystack = "𝛃" +bounds = [2, 3] +matches = [] +unicode = false + +[[test]] +name = "word-end-half-ascii-070" +regex = '\b{end-half}' +haystack = " 𝛃 " +matches = [[0, 0], [1, 1], [5, 5], [6, 6]] +unicode = false + +[[test]] +name = "word-end-half-ascii-080" +regex = '\b{end-half}' +haystack = "𝛃𐆀" +matches = [[0, 0], [4, 4], [8, 8]] +unicode = false + +[[test]] +name = "word-end-half-ascii-090" +regex = '\b{end-half}' +haystack = "𝛃b" +matches = [[0, 0], [5, 5]] +unicode = false + +[[test]] +name = "word-end-half-ascii-110" +regex = '\b{end-half}' +haystack = "b𝛃" +matches = [[1, 1], [5, 5]] +unicode = false + +# Tests for \b{start-half} + +[[test]] +name = "word-start-half-unicode-010" +regex = '\b{start-half}' +haystack = "a" +matches = [[0, 0]] +unicode = true + +[[test]] +name = "word-start-half-unicode-020" +regex = '\b{start-half}' +haystack = "a " +matches = [[0, 0], [2, 2]] +unicode = true + +[[test]] +name = "word-start-half-unicode-030" +regex = '\b{start-half}' +haystack = " a " +matches = [[0, 0], [1, 1], [3, 3]] +unicode = true + +[[test]] +name = "word-start-half-unicode-040" +regex = '\b{start-half}' +haystack = "" +matches = [[0, 0]] +unicode = true + +[[test]] +name = "word-start-half-unicode-050" +regex = '\b{start-half}' +haystack = "ab" +matches = [[0, 0]] +unicode = true + +[[test]] +name = "word-start-half-unicode-060" +regex = '\b{start-half}' +haystack = "𝛃" +matches = [[0, 0]] +unicode = true + +[[test]] +name = "word-start-half-unicode-060-bounds" +regex = '\b{start-half}' +haystack = "𝛃" +bounds = [2, 3] +matches = [] +unicode = true + +[[test]] +name = "word-start-half-unicode-070" +regex = '\b{start-half}' +haystack = " 𝛃 " +matches = [[0, 0], [1, 1], [6, 6]] +unicode = true + +[[test]] +name = "word-start-half-unicode-080" +regex = '\b{start-half}' +haystack = "𝛃𐆀" +matches = [[0, 0], [8, 8]] +unicode = true + +[[test]] +name = "word-start-half-unicode-090" +regex = '\b{start-half}' +haystack = "𝛃b" +matches = [[0, 0]] +unicode = true + +[[test]] +name = "word-start-half-unicode-110" +regex = '\b{start-half}' +haystack = "b𝛃" +matches = [[0, 0]] +unicode = true + +# Tests for \b{end-half} + +[[test]] +name = "word-end-half-unicode-010" +regex = '\b{end-half}' +haystack = "a" +matches = [[1, 1]] +unicode = true + +[[test]] +name = "word-end-half-unicode-020" +regex = '\b{end-half}' +haystack = "a " +matches = [[1, 1], [2, 2]] +unicode = true + +[[test]] +name = "word-end-half-unicode-030" +regex = '\b{end-half}' +haystack = " a " +matches = [[0, 0], [2, 2], [3, 3]] +unicode = true + +[[test]] +name = "word-end-half-unicode-040" +regex = '\b{end-half}' +haystack = "" +matches = [[0, 0]] +unicode = true + +[[test]] +name = "word-end-half-unicode-050" +regex = '\b{end-half}' +haystack = "ab" +matches = [[2, 2]] +unicode = true + +[[test]] +name = "word-end-half-unicode-060" +regex = '\b{end-half}' +haystack = "𝛃" +matches = [[4, 4]] +unicode = true + +[[test]] +name = "word-end-half-unicode-060-bounds" +regex = '\b{end-half}' +haystack = "𝛃" +bounds = [2, 3] +matches = [] +unicode = true + +[[test]] +name = "word-end-half-unicode-070" +regex = '\b{end-half}' +haystack = " 𝛃 " +matches = [[0, 0], [5, 5], [6, 6]] +unicode = true + +[[test]] +name = "word-end-half-unicode-080" +regex = '\b{end-half}' +haystack = "𝛃𐆀" +matches = [[4, 4], [8, 8]] +unicode = true + +[[test]] +name = "word-end-half-unicode-090" +regex = '\b{end-half}' +haystack = "𝛃b" +matches = [[5, 5]] +unicode = true + +[[test]] +name = "word-end-half-unicode-110" +regex = '\b{end-half}' +haystack = "b𝛃" +matches = [[5, 5]] +unicode = true + +# Specialty tests. + +# Since \r is special cased in the start state computation (to deal with CRLF +# mode), this test ensures that the correct start state is computed when the +# pattern starts with a half word boundary assertion. +[[test]] +name = "word-start-half-ascii-carriage" +regex = '\b{start-half}[a-z]+' +haystack = 'ABC\rabc' +matches = [[4, 7]] +bounds = [4, 7] +unescape = true + +# Since \n is also special cased in the start state computation, this test +# ensures that the correct start state is computed when the pattern starts with +# a half word boundary assertion. +[[test]] +name = "word-start-half-ascii-linefeed" +regex = '\b{start-half}[a-z]+' +haystack = 'ABC\nabc' +matches = [[4, 7]] +bounds = [4, 7] +unescape = true + +# Like the carriage return test above, but with a custom line terminator. +[[test]] +name = "word-start-half-ascii-customlineterm" +regex = '\b{start-half}[a-z]+' +haystack = 'ABC!abc' +matches = [[4, 7]] +bounds = [4, 7] +unescape = true +line-terminator = '!' diff --git a/tests/lib.rs b/tests/lib.rs index badd57455d..b3f69423d9 100644 --- a/tests/lib.rs +++ b/tests/lib.rs @@ -49,6 +49,7 @@ fn suite() -> anyhow::Result { load!("unicode"); load!("utf8"); load!("word-boundary"); + load!("word-boundary-special"); load!("fowler/basic"); load!("fowler/nullsubexpr"); load!("fowler/repetition");