Skip to content

Commit

Permalink
impl: cut over to regex-automata
Browse files Browse the repository at this point in the history
  • Loading branch information
BurntSushi committed Apr 20, 2023
1 parent 28678d0 commit 4176b6c
Show file tree
Hide file tree
Showing 57 changed files with 722 additions and 12,664 deletions.
23 changes: 19 additions & 4 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,13 @@ jobs:
# systems.
CARGO: cargo
# When CARGO is set to CROSS, TARGET is set to `--target matrix.target`.
# Note that we only use cross on Linux, so setting a target on a
# different OS will just use normal cargo.
TARGET:
# Bump this as appropriate. We pin to a version to make sure CI
# continues to work as cross releases in the past have broken things
# in subtle ways.
CROSS_VERSION: v0.2.5
runs-on: ${{ matrix.os }}
strategy:
matrix:
Expand Down Expand Up @@ -90,20 +96,29 @@ jobs:
toolchain: ${{ matrix.rust }}

- name: Install and configure Cross
if: matrix.target != ''
if: matrix.os == 'ubuntu-latest' && matrix.target != ''
run: |
# In the past, new releases of 'cross' have broken CI. So for now, we
# pin it. We also use their pre-compiled binary releases because cross
# has over 100 dependencies and takes a bit to compile.
mkdir "$RUNNER_TEMP/cross-download"
echo "$RUNNER_TEMP" >> $GITHUB_PATH
cd "$RUNNER_TEMP/cross-download"
curl -LO "https://github.com/cross-rs/cross/releases/download/$CROSS_VERSION/cross-x86_64-unknown-linux-musl.tar.gz"
tar xf cross-x86_64-unknown-linux-musl.tar.gz
# We used to install 'cross' from master, but it kept failing. So now
# we build from a known-good version until 'cross' becomes more stable
# or we find an alternative. Notably, between v0.2.1 and current
# master (2022-06-14), the number of Cross's dependencies has doubled.
cargo install --bins --git https://github.com/rust-embedded/cross --tag v0.2.1
# cargo install --bins --git https://github.com/rust-embedded/cross --tag v0.2.1
echo "CARGO=cross" >> $GITHUB_ENV
echo "TARGET=--target ${{ matrix.target }}" >> $GITHUB_ENV
- name: Show command used for Cargo
run: |
echo "cargo command is: ${{ env.CARGO }}"
echo "target flag is: ${{ env.TARGET }}"
echo "cargo command is: $CARGO"
echo "target flag is: $TARGET"
- name: Show CPU info for debugging
if: matrix.os == 'ubuntu-latest'
Expand Down
37 changes: 0 additions & 37 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -238,43 +238,6 @@ name = "default"
path = "tests/test_default_bytes.rs"
name = "default-bytes"

# Run the test suite on the NFA algorithm over Unicode codepoints.
[[test]]
path = "tests/test_nfa.rs"
name = "nfa"

# Run the test suite on the NFA algorithm over bytes that match UTF-8 only.
[[test]]
path = "tests/test_nfa_utf8bytes.rs"
name = "nfa-utf8bytes"

# Run the test suite on the NFA algorithm over arbitrary bytes.
[[test]]
path = "tests/test_nfa_bytes.rs"
name = "nfa-bytes"

# Run the test suite on the backtracking engine over Unicode codepoints.
[[test]]
path = "tests/test_backtrack.rs"
name = "backtrack"

# Run the test suite on the backtracking engine over bytes that match UTF-8
# only.
[[test]]
path = "tests/test_backtrack_utf8bytes.rs"
name = "backtrack-utf8bytes"

# Run the test suite on the backtracking engine over arbitrary bytes.
[[test]]
path = "tests/test_backtrack_bytes.rs"
name = "backtrack-bytes"

# Run all backends against each regex found on crates.io and make sure
# that they all do the same thing.
[[test]]
path = "tests/test_crates_regex.rs"
name = "crates-regex"

[package.metadata.docs.rs]
# We want to document all features.
all-features = true
Expand Down
7 changes: 7 additions & 0 deletions Cross.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
[build.env]
passthrough = [
"RUST_BACKTRACE",
"RUST_LOG",
"REGEX_TEST",
"REGEX_TEST_VERBOSE",
]
3 changes: 0 additions & 3 deletions bench/src/bench.rs
Original file line number Diff line number Diff line change
Expand Up @@ -304,6 +304,3 @@ cfg_if! {
mod sherlock;
}
}

#[cfg(any(feature = "re-rust", feature = "re-rust-bytes"))]
mod rust_compile;
67 changes: 0 additions & 67 deletions bench/src/rust_compile.rs

This file was deleted.

17 changes: 0 additions & 17 deletions examples/shootout-regex-dna-replace.rs

This file was deleted.

1 change: 1 addition & 0 deletions regex-automata/src/dfa/regex.rs
Original file line number Diff line number Diff line change
Expand Up @@ -738,6 +738,7 @@ impl Builder {
.configure(
dense::Config::new()
.prefilter(None)
.specialize_start_states(false)
.start_kind(StartKind::Anchored)
.match_kind(MatchKind::All),
)
Expand Down
5 changes: 4 additions & 1 deletion regex-automata/src/hybrid/regex.rs
Original file line number Diff line number Diff line change
Expand Up @@ -794,7 +794,10 @@ impl Builder {
.dfa
.clone()
.configure(
DFA::config().prefilter(None).match_kind(MatchKind::All),
DFA::config()
.prefilter(None)
.specialize_start_states(false)
.match_kind(MatchKind::All),
)
.thompson(thompson::Config::new().reverse(true))
.build_many(patterns)?;
Expand Down
4 changes: 2 additions & 2 deletions regex-automata/src/hybrid/search.rs
Original file line number Diff line number Diff line change
Expand Up @@ -416,7 +416,7 @@ fn find_rev_imp(
}
if sid.is_tagged() {
if sid.is_start() {
continue;
// do nothing
} else if sid.is_match() {
let pattern = dfa.match_pattern(cache, sid, 0);
// Since reverse searches report the beginning of a match
Expand Down Expand Up @@ -639,7 +639,7 @@ pub(crate) fn find_overlapping_rev(
if sid.is_tagged() {
state.id = Some(sid);
if sid.is_start() {
continue;
// do nothing
} else if sid.is_match() {
state.next_match_index = Some(1);
let pattern = dfa.match_pattern(cache, sid, 0);
Expand Down
12 changes: 12 additions & 0 deletions regex-automata/src/meta/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,18 @@ impl BuildError {
}
}

/// If this error occurred because the regex exceeded the configured size
/// limit before being built, then this returns the configured size limit.
///
/// The limit returned is what was configured, and corresponds to the
/// maximum amount of heap usage in bytes.
pub fn size_limit(&self) -> Option<usize> {
match self.kind {
BuildErrorKind::NFA(ref err) => err.size_limit(),
_ => None,
}
}

pub(crate) fn ast(pid: PatternID, err: ast::Error) -> BuildError {
BuildError { kind: BuildErrorKind::Ast { pid, err } }
}
Expand Down
44 changes: 44 additions & 0 deletions regex-automata/src/meta/regex.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1995,6 +1995,17 @@ pub struct FindMatches<'r, 'h> {
it: iter::Searcher<'h>,
}

impl<'r, 'h> FindMatches<'r, 'h> {
/// Returns the current `Input` associated with this iterator.
///
/// The `start` position on the given `Input` may change during iteration,
/// but all other values are guaranteed to remain invariant.
#[inline]
pub fn input<'s>(&'s self) -> &'s Input<'h> {
self.it.input()
}
}

impl<'r, 'h> Iterator for FindMatches<'r, 'h> {
type Item = Match;

Expand Down Expand Up @@ -2042,6 +2053,17 @@ pub struct CapturesMatches<'r, 'h> {
it: iter::Searcher<'h>,
}

impl<'r, 'h> CapturesMatches<'r, 'h> {
/// Returns the current `Input` associated with this iterator.
///
/// The `start` position on the given `Input` may change during iteration,
/// but all other values are guaranteed to remain invariant.
#[inline]
pub fn input<'s>(&'s self) -> &'s Input<'h> {
self.it.input()
}
}

impl<'r, 'h> Iterator for CapturesMatches<'r, 'h> {
type Item = Captures;

Expand Down Expand Up @@ -2091,6 +2113,17 @@ pub struct Split<'r, 'h> {
last: usize,
}

impl<'r, 'h> Split<'r, 'h> {
/// Returns the current `Input` associated with this iterator.
///
/// The `start` position on the given `Input` may change during iteration,
/// but all other values are guaranteed to remain invariant.
#[inline]
pub fn input<'s>(&'s self) -> &'s Input<'h> {
self.finder.input()
}
}

impl<'r, 'h> Iterator for Split<'r, 'h> {
type Item = Span;

Expand Down Expand Up @@ -2134,6 +2167,17 @@ pub struct SplitN<'r, 'h> {
limit: usize,
}

impl<'r, 'h> SplitN<'r, 'h> {
/// Returns the current `Input` associated with this iterator.
///
/// The `start` position on the given `Input` may change during iteration,
/// but all other values are guaranteed to remain invariant.
#[inline]
pub fn input<'s>(&'s self) -> &'s Input<'h> {
self.splits.input()
}
}

impl<'r, 'h> Iterator for SplitN<'r, 'h> {
type Item = Span;

Expand Down
4 changes: 3 additions & 1 deletion regex-automata/src/meta/strategy.rs
Original file line number Diff line number Diff line change
Expand Up @@ -782,7 +782,9 @@ impl Strategy for Core {
input,
patset,
) {
Ok(()) => return,
Ok(()) => {
return;
}
Err(err) => err,
};
trace!("fast overlapping search failed: {}", _err);
Expand Down
6 changes: 4 additions & 2 deletions regex-automata/src/meta/wrappers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -580,7 +580,8 @@ impl HybridEngine {
dfa_config
.clone()
.match_kind(MatchKind::All)
.prefilter(None),
.prefilter(None)
.specialize_start_states(false),
)
.build_from_nfa(nfarev.clone());
let rev = match result {
Expand Down Expand Up @@ -881,7 +882,8 @@ impl DFAEngine {
// don't.)
.start_kind(dfa::StartKind::Anchored)
.match_kind(MatchKind::All)
.prefilter(None),
.prefilter(None)
.specialize_start_states(false),
)
.build_from_nfa(&nfarev);
let rev = match result {
Expand Down
3 changes: 3 additions & 0 deletions regex-automata/src/nfa/thompson/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1309,6 +1309,9 @@ mod tests {
// is built.
#[test]
fn state_has_small_size() {
#[cfg(target_pointer_width = "64")]
assert_eq!(32, core::mem::size_of::<State>());
#[cfg(target_pointer_width = "32")]
assert_eq!(16, core::mem::size_of::<State>());
}
}
12 changes: 12 additions & 0 deletions regex-automata/src/nfa/thompson/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,18 @@ enum BuildErrorKind {
}

impl BuildError {
/// If this error occurred because the NFA exceeded the configured size
/// limit before being built, then this returns the configured size limit.
///
/// The limit returned is what was configured, and corresponds to the
/// maximum amount of heap usage in bytes.
pub fn size_limit(&self) -> Option<usize> {
match self.kind {
BuildErrorKind::ExceededSizeLimit { limit } => Some(limit),
_ => None,
}
}

fn kind(&self) -> &BuildErrorKind {
&self.kind
}
Expand Down

0 comments on commit 4176b6c

Please sign in to comment.