Bad(?) performance with a specific regex with backreferences #101

intelfx · 2022-09-04T01:34:00Z

Consider this regular expression:

(..)\1\1\1$

This regular expression should match 4 identical groups of 2 characters at the end of the input string (e. g. 123456789ABABABAB should match).

When using fancy-regex to match this regular expression against a 40-character string, a single iteration takes almost 6 µs if using the normal Regex.is_match() interface whereas manually compiling and running an Expr (as it is done in benchmarks) takes ~300 ns.

For comparison, an equivalent RE using only regex syntax ((00000000|...|FFFFFFFF)$) takes ~170 ns to run against the same 40-character string.

I understand that Regex wraps the input RE in a super-expression implementing the "match anywhere" semantics, which probably explains the 20x performance difference, but I'm wondering if this is expected (and optimizing further is a non-goal or a wishlist-level goal) or if this is unexpected/a bug.

bench.rs

#[macro_use]
extern crate criterion;
extern crate rand;

use criterion::Criterion;
use std::time::Duration;

use fancy_regex::internal::{analyze, compile, run_default};
use fancy_regex::Expr;

use rand::Rng;

struct Hexadecimal;
impl rand::distributions::Distribution<u8> for Hexadecimal {
    fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> u8 {
        const RANGE: u32 = 16;
        const GEN_ASCII_STR_CHARSET: &[u8] = b"0123456789ABCDEF";
        // We can pick from 62 characters. This is so close to a power of 2, 64,
        // that we can do better than `Uniform`. Use a simple bitshift and
        // rejection sampling. We do not use a bitmask, because for small RNGs
        // the most significant bits are usually of higher quality.
        loop {
            let var = rng.next_u32() >> (32 - 4);
            if var < RANGE {
                return GEN_ASCII_STR_CHARSET[var as usize];
            }
        }
    }
}

fn check_normal_expr(c: &mut Criterion) {
    let re_str = "(00000000|01010101|02020202|03030303|04040404|05050505|06060606|07070707|08080808|09090909|0A0A0A0A|0B0B0B0B|0C0C0C0C|0D0D0D0D|0E0E0E0E|0F0F0F0F|10101010|11111111|12121212|13131313|14141414|15151515|16161616|17171717|18181818|19191919|1A1A1A1A|1B1B1B1B|1C1C1C1C|1D1D1D1D|1E1E1E1E|1F1F1F1F|20202020|21212121|22222222|23232323|24242424|25252525|26262626|27272727|28282828|29292929|2A2A2A2A|2B2B2B2B|2C2C2C2C|2D2D2D2D|2E2E2E2E|2F2F2F2F|30303030|31313131|32323232|33333333|34343434|35353535|36363636|37373737|38383838|39393939|3A3A3A3A|3B3B3B3B|3C3C3C3C|3D3D3D3D|3E3E3E3E|3F3F3F3F|40404040|41414141|42424242|43434343|44444444|45454545|46464646|47474747|48484848|49494949|4A4A4A4A|4B4B4B4B|4C4C4C4C|4D4D4D4D|4E4E4E4E|4F4F4F4F|50505050|51515151|52525252|53535353|54545454|55555555|56565656|57575757|58585858|59595959|5A5A5A5A|5B5B5B5B|5C5C5C5C|5D5D5D5D|5E5E5E5E|5F5F5F5F|60606060|61616161|62626262|63636363|64646464|65656565|66666666|67676767|68686868|69696969|6A6A6A6A|6B6B6B6B|6C6C6C6C|6D6D6D6D|6E6E6E6E|6F6F6F6F|70707070|71717171|72727272|73737373|74747474|75757575|76767676|77777777|78787878|79797979|7A7A7A7A|7B7B7B7B|7C7C7C7C|7D7D7D7D|7E7E7E7E|7F7F7F7F|80808080|81818181|82828282|83838383|84848484|85858585|86868686|87878787|88888888|89898989|8A8A8A8A|8B8B8B8B|8C8C8C8C|8D8D8D8D|8E8E8E8E|8F8F8F8F|90909090|91919191|92929292|93939393|94949494|95959595|96969696|97979797|98989898|99999999|9A9A9A9A|9B9B9B9B|9C9C9C9C|9D9D9D9D|9E9E9E9E|9F9F9F9F|A0A0A0A0|A1A1A1A1|A2A2A2A2|A3A3A3A3|A4A4A4A4|A5A5A5A5|A6A6A6A6|A7A7A7A7|A8A8A8A8|A9A9A9A9|AAAAAAAA|ABABABAB|ACACACAC|ADADADAD|AEAEAEAE|AFAFAFAF|B0B0B0B0|B1B1B1B1|B2B2B2B2|B3B3B3B3|B4B4B4B4|B5B5B5B5|B6B6B6B6|B7B7B7B7|B8B8B8B8|B9B9B9B9|BABABABA|BBBBBBBB|BCBCBCBC|BDBDBDBD|BEBEBEBE|BFBFBFBF|C0C0C0C0|C1C1C1C1|C2C2C2C2|C3C3C3C3|C4C4C4C4|C5C5C5C5|C6C6C6C6|C7C7C7C7|C8C8C8C8|C9C9C9C9|CACACACA|CBCBCBCB|CCCCCCCC|CDCDCDCD|CECECECE|CFCFCFCF|D0D0D0D0|D1D1D1D1|D2D2D2D2|D3D3D3D3|D4D4D4D4|D5D5D5D5|D6D6D6D6|D7D7D7D7|D8D8D8D8|D9D9D9D9|DADADADA|DBDBDBDB|DCDCDCDC|DDDDDDDD|DEDEDEDE|DFDFDFDF|E0E0E0E0|E1E1E1E1|E2E2E2E2|E3E3E3E3|E4E4E4E4|E5E5E5E5|E6E6E6E6|E7E7E7E7|E8E8E8E8|E9E9E9E9|EAEAEAEA|EBEBEBEB|ECECECEC|EDEDEDED|EEEEEEEE|EFEFEFEF|F0F0F0F0|F1F1F1F1|F2F2F2F2|F3F3F3F3|F4F4F4F4|F5F5F5F5|F6F6F6F6|F7F7F7F7|F8F8F8F8|F9F9F9F9|FAFAFAFA|FBFBFBFB|FCFCFCFC|FDFDFDFD|FEFEFEFE|FFFFFFFF)$";
    let tree = Expr::parse_tree(re_str).unwrap();
    let a = analyze(&tree).unwrap();
    let p = compile(&a).unwrap();
    c.bench_function("check_normal_expr", |b| {
        b.iter(|| {
            let rng = rand::thread_rng();
            let string: String = rng.sample_iter(Hexadecimal).take(40).map(char::from).collect();
            run_default(&p, &string, 0).unwrap()
        });
    });
}

fn check_normal_regex(c: &mut Criterion) {
    let re_str = "(00000000|01010101|02020202|03030303|04040404|05050505|06060606|07070707|08080808|09090909|0A0A0A0A|0B0B0B0B|0C0C0C0C|0D0D0D0D|0E0E0E0E|0F0F0F0F|10101010|11111111|12121212|13131313|14141414|15151515|16161616|17171717|18181818|19191919|1A1A1A1A|1B1B1B1B|1C1C1C1C|1D1D1D1D|1E1E1E1E|1F1F1F1F|20202020|21212121|22222222|23232323|24242424|25252525|26262626|27272727|28282828|29292929|2A2A2A2A|2B2B2B2B|2C2C2C2C|2D2D2D2D|2E2E2E2E|2F2F2F2F|30303030|31313131|32323232|33333333|34343434|35353535|36363636|37373737|38383838|39393939|3A3A3A3A|3B3B3B3B|3C3C3C3C|3D3D3D3D|3E3E3E3E|3F3F3F3F|40404040|41414141|42424242|43434343|44444444|45454545|46464646|47474747|48484848|49494949|4A4A4A4A|4B4B4B4B|4C4C4C4C|4D4D4D4D|4E4E4E4E|4F4F4F4F|50505050|51515151|52525252|53535353|54545454|55555555|56565656|57575757|58585858|59595959|5A5A5A5A|5B5B5B5B|5C5C5C5C|5D5D5D5D|5E5E5E5E|5F5F5F5F|60606060|61616161|62626262|63636363|64646464|65656565|66666666|67676767|68686868|69696969|6A6A6A6A|6B6B6B6B|6C6C6C6C|6D6D6D6D|6E6E6E6E|6F6F6F6F|70707070|71717171|72727272|73737373|74747474|75757575|76767676|77777777|78787878|79797979|7A7A7A7A|7B7B7B7B|7C7C7C7C|7D7D7D7D|7E7E7E7E|7F7F7F7F|80808080|81818181|82828282|83838383|84848484|85858585|86868686|87878787|88888888|89898989|8A8A8A8A|8B8B8B8B|8C8C8C8C|8D8D8D8D|8E8E8E8E|8F8F8F8F|90909090|91919191|92929292|93939393|94949494|95959595|96969696|97979797|98989898|99999999|9A9A9A9A|9B9B9B9B|9C9C9C9C|9D9D9D9D|9E9E9E9E|9F9F9F9F|A0A0A0A0|A1A1A1A1|A2A2A2A2|A3A3A3A3|A4A4A4A4|A5A5A5A5|A6A6A6A6|A7A7A7A7|A8A8A8A8|A9A9A9A9|AAAAAAAA|ABABABAB|ACACACAC|ADADADAD|AEAEAEAE|AFAFAFAF|B0B0B0B0|B1B1B1B1|B2B2B2B2|B3B3B3B3|B4B4B4B4|B5B5B5B5|B6B6B6B6|B7B7B7B7|B8B8B8B8|B9B9B9B9|BABABABA|BBBBBBBB|BCBCBCBC|BDBDBDBD|BEBEBEBE|BFBFBFBF|C0C0C0C0|C1C1C1C1|C2C2C2C2|C3C3C3C3|C4C4C4C4|C5C5C5C5|C6C6C6C6|C7C7C7C7|C8C8C8C8|C9C9C9C9|CACACACA|CBCBCBCB|CCCCCCCC|CDCDCDCD|CECECECE|CFCFCFCF|D0D0D0D0|D1D1D1D1|D2D2D2D2|D3D3D3D3|D4D4D4D4|D5D5D5D5|D6D6D6D6|D7D7D7D7|D8D8D8D8|D9D9D9D9|DADADADA|DBDBDBDB|DCDCDCDC|DDDDDDDD|DEDEDEDE|DFDFDFDF|E0E0E0E0|E1E1E1E1|E2E2E2E2|E3E3E3E3|E4E4E4E4|E5E5E5E5|E6E6E6E6|E7E7E7E7|E8E8E8E8|E9E9E9E9|EAEAEAEA|EBEBEBEB|ECECECEC|EDEDEDED|EEEEEEEE|EFEFEFEF|F0F0F0F0|F1F1F1F1|F2F2F2F2|F3F3F3F3|F4F4F4F4|F5F5F5F5|F6F6F6F6|F7F7F7F7|F8F8F8F8|F9F9F9F9|FAFAFAFA|FBFBFBFB|FCFCFCFC|FDFDFDFD|FEFEFEFE|FFFFFFFF)$";
    let regex = fancy_regex::Regex::new(re_str).unwrap();
    c.bench_function("check_normal_regex", |b| {
        b.iter(|| {
            let rng = rand::thread_rng();
            let string: String = rng.sample_iter(Hexadecimal).take(40).map(char::from).collect();
            regex.is_match(&string).unwrap()
        });
    });
}

// The following regex is a pathological case for backtracking
// implementations, see README.md:
fn check_backref_expr(c: &mut Criterion) {
    let tree = Expr::parse_tree("((..)\\1\\1\\1$)").unwrap();
    let a = analyze(&tree).unwrap();
    let p = compile(&a).unwrap();
    c.bench_function("check_backref_expr", |b| {
        b.iter(|| {
            let rng = rand::thread_rng();
            let string: String = rng.sample_iter(Hexadecimal).take(40).map(char::from).collect();
            run_default(&p, &string, 0).unwrap()
        });
    });
}

fn check_backref_regex(c: &mut Criterion) {
    let regex = fancy_regex::Regex::new("(..)\\1\\1\\1$").unwrap();
    c.bench_function("check_backref_regex", |b| {
        b.iter(|| {
            let rng = rand::thread_rng();
            let string: String = rng.sample_iter(Hexadecimal).take(40).map(char::from).collect();
            regex.is_match(&string).unwrap()
        });
    });
}

criterion_group!(
    name = checks;
    config = Criterion::default().warm_up_time(Duration::from_secs(10));
    targets =
    check_normal_expr,
    check_normal_regex,
    check_backref_expr,
    check_backref_regex
);

criterion_main!(checks);

`cargo bench` output

     Running benches/bench.rs (target/release/deps/bench-8117c8c31bf52a35)
WARNING: HTML report generation will become a non-default optional feature in Criterion.rs 0.4.0.
This feature is being moved to cargo-criterion (https://github.com/bheisler/cargo-criterion) and will be optional in a future version of Criterion.rs. To silence this warning, either switch to cargo-criterion or enable the 'html_reports' feature in your Cargo.toml.

Gnuplot not found, using plotters backend
check_normal_expr       time:   [2.6787 us 2.7315 us 2.7891 us]
Found 5 outliers among 100 measurements (5.00%)
  4 (4.00%) high mild
  1 (1.00%) high severe

check_normal_regex      time:   [168.07 ns 169.70 ns 171.54 ns]
Found 2 outliers among 100 measurements (2.00%)
  2 (2.00%) high mild

check_backref_expr      time:   [304.59 ns 306.39 ns 308.37 ns]
Found 5 outliers among 100 measurements (5.00%)
  5 (5.00%) high mild

check_backref_regex     time:   [6.4816 us 6.8658 us 7.3006 us]
Found 15 outliers among 100 measurements (15.00%)
  4 (4.00%) high mild
  11 (11.00%) high severe

The text was updated successfully, but these errors were encountered:

robinst · 2023-01-12T03:30:13Z

When using fancy-regex to match this regular expression against a 40-character string, a single iteration takes almost 6 µs if using the normal Regex.is_match() interface whereas manually compiling and running an Expr (as it is done in benchmarks) takes ~300 ns.

For the latter case (check_backref_expr), can you assert that the result is actually what you expect? I think the way it's written it would never actually match (e.g. against 123456789ABABABAB).

Apart from that, the runtime sounds about right with the current state of optimizations. The reason it's slow is that the pattern has to be tried 32 times before it succeeds; starting at position 0, then 1, until it finally succeeds at position 32. The non-backref regex with the regex crate might be able to figure out that it's a constant length and so can only ever potentially match the last 8 characters, or run the search backwards.

I wonder what other engines do, for example Oniguruma. It seems to be pretty fast with 364.14 ns here. I had a look through the code and I think it might have optimizations for a fixed size match anchored at the end, e.g. see:

https://github.com/kkos/oniguruma/blob/master/src/regcomp.c#L7024-L7027

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Bad(?) performance with a specific regex with backreferences #101

Bad(?) performance with a specific regex with backreferences #101

intelfx commented Sep 4, 2022

robinst commented Jan 12, 2023

Bad(?) performance with a specific regex with backreferences #101

Bad(?) performance with a specific regex with backreferences #101

Comments

intelfx commented Sep 4, 2022

robinst commented Jan 12, 2023