Skip to content

Commit

Permalink
Permit use of (?-u) in byte-regex strategies (#336)
Browse files Browse the repository at this point in the history
It is desirable to be able to generate, from a regex, byte sequences
that are not necessarily valid UTF-8.  For example, suppose you have a
parser that accepts any string generated by the regex

```
[0-9]+(\.[0-9]*)?
```

Then, in your test suite, you might want to generate strings from the
complementary regular language, which you could do with the regex

```
(?s:|[^0-9].*|[0-9]+[^0-9.].*|[0-9]+\.[0-9]*[^0-9].*)
```

However, this will still only generate valid UTF-8 strings.  Maybe you
are parsing directly from byte sequences read from disk, in which case
you want to test the parser’s ability to reject invalid UTF-8 _as well
as_ valid UTF-8 but not within the accepted language.  Then you want
this slight variation:

```
(?s-u:|[^0-9].*|[0-9]+[^0-9.].*|[0-9]+\.[0-9]*[^0-9].*)
```

But this regex will be rejected by `bytes_regex`, because by default
`regex_syntax::Parser` errors out on any regex that potentially
matches invalid UTF-8.  The application — i.e. proptest — must opt
into use of such regexes.  This patch makes proptest do just that, for
`bytes_regex` only.

There should be no change to the behavior of any existing test suite,
because opting to allow use of `(?-u)` does not change the semantics
of any regex that _doesn’t_ contain `(?-u)`, and any existing regex
that _does_ contain `(?-u)` must be incapable of generating invalid
UTF-8 for other reasons, or `regex_syntax::Parser` would be rejecting it.
(For example, `(?-u:[a-z])` cannot generate invalid UTF-8.)

This patch also adds a bunch of tests for `bytes_regex`, which AFAICT
was not being tested at all.  Some of these use the new functionality
and others don’t.  There is quite a bit of code duplication in the
test helper functions — `do_test` and `do_test_bytes` are almost
identical, as are `generate_values_matching_regex` and
`generate_byte_values_matching_regex`.  I am not good enough at
generic metaprogramming in Rust to factor out the duplication.
  • Loading branch information
zackw committed Jun 29, 2023
1 parent 7d840ca commit 51ef282
Showing 1 changed file with 103 additions and 8 deletions.
111 changes: 103 additions & 8 deletions proptest/src/string.rs
Expand Up @@ -23,7 +23,7 @@ use regex_syntax::hir::{
RepetitionKind::{self, *},
RepetitionRange::*,
};
use regex_syntax::{Error as ParseError, Parser};
use regex_syntax::{Error as ParseError, ParserBuilder};

use crate::bool;
use crate::char;
Expand Down Expand Up @@ -150,7 +150,8 @@ impl StrategyFromRegex for Vec<u8> {
/// If you don't need error handling and aren't limited by setup time, it is
/// also possible to directly use a `&str` as a strategy with the same effect.
pub fn string_regex(regex: &str) -> ParseResult<String> {
string_regex_parsed(&regex_to_hir(regex)?)
let hir = ParserBuilder::new().build().parse(regex)?;
string_regex_parsed(&hir)
}

/// Like `string_regex()`, but allows providing a pre-parsed expression.
Expand All @@ -167,8 +168,20 @@ pub fn string_regex_parsed(expr: &Hir) -> ParseResult<String> {

/// Creates a strategy which generates byte strings matching the given regular
/// expression.
///
/// By default, the byte strings generated by this strategy _will_ be valid
/// UTF-8. If you wish to generate byte strings that aren't (necessarily)
/// valid UTF-8, wrap your regex (or some subsection of it) in `(?-u: ... )`.
/// You may want to turn on the `s` flag as well (`(?s-u: ... )`) so that `.`
/// will generate newline characters (byte value `0x0A`). See the
/// [`regex` crate's documentation](https://docs.rs/regex/*/regex/#opt-out-of-unicode-support)
/// for more information.
pub fn bytes_regex(regex: &str) -> ParseResult<Vec<u8>> {
bytes_regex_parsed(&regex_to_hir(regex)?)
let hir = ParserBuilder::new()
.allow_invalid_utf8(true)
.build()
.parse(regex)?;
bytes_regex_parsed(&hir)
}

/// Like `bytes_regex()`, but allows providing a pre-parsed expression.
Expand Down Expand Up @@ -357,10 +370,6 @@ fn to_bytes(khar: char) -> Vec<u8> {
khar.encode_utf8(&mut buf).as_bytes().to_owned()
}

fn regex_to_hir(pattern: &str) -> Result<Hir, Error> {
Ok(Parser::new().parse(pattern)?)
}

fn unsupported<T>(error: &'static str) -> Result<T, Error> {
Err(Error::UnsupportedRegex(error))
}
Expand All @@ -370,9 +379,17 @@ mod test {
use std::collections::HashSet;

use regex::Regex;
use regex::bytes::Regex as BytesRegex;

use super::*;

fn printable_ascii(v: &[u8]) -> String {
v.iter()
.flat_map(|c| std::ascii::escape_default(*c))
.map(|c| char::from_u32(c.into()).unwrap())
.collect()
}

fn do_test(
pattern: &str,
min_distinct: usize,
Expand All @@ -396,6 +413,29 @@ mod test {
);
}

fn do_test_bytes(
pattern: &str,
min_distinct: usize,
max_distinct: usize,
iterations: usize,
) {
let generated = generate_byte_values_matching_regex(pattern, iterations);
assert!(
generated.len() >= min_distinct,
"Expected to generate at least {} strings, but only \
generated {}",
min_distinct,
generated.len()
);
assert!(
generated.len() <= max_distinct,
"Expected to generate at most {} strings, but \
generated {}",
max_distinct,
generated.len()
);
}

fn generate_values_matching_regex(
pattern: &str,
iterations: usize,
Expand Down Expand Up @@ -432,6 +472,42 @@ mod test {
generated
}

fn generate_byte_values_matching_regex(
pattern: &str,
iterations: usize,
) -> HashSet<Vec<u8>> {
let rx = BytesRegex::new(pattern).unwrap();
let mut generated = HashSet::new();

let strategy = bytes_regex(pattern).unwrap();
let mut runner = TestRunner::deterministic();
for _ in 0..iterations {
let mut value = strategy.new_tree(&mut runner).unwrap();

loop {
let s = value.current();
let ok = if let Some(matsch) = rx.find(&s) {
0 == matsch.start() && s.len() == matsch.end()
} else {
false
};
if !ok {
panic!(
"Generated string {:?} which does not match {:?}",
printable_ascii(&s), pattern
);
}

generated.insert(s);

if !value.simplify() {
break;
}
}
}
generated
}

#[test]
fn test_case_insensitive_produces_all_available_values() {
let mut expected: HashSet<String> = HashSet::new();
Expand All @@ -445,6 +521,7 @@ mod test {
#[test]
fn test_literal() {
do_test("foo", 1, 1, 8);
do_test_bytes("foo", 1, 1, 8);
}

#[test]
Expand All @@ -455,36 +532,43 @@ mod test {
#[test]
fn test_alternation() {
do_test("foo|bar|baz", 3, 3, 16);
do_test_bytes("foo|bar|baz", 3, 3, 16);
}

#[test]
fn test_repitition() {
fn test_repetition() {
do_test("a{0,8}", 9, 9, 64);
do_test_bytes("a{0,8}", 9, 9, 64);
}

#[test]
fn test_question() {
do_test("a?", 2, 2, 16);
do_test_bytes("a?", 2, 2, 16);
}

#[test]
fn test_star() {
do_test("a*", 33, 33, 256);
do_test_bytes("a*", 33, 33, 256);
}

#[test]
fn test_plus() {
do_test("a+", 32, 32, 256);
do_test_bytes("a+", 32, 32, 256);
}

#[test]
fn test_n_to_range() {
do_test("a{4,}", 4, 4, 64);
do_test_bytes("a{4,}", 4, 4, 64);
}

#[test]
fn test_concatenation() {
do_test("(foo|bar)(xyzzy|plugh)", 4, 4, 32);
do_test_bytes("(foo|bar)(xyzzy|plugh)", 4, 4, 32);
}

#[test]
Expand All @@ -505,13 +589,24 @@ mod test {
#[test]
fn test_dot_s() {
do_test("(?s).", 200, 65536, 256);
do_test_bytes("(?s-u).", 256, 256, 2048);
}

#[test]
fn test_backslash_d_plus() {
do_test("\\d+", 1, 65536, 256);
}

#[test]
fn test_non_utf8_byte_strings() {
do_test_bytes(r"(?-u)[\xC0-\xFF]\x20", 64, 64, 512);
do_test_bytes(r"(?-u)\x20[\x80-\xBF]", 64, 64, 512);
do_test_bytes(r#"(?x-u)
\xed (( ( \xa0\x80 | \xad\xbf | \xae\x80 | \xaf\xbf )
( \xed ( \xb0\x80 | \xbf\xbf ) )? )
| \xb0\x80 | \xbe\x80 | \xbf\xbf )"#, 15, 15, 120);
}

fn assert_send_and_sync<T: Send + Sync>(_: T) {}

#[test]
Expand Down

0 comments on commit 51ef282

Please sign in to comment.