Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Permit use of (?-u) in byte-regex strategies (#336) #337

Merged
merged 2 commits into from Sep 24, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
111 changes: 103 additions & 8 deletions proptest/src/string.rs
Expand Up @@ -17,7 +17,7 @@ use core::ops::RangeInclusive;
use core::u32;

use regex_syntax::hir::{self, Hir, HirKind::*, Repetition};
use regex_syntax::{Error as ParseError, Parser};
use regex_syntax::{Error as ParseError, ParserBuilder};

use crate::bool;
use crate::char;
Expand Down Expand Up @@ -144,7 +144,8 @@ impl StrategyFromRegex for Vec<u8> {
/// If you don't need error handling and aren't limited by setup time, it is
/// also possible to directly use a `&str` as a strategy with the same effect.
pub fn string_regex(regex: &str) -> ParseResult<String> {
string_regex_parsed(&regex_to_hir(regex)?)
let hir = ParserBuilder::new().build().parse(regex)?;
string_regex_parsed(&hir)
}

/// Like `string_regex()`, but allows providing a pre-parsed expression.
Expand All @@ -161,8 +162,20 @@ pub fn string_regex_parsed(expr: &Hir) -> ParseResult<String> {

/// Creates a strategy which generates byte strings matching the given regular
/// expression.
///
/// By default, the byte strings generated by this strategy _will_ be valid
/// UTF-8. If you wish to generate byte strings that aren't (necessarily)
/// valid UTF-8, wrap your regex (or some subsection of it) in `(?-u: ... )`.
/// You may want to turn on the `s` flag as well (`(?s-u: ... )`) so that `.`
/// will generate newline characters (byte value `0x0A`). See the
/// [`regex` crate's documentation](https://docs.rs/regex/*/regex/#opt-out-of-unicode-support)
/// for more information.
pub fn bytes_regex(regex: &str) -> ParseResult<Vec<u8>> {
bytes_regex_parsed(&regex_to_hir(regex)?)
let hir = ParserBuilder::new()
.utf8(false)
.build()
.parse(regex)?;
bytes_regex_parsed(&hir)
}

/// Like `bytes_regex()`, but allows providing a pre-parsed expression.
Expand Down Expand Up @@ -340,10 +353,6 @@ fn to_bytes(khar: char) -> Vec<u8> {
khar.encode_utf8(&mut buf).as_bytes().to_owned()
}

fn regex_to_hir(pattern: &str) -> Result<Hir, Error> {
Ok(Parser::new().parse(pattern)?)
}

fn unsupported<T>(error: &'static str) -> Result<T, Error> {
Err(Error::UnsupportedRegex(error))
}
Expand All @@ -353,9 +362,17 @@ mod test {
use std::collections::HashSet;

use regex::Regex;
use regex::bytes::Regex as BytesRegex;

use super::*;

fn printable_ascii(v: &[u8]) -> String {
v.iter()
.flat_map(|c| std::ascii::escape_default(*c))
.map(|c| char::from_u32(c.into()).unwrap())
.collect()
}

fn do_test(
pattern: &str,
min_distinct: usize,
Expand All @@ -379,6 +396,29 @@ mod test {
);
}

fn do_test_bytes(
pattern: &str,
min_distinct: usize,
max_distinct: usize,
iterations: usize,
) {
let generated = generate_byte_values_matching_regex(pattern, iterations);
assert!(
generated.len() >= min_distinct,
"Expected to generate at least {} strings, but only \
generated {}",
min_distinct,
generated.len()
);
assert!(
generated.len() <= max_distinct,
"Expected to generate at most {} strings, but \
generated {}",
max_distinct,
generated.len()
);
}

fn generate_values_matching_regex(
pattern: &str,
iterations: usize,
Expand Down Expand Up @@ -415,6 +455,42 @@ mod test {
generated
}

fn generate_byte_values_matching_regex(
pattern: &str,
iterations: usize,
) -> HashSet<Vec<u8>> {
let rx = BytesRegex::new(pattern).unwrap();
let mut generated = HashSet::new();

let strategy = bytes_regex(pattern).unwrap();
let mut runner = TestRunner::deterministic();
for _ in 0..iterations {
let mut value = strategy.new_tree(&mut runner).unwrap();

loop {
let s = value.current();
let ok = if let Some(matsch) = rx.find(&s) {
0 == matsch.start() && s.len() == matsch.end()
} else {
false
};
if !ok {
panic!(
"Generated string {:?} which does not match {:?}",
printable_ascii(&s), pattern
);
}

generated.insert(s);

if !value.simplify() {
break;
}
}
}
generated
}

#[test]
fn test_case_insensitive_produces_all_available_values() {
let mut expected: HashSet<String> = HashSet::new();
Expand All @@ -428,6 +504,7 @@ mod test {
#[test]
fn test_literal() {
do_test("foo", 1, 1, 8);
do_test_bytes("foo", 1, 1, 8);
}

#[test]
Expand All @@ -438,36 +515,43 @@ mod test {
#[test]
fn test_alternation() {
do_test("foo|bar|baz", 3, 3, 16);
do_test_bytes("foo|bar|baz", 3, 3, 16);
}

#[test]
fn test_repitition() {
fn test_repetition() {
do_test("a{0,8}", 9, 9, 64);
do_test_bytes("a{0,8}", 9, 9, 64);
}

#[test]
fn test_question() {
do_test("a?", 2, 2, 16);
do_test_bytes("a?", 2, 2, 16);
}

#[test]
fn test_star() {
do_test("a*", 33, 33, 256);
do_test_bytes("a*", 33, 33, 256);
}

#[test]
fn test_plus() {
do_test("a+", 32, 32, 256);
do_test_bytes("a+", 32, 32, 256);
}

#[test]
fn test_n_to_range() {
do_test("a{4,}", 4, 4, 64);
do_test_bytes("a{4,}", 4, 4, 64);
}

#[test]
fn test_concatenation() {
do_test("(foo|bar)(xyzzy|plugh)", 4, 4, 32);
do_test_bytes("(foo|bar)(xyzzy|plugh)", 4, 4, 32);
}

#[test]
Expand All @@ -488,13 +572,24 @@ mod test {
#[test]
fn test_dot_s() {
do_test("(?s).", 200, 65536, 256);
do_test_bytes("(?s-u).", 256, 256, 2048);
}

#[test]
fn test_backslash_d_plus() {
do_test("\\d+", 1, 65536, 256);
}

#[test]
fn test_non_utf8_byte_strings() {
do_test_bytes(r"(?-u)[\xC0-\xFF]\x20", 64, 64, 512);
do_test_bytes(r"(?-u)\x20[\x80-\xBF]", 64, 64, 512);
do_test_bytes(r#"(?x-u)
\xed (( ( \xa0\x80 | \xad\xbf | \xae\x80 | \xaf\xbf )
( \xed ( \xb0\x80 | \xbf\xbf ) )? )
| \xb0\x80 | \xbe\x80 | \xbf\xbf )"#, 15, 15, 120);
}

fn assert_send_and_sync<T: Send + Sync>(_: T) {}

#[test]
Expand Down