Permit use of (?-u) in byte-regex strategies (#336)

It is desirable to be able to generate, from a regex, byte sequences that are not necessarily valid UTF-8. For example, suppose you have a parser that accepts any string generated by the regex ``` [0-9]+(\.[0-9]*)? ``` Then, in your test suite, you might want to generate strings from the complementary regular language, which you could do with the regex ``` (?s:|[^0-9].*|[0-9]+[^0-9.].*|[0-9]+\.[0-9]*[^0-9].*) ``` However, this will still only generate valid UTF-8 strings. Maybe you are parsing directly from byte sequences read from disk, in which case you want to test the parser’s ability to reject invalid UTF-8 _as well as_ valid UTF-8 but not within the accepted language. Then you want this slight variation: ``` (?s-u:|[^0-9].*|[0-9]+[^0-9.].*|[0-9]+\.[0-9]*[^0-9].*) ``` But this regex will be rejected by `bytes_regex`, because by default `regex_syntax::Parser` errors out on any regex that potentially matches invalid UTF-8. The application — i.e. proptest — must opt into use of such regexes. This patch makes proptest do just that, for `bytes_regex` only. There should be no change to the behavior of any existing test suite, because opting to allow use of `(?-u)` does not change the semantics of any regex that _doesn’t_ contain `(?-u)`, and any existing regex that _does_ contain `(?-u)` must be incapable of generating invalid UTF-8 for other reasons, or `regex_syntax::Parser` would be rejecting it. (For example, `(?-u:[a-z])` cannot generate invalid UTF-8.) This patch also adds a bunch of tests for `bytes_regex`, which AFAICT was not being tested at all. Some of these use the new functionality and others don’t. There is quite a bit of code duplication in the test helper functions — `do_test` and `do_test_bytes` are almost identical, as are `generate_values_matching_regex` and `generate_byte_values_matching_regex`. I am not good enough at generic metaprogramming in Rust to factor out the duplication.
proptest-rs · Jun 29, 2023 · 51ef282 · 51ef282
1 parent 7d840ca
commit 51ef282
Showing 1 changed file with 103 additions and 8 deletions.
diff --git a/proptest/src/string.rs b/proptest/src/string.rs
@@ -23,7 +23,7 @@ use regex_syntax::hir::{
     RepetitionKind::{self, *},
     RepetitionRange::*,
 };
-use regex_syntax::{Error as ParseError, Parser};
+use regex_syntax::{Error as ParseError, ParserBuilder};
 
 use crate::bool;
 use crate::char;
@@ -150,7 +150,8 @@ impl StrategyFromRegex for Vec<u8> {
 /// If you don't need error handling and aren't limited by setup time, it is
 /// also possible to directly use a `&str` as a strategy with the same effect.
 pub fn string_regex(regex: &str) -> ParseResult<String> {
-    string_regex_parsed(&regex_to_hir(regex)?)
+    let hir = ParserBuilder::new().build().parse(regex)?;
+    string_regex_parsed(&hir)
 }
 
 /// Like `string_regex()`, but allows providing a pre-parsed expression.
@@ -167,8 +168,20 @@ pub fn string_regex_parsed(expr: &Hir) -> ParseResult<String> {
 
 /// Creates a strategy which generates byte strings matching the given regular
 /// expression.
+///
+/// By default, the byte strings generated by this strategy _will_ be valid
+/// UTF-8.  If you wish to generate byte strings that aren't (necessarily)
+/// valid UTF-8, wrap your regex (or some subsection of it) in `(?-u: ... )`.
+/// You may want to turn on the `s` flag as well (`(?s-u: ... )`) so that `.`
+/// will generate newline characters (byte value `0x0A`).  See the
+/// [`regex` crate's documentation](https://docs.rs/regex/*/regex/#opt-out-of-unicode-support)
+/// for more information.
 pub fn bytes_regex(regex: &str) -> ParseResult<Vec<u8>> {
-    bytes_regex_parsed(&regex_to_hir(regex)?)
+    let hir = ParserBuilder::new()
+        .allow_invalid_utf8(true)
+        .build()
+        .parse(regex)?;
+    bytes_regex_parsed(&hir)
 }
 
 /// Like `bytes_regex()`, but allows providing a pre-parsed expression.
@@ -357,10 +370,6 @@ fn to_bytes(khar: char) -> Vec<u8> {
     khar.encode_utf8(&mut buf).as_bytes().to_owned()
 }
 
-fn regex_to_hir(pattern: &str) -> Result<Hir, Error> {
-    Ok(Parser::new().parse(pattern)?)
-}
-
 fn unsupported<T>(error: &'static str) -> Result<T, Error> {
     Err(Error::UnsupportedRegex(error))
 }
@@ -370,9 +379,17 @@ mod test {
     use std::collections::HashSet;
 
     use regex::Regex;
+    use regex::bytes::Regex as BytesRegex;
 
     use super::*;
 
+    fn printable_ascii(v: &[u8]) -> String {
+        v.iter()
+            .flat_map(|c| std::ascii::escape_default(*c))
+            .map(|c| char::from_u32(c.into()).unwrap())
+            .collect()
+    }
+
     fn do_test(
         pattern: &str,
         min_distinct: usize,
@@ -396,6 +413,29 @@ mod test {
         );
     }
 
+    fn do_test_bytes(
+        pattern: &str,
+        min_distinct: usize,
+        max_distinct: usize,
+        iterations: usize,
+    ) {
+        let generated = generate_byte_values_matching_regex(pattern, iterations);
+        assert!(
+            generated.len() >= min_distinct,
+            "Expected to generate at least {} strings, but only \
+             generated {}",
+            min_distinct,
+            generated.len()
+        );
+        assert!(
+            generated.len() <= max_distinct,
+            "Expected to generate at most {} strings, but \
+             generated {}",
+            max_distinct,
+            generated.len()
+        );
+    }
+
     fn generate_values_matching_regex(
         pattern: &str,
         iterations: usize,
@@ -432,6 +472,42 @@ mod test {
         generated
     }
 
+    fn generate_byte_values_matching_regex(
+        pattern: &str,
+        iterations: usize,
+    ) -> HashSet<Vec<u8>> {
+        let rx = BytesRegex::new(pattern).unwrap();
+        let mut generated = HashSet::new();
+
+        let strategy = bytes_regex(pattern).unwrap();
+        let mut runner = TestRunner::deterministic();
+        for _ in 0..iterations {
+            let mut value = strategy.new_tree(&mut runner).unwrap();
+
+            loop {
+                let s = value.current();
+                let ok = if let Some(matsch) = rx.find(&s) {
+                    0 == matsch.start() && s.len() == matsch.end()
+                } else {
+                    false
+                };
+                if !ok {
+                    panic!(
+                        "Generated string {:?} which does not match {:?}",
+                        printable_ascii(&s), pattern
+                    );
+                }
+
+                generated.insert(s);
+
+                if !value.simplify() {
+                    break;
+                }
+            }
+        }
+        generated
+    }
+
     #[test]
     fn test_case_insensitive_produces_all_available_values() {
         let mut expected: HashSet<String> = HashSet::new();
@@ -445,6 +521,7 @@ mod test {
     #[test]
     fn test_literal() {
         do_test("foo", 1, 1, 8);
+        do_test_bytes("foo", 1, 1, 8);
     }
 
     #[test]
@@ -455,36 +532,43 @@ mod test {
     #[test]
     fn test_alternation() {
         do_test("foo|bar|baz", 3, 3, 16);
+        do_test_bytes("foo|bar|baz", 3, 3, 16);
     }
 
     #[test]
-    fn test_repitition() {
+    fn test_repetition() {
         do_test("a{0,8}", 9, 9, 64);
+        do_test_bytes("a{0,8}", 9, 9, 64);
     }
 
     #[test]
     fn test_question() {
         do_test("a?", 2, 2, 16);
+        do_test_bytes("a?", 2, 2, 16);
     }
 
     #[test]
     fn test_star() {
         do_test("a*", 33, 33, 256);
+        do_test_bytes("a*", 33, 33, 256);
     }
 
     #[test]
     fn test_plus() {
         do_test("a+", 32, 32, 256);
+        do_test_bytes("a+", 32, 32, 256);
     }
 
     #[test]
     fn test_n_to_range() {
         do_test("a{4,}", 4, 4, 64);
+        do_test_bytes("a{4,}", 4, 4, 64);
     }
 
     #[test]
     fn test_concatenation() {
         do_test("(foo|bar)(xyzzy|plugh)", 4, 4, 32);
+        do_test_bytes("(foo|bar)(xyzzy|plugh)", 4, 4, 32);
     }
 
     #[test]
@@ -505,13 +589,24 @@ mod test {
     #[test]
     fn test_dot_s() {
         do_test("(?s).", 200, 65536, 256);
+        do_test_bytes("(?s-u).", 256, 256, 2048);
     }
 
     #[test]
     fn test_backslash_d_plus() {
         do_test("\\d+", 1, 65536, 256);
     }
 
+    #[test]
+    fn test_non_utf8_byte_strings() {
+        do_test_bytes(r"(?-u)[\xC0-\xFF]\x20", 64, 64, 512);
+        do_test_bytes(r"(?-u)\x20[\x80-\xBF]", 64, 64, 512);
+        do_test_bytes(r#"(?x-u)
+  \xed (( ( \xa0\x80 | \xad\xbf | \xae\x80 | \xaf\xbf )
+          ( \xed ( \xb0\x80 | \xbf\xbf ) )? )
+        | \xb0\x80 | \xbe\x80 | \xbf\xbf )"#, 15, 15, 120);
+    }
+
     fn assert_send_and_sync<T: Send + Sync>(_: T) {}
 
     #[test]