be exact about whitespace

Be exact about whitespace in parsing. This changes pattern matching in `format::parse::parse` as it does not allow arbitrary whitespace before, after, or between the datetime specifiers. `format/parse.rs:datetime_from_str` is exact about whitespace in the passed data `s` and passed strftime format `fmt`. Also be more exacting about colons and whitespace around timezones. Instead of unlimited colons and whitespace, only match a more limited possible set of leading colons and whitespace. Issue #660
chronotope · Sep 2, 2022 · cb4e2ac · cb4e2ac
1 parent 6e05e83
commit cb4e2ac
Show file tree

Hide file tree

Showing 11 changed files with 1,028 additions and 161 deletions.
diff --git a/src/datetime/mod.rs b/src/datetime/mod.rs
@@ -510,13 +510,18 @@ impl DateTime<FixedOffset> {
     /// RFC 2822 is the internet message standard that specifies the
     /// representation of times in HTTP and email headers.
     ///
+    /// The RFC 2822 standard allows arbitrary intermixed whitespace.
+    /// See [RFC 2822 Appendix A.5]
+    ///
     /// ```
     /// # use chrono::{DateTime, FixedOffset, TimeZone};
     /// assert_eq!(
     ///     DateTime::parse_from_rfc2822("Wed, 18 Feb 2015 23:16:09 GMT").unwrap(),
     ///     FixedOffset::east(0).ymd(2015, 2, 18).and_hms(23, 16, 9)
     /// );
     /// ```
+    ///
+    /// [RFC 2822 Appendix A.5]: https://www.rfc-editor.org/rfc/rfc2822#appendix-A.5
     pub fn parse_from_rfc2822(s: &str) -> ParseResult<DateTime<FixedOffset>> {
         const ITEMS: &[Item<'static>] = &[Item::Fixed(Fixed::RFC2822)];
         let mut parsed = Parsed::new();

diff --git a/src/datetime/tests.rs b/src/datetime/tests.rs
diff --git a/src/format/mod.rs b/src/format/mod.rs
@@ -218,27 +218,27 @@ pub enum Fixed {
     ///
     /// It does not support parsing, its use in the parser is an immediate failure.
     TimezoneName,
-    /// Offset from the local time to UTC (`+09:00` or `-04:00` or `+00:00`).
+    /// Offset from the local time to UTC (`+09:00` or `-0400` or `+00:00`).
     ///
-    /// In the parser, the colon can be omitted and/or surrounded with any amount of whitespace.
+    /// In the parser, the colon may be omitted,
     /// The offset is limited from `-24:00` to `+24:00`,
     /// which is the same as [`FixedOffset`](../offset/struct.FixedOffset.html)'s range.
     TimezoneOffsetColon,
-    /// Offset from the local time to UTC with seconds (`+09:00:00` or `-04:00:00` or `+00:00:00`).
+    /// Offset from the local time to UTC with seconds (`+09:00:00` or `-0400:00` or `+000000`).
     ///
-    /// In the parser, the colon can be omitted and/or surrounded with any amount of whitespace.
+    /// In the parser, the colon may be omitted,
     /// The offset is limited from `-24:00:00` to `+24:00:00`,
     /// which is the same as [`FixedOffset`](../offset/struct.FixedOffset.html)'s range.
     TimezoneOffsetDoubleColon,
     /// Offset from the local time to UTC without minutes (`+09` or `-04` or `+00`).
     ///
-    /// In the parser, the colon can be omitted and/or surrounded with any amount of whitespace.
+    /// In the parser, the colon may be omitted,
     /// The offset is limited from `-24` to `+24`,
     /// which is the same as [`FixedOffset`](../offset/struct.FixedOffset.html)'s range.
     TimezoneOffsetTripleColon,
-    /// Offset from the local time to UTC (`+09:00` or `-04:00` or `Z`).
+    /// Offset from the local time to UTC (`+09:00` or `-0400` or `Z`).
     ///
-    /// In the parser, the colon can be omitted and/or surrounded with any amount of whitespace,
+    /// In the parser, the colon may be omitted,
     /// and `Z` can be either in upper case or in lower case.
     /// The offset is limited from `-24:00` to `+24:00`,
     /// which is the same as [`FixedOffset`](../offset/struct.FixedOffset.html)'s range.

diff --git a/src/format/parse.rs b/src/format/parse.rs
diff --git a/src/format/scan.rs b/src/format/scan.rs
@@ -198,9 +198,69 @@ pub(super) fn space(s: &str) -> ParseResult<&str> {
     }
 }
 
-/// Consumes any number (including zero) of colon or spaces.
+/// returns slice `s` remaining after first char
+/// if `s.len() <= 1` then return an empty slice
+pub(super) fn s_next(s: &str) -> &str {
+    if s.len() <= 1 {
+        return &s[s.len()..];
+    }
+    match s.char_indices().nth(1) {
+        Some((offset, _)) => &s[offset..],
+        None => {
+            panic!("unexpected None for s {:?}.char_indices().nth(1)", s)
+        }
+    }
+}
+
+/// Consume one whitespace from the start of `s` if the first `char` is
+/// whitespace.
+pub(super) fn space1(s: &str) -> &str {
+    match s.chars().next() {
+        Some(c) if c.is_whitespace() => s_next(s),
+        Some(_) | None => s,
+    }
+}
+
+/// Allow a colon with possible whitespace padding.
+/// Consumes zero or one of leading patterns
+/// `":"`, `" "`, `" :"`, `": "`, or `" : "`
 pub(super) fn colon_or_space(s: &str) -> ParseResult<&str> {
-    Ok(s.trim_left_matches(|c: char| c == ':' || c.is_whitespace()))
+    let c0_ = match s.chars().next() {
+        Some(c) => c,
+        None => {
+            return Ok(s);
+        }
+    };
+    if c0_ != ':' && !c0_.is_whitespace() {
+        return Ok(s);
+    }
+    let c1_ = s.chars().nth(1);
+    match (c0_, c1_) {
+        (c0, None) if c0 == ':' || c0.is_whitespace() => {
+            return Ok(s_next(s));
+        }
+        (c0, Some(c1)) if c0 == ':' && c1.is_whitespace() => {
+            return Ok(s_next(s_next(s)));
+        }
+        (c0, Some(c1)) if c0 == ':' && !c1.is_whitespace() => {
+            return Ok(s_next(s));
+        }
+        (c0, Some(c1)) if c0.is_whitespace() && (!c1.is_whitespace() && c1 != ':') => {
+            return Ok(s_next(s));
+        }
+        _ => {}
+    }
+    let c2_ = s.chars().nth(2);
+    match (c0_, c1_, c2_) {
+        (c0, Some(c1), None) if c0.is_whitespace() && c1 == ':' => Ok(s_next(s_next(s))),
+        (c0, Some(c1), Some(c2)) if c0.is_whitespace() && c1 == ':' && !c2.is_whitespace() => {
+            Ok(s_next(s_next(s)))
+        }
+        (c0, Some(c1), Some(c2)) if c0.is_whitespace() && c1 == ':' && c2.is_whitespace() => {
+            Ok(s_next(s_next(s_next(s))))
+        }
+        _ => Ok(s),
+    }
 }
 
 /// Tries to parse `[-+]\d\d` continued by `\d\d`. Return an offset in seconds if possible.
@@ -238,6 +298,16 @@ where
     };
     s = &s[1..];
 
+    // special check for `Z` to return more accurate error `INVALID`.
+    // Otherwise the upcoming match for digits might return error `TOO_SHORT`
+    // which is confusing for the user.
+    match s.as_bytes().first() {
+        Some(&b'Z') | Some(&b'z') => {
+            return Err(INVALID);
+        }
+        _ => {}
+    }
+
     // hours (00--99)
     let hours = match digits(s)? {
         (h1 @ b'0'..=b'9', h2 @ b'0'..=b'9') => i32::from((h1 - b'0') * 10 + (h2 - b'0')),
@@ -413,3 +483,68 @@ fn test_rfc2822_comments() {
         );
     }
 }
+
+#[test]
+fn test_space() {
+    assert_eq!(space(""), Err(TOO_SHORT));
+    assert_eq!(space(" "), Ok(""));
+    assert_eq!(space(" \t"), Ok(""));
+    assert_eq!(space(" \ta"), Ok("a"));
+    assert_eq!(space(" \ta "), Ok("a "));
+    assert_eq!(space("a"), Err(INVALID));
+    assert_eq!(space("a "), Err(INVALID));
+}
+
+#[test]
+fn test_s_next() {
+    assert_eq!(s_next(""), "");
+    assert_eq!(s_next(" "), "");
+    assert_eq!(s_next("a"), "");
+    assert_eq!(s_next("ab"), "b");
+    assert_eq!(s_next("abc"), "bc");
+    assert_eq!(s_next("😾b"), "b");
+    assert_eq!(s_next("a😾"), "😾");
+    assert_eq!(s_next("😾bc"), "bc");
+    assert_eq!(s_next("a😾c"), "😾c");
+}
+
+#[test]
+fn test_space1() {
+    assert_eq!(space1(""), "");
+    assert_eq!(space1(" "), "");
+    assert_eq!(space1("\t"), "");
+    assert_eq!(space1("\t\t"), "\t");
+    assert_eq!(space1("  "), " ");
+    assert_eq!(space1("a"), "a");
+    assert_eq!(space1("a "), "a ");
+    assert_eq!(space1("ab"), "ab");
+    assert_eq!(space1("😼"), "😼");
+    assert_eq!(space1("😼b"), "😼b");
+}
+
+#[test]
+fn test_colon_or_space() {
+    assert_eq!(colon_or_space(""), Ok(""));
+    assert_eq!(colon_or_space(" "), Ok(""));
+    assert_eq!(colon_or_space(":"), Ok(""));
+    assert_eq!(colon_or_space(" :"), Ok(""));
+    assert_eq!(colon_or_space(": "), Ok(""));
+    assert_eq!(colon_or_space(" : "), Ok(""));
+    assert_eq!(colon_or_space(" :: "), Ok(": "));
+    assert_eq!(colon_or_space("😸"), Ok("😸"));
+    assert_eq!(colon_or_space("😸😸"), Ok("😸😸"));
+    assert_eq!(colon_or_space("😸:"), Ok("😸:"));
+    assert_eq!(colon_or_space("😸 "), Ok("😸 "));
+    assert_eq!(colon_or_space(" 😸"), Ok("😸"));
+    assert_eq!(colon_or_space(":😸"), Ok("😸"));
+    assert_eq!(colon_or_space(":😸 "), Ok("😸 "));
+    assert_eq!(colon_or_space(" :😸"), Ok("😸"));
+    assert_eq!(colon_or_space(" :😸 "), Ok("😸 "));
+    assert_eq!(colon_or_space(" :😸:"), Ok("😸:"));
+    assert_eq!(colon_or_space(": 😸"), Ok("😸"));
+    assert_eq!(colon_or_space(":  😸"), Ok(" 😸"));
+    assert_eq!(colon_or_space(": :😸"), Ok(":😸"));
+    assert_eq!(colon_or_space(" : 😸"), Ok("😸"));
+    assert_eq!(colon_or_space(" ::😸"), Ok(":😸"));
+    assert_eq!(colon_or_space(" :: 😸"), Ok(": 😸"));
+}
diff --git a/src/format/strftime.rs b/src/format/strftime.rs
@@ -466,28 +466,54 @@ impl<'a> Iterator for StrftimeItems<'a> {
                 }
             }
 
-            // the next item is space
+            // whitespace
             Some(c) if c.is_whitespace() => {
-                // `%` is not a whitespace, so `c != '%'` is redundant
-                let nextspec = self
-                    .remainder
-                    .find(|c: char| !c.is_whitespace())
-                    .unwrap_or(self.remainder.len());
-                assert!(nextspec > 0);
-                let item = sp!(&self.remainder[..nextspec]);
-                self.remainder = &self.remainder[nextspec..];
+                // LAST WORKING HERE 20220830 must compare whitespace chars
+                // wait, are any tests checking for mismatching whitespace? what about wide chars?
+                // same for case of literals below
+                let ws = self.remainder;
+                let mut end: usize = 0;
+                for (offset, c_) in self.remainder.char_indices() {
+                    if !c_.is_whitespace() {
+                        break;
+                    }
+                    // advance `end` by 1 char
+                    end = offset;
+                }
+                // get the offset of the last char too
+                end += match &self.remainder[end..].char_indices().nth(1) {
+                    Some((offset, _c)) => *offset,
+                    None => self.remainder[end..].len(),
+                };
+                self.remainder = &self.remainder[end..];
+                let item = sp!(&ws[..end]);
                 Some(item)
             }
 
-            // the next item is literal
-            _ => {
-                let nextspec = self
-                    .remainder
-                    .find(|c: char| c.is_whitespace() || c == '%')
-                    .unwrap_or(self.remainder.len());
-                assert!(nextspec > 0);
-                let item = lit!(&self.remainder[..nextspec]);
-                self.remainder = &self.remainder[nextspec..];
+            // literals
+            Some(_c) => {
+                let ws = self.remainder;
+                let mut end: usize = 0;
+                fn is_literal(c: &char) -> bool {
+                    if !c.is_whitespace() && c != &'%' {
+                        return true;
+                    }
+                    false
+                }
+                for (offset, c_) in self.remainder.char_indices() {
+                    if !is_literal(&c_) {
+                        break;
+                    }
+                    // advance `end` by 1 char
+                    end = offset;
+                }
+                // get the offset of the last char too
+                end += match &self.remainder[end..].char_indices().nth(1) {
+                    Some((offset, _c)) => *offset,
+                    None => self.remainder[end..].len(),
+                };
+                self.remainder = &self.remainder[end..];
+                let item = lit!(&ws[..end]);
                 Some(item)
             }
         }
@@ -499,8 +525,11 @@ impl<'a> Iterator for StrftimeItems<'a> {
 fn test_strftime_items() {
     fn parse_and_collect(s: &str) -> Vec<Item<'_>> {
         // map any error into `[Item::Error]`. useful for easy testing.
+        eprintln!("test_strftime_items: parse_and_collect({:?})", s);
         let items = StrftimeItems::new(s);
+        eprintln!("  items: {:?}", &items);
         let items = items.map(|spec| if spec == Item::Error { None } else { Some(spec) });
+        eprintln!("  items: {:?}", &items);
         items.collect::<Option<Vec<_>>>().unwrap_or_else(|| vec![Item::Error])
     }
 
@@ -518,6 +547,7 @@ fn test_strftime_items() {
         parse_and_collect("%Y-%m-%d"),
         [num0!(Year), lit!("-"), num0!(Month), lit!("-"), num0!(Day)]
     );
+    assert_eq!(parse_and_collect("%Y--%m"), [num0!(Year), lit!("--"), num0!(Month)]);
     assert_eq!(parse_and_collect("[%F]"), parse_and_collect("[%Y-%m-%d]"));
     assert_eq!(parse_and_collect("%m %d"), [num0!(Month), sp!(" "), num0!(Day)]);
     assert_eq!(parse_and_collect("%"), [Item::Error]);
@@ -543,6 +573,9 @@ fn test_strftime_items() {
     assert_eq!(parse_and_collect("%0e"), [num0!(Day)]);
     assert_eq!(parse_and_collect("%_e"), [nums!(Day)]);
     assert_eq!(parse_and_collect("%z"), [fix!(TimezoneOffset)]);
+    assert_eq!(parse_and_collect("%:z"), [fix!(TimezoneOffsetColon)]);
+    assert_eq!(parse_and_collect("%Z"), [fix!(TimezoneName)]);
+    assert_eq!(parse_and_collect("%ZZZZ"), [fix!(TimezoneName), lit!("ZZZ")]);
     assert_eq!(parse_and_collect("%#z"), [internal_fix!(TimezoneOffsetPermissive)]);
     assert_eq!(parse_and_collect("%#m"), [Item::Error]);
 }
@@ -643,6 +676,13 @@ fn test_strftime_format() {
     assert_eq!(dt.format("%t").to_string(), "\t");
     assert_eq!(dt.format("%n").to_string(), "\n");
     assert_eq!(dt.format("%%").to_string(), "%");
+
+    // complex format specifiers
+    assert_eq!(dt.format("  %Y%d%m%%%%%t%H%M%S\t").to_string(), "  20010807%%\t003460\t");
+    assert_eq!(
+        dt.format("  %Y%d%m%%%%%t%H:%P:%M%S%:::z\t").to_string(),
+        "  20010807%%\t00:am:3460+09\t"
+    );
 }
 
 #[cfg(feature = "unstable-locales")]