Skip to content

Commit

Permalink
be exact about whitespace
Browse files Browse the repository at this point in the history
Be exact about whitespace in parsing. This changes
pattern matching in `format::parse::parse` as it does not allow
arbitrary whitespace before, after, or between the datetime specifiers.

`format/parse.rs:datetime_from_str` is exact about whitespace in
the passed data `s` and passed strftime format `fmt`.

Also be more exacting about colons and whitespace around timezones.
Instead of unlimited colons and whitespace, only match a more limited
possible set of leading colons and whitespace.

Issue #660
  • Loading branch information
jtmoon79 committed Sep 2, 2022
1 parent 6e05e83 commit cb4e2ac
Show file tree
Hide file tree
Showing 11 changed files with 1,028 additions and 161 deletions.
5 changes: 5 additions & 0 deletions src/datetime/mod.rs
Expand Up @@ -510,13 +510,18 @@ impl DateTime<FixedOffset> {
/// RFC 2822 is the internet message standard that specifies the
/// representation of times in HTTP and email headers.
///
/// The RFC 2822 standard allows arbitrary intermixed whitespace.
/// See [RFC 2822 Appendix A.5]
///
/// ```
/// # use chrono::{DateTime, FixedOffset, TimeZone};
/// assert_eq!(
/// DateTime::parse_from_rfc2822("Wed, 18 Feb 2015 23:16:09 GMT").unwrap(),
/// FixedOffset::east(0).ymd(2015, 2, 18).and_hms(23, 16, 9)
/// );
/// ```
///
/// [RFC 2822 Appendix A.5]: https://www.rfc-editor.org/rfc/rfc2822#appendix-A.5
pub fn parse_from_rfc2822(s: &str) -> ParseResult<DateTime<FixedOffset>> {
const ITEMS: &[Item<'static>] = &[Item::Fixed(Fixed::RFC2822)];
let mut parsed = Parsed::new();
Expand Down
391 changes: 367 additions & 24 deletions src/datetime/tests.rs

Large diffs are not rendered by default.

14 changes: 7 additions & 7 deletions src/format/mod.rs
Expand Up @@ -218,27 +218,27 @@ pub enum Fixed {
///
/// It does not support parsing, its use in the parser is an immediate failure.
TimezoneName,
/// Offset from the local time to UTC (`+09:00` or `-04:00` or `+00:00`).
/// Offset from the local time to UTC (`+09:00` or `-0400` or `+00:00`).
///
/// In the parser, the colon can be omitted and/or surrounded with any amount of whitespace.
/// In the parser, the colon may be omitted,
/// The offset is limited from `-24:00` to `+24:00`,
/// which is the same as [`FixedOffset`](../offset/struct.FixedOffset.html)'s range.
TimezoneOffsetColon,
/// Offset from the local time to UTC with seconds (`+09:00:00` or `-04:00:00` or `+00:00:00`).
/// Offset from the local time to UTC with seconds (`+09:00:00` or `-0400:00` or `+000000`).
///
/// In the parser, the colon can be omitted and/or surrounded with any amount of whitespace.
/// In the parser, the colon may be omitted,
/// The offset is limited from `-24:00:00` to `+24:00:00`,
/// which is the same as [`FixedOffset`](../offset/struct.FixedOffset.html)'s range.
TimezoneOffsetDoubleColon,
/// Offset from the local time to UTC without minutes (`+09` or `-04` or `+00`).
///
/// In the parser, the colon can be omitted and/or surrounded with any amount of whitespace.
/// In the parser, the colon may be omitted,
/// The offset is limited from `-24` to `+24`,
/// which is the same as [`FixedOffset`](../offset/struct.FixedOffset.html)'s range.
TimezoneOffsetTripleColon,
/// Offset from the local time to UTC (`+09:00` or `-04:00` or `Z`).
/// Offset from the local time to UTC (`+09:00` or `-0400` or `Z`).
///
/// In the parser, the colon can be omitted and/or surrounded with any amount of whitespace,
/// In the parser, the colon may be omitted,
/// and `Z` can be either in upper case or in lower case.
/// The offset is limited from `-24:00` to `+24:00`,
/// which is the same as [`FixedOffset`](../offset/struct.FixedOffset.html)'s range.
Expand Down
384 changes: 322 additions & 62 deletions src/format/parse.rs

Large diffs are not rendered by default.

139 changes: 137 additions & 2 deletions src/format/scan.rs
Expand Up @@ -198,9 +198,69 @@ pub(super) fn space(s: &str) -> ParseResult<&str> {
}
}

/// Consumes any number (including zero) of colon or spaces.
/// returns slice `s` remaining after first char
/// if `s.len() <= 1` then return an empty slice
pub(super) fn s_next(s: &str) -> &str {
if s.len() <= 1 {
return &s[s.len()..];
}
match s.char_indices().nth(1) {
Some((offset, _)) => &s[offset..],
None => {
panic!("unexpected None for s {:?}.char_indices().nth(1)", s)
}
}
}

/// Consume one whitespace from the start of `s` if the first `char` is
/// whitespace.
pub(super) fn space1(s: &str) -> &str {
match s.chars().next() {
Some(c) if c.is_whitespace() => s_next(s),
Some(_) | None => s,
}
}

/// Allow a colon with possible whitespace padding.
/// Consumes zero or one of leading patterns
/// `":"`, `" "`, `" :"`, `": "`, or `" : "`
pub(super) fn colon_or_space(s: &str) -> ParseResult<&str> {
Ok(s.trim_left_matches(|c: char| c == ':' || c.is_whitespace()))
let c0_ = match s.chars().next() {
Some(c) => c,
None => {
return Ok(s);
}
};
if c0_ != ':' && !c0_.is_whitespace() {
return Ok(s);
}
let c1_ = s.chars().nth(1);
match (c0_, c1_) {
(c0, None) if c0 == ':' || c0.is_whitespace() => {
return Ok(s_next(s));
}
(c0, Some(c1)) if c0 == ':' && c1.is_whitespace() => {
return Ok(s_next(s_next(s)));
}
(c0, Some(c1)) if c0 == ':' && !c1.is_whitespace() => {
return Ok(s_next(s));
}
(c0, Some(c1)) if c0.is_whitespace() && (!c1.is_whitespace() && c1 != ':') => {
return Ok(s_next(s));
}
_ => {}
}
let c2_ = s.chars().nth(2);
match (c0_, c1_, c2_) {
(c0, Some(c1), None) if c0.is_whitespace() && c1 == ':' => Ok(s_next(s_next(s))),
(c0, Some(c1), Some(c2)) if c0.is_whitespace() && c1 == ':' && !c2.is_whitespace() => {
Ok(s_next(s_next(s)))
}
(c0, Some(c1), Some(c2)) if c0.is_whitespace() && c1 == ':' && c2.is_whitespace() => {
Ok(s_next(s_next(s_next(s))))
}
_ => Ok(s),
}
}

/// Tries to parse `[-+]\d\d` continued by `\d\d`. Return an offset in seconds if possible.
Expand Down Expand Up @@ -238,6 +298,16 @@ where
};
s = &s[1..];

// special check for `Z` to return more accurate error `INVALID`.
// Otherwise the upcoming match for digits might return error `TOO_SHORT`
// which is confusing for the user.
match s.as_bytes().first() {
Some(&b'Z') | Some(&b'z') => {
return Err(INVALID);
}
_ => {}
}

// hours (00--99)
let hours = match digits(s)? {
(h1 @ b'0'..=b'9', h2 @ b'0'..=b'9') => i32::from((h1 - b'0') * 10 + (h2 - b'0')),
Expand Down Expand Up @@ -413,3 +483,68 @@ fn test_rfc2822_comments() {
);
}
}

#[test]
fn test_space() {
assert_eq!(space(""), Err(TOO_SHORT));
assert_eq!(space(" "), Ok(""));
assert_eq!(space(" \t"), Ok(""));
assert_eq!(space(" \ta"), Ok("a"));
assert_eq!(space(" \ta "), Ok("a "));
assert_eq!(space("a"), Err(INVALID));
assert_eq!(space("a "), Err(INVALID));
}

#[test]
fn test_s_next() {
assert_eq!(s_next(""), "");
assert_eq!(s_next(" "), "");
assert_eq!(s_next("a"), "");
assert_eq!(s_next("ab"), "b");
assert_eq!(s_next("abc"), "bc");
assert_eq!(s_next("😾b"), "b");
assert_eq!(s_next("a😾"), "😾");
assert_eq!(s_next("😾bc"), "bc");
assert_eq!(s_next("a😾c"), "😾c");
}

#[test]
fn test_space1() {
assert_eq!(space1(""), "");
assert_eq!(space1(" "), "");
assert_eq!(space1("\t"), "");
assert_eq!(space1("\t\t"), "\t");
assert_eq!(space1(" "), " ");
assert_eq!(space1("a"), "a");
assert_eq!(space1("a "), "a ");
assert_eq!(space1("ab"), "ab");
assert_eq!(space1("😼"), "😼");
assert_eq!(space1("😼b"), "😼b");
}

#[test]
fn test_colon_or_space() {
assert_eq!(colon_or_space(""), Ok(""));
assert_eq!(colon_or_space(" "), Ok(""));
assert_eq!(colon_or_space(":"), Ok(""));
assert_eq!(colon_or_space(" :"), Ok(""));
assert_eq!(colon_or_space(": "), Ok(""));
assert_eq!(colon_or_space(" : "), Ok(""));
assert_eq!(colon_or_space(" :: "), Ok(": "));
assert_eq!(colon_or_space("😸"), Ok("😸"));
assert_eq!(colon_or_space("😸😸"), Ok("😸😸"));
assert_eq!(colon_or_space("😸:"), Ok("😸:"));
assert_eq!(colon_or_space("😸 "), Ok("😸 "));
assert_eq!(colon_or_space(" 😸"), Ok("😸"));
assert_eq!(colon_or_space(":😸"), Ok("😸"));
assert_eq!(colon_or_space(":😸 "), Ok("😸 "));
assert_eq!(colon_or_space(" :😸"), Ok("😸"));
assert_eq!(colon_or_space(" :😸 "), Ok("😸 "));
assert_eq!(colon_or_space(" :😸:"), Ok("😸:"));
assert_eq!(colon_or_space(": 😸"), Ok("😸"));
assert_eq!(colon_or_space(": 😸"), Ok(" 😸"));
assert_eq!(colon_or_space(": :😸"), Ok(":😸"));
assert_eq!(colon_or_space(" : 😸"), Ok("😸"));
assert_eq!(colon_or_space(" ::😸"), Ok(":😸"));
assert_eq!(colon_or_space(" :: 😸"), Ok(": 😸"));
}
76 changes: 58 additions & 18 deletions src/format/strftime.rs
Expand Up @@ -466,28 +466,54 @@ impl<'a> Iterator for StrftimeItems<'a> {
}
}

// the next item is space
// whitespace
Some(c) if c.is_whitespace() => {
// `%` is not a whitespace, so `c != '%'` is redundant
let nextspec = self
.remainder
.find(|c: char| !c.is_whitespace())
.unwrap_or(self.remainder.len());
assert!(nextspec > 0);
let item = sp!(&self.remainder[..nextspec]);
self.remainder = &self.remainder[nextspec..];
// LAST WORKING HERE 20220830 must compare whitespace chars
// wait, are any tests checking for mismatching whitespace? what about wide chars?
// same for case of literals below
let ws = self.remainder;
let mut end: usize = 0;
for (offset, c_) in self.remainder.char_indices() {
if !c_.is_whitespace() {
break;
}
// advance `end` by 1 char
end = offset;
}
// get the offset of the last char too
end += match &self.remainder[end..].char_indices().nth(1) {
Some((offset, _c)) => *offset,
None => self.remainder[end..].len(),
};
self.remainder = &self.remainder[end..];
let item = sp!(&ws[..end]);
Some(item)
}

// the next item is literal
_ => {
let nextspec = self
.remainder
.find(|c: char| c.is_whitespace() || c == '%')
.unwrap_or(self.remainder.len());
assert!(nextspec > 0);
let item = lit!(&self.remainder[..nextspec]);
self.remainder = &self.remainder[nextspec..];
// literals
Some(_c) => {
let ws = self.remainder;
let mut end: usize = 0;
fn is_literal(c: &char) -> bool {
if !c.is_whitespace() && c != &'%' {
return true;
}
false
}
for (offset, c_) in self.remainder.char_indices() {
if !is_literal(&c_) {
break;
}
// advance `end` by 1 char
end = offset;
}
// get the offset of the last char too
end += match &self.remainder[end..].char_indices().nth(1) {
Some((offset, _c)) => *offset,
None => self.remainder[end..].len(),
};
self.remainder = &self.remainder[end..];
let item = lit!(&ws[..end]);
Some(item)
}
}
Expand All @@ -499,8 +525,11 @@ impl<'a> Iterator for StrftimeItems<'a> {
fn test_strftime_items() {
fn parse_and_collect(s: &str) -> Vec<Item<'_>> {
// map any error into `[Item::Error]`. useful for easy testing.
eprintln!("test_strftime_items: parse_and_collect({:?})", s);
let items = StrftimeItems::new(s);
eprintln!(" items: {:?}", &items);
let items = items.map(|spec| if spec == Item::Error { None } else { Some(spec) });
eprintln!(" items: {:?}", &items);
items.collect::<Option<Vec<_>>>().unwrap_or_else(|| vec![Item::Error])
}

Expand All @@ -518,6 +547,7 @@ fn test_strftime_items() {
parse_and_collect("%Y-%m-%d"),
[num0!(Year), lit!("-"), num0!(Month), lit!("-"), num0!(Day)]
);
assert_eq!(parse_and_collect("%Y--%m"), [num0!(Year), lit!("--"), num0!(Month)]);
assert_eq!(parse_and_collect("[%F]"), parse_and_collect("[%Y-%m-%d]"));
assert_eq!(parse_and_collect("%m %d"), [num0!(Month), sp!(" "), num0!(Day)]);
assert_eq!(parse_and_collect("%"), [Item::Error]);
Expand All @@ -543,6 +573,9 @@ fn test_strftime_items() {
assert_eq!(parse_and_collect("%0e"), [num0!(Day)]);
assert_eq!(parse_and_collect("%_e"), [nums!(Day)]);
assert_eq!(parse_and_collect("%z"), [fix!(TimezoneOffset)]);
assert_eq!(parse_and_collect("%:z"), [fix!(TimezoneOffsetColon)]);
assert_eq!(parse_and_collect("%Z"), [fix!(TimezoneName)]);
assert_eq!(parse_and_collect("%ZZZZ"), [fix!(TimezoneName), lit!("ZZZ")]);
assert_eq!(parse_and_collect("%#z"), [internal_fix!(TimezoneOffsetPermissive)]);
assert_eq!(parse_and_collect("%#m"), [Item::Error]);
}
Expand Down Expand Up @@ -643,6 +676,13 @@ fn test_strftime_format() {
assert_eq!(dt.format("%t").to_string(), "\t");
assert_eq!(dt.format("%n").to_string(), "\n");
assert_eq!(dt.format("%%").to_string(), "%");

// complex format specifiers
assert_eq!(dt.format(" %Y%d%m%%%%%t%H%M%S\t").to_string(), " 20010807%%\t003460\t");
assert_eq!(
dt.format(" %Y%d%m%%%%%t%H:%P:%M%S%:::z\t").to_string(),
" 20010807%%\t00:am:3460+09\t"
);
}

#[cfg(feature = "unstable-locales")]
Expand Down

0 comments on commit cb4e2ac

Please sign in to comment.