Skip to content

Commit

Permalink
fix(token): Don't crash on parsing unicode
Browse files Browse the repository at this point in the history
  • Loading branch information
epage committed Feb 8, 2024
1 parent 4c248c8 commit 8879269
Showing 1 changed file with 19 additions and 1 deletion.
20 changes: 19 additions & 1 deletion crates/typos/src/tokens.rs
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,12 @@ mod parser {
// `{XID_Continue}+` because XID_Continue is a superset of XID_Start and rather catch odd
// or unexpected cases than strip off start characters to a word since we aren't doing a
// proper word boundary parse
trace("identifier", take_while(1.., is_xid_continue)).parse_next(input)
trace(
"identifier",
take_while(1.., is_xid_continue)
.verify(|s: &<T as Stream>::Slice| std::str::from_utf8(s.as_bstr()).is_ok()),
)
.parse_next(input)
}

fn ignore<T>(input: &mut T) -> PResult<<T as Stream>::Slice, ()>
Expand Down Expand Up @@ -1310,6 +1315,18 @@ mod test {
assert_eq!(expected, actual);
}

#[test]
fn tokenize_unicode_without_unicode() {
let parser = TokenizerBuilder::new().unicode(false).build();

let input = "appliqués";
let expected: Vec<Identifier> = vec![];
let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
assert_eq!(expected, actual);
let actual: Vec<_> = parser.parse_str(input).collect();
assert_eq!(expected, actual);
}

#[test]
fn split_ident() {
let cases = [
Expand Down Expand Up @@ -1365,6 +1382,7 @@ mod test {
"BFG9000",
&[("BFG", Case::Upper, 0), ("9000", Case::None, 3)],
),
("appliqués", &[("appliqués", Case::Lower, 0)]),
];
for (input, expected) in cases.iter() {
let ident = Identifier::new_unchecked(input, Case::None, 0);
Expand Down

0 comments on commit 8879269

Please sign in to comment.