From 8879269b0d6c74ac3aec50c2beed5eb4bb16669a Mon Sep 17 00:00:00 2001
From: Ed Page <eopage@gmail.com>
Date: Thu, 8 Feb 2024 07:10:28 -0600
Subject: [PATCH] fix(token): Don't crash on parsing unicode

---
 crates/typos/src/tokens.rs | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)
diff --git a/crates/typos/src/tokens.rs b/crates/typos/src/tokens.rs
index 796de4408..8b3332e07 100644
--- a/crates/typos/src/tokens.rs
+++ b/crates/typos/src/tokens.rs
@@ -159,7 +159,12 @@ mod parser {
         // `{XID_Continue}+` because XID_Continue is a superset of XID_Start and rather catch odd
         // or unexpected cases than strip off start characters to a word since we aren't doing a
         // proper word boundary parse
-        trace("identifier", take_while(1.., is_xid_continue)).parse_next(input)
+        trace(
+            "identifier",
+            take_while(1.., is_xid_continue)
+                .verify(|s: &<T as Stream>::Slice| std::str::from_utf8(s.as_bstr()).is_ok()),
+        )
+        .parse_next(input)
     }
 
     fn ignore<T>(input: &mut T) -> PResult<<T as Stream>::Slice, ()>
@@ -1310,6 +1315,18 @@ mod test {
         assert_eq!(expected, actual);
     }
 
+    #[test]
+    fn tokenize_unicode_without_unicode() {
+        let parser = TokenizerBuilder::new().unicode(false).build();
+
+        let input = "appliqués";
+        let expected: Vec<Identifier> = vec![];
+        let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
+        assert_eq!(expected, actual);
+        let actual: Vec<_> = parser.parse_str(input).collect();
+        assert_eq!(expected, actual);
+    }
+
     #[test]
     fn split_ident() {
         let cases = [
@@ -1365,6 +1382,7 @@ mod test {
                 "BFG9000",
                 &[("BFG", Case::Upper, 0), ("9000", Case::None, 3)],
             ),
+            ("appliqués", &[("appliqués", Case::Lower, 0)]),
         ];
         for (input, expected) in cases.iter() {
             let ident = Identifier::new_unchecked(input, Case::None, 0);