From c78268c9be8d5585073b77b9b7adc2d1c1527b13 Mon Sep 17 00:00:00 2001
From: John Backes <jwbackes@gmail.com>
Date: Fri, 14 Apr 2023 22:37:16 -0700
Subject: [PATCH 1/3] Add ability to tokenize a string and return the decoded
 tokens using the correct BPE model

The _decode_native_and_split function decodes encoded BPE tokens into their corresponding byte arrays and returns a vector of vectors of bytes. The split_by_token_with_special_tokens function takes in a string, encodes it using the BPE model, and then decodes the encoded tokens into a vector of strings. This allows for tokenizing a string and returning the decoded tokens using the correct BPE model. Added a corresponding test (cl100k_split_test) to tests/tiktoken.rs.
---
 tiktoken-rs/src/vendor_tiktoken.rs | 26 ++++++++++++++++++++++++++
 tiktoken-rs/tests/tiktoken.rs      |  7 +++++++
 2 files changed, 33 insertions(+)
diff --git a/tiktoken-rs/src/vendor_tiktoken.rs b/tiktoken-rs/src/vendor_tiktoken.rs
index d94b938..ffcccc8 100644
--- a/tiktoken-rs/src/vendor_tiktoken.rs
+++ b/tiktoken-rs/src/vendor_tiktoken.rs
@@ -235,6 +235,18 @@ impl CoreBPE {
         ret
     }
 
+    fn _decode_native_and_split(&self, tokens: &[usize]) -> Vec<Vec<u8>> {
+        let mut ret = Vec::with_capacity(tokens.len());
+        for token in tokens {
+            let token_bytes = self
+                .decoder
+                .get(token)
+                .unwrap_or_else(|| &self.special_tokens_decoder[token]);
+            ret.push(token_bytes.clone());
+        }
+        ret
+    }
+
     fn _encode_ordinary_native(&self, text: &str) -> Vec<usize> {
         // This is the core of the encoding logic; the other functions in here
         // just make things complicated :-)
@@ -541,6 +553,20 @@ impl CoreBPE {
             Err(e) => Err(anyhow!("Unable to decode into a valid UTF-8 string: {}", e)),
         }
     }
+
+    // tokenize a string and return the decoded tokens using the correct BPE model
+    // for example: "Hello world" -> ["Hello", " world"]
+    pub fn split_by_token_with_special_tokens(&self, text: &str) -> Result<Vec<String>> {
+        // first, encode the text using the BPE model
+        let encoded = self.encode_with_special_tokens(text);
+
+        let tokenized = self._decode_native_and_split(&encoded);
+
+        tokenized
+            .iter()
+            .map(|token| String::from_utf8(token.clone()).map_err(|e| anyhow!(e.to_string())))
+            .collect()
+    }
 }
 
 #[cfg(feature = "python")]
diff --git a/tiktoken-rs/tests/tiktoken.rs b/tiktoken-rs/tests/tiktoken.rs
index 184920f..faae6c6 100644
--- a/tiktoken-rs/tests/tiktoken.rs
+++ b/tiktoken-rs/tests/tiktoken.rs
@@ -82,6 +82,13 @@ fn cl100k_base_test() {
     );
 }
 
+#[test]
+fn cl100k_split_test() {
+    let bpe = cl100k_base().unwrap();
+    let tokenized = bpe.split_by_token_with_special_tokens("This is a test         with a lot of spaces").unwrap();
+    assert_eq!(tokenized, vec!["This", " is", " a", " test", "        ", " with", " a", " lot", " of", " spaces"]);
+}
+
 #[test]
 fn p50k_base_singleton_test() {
     // let now = std::time::Instant::now();

From 7ecf0a1082660d37f82b5dafca47a90eeb45c279 Mon Sep 17 00:00:00 2001
From: John Backes <jwbackes@gmail.com>
Date: Sat, 15 Apr 2023 20:38:23 -0700
Subject: [PATCH 2/3] Add ChatCompletionRequestMessage Eq implementation

Add the "Eq" trait to the "ChatCompletionRequestMessage" struct to allow for easy comparison with other structs.
---
 tiktoken-rs/src/api.rs        | 2 +-
 tiktoken-rs/tests/tiktoken.rs | 9 +++++++--
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/tiktoken-rs/src/api.rs b/tiktoken-rs/src/api.rs
index 3544927..08aee24 100644
--- a/tiktoken-rs/src/api.rs
+++ b/tiktoken-rs/src/api.rs
@@ -46,7 +46,7 @@ pub fn get_completion_max_tokens(model: &str, prompt: &str) -> Result<usize> {
     Ok(context_size.saturating_sub(prompt_tokens))
 }
 
-#[derive(Debug, Default, Clone, PartialEq)]
+#[derive(Debug, Default, Clone, PartialEq, Eq)]
 pub struct ChatCompletionRequestMessage {
     /// The role of the author of this message.
     pub role: String,
diff --git a/tiktoken-rs/tests/tiktoken.rs b/tiktoken-rs/tests/tiktoken.rs
index faae6c6..7589a36 100644
--- a/tiktoken-rs/tests/tiktoken.rs
+++ b/tiktoken-rs/tests/tiktoken.rs
@@ -85,8 +85,13 @@ fn cl100k_base_test() {
 #[test]
 fn cl100k_split_test() {
     let bpe = cl100k_base().unwrap();
-    let tokenized = bpe.split_by_token_with_special_tokens("This is a test         with a lot of spaces").unwrap();
-    assert_eq!(tokenized, vec!["This", " is", " a", " test", "        ", " with", " a", " lot", " of", " spaces"]);
+    let tokenized = bpe
+        .split_by_token_with_special_tokens("This is a test         with a lot of spaces")
+        .unwrap();
+    assert_eq!(
+        tokenized,
+        vec!["This", " is", " a", " test", "        ", " with", " a", " lot", " of", " spaces"]
+    );
 }
 
 #[test]

From d3461d3a7aee3d5160b35e18bc997f0442326aa1 Mon Sep 17 00:00:00 2001
From: John Backes <jwbackes@gmail.com>
Date: Sat, 15 Apr 2023 22:25:54 -0700
Subject: [PATCH 3/3] Add split_by_token_with_special_tokens() method to
 CoreBPE

Add a new method to CoreBPE, split_by_token_with_special_tokens(), which takes a string slice containing the text to be tokenized, encodes it using the BPE model, and decodes the encoded tokens into a vector of strings. The resulting iterator yields each token as a Result<String> to handle decoding errors. The method includes a test to ensure its correctness.
---
 tiktoken-rs/src/vendor_tiktoken.rs | 73 ++++++++++++++++++++++--------
 tiktoken-rs/tests/tiktoken.rs      |  5 +-
 2 files changed, 58 insertions(+), 20 deletions(-)

diff --git a/tiktoken-rs/src/vendor_tiktoken.rs b/tiktoken-rs/src/vendor_tiktoken.rs
index ffcccc8..1da339e 100644
--- a/tiktoken-rs/src/vendor_tiktoken.rs
+++ b/tiktoken-rs/src/vendor_tiktoken.rs
@@ -235,16 +235,18 @@ impl CoreBPE {
         ret
     }
 
-    fn _decode_native_and_split(&self, tokens: &[usize]) -> Vec<Vec<u8>> {
-        let mut ret = Vec::with_capacity(tokens.len());
-        for token in tokens {
+    #[allow(clippy::needless_lifetimes)] // the iterator captures a lifetime outside of the function
+    fn _decode_native_and_split<'a>(
+        &'a self,
+        tokens: Vec<usize>,
+    ) -> impl Iterator<Item = Vec<u8>> + '_ {
+        tokens.into_iter().map(move |token| {
             let token_bytes = self
                 .decoder
-                .get(token)
-                .unwrap_or_else(|| &self.special_tokens_decoder[token]);
-            ret.push(token_bytes.clone());
-        }
-        ret
+                .get(&token)
+                .unwrap_or_else(|| &self.special_tokens_decoder[&token]);
+            token_bytes.clone()
+        })
     }
 
     fn _encode_ordinary_native(&self, text: &str) -> Vec<usize> {
@@ -554,18 +556,53 @@ impl CoreBPE {
         }
     }
 
-    // tokenize a string and return the decoded tokens using the correct BPE model
-    // for example: "Hello world" -> ["Hello", " world"]
-    pub fn split_by_token_with_special_tokens(&self, text: &str) -> Result<Vec<String>> {
-        // first, encode the text using the BPE model
+    /// Tokenize a string and return the decoded tokens using the correct BPE model.
+    ///
+    /// This method takes a string, encodes it using the BPE model, and decodes the encoded tokens into
+    /// a vector of strings. It can be used to tokenize a string and return the decoded tokens using the
+    /// correct BPE model.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    ///     use tiktoken_rs::cl100k_base;
+    ///     let bpe = cl100k_base().unwrap();
+    ///     let tokenized: Result<Vec<_>, _> = bpe
+    ///         .split_by_token_with_special_tokens("This is a test         with a lot of spaces")
+    ///         .collect();
+    ///     let tokenized = tokenized.unwrap();
+    ///     assert_eq!(
+    ///         tokenized,
+    ///         vec!["This", " is", " a", " test", "        ", " with", " a", " lot", " of", " spaces"]
+    ///     );
+    /// ```
+    ///
+    /// # Arguments
+    ///
+    /// * text: A string slice containing the text to be tokenized.
+    ///
+    /// # Returns
+    ///
+    /// * Result<Vec<String>>: A Result containing a vector of decoded tokens as strings, or an error
+    /// if the string cannot be converted into a valid UTF-8 string.
+    ///
+    /// # Errors
+    ///
+    /// This function will return an error if:
+    ///
+    /// * The input text cannot be converted into a valid UTF-8 string during the decoding process.
+    ///
+    pub fn split_by_token_with_special_tokens<'a>(
+        &'a self,
+        text: &'a str,
+    ) -> impl Iterator<Item = Result<String>> + 'a {
+        // First, encode the text using the BPE model
         let encoded = self.encode_with_special_tokens(text);
 
-        let tokenized = self._decode_native_and_split(&encoded);
-
-        tokenized
-            .iter()
-            .map(|token| String::from_utf8(token.clone()).map_err(|e| anyhow!(e.to_string())))
-            .collect()
+        self._decode_native_and_split(encoded).map(|token|
+                // Map each token to a Result<String>
+                String::from_utf8(token)
+                .map_err(|e| anyhow!(e.to_string())))
     }
 }
 
diff --git a/tiktoken-rs/tests/tiktoken.rs b/tiktoken-rs/tests/tiktoken.rs
index 7589a36..bfe633d 100644
--- a/tiktoken-rs/tests/tiktoken.rs
+++ b/tiktoken-rs/tests/tiktoken.rs
@@ -85,9 +85,10 @@ fn cl100k_base_test() {
 #[test]
 fn cl100k_split_test() {
     let bpe = cl100k_base().unwrap();
-    let tokenized = bpe
+    let tokenized: Result<Vec<_>, _> = bpe
         .split_by_token_with_special_tokens("This is a test         with a lot of spaces")
-        .unwrap();
+        .collect();
+    let tokenized = tokenized.unwrap();
     assert_eq!(
         tokenized,
         vec!["This", " is", " a", " test", "        ", " with", " a", " lot", " of", " spaces"]