From c78268c9be8d5585073b77b9b7adc2d1c1527b13 Mon Sep 17 00:00:00 2001 From: John Backes Date: Fri, 14 Apr 2023 22:37:16 -0700 Subject: [PATCH 1/3] Add ability to tokenize a string and return the decoded tokens using the correct BPE model The _decode_native_and_split function decodes encoded BPE tokens into their corresponding byte arrays and returns a vector of vectors of bytes. The split_by_token_with_special_tokens function takes in a string, encodes it using the BPE model, and then decodes the encoded tokens into a vector of strings. This allows for tokenizing a string and returning the decoded tokens using the correct BPE model. Added a corresponding test (cl100k_split_test) to tests/tiktoken.rs. --- tiktoken-rs/src/vendor_tiktoken.rs | 26 ++++++++++++++++++++++++++ tiktoken-rs/tests/tiktoken.rs | 7 +++++++ 2 files changed, 33 insertions(+) diff --git a/tiktoken-rs/src/vendor_tiktoken.rs b/tiktoken-rs/src/vendor_tiktoken.rs index d94b938..ffcccc8 100644 --- a/tiktoken-rs/src/vendor_tiktoken.rs +++ b/tiktoken-rs/src/vendor_tiktoken.rs @@ -235,6 +235,18 @@ impl CoreBPE { ret } + fn _decode_native_and_split(&self, tokens: &[usize]) -> Vec> { + let mut ret = Vec::with_capacity(tokens.len()); + for token in tokens { + let token_bytes = self + .decoder + .get(token) + .unwrap_or_else(|| &self.special_tokens_decoder[token]); + ret.push(token_bytes.clone()); + } + ret + } + fn _encode_ordinary_native(&self, text: &str) -> Vec { // This is the core of the encoding logic; the other functions in here // just make things complicated :-) @@ -541,6 +553,20 @@ impl CoreBPE { Err(e) => Err(anyhow!("Unable to decode into a valid UTF-8 string: {}", e)), } } + + // tokenize a string and return the decoded tokens using the correct BPE model + // for example: "Hello world" -> ["Hello", " world"] + pub fn split_by_token_with_special_tokens(&self, text: &str) -> Result> { + // first, encode the text using the BPE model + let encoded = self.encode_with_special_tokens(text); + + let tokenized = self._decode_native_and_split(&encoded); + + tokenized + .iter() + .map(|token| String::from_utf8(token.clone()).map_err(|e| anyhow!(e.to_string()))) + .collect() + } } #[cfg(feature = "python")] diff --git a/tiktoken-rs/tests/tiktoken.rs b/tiktoken-rs/tests/tiktoken.rs index 184920f..faae6c6 100644 --- a/tiktoken-rs/tests/tiktoken.rs +++ b/tiktoken-rs/tests/tiktoken.rs @@ -82,6 +82,13 @@ fn cl100k_base_test() { ); } +#[test] +fn cl100k_split_test() { + let bpe = cl100k_base().unwrap(); + let tokenized = bpe.split_by_token_with_special_tokens("This is a test with a lot of spaces").unwrap(); + assert_eq!(tokenized, vec!["This", " is", " a", " test", " ", " with", " a", " lot", " of", " spaces"]); +} + #[test] fn p50k_base_singleton_test() { // let now = std::time::Instant::now(); From 7ecf0a1082660d37f82b5dafca47a90eeb45c279 Mon Sep 17 00:00:00 2001 From: John Backes Date: Sat, 15 Apr 2023 20:38:23 -0700 Subject: [PATCH 2/3] Add ChatCompletionRequestMessage Eq implementation Add the "Eq" trait to the "ChatCompletionRequestMessage" struct to allow for easy comparison with other structs. --- tiktoken-rs/src/api.rs | 2 +- tiktoken-rs/tests/tiktoken.rs | 9 +++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/tiktoken-rs/src/api.rs b/tiktoken-rs/src/api.rs index 3544927..08aee24 100644 --- a/tiktoken-rs/src/api.rs +++ b/tiktoken-rs/src/api.rs @@ -46,7 +46,7 @@ pub fn get_completion_max_tokens(model: &str, prompt: &str) -> Result { Ok(context_size.saturating_sub(prompt_tokens)) } -#[derive(Debug, Default, Clone, PartialEq)] +#[derive(Debug, Default, Clone, PartialEq, Eq)] pub struct ChatCompletionRequestMessage { /// The role of the author of this message. pub role: String, diff --git a/tiktoken-rs/tests/tiktoken.rs b/tiktoken-rs/tests/tiktoken.rs index faae6c6..7589a36 100644 --- a/tiktoken-rs/tests/tiktoken.rs +++ b/tiktoken-rs/tests/tiktoken.rs @@ -85,8 +85,13 @@ fn cl100k_base_test() { #[test] fn cl100k_split_test() { let bpe = cl100k_base().unwrap(); - let tokenized = bpe.split_by_token_with_special_tokens("This is a test with a lot of spaces").unwrap(); - assert_eq!(tokenized, vec!["This", " is", " a", " test", " ", " with", " a", " lot", " of", " spaces"]); + let tokenized = bpe + .split_by_token_with_special_tokens("This is a test with a lot of spaces") + .unwrap(); + assert_eq!( + tokenized, + vec!["This", " is", " a", " test", " ", " with", " a", " lot", " of", " spaces"] + ); } #[test] From d3461d3a7aee3d5160b35e18bc997f0442326aa1 Mon Sep 17 00:00:00 2001 From: John Backes Date: Sat, 15 Apr 2023 22:25:54 -0700 Subject: [PATCH 3/3] Add split_by_token_with_special_tokens() method to CoreBPE Add a new method to CoreBPE, split_by_token_with_special_tokens(), which takes a string slice containing the text to be tokenized, encodes it using the BPE model, and decodes the encoded tokens into a vector of strings. The resulting iterator yields each token as a Result to handle decoding errors. The method includes a test to ensure its correctness. --- tiktoken-rs/src/vendor_tiktoken.rs | 73 ++++++++++++++++++++++-------- tiktoken-rs/tests/tiktoken.rs | 5 +- 2 files changed, 58 insertions(+), 20 deletions(-) diff --git a/tiktoken-rs/src/vendor_tiktoken.rs b/tiktoken-rs/src/vendor_tiktoken.rs index ffcccc8..1da339e 100644 --- a/tiktoken-rs/src/vendor_tiktoken.rs +++ b/tiktoken-rs/src/vendor_tiktoken.rs @@ -235,16 +235,18 @@ impl CoreBPE { ret } - fn _decode_native_and_split(&self, tokens: &[usize]) -> Vec> { - let mut ret = Vec::with_capacity(tokens.len()); - for token in tokens { + #[allow(clippy::needless_lifetimes)] // the iterator captures a lifetime outside of the function + fn _decode_native_and_split<'a>( + &'a self, + tokens: Vec, + ) -> impl Iterator> + '_ { + tokens.into_iter().map(move |token| { let token_bytes = self .decoder - .get(token) - .unwrap_or_else(|| &self.special_tokens_decoder[token]); - ret.push(token_bytes.clone()); - } - ret + .get(&token) + .unwrap_or_else(|| &self.special_tokens_decoder[&token]); + token_bytes.clone() + }) } fn _encode_ordinary_native(&self, text: &str) -> Vec { @@ -554,18 +556,53 @@ impl CoreBPE { } } - // tokenize a string and return the decoded tokens using the correct BPE model - // for example: "Hello world" -> ["Hello", " world"] - pub fn split_by_token_with_special_tokens(&self, text: &str) -> Result> { - // first, encode the text using the BPE model + /// Tokenize a string and return the decoded tokens using the correct BPE model. + /// + /// This method takes a string, encodes it using the BPE model, and decodes the encoded tokens into + /// a vector of strings. It can be used to tokenize a string and return the decoded tokens using the + /// correct BPE model. + /// + /// # Examples + /// + /// ``` + /// use tiktoken_rs::cl100k_base; + /// let bpe = cl100k_base().unwrap(); + /// let tokenized: Result, _> = bpe + /// .split_by_token_with_special_tokens("This is a test with a lot of spaces") + /// .collect(); + /// let tokenized = tokenized.unwrap(); + /// assert_eq!( + /// tokenized, + /// vec!["This", " is", " a", " test", " ", " with", " a", " lot", " of", " spaces"] + /// ); + /// ``` + /// + /// # Arguments + /// + /// * text: A string slice containing the text to be tokenized. + /// + /// # Returns + /// + /// * Result>: A Result containing a vector of decoded tokens as strings, or an error + /// if the string cannot be converted into a valid UTF-8 string. + /// + /// # Errors + /// + /// This function will return an error if: + /// + /// * The input text cannot be converted into a valid UTF-8 string during the decoding process. + /// + pub fn split_by_token_with_special_tokens<'a>( + &'a self, + text: &'a str, + ) -> impl Iterator> + 'a { + // First, encode the text using the BPE model let encoded = self.encode_with_special_tokens(text); - let tokenized = self._decode_native_and_split(&encoded); - - tokenized - .iter() - .map(|token| String::from_utf8(token.clone()).map_err(|e| anyhow!(e.to_string()))) - .collect() + self._decode_native_and_split(encoded).map(|token| + // Map each token to a Result + String::from_utf8(token) + .map_err(|e| anyhow!(e.to_string()))) } } diff --git a/tiktoken-rs/tests/tiktoken.rs b/tiktoken-rs/tests/tiktoken.rs index 7589a36..bfe633d 100644 --- a/tiktoken-rs/tests/tiktoken.rs +++ b/tiktoken-rs/tests/tiktoken.rs @@ -85,9 +85,10 @@ fn cl100k_base_test() { #[test] fn cl100k_split_test() { let bpe = cl100k_base().unwrap(); - let tokenized = bpe + let tokenized: Result, _> = bpe .split_by_token_with_special_tokens("This is a test with a lot of spaces") - .unwrap(); + .collect(); + let tokenized = tokenized.unwrap(); assert_eq!( tokenized, vec!["This", " is", " a", " test", " ", " with", " a", " lot", " of", " spaces"]