Switch to non-overflowing decoded length formula

No change to benchmarks. Motivated by the alternate approach in https://github.com/marshallpierce/rust-base64/pull/216/files.
marshallpierce · Jan 17, 2023 · fa47981 · fa47981
1 parent 92e94d2
commit fa47981
Show file tree

Hide file tree

Showing 5 changed files with 72 additions and 45 deletions.
diff --git a/src/decode.rs b/src/decode.rs
@@ -148,11 +148,6 @@ pub fn decode_engine_slice<E: Engine, T: AsRef<[u8]>>(
 /// // start of the next quad of encoded symbols
 /// assert_eq!(6, decoded_len_estimate(5));
 /// ```
-///
-/// # Panics
-///
-/// Panics if decoded length estimation overflows.
-/// This would happen for sizes within a few bytes of the maximum value of `usize`.
 pub fn decoded_len_estimate(encoded_len: usize) -> usize {
     STANDARD
         .internal_decoded_len_estimate(encoded_len)

diff --git a/src/engine/general_purpose/decode.rs b/src/engine/general_purpose/decode.rs
@@ -30,16 +30,11 @@ pub struct GeneralPurposeEstimate {
 
 impl GeneralPurposeEstimate {
     pub(crate) fn new(encoded_len: usize) -> Self {
+        // Formulas that won't overflow
         Self {
-            num_chunks: encoded_len
-                .checked_add(INPUT_CHUNK_LEN - 1)
-                .expect("Overflow when calculating number of chunks in input")
-                / INPUT_CHUNK_LEN,
-            decoded_len_estimate: encoded_len
-                .checked_add(3)
-                .expect("Overflow when calculating decoded len estimate")
-                / 4
-                * 3,
+            num_chunks: encoded_len / INPUT_CHUNK_LEN
+                + (encoded_len % INPUT_CHUNK_LEN > 0) as usize,
+            decoded_len_estimate: (encoded_len / 4 + (encoded_len % 4 > 0) as usize) * 3,
         }
     }
 }
@@ -345,4 +340,44 @@ mod tests {
         decode_chunk(&input[..], 0, &STANDARD.decode_table, &mut output).unwrap();
         assert_eq!(&vec![b'f', b'o', b'o', b'b', b'a', b'r', 0, 0], &output);
     }
+
+    #[test]
+    fn estimate_short_lengths() {
+        for (range, (num_chunks, decoded_len_estimate)) in [
+            (0..=0, (0, 0)),
+            (1..=4, (1, 3)),
+            (5..=8, (1, 6)),
+            (9..=12, (2, 9)),
+            (13..=16, (2, 12)),
+            (17..=20, (3, 15)),
+        ] {
+            for encoded_len in range {
+                let estimate = GeneralPurposeEstimate::new(encoded_len);
+                assert_eq!(num_chunks, estimate.num_chunks);
+                assert_eq!(decoded_len_estimate, estimate.decoded_len_estimate);
+            }
+        }
+    }
+
+    #[test]
+    fn estimate_via_u128_inflation() {
+        // cover both ends of usize
+        (0..1000)
+            .chain(usize::MAX - 1000..=usize::MAX)
+            .for_each(|encoded_len| {
+                // inflate to 128 bit type to be able to safely use the easy formulas
+                let len_128 = encoded_len as u128;
+
+                let estimate = GeneralPurposeEstimate::new(encoded_len);
+                assert_eq!(
+                    ((len_128 + (INPUT_CHUNK_LEN - 1) as u128) / (INPUT_CHUNK_LEN as u128))
+                        as usize,
+                    estimate.num_chunks
+                );
+                assert_eq!(
+                    ((len_128 + 3) / 4 * 3) as usize,
+                    estimate.decoded_len_estimate
+                );
+            })
+    }
 }
diff --git a/src/engine/mod.rs b/src/engine/mod.rs
@@ -62,10 +62,6 @@ pub trait Engine: Send + Sync {
     /// As an optimization to prevent the decoded length from being calculated twice, it is
     /// sometimes helpful to have a conservative estimate of the decoded size before doing the
     /// decoding, so this calculation is done separately and passed to [Engine::decode()] as needed.
-    ///
-    /// # Panics
-    ///
-    /// Panics if decoded length estimation overflows.
     #[doc(hidden)]
     fn internal_decoded_len_estimate(&self, input_len: usize) -> Self::DecodeEstimate;
 
@@ -225,11 +221,6 @@ pub trait Engine: Send + Sync {
     ///     .decode("aGVsbG8gaW50ZXJuZXR-Cg").unwrap();
     /// println!("{:?}", bytes_url);
     /// ```
-    ///
-    /// # Panics
-    ///
-    /// Panics if decoded length estimation overflows.
-    /// This would happen for sizes within a few bytes of the maximum value of `usize`.
     #[cfg(any(feature = "alloc", feature = "std", test))]
     fn decode<T: AsRef<[u8]>>(&self, input: T) -> Result<Vec<u8>, DecodeError> {
         let input_bytes = input.as_ref();
@@ -272,11 +263,6 @@ pub trait Engine: Send + Sync {
     ///     println!("{:?}", buffer);
     /// }
     /// ```
-    ///
-    /// # Panics
-    ///
-    /// Panics if decoded length estimation overflows.
-    /// This would happen for sizes within a few bytes of the maximum value of `usize`.
     #[cfg(any(feature = "alloc", feature = "std", test))]
     fn decode_vec<T: AsRef<[u8]>>(
         &self,
@@ -312,11 +298,6 @@ pub trait Engine: Send + Sync {
     ///
     /// See [Engine::decode_slice_unchecked] for a version that panics instead of returning an error
     /// if the output buffer is too small.
-    ///
-    /// # Panics
-    ///
-    /// Panics if decoded length estimation overflows.
-    /// This would happen for sizes within a few bytes of the maximum value of `usize`.
     fn decode_slice<T: AsRef<[u8]>>(
         &self,
         input: T,
@@ -344,9 +325,6 @@ pub trait Engine: Send + Sync {
     ///
     /// # Panics
     ///
-    /// Panics if decoded length estimation overflows.
-    /// This would happen for sizes within a few bytes of the maximum value of `usize`.
-    ///
     /// Panics if the provided output buffer is too small for the decoded data.
     fn decode_slice_unchecked<T: AsRef<[u8]>>(
         &self,
@@ -387,11 +365,6 @@ pub trait DecodeEstimate {
     ///
     /// The estimate must be no larger than the next largest complete triple of decoded bytes.
     /// That is, the final quad of tokens to decode may be assumed to be complete with no padding.
-    ///
-    /// # Panics
-    ///
-    /// Panics if decoded length estimation overflows.
-    /// This would happen for sizes within a few bytes of the maximum value of `usize`.
     fn decoded_len_estimate(&self) -> usize;
 }
 

diff --git a/src/engine/tests.rs b/src/engine/tests.rs
@@ -1200,6 +1200,32 @@ fn decode_length_estimate_delta<E: EngineWrapper>(engine_wrapper: E) {
     }
 }
 
+#[apply(all_engines)]
+fn estimate_via_u128_inflation<E: EngineWrapper>(engine_wrapper: E) {
+    // cover both ends of usize
+    (0..1000)
+        .chain(usize::MAX - 1000..=usize::MAX)
+        .for_each(|encoded_len| {
+            // inflate to 128 bit type to be able to safely use the easy formulas
+            let len_128 = encoded_len as u128;
+
+            let estimate = E::standard()
+                .internal_decoded_len_estimate(encoded_len)
+                .decoded_len_estimate();
+
+            // This check is a little too strict: it requires using the (len + 3) / 4 * 3 formula
+            // or equivalent, but until other engines come along that use a different formula
+            // requiring that we think more carefully about what the allowable criteria are, this
+            // will do.
+            assert_eq!(
+                ((len_128 + 3) / 4 * 3) as usize,
+                estimate,
+                "enc len {}",
+                encoded_len
+            );
+        })
+}
+
 /// Returns a tuple of the original data length, the encoded data length (just data), and the length including padding.
 ///
 /// Vecs provided should be empty.

diff --git a/tests/encode.rs b/tests/encode.rs
@@ -26,10 +26,9 @@ fn encode_all_ascii() {
 fn encode_all_bytes() {
     let mut bytes = Vec::<u8>::with_capacity(256);
 
-    for i in 0..255 {
+    for i in 0..=255 {
         bytes.push(i);
     }
-    bytes.push(255); //bug with "overflowing" ranges?
 
     compare_encode(
         "AAECAwQFBgcICQoLDA0ODxAREhMUFRYXGBkaGxwdHh8gISIjJCUmJygpKissLS4vMDEyMzQ1Njc4OTo7P\
@@ -44,10 +43,9 @@ fn encode_all_bytes() {
 fn encode_all_bytes_url() {
     let mut bytes = Vec::<u8>::with_capacity(256);
 
-    for i in 0..255 {
+    for i in 0..=255 {
         bytes.push(i);
     }
-    bytes.push(255); //bug with "overflowing" ranges?
 
     assert_eq!(
         "AAECAwQFBgcICQoLDA0ODxAREhMUFRYXGBkaGxwdHh8gISIjJCUmJygpKissLS4vMDEyMzQ1Njc4OTo7PD0\