From 57e2334aa9ebbe212e4ff5fe3065fd812036ad04 Mon Sep 17 00:00:00 2001
From: Marshall Pierce <575695+marshallpierce@users.noreply.github.com>
Date: Sun, 21 May 2023 17:43:39 -0600
Subject: [PATCH] Fix DecoderReader handling of padding

Fixes #226.

DecoderReader now keeps more state so that erroneous internal padding that happens to fall right on a read buffer boundary isn't silently ignored.
---
 .circleci/config.yml                        |   2 +-
 Cargo.toml                                  |   4 +-
 README.md                                   |   2 +-
 RELEASE-NOTES.md                            |   7 +
 clippy.toml                                 |   2 +-
 src/engine/general_purpose/decode.rs        |   6 +-
 src/engine/general_purpose/decode_suffix.rs |  17 +-
 src/engine/general_purpose/mod.rs           |   5 +-
 src/engine/mod.rs                           |  43 ++-
 src/engine/naive.rs                         |   4 +-
 src/engine/tests.rs                         | 342 +++++++++++++++++---
 src/read/decoder.rs                         |  65 ++--
 src/read/decoder_tests.rs                   | 157 ++++++++-
 13 files changed, 560 insertions(+), 96 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index fa98f9c..6fedd67 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -14,7 +14,7 @@ workflows:
                 # be easier on the CI hosts since presumably those fat lower layers will already be cached, and
                 # therefore faster than a minimal, customized alpine.
                 # MSRV
-                'rust:1.57.0'
+                'rust:1.60.0'
               ]
               # a hacky scheme to work around CircleCI's inability to deal with mutable docker tags, forcing us to
               # get a nightly or stable toolchain via rustup instead of a mutable docker tag
diff --git a/Cargo.toml b/Cargo.toml
index 33847db..7ce687f 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "base64"
-version = "0.21.0"
+version = "0.21.1"
 authors = ["Alice Maz <alice@alicemaz.com>", "Marshall Pierce <marshall@mpierce.org>"]
 description = "encodes and decodes base64 as bytes or utf8"
 repository = "https://github.com/marshallpierce/rust-base64"
@@ -10,7 +10,7 @@ keywords = ["base64", "utf8", "encode", "decode", "no_std"]
 categories = ["encoding"]
 license = "MIT OR Apache-2.0"
 edition = "2021"
-rust-version = "1.57.0"
+rust-version = "1.60.0"
 
 [[bench]]
 name = "benchmarks"
diff --git a/README.md b/README.md
index d7b0885..285f082 100644
--- a/README.md
+++ b/README.md
@@ -63,7 +63,7 @@ optionally may allow other behaviors.
 
 ## Rust version compatibility
 
-The minimum supported Rust version is 1.57.0.
+The minimum supported Rust version is 1.60.0.
 
 # Contributing
 
diff --git a/RELEASE-NOTES.md b/RELEASE-NOTES.md
index 8514b11..f15fa85 100644
--- a/RELEASE-NOTES.md
+++ b/RELEASE-NOTES.md
@@ -1,6 +1,13 @@
 # 0.21.1
 
 - Remove the possibility of panicking during decoded length calculations
+- `DecoderReader` no longer sometimes erroneously ignores padding  [#226](https://github.com/marshallpierce/rust-base64/issues/226)
+
+## Breaking changes
+
+- `Engine.internal_decode` return type changed
+- Update MSRV to 1.60.0
+
 
 # 0.21.0
 
diff --git a/clippy.toml b/clippy.toml
index 23b32c1..16caf02 100644
--- a/clippy.toml
+++ b/clippy.toml
@@ -1 +1 @@
-msrv = "1.57.0"
+msrv = "1.60.0"
diff --git a/src/engine/general_purpose/decode.rs b/src/engine/general_purpose/decode.rs
index 5e30e45..21a386f 100644
--- a/src/engine/general_purpose/decode.rs
+++ b/src/engine/general_purpose/decode.rs
@@ -1,5 +1,5 @@
 use crate::{
-    engine::{general_purpose::INVALID_VALUE, DecodeEstimate, DecodePaddingMode},
+    engine::{general_purpose::INVALID_VALUE, DecodeEstimate, DecodeMetadata, DecodePaddingMode},
     DecodeError, PAD_BYTE,
 };
 
@@ -46,7 +46,7 @@ impl DecodeEstimate for GeneralPurposeEstimate {
 }
 
 /// Helper to avoid duplicating num_chunks calculation, which is costly on short inputs.
-/// Returns the number of bytes written, or an error.
+/// Returns the decode metadata, or an error.
 // We're on the fragile edge of compiler heuristics here. If this is not inlined, slow. If this is
 // inlined(always), a different slow. plain ol' inline makes the benchmarks happiest at the moment,
 // but this is fragile and the best setting changes with only minor code modifications.
@@ -58,7 +58,7 @@ pub(crate) fn decode_helper(
     decode_table: &[u8; 256],
     decode_allow_trailing_bits: bool,
     padding_mode: DecodePaddingMode,
-) -> Result<usize, DecodeError> {
+) -> Result<DecodeMetadata, DecodeError> {
     let remainder_len = input.len() % INPUT_CHUNK_LEN;
 
     // Because the fast decode loop writes in groups of 8 bytes (unrolled to
diff --git a/src/engine/general_purpose/decode_suffix.rs b/src/engine/general_purpose/decode_suffix.rs
index 5652035..e1e005d 100644
--- a/src/engine/general_purpose/decode_suffix.rs
+++ b/src/engine/general_purpose/decode_suffix.rs
@@ -1,13 +1,13 @@
 use crate::{
-    engine::{general_purpose::INVALID_VALUE, DecodePaddingMode},
+    engine::{general_purpose::INVALID_VALUE, DecodeMetadata, DecodePaddingMode},
     DecodeError, PAD_BYTE,
 };
 
 /// Decode the last 1-8 bytes, checking for trailing set bits and padding per the provided
 /// parameters.
 ///
-/// Returns the total number of bytes decoded, including the ones indicated as already written by
-/// `output_index`.
+/// Returns the decode metadata representing the total number of bytes decoded, including the ones
+/// indicated as already written by `output_index`.
 pub(crate) fn decode_suffix(
     input: &[u8],
     input_index: usize,
@@ -16,7 +16,7 @@ pub(crate) fn decode_suffix(
     decode_table: &[u8; 256],
     decode_allow_trailing_bits: bool,
     padding_mode: DecodePaddingMode,
-) -> Result<usize, DecodeError> {
+) -> Result<DecodeMetadata, DecodeError> {
     // Decode any leftovers that aren't a complete input block of 8 bytes.
     // Use a u64 as a stack-resident 8 byte buffer.
     let mut leftover_bits: u64 = 0;
@@ -157,5 +157,12 @@ pub(crate) fn decode_suffix(
         leftover_bits_appended_to_buf += 8;
     }
 
-    Ok(output_index)
+    Ok(DecodeMetadata::new(
+        output_index,
+        if padding_bytes > 0 {
+            Some(input_index + first_padding_index)
+        } else {
+            None
+        },
+    ))
 }
diff --git a/src/engine/general_purpose/mod.rs b/src/engine/general_purpose/mod.rs
index af8897b..01d2204 100644
--- a/src/engine/general_purpose/mod.rs
+++ b/src/engine/general_purpose/mod.rs
@@ -2,13 +2,14 @@
 use crate::{
     alphabet,
     alphabet::Alphabet,
-    engine::{Config, DecodePaddingMode},
+    engine::{Config, DecodeMetadata, DecodePaddingMode},
     DecodeError,
 };
 use core::convert::TryInto;
 
 mod decode;
 pub(crate) mod decode_suffix;
+
 pub use decode::GeneralPurposeEstimate;
 
 pub(crate) const INVALID_VALUE: u8 = 255;
@@ -170,7 +171,7 @@ impl super::Engine for GeneralPurpose {
         input: &[u8],
         output: &mut [u8],
         estimate: Self::DecodeEstimate,
-    ) -> Result<usize, DecodeError> {
+    ) -> Result<DecodeMetadata, DecodeError> {
         decode::decode_helper(
             input,
             estimate,
diff --git a/src/engine/mod.rs b/src/engine/mod.rs
index 7467a91..aa41dff 100644
--- a/src/engine/mod.rs
+++ b/src/engine/mod.rs
@@ -73,8 +73,6 @@ pub trait Engine: Send + Sync {
     /// `decode_estimate` is the result of [Engine::internal_decoded_len_estimate()], which is passed in to avoid
     /// calculating it again (expensive on short inputs).`
     ///
-    /// Returns the number of bytes written to `output`.
-    ///
     /// Each complete 4-byte chunk of encoded data decodes to 3 bytes of decoded data, but this
     /// function must also handle the final possibly partial chunk.
     /// If the input length is not a multiple of 4, or uses padding bytes to reach a multiple of 4,
@@ -95,7 +93,7 @@ pub trait Engine: Send + Sync {
         input: &[u8],
         output: &mut [u8],
         decode_estimate: Self::DecodeEstimate,
-    ) -> Result<usize, DecodeError>;
+    ) -> Result<DecodeMetadata, DecodeError>;
 
     /// Returns the config for this engine.
     fn config(&self) -> &Self::Config;
@@ -202,8 +200,7 @@ pub trait Engine: Send + Sync {
         Ok(encoded_size)
     }
 
-    /// Decode from string reference as octets using the specified [Engine].
-    /// Returns a `Result` containing a `Vec<u8>`.
+    /// Decode the input into a new `Vec`.
     ///
     /// # Example
     ///
@@ -228,13 +225,16 @@ pub trait Engine: Send + Sync {
         let estimate = self.internal_decoded_len_estimate(input_bytes.len());
         let mut buffer = vec![0; estimate.decoded_len_estimate()];
 
-        let bytes_written = self.internal_decode(input_bytes, &mut buffer, estimate)?;
+        let bytes_written = self
+            .internal_decode(input_bytes, &mut buffer, estimate)?
+            .decoded_len;
         buffer.truncate(bytes_written);
 
         Ok(buffer)
     }
 
-    /// Decode from string reference as octets.
+    /// Decode the `input` into the supplied `buffer`.
+    ///
     /// Writes into the supplied `Vec`, which may allocate if its internal buffer isn't big enough.
     /// Returns a `Result` containing an empty tuple, aka `()`.
     ///
@@ -281,7 +281,9 @@ pub trait Engine: Send + Sync {
         buffer.resize(total_len_estimate, 0);
 
         let buffer_slice = &mut buffer.as_mut_slice()[starting_output_len..];
-        let bytes_written = self.internal_decode(input_bytes, buffer_slice, estimate)?;
+        let bytes_written = self
+            .internal_decode(input_bytes, buffer_slice, estimate)?
+            .decoded_len;
 
         buffer.truncate(starting_output_len + bytes_written);
 
@@ -290,7 +292,8 @@ pub trait Engine: Send + Sync {
 
     /// Decode the input into the provided output slice.
     ///
-    /// Returns an error if `output` is smaller than the estimated decoded length.
+    /// Returns the number of bytes written to the slice, or an error if `output` is smaller than
+    /// the estimated decoded length.
     ///
     /// This will not write any bytes past exactly what is decoded (no stray garbage bytes at the end).
     ///
@@ -312,10 +315,13 @@ pub trait Engine: Send + Sync {
 
         self.internal_decode(input_bytes, output, estimate)
             .map_err(|e| e.into())
+            .map(|dm| dm.decoded_len)
     }
 
     /// Decode the input into the provided output slice.
     ///
+    /// Returns the number of bytes written to the slice.
+    ///
     /// This will not write any bytes past exactly what is decoded (no stray garbage bytes at the end).
     ///
     /// See [crate::decoded_len_estimate] for calculating buffer sizes.
@@ -338,6 +344,7 @@ pub trait Engine: Send + Sync {
             output,
             self.internal_decoded_len_estimate(input_bytes.len()),
         )
+        .map(|dm| dm.decoded_len)
     }
 }
 
@@ -381,3 +388,21 @@ pub enum DecodePaddingMode {
     /// Padding must be absent -- for when you want predictable padding, without any wasted bytes.
     RequireNone,
 }
+
+/// Metadata about the result of a decode operation
+#[derive(PartialEq, Eq, Debug)]
+pub struct DecodeMetadata {
+    /// Number of decoded bytes output
+    pub(crate) decoded_len: usize,
+    /// Offset of the first padding byte in the input, if any
+    pub(crate) padding_offset: Option<usize>,
+}
+
+impl DecodeMetadata {
+    pub(crate) fn new(decoded_bytes: usize, padding_index: Option<usize>) -> Self {
+        Self {
+            decoded_len: decoded_bytes,
+            padding_offset: padding_index,
+        }
+    }
+}
diff --git a/src/engine/naive.rs b/src/engine/naive.rs
index 6665c5e..42b6085 100644
--- a/src/engine/naive.rs
+++ b/src/engine/naive.rs
@@ -2,7 +2,7 @@ use crate::{
     alphabet::Alphabet,
     engine::{
         general_purpose::{self, decode_table, encode_table},
-        Config, DecodeEstimate, DecodePaddingMode, Engine,
+        Config, DecodeEstimate, DecodeMetadata, DecodePaddingMode, Engine,
     },
     DecodeError, PAD_BYTE,
 };
@@ -112,7 +112,7 @@ impl Engine for Naive {
         input: &[u8],
         output: &mut [u8],
         estimate: Self::DecodeEstimate,
-    ) -> Result<usize, DecodeError> {
+    ) -> Result<DecodeMetadata, DecodeError> {
         if estimate.rem == 1 {
             // trailing whitespace is so common that it's worth it to check the last byte to
             // possibly return a better error message
diff --git a/src/engine/tests.rs b/src/engine/tests.rs
index d2851ec..6430b35 100644
--- a/src/engine/tests.rs
+++ b/src/engine/tests.rs
@@ -8,13 +8,16 @@ use rand::{
 };
 use rstest::rstest;
 use rstest_reuse::{apply, template};
-use std::{collections, fmt};
+use std::{collections, fmt, io::Read as _};
 
 use crate::{
     alphabet::{Alphabet, STANDARD},
     encode::add_padding,
     encoded_len,
-    engine::{general_purpose, naive, Config, DecodeEstimate, DecodePaddingMode, Engine},
+    engine::{
+        general_purpose, naive, Config, DecodeEstimate, DecodeMetadata, DecodePaddingMode, Engine,
+    },
+    read::DecoderReader,
     tests::{assert_encode_sanity, random_alphabet, random_config},
     DecodeError, PAD_BYTE,
 };
@@ -24,9 +27,20 @@ use crate::{
 #[rstest(engine_wrapper,
 case::general_purpose(GeneralPurposeWrapper {}),
 case::naive(NaiveWrapper {}),
+case::decoder_reader(DecoderReaderEngineWrapper {}),
 )]
 fn all_engines<E: EngineWrapper>(engine_wrapper: E) {}
 
+/// Some decode tests don't make sense for use with `DecoderReader` as they are difficult to
+/// reason about or otherwise inapplicable given how DecoderReader slice up its input along
+/// chunk boundaries.
+#[template]
+#[rstest(engine_wrapper,
+case::general_purpose(GeneralPurposeWrapper {}),
+case::naive(NaiveWrapper {}),
+)]
+fn all_engines_except_decoder_reader<E: EngineWrapper>(engine_wrapper: E) {}
+
 #[apply(all_engines)]
 fn rfc_test_vectors_std_alphabet<E: EngineWrapper>(engine_wrapper: E) {
     let data = vec![
@@ -552,7 +566,7 @@ fn decode_invalid_byte_error<E: EngineWrapper>(engine_wrapper: E) {
 
     let len_range = distributions::Uniform::new(1, 1_000);
 
-    for _ in 0..10_000 {
+    for _ in 0..100_000 {
         let alphabet = random_alphabet(&mut rng);
         let engine = E::random_alphabet(&mut rng, alphabet);
 
@@ -576,7 +590,7 @@ fn decode_invalid_byte_error<E: EngineWrapper>(engine_wrapper: E) {
         let invalid_byte: u8 = loop {
             let byte: u8 = rng.gen();
 
-            if alphabet.symbols.contains(&byte) {
+            if alphabet.symbols.contains(&byte) || byte == PAD_BYTE {
                 continue;
             } else {
                 break byte;
@@ -600,7 +614,9 @@ fn decode_invalid_byte_error<E: EngineWrapper>(engine_wrapper: E) {
 /// Any amount of padding anywhere before the final non padding character = invalid byte at first
 /// pad byte.
 /// From this, we know padding must extend to the end of the input.
-#[apply(all_engines)]
+// DecoderReader pseudo-engine detects InvalidLastSymbol instead of InvalidLength because it
+// can end a decode on the quad that happens to contain the start of the padding
+#[apply(all_engines_except_decoder_reader)]
 fn decode_padding_before_final_non_padding_char_error_invalid_byte<E: EngineWrapper>(
     engine_wrapper: E,
 ) {
@@ -644,10 +660,13 @@ fn decode_padding_before_final_non_padding_char_error_invalid_byte<E: EngineWrap
     }
 }
 
-/// Any amount of padding before final chunk that crosses over into final chunk with 1-4 bytes =
-/// invalid byte at first pad byte (except for 1 byte suffix = invalid length).
-/// From this we know the padding must start in the final chunk.
-#[apply(all_engines)]
+/// Any amount of padding before final chunk that crosses over into final chunk with 2-4 bytes =
+/// invalid byte at first pad byte.
+/// From this and [decode_padding_starts_before_final_chunk_error_invalid_length] we know the
+/// padding must start in the final chunk.
+// DecoderReader pseudo-engine detects InvalidLastSymbol instead of InvalidLength because it
+// can end a decode on the quad that happens to contain the start of the padding
+#[apply(all_engines_except_decoder_reader)]
 fn decode_padding_starts_before_final_chunk_error_invalid_byte<E: EngineWrapper>(
     engine_wrapper: E,
 ) {
@@ -655,8 +674,8 @@ fn decode_padding_starts_before_final_chunk_error_invalid_byte<E: EngineWrapper>
 
     // must have at least one prefix quad
     let prefix_quads_range = distributions::Uniform::from(1..256);
-    // including 1 just to make sure that it really does produce invalid length
-    let suffix_pad_len_range = distributions::Uniform::from(1..=4);
+    // excluding 1 since we don't care about invalid length in this test
+    let suffix_pad_len_range = distributions::Uniform::from(2..=4);
     for mode in all_pad_modes() {
         // we don't encode so we don't care about encode padding
         let engine = E::standard_with_pad_mode(true, mode);
@@ -674,14 +693,48 @@ fn decode_padding_starts_before_final_chunk_error_invalid_byte<E: EngineWrapper>
             let padding_start = encoded.len() - padding_len;
             encoded[padding_start..].fill(PAD_BYTE);
 
-            if suffix_len == 1 {
-                assert_eq!(Err(DecodeError::InvalidLength), engine.decode(&encoded),);
-            } else {
-                assert_eq!(
-                    Err(DecodeError::InvalidByte(padding_start, PAD_BYTE)),
-                    engine.decode(&encoded),
-                );
-            }
+            assert_eq!(
+                Err(DecodeError::InvalidByte(padding_start, PAD_BYTE)),
+                engine.decode(&encoded),
+                "suffix_len: {}, padding_len: {}, b64: {}",
+                suffix_len,
+                padding_len,
+                std::str::from_utf8(&encoded).unwrap()
+            );
+        }
+    }
+}
+
+/// Any amount of padding before final chunk that crosses over into final chunk with 1 byte =
+/// invalid length.
+/// From this we know the padding must start in the final chunk.
+// DecoderReader pseudo-engine detects InvalidByte instead of InvalidLength because it starts by
+// decoding only the available complete quads
+#[apply(all_engines_except_decoder_reader)]
+fn decode_padding_starts_before_final_chunk_error_invalid_length<E: EngineWrapper>(
+    engine_wrapper: E,
+) {
+    let mut rng = seeded_rng();
+
+    // must have at least one prefix quad
+    let prefix_quads_range = distributions::Uniform::from(1..256);
+    for mode in all_pad_modes() {
+        // we don't encode so we don't care about encode padding
+        let engine = E::standard_with_pad_mode(true, mode);
+        for _ in 0..100_000 {
+            let mut encoded = "ABCD"
+                .repeat(prefix_quads_range.sample(&mut rng))
+                .into_bytes();
+            encoded.resize(encoded.len() + 1, PAD_BYTE);
+
+            // amount of padding must be long enough to extend back from suffix into previous
+            // quads
+            let padding_len = rng.gen_range(1 + 1..encoded.len());
+            // no non-padding after padding in this test, so padding goes to the end
+            let padding_start = encoded.len() - padding_len;
+            encoded[padding_start..].fill(PAD_BYTE);
+
+            assert_eq!(Err(DecodeError::InvalidLength), engine.decode(&encoded),);
         }
     }
 }
@@ -790,7 +843,9 @@ fn decode_malleability_test_case_2_byte_suffix_no_padding<E: EngineWrapper>(engi
 }
 
 // https://eprint.iacr.org/2022/361.pdf table 2, test 7
-#[apply(all_engines)]
+// DecoderReader pseudo-engine gets InvalidByte at 8 (extra padding) since it decodes the first
+// two complete quads correctly.
+#[apply(all_engines_except_decoder_reader)]
 fn decode_malleability_test_case_2_byte_suffix_too_much_padding<E: EngineWrapper>(
     engine_wrapper: E,
 ) {
@@ -864,7 +919,11 @@ fn decode_pad_mode_indifferent_padding_accepts_anything<E: EngineWrapper>(engine
 }
 
 //this is a MAY in the rfc: https://tools.ietf.org/html/rfc4648#section-3.3
-#[apply(all_engines)]
+// DecoderReader pseudo-engine finds the first padding, but doesn't report it as an error,
+// because in the next decode it finds more padding, which is reported as InvalidByte, just
+// with an offset at its position in the second decode, rather than being linked to the start
+// of the padding that was first seen in the previous decode.
+#[apply(all_engines_except_decoder_reader)]
 fn decode_pad_byte_in_penultimate_quad_error<E: EngineWrapper>(engine_wrapper: E) {
     for mode in all_pad_modes() {
         // we don't encode so we don't care about encode padding
@@ -898,7 +957,7 @@ fn decode_pad_byte_in_penultimate_quad_error<E: EngineWrapper>(engine_wrapper: E
                             num_prefix_quads * 4 + num_valid_bytes_penultimate_quad,
                             b'=',
                         ),
-                        engine.decode(&s).unwrap_err()
+                        engine.decode(&s).unwrap_err(),
                     );
                 }
             }
@@ -958,7 +1017,9 @@ fn decode_absurd_pad_error<E: EngineWrapper>(engine_wrapper: E) {
     }
 }
 
-#[apply(all_engines)]
+// DecoderReader pseudo-engine detects InvalidByte instead of InvalidLength because it starts by
+// decoding only the available complete quads
+#[apply(all_engines_except_decoder_reader)]
 fn decode_too_much_padding_returns_error<E: EngineWrapper>(engine_wrapper: E) {
     for mode in all_pad_modes() {
         // we don't encode so we don't care about encode padding
@@ -984,7 +1045,9 @@ fn decode_too_much_padding_returns_error<E: EngineWrapper>(engine_wrapper: E) {
     }
 }
 
-#[apply(all_engines)]
+// DecoderReader pseudo-engine detects InvalidByte instead of InvalidLength because it starts by
+// decoding only the available complete quads
+#[apply(all_engines_except_decoder_reader)]
 fn decode_padding_followed_by_non_padding_returns_error<E: EngineWrapper>(engine_wrapper: E) {
     for mode in all_pad_modes() {
         // we don't encode so we don't care about encode padding
@@ -1082,27 +1145,43 @@ fn decode_too_few_symbols_in_final_quad_error<E: EngineWrapper>(engine_wrapper:
     }
 }
 
-#[apply(all_engines)]
+// DecoderReader pseudo-engine can't handle DecodePaddingMode::RequireNone since it will decode
+// a complete quad with padding in it before encountering the stray byte that makes it an invalid
+// length
+#[apply(all_engines_except_decoder_reader)]
 fn decode_invalid_trailing_bytes<E: EngineWrapper>(engine_wrapper: E) {
     for mode in all_pad_modes() {
-        // we don't encode so we don't care about encode padding
-        let engine = E::standard_with_pad_mode(true, mode);
+        do_invalid_trailing_byte(E::standard_with_pad_mode(true, mode), mode);
+    }
+}
 
-        for num_prefix_quads in 0..256 {
-            let mut s: String = "ABCD".repeat(num_prefix_quads);
-            s.push_str("Cg==\n");
+#[apply(all_engines)]
+fn decode_invalid_trailing_bytes_all_modes<E: EngineWrapper>(engine_wrapper: E) {
+    // excluding no padding mode because the DecoderWrapper pseudo-engine will fail with
+    // InvalidPadding because it will decode the last complete quad with padding first
+    for mode in pad_modes_allowing_padding() {
+        do_invalid_trailing_byte(E::standard_with_pad_mode(true, mode), mode);
+    }
+}
 
-            // The case of trailing newlines is common enough to warrant a test for a good error
-            // message.
-            assert_eq!(
-                Err(DecodeError::InvalidByte(num_prefix_quads * 4 + 4, b'\n')),
-                engine.decode(&s)
-            );
+#[apply(all_engines)]
+fn decode_invalid_trailing_padding_as_invalid_length<E: EngineWrapper>(engine_wrapper: E) {
+    // excluding no padding mode because the DecoderWrapper pseudo-engine will fail with
+    // InvalidPadding because it will decode the last complete quad with padding first
+    for mode in pad_modes_allowing_padding() {
+        do_invalid_trailing_padding_as_invalid_length(E::standard_with_pad_mode(true, mode), mode);
+    }
+}
 
-            // extra padding, however, is still InvalidLength
-            let s = s.replace('\n', "=");
-            assert_eq!(Err(DecodeError::InvalidLength), engine.decode(s));
-        }
+// DecoderReader pseudo-engine can't handle DecodePaddingMode::RequireNone since it will decode
+// a complete quad with padding in it before encountering the stray byte that makes it an invalid
+// length
+#[apply(all_engines_except_decoder_reader)]
+fn decode_invalid_trailing_padding_as_invalid_length_all_modes<E: EngineWrapper>(
+    engine_wrapper: E,
+) {
+    for mode in all_pad_modes() {
+        do_invalid_trailing_padding_as_invalid_length(E::standard_with_pad_mode(true, mode), mode);
     }
 }
 
@@ -1180,6 +1259,53 @@ fn decode_into_slice_fits_in_precisely_sized_slice<E: EngineWrapper>(engine_wrap
     }
 }
 
+#[apply(all_engines)]
+fn inner_decode_reports_padding_position<E: EngineWrapper>(engine_wrapper: E) {
+    let mut b64 = String::new();
+    let mut decoded = Vec::new();
+    let engine = E::standard();
+
+    for pad_position in 1..10_000 {
+        b64.clear();
+        decoded.clear();
+        // plenty of room for original data
+        decoded.resize(pad_position, 0);
+
+        for _ in 0..pad_position {
+            b64.push('A');
+        }
+        // finish the quad with padding
+        for _ in 0..(4 - (pad_position % 4)) {
+            b64.push('=');
+        }
+
+        let decode_res = engine.internal_decode(
+            b64.as_bytes(),
+            &mut decoded[..],
+            engine.internal_decoded_len_estimate(b64.len()),
+        );
+        if pad_position % 4 < 2 {
+            // impossible padding
+            assert_eq!(
+                Err(DecodeError::InvalidByte(pad_position, PAD_BYTE)),
+                decode_res
+            );
+        } else {
+            let decoded_bytes = pad_position / 4 * 3
+                + match pad_position % 4 {
+                    0 => 0,
+                    2 => 1,
+                    3 => 2,
+                    _ => unreachable!(),
+                };
+            assert_eq!(
+                Ok(DecodeMetadata::new(decoded_bytes, Some(pad_position))),
+                decode_res
+            );
+        }
+    }
+}
+
 #[apply(all_engines)]
 fn decode_length_estimate_delta<E: EngineWrapper>(engine_wrapper: E) {
     for engine in [E::standard(), E::standard_unpadded()] {
@@ -1229,6 +1355,38 @@ fn estimate_via_u128_inflation<E: EngineWrapper>(engine_wrapper: E) {
         })
 }
 
+fn do_invalid_trailing_byte(engine: impl Engine, mode: DecodePaddingMode) {
+    for num_prefix_quads in 0..256 {
+        let mut s: String = "ABCD".repeat(num_prefix_quads);
+        s.push_str("Cg==\n");
+
+        // The case of trailing newlines is common enough to warrant a test for a good error
+        // message.
+        assert_eq!(
+            Err(DecodeError::InvalidByte(num_prefix_quads * 4 + 4, b'\n')),
+            engine.decode(&s),
+            "mode: {:?}, input: {}",
+            mode,
+            s
+        );
+    }
+}
+
+fn do_invalid_trailing_padding_as_invalid_length(engine: impl Engine, mode: DecodePaddingMode) {
+    for num_prefix_quads in 0..256 {
+        let mut s: String = "ABCD".repeat(num_prefix_quads);
+        s.push_str("Cg===");
+
+        assert_eq!(
+            Err(DecodeError::InvalidLength),
+            engine.decode(&s),
+            "mode: {:?}, input: {}",
+            mode,
+            s
+        );
+    }
+}
+
 /// Returns a tuple of the original data length, the encoded data length (just data), and the length including padding.
 ///
 /// Vecs provided should be empty.
@@ -1430,6 +1588,103 @@ impl EngineWrapper for NaiveWrapper {
     }
 }
 
+/// A pseudo-Engine that routes all decoding through [DecoderReader]
+struct DecoderReaderEngine<E: Engine> {
+    engine: E,
+}
+
+impl<E: Engine> From<E> for DecoderReaderEngine<E> {
+    fn from(value: E) -> Self {
+        Self { engine: value }
+    }
+}
+
+impl<E: Engine> Engine for DecoderReaderEngine<E> {
+    type Config = E::Config;
+    type DecodeEstimate = E::DecodeEstimate;
+
+    fn internal_encode(&self, input: &[u8], output: &mut [u8]) -> usize {
+        self.engine.internal_encode(input, output)
+    }
+
+    fn internal_decoded_len_estimate(&self, input_len: usize) -> Self::DecodeEstimate {
+        self.engine.internal_decoded_len_estimate(input_len)
+    }
+
+    fn internal_decode(
+        &self,
+        input: &[u8],
+        output: &mut [u8],
+        decode_estimate: Self::DecodeEstimate,
+    ) -> Result<DecodeMetadata, DecodeError> {
+        let mut reader = DecoderReader::new(input, &self.engine);
+        let mut buf = vec![0; input.len()];
+        // to avoid effects like not detecting invalid length due to progressively growing
+        // the output buffer in read_to_end etc, read into a big enough buffer in one go
+        // to make behavior more consistent with normal engines
+        let _ = reader
+            .read(&mut buf)
+            .and_then(|len| {
+                buf.truncate(len);
+                // make sure we got everything
+                reader.read_to_end(&mut buf)
+            })
+            .map_err(|io_error| {
+                *io_error
+                    .into_inner()
+                    .and_then(|inner| inner.downcast::<DecodeError>().ok())
+                    .unwrap()
+            })?;
+        output[..buf.len()].copy_from_slice(&buf);
+        Ok(DecodeMetadata::new(
+            buf.len(),
+            input
+                .iter()
+                .enumerate()
+                .filter(|(_offset, byte)| **byte == PAD_BYTE)
+                .map(|(offset, _byte)| offset)
+                .next(),
+        ))
+    }
+
+    fn config(&self) -> &Self::Config {
+        self.engine.config()
+    }
+}
+
+struct DecoderReaderEngineWrapper {}
+
+impl EngineWrapper for DecoderReaderEngineWrapper {
+    type Engine = DecoderReaderEngine<general_purpose::GeneralPurpose>;
+
+    fn standard() -> Self::Engine {
+        GeneralPurposeWrapper::standard().into()
+    }
+
+    fn standard_unpadded() -> Self::Engine {
+        GeneralPurposeWrapper::standard_unpadded().into()
+    }
+
+    fn standard_with_pad_mode(
+        encode_pad: bool,
+        decode_pad_mode: DecodePaddingMode,
+    ) -> Self::Engine {
+        GeneralPurposeWrapper::standard_with_pad_mode(encode_pad, decode_pad_mode).into()
+    }
+
+    fn standard_allow_trailing_bits() -> Self::Engine {
+        GeneralPurposeWrapper::standard_allow_trailing_bits().into()
+    }
+
+    fn random<R: rand::Rng>(rng: &mut R) -> Self::Engine {
+        GeneralPurposeWrapper::random(rng).into()
+    }
+
+    fn random_alphabet<R: rand::Rng>(rng: &mut R, alphabet: &Alphabet) -> Self::Engine {
+        GeneralPurposeWrapper::random_alphabet(rng, alphabet).into()
+    }
+}
+
 fn seeded_rng() -> impl rand::Rng {
     rngs::SmallRng::from_entropy()
 }
@@ -1442,6 +1697,13 @@ fn all_pad_modes() -> Vec<DecodePaddingMode> {
     ]
 }
 
+fn pad_modes_allowing_padding() -> Vec<DecodePaddingMode> {
+    vec![
+        DecodePaddingMode::Indifferent,
+        DecodePaddingMode::RequireCanonical,
+    ]
+}
+
 fn assert_all_suffixes_ok<E: Engine>(engine: E, suffixes: Vec<&str>) {
     for num_prefix_quads in 0..256 {
         for &suffix in suffixes.iter() {
diff --git a/src/read/decoder.rs b/src/read/decoder.rs
index 4888c9c..b656ae3 100644
--- a/src/read/decoder.rs
+++ b/src/read/decoder.rs
@@ -1,4 +1,4 @@
-use crate::{engine::Engine, DecodeError};
+use crate::{engine::Engine, DecodeError, PAD_BYTE};
 use std::{cmp, fmt, io};
 
 // This should be large, but it has to fit on the stack.
@@ -46,13 +46,15 @@ pub struct DecoderReader<'e, E: Engine, R: io::Read> {
     // Technically we only need to hold 2 bytes but then we'd need a separate temporary buffer to
     // decode 3 bytes into and then juggle copying one byte into the provided read buf and the rest
     // into here, which seems like a lot of complexity for 1 extra byte of storage.
-    decoded_buffer: [u8; 3],
+    decoded_buffer: [u8; DECODED_CHUNK_SIZE],
     // index of start of decoded data
     decoded_offset: usize,
     // length of decoded data
     decoded_len: usize,
     // used to provide accurate offsets in errors
     total_b64_decoded: usize,
+    // offset of previously seen padding, if any
+    padding_offset: Option<usize>,
 }
 
 impl<'e, E: Engine, R: io::Read> fmt::Debug for DecoderReader<'e, E, R> {
@@ -64,6 +66,7 @@ impl<'e, E: Engine, R: io::Read> fmt::Debug for DecoderReader<'e, E, R> {
             .field("decoded_offset", &self.decoded_offset)
             .field("decoded_len", &self.decoded_len)
             .field("total_b64_decoded", &self.total_b64_decoded)
+            .field("padding_offset", &self.padding_offset)
             .finish()
     }
 }
@@ -81,6 +84,7 @@ impl<'e, E: Engine, R: io::Read> DecoderReader<'e, E, R> {
             decoded_offset: 0,
             decoded_len: 0,
             total_b64_decoded: 0,
+            padding_offset: None,
         }
     }
 
@@ -127,20 +131,28 @@ impl<'e, E: Engine, R: io::Read> DecoderReader<'e, E, R> {
     /// caller's responsibility to choose the number of b64 bytes to decode correctly.
     ///
     /// Returns a Result with the number of decoded bytes written to `buf`.
-    fn decode_to_buf(&mut self, num_bytes: usize, buf: &mut [u8]) -> io::Result<usize> {
-        debug_assert!(self.b64_len >= num_bytes);
+    fn decode_to_buf(&mut self, b64_len_to_decode: usize, buf: &mut [u8]) -> io::Result<usize> {
+        debug_assert!(self.b64_len >= b64_len_to_decode);
         debug_assert!(self.b64_offset + self.b64_len <= BUF_SIZE);
         debug_assert!(!buf.is_empty());
 
-        let decoded = self
+        let b64_to_decode = &self.b64_buffer[self.b64_offset..self.b64_offset + b64_len_to_decode];
+        let decode_metadata = self
             .engine
             .internal_decode(
-                &self.b64_buffer[self.b64_offset..self.b64_offset + num_bytes],
+                b64_to_decode,
                 buf,
-                self.engine.internal_decoded_len_estimate(num_bytes),
+                self.engine.internal_decoded_len_estimate(b64_len_to_decode),
             )
             .map_err(|e| match e {
                 DecodeError::InvalidByte(offset, byte) => {
+                    // This can be incorrect, but not in a way that probably matters to anyone:
+                    // if there was padding handled in a previous decode, and we are now getting
+                    // InvalidByte due to more padding, we should arguably report InvalidByte with
+                    // PAD_BYTE at the original padding position (`self.padding_offset`), but we
+                    // don't have a good way to tie those two cases together, so instead we
+                    // just report the invalid byte as if the previous padding, and its possibly
+                    // related downgrade to a now invalid byte, didn't happen.
                     DecodeError::InvalidByte(self.total_b64_decoded + offset, byte)
                 }
                 DecodeError::InvalidLength => DecodeError::InvalidLength,
@@ -151,13 +163,27 @@ impl<'e, E: Engine, R: io::Read> DecoderReader<'e, E, R> {
             })
             .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
 
-        self.total_b64_decoded += num_bytes;
-        self.b64_offset += num_bytes;
-        self.b64_len -= num_bytes;
+        if let Some(offset) = self.padding_offset {
+            // we've already seen padding
+            if decode_metadata.decoded_len > 0 {
+                // we read more after already finding padding; report error at first padding byte
+                return Err(io::Error::new(
+                    io::ErrorKind::InvalidData,
+                    DecodeError::InvalidByte(offset, PAD_BYTE),
+                ));
+            }
+        }
+
+        self.padding_offset = self.padding_offset.or(decode_metadata
+            .padding_offset
+            .map(|offset| self.total_b64_decoded + offset));
+        self.total_b64_decoded += b64_len_to_decode;
+        self.b64_offset += b64_len_to_decode;
+        self.b64_len -= b64_len_to_decode;
 
         debug_assert!(self.b64_offset + self.b64_len <= BUF_SIZE);
 
-        Ok(decoded)
+        Ok(decode_metadata.decoded_len)
     }
 
     /// Unwraps this `DecoderReader`, returning the base reader which it reads base64 encoded
@@ -205,9 +231,9 @@ impl<'e, E: Engine, R: io::Read> io::Read for DecoderReader<'e, E, R> {
             self.decoded_offset < DECODED_CHUNK_SIZE
         });
 
-        // We shouldn't ever decode into here when we can't immediately write at least one byte into
-        // the provided buf, so the effective length should only be 3 momentarily between when we
-        // decode and when we copy into the target buffer.
+        // We shouldn't ever decode into decoded_buffer when we can't immediately write at least one
+        // byte into the provided buf, so the effective length should only be 3 momentarily between
+        // when we decode and when we copy into the target buffer.
         debug_assert!(self.decoded_len < DECODED_CHUNK_SIZE);
         debug_assert!(self.decoded_len + self.decoded_offset <= DECODED_CHUNK_SIZE);
 
@@ -217,20 +243,15 @@ impl<'e, E: Engine, R: io::Read> io::Read for DecoderReader<'e, E, R> {
         } else {
             let mut at_eof = false;
             while self.b64_len < BASE64_CHUNK_SIZE {
-                // Work around lack of copy_within, which is only present in 1.37
                 // Copy any bytes we have to the start of the buffer.
-                // We know we have < 1 chunk, so we can use a tiny tmp buffer.
-                let mut memmove_buf = [0_u8; BASE64_CHUNK_SIZE];
-                memmove_buf[..self.b64_len].copy_from_slice(
-                    &self.b64_buffer[self.b64_offset..self.b64_offset + self.b64_len],
-                );
-                self.b64_buffer[0..self.b64_len].copy_from_slice(&memmove_buf[..self.b64_len]);
+                self.b64_buffer
+                    .copy_within(self.b64_offset..self.b64_offset + self.b64_len, 0);
                 self.b64_offset = 0;
 
                 // then fill in more data
                 let read = self.read_from_delegate()?;
                 if read == 0 {
-                    // we never pass in an empty buf, so 0 => we've hit EOF
+                    // we never read into an empty buf, so 0 => we've hit EOF
                     at_eof = true;
                     break;
                 }
diff --git a/src/read/decoder_tests.rs b/src/read/decoder_tests.rs
index 65d58d8..625a07d 100644
--- a/src/read/decoder_tests.rs
+++ b/src/read/decoder_tests.rs
@@ -8,9 +8,10 @@ use rand::{Rng as _, RngCore as _};
 
 use super::decoder::{DecoderReader, BUF_SIZE};
 use crate::{
+    alphabet,
     engine::{general_purpose::STANDARD, Engine, GeneralPurpose},
     tests::{random_alphabet, random_config, random_engine},
-    DecodeError,
+    DecodeError, PAD_BYTE,
 };
 
 #[test]
@@ -247,19 +248,21 @@ fn reports_invalid_byte_correctly() {
     let mut rng = rand::thread_rng();
     let mut bytes = Vec::new();
     let mut b64 = String::new();
-    let mut decoded = Vec::new();
+    let mut stream_decoded = Vec::new();
+    let mut bulk_decoded = Vec::new();
 
     for _ in 0..10_000 {
         bytes.clear();
         b64.clear();
-        decoded.clear();
+        stream_decoded.clear();
+        bulk_decoded.clear();
 
         let size = rng.gen_range(1..(10 * BUF_SIZE));
         bytes.extend(iter::repeat(0).take(size));
         rng.fill_bytes(&mut bytes[..size]);
         assert_eq!(size, bytes.len());
 
-        let engine = random_engine(&mut rng);
+        let engine = GeneralPurpose::new(&alphabet::STANDARD, random_config(&mut rng));
 
         engine.encode_string(&bytes[..], &mut b64);
         // replace one byte, somewhere, with '*', which is invalid
@@ -270,9 +273,8 @@ fn reports_invalid_byte_correctly() {
         let mut wrapped_reader = io::Cursor::new(b64_bytes.clone());
         let mut decoder = DecoderReader::new(&mut wrapped_reader, &engine);
 
-        // some gymnastics to avoid double-moving the io::Error, which is not Copy
         let read_decode_err = decoder
-            .read_to_end(&mut decoded)
+            .read_to_end(&mut stream_decoded)
             .map_err(|e| {
                 let kind = e.kind();
                 let inner = e
@@ -283,8 +285,7 @@ fn reports_invalid_byte_correctly() {
             .err()
             .and_then(|o| o);
 
-        let mut bulk_buf = Vec::new();
-        let bulk_decode_err = engine.decode_vec(&b64_bytes[..], &mut bulk_buf).err();
+        let bulk_decode_err = engine.decode_vec(&b64_bytes[..], &mut bulk_decoded).err();
 
         // it's tricky to predict where the invalid data's offset will be since if it's in the last
         // chunk it will be reported at the first padding location because it's treated as invalid
@@ -296,6 +297,134 @@ fn reports_invalid_byte_correctly() {
     }
 }
 
+#[test]
+fn internal_padding_error_with_short_read_concatenated_texts_invalid_byte_error() {
+    let mut rng = rand::thread_rng();
+    let mut bytes = Vec::new();
+    let mut b64 = String::new();
+    let mut reader_decoded = Vec::new();
+    let mut bulk_decoded = Vec::new();
+
+    // encodes with padding, requires that padding be present so we don't get InvalidPadding
+    // just because padding is there at all
+    let engine = STANDARD;
+
+    for _ in 0..10_000 {
+        bytes.clear();
+        b64.clear();
+        reader_decoded.clear();
+        bulk_decoded.clear();
+
+        // at least 2 bytes so there can be a split point between bytes
+        let size = rng.gen_range(2..(10 * BUF_SIZE));
+        bytes.resize(size, 0);
+        rng.fill_bytes(&mut bytes[..size]);
+
+        // Concatenate two valid b64s, yielding padding in the middle.
+        // This avoids scenarios that are challenging to assert on, like random padding location
+        // that might be InvalidLastSymbol when decoded at certain buffer sizes but InvalidByte
+        // when done all at once.
+        let split = loop {
+            // find a split point that will produce padding on the first part
+            let s = rng.gen_range(1..size);
+            if s % 3 != 0 {
+                // short enough to need padding
+                break s;
+            };
+        };
+
+        engine.encode_string(&bytes[..split], &mut b64);
+        assert!(b64.contains('='), "split: {}, b64: {}", split, b64);
+        let bad_byte_pos = b64.find('=').unwrap();
+        engine.encode_string(&bytes[split..], &mut b64);
+        let b64_bytes = b64.as_bytes();
+
+        // short read to make it plausible for padding to happen on a read boundary
+        let read_len = rng.gen_range(1..10);
+        let mut wrapped_reader = ShortRead {
+            max_read_len: read_len,
+            delegate: io::Cursor::new(&b64_bytes),
+        };
+
+        let mut decoder = DecoderReader::new(&mut wrapped_reader, &engine);
+
+        let read_decode_err = decoder
+            .read_to_end(&mut reader_decoded)
+            .map_err(|e| {
+                *e.into_inner()
+                    .and_then(|e| e.downcast::<DecodeError>().ok())
+                    .unwrap()
+            })
+            .unwrap_err();
+
+        let bulk_decode_err = engine.decode_vec(b64_bytes, &mut bulk_decoded).unwrap_err();
+
+        assert_eq!(
+            bulk_decode_err,
+            read_decode_err,
+            "read len: {}, bad byte pos: {}, b64: {}",
+            read_len,
+            bad_byte_pos,
+            std::str::from_utf8(b64_bytes).unwrap()
+        );
+        assert_eq!(
+            DecodeError::InvalidByte(
+                split / 3 * 4
+                    + match split % 3 {
+                        1 => 2,
+                        2 => 3,
+                        _ => unreachable!(),
+                    },
+                PAD_BYTE
+            ),
+            read_decode_err
+        );
+    }
+}
+
+#[test]
+fn internal_padding_anywhere_error() {
+    let mut rng = rand::thread_rng();
+    let mut bytes = Vec::new();
+    let mut b64 = String::new();
+    let mut reader_decoded = Vec::new();
+
+    // encodes with padding, requires that padding be present so we don't get InvalidPadding
+    // just because padding is there at all
+    let engine = STANDARD;
+
+    for _ in 0..10_000 {
+        bytes.clear();
+        b64.clear();
+        reader_decoded.clear();
+
+        bytes.resize(10 * BUF_SIZE, 0);
+        rng.fill_bytes(&mut bytes[..]);
+
+        // Just shove a padding byte in there somewhere.
+        // The specific error to expect is challenging to predict precisely because it
+        // will vary based on the position of the padding in the quad and the read buffer
+        // length, but SOMETHING should go wrong.
+
+        engine.encode_string(&bytes[..], &mut b64);
+        let mut b64_bytes = b64.as_bytes().to_vec();
+        // put padding somewhere other than the last quad
+        b64_bytes[rng.gen_range(0..bytes.len() - 4)] = PAD_BYTE;
+
+        // short read to make it plausible for padding to happen on a read boundary
+        let read_len = rng.gen_range(1..10);
+        let mut wrapped_reader = ShortRead {
+            max_read_len: read_len,
+            delegate: io::Cursor::new(&b64_bytes),
+        };
+
+        let mut decoder = DecoderReader::new(&mut wrapped_reader, &engine);
+
+        let result = decoder.read_to_end(&mut reader_decoded);
+        assert!(result.is_err());
+    }
+}
+
 fn consume_with_short_reads_and_validate<R: io::Read>(
     rng: &mut rand::rngs::ThreadRng,
     expected_bytes: &[u8],
@@ -344,3 +473,15 @@ impl<'a, 'b, R: io::Read, N: rand::Rng> io::Read for RandomShortRead<'a, 'b, R,
         self.delegate.read(&mut buf[..effective_len])
     }
 }
+
+struct ShortRead<R: io::Read> {
+    delegate: R,
+    max_read_len: usize,
+}
+
+impl<R: io::Read> io::Read for ShortRead<R> {
+    fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
+        let len = self.max_read_len.max(buf.len());
+        self.delegate.read(&mut buf[..len])
+    }
+}