Skip to content

Commit

Permalink
Merge pull request #325 from jongiddy/write-multigzdecoder
Browse files Browse the repository at this point in the history
  • Loading branch information
JohnTitor committed Apr 23, 2023
2 parents a9100e5 + ab891f1 commit 70944ea
Show file tree
Hide file tree
Showing 2 changed files with 129 additions and 0 deletions.
128 changes: 128 additions & 0 deletions src/gz/write.rs
Original file line number Diff line number Diff line change
Expand Up @@ -373,6 +373,112 @@ impl<W: Read + Write> Read for GzDecoder<W> {
}
}

/// A gzip streaming decoder that decodes all members of a multistream
///
/// A gzip member consists of a header, compressed data and a trailer. The [gzip
/// specification](https://tools.ietf.org/html/rfc1952), however, allows multiple
/// gzip members to be joined in a single stream. `MultiGzDecoder` will
/// decode all consecutive members while `GzDecoder` will only decompress
/// the first gzip member. The multistream format is commonly used in
/// bioinformatics, for example when using the BGZF compressed data.
///
/// This structure exposes a [`Write`] interface that will consume all gzip members
/// from the written buffers and write uncompressed data to the writer.
#[derive(Debug)]
pub struct MultiGzDecoder<W: Write> {
inner: GzDecoder<W>,
}

impl<W: Write> MultiGzDecoder<W> {
/// Creates a new decoder which will write uncompressed data to the stream.
/// If the gzip stream contains multiple members all will be decoded.
pub fn new(w: W) -> MultiGzDecoder<W> {
MultiGzDecoder {
inner: GzDecoder::new(w),
}
}

/// Returns the header associated with the current member.
pub fn header(&self) -> Option<&GzHeader> {
self.inner.header()
}

/// Acquires a reference to the underlying writer.
pub fn get_ref(&self) -> &W {
self.inner.get_ref()
}

/// Acquires a mutable reference to the underlying writer.
///
/// Note that mutating the output/input state of the stream may corrupt this
/// object, so care must be taken when using this method.
pub fn get_mut(&mut self) -> &mut W {
self.inner.get_mut()
}

/// Attempt to finish this output stream, writing out final chunks of data.
///
/// Note that this function can only be used once data has finished being
/// written to the output stream. After this function is called then further
/// calls to `write` may result in a panic.
///
/// # Panics
///
/// Attempts to write data to this stream may result in a panic after this
/// function is called.
///
/// # Errors
///
/// This function will perform I/O to finish the stream, returning any
/// errors which happen.
pub fn try_finish(&mut self) -> io::Result<()> {
self.inner.try_finish()
}

/// Consumes this decoder, flushing the output stream.
///
/// This will flush the underlying data stream and then return the contained
/// writer if the flush succeeded.
///
/// Note that this function may not be suitable to call in a situation where
/// the underlying stream is an asynchronous I/O stream. To finish a stream
/// the `try_finish` (or `shutdown`) method should be used instead. To
/// re-acquire ownership of a stream it is safe to call this method after
/// `try_finish` or `shutdown` has returned `Ok`.
///
/// # Errors
///
/// This function will perform I/O to complete this stream, and any I/O
/// errors which occur will be returned from this function.
pub fn finish(self) -> io::Result<W> {
self.inner.finish()
}
}

impl<W: Write> Write for MultiGzDecoder<W> {
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
if buf.is_empty() {
Ok(0)
} else {
match self.inner.write(buf) {
Ok(0) => {
// When the GzDecoder indicates that it has finished
// create a new GzDecoder to handle additional data.
self.inner.try_finish()?;
let w = self.inner.inner.take_inner().into_inner();
self.inner = GzDecoder::new(w);
self.inner.write(buf)
}
res => res,
}
}
}

fn flush(&mut self) -> io::Result<()> {
self.inner.flush()
}
}

#[cfg(test)]
mod tests {
use super::*;
Expand Down Expand Up @@ -447,4 +553,26 @@ mod tests {
let return_string = String::from_utf8(writer).expect("String parsing error");
assert_eq!(return_string, STR);
}

// Two or more gzip files concatenated form a multi-member gzip file. MultiGzDecoder will
// concatenate the decoded contents of all members.
#[test]
fn decode_multi_writer() {
let mut e = GzEncoder::new(Vec::new(), Compression::default());
e.write(STR.as_ref()).unwrap();
let bytes = e.finish().unwrap().repeat(2);

let mut writer = Vec::new();
let mut decoder = MultiGzDecoder::new(writer);
let mut count = 0;
while count < bytes.len() {
let n = decoder.write(&bytes[count..]).unwrap();
assert!(n != 0);
count += n;
}
writer = decoder.finish().unwrap();
let return_string = String::from_utf8(writer).expect("String parsing error");
let expected = STR.repeat(2);
assert_eq!(return_string, expected);
}
}
1 change: 1 addition & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@ pub mod write {
pub use crate::deflate::write::DeflateEncoder;
pub use crate::gz::write::GzDecoder;
pub use crate::gz::write::GzEncoder;
pub use crate::gz::write::MultiGzDecoder;
pub use crate::zlib::write::ZlibDecoder;
pub use crate::zlib::write::ZlibEncoder;
}
Expand Down

0 comments on commit 70944ea

Please sign in to comment.