Merge pull request #100 from Caesurus/timdw/PR_to_pierrec

Timdw/pr to pierrec
pierrec · Oct 31, 2020 · 0e583d3 · 0e583d3
2 parents e876bbd + 2751f8d
commit 0e583d3
Show file tree

Hide file tree

Showing 10 changed files with 336 additions and 8 deletions.
diff --git a/cmd/lz4c/compress.go b/cmd/lz4c/compress.go
@@ -10,8 +10,8 @@ import (
 	"code.cloudfoundry.org/bytefmt"
 	"github.com/schollz/progressbar"
 
-	"github.com/pierrec/cmdflag"
 	"github.com/pierrec/lz4"
+	"github.com/pierrec/cmdflag"
 )
 
 // Compress compresses a set of files or from stdin to stdout.

diff --git a/fuzz/lz4.go b/fuzz/lz4.go
@@ -2,8 +2,9 @@ package lz4
 
 import (
 	"bytes"
-	"github.com/pierrec/lz4"
 	"io"
+
+	"github.com/pierrec/lz4"
 )
 
 // Fuzz function for the Reader and Writer.

diff --git a/internal/xxh32/xxh32zero_test.go b/internal/xxh32/xxh32zero_test.go
@@ -6,8 +6,8 @@ import (
 	"hash/fnv"
 	"testing"
 
-	qt "github.com/frankban/quicktest"
 	"github.com/pierrec/lz4/internal/xxh32"
+	qt "github.com/frankban/quicktest"
 )
 
 type test struct {

diff --git a/lz4.go b/lz4.go
@@ -10,18 +10,20 @@
 //
 package lz4
 
-import "math/bits"
-
-import "sync"
+import (
+	"math/bits"
+	"sync"
+)
 
 const (
 	// Extension is the LZ4 frame file name extension
 	Extension = ".lz4"
 	// Version is the LZ4 frame format version
 	Version = 1
 
-	frameMagic     uint32 = 0x184D2204
-	frameSkipMagic uint32 = 0x184D2A50
+	frameMagic       uint32 = 0x184D2204
+	frameSkipMagic   uint32 = 0x184D2A50
+	frameMagicLegacy uint32 = 0x184C2102
 
 	// The following constants are used to setup the compression algorithm.
 	minMatch            = 4  // the minimum size of the match sequence size (4 bytes)
@@ -108,6 +110,7 @@ type Header struct {
 	done             bool   // Header processed flag (Read or Write and checked).
 }
 
+// Reset reset internal status
 func (h *Header) Reset() {
 	h.done = false
 }
diff --git a/reader_legacy.go b/reader_legacy.go
@@ -0,0 +1,207 @@
+package lz4
+
+import (
+	"encoding/binary"
+	"fmt"
+	"io"
+)
+
+// ReaderLegacy implements the LZ4Demo frame decoder.
+// The Header is set after the first call to Read().
+type ReaderLegacy struct {
+	Header
+	// Handler called when a block has been successfully read.
+	// It provides the number of bytes read.
+	OnBlockDone func(size int)
+
+	lastBlock bool
+	buf       [8]byte   // Scrap buffer.
+	pos       int64     // Current position in src.
+	src       io.Reader // Source.
+	zdata     []byte    // Compressed data.
+	data      []byte    // Uncompressed data.
+	idx       int       // Index of unread bytes into data.
+	skip      int64     // Bytes to skip before next read.
+	dpos      int64     // Position in dest
+}
+
+// NewReaderLegacy returns a new LZ4Demo frame decoder.
+// No access to the underlying io.Reader is performed.
+func NewReaderLegacy(src io.Reader) *ReaderLegacy {
+	r := &ReaderLegacy{src: src}
+	return r
+}
+
+// readHeader checks the frame magic number and parses the frame descriptoz.
+// Skippable frames are supported even as a first frame although the LZ4
+// specifications recommends skippable frames not to be used as first frames.
+func (z *ReaderLegacy) readLegacyHeader() error {
+	z.lastBlock = false
+	magic, err := z.readUint32()
+	if err != nil {
+		z.pos += 4
+		if err == io.ErrUnexpectedEOF {
+			return io.EOF
+		}
+		return err
+	}
+	if magic != frameMagicLegacy {
+		return ErrInvalid
+	}
+	z.pos += 4
+
+	// Legacy has fixed 8MB blocksizes
+	// https://github.com/lz4/lz4/blob/dev/doc/lz4_Frame_format.md#legacy-frame
+	bSize := blockSize4M * 2
+
+	// Allocate the compressed/uncompressed buffers.
+	// The compressed buffer cannot exceed the uncompressed one.
+	if n := 2 * bSize; cap(z.zdata) < n {
+		z.zdata = make([]byte, n, n)
+	}
+	if debugFlag {
+		debug("header block max size size=%d", bSize)
+	}
+	z.zdata = z.zdata[:bSize]
+	z.data = z.zdata[:cap(z.zdata)][bSize:]
+	z.idx = len(z.data)
+
+	z.Header.done = true
+	if debugFlag {
+		debug("header read: %v", z.Header)
+	}
+
+	return nil
+}
+
+// Read decompresses data from the underlying source into the supplied buffer.
+//
+// Since there can be multiple streams concatenated, Header values may
+// change between calls to Read(). If that is the case, no data is actually read from
+// the underlying io.Reader, to allow for potential input buffer resizing.
+func (z *ReaderLegacy) Read(buf []byte) (int, error) {
+	if debugFlag {
+		debug("Read buf len=%d", len(buf))
+	}
+	if !z.Header.done {
+		if err := z.readLegacyHeader(); err != nil {
+			return 0, err
+		}
+		if debugFlag {
+			debug("header read OK compressed buffer %d / %d uncompressed buffer %d : %d index=%d",
+				len(z.zdata), cap(z.zdata), len(z.data), cap(z.data), z.idx)
+		}
+	}
+
+	if len(buf) == 0 {
+		return 0, nil
+	}
+
+	if z.idx == len(z.data) {
+		// No data ready for reading, process the next block.
+		if debugFlag {
+			debug("  reading block from writer %d %d", z.idx, blockSize4M*2)
+		}
+
+		// Reset uncompressed buffer
+		z.data = z.zdata[:cap(z.zdata)][len(z.zdata):]
+
+		bLen, err := z.readUint32()
+		if err != nil {
+			return 0, err
+		}
+		if debugFlag {
+			debug("   bLen %d (0x%x) offset = %d (0x%x)", bLen, bLen, z.pos, z.pos)
+		}
+		z.pos += 4
+
+		// Legacy blocks are always compressed, even when detrimental
+		if debugFlag {
+			debug("   compressed block size %d", bLen)
+		}
+
+		if int(bLen) > cap(z.data) {
+			return 0, fmt.Errorf("lz4: invalid block size: %d", bLen)
+		}
+		zdata := z.zdata[:bLen]
+		if _, err := io.ReadFull(z.src, zdata); err != nil {
+			return 0, err
+		}
+		z.pos += int64(bLen)
+
+		n, err := UncompressBlock(zdata, z.data)
+		if err != nil {
+			return 0, err
+		}
+
+		z.data = z.data[:n]
+		if z.OnBlockDone != nil {
+			z.OnBlockDone(n)
+		}
+
+		z.idx = 0
+
+		// Legacy blocks are fixed to 8MB, if we read a decompressed block smaller than this
+		// it means we've reached the end...
+		if n < blockSize4M*2 {
+			z.lastBlock = true
+		}
+	}
+
+	if z.skip > int64(len(z.data[z.idx:])) {
+		z.skip -= int64(len(z.data[z.idx:]))
+		z.dpos += int64(len(z.data[z.idx:]))
+		z.idx = len(z.data)
+		return 0, nil
+	}
+
+	z.idx += int(z.skip)
+	z.dpos += z.skip
+	z.skip = 0
+
+	n := copy(buf, z.data[z.idx:])
+	z.idx += n
+	z.dpos += int64(n)
+	if debugFlag {
+		debug("%v] copied %d bytes to input (%d:%d)", z.lastBlock, n, z.idx, len(z.data))
+	}
+	if z.lastBlock && len(z.data) == z.idx {
+		return n, io.EOF
+	}
+	return n, nil
+}
+
+// Seek implements io.Seeker, but supports seeking forward from the current
+// position only. Any other seek will return an error. Allows skipping output
+// bytes which aren't needed, which in some scenarios is faster than reading
+// and discarding them.
+// Note this may cause future calls to Read() to read 0 bytes if all of the
+// data they would have returned is skipped.
+func (z *ReaderLegacy) Seek(offset int64, whence int) (int64, error) {
+	if offset < 0 || whence != io.SeekCurrent {
+		return z.dpos + z.skip, ErrUnsupportedSeek
+	}
+	z.skip += offset
+	return z.dpos + z.skip, nil
+}
+
+// Reset discards the Reader's state and makes it equivalent to the
+// result of its original state from NewReader, but reading from r instead.
+// This permits reusing a Reader rather than allocating a new one.
+func (z *ReaderLegacy) Reset(r io.Reader) {
+	z.Header = Header{}
+	z.pos = 0
+	z.src = r
+	z.zdata = z.zdata[:0]
+	z.data = z.data[:0]
+	z.idx = 0
+}
+
+// readUint32 reads an uint32 into the supplied buffer.
+// The idea is to make use of the already allocated buffers avoiding additional allocations.
+func (z *ReaderLegacy) readUint32() (uint32, error) {
+	buf := z.buf[:4]
+	_, err := io.ReadFull(z.src, buf)
+	x := binary.LittleEndian.Uint32(buf)
+	return x, err
+}
diff --git a/reader_legacy_test.go b/reader_legacy_test.go
@@ -0,0 +1,117 @@
+package lz4_test
+
+import (
+	"bytes"
+	"io"
+	"io/ioutil"
+	"os"
+	"reflect"
+	"strings"
+	"testing"
+
+	"github.com/pierrec/lz4"
+)
+
+func TestReaderLegacy(t *testing.T) {
+	goldenFiles := []string{
+		"testdata/vmlinux_LZ4_19377.lz4",
+		"testdata/bzImage_lz4_isolated.lz4",
+	}
+
+	for _, fname := range goldenFiles {
+		t.Run(fname, func(t *testing.T) {
+			fname := fname
+			t.Parallel()
+
+			var out bytes.Buffer
+			rawfile := strings.TrimSuffix(fname, ".lz4")
+			raw, err := ioutil.ReadFile(rawfile)
+			if err != nil {
+				t.Fatal(err)
+			}
+
+			f, err := os.Open(fname)
+			if err != nil {
+				t.Fatal(err)
+			}
+			defer f.Close()
+
+			zr := lz4.NewReaderLegacy(f)
+			n, err := io.Copy(&out, zr)
+			if err != nil {
+				t.Fatal(err, n)
+			}
+
+			if got, want := int(n), len(raw); got != want {
+				t.Errorf("invalid sizes: got %d; want %d", got, want)
+			}
+
+			if got, want := out.Bytes(), raw; !reflect.DeepEqual(got, want) {
+				t.Fatal("uncompressed data does not match original")
+			}
+
+			if len(raw) < 20 {
+				return
+			}
+
+			f2, err := os.Open(fname)
+			if err != nil {
+				t.Fatal(err)
+			}
+			defer f2.Close()
+
+			out.Reset()
+			zr = lz4.NewReaderLegacy(f2)
+			_, err = io.CopyN(&out, zr, 10)
+			if err != nil {
+				t.Fatal(err)
+			}
+
+			if !reflect.DeepEqual(out.Bytes(), raw[:10]) {
+				t.Fatal("partial read does not match original")
+			} else {
+				t.Log("partial read is ok")
+			}
+
+			pos, err := zr.Seek(-1, io.SeekCurrent)
+			if err == nil {
+				t.Fatal("expected error from invalid seek")
+			}
+			if pos != 10 {
+				t.Fatalf("unexpected position %d", pos)
+			}
+			pos, err = zr.Seek(1, io.SeekStart)
+			if err == nil {
+				t.Fatal("expected error from invalid seek")
+			}
+			if pos != 10 {
+				t.Fatalf("unexpected position %d", pos)
+			}
+			pos, err = zr.Seek(-1, io.SeekEnd)
+			if err == nil {
+				t.Fatal("expected error from invalid seek")
+			}
+			if pos != 10 {
+				t.Fatalf("unexpected position %d", pos)
+			}
+
+			pos, err = zr.Seek(int64(len(raw)-20), io.SeekCurrent)
+			if err != nil {
+				t.Fatal(err)
+			}
+			if pos != int64(len(raw)-10) {
+				t.Fatalf("unexpected position %d", pos)
+			}
+
+			out.Reset()
+			_, err = io.CopyN(&out, zr, 10)
+			if err != nil {
+				t.Fatal(err)
+			}
+
+			if !reflect.DeepEqual(out.Bytes(), raw[len(raw)-10:]) {
+				t.Fatal("after seek, partial read does not match original")
+			}
+		})
+	}
+}
diff --git a/testdata/bzImage_lz4_isolated b/testdata/bzImage_lz4_isolated
diff --git a/testdata/bzImage_lz4_isolated.lz4 b/testdata/bzImage_lz4_isolated.lz4
diff --git a/testdata/vmlinux_LZ4_19377 b/testdata/vmlinux_LZ4_19377
diff --git a/testdata/vmlinux_LZ4_19377.lz4 b/testdata/vmlinux_LZ4_19377.lz4