Skip to content

Commit

Permalink
zstd: Remove offset from bitReader
Browse files Browse the repository at this point in the history
Since we read from the end, we can reslice instead of keeping a separate
offset. This gets rid of some bounds checks.

Also some other micro-optimizations to bit reading code. Combined results:

                                                     │   zstd/old   │              zstd/new               │
                                                     │     B/s      │     B/s       vs base               │
Decoder_DecoderSmall/kppkn.gtb.zst/buffered-8          427.6Mi ± 0%   428.2Mi ± 0%  +0.13% (p=0.019 n=10)
Decoder_DecoderSmall/kppkn.gtb.zst/unbuffered-8        511.6Mi ± 3%   516.9Mi ± 3%       ~ (p=0.280 n=10)
Decoder_DecoderSmall/geo.protodata.zst/buffered-8      1.110Gi ± 0%   1.110Gi ± 0%       ~ (p=0.165 n=10)
Decoder_DecoderSmall/geo.protodata.zst/unbuffered-8    824.7Mi ± 2%   827.3Mi ± 2%       ~ (p=0.481 n=10)
Decoder_DecoderSmall/plrabn12.txt.zst/buffered-8       330.4Mi ± 0%   330.3Mi ± 1%       ~ (p=0.645 n=10)
Decoder_DecoderSmall/plrabn12.txt.zst/unbuffered-8     533.3Mi ± 4%   538.8Mi ± 5%       ~ (p=0.393 n=10)
Decoder_DecoderSmall/lcet10.txt.zst/buffered-8         395.0Mi ± 0%   394.6Mi ± 0%  -0.10% (p=0.034 n=10)
Decoder_DecoderSmall/lcet10.txt.zst/unbuffered-8       556.5Mi ± 6%   546.2Mi ± 8%       ~ (p=0.436 n=10)
Decoder_DecoderSmall/asyoulik.txt.zst/buffered-8       342.2Mi ± 0%   342.2Mi ± 0%       ~ (p=0.956 n=10)
Decoder_DecoderSmall/asyoulik.txt.zst/unbuffered-8     436.7Mi ± 2%   435.4Mi ± 3%       ~ (p=0.739 n=10)
Decoder_DecoderSmall/alice29.txt.zst/buffered-8        335.6Mi ± 2%   337.0Mi ± 0%  +0.43% (p=0.000 n=10)
Decoder_DecoderSmall/alice29.txt.zst/unbuffered-8      552.6Mi ± 3%   550.7Mi ± 4%       ~ (p=1.000 n=10)
Decoder_DecoderSmall/html_x_4.zst/buffered-8           2.264Gi ± 0%   2.271Gi ± 0%  +0.29% (p=0.035 n=10)
Decoder_DecoderSmall/html_x_4.zst/unbuffered-8         1.558Gi ± 4%   1.554Gi ± 3%       ~ (p=0.579 n=10)
Decoder_DecoderSmall/paper-100k.pdf.zst/buffered-8     3.554Gi ± 5%   3.610Gi ± 0%  +1.59% (p=0.000 n=10)
Decoder_DecoderSmall/paper-100k.pdf.zst/unbuffered-8   1.701Gi ± 8%   1.709Gi ± 5%       ~ (p=0.631 n=10)
Decoder_DecoderSmall/fireworks.jpeg.zst/buffered-8     7.891Gi ± 4%   8.070Gi ± 0%  +2.26% (p=0.000 n=10)
Decoder_DecoderSmall/fireworks.jpeg.zst/unbuffered-8   3.062Gi ± 4%   3.129Gi ± 2%  +2.16% (p=0.002 n=10)
Decoder_DecoderSmall/urls.10K.zst/buffered-8           525.4Mi ± 6%   553.8Mi ± 0%  +5.39% (p=0.000 n=10)
Decoder_DecoderSmall/urls.10K.zst/unbuffered-8         763.7Mi ± 6%   819.7Mi ± 2%  +7.34% (p=0.000 n=10)
Decoder_DecoderSmall/html.zst/buffered-8               894.8Mi ± 0%   898.8Mi ± 2%  +0.45% (p=0.043 n=10)
Decoder_DecoderSmall/html.zst/unbuffered-8             722.3Mi ± 2%   717.7Mi ± 2%       ~ (p=0.912 n=10)
Decoder_DecoderSmall/comp-data.bin.zst/buffered-8      386.6Mi ± 2%   390.4Mi ± 0%  +1.00% (p=0.000 n=10)
Decoder_DecoderSmall/comp-data.bin.zst/unbuffered-8    145.2Mi ± 2%   148.7Mi ± 1%  +2.42% (p=0.003 n=10)
geomean                                                770.3Mi        777.5Mi       +0.93%
  • Loading branch information
greatroar committed Aug 18, 2023
1 parent b404607 commit 3f4e64f
Show file tree
Hide file tree
Showing 5 changed files with 88 additions and 97 deletions.
4 changes: 2 additions & 2 deletions zstd/_generate/gen.go
Original file line number Diff line number Diff line change
Expand Up @@ -156,8 +156,8 @@ func (o options) generateBody(name string, executeSingleTriple func(ctx *execute
brPointer := GP64()
Load(br.Field("value"), brValue)
Load(br.Field("bitsRead"), brBitsRead)
Load(br.Field("off"), brOffset)
Load(br.Field("in").Base(), brPointer)
Load(br.Field("in").Len(), brOffset)
ADDQ(brOffset, brPointer) // Add current offset to read pointer.
MOVQ(brPointer, brPointerStash)
}
Expand Down Expand Up @@ -438,7 +438,7 @@ func (o options) generateBody(name string, executeSingleTriple func(ctx *execute
br := Dereference(Param("br"))
Store(brValue, br.Field("value"))
Store(brBitsRead.As8(), br.Field("bitsRead"))
Store(brOffset, br.Field("off"))
Store(brOffset, br.Field("in").Len())

if !o.useSeqs {
Comment("Update the context")
Expand Down
34 changes: 15 additions & 19 deletions zstd/bitreader.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ import (
// for aligning the input.
type bitReader struct {
in []byte
off uint // next byte to read is at in[off - 1]
value uint64 // Maybe use [16]byte, but shifting is awkward.
bitsRead uint8
}
Expand All @@ -28,7 +27,6 @@ func (b *bitReader) init(in []byte) error {
return errors.New("corrupt stream: too short")
}
b.in = in
b.off = uint(len(in))
// The highest bit of the last byte indicates where to start
v := in[len(in)-1]
if v == 0 {
Expand Down Expand Up @@ -69,47 +67,45 @@ func (b *bitReader) fillFast() {
if b.bitsRead < 32 {
return
}
// 2 bounds checks.
v := b.in[b.off-4:]
v = v[:4]
v := b.in[len(b.in)-4:]
b.in = b.in[:len(b.in)-4]
low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
b.value = (b.value << 32) | uint64(low)
b.bitsRead -= 32
b.off -= 4
}

// fillFastStart() assumes the bitreader is empty and there is at least 8 bytes to read.
func (b *bitReader) fillFastStart() {
// Do single re-slice to avoid bounds checks.
b.value = binary.LittleEndian.Uint64(b.in[b.off-8:])
v := b.in[len(b.in)-8:]
b.in = b.in[:len(b.in)-8]
b.value = binary.LittleEndian.Uint64(v)
b.bitsRead = 0
b.off -= 8
}

// fill() will make sure at least 32 bits are available.
func (b *bitReader) fill() {
if b.bitsRead < 32 {
return
}
if b.off >= 4 {
v := b.in[b.off-4:]
v = v[:4]
if len(b.in) >= 4 {
v := b.in[len(b.in)-4:]
b.in = b.in[:len(b.in)-4]
low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
b.value = (b.value << 32) | uint64(low)
b.bitsRead -= 32
b.off -= 4
return
}
for b.off > 0 {
b.value = (b.value << 8) | uint64(b.in[b.off-1])
b.bitsRead -= 8
b.off--

b.bitsRead -= uint8(8 * len(b.in))
for len(b.in) > 0 {
b.value = (b.value << 8) | uint64(b.in[len(b.in)-1])
b.in = b.in[:len(b.in)-1]
}
}

// finished returns true if all bits have been read from the bit stream.
func (b *bitReader) finished() bool {
return b.off == 0 && b.bitsRead >= 64
return len(b.in) == 0 && b.bitsRead >= 64
}

// overread returns true if more bits have been requested than is on the stream.
Expand All @@ -119,7 +115,7 @@ func (b *bitReader) overread() bool {

// remain returns the number of bits remaining.
func (b *bitReader) remain() uint {
return b.off*8 + 64 - uint(b.bitsRead)
return 8*uint(len(b.in)) + 64 - uint(b.bitsRead)
}

// close the bitstream and returns an error if out-of-buffer reads occurred.
Expand Down
17 changes: 6 additions & 11 deletions zstd/seqdec.go
Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,7 @@ func (s *sequenceDecs) decodeSync(hist []byte) error {
return io.ErrUnexpectedEOF
}
var ll, mo, ml int
if br.off > 4+((maxOffsetBits+16+16)>>3) {
if len(br.in) > 4+((maxOffsetBits+16+16)>>3) {
// inlined function:
// ll, mo, ml = s.nextFast(br, llState, mlState, ofState)

Expand Down Expand Up @@ -452,18 +452,13 @@ func (s *sequenceDecs) next(br *bitReader, llState, mlState, ofState decSymbol)

// extra bits are stored in reverse order.
br.fill()
if s.maxBits <= 32 {
mo += br.getBits(moB)
ml += br.getBits(mlB)
ll += br.getBits(llB)
} else {
mo += br.getBits(moB)
mo += br.getBits(moB)
if s.maxBits > 32 {
br.fill()
// matchlength+literal length, max 32 bits
ml += br.getBits(mlB)
ll += br.getBits(llB)

}
// matchlength+literal length, max 32 bits
ml += br.getBits(mlB)
ll += br.getBits(llB)
mo = s.adjustOffset(mo, ll, moB)
return
}
Expand Down
128 changes: 64 additions & 64 deletions zstd/seqdec_amd64.s
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,11 @@
// func sequenceDecs_decode_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
// Requires: CMOV
TEXT ·sequenceDecs_decode_amd64(SB), $8-32
MOVQ br+8(FP), AX
MOVQ 32(AX), DX
MOVBQZX 40(AX), BX
MOVQ 24(AX), SI
MOVQ (AX), AX
MOVQ br+8(FP), CX
MOVQ 24(CX), DX
MOVBQZX 32(CX), BX
MOVQ (CX), AX
MOVQ 8(CX), SI
ADDQ SI, AX
MOVQ AX, (SP)
MOVQ ctx+16(FP), AX
Expand Down Expand Up @@ -301,9 +301,9 @@ sequenceDecs_decode_amd64_match_len_ofs_ok:
MOVQ R12, 152(AX)
MOVQ R13, 160(AX)
MOVQ br+8(FP), AX
MOVQ DX, 32(AX)
MOVB BL, 40(AX)
MOVQ SI, 24(AX)
MOVQ DX, 24(AX)
MOVB BL, 32(AX)
MOVQ SI, 8(AX)

// Return success
MOVQ $0x00000000, ret+24(FP)
Expand Down Expand Up @@ -336,11 +336,11 @@ error_overread:
// func sequenceDecs_decode_56_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
// Requires: CMOV
TEXT ·sequenceDecs_decode_56_amd64(SB), $8-32
MOVQ br+8(FP), AX
MOVQ 32(AX), DX
MOVBQZX 40(AX), BX
MOVQ 24(AX), SI
MOVQ (AX), AX
MOVQ br+8(FP), CX
MOVQ 24(CX), DX
MOVBQZX 32(CX), BX
MOVQ (CX), AX
MOVQ 8(CX), SI
ADDQ SI, AX
MOVQ AX, (SP)
MOVQ ctx+16(FP), AX
Expand Down Expand Up @@ -603,9 +603,9 @@ sequenceDecs_decode_56_amd64_match_len_ofs_ok:
MOVQ R12, 152(AX)
MOVQ R13, 160(AX)
MOVQ br+8(FP), AX
MOVQ DX, 32(AX)
MOVB BL, 40(AX)
MOVQ SI, 24(AX)
MOVQ DX, 24(AX)
MOVB BL, 32(AX)
MOVQ SI, 8(AX)

// Return success
MOVQ $0x00000000, ret+24(FP)
Expand Down Expand Up @@ -638,11 +638,11 @@ error_overread:
// func sequenceDecs_decode_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
// Requires: BMI, BMI2, CMOV
TEXT ·sequenceDecs_decode_bmi2(SB), $8-32
MOVQ br+8(FP), CX
MOVQ 32(CX), AX
MOVBQZX 40(CX), DX
MOVQ 24(CX), BX
MOVQ (CX), CX
MOVQ br+8(FP), BX
MOVQ 24(BX), AX
MOVBQZX 32(BX), DX
MOVQ (BX), CX
MOVQ 8(BX), BX
ADDQ BX, CX
MOVQ CX, (SP)
MOVQ ctx+16(FP), CX
Expand Down Expand Up @@ -892,9 +892,9 @@ sequenceDecs_decode_bmi2_match_len_ofs_ok:
MOVQ R11, 152(CX)
MOVQ R12, 160(CX)
MOVQ br+8(FP), CX
MOVQ AX, 32(CX)
MOVB DL, 40(CX)
MOVQ BX, 24(CX)
MOVQ AX, 24(CX)
MOVB DL, 32(CX)
MOVQ BX, 8(CX)

// Return success
MOVQ $0x00000000, ret+24(FP)
Expand Down Expand Up @@ -927,11 +927,11 @@ error_overread:
// func sequenceDecs_decode_56_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
// Requires: BMI, BMI2, CMOV
TEXT ·sequenceDecs_decode_56_bmi2(SB), $8-32
MOVQ br+8(FP), CX
MOVQ 32(CX), AX
MOVBQZX 40(CX), DX
MOVQ 24(CX), BX
MOVQ (CX), CX
MOVQ br+8(FP), BX
MOVQ 24(BX), AX
MOVBQZX 32(BX), DX
MOVQ (BX), CX
MOVQ 8(BX), BX
ADDQ BX, CX
MOVQ CX, (SP)
MOVQ ctx+16(FP), CX
Expand Down Expand Up @@ -1152,9 +1152,9 @@ sequenceDecs_decode_56_bmi2_match_len_ofs_ok:
MOVQ R11, 152(CX)
MOVQ R12, 160(CX)
MOVQ br+8(FP), CX
MOVQ AX, 32(CX)
MOVB DL, 40(CX)
MOVQ BX, 24(CX)
MOVQ AX, 24(CX)
MOVB DL, 32(CX)
MOVQ BX, 8(CX)

// Return success
MOVQ $0x00000000, ret+24(FP)
Expand Down Expand Up @@ -1797,11 +1797,11 @@ empty_seqs:
// func sequenceDecs_decodeSync_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
// Requires: CMOV, SSE
TEXT ·sequenceDecs_decodeSync_amd64(SB), $64-32
MOVQ br+8(FP), AX
MOVQ 32(AX), DX
MOVBQZX 40(AX), BX
MOVQ 24(AX), SI
MOVQ (AX), AX
MOVQ br+8(FP), CX
MOVQ 24(CX), DX
MOVBQZX 32(CX), BX
MOVQ (CX), AX
MOVQ 8(CX), SI
ADDQ SI, AX
MOVQ AX, (SP)
MOVQ ctx+16(FP), AX
Expand Down Expand Up @@ -2295,9 +2295,9 @@ handle_loop:

loop_finished:
MOVQ br+8(FP), AX
MOVQ DX, 32(AX)
MOVB BL, 40(AX)
MOVQ SI, 24(AX)
MOVQ DX, 24(AX)
MOVB BL, 32(AX)
MOVQ SI, 8(AX)

// Update the context
MOVQ ctx+16(FP), AX
Expand Down Expand Up @@ -2362,11 +2362,11 @@ error_not_enough_space:
// func sequenceDecs_decodeSync_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
// Requires: BMI, BMI2, CMOV, SSE
TEXT ·sequenceDecs_decodeSync_bmi2(SB), $64-32
MOVQ br+8(FP), CX
MOVQ 32(CX), AX
MOVBQZX 40(CX), DX
MOVQ 24(CX), BX
MOVQ (CX), CX
MOVQ br+8(FP), BX
MOVQ 24(BX), AX
MOVBQZX 32(BX), DX
MOVQ (BX), CX
MOVQ 8(BX), BX
ADDQ BX, CX
MOVQ CX, (SP)
MOVQ ctx+16(FP), CX
Expand Down Expand Up @@ -2818,9 +2818,9 @@ handle_loop:

loop_finished:
MOVQ br+8(FP), CX
MOVQ AX, 32(CX)
MOVB DL, 40(CX)
MOVQ BX, 24(CX)
MOVQ AX, 24(CX)
MOVB DL, 32(CX)
MOVQ BX, 8(CX)

// Update the context
MOVQ ctx+16(FP), AX
Expand Down Expand Up @@ -2885,11 +2885,11 @@ error_not_enough_space:
// func sequenceDecs_decodeSync_safe_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
// Requires: CMOV, SSE
TEXT ·sequenceDecs_decodeSync_safe_amd64(SB), $64-32
MOVQ br+8(FP), AX
MOVQ 32(AX), DX
MOVBQZX 40(AX), BX
MOVQ 24(AX), SI
MOVQ (AX), AX
MOVQ br+8(FP), CX
MOVQ 24(CX), DX
MOVBQZX 32(CX), BX
MOVQ (CX), AX
MOVQ 8(CX), SI
ADDQ SI, AX
MOVQ AX, (SP)
MOVQ ctx+16(FP), AX
Expand Down Expand Up @@ -3485,9 +3485,9 @@ handle_loop:

loop_finished:
MOVQ br+8(FP), AX
MOVQ DX, 32(AX)
MOVB BL, 40(AX)
MOVQ SI, 24(AX)
MOVQ DX, 24(AX)
MOVB BL, 32(AX)
MOVQ SI, 8(AX)

// Update the context
MOVQ ctx+16(FP), AX
Expand Down Expand Up @@ -3552,11 +3552,11 @@ error_not_enough_space:
// func sequenceDecs_decodeSync_safe_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
// Requires: BMI, BMI2, CMOV, SSE
TEXT ·sequenceDecs_decodeSync_safe_bmi2(SB), $64-32
MOVQ br+8(FP), CX
MOVQ 32(CX), AX
MOVBQZX 40(CX), DX
MOVQ 24(CX), BX
MOVQ (CX), CX
MOVQ br+8(FP), BX
MOVQ 24(BX), AX
MOVBQZX 32(BX), DX
MOVQ (BX), CX
MOVQ 8(BX), BX
ADDQ BX, CX
MOVQ CX, (SP)
MOVQ ctx+16(FP), CX
Expand Down Expand Up @@ -4110,9 +4110,9 @@ handle_loop:

loop_finished:
MOVQ br+8(FP), CX
MOVQ AX, 32(CX)
MOVB DL, 40(CX)
MOVQ BX, 24(CX)
MOVQ AX, 24(CX)
MOVB DL, 32(CX)
MOVQ BX, 8(CX)

// Update the context
MOVQ ctx+16(FP), AX
Expand Down
2 changes: 1 addition & 1 deletion zstd/seqdec_generic.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ func (s *sequenceDecs) decode(seqs []seqVals) error {
}
for i := range seqs {
var ll, mo, ml int
if br.off > 4+((maxOffsetBits+16+16)>>3) {
if len(br.in) > 4+((maxOffsetBits+16+16)>>3) {
// inlined function:
// ll, mo, ml = s.nextFast(br, llState, mlState, ofState)

Expand Down

0 comments on commit 3f4e64f

Please sign in to comment.