Skip to content

Commit

Permalink
zstd: Add amd64 match length assembly (#824)
Browse files Browse the repository at this point in the history
* zstd: Add amd64 match length assembly

Copied from the S2 implementation. 5-10% faster.
  • Loading branch information
klauspost committed Jun 12, 2023
1 parent 9a951be commit a3a5dce
Show file tree
Hide file tree
Showing 5 changed files with 118 additions and 23 deletions.
2 changes: 1 addition & 1 deletion zstd/blockdec.go
Expand Up @@ -592,7 +592,7 @@ func (b *blockDec) prepareSequences(in []byte, hist *history) (err error) {
}
seq.fse.setRLE(symb)
if debugDecoder {
printf("RLE set to %+v, code: %v", symb, v)
printf("RLE set to 0x%x, code: %v", symb, v)
}
case compModeFSE:
println("Reading table for", tableIndex(i))
Expand Down
16 changes: 16 additions & 0 deletions zstd/matchlen_amd64.go
@@ -0,0 +1,16 @@
//go:build amd64 && !appengine && !noasm && gc
// +build amd64,!appengine,!noasm,gc

// Copyright 2019+ Klaus Post. All rights reserved.
// License information can be found in the LICENSE file.

package zstd

// matchLen returns how many bytes match in a and b
//
// It assumes that:
//
// len(a) <= len(b) and len(a) > 0
//
//go:noescape
func matchLen(a []byte, b []byte) int
68 changes: 68 additions & 0 deletions zstd/matchlen_amd64.s
@@ -0,0 +1,68 @@
// Copied from S2 implementation.

//go:build !appengine && !noasm && gc && !noasm

#include "textflag.h"

// func matchLen(a []byte, b []byte) int
// Requires: BMI
TEXT ·matchLen(SB), NOSPLIT, $0-56
MOVQ a_base+0(FP), AX
MOVQ b_base+24(FP), CX
MOVQ a_len+8(FP), DX

// matchLen
XORL SI, SI
CMPL DX, $0x08
JB matchlen_match4_standalone

matchlen_loopback_standalone:
MOVQ (AX)(SI*1), BX
XORQ (CX)(SI*1), BX
TESTQ BX, BX
JZ matchlen_loop_standalone

#ifdef GOAMD64_v3
TZCNTQ BX, BX
#else
BSFQ BX, BX
#endif
SARQ $0x03, BX
LEAL (SI)(BX*1), SI
JMP gen_match_len_end

matchlen_loop_standalone:
LEAL -8(DX), DX
LEAL 8(SI), SI
CMPL DX, $0x08
JAE matchlen_loopback_standalone

matchlen_match4_standalone:
CMPL DX, $0x04
JB matchlen_match2_standalone
MOVL (AX)(SI*1), BX
CMPL (CX)(SI*1), BX
JNE matchlen_match2_standalone
LEAL -4(DX), DX
LEAL 4(SI), SI

matchlen_match2_standalone:
CMPL DX, $0x02
JB matchlen_match1_standalone
MOVW (AX)(SI*1), BX
CMPW (CX)(SI*1), BX
JNE matchlen_match1_standalone
LEAL -2(DX), DX
LEAL 2(SI), SI

matchlen_match1_standalone:
CMPL DX, $0x01
JB gen_match_len_end
MOVB (AX)(SI*1), BL
CMPB (CX)(SI*1), BL
JNE gen_match_len_end
INCL SI

gen_match_len_end:
MOVQ SI, ret+48(FP)
RET
33 changes: 33 additions & 0 deletions zstd/matchlen_generic.go
@@ -0,0 +1,33 @@
//go:build !amd64 || appengine || !gc || noasm
// +build !amd64 appengine !gc noasm

// Copyright 2019+ Klaus Post. All rights reserved.
// License information can be found in the LICENSE file.

package zstd

import (
"encoding/binary"
"math/bits"
)

// matchLen returns the maximum common prefix length of a and b.
// a must be the shortest of the two.
func matchLen(a, b []byte) (n int) {
for ; len(a) >= 8 && len(b) >= 8; a, b = a[8:], b[8:] {
diff := binary.LittleEndian.Uint64(a) ^ binary.LittleEndian.Uint64(b)
if diff != 0 {
return n + bits.TrailingZeros64(diff)>>3
}
n += 8
}

for i := range a {
if a[i] != b[i] {
break
}
n++
}
return n

}
22 changes: 0 additions & 22 deletions zstd/zstd.go
Expand Up @@ -9,7 +9,6 @@ import (
"errors"
"log"
"math"
"math/bits"
)

// enable debug printing
Expand Down Expand Up @@ -106,27 +105,6 @@ func printf(format string, a ...interface{}) {
}
}

// matchLen returns the maximum common prefix length of a and b.
// a must be the shortest of the two.
func matchLen(a, b []byte) (n int) {
for ; len(a) >= 8 && len(b) >= 8; a, b = a[8:], b[8:] {
diff := binary.LittleEndian.Uint64(a) ^ binary.LittleEndian.Uint64(b)
if diff != 0 {
return n + bits.TrailingZeros64(diff)>>3
}
n += 8
}

for i := range a {
if a[i] != b[i] {
break
}
n++
}
return n

}

func load3232(b []byte, i int32) uint32 {
return binary.LittleEndian.Uint32(b[:len(b):len(b)][i:])
}
Expand Down

0 comments on commit a3a5dce

Please sign in to comment.