Skip to content

Commit

Permalink
mask.go: Use SIMD masking for amd64 and arm64
Browse files Browse the repository at this point in the history
goos: windows
goarch: amd64
pkg: nhooyr.io/websocket
cpu: Intel(R) Core(TM) i5-9300H CPU @ 2.40GHz
Benchmark_mask/2/basic-8         	425339004	         2.795 ns/op	 715.66 MB/s
Benchmark_mask/2/nhooyr-8        	379937766	         3.186 ns/op	 627.78 MB/s
Benchmark_mask/2/gorilla-8       	392164167	         3.071 ns/op	 651.24 MB/s
Benchmark_mask/2/gobwas-8        	310037222	         3.880 ns/op	 515.46 MB/s
Benchmark_mask/3/basic-8         	321408024	         3.806 ns/op	 788.32 MB/s
Benchmark_mask/3/nhooyr-8        	350726338	         3.478 ns/op	 862.58 MB/s
Benchmark_mask/3/gorilla-8       	332217727	         3.634 ns/op	 825.43 MB/s
Benchmark_mask/3/gobwas-8        	247376214	         4.886 ns/op	 614.01 MB/s
Benchmark_mask/4/basic-8         	261182472	         4.582 ns/op	 872.91 MB/s
Benchmark_mask/4/nhooyr-8        	381830712	         3.262 ns/op	1226.05 MB/s
Benchmark_mask/4/gorilla-8       	272616304	         4.395 ns/op	 910.04 MB/s
Benchmark_mask/4/gobwas-8        	204574558	         5.855 ns/op	 683.19 MB/s
Benchmark_mask/8/basic-8         	191330037	         6.162 ns/op	1298.24 MB/s
Benchmark_mask/8/nhooyr-8        	369694992	         3.285 ns/op	2435.65 MB/s
Benchmark_mask/8/gorilla-8       	175388466	         6.743 ns/op	1186.48 MB/s
Benchmark_mask/8/gobwas-8        	241719933	         4.886 ns/op	1637.45 MB/s
Benchmark_mask/16/basic-8        	100000000	        10.92 ns/op	1464.83 MB/s
Benchmark_mask/16/nhooyr-8       	272565096	         4.436 ns/op	3606.98 MB/s
Benchmark_mask/16/gorilla-8      	100000000	        11.20 ns/op	1428.53 MB/s
Benchmark_mask/16/gobwas-8       	221356798	         5.405 ns/op	2960.45 MB/s
Benchmark_mask/32/basic-8        	61476984	        20.40 ns/op	1568.80 MB/s
Benchmark_mask/32/nhooyr-8       	238665572	         5.050 ns/op	6337.22 MB/s
Benchmark_mask/32/gorilla-8      	100000000	        12.09 ns/op	2647.28 MB/s
Benchmark_mask/32/gobwas-8       	186077235	         6.477 ns/op	4940.36 MB/s
Benchmark_mask/128/basic-8       	14629720	        80.90 ns/op	1582.19 MB/s
Benchmark_mask/128/nhooyr-8      	181241968	         6.565 ns/op	19497.98 MB/s
Benchmark_mask/128/gorilla-8     	68308342	        16.76 ns/op	7639.37 MB/s
Benchmark_mask/128/gobwas-8      	94582026	        12.97 ns/op	9872.11 MB/s
Benchmark_mask/512/basic-8       	 3921001	       305.6 ns/op	1675.55 MB/s
Benchmark_mask/512/nhooyr-8      	123102199	         9.721 ns/op	52669.11 MB/s
Benchmark_mask/512/gorilla-8     	32355914	        38.18 ns/op	13411.43 MB/s
Benchmark_mask/512/gobwas-8      	31528501	        37.80 ns/op	13544.37 MB/s
Benchmark_mask/4096/basic-8      	  491804	      2381 ns/op	1720.39 MB/s
Benchmark_mask/4096/nhooyr-8     	26159691	        46.98 ns/op	87187.73 MB/s
Benchmark_mask/4096/gorilla-8    	 4898440	       243.6 ns/op	16817.89 MB/s
Benchmark_mask/4096/gobwas-8     	 4336398	       277.2 ns/op	14776.40 MB/s
Benchmark_mask/16384/basic-8     	  113842	      9623 ns/op	1702.66 MB/s
Benchmark_mask/16384/nhooyr-8    	 8088847	       154.5 ns/op	106058.18 MB/s
Benchmark_mask/16384/gorilla-8   	 1282993	       933.6 ns/op	17549.90 MB/s
Benchmark_mask/16384/gobwas-8    	  997347	      1086 ns/op	15093.49 MB/s

We're about 4-5x faster then gorilla now.
  • Loading branch information
wdvxdr1123 authored and nhooyr committed Oct 19, 2023
1 parent 535fd2c commit cfca343
Show file tree
Hide file tree
Showing 7 changed files with 257 additions and 1 deletion.
2 changes: 1 addition & 1 deletion frame.go
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@ func writeFrameHeader(h header, w *bufio.Writer, buf []byte) (err error) {
// to be in little endian.
//
// See https://github.com/golang/go/issues/31586
func mask(key uint32, b []byte) uint32 {
func maskGo(key uint32, b []byte) uint32 {

Check failure on line 187 in frame.go

View workflow job for this annotation

GitHub Actions / lint

func maskGo is unused (U1000)
if len(b) >= 8 {
key64 := uint64(key)<<32 | uint64(key)

Expand Down
2 changes: 2 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
module nhooyr.io/websocket

go 1.19

require golang.org/x/sys v0.13.0
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
golang.org/x/sys v0.13.0 h1:Af8nKPmuFypiUBjVoU9V20FiaFXOcuZI21p0ycVYYGE=
golang.org/x/sys v0.13.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
152 changes: 152 additions & 0 deletions mask_amd64.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
#include "textflag.h"

// func maskAsm(b *byte, len int, key uint32)
TEXT ·maskAsm(SB), NOSPLIT, $0-28
// AX = b
// CX = len (left length)
// SI = key (uint32)
// DI = uint64(SI) | uint64(SI)<<32
MOVQ b+0(FP), AX
MOVQ len+8(FP), CX
MOVL key+16(FP), SI

// calculate the DI
// DI = SI<<32 | SI
MOVL SI, DI
MOVQ DI, DX
SHLQ $32, DI
ORQ DX, DI

CMPQ CX, $15
JLE less_than_16
CMPQ CX, $63
JLE less_than_64
CMPQ CX, $128
JLE sse
TESTQ $31, AX
JNZ unaligned

aligned:
CMPB ·useAVX2(SB), $1
JE avx2
JMP sse

unaligned_loop_1byte:
XORB SI, (AX)
INCQ AX
DECQ CX
ROLL $24, SI
TESTQ $7, AX
JNZ unaligned_loop_1byte

// calculate DI again since SI was modified
// DI = SI<<32 | SI
MOVL SI, DI
MOVQ DI, DX
SHLQ $32, DI
ORQ DX, DI

TESTQ $31, AX
JZ aligned

unaligned:
TESTQ $7, AX // AND $7 & len, if not zero jump to loop_1b.
JNZ unaligned_loop_1byte

unaligned_loop:
// we don't need to check the CX since we know it's above 128
XORQ DI, (AX)
ADDQ $8, AX
SUBQ $8, CX
TESTQ $31, AX
JNZ unaligned_loop
JMP aligned

avx2:
CMPQ CX, $0x80
JL sse
VMOVQ DI, X0
VPBROADCASTQ X0, Y0

avx2_loop:
VPXOR (AX), Y0, Y1
VPXOR 32(AX), Y0, Y2
VPXOR 64(AX), Y0, Y3
VPXOR 96(AX), Y0, Y4
VMOVDQU Y1, (AX)
VMOVDQU Y2, 32(AX)
VMOVDQU Y3, 64(AX)
VMOVDQU Y4, 96(AX)
ADDQ $0x80, AX
SUBQ $0x80, CX
CMPQ CX, $0x80
JAE avx2_loop // loop if CX >= 0x80

sse:
CMPQ CX, $0x40
JL less_than_64
MOVQ DI, X0
PUNPCKLQDQ X0, X0

sse_loop:
MOVOU 0*16(AX), X1
MOVOU 1*16(AX), X2
MOVOU 2*16(AX), X3
MOVOU 3*16(AX), X4
PXOR X0, X1
PXOR X0, X2
PXOR X0, X3
PXOR X0, X4
MOVOU X1, 0*16(AX)
MOVOU X2, 1*16(AX)
MOVOU X3, 2*16(AX)
MOVOU X4, 3*16(AX)
ADDQ $0x40, AX
SUBQ $0x40, CX
CMPQ CX, $0x40
JAE sse_loop

less_than_64:
TESTQ $32, CX
JZ less_than_32
XORQ DI, (AX)
XORQ DI, 8(AX)
XORQ DI, 16(AX)
XORQ DI, 24(AX)
ADDQ $32, AX

less_than_32:
TESTQ $16, CX
JZ less_than_16
XORQ DI, (AX)
XORQ DI, 8(AX)
ADDQ $16, AX

less_than_16:
TESTQ $8, CX
JZ less_than_8
XORQ DI, (AX)
ADDQ $8, AX

less_than_8:
TESTQ $4, CX
JZ less_than_4
XORL SI, (AX)
ADDQ $4, AX

less_than_4:
TESTQ $2, CX
JZ less_than_2
XORW SI, (AX)
ROLL $16, SI
ADDQ $2, AX

less_than_2:
TESTQ $1, CX
JZ done
XORB SI, (AX)
ROLL $24, SI

done:
MOVL SI, ret+24(FP)
RET
74 changes: 74 additions & 0 deletions mask_arm64.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
#include "textflag.h"

// func maskAsm(b *byte,len, int, key uint32)
TEXT ·maskAsm(SB), NOSPLIT, $0-28
// R0 = b
// R1 = len
// R2 = uint64(key)<<32 | uint64(key)
// R3 = key (uint32)
MOVD b_ptr+0(FP), R0
MOVD b_len+8(FP), R1
MOVWU key+16(FP), R3
MOVD R3, R2
ORR R2<<32, R2, R2
VDUP R2, V0.D2
CMP $64, R1
BLT less_than_64

// todo: optimize unaligned case
loop_64:
VLD1 (R0), [V1.B16, V2.B16, V3.B16, V4.B16]
VEOR V1.B16, V0.B16, V1.B16
VEOR V2.B16, V0.B16, V2.B16
VEOR V3.B16, V0.B16, V3.B16
VEOR V4.B16, V0.B16, V4.B16
VST1.P [V1.B16, V2.B16, V3.B16, V4.B16], 64(R0)
SUBS $64, R1
CMP $64, R1
BGE loop_64

less_than_64:
// quick end
CBZ R1, end
TBZ $5, R1, less_than32
VLD1 (R0), [V1.B16, V2.B16]
VEOR V1.B16, V0.B16, V1.B16
VEOR V2.B16, V0.B16, V2.B16
VST1.P [V1.B16, V2.B16], 32(R0)

less_than32:
TBZ $4, R1, less_than16
LDP (R0), (R11, R12)
EOR R11, R2, R11
EOR R12, R2, R12
STP.P (R11, R12), 16(R0)

less_than16:
TBZ $3, R1, less_than8
MOVD (R0), R11
EOR R2, R11, R11
MOVD.P R11, 8(R0)

less_than8:
TBZ $2, R1, less_than4
MOVWU (R0), R11
EORW R2, R11, R11
MOVWU.P R11, 4(R0)

less_than4:
TBZ $1, R1, less_than2
MOVHU (R0), R11
EORW R3, R11, R11
MOVHU.P R11, 2(R0)
RORW $16, R3

less_than2:
TBZ $0, R1, end
MOVBU (R0), R11
EORW R3, R11, R11
MOVBU.P R11, 1(R0)
RORW $8, R3

end:
MOVWU R3, ret+24(FP)
RET
19 changes: 19 additions & 0 deletions mask_asm.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
//go:build !appengine && (amd64 || arm64)
// +build !appengine
// +build amd64 arm64

package websocket

import "golang.org/x/sys/cpu"

func mask(key uint32, b []byte) uint32 {
if len(b) > 0 {
return maskAsm(&b[0], len(b), key)
}
return key
}

var useAVX2 = cpu.X86.HasAVX2

Check failure on line 16 in mask_asm.go

View workflow job for this annotation

GitHub Actions / lint

var useAVX2 is unused (U1000)

//go:noescape
func maskAsm(b *byte, len int, key uint32) uint32
7 changes: 7 additions & 0 deletions mask_generic.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
//go:build appengine || (!amd64 && !arm64 && !js)

package websocket

func mask(key uint32, b []byte) uint32 {
return maskGo(key, b)
}

0 comments on commit cfca343

Please sign in to comment.