From 5df0303d0a24d67de232a55f55ff3cbf9f8fc6ac Mon Sep 17 00:00:00 2001 From: wdvxdr Date: Mon, 24 Jan 2022 19:25:11 +0800 Subject: [PATCH 01/26] mask.go: Use SIMD masking for amd64 and arm64 goos: windows goarch: amd64 pkg: nhooyr.io/websocket cpu: Intel(R) Core(TM) i5-9300H CPU @ 2.40GHz Benchmark_mask/2/basic-8 425339004 2.795 ns/op 715.66 MB/s Benchmark_mask/2/nhooyr-8 379937766 3.186 ns/op 627.78 MB/s Benchmark_mask/2/gorilla-8 392164167 3.071 ns/op 651.24 MB/s Benchmark_mask/2/gobwas-8 310037222 3.880 ns/op 515.46 MB/s Benchmark_mask/3/basic-8 321408024 3.806 ns/op 788.32 MB/s Benchmark_mask/3/nhooyr-8 350726338 3.478 ns/op 862.58 MB/s Benchmark_mask/3/gorilla-8 332217727 3.634 ns/op 825.43 MB/s Benchmark_mask/3/gobwas-8 247376214 4.886 ns/op 614.01 MB/s Benchmark_mask/4/basic-8 261182472 4.582 ns/op 872.91 MB/s Benchmark_mask/4/nhooyr-8 381830712 3.262 ns/op 1226.05 MB/s Benchmark_mask/4/gorilla-8 272616304 4.395 ns/op 910.04 MB/s Benchmark_mask/4/gobwas-8 204574558 5.855 ns/op 683.19 MB/s Benchmark_mask/8/basic-8 191330037 6.162 ns/op 1298.24 MB/s Benchmark_mask/8/nhooyr-8 369694992 3.285 ns/op 2435.65 MB/s Benchmark_mask/8/gorilla-8 175388466 6.743 ns/op 1186.48 MB/s Benchmark_mask/8/gobwas-8 241719933 4.886 ns/op 1637.45 MB/s Benchmark_mask/16/basic-8 100000000 10.92 ns/op 1464.83 MB/s Benchmark_mask/16/nhooyr-8 272565096 4.436 ns/op 3606.98 MB/s Benchmark_mask/16/gorilla-8 100000000 11.20 ns/op 1428.53 MB/s Benchmark_mask/16/gobwas-8 221356798 5.405 ns/op 2960.45 MB/s Benchmark_mask/32/basic-8 61476984 20.40 ns/op 1568.80 MB/s Benchmark_mask/32/nhooyr-8 238665572 5.050 ns/op 6337.22 MB/s Benchmark_mask/32/gorilla-8 100000000 12.09 ns/op 2647.28 MB/s Benchmark_mask/32/gobwas-8 186077235 6.477 ns/op 4940.36 MB/s Benchmark_mask/128/basic-8 14629720 80.90 ns/op 1582.19 MB/s Benchmark_mask/128/nhooyr-8 181241968 6.565 ns/op 19497.98 MB/s Benchmark_mask/128/gorilla-8 68308342 16.76 ns/op 7639.37 MB/s Benchmark_mask/128/gobwas-8 94582026 12.97 ns/op 9872.11 MB/s Benchmark_mask/512/basic-8 3921001 305.6 ns/op 1675.55 MB/s Benchmark_mask/512/nhooyr-8 123102199 9.721 ns/op 52669.11 MB/s Benchmark_mask/512/gorilla-8 32355914 38.18 ns/op 13411.43 MB/s Benchmark_mask/512/gobwas-8 31528501 37.80 ns/op 13544.37 MB/s Benchmark_mask/4096/basic-8 491804 2381 ns/op 1720.39 MB/s Benchmark_mask/4096/nhooyr-8 26159691 46.98 ns/op 87187.73 MB/s Benchmark_mask/4096/gorilla-8 4898440 243.6 ns/op 16817.89 MB/s Benchmark_mask/4096/gobwas-8 4336398 277.2 ns/op 14776.40 MB/s Benchmark_mask/16384/basic-8 113842 9623 ns/op 1702.66 MB/s Benchmark_mask/16384/nhooyr-8 8088847 154.5 ns/op 106058.18 MB/s Benchmark_mask/16384/gorilla-8 1282993 933.6 ns/op 17549.90 MB/s Benchmark_mask/16384/gobwas-8 997347 1086 ns/op 15093.49 MB/s We're about 4-5x faster then gorilla now. --- frame.go | 2 +- go.mod | 2 + go.sum | 2 + mask_amd64.s | 152 ++++++++++++++++++++++++++++++++++++++++++++++++ mask_arm64.s | 74 +++++++++++++++++++++++ mask_asm.go | 19 ++++++ mask_generic.go | 7 +++ 7 files changed, 257 insertions(+), 1 deletion(-) create mode 100644 go.sum create mode 100644 mask_amd64.s create mode 100644 mask_arm64.s create mode 100644 mask_asm.go create mode 100644 mask_generic.go diff --git a/frame.go b/frame.go index 351632fd..eec15bb9 100644 --- a/frame.go +++ b/frame.go @@ -184,7 +184,7 @@ func writeFrameHeader(h header, w *bufio.Writer, buf []byte) (err error) { // to be in little endian. // // See https://github.com/golang/go/issues/31586 -func mask(key uint32, b []byte) uint32 { +func maskGo(key uint32, b []byte) uint32 { if len(b) >= 8 { key64 := uint64(key)<<32 | uint64(key) diff --git a/go.mod b/go.mod index 715a9f7a..285b955f 100644 --- a/go.mod +++ b/go.mod @@ -1,3 +1,5 @@ module nhooyr.io/websocket go 1.19 + +require golang.org/x/sys v0.13.0 diff --git a/go.sum b/go.sum new file mode 100644 index 00000000..d4673ecf --- /dev/null +++ b/go.sum @@ -0,0 +1,2 @@ +golang.org/x/sys v0.13.0 h1:Af8nKPmuFypiUBjVoU9V20FiaFXOcuZI21p0ycVYYGE= +golang.org/x/sys v0.13.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= diff --git a/mask_amd64.s b/mask_amd64.s new file mode 100644 index 00000000..caca53ec --- /dev/null +++ b/mask_amd64.s @@ -0,0 +1,152 @@ +#include "textflag.h" + +// func maskAsm(b *byte, len int, key uint32) +TEXT ·maskAsm(SB), NOSPLIT, $0-28 + // AX = b + // CX = len (left length) + // SI = key (uint32) + // DI = uint64(SI) | uint64(SI)<<32 + MOVQ b+0(FP), AX + MOVQ len+8(FP), CX + MOVL key+16(FP), SI + + // calculate the DI + // DI = SI<<32 | SI + MOVL SI, DI + MOVQ DI, DX + SHLQ $32, DI + ORQ DX, DI + + CMPQ CX, $15 + JLE less_than_16 + CMPQ CX, $63 + JLE less_than_64 + CMPQ CX, $128 + JLE sse + TESTQ $31, AX + JNZ unaligned + +aligned: + CMPB ·useAVX2(SB), $1 + JE avx2 + JMP sse + +unaligned_loop_1byte: + XORB SI, (AX) + INCQ AX + DECQ CX + ROLL $24, SI + TESTQ $7, AX + JNZ unaligned_loop_1byte + + // calculate DI again since SI was modified + // DI = SI<<32 | SI + MOVL SI, DI + MOVQ DI, DX + SHLQ $32, DI + ORQ DX, DI + + TESTQ $31, AX + JZ aligned + +unaligned: + TESTQ $7, AX // AND $7 & len, if not zero jump to loop_1b. + JNZ unaligned_loop_1byte + +unaligned_loop: + // we don't need to check the CX since we know it's above 128 + XORQ DI, (AX) + ADDQ $8, AX + SUBQ $8, CX + TESTQ $31, AX + JNZ unaligned_loop + JMP aligned + +avx2: + CMPQ CX, $0x80 + JL sse + VMOVQ DI, X0 + VPBROADCASTQ X0, Y0 + +avx2_loop: + VPXOR (AX), Y0, Y1 + VPXOR 32(AX), Y0, Y2 + VPXOR 64(AX), Y0, Y3 + VPXOR 96(AX), Y0, Y4 + VMOVDQU Y1, (AX) + VMOVDQU Y2, 32(AX) + VMOVDQU Y3, 64(AX) + VMOVDQU Y4, 96(AX) + ADDQ $0x80, AX + SUBQ $0x80, CX + CMPQ CX, $0x80 + JAE avx2_loop // loop if CX >= 0x80 + +sse: + CMPQ CX, $0x40 + JL less_than_64 + MOVQ DI, X0 + PUNPCKLQDQ X0, X0 + +sse_loop: + MOVOU 0*16(AX), X1 + MOVOU 1*16(AX), X2 + MOVOU 2*16(AX), X3 + MOVOU 3*16(AX), X4 + PXOR X0, X1 + PXOR X0, X2 + PXOR X0, X3 + PXOR X0, X4 + MOVOU X1, 0*16(AX) + MOVOU X2, 1*16(AX) + MOVOU X3, 2*16(AX) + MOVOU X4, 3*16(AX) + ADDQ $0x40, AX + SUBQ $0x40, CX + CMPQ CX, $0x40 + JAE sse_loop + +less_than_64: + TESTQ $32, CX + JZ less_than_32 + XORQ DI, (AX) + XORQ DI, 8(AX) + XORQ DI, 16(AX) + XORQ DI, 24(AX) + ADDQ $32, AX + +less_than_32: + TESTQ $16, CX + JZ less_than_16 + XORQ DI, (AX) + XORQ DI, 8(AX) + ADDQ $16, AX + +less_than_16: + TESTQ $8, CX + JZ less_than_8 + XORQ DI, (AX) + ADDQ $8, AX + +less_than_8: + TESTQ $4, CX + JZ less_than_4 + XORL SI, (AX) + ADDQ $4, AX + +less_than_4: + TESTQ $2, CX + JZ less_than_2 + XORW SI, (AX) + ROLL $16, SI + ADDQ $2, AX + +less_than_2: + TESTQ $1, CX + JZ done + XORB SI, (AX) + ROLL $24, SI + +done: + MOVL SI, ret+24(FP) + RET diff --git a/mask_arm64.s b/mask_arm64.s new file mode 100644 index 00000000..624cb720 --- /dev/null +++ b/mask_arm64.s @@ -0,0 +1,74 @@ +#include "textflag.h" + +// func maskAsm(b *byte,len, int, key uint32) +TEXT ·maskAsm(SB), NOSPLIT, $0-28 + // R0 = b + // R1 = len + // R2 = uint64(key)<<32 | uint64(key) + // R3 = key (uint32) + MOVD b_ptr+0(FP), R0 + MOVD b_len+8(FP), R1 + MOVWU key+16(FP), R3 + MOVD R3, R2 + ORR R2<<32, R2, R2 + VDUP R2, V0.D2 + CMP $64, R1 + BLT less_than_64 + + // todo: optimize unaligned case +loop_64: + VLD1 (R0), [V1.B16, V2.B16, V3.B16, V4.B16] + VEOR V1.B16, V0.B16, V1.B16 + VEOR V2.B16, V0.B16, V2.B16 + VEOR V3.B16, V0.B16, V3.B16 + VEOR V4.B16, V0.B16, V4.B16 + VST1.P [V1.B16, V2.B16, V3.B16, V4.B16], 64(R0) + SUBS $64, R1 + CMP $64, R1 + BGE loop_64 + +less_than_64: + // quick end + CBZ R1, end + TBZ $5, R1, less_than32 + VLD1 (R0), [V1.B16, V2.B16] + VEOR V1.B16, V0.B16, V1.B16 + VEOR V2.B16, V0.B16, V2.B16 + VST1.P [V1.B16, V2.B16], 32(R0) + +less_than32: + TBZ $4, R1, less_than16 + LDP (R0), (R11, R12) + EOR R11, R2, R11 + EOR R12, R2, R12 + STP.P (R11, R12), 16(R0) + +less_than16: + TBZ $3, R1, less_than8 + MOVD (R0), R11 + EOR R2, R11, R11 + MOVD.P R11, 8(R0) + +less_than8: + TBZ $2, R1, less_than4 + MOVWU (R0), R11 + EORW R2, R11, R11 + MOVWU.P R11, 4(R0) + +less_than4: + TBZ $1, R1, less_than2 + MOVHU (R0), R11 + EORW R3, R11, R11 + MOVHU.P R11, 2(R0) + RORW $16, R3 + +less_than2: + TBZ $0, R1, end + MOVBU (R0), R11 + EORW R3, R11, R11 + MOVBU.P R11, 1(R0) + RORW $8, R3 + +end: + MOVWU R3, ret+24(FP) + RET diff --git a/mask_asm.go b/mask_asm.go new file mode 100644 index 00000000..a18a20e5 --- /dev/null +++ b/mask_asm.go @@ -0,0 +1,19 @@ +//go:build !appengine && (amd64 || arm64) +// +build !appengine +// +build amd64 arm64 + +package websocket + +import "golang.org/x/sys/cpu" + +func mask(key uint32, b []byte) uint32 { + if len(b) > 0 { + return maskAsm(&b[0], len(b), key) + } + return key +} + +var useAVX2 = cpu.X86.HasAVX2 + +//go:noescape +func maskAsm(b *byte, len int, key uint32) uint32 diff --git a/mask_generic.go b/mask_generic.go new file mode 100644 index 00000000..6331b746 --- /dev/null +++ b/mask_generic.go @@ -0,0 +1,7 @@ +//go:build appengine || (!amd64 && !arm64 && !js) + +package websocket + +func mask(key uint32, b []byte) uint32 { + return maskGo(key, b) +} From cda2170e9b48e7a33e4b5401eb6db31cd61314d4 Mon Sep 17 00:00:00 2001 From: Anmol Sethi Date: Thu, 19 Oct 2023 07:41:22 -0700 Subject: [PATCH 02/26] Refactor and compile masking code again --- frame.go | 2 +- internal/examples/go.mod | 2 ++ internal/examples/go.sum | 2 ++ internal/thirdparty/go.mod | 2 +- internal/thirdparty/go.sum | 4 ++-- mask_generic.go => mask.go | 2 +- mask_asm.go | 6 ++---- 7 files changed, 11 insertions(+), 9 deletions(-) rename mask_generic.go => mask.go (63%) diff --git a/frame.go b/frame.go index eec15bb9..362a99e9 100644 --- a/frame.go +++ b/frame.go @@ -173,7 +173,7 @@ func writeFrameHeader(h header, w *bufio.Writer, buf []byte) (err error) { return nil } -// mask applies the WebSocket masking algorithm to p +// maskGo applies the WebSocket masking algorithm to p // with the given key. // See https://tools.ietf.org/html/rfc6455#section-5.3 // diff --git a/internal/examples/go.mod b/internal/examples/go.mod index c98b81ce..dfdb8cca 100644 --- a/internal/examples/go.mod +++ b/internal/examples/go.mod @@ -8,3 +8,5 @@ require ( golang.org/x/time v0.3.0 nhooyr.io/websocket v0.0.0-00010101000000-000000000000 ) + +require golang.org/x/sys v0.13.0 // indirect diff --git a/internal/examples/go.sum b/internal/examples/go.sum index f8a07e82..1931a8f2 100644 --- a/internal/examples/go.sum +++ b/internal/examples/go.sum @@ -1,2 +1,4 @@ +golang.org/x/sys v0.13.0 h1:Af8nKPmuFypiUBjVoU9V20FiaFXOcuZI21p0ycVYYGE= +golang.org/x/sys v0.13.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/time v0.3.0 h1:rg5rLMjNzMS1RkNLzCG38eapWhnYLFYXDXj2gOlr8j4= golang.org/x/time v0.3.0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= diff --git a/internal/thirdparty/go.mod b/internal/thirdparty/go.mod index 10eb45c1..3f32a416 100644 --- a/internal/thirdparty/go.mod +++ b/internal/thirdparty/go.mod @@ -34,7 +34,7 @@ require ( golang.org/x/arch v0.3.0 // indirect golang.org/x/crypto v0.9.0 // indirect golang.org/x/net v0.10.0 // indirect - golang.org/x/sys v0.8.0 // indirect + golang.org/x/sys v0.13.0 // indirect golang.org/x/text v0.9.0 // indirect google.golang.org/protobuf v1.30.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect diff --git a/internal/thirdparty/go.sum b/internal/thirdparty/go.sum index a9424b8d..47f324bb 100644 --- a/internal/thirdparty/go.sum +++ b/internal/thirdparty/go.sum @@ -76,8 +76,8 @@ golang.org/x/net v0.10.0 h1:X2//UzNDwYmtCLn7To6G58Wr6f5ahEAQgKNzv9Y951M= golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= golang.org/x/sys v0.0.0-20220704084225-05e143d24a9e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.8.0 h1:EBmGv8NaZBZTWvrbjNoL6HVt+IVy3QDQpJs7VRIw3tU= -golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.13.0 h1:Af8nKPmuFypiUBjVoU9V20FiaFXOcuZI21p0ycVYYGE= +golang.org/x/sys v0.13.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/text v0.9.0 h1:2sjJmO8cDvYveuX97RDLsxlyUxLl+GHoLxBiRdHllBE= golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543 h1:E7g+9GITq07hpfrRu66IVDexMakfv52eLZ2CXBWiKr4= diff --git a/mask_generic.go b/mask.go similarity index 63% rename from mask_generic.go rename to mask.go index 6331b746..7c9fe308 100644 --- a/mask_generic.go +++ b/mask.go @@ -1,4 +1,4 @@ -//go:build appengine || (!amd64 && !arm64 && !js) +//go:build !amd64 && !arm64 && !js package websocket diff --git a/mask_asm.go b/mask_asm.go index a18a20e5..9b370690 100644 --- a/mask_asm.go +++ b/mask_asm.go @@ -1,6 +1,4 @@ -//go:build !appengine && (amd64 || arm64) -// +build !appengine -// +build amd64 arm64 +//go:build amd64 || arm64 package websocket @@ -13,7 +11,7 @@ func mask(key uint32, b []byte) uint32 { return key } -var useAVX2 = cpu.X86.HasAVX2 +var useAVX2 = cpu.X86.HasAVX2 //lint:ignore U1000 mask_amd64.s //go:noescape func maskAsm(b *byte, len int, key uint32) uint32 From f5397ae3d1bfdf120ce8a598a0b0b0fe2b86f784 Mon Sep 17 00:00:00 2001 From: Anmol Sethi Date: Thu, 19 Oct 2023 07:48:47 -0700 Subject: [PATCH 03/26] mask_asm.go: Disable AVX2 Slower for some reason than just SIMD. Also no dependency on cpu package is nice. --- go.mod | 2 -- go.sum | 2 -- internal/examples/go.mod | 2 -- internal/examples/go.sum | 2 -- mask_asm.go | 4 +--- 5 files changed, 1 insertion(+), 11 deletions(-) diff --git a/go.mod b/go.mod index 285b955f..715a9f7a 100644 --- a/go.mod +++ b/go.mod @@ -1,5 +1,3 @@ module nhooyr.io/websocket go 1.19 - -require golang.org/x/sys v0.13.0 diff --git a/go.sum b/go.sum index d4673ecf..e69de29b 100644 --- a/go.sum +++ b/go.sum @@ -1,2 +0,0 @@ -golang.org/x/sys v0.13.0 h1:Af8nKPmuFypiUBjVoU9V20FiaFXOcuZI21p0ycVYYGE= -golang.org/x/sys v0.13.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= diff --git a/internal/examples/go.mod b/internal/examples/go.mod index dfdb8cca..c98b81ce 100644 --- a/internal/examples/go.mod +++ b/internal/examples/go.mod @@ -8,5 +8,3 @@ require ( golang.org/x/time v0.3.0 nhooyr.io/websocket v0.0.0-00010101000000-000000000000 ) - -require golang.org/x/sys v0.13.0 // indirect diff --git a/internal/examples/go.sum b/internal/examples/go.sum index 1931a8f2..f8a07e82 100644 --- a/internal/examples/go.sum +++ b/internal/examples/go.sum @@ -1,4 +1,2 @@ -golang.org/x/sys v0.13.0 h1:Af8nKPmuFypiUBjVoU9V20FiaFXOcuZI21p0ycVYYGE= -golang.org/x/sys v0.13.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/time v0.3.0 h1:rg5rLMjNzMS1RkNLzCG38eapWhnYLFYXDXj2gOlr8j4= golang.org/x/time v0.3.0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= diff --git a/mask_asm.go b/mask_asm.go index 9b370690..946bc0f6 100644 --- a/mask_asm.go +++ b/mask_asm.go @@ -2,8 +2,6 @@ package websocket -import "golang.org/x/sys/cpu" - func mask(key uint32, b []byte) uint32 { if len(b) > 0 { return maskAsm(&b[0], len(b), key) @@ -11,7 +9,7 @@ func mask(key uint32, b []byte) uint32 { return key } -var useAVX2 = cpu.X86.HasAVX2 //lint:ignore U1000 mask_amd64.s +var useAVX2 = false //go:noescape func maskAsm(b *byte, len int, key uint32) uint32 From 14172e5b461e3b8376d79d2456800b7e100a044b Mon Sep 17 00:00:00 2001 From: Anmol Sethi Date: Thu, 19 Oct 2023 08:05:25 -0700 Subject: [PATCH 04/26] Benchmark pure go masking algorithm separately from assembly --- internal/thirdparty/frame_test.go | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/internal/thirdparty/frame_test.go b/internal/thirdparty/frame_test.go index 1a0ed125..dd0440db 100644 --- a/internal/thirdparty/frame_test.go +++ b/internal/thirdparty/frame_test.go @@ -26,6 +26,9 @@ func gorillaMaskBytes(key [4]byte, pos int, b []byte) int //go:linkname mask nhooyr.io/websocket.mask func mask(key32 uint32, b []byte) int +//go:linkname maskGo nhooyr.io/websocket.maskGo +func maskGo(key32 uint32, b []byte) int + func Benchmark_mask(b *testing.B) { sizes := []int{ 2, @@ -54,7 +57,18 @@ func Benchmark_mask(b *testing.B) { }, { - name: "nhooyr", + name: "nhooyr-go", + fn: func(b *testing.B, key [4]byte, p []byte) { + key32 := binary.LittleEndian.Uint32(key[:]) + b.ResetTimer() + + for i := 0; i < b.N; i++ { + maskGo(key32, p) + } + }, + }, + { + name: "wdvxdr1123-asm", fn: func(b *testing.B, key [4]byte, p []byte) { key32 := binary.LittleEndian.Uint32(key[:]) b.ResetTimer() @@ -64,6 +78,7 @@ func Benchmark_mask(b *testing.B) { } }, }, + { name: "gorilla", fn: func(b *testing.B, key [4]byte, p []byte) { From 685a56e22e4ffe94d73c1da0d0e8746dcd36f165 Mon Sep 17 00:00:00 2001 From: Anmol Sethi Date: Thu, 19 Oct 2023 08:08:37 -0700 Subject: [PATCH 05/26] Update README.md to indicate assembly websocket masking --- README.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index d093746d..4b2d828e 100644 --- a/README.md +++ b/README.md @@ -26,6 +26,7 @@ go get nhooyr.io/websocket - [RFC 7692](https://tools.ietf.org/html/rfc7692) permessage-deflate compression - [CloseRead](https://pkg.go.dev/nhooyr.io/websocket#Conn.CloseRead) helper for write only connections - Compile to [Wasm](https://pkg.go.dev/nhooyr.io/websocket#hdr-Wasm) +- WebSocket masking implemented in assembly for amd64 and arm64 [#326](https://github.com/nhooyr/websocket/issues/326) ## Roadmap @@ -36,8 +37,6 @@ See GitHub issues for minor issues but the major future enhancements are: - [ ] Ping pong heartbeat helper [#267](https://github.com/nhooyr/websocket/issues/267) - [ ] Ping pong instrumentation callbacks [#246](https://github.com/nhooyr/websocket/issues/246) - [ ] Graceful shutdown helpers [#209](https://github.com/nhooyr/websocket/issues/209) -- [ ] Assembly for WebSocket masking [#16](https://github.com/nhooyr/websocket/issues/16) - - WIP at [#326](https://github.com/nhooyr/websocket/pull/326), about 3x faster - [ ] HTTP/2 [#4](https://github.com/nhooyr/websocket/issues/4) - [ ] The holy grail [#402](https://github.com/nhooyr/websocket/issues/402) @@ -121,7 +120,7 @@ Advantages of nhooyr.io/websocket: - Gorilla requires registering a pong callback before sending a Ping - Can target Wasm ([gorilla/websocket#432](https://github.com/gorilla/websocket/issues/432)) - Transparent message buffer reuse with [wsjson](https://pkg.go.dev/nhooyr.io/websocket/wsjson) subpackage -- [1.75x](https://github.com/nhooyr/websocket/releases/tag/v1.7.4) faster WebSocket masking implementation in pure Go +- [3x](https://github.com/nhooyr/websocket/pull/326) faster WebSocket masking implementation in assembly for amd64 and arm64 and [2x](https://github.com/nhooyr/websocket/releases/tag/v1.7.4) faster implementation in pure Go - Gorilla's implementation is slower and uses [unsafe](https://golang.org/pkg/unsafe/). Soon we'll have assembly and be 3x faster [#326](https://github.com/nhooyr/websocket/pull/326) - Full [permessage-deflate](https://tools.ietf.org/html/rfc7692) compression extension support From cb7509ab70e9f9ca4ce47a2eb90ff86ebaee2d28 Mon Sep 17 00:00:00 2001 From: Anmol Sethi Date: Thu, 19 Oct 2023 08:47:44 -0700 Subject: [PATCH 06/26] mask_amd64.s: Remove AVX2 fully --- mask_amd64.s | 29 ++--------------------------- mask_arm64.s | 1 - mask_asm.go | 2 -- 3 files changed, 2 insertions(+), 30 deletions(-) diff --git a/mask_amd64.s b/mask_amd64.s index caca53ec..bd42be31 100644 --- a/mask_amd64.s +++ b/mask_amd64.s @@ -26,11 +26,6 @@ TEXT ·maskAsm(SB), NOSPLIT, $0-28 TESTQ $31, AX JNZ unaligned -aligned: - CMPB ·useAVX2(SB), $1 - JE avx2 - JMP sse - unaligned_loop_1byte: XORB SI, (AX) INCQ AX @@ -47,7 +42,7 @@ unaligned_loop_1byte: ORQ DX, DI TESTQ $31, AX - JZ aligned + JZ sse unaligned: TESTQ $7, AX // AND $7 & len, if not zero jump to loop_1b. @@ -60,27 +55,7 @@ unaligned_loop: SUBQ $8, CX TESTQ $31, AX JNZ unaligned_loop - JMP aligned - -avx2: - CMPQ CX, $0x80 - JL sse - VMOVQ DI, X0 - VPBROADCASTQ X0, Y0 - -avx2_loop: - VPXOR (AX), Y0, Y1 - VPXOR 32(AX), Y0, Y2 - VPXOR 64(AX), Y0, Y3 - VPXOR 96(AX), Y0, Y4 - VMOVDQU Y1, (AX) - VMOVDQU Y2, 32(AX) - VMOVDQU Y3, 64(AX) - VMOVDQU Y4, 96(AX) - ADDQ $0x80, AX - SUBQ $0x80, CX - CMPQ CX, $0x80 - JAE avx2_loop // loop if CX >= 0x80 + JMP sse sse: CMPQ CX, $0x40 diff --git a/mask_arm64.s b/mask_arm64.s index 624cb720..b3d48e68 100644 --- a/mask_arm64.s +++ b/mask_arm64.s @@ -15,7 +15,6 @@ TEXT ·maskAsm(SB), NOSPLIT, $0-28 CMP $64, R1 BLT less_than_64 - // todo: optimize unaligned case loop_64: VLD1 (R0), [V1.B16, V2.B16, V3.B16, V4.B16] VEOR V1.B16, V0.B16, V1.B16 diff --git a/mask_asm.go b/mask_asm.go index 946bc0f6..34021fa7 100644 --- a/mask_asm.go +++ b/mask_asm.go @@ -9,7 +9,5 @@ func mask(key uint32, b []byte) uint32 { return key } -var useAVX2 = false - //go:noescape func maskAsm(b *byte, len int, key uint32) uint32 From 3f8c9e07bcaa0a223d092b618c34ca7dba3521db Mon Sep 17 00:00:00 2001 From: Anmol Sethi Date: Thu, 19 Oct 2023 09:20:31 -0700 Subject: [PATCH 07/26] mask_amd64.s: Minor improvements --- frame.go | 2 ++ mask_amd64.s | 12 ++++++------ 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/frame.go b/frame.go index 362a99e9..ff09ec26 100644 --- a/frame.go +++ b/frame.go @@ -184,6 +184,8 @@ func writeFrameHeader(h header, w *bufio.Writer, buf []byte) (err error) { // to be in little endian. // // See https://github.com/golang/go/issues/31586 +// +//lint:ignore U1000 mask.go func maskGo(key uint32, b []byte) uint32 { if len(b) >= 8 { key64 := uint64(key)<<32 | uint64(key) diff --git a/mask_amd64.s b/mask_amd64.s index bd42be31..935232fa 100644 --- a/mask_amd64.s +++ b/mask_amd64.s @@ -17,8 +17,8 @@ TEXT ·maskAsm(SB), NOSPLIT, $0-28 SHLQ $32, DI ORQ DX, DI - CMPQ CX, $15 - JLE less_than_16 + CMPQ CX, $7 + JLE less_than_8 CMPQ CX, $63 JLE less_than_64 CMPQ CX, $128 @@ -58,7 +58,7 @@ unaligned_loop: JMP sse sse: - CMPQ CX, $0x40 + CMPQ CX, $64 JL less_than_64 MOVQ DI, X0 PUNPCKLQDQ X0, X0 @@ -76,9 +76,9 @@ sse_loop: MOVOU X2, 1*16(AX) MOVOU X3, 2*16(AX) MOVOU X4, 3*16(AX) - ADDQ $0x40, AX - SUBQ $0x40, CX - CMPQ CX, $0x40 + ADDQ $64, AX + SUBQ $64, CX + CMPQ CX, $64 JAE sse_loop less_than_64: From 367743dc6fe48866a91ef88296699449cca17d4d Mon Sep 17 00:00:00 2001 From: Anmol Sethi Date: Thu, 19 Oct 2023 15:57:47 -0700 Subject: [PATCH 08/26] mask_amd64.sh: Cleanup --- mask_amd64.s | 23 ++++++++++++----------- mask_arm64.s | 4 +++- 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/mask_amd64.s b/mask_amd64.s index 935232fa..905d7e4a 100644 --- a/mask_amd64.s +++ b/mask_amd64.s @@ -10,18 +10,18 @@ TEXT ·maskAsm(SB), NOSPLIT, $0-28 MOVQ len+8(FP), CX MOVL key+16(FP), SI - // calculate the DI - // DI = SI<<32 | SI + // Calculate the DI aka the uint64 key. + // DI = uint64(SI) | uint64(SI)<<32 MOVL SI, DI MOVQ DI, DX SHLQ $32, DI ORQ DX, DI - CMPQ CX, $7 - JLE less_than_8 - CMPQ CX, $63 - JLE less_than_64 - CMPQ CX, $128 + CMPQ CX, $8 + JL less_than_8 + CMPQ CX, $64 + JL less_than_64 + CMPQ CX, $512 JLE sse TESTQ $31, AX JNZ unaligned @@ -34,8 +34,8 @@ unaligned_loop_1byte: TESTQ $7, AX JNZ unaligned_loop_1byte - // calculate DI again since SI was modified - // DI = SI<<32 | SI + // Calculate DI again since SI was modified. + // DI = uint64(SI) | uint64(SI)<<32 MOVL SI, DI MOVQ DI, DX SHLQ $32, DI @@ -45,11 +45,12 @@ unaligned_loop_1byte: JZ sse unaligned: - TESTQ $7, AX // AND $7 & len, if not zero jump to loop_1b. + // $7 & len, if not zero jump to loop_1b. + TESTQ $7, AX JNZ unaligned_loop_1byte unaligned_loop: - // we don't need to check the CX since we know it's above 128 + // We don't need to check the CX since we know it's above 512. XORQ DI, (AX) ADDQ $8, AX SUBQ $8, CX diff --git a/mask_arm64.s b/mask_arm64.s index b3d48e68..741b77a5 100644 --- a/mask_arm64.s +++ b/mask_arm64.s @@ -1,6 +1,6 @@ #include "textflag.h" -// func maskAsm(b *byte,len, int, key uint32) +// func maskAsm(b *byte, len int, key uint32) TEXT ·maskAsm(SB), NOSPLIT, $0-28 // R0 = b // R1 = len @@ -15,6 +15,8 @@ TEXT ·maskAsm(SB), NOSPLIT, $0-28 CMP $64, R1 BLT less_than_64 +// TODO: allign memory like amd64 + loop_64: VLD1 (R0), [V1.B16, V2.B16, V3.B16, V4.B16] VEOR V1.B16, V0.B16, V1.B16 From 27f80cb8b4515ffa660eaa962aa01cd370e4c48e Mon Sep 17 00:00:00 2001 From: Anmol Sethi Date: Thu, 19 Oct 2023 16:24:32 -0700 Subject: [PATCH 09/26] mask.go: Cleanup assembly and add nbio benchmark --- frame.go | 2 +- frame_test.go | 2 +- internal/thirdparty/frame_test.go | 50 ++++++++++++++++++++----------- internal/thirdparty/go.mod | 2 ++ internal/thirdparty/go.sum | 36 ++++++++++++++++++++++ mask.go | 4 +-- mask_amd64.s | 2 -- mask_arm64.s | 2 +- mask_asm.go | 2 +- read.go | 4 +-- write.go | 2 +- 11 files changed, 79 insertions(+), 29 deletions(-) diff --git a/frame.go b/frame.go index ff09ec26..5d826ea3 100644 --- a/frame.go +++ b/frame.go @@ -186,7 +186,7 @@ func writeFrameHeader(h header, w *bufio.Writer, buf []byte) (err error) { // See https://github.com/golang/go/issues/31586 // //lint:ignore U1000 mask.go -func maskGo(key uint32, b []byte) uint32 { +func maskGo(b []byte, key uint32) uint32 { if len(b) >= 8 { key64 := uint64(key)<<32 | uint64(key) diff --git a/frame_test.go b/frame_test.go index e697e198..bd626358 100644 --- a/frame_test.go +++ b/frame_test.go @@ -97,7 +97,7 @@ func Test_mask(t *testing.T) { key := []byte{0xa, 0xb, 0xc, 0xff} key32 := binary.LittleEndian.Uint32(key) p := []byte{0xa, 0xb, 0xc, 0xf2, 0xc} - gotKey32 := mask(key32, p) + gotKey32 := mask(p, key32) expP := []byte{0, 0, 0, 0x0d, 0x6} assert.Equal(t, "p", expP, p) diff --git a/internal/thirdparty/frame_test.go b/internal/thirdparty/frame_test.go index dd0440db..7202322d 100644 --- a/internal/thirdparty/frame_test.go +++ b/internal/thirdparty/frame_test.go @@ -8,11 +8,12 @@ import ( "github.com/gobwas/ws" _ "github.com/gorilla/websocket" + _ "github.com/lesismal/nbio/nbhttp/websocket" _ "nhooyr.io/websocket" ) -func basicMask(maskKey [4]byte, pos int, b []byte) int { +func basicMask(b []byte, maskKey [4]byte, pos int) int { for i := range b { b[i] ^= maskKey[pos&3] pos++ @@ -20,26 +21,30 @@ func basicMask(maskKey [4]byte, pos int, b []byte) int { return pos & 3 } -//go:linkname gorillaMaskBytes github.com/gorilla/websocket.maskBytes -func gorillaMaskBytes(key [4]byte, pos int, b []byte) int +//go:linkname maskGo nhooyr.io/websocket.maskGo +func maskGo(b []byte, key32 uint32) int -//go:linkname mask nhooyr.io/websocket.mask -func mask(key32 uint32, b []byte) int +//go:linkname maskAsm nhooyr.io/websocket.maskAsm +func maskAsm(b *byte, len int, key32 uint32) uint32 -//go:linkname maskGo nhooyr.io/websocket.maskGo -func maskGo(key32 uint32, b []byte) int +//go:linkname nbioMaskBytes github.com/lesismal/nbio/nbhttp/websocket.maskXOR +func nbioMaskBytes(b, key []byte) int + +//go:linkname gorillaMaskBytes github.com/gorilla/websocket.maskBytes +func gorillaMaskBytes(key [4]byte, pos int, b []byte) int func Benchmark_mask(b *testing.B) { sizes := []int{ - 2, - 3, - 4, 8, 16, 32, 128, + 256, 512, + 1024, + 2048, 4096, + 8192, 16384, } @@ -51,7 +56,7 @@ func Benchmark_mask(b *testing.B) { name: "basic", fn: func(b *testing.B, key [4]byte, p []byte) { for i := 0; i < b.N; i++ { - basicMask(key, 0, p) + basicMask(p, key, 0) } }, }, @@ -63,7 +68,7 @@ func Benchmark_mask(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { - maskGo(key32, p) + maskGo(p, key32) } }, }, @@ -74,7 +79,7 @@ func Benchmark_mask(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { - mask(key32, p) + maskAsm(&p[0], len(p), key32) } }, }, @@ -95,16 +100,25 @@ func Benchmark_mask(b *testing.B) { } }, }, + { + name: "nbio", + fn: func(b *testing.B, key [4]byte, p []byte) { + keyb := key[:] + for i := 0; i < b.N; i++ { + nbioMaskBytes(p, keyb) + } + }, + }, } key := [4]byte{1, 2, 3, 4} - for _, size := range sizes { - p := make([]byte, size) + for _, fn := range fns { + b.Run(fn.name, func(b *testing.B) { + for _, size := range sizes { + p := make([]byte, size) - b.Run(strconv.Itoa(size), func(b *testing.B) { - for _, fn := range fns { - b.Run(fn.name, func(b *testing.B) { + b.Run(strconv.Itoa(size), func(b *testing.B) { b.SetBytes(int64(size)) fn.fn(b, key, p) diff --git a/internal/thirdparty/go.mod b/internal/thirdparty/go.mod index 3f32a416..f418d288 100644 --- a/internal/thirdparty/go.mod +++ b/internal/thirdparty/go.mod @@ -8,6 +8,7 @@ require ( github.com/gin-gonic/gin v1.9.1 github.com/gobwas/ws v1.3.0 github.com/gorilla/websocket v1.5.0 + github.com/lesismal/nbio v1.3.18 nhooyr.io/websocket v0.0.0-00010101000000-000000000000 ) @@ -25,6 +26,7 @@ require ( github.com/json-iterator/go v1.1.12 // indirect github.com/klauspost/cpuid/v2 v2.2.4 // indirect github.com/leodido/go-urn v1.2.4 // indirect + github.com/lesismal/llib v1.1.12 // indirect github.com/mattn/go-isatty v0.0.19 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.2 // indirect diff --git a/internal/thirdparty/go.sum b/internal/thirdparty/go.sum index 47f324bb..658a4a7b 100644 --- a/internal/thirdparty/go.sum +++ b/internal/thirdparty/go.sum @@ -41,6 +41,10 @@ github.com/klauspost/cpuid/v2 v2.2.4 h1:acbojRNwl3o09bUq+yDCtZFc1aiwaAAxtcn8YkZX github.com/klauspost/cpuid/v2 v2.2.4/go.mod h1:RVVoqg1df56z8g3pUjL/3lE5UfnlrJX8tyFgg4nqhuY= github.com/leodido/go-urn v1.2.4 h1:XlAE/cm/ms7TE/VMVoduSpNBoyc2dOxHs5MZSwAN63Q= github.com/leodido/go-urn v1.2.4/go.mod h1:7ZrI8mTSeBSHl/UaRyKQW1qZeMgak41ANeCNaVckg+4= +github.com/lesismal/llib v1.1.12 h1:KJFB8bL02V+QGIvILEw/w7s6bKj9Ps9Px97MZP2EOk0= +github.com/lesismal/llib v1.1.12/go.mod h1:70tFXXe7P1FZ02AU9l8LgSOK7d7sRrpnkUr3rd3gKSg= +github.com/lesismal/nbio v1.3.18 h1:kmJZlxjQpVfuCPYcXdv0Biv9LHVViJZet5K99Xs3RAs= +github.com/lesismal/nbio v1.3.18/go.mod h1:KWlouFT5cgDdW5sMX8RsHASUMGniea9X0XIellZ0B38= github.com/mattn/go-isatty v0.0.19 h1:JITubQf0MOLdlGRuRq+jtsDlekdYPia9ZFsB8h/APPA= github.com/mattn/go-isatty v0.0.19/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= @@ -67,19 +71,51 @@ github.com/twitchyliquid64/golang-asm v0.15.1 h1:SU5vSMR7hnwNxj24w34ZyCi/FmDZTkS github.com/twitchyliquid64/golang-asm v0.15.1/go.mod h1:a1lVb/DtPvCB8fslRZhAngC2+aY1QWCk3Cedj/Gdt08= github.com/ugorji/go/codec v1.2.11 h1:BMaWp1Bb6fHwEtbplGBGJ498wD+LKlNSl25MjdZY4dU= github.com/ugorji/go/codec v1.2.11/go.mod h1:UNopzCgEMSXjBc6AOMqYvWC1ktqTAfzJZUZgYf6w6lg= +github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= golang.org/x/arch v0.0.0-20210923205945-b76863e36670/go.mod h1:5om86z9Hs0C8fWVUuoMHwpExlXzs5Tkyp9hOrfG7pp8= golang.org/x/arch v0.3.0 h1:02VY4/ZcO/gBOH6PUaoiptASxtXU10jazRCP865E97k= golang.org/x/arch v0.3.0/go.mod h1:5om86z9Hs0C8fWVUuoMHwpExlXzs5Tkyp9hOrfG7pp8= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20210513122933-cd7d49e622d5/go.mod h1:P+XmwS30IXTQdn5tA2iutPOUgjI07+tq3H3K9MVA1s8= +golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= +golang.org/x/crypto v0.6.0/go.mod h1:OFC/31mSvZgRz0V1QTNCzfAI1aIRzbiufJtkMIlEp58= golang.org/x/crypto v0.9.0 h1:LF6fAI+IutBocDJ2OT0Q1g8plpYljMZ4+lty+dsqw3g= golang.org/x/crypto v0.9.0/go.mod h1:yrmDGqONDYtNj3tH8X9dzUun2m2lzPa9ngI6/RUPGR0= +golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= +golang.org/x/net v0.0.0-20210510120150-4163338589ed/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= +golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= +golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= golang.org/x/net v0.10.0 h1:X2//UzNDwYmtCLn7To6G58Wr6f5ahEAQgKNzv9Y951M= golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220704084225-05e143d24a9e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.13.0 h1:Af8nKPmuFypiUBjVoU9V20FiaFXOcuZI21p0ycVYYGE= golang.org/x/sys v0.13.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= +golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= +golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= +golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= golang.org/x/text v0.9.0 h1:2sjJmO8cDvYveuX97RDLsxlyUxLl+GHoLxBiRdHllBE= golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543 h1:E7g+9GITq07hpfrRu66IVDexMakfv52eLZ2CXBWiKr4= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw= diff --git a/mask.go b/mask.go index 7c9fe308..b29435e9 100644 --- a/mask.go +++ b/mask.go @@ -2,6 +2,6 @@ package websocket -func mask(key uint32, b []byte) uint32 { - return maskGo(key, b) +func mask(b []byte, key uint32) uint32 { + return maskGo(b, key) } diff --git a/mask_amd64.s b/mask_amd64.s index 905d7e4a..73ae59b4 100644 --- a/mask_amd64.s +++ b/mask_amd64.s @@ -19,8 +19,6 @@ TEXT ·maskAsm(SB), NOSPLIT, $0-28 CMPQ CX, $8 JL less_than_8 - CMPQ CX, $64 - JL less_than_64 CMPQ CX, $512 JLE sse TESTQ $31, AX diff --git a/mask_arm64.s b/mask_arm64.s index 741b77a5..8fd49aa9 100644 --- a/mask_arm64.s +++ b/mask_arm64.s @@ -4,8 +4,8 @@ TEXT ·maskAsm(SB), NOSPLIT, $0-28 // R0 = b // R1 = len - // R2 = uint64(key)<<32 | uint64(key) // R3 = key (uint32) + // R2 = uint64(key)<<32 | uint64(key) MOVD b_ptr+0(FP), R0 MOVD b_len+8(FP), R1 MOVWU key+16(FP), R3 diff --git a/mask_asm.go b/mask_asm.go index 34021fa7..b8c4ee66 100644 --- a/mask_asm.go +++ b/mask_asm.go @@ -2,7 +2,7 @@ package websocket -func mask(key uint32, b []byte) uint32 { +func mask(b []byte, key uint32) uint32 { if len(b) > 0 { return maskAsm(&b[0], len(b), key) } diff --git a/read.go b/read.go index 8742842e..81b89831 100644 --- a/read.go +++ b/read.go @@ -289,7 +289,7 @@ func (c *Conn) handleControl(ctx context.Context, h header) (err error) { } if h.masked { - mask(h.maskKey, b) + mask(b, h.maskKey) } switch h.opcode { @@ -453,7 +453,7 @@ func (mr *msgReader) read(p []byte) (int, error) { mr.payloadLength -= int64(n) if !mr.c.client { - mr.maskKey = mask(mr.maskKey, p) + mr.maskKey = mask(p, mr.maskKey) } return n, nil diff --git a/write.go b/write.go index 7b1152ce..7ac7ce63 100644 --- a/write.go +++ b/write.go @@ -365,7 +365,7 @@ func (c *Conn) writeFramePayload(p []byte) (n int, err error) { return n, err } - maskKey = mask(maskKey, c.writeBuf[i:c.bw.Buffered()]) + maskKey = mask(c.writeBuf[i:c.bw.Buffered()], maskKey) p = p[j:] n += j From 369d641608eba0a8387f79eb204c56f364c2e31d Mon Sep 17 00:00:00 2001 From: Anmol Sethi Date: Thu, 19 Oct 2023 17:04:52 -0700 Subject: [PATCH 10/26] mask_arm64.s: Cleanup --- mask_amd64.s | 4 ++-- mask_arm64.s | 24 +++++++++++------------- 2 files changed, 13 insertions(+), 15 deletions(-) diff --git a/mask_amd64.s b/mask_amd64.s index 73ae59b4..8464440b 100644 --- a/mask_amd64.s +++ b/mask_amd64.s @@ -117,10 +117,10 @@ less_than_4: less_than_2: TESTQ $1, CX - JZ done + JZ end XORB SI, (AX) ROLL $24, SI -done: +end: MOVL SI, ret+24(FP) RET diff --git a/mask_arm64.s b/mask_arm64.s index 8fd49aa9..42a1211f 100644 --- a/mask_arm64.s +++ b/mask_arm64.s @@ -15,7 +15,7 @@ TEXT ·maskAsm(SB), NOSPLIT, $0-28 CMP $64, R1 BLT less_than_64 -// TODO: allign memory like amd64 +// TODO: align memory like amd64 loop_64: VLD1 (R0), [V1.B16, V2.B16, V3.B16, V4.B16] @@ -29,41 +29,39 @@ loop_64: BGE loop_64 less_than_64: - // quick end - CBZ R1, end - TBZ $5, R1, less_than32 + TBZ $5, R1, less_than_32 VLD1 (R0), [V1.B16, V2.B16] VEOR V1.B16, V0.B16, V1.B16 VEOR V2.B16, V0.B16, V2.B16 VST1.P [V1.B16, V2.B16], 32(R0) -less_than32: - TBZ $4, R1, less_than16 +less_than_32: + TBZ $4, R1, less_than_16 LDP (R0), (R11, R12) EOR R11, R2, R11 EOR R12, R2, R12 STP.P (R11, R12), 16(R0) -less_than16: - TBZ $3, R1, less_than8 +less_than_16: + TBZ $3, R1, less_than_8 MOVD (R0), R11 EOR R2, R11, R11 MOVD.P R11, 8(R0) -less_than8: - TBZ $2, R1, less_than4 +less_than_8: + TBZ $2, R1, less_than_4 MOVWU (R0), R11 EORW R2, R11, R11 MOVWU.P R11, 4(R0) -less_than4: - TBZ $1, R1, less_than2 +less_than_4: + TBZ $1, R1, less_than_2 MOVHU (R0), R11 EORW R3, R11, R11 MOVHU.P R11, 2(R0) RORW $16, R3 -less_than2: +less_than_2: TBZ $0, R1, end MOVBU (R0), R11 EORW R3, R11, R11 From fb13df2dc30520f64bd4daa167ed9c7e739a98b7 Mon Sep 17 00:00:00 2001 From: Anmol Sethi Date: Thu, 19 Oct 2023 19:46:00 -0700 Subject: [PATCH 11/26] ci/bench.sh: Benchmark masking on arm64 with QEMU --- ci/bench.sh | 3 +++ internal/thirdparty/frame_test.go | 5 +++++ 2 files changed, 8 insertions(+) diff --git a/ci/bench.sh b/ci/bench.sh index a553b93a..6af59ecf 100755 --- a/ci/bench.sh +++ b/ci/bench.sh @@ -6,4 +6,7 @@ go test --run=^$ --bench=. --benchmem --memprofile ci/out/prof.mem --cpuprofile ( cd ./internal/thirdparty go test --run=^$ --bench=. --benchmem --memprofile ../../ci/out/prof-thirdparty.mem --cpuprofile ../../ci/out/prof-thirdparty.cpu -o ../../ci/out/thirdparty.test "$@" . + + GOARCH=arm64 go test -c -o ../../ci/out/thirdparty-arm64.test . + qemu-aarch64 ../../ci/out/thirdparty-arm64.test --test.run=^$ --test.bench=Benchmark_mask --test.benchmem --test.memprofile ../../ci/out/prof-thirdparty-arm64.mem --test.cpuprofile ../../ci/out/prof-thirdparty-arm64.cpu . ) diff --git a/internal/thirdparty/frame_test.go b/internal/thirdparty/frame_test.go index 7202322d..89042e53 100644 --- a/internal/thirdparty/frame_test.go +++ b/internal/thirdparty/frame_test.go @@ -2,6 +2,7 @@ package thirdparty import ( "encoding/binary" + "runtime" "strconv" "testing" _ "unsafe" @@ -34,6 +35,10 @@ func nbioMaskBytes(b, key []byte) int func gorillaMaskBytes(key [4]byte, pos int, b []byte) int func Benchmark_mask(b *testing.B) { + b.Run(runtime.GOARCH, benchmark_mask) +} + +func benchmark_mask(b *testing.B) { sizes := []int{ 8, 16, From ecf7dec40098cbc3b659d99b467c0d6c97f38c6c Mon Sep 17 00:00:00 2001 From: Anmol Sethi Date: Thu, 19 Oct 2023 19:52:36 -0700 Subject: [PATCH 12/26] ci/bench.sh: Install QEMU on CI --- README.md | 3 +-- ci/bench.sh | 5 +++++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 4b2d828e..0f286e63 100644 --- a/README.md +++ b/README.md @@ -120,9 +120,8 @@ Advantages of nhooyr.io/websocket: - Gorilla requires registering a pong callback before sending a Ping - Can target Wasm ([gorilla/websocket#432](https://github.com/gorilla/websocket/issues/432)) - Transparent message buffer reuse with [wsjson](https://pkg.go.dev/nhooyr.io/websocket/wsjson) subpackage -- [3x](https://github.com/nhooyr/websocket/pull/326) faster WebSocket masking implementation in assembly for amd64 and arm64 and [2x](https://github.com/nhooyr/websocket/releases/tag/v1.7.4) faster implementation in pure Go +- [4x](https://github.com/nhooyr/websocket/pull/326) faster WebSocket masking implementation in assembly for amd64 and arm64 and [2x](https://github.com/nhooyr/websocket/releases/tag/v1.7.4) faster implementation in pure Go - Gorilla's implementation is slower and uses [unsafe](https://golang.org/pkg/unsafe/). - Soon we'll have assembly and be 3x faster [#326](https://github.com/nhooyr/websocket/pull/326) - Full [permessage-deflate](https://tools.ietf.org/html/rfc7692) compression extension support - Gorilla only supports no context takeover mode - [CloseRead](https://pkg.go.dev/nhooyr.io/websocket#Conn.CloseRead) helper for write only connections ([gorilla/websocket#492](https://github.com/gorilla/websocket/issues/492)) diff --git a/ci/bench.sh b/ci/bench.sh index 6af59ecf..afc2d825 100755 --- a/ci/bench.sh +++ b/ci/bench.sh @@ -8,5 +8,10 @@ go test --run=^$ --bench=. --benchmem --memprofile ci/out/prof.mem --cpuprofile go test --run=^$ --bench=. --benchmem --memprofile ../../ci/out/prof-thirdparty.mem --cpuprofile ../../ci/out/prof-thirdparty.cpu -o ../../ci/out/thirdparty.test "$@" . GOARCH=arm64 go test -c -o ../../ci/out/thirdparty-arm64.test . + if [ "${CI-}" ]; then + sudo apt-get update + sudo apt-get install -y qemu-user-static + alias qemu-aarch64=qemu-aarch64-static + fi qemu-aarch64 ../../ci/out/thirdparty-arm64.test --test.run=^$ --test.bench=Benchmark_mask --test.benchmem --test.memprofile ../../ci/out/prof-thirdparty-arm64.mem --test.cpuprofile ../../ci/out/prof-thirdparty-arm64.cpu . ) From d34e5d4b8e4c3d9ab9311d43492fe0bded560ccc Mon Sep 17 00:00:00 2001 From: Anmol Sethi Date: Fri, 20 Oct 2023 07:47:25 -0700 Subject: [PATCH 13/26] wsjson: Add json.Encoder vs json.Marshal benchmark json.Encoder is 42% faster than json.Marshal thanks to the memory reuse. goos: linux goarch: amd64 pkg: nhooyr.io/websocket/wsjson cpu: 12th Gen Intel(R) Core(TM) i5-1235U BenchmarkJSON/json.Encoder-12 3517579 340.2 ns/op 24 B/op 1 allocs/op BenchmarkJSON/json.Marshal-12 2374086 484.3 ns/op 728 B/op 2 allocs/op Closes #409 --- wsjson/wsjson_test.go | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 wsjson/wsjson_test.go diff --git a/wsjson/wsjson_test.go b/wsjson/wsjson_test.go new file mode 100644 index 00000000..a70e808c --- /dev/null +++ b/wsjson/wsjson_test.go @@ -0,0 +1,24 @@ +package wsjson_test + +import ( + "encoding/json" + "io" + "strings" + "testing" +) + +func BenchmarkJSON(b *testing.B) { + msg := []byte(strings.Repeat("1234", 128)) + b.SetBytes(int64(len(msg))) + b.ReportAllocs() + b.Run("json.Encoder", func(b *testing.B) { + for i := 0; i < b.N; i++ { + json.NewEncoder(io.Discard).Encode(msg) + } + }) + b.Run("json.Marshal", func(b *testing.B) { + for i := 0; i < b.N; i++ { + json.Marshal(msg) + } + }) +} From e25d9681bd6cc0a9176a2fa35d1a1e16bdcd685d Mon Sep 17 00:00:00 2001 From: Anmol Sethi Date: Fri, 20 Oct 2023 07:55:15 -0700 Subject: [PATCH 14/26] ci/bench.sh: Don't profile by default --- ci/bench.sh | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/ci/bench.sh b/ci/bench.sh index afc2d825..afe0490b 100755 --- a/ci/bench.sh +++ b/ci/bench.sh @@ -2,16 +2,19 @@ set -eu cd -- "$(dirname "$0")/.." -go test --run=^$ --bench=. --benchmem --memprofile ci/out/prof.mem --cpuprofile ci/out/prof.cpu -o ci/out/websocket.test "$@" . +go test --run=^$ --bench=. --benchmem "$@" ./... +# For profiling add: --memprofile ci/out/prof.mem --cpuprofile ci/out/prof.cpu -o ci/out/websocket.test ( cd ./internal/thirdparty - go test --run=^$ --bench=. --benchmem --memprofile ../../ci/out/prof-thirdparty.mem --cpuprofile ../../ci/out/prof-thirdparty.cpu -o ../../ci/out/thirdparty.test "$@" . + go test --run=^$ --bench=. --benchmem "$@" . - GOARCH=arm64 go test -c -o ../../ci/out/thirdparty-arm64.test . - if [ "${CI-}" ]; then - sudo apt-get update - sudo apt-get install -y qemu-user-static - alias qemu-aarch64=qemu-aarch64-static + GOARCH=arm64 go test -c -o ../../ci/out/thirdparty-arm64.test "$@" . + if [ "$#" -eq 0 ]; then + if [ "${CI-}" ]; then + sudo apt-get update + sudo apt-get install -y qemu-user-static + alias qemu-aarch64=qemu-aarch64-static + fi + qemu-aarch64 ../../ci/out/thirdparty-arm64.test --test.run=^$ --test.bench=Benchmark_mask --test.benchmem fi - qemu-aarch64 ../../ci/out/thirdparty-arm64.test --test.run=^$ --test.bench=Benchmark_mask --test.benchmem --test.memprofile ../../ci/out/prof-thirdparty-arm64.mem --test.cpuprofile ../../ci/out/prof-thirdparty-arm64.cpu . ) From 640e3c25bcb716956df6452b85ef7ead53477040 Mon Sep 17 00:00:00 2001 From: Anmol Sethi Date: Fri, 20 Oct 2023 07:56:39 -0700 Subject: [PATCH 15/26] ci/bench.sh: Try function instead of alias --- ci/bench.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/bench.sh b/ci/bench.sh index afe0490b..5b1360d0 100755 --- a/ci/bench.sh +++ b/ci/bench.sh @@ -13,7 +13,7 @@ go test --run=^$ --bench=. --benchmem "$@" ./... if [ "${CI-}" ]; then sudo apt-get update sudo apt-get install -y qemu-user-static - alias qemu-aarch64=qemu-aarch64-static + qemu-aarch64() { qemu-aarch64-static "$@" } fi qemu-aarch64 ../../ci/out/thirdparty-arm64.test --test.run=^$ --test.bench=Benchmark_mask --test.benchmem fi From 0596e7a0e19d842ac2d9db0d598902dba6485cc4 Mon Sep 17 00:00:00 2001 From: Anmol Sethi Date: Fri, 20 Oct 2023 08:04:36 -0700 Subject: [PATCH 16/26] wsjson: Extend benchmark with multiple sizes [qrvnl@dios ~/src/websocket] 130$ go test -bench=. ./wsjson/ goos: linux goarch: amd64 pkg: nhooyr.io/websocket/wsjson cpu: 12th Gen Intel(R) Core(TM) i5-1235U BenchmarkJSON/json.Encoder/8-12 14041426 72.59 ns/op 110.21 MB/s 16 B/op 1 allocs/op BenchmarkJSON/json.Encoder/16-12 13936426 86.99 ns/op 183.92 MB/s 16 B/op 1 allocs/op BenchmarkJSON/json.Encoder/32-12 11416401 115.3 ns/op 277.59 MB/s 16 B/op 1 allocs/op BenchmarkJSON/json.Encoder/128-12 4600574 264.7 ns/op 483.55 MB/s 16 B/op 1 allocs/op BenchmarkJSON/json.Encoder/256-12 2710398 433.9 ns/op 590.06 MB/s 16 B/op 1 allocs/op BenchmarkJSON/json.Encoder/512-12 1588930 717.3 ns/op 713.82 MB/s 16 B/op 1 allocs/op BenchmarkJSON/json.Encoder/1024-12 823138 1484 ns/op 689.80 MB/s 16 B/op 1 allocs/op BenchmarkJSON/json.Encoder/2048-12 402823 2875 ns/op 712.32 MB/s 16 B/op 1 allocs/op BenchmarkJSON/json.Encoder/4096-12 213926 5602 ns/op 731.14 MB/s 16 B/op 1 allocs/op BenchmarkJSON/json.Encoder/8192-12 92864 11281 ns/op 726.19 MB/s 16 B/op 1 allocs/op BenchmarkJSON/json.Encoder/16384-12 39318 29203 ns/op 561.04 MB/s 19 B/op 1 allocs/op BenchmarkJSON/json.Marshal/8-12 10768671 114.5 ns/op 69.89 MB/s 48 B/op 2 allocs/op BenchmarkJSON/json.Marshal/16-12 10140996 113.9 ns/op 140.51 MB/s 64 B/op 2 allocs/op BenchmarkJSON/json.Marshal/32-12 9211780 121.6 ns/op 263.06 MB/s 64 B/op 2 allocs/op BenchmarkJSON/json.Marshal/128-12 4632796 264.2 ns/op 484.53 MB/s 224 B/op 2 allocs/op BenchmarkJSON/json.Marshal/256-12 2441511 473.5 ns/op 540.65 MB/s 432 B/op 2 allocs/op BenchmarkJSON/json.Marshal/512-12 1298788 896.2 ns/op 571.27 MB/s 912 B/op 2 allocs/op BenchmarkJSON/json.Marshal/1024-12 602084 1866 ns/op 548.83 MB/s 1808 B/op 2 allocs/op BenchmarkJSON/json.Marshal/2048-12 341151 3817 ns/op 536.61 MB/s 3474 B/op 2 allocs/op BenchmarkJSON/json.Marshal/4096-12 175594 7034 ns/op 582.32 MB/s 6548 B/op 2 allocs/op BenchmarkJSON/json.Marshal/8192-12 83222 15023 ns/op 545.30 MB/s 13591 B/op 2 allocs/op BenchmarkJSON/json.Marshal/16384-12 33087 39348 ns/op 416.39 MB/s 27304 B/op 2 allocs/op PASS ok nhooyr.io/websocket/wsjson 32.934s --- wsjson/wsjson_test.go | 45 +++++++++++++++++++++++++++++++++++-------- 1 file changed, 37 insertions(+), 8 deletions(-) diff --git a/wsjson/wsjson_test.go b/wsjson/wsjson_test.go index a70e808c..080ab38d 100644 --- a/wsjson/wsjson_test.go +++ b/wsjson/wsjson_test.go @@ -3,22 +3,51 @@ package wsjson_test import ( "encoding/json" "io" - "strings" + "strconv" "testing" + + "nhooyr.io/websocket/internal/test/xrand" ) func BenchmarkJSON(b *testing.B) { - msg := []byte(strings.Repeat("1234", 128)) - b.SetBytes(int64(len(msg))) - b.ReportAllocs() + sizes := []int{ + 8, + 16, + 32, + 128, + 256, + 512, + 1024, + 2048, + 4096, + 8192, + 16384, + } + b.Run("json.Encoder", func(b *testing.B) { - for i := 0; i < b.N; i++ { - json.NewEncoder(io.Discard).Encode(msg) + for _, size := range sizes { + b.Run(strconv.Itoa(size), func(b *testing.B) { + msg := xrand.String(size) + b.SetBytes(int64(size)) + b.ReportAllocs() + b.ResetTimer() + for i := 0; i < b.N; i++ { + json.NewEncoder(io.Discard).Encode(msg) + } + }) } }) b.Run("json.Marshal", func(b *testing.B) { - for i := 0; i < b.N; i++ { - json.Marshal(msg) + for _, size := range sizes { + b.Run(strconv.Itoa(size), func(b *testing.B) { + msg := xrand.String(size) + b.SetBytes(int64(size)) + b.ReportAllocs() + b.ResetTimer() + for i := 0; i < b.N; i++ { + json.Marshal(msg) + } + }) } }) } From 30447a3e05b34bbf55ed531aebeb31192dacd251 Mon Sep 17 00:00:00 2001 From: Anmol Sethi Date: Fri, 20 Oct 2023 08:08:21 -0700 Subject: [PATCH 17/26] ci/bench.sh: Just symlink the expected qemu-aarch64 binary name --- ci/bench.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/bench.sh b/ci/bench.sh index 5b1360d0..30c06986 100755 --- a/ci/bench.sh +++ b/ci/bench.sh @@ -13,7 +13,7 @@ go test --run=^$ --bench=. --benchmem "$@" ./... if [ "${CI-}" ]; then sudo apt-get update sudo apt-get install -y qemu-user-static - qemu-aarch64() { qemu-aarch64-static "$@" } + ln -s /usr/bin/qemu-aarch64-static /usr/local/bin/qemu-aarch64 fi qemu-aarch64 ../../ci/out/thirdparty-arm64.test --test.run=^$ --test.bench=Benchmark_mask --test.benchmem fi From f4e61e5a124dfdc0df65ae401b6f7c134005fc5f Mon Sep 17 00:00:00 2001 From: Anmol Sethi Date: Fri, 20 Oct 2023 21:45:28 -0700 Subject: [PATCH 18/26] ci/fmt.sh: Error if changes on CI --- ci/fmt.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ci/fmt.sh b/ci/fmt.sh index 6e5a68e4..31d0c15d 100755 --- a/ci/fmt.sh +++ b/ci/fmt.sh @@ -18,3 +18,7 @@ npx prettier@3.0.3 \ $(git ls-files "*.yml" "*.md" "*.js" "*.css" "*.html") go run golang.org/x/tools/cmd/stringer@latest -type=opcode,MessageType,StatusCode -output=stringer.go + +if [ "${CI-}" ]; then + git diff --exit-code +fi From f533f430c7d63e9e0bceb2dcbbd5d75602803b82 Mon Sep 17 00:00:00 2001 From: Anmol Sethi Date: Sat, 21 Oct 2023 05:45:11 -0700 Subject: [PATCH 19/26] mask.go: Reorganize --- frame.go | 125 ------------------------------------------------ mask.go | 131 +++++++++++++++++++++++++++++++++++++++++++++++++-- mask_amd64.s | 36 +++++++++++--- mask_asm.go | 2 + mask_go.go | 7 +++ 5 files changed, 166 insertions(+), 135 deletions(-) create mode 100644 mask_go.go diff --git a/frame.go b/frame.go index 5d826ea3..d5631863 100644 --- a/frame.go +++ b/frame.go @@ -8,7 +8,6 @@ import ( "fmt" "io" "math" - "math/bits" "nhooyr.io/websocket/internal/errd" ) @@ -172,127 +171,3 @@ func writeFrameHeader(h header, w *bufio.Writer, buf []byte) (err error) { return nil } - -// maskGo applies the WebSocket masking algorithm to p -// with the given key. -// See https://tools.ietf.org/html/rfc6455#section-5.3 -// -// The returned value is the correctly rotated key to -// to continue to mask/unmask the message. -// -// It is optimized for LittleEndian and expects the key -// to be in little endian. -// -// See https://github.com/golang/go/issues/31586 -// -//lint:ignore U1000 mask.go -func maskGo(b []byte, key uint32) uint32 { - if len(b) >= 8 { - key64 := uint64(key)<<32 | uint64(key) - - // At some point in the future we can clean these unrolled loops up. - // See https://github.com/golang/go/issues/31586#issuecomment-487436401 - - // Then we xor until b is less than 128 bytes. - for len(b) >= 128 { - v := binary.LittleEndian.Uint64(b) - binary.LittleEndian.PutUint64(b, v^key64) - v = binary.LittleEndian.Uint64(b[8:16]) - binary.LittleEndian.PutUint64(b[8:16], v^key64) - v = binary.LittleEndian.Uint64(b[16:24]) - binary.LittleEndian.PutUint64(b[16:24], v^key64) - v = binary.LittleEndian.Uint64(b[24:32]) - binary.LittleEndian.PutUint64(b[24:32], v^key64) - v = binary.LittleEndian.Uint64(b[32:40]) - binary.LittleEndian.PutUint64(b[32:40], v^key64) - v = binary.LittleEndian.Uint64(b[40:48]) - binary.LittleEndian.PutUint64(b[40:48], v^key64) - v = binary.LittleEndian.Uint64(b[48:56]) - binary.LittleEndian.PutUint64(b[48:56], v^key64) - v = binary.LittleEndian.Uint64(b[56:64]) - binary.LittleEndian.PutUint64(b[56:64], v^key64) - v = binary.LittleEndian.Uint64(b[64:72]) - binary.LittleEndian.PutUint64(b[64:72], v^key64) - v = binary.LittleEndian.Uint64(b[72:80]) - binary.LittleEndian.PutUint64(b[72:80], v^key64) - v = binary.LittleEndian.Uint64(b[80:88]) - binary.LittleEndian.PutUint64(b[80:88], v^key64) - v = binary.LittleEndian.Uint64(b[88:96]) - binary.LittleEndian.PutUint64(b[88:96], v^key64) - v = binary.LittleEndian.Uint64(b[96:104]) - binary.LittleEndian.PutUint64(b[96:104], v^key64) - v = binary.LittleEndian.Uint64(b[104:112]) - binary.LittleEndian.PutUint64(b[104:112], v^key64) - v = binary.LittleEndian.Uint64(b[112:120]) - binary.LittleEndian.PutUint64(b[112:120], v^key64) - v = binary.LittleEndian.Uint64(b[120:128]) - binary.LittleEndian.PutUint64(b[120:128], v^key64) - b = b[128:] - } - - // Then we xor until b is less than 64 bytes. - for len(b) >= 64 { - v := binary.LittleEndian.Uint64(b) - binary.LittleEndian.PutUint64(b, v^key64) - v = binary.LittleEndian.Uint64(b[8:16]) - binary.LittleEndian.PutUint64(b[8:16], v^key64) - v = binary.LittleEndian.Uint64(b[16:24]) - binary.LittleEndian.PutUint64(b[16:24], v^key64) - v = binary.LittleEndian.Uint64(b[24:32]) - binary.LittleEndian.PutUint64(b[24:32], v^key64) - v = binary.LittleEndian.Uint64(b[32:40]) - binary.LittleEndian.PutUint64(b[32:40], v^key64) - v = binary.LittleEndian.Uint64(b[40:48]) - binary.LittleEndian.PutUint64(b[40:48], v^key64) - v = binary.LittleEndian.Uint64(b[48:56]) - binary.LittleEndian.PutUint64(b[48:56], v^key64) - v = binary.LittleEndian.Uint64(b[56:64]) - binary.LittleEndian.PutUint64(b[56:64], v^key64) - b = b[64:] - } - - // Then we xor until b is less than 32 bytes. - for len(b) >= 32 { - v := binary.LittleEndian.Uint64(b) - binary.LittleEndian.PutUint64(b, v^key64) - v = binary.LittleEndian.Uint64(b[8:16]) - binary.LittleEndian.PutUint64(b[8:16], v^key64) - v = binary.LittleEndian.Uint64(b[16:24]) - binary.LittleEndian.PutUint64(b[16:24], v^key64) - v = binary.LittleEndian.Uint64(b[24:32]) - binary.LittleEndian.PutUint64(b[24:32], v^key64) - b = b[32:] - } - - // Then we xor until b is less than 16 bytes. - for len(b) >= 16 { - v := binary.LittleEndian.Uint64(b) - binary.LittleEndian.PutUint64(b, v^key64) - v = binary.LittleEndian.Uint64(b[8:16]) - binary.LittleEndian.PutUint64(b[8:16], v^key64) - b = b[16:] - } - - // Then we xor until b is less than 8 bytes. - for len(b) >= 8 { - v := binary.LittleEndian.Uint64(b) - binary.LittleEndian.PutUint64(b, v^key64) - b = b[8:] - } - } - - // Then we xor until b is less than 4 bytes. - for len(b) >= 4 { - v := binary.LittleEndian.Uint32(b) - binary.LittleEndian.PutUint32(b, v^key) - b = b[4:] - } - - // xor remaining bytes. - for i := range b { - b[i] ^= byte(key) - key = bits.RotateLeft32(key, -8) - } - - return key -} diff --git a/mask.go b/mask.go index b29435e9..5f0746dc 100644 --- a/mask.go +++ b/mask.go @@ -1,7 +1,130 @@ -//go:build !amd64 && !arm64 && !js - package websocket -func mask(b []byte, key uint32) uint32 { - return maskGo(b, key) +import ( + "encoding/binary" + "math/bits" +) + +// maskGo applies the WebSocket masking algorithm to p +// with the given key. +// See https://tools.ietf.org/html/rfc6455#section-5.3 +// +// The returned value is the correctly rotated key to +// to continue to mask/unmask the message. +// +// It is optimized for LittleEndian and expects the key +// to be in little endian. +// +// See https://github.com/golang/go/issues/31586 +// +//lint:ignore U1000 mask.go +func maskGo(b []byte, key uint32) uint32 { + if len(b) >= 8 { + key64 := uint64(key)<<32 | uint64(key) + + // At some point in the future we can clean these unrolled loops up. + // See https://github.com/golang/go/issues/31586#issuecomment-487436401 + + // Then we xor until b is less than 128 bytes. + for len(b) >= 128 { + v := binary.LittleEndian.Uint64(b) + binary.LittleEndian.PutUint64(b, v^key64) + v = binary.LittleEndian.Uint64(b[8:16]) + binary.LittleEndian.PutUint64(b[8:16], v^key64) + v = binary.LittleEndian.Uint64(b[16:24]) + binary.LittleEndian.PutUint64(b[16:24], v^key64) + v = binary.LittleEndian.Uint64(b[24:32]) + binary.LittleEndian.PutUint64(b[24:32], v^key64) + v = binary.LittleEndian.Uint64(b[32:40]) + binary.LittleEndian.PutUint64(b[32:40], v^key64) + v = binary.LittleEndian.Uint64(b[40:48]) + binary.LittleEndian.PutUint64(b[40:48], v^key64) + v = binary.LittleEndian.Uint64(b[48:56]) + binary.LittleEndian.PutUint64(b[48:56], v^key64) + v = binary.LittleEndian.Uint64(b[56:64]) + binary.LittleEndian.PutUint64(b[56:64], v^key64) + v = binary.LittleEndian.Uint64(b[64:72]) + binary.LittleEndian.PutUint64(b[64:72], v^key64) + v = binary.LittleEndian.Uint64(b[72:80]) + binary.LittleEndian.PutUint64(b[72:80], v^key64) + v = binary.LittleEndian.Uint64(b[80:88]) + binary.LittleEndian.PutUint64(b[80:88], v^key64) + v = binary.LittleEndian.Uint64(b[88:96]) + binary.LittleEndian.PutUint64(b[88:96], v^key64) + v = binary.LittleEndian.Uint64(b[96:104]) + binary.LittleEndian.PutUint64(b[96:104], v^key64) + v = binary.LittleEndian.Uint64(b[104:112]) + binary.LittleEndian.PutUint64(b[104:112], v^key64) + v = binary.LittleEndian.Uint64(b[112:120]) + binary.LittleEndian.PutUint64(b[112:120], v^key64) + v = binary.LittleEndian.Uint64(b[120:128]) + binary.LittleEndian.PutUint64(b[120:128], v^key64) + b = b[128:] + } + + // Then we xor until b is less than 64 bytes. + for len(b) >= 64 { + v := binary.LittleEndian.Uint64(b) + binary.LittleEndian.PutUint64(b, v^key64) + v = binary.LittleEndian.Uint64(b[8:16]) + binary.LittleEndian.PutUint64(b[8:16], v^key64) + v = binary.LittleEndian.Uint64(b[16:24]) + binary.LittleEndian.PutUint64(b[16:24], v^key64) + v = binary.LittleEndian.Uint64(b[24:32]) + binary.LittleEndian.PutUint64(b[24:32], v^key64) + v = binary.LittleEndian.Uint64(b[32:40]) + binary.LittleEndian.PutUint64(b[32:40], v^key64) + v = binary.LittleEndian.Uint64(b[40:48]) + binary.LittleEndian.PutUint64(b[40:48], v^key64) + v = binary.LittleEndian.Uint64(b[48:56]) + binary.LittleEndian.PutUint64(b[48:56], v^key64) + v = binary.LittleEndian.Uint64(b[56:64]) + binary.LittleEndian.PutUint64(b[56:64], v^key64) + b = b[64:] + } + + // Then we xor until b is less than 32 bytes. + for len(b) >= 32 { + v := binary.LittleEndian.Uint64(b) + binary.LittleEndian.PutUint64(b, v^key64) + v = binary.LittleEndian.Uint64(b[8:16]) + binary.LittleEndian.PutUint64(b[8:16], v^key64) + v = binary.LittleEndian.Uint64(b[16:24]) + binary.LittleEndian.PutUint64(b[16:24], v^key64) + v = binary.LittleEndian.Uint64(b[24:32]) + binary.LittleEndian.PutUint64(b[24:32], v^key64) + b = b[32:] + } + + // Then we xor until b is less than 16 bytes. + for len(b) >= 16 { + v := binary.LittleEndian.Uint64(b) + binary.LittleEndian.PutUint64(b, v^key64) + v = binary.LittleEndian.Uint64(b[8:16]) + binary.LittleEndian.PutUint64(b[8:16], v^key64) + b = b[16:] + } + + // Then we xor until b is less than 8 bytes. + for len(b) >= 8 { + v := binary.LittleEndian.Uint64(b) + binary.LittleEndian.PutUint64(b, v^key64) + b = b[8:] + } + } + + // Then we xor until b is less than 4 bytes. + for len(b) >= 4 { + v := binary.LittleEndian.Uint32(b) + binary.LittleEndian.PutUint32(b, v^key) + b = b[4:] + } + + // xor remaining bytes. + for i := range b { + b[i] ^= byte(key) + key = bits.RotateLeft32(key, -8) + } + + return key } diff --git a/mask_amd64.s b/mask_amd64.s index 8464440b..ba37731d 100644 --- a/mask_amd64.s +++ b/mask_amd64.s @@ -19,11 +19,16 @@ TEXT ·maskAsm(SB), NOSPLIT, $0-28 CMPQ CX, $8 JL less_than_8 - CMPQ CX, $512 + CMPQ CX, $128 JLE sse TESTQ $31, AX JNZ unaligned +aligned: + CMPB ·useAVX2(SB), $1 + JE avx2 + JMP sse + unaligned_loop_1byte: XORB SI, (AX) INCQ AX @@ -40,7 +45,7 @@ unaligned_loop_1byte: ORQ DX, DI TESTQ $31, AX - JZ sse + JZ aligned unaligned: // $7 & len, if not zero jump to loop_1b. @@ -54,8 +59,27 @@ unaligned_loop: SUBQ $8, CX TESTQ $31, AX JNZ unaligned_loop - JMP sse - + JMP aligned + +avx2: + CMPQ CX, $128 + JL sse + VMOVQ DI, X0 + VPBROADCASTQ X0, Y0 + +// TODO: shouldn't these be aligned movs now? +// TODO: should be 256? +avx2_loop: + VMOVDQU (AX), Y1 + VPXOR Y0, Y1, Y2 + VMOVDQU Y2, (AX) + ADDQ $128, AX + SUBQ $128, CX + CMPQ CX, $128 + // Loop if CX >= 128. + JAE avx2_loop + +// TODO: should be 128? sse: CMPQ CX, $64 JL less_than_64 @@ -63,8 +87,8 @@ sse: PUNPCKLQDQ X0, X0 sse_loop: - MOVOU 0*16(AX), X1 - MOVOU 1*16(AX), X2 + MOVOU (AX), X1 + MOVOU 16(AX), X2 MOVOU 2*16(AX), X3 MOVOU 3*16(AX), X4 PXOR X0, X1 diff --git a/mask_asm.go b/mask_asm.go index b8c4ee66..1f294982 100644 --- a/mask_asm.go +++ b/mask_asm.go @@ -9,5 +9,7 @@ func mask(b []byte, key uint32) uint32 { return key } +var useAVX2 = true + //go:noescape func maskAsm(b *byte, len int, key uint32) uint32 diff --git a/mask_go.go b/mask_go.go new file mode 100644 index 00000000..b29435e9 --- /dev/null +++ b/mask_go.go @@ -0,0 +1,7 @@ +//go:build !amd64 && !arm64 && !js + +package websocket + +func mask(b []byte, key uint32) uint32 { + return maskGo(b, key) +} From a1bb44194159a5ff19ddb3032b796d8466d64d7a Mon Sep 17 00:00:00 2001 From: Anmol Sethi Date: Tue, 6 Feb 2024 16:59:47 -0800 Subject: [PATCH 20/26] ci: Fix dev coverage output --- .github/workflows/daily.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/daily.yml b/.github/workflows/daily.yml index b1e64fbc..340de501 100644 --- a/.github/workflows/daily.yml +++ b/.github/workflows/daily.yml @@ -50,5 +50,5 @@ jobs: - run: AUTOBAHN=1 ./ci/test.sh - uses: actions/upload-artifact@v3 with: - name: coverage.html + name: coverage-dev.html path: ./ci/out/coverage.html From fee373961a0522e40495f989bdd408146390f7e0 Mon Sep 17 00:00:00 2001 From: Anmol Sethi Date: Wed, 7 Feb 2024 02:42:51 -0800 Subject: [PATCH 21/26] mask_asm: Note implementation may not be perfect --- go.mod | 2 ++ go.sum | 2 ++ mask_arm64.s | 3 +-- mask_asm.go | 9 ++++++++- 4 files changed, 13 insertions(+), 3 deletions(-) diff --git a/go.mod b/go.mod index 715a9f7a..dbc4a5a7 100644 --- a/go.mod +++ b/go.mod @@ -1,3 +1,5 @@ module nhooyr.io/websocket go 1.19 + +require golang.org/x/sys v0.17.0 // indirect diff --git a/go.sum b/go.sum index e69de29b..735d9a79 100644 --- a/go.sum +++ b/go.sum @@ -0,0 +1,2 @@ +golang.org/x/sys v0.17.0 h1:25cE3gD+tdBA7lp7QfhuV+rJiE9YXTcS3VG1SqssI/Y= +golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= diff --git a/mask_arm64.s b/mask_arm64.s index 42a1211f..e494b43a 100644 --- a/mask_arm64.s +++ b/mask_arm64.s @@ -15,8 +15,6 @@ TEXT ·maskAsm(SB), NOSPLIT, $0-28 CMP $64, R1 BLT less_than_64 -// TODO: align memory like amd64 - loop_64: VLD1 (R0), [V1.B16, V2.B16, V3.B16, V4.B16] VEOR V1.B16, V0.B16, V1.B16 @@ -29,6 +27,7 @@ loop_64: BGE loop_64 less_than_64: + CBZ R1, end TBZ $5, R1, less_than_32 VLD1 (R0), [V1.B16, V2.B16] VEOR V1.B16, V0.B16, V1.B16 diff --git a/mask_asm.go b/mask_asm.go index 1f294982..865cd4b8 100644 --- a/mask_asm.go +++ b/mask_asm.go @@ -2,6 +2,8 @@ package websocket +import "golang.org/x/sys/cpu" + func mask(b []byte, key uint32) uint32 { if len(b) > 0 { return maskAsm(&b[0], len(b), key) @@ -9,7 +11,12 @@ func mask(b []byte, key uint32) uint32 { return key } -var useAVX2 = true +var useAVX2 = cpu.X86.HasAVX2 +// @nhooyr: I am not confident that the amd64 or the arm64 implementations of this +// function are perfect. There are almost certainly missing optimizations or +// opportunities for // simplification. I'm confident there are no bugs though. +// For example, the arm64 implementation doesn't align memory like the amd64. +// Or the amd64 implementation could use AVX512 instead of just AVX2. //go:noescape func maskAsm(b *byte, len int, key uint32) uint32 From 68fc887a3af8be45880f95f2a5f249a84b2b99b8 Mon Sep 17 00:00:00 2001 From: Anmol Sethi Date: Thu, 22 Feb 2024 04:51:47 -0800 Subject: [PATCH 22/26] mask.go: Revert my changes I'm just not good enough at assembly. I added tests to confirm that @wdvxdr's implementation works correctly and matches the output of the basic masking loop. --- go.mod | 2 +- internal/examples/go.mod | 2 ++ internal/examples/go.sum | 2 ++ internal/thirdparty/go.mod | 2 +- internal/thirdparty/go.sum | 4 +-- mask_amd64.s | 62 ++++++++++++++++---------------- mask_asm.go | 2 ++ mask_asm_test.go | 11 ++++++ mask_test.go | 73 ++++++++++++++++++++++++++++++++++++++ 9 files changed, 126 insertions(+), 34 deletions(-) create mode 100644 mask_asm_test.go create mode 100644 mask_test.go diff --git a/go.mod b/go.mod index dbc4a5a7..c6ec72cc 100644 --- a/go.mod +++ b/go.mod @@ -2,4 +2,4 @@ module nhooyr.io/websocket go 1.19 -require golang.org/x/sys v0.17.0 // indirect +require golang.org/x/sys v0.17.0 diff --git a/internal/examples/go.mod b/internal/examples/go.mod index c98b81ce..50695945 100644 --- a/internal/examples/go.mod +++ b/internal/examples/go.mod @@ -8,3 +8,5 @@ require ( golang.org/x/time v0.3.0 nhooyr.io/websocket v0.0.0-00010101000000-000000000000 ) + +require golang.org/x/sys v0.17.0 // indirect diff --git a/internal/examples/go.sum b/internal/examples/go.sum index f8a07e82..06068548 100644 --- a/internal/examples/go.sum +++ b/internal/examples/go.sum @@ -1,2 +1,4 @@ +golang.org/x/sys v0.17.0 h1:25cE3gD+tdBA7lp7QfhuV+rJiE9YXTcS3VG1SqssI/Y= +golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/time v0.3.0 h1:rg5rLMjNzMS1RkNLzCG38eapWhnYLFYXDXj2gOlr8j4= golang.org/x/time v0.3.0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= diff --git a/internal/thirdparty/go.mod b/internal/thirdparty/go.mod index f418d288..d991dd64 100644 --- a/internal/thirdparty/go.mod +++ b/internal/thirdparty/go.mod @@ -36,7 +36,7 @@ require ( golang.org/x/arch v0.3.0 // indirect golang.org/x/crypto v0.9.0 // indirect golang.org/x/net v0.10.0 // indirect - golang.org/x/sys v0.13.0 // indirect + golang.org/x/sys v0.17.0 // indirect golang.org/x/text v0.9.0 // indirect google.golang.org/protobuf v1.30.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect diff --git a/internal/thirdparty/go.sum b/internal/thirdparty/go.sum index 658a4a7b..1f542103 100644 --- a/internal/thirdparty/go.sum +++ b/internal/thirdparty/go.sum @@ -100,8 +100,8 @@ golang.org/x/sys v0.0.0-20220704084225-05e143d24a9e/go.mod h1:oPkhp1MJrh7nUepCBc golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.13.0 h1:Af8nKPmuFypiUBjVoU9V20FiaFXOcuZI21p0ycVYYGE= -golang.org/x/sys v0.13.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.17.0 h1:25cE3gD+tdBA7lp7QfhuV+rJiE9YXTcS3VG1SqssI/Y= +golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= diff --git a/mask_amd64.s b/mask_amd64.s index ba37731d..caca53ec 100644 --- a/mask_amd64.s +++ b/mask_amd64.s @@ -10,15 +10,17 @@ TEXT ·maskAsm(SB), NOSPLIT, $0-28 MOVQ len+8(FP), CX MOVL key+16(FP), SI - // Calculate the DI aka the uint64 key. - // DI = uint64(SI) | uint64(SI)<<32 + // calculate the DI + // DI = SI<<32 | SI MOVL SI, DI MOVQ DI, DX SHLQ $32, DI ORQ DX, DI - CMPQ CX, $8 - JL less_than_8 + CMPQ CX, $15 + JLE less_than_16 + CMPQ CX, $63 + JLE less_than_64 CMPQ CX, $128 JLE sse TESTQ $31, AX @@ -37,8 +39,8 @@ unaligned_loop_1byte: TESTQ $7, AX JNZ unaligned_loop_1byte - // Calculate DI again since SI was modified. - // DI = uint64(SI) | uint64(SI)<<32 + // calculate DI again since SI was modified + // DI = SI<<32 | SI MOVL SI, DI MOVQ DI, DX SHLQ $32, DI @@ -48,12 +50,11 @@ unaligned_loop_1byte: JZ aligned unaligned: - // $7 & len, if not zero jump to loop_1b. - TESTQ $7, AX + TESTQ $7, AX // AND $7 & len, if not zero jump to loop_1b. JNZ unaligned_loop_1byte unaligned_loop: - // We don't need to check the CX since we know it's above 512. + // we don't need to check the CX since we know it's above 128 XORQ DI, (AX) ADDQ $8, AX SUBQ $8, CX @@ -62,33 +63,34 @@ unaligned_loop: JMP aligned avx2: - CMPQ CX, $128 + CMPQ CX, $0x80 JL sse VMOVQ DI, X0 VPBROADCASTQ X0, Y0 -// TODO: shouldn't these be aligned movs now? -// TODO: should be 256? avx2_loop: - VMOVDQU (AX), Y1 - VPXOR Y0, Y1, Y2 - VMOVDQU Y2, (AX) - ADDQ $128, AX - SUBQ $128, CX - CMPQ CX, $128 - // Loop if CX >= 128. - JAE avx2_loop - -// TODO: should be 128? + VPXOR (AX), Y0, Y1 + VPXOR 32(AX), Y0, Y2 + VPXOR 64(AX), Y0, Y3 + VPXOR 96(AX), Y0, Y4 + VMOVDQU Y1, (AX) + VMOVDQU Y2, 32(AX) + VMOVDQU Y3, 64(AX) + VMOVDQU Y4, 96(AX) + ADDQ $0x80, AX + SUBQ $0x80, CX + CMPQ CX, $0x80 + JAE avx2_loop // loop if CX >= 0x80 + sse: - CMPQ CX, $64 + CMPQ CX, $0x40 JL less_than_64 MOVQ DI, X0 PUNPCKLQDQ X0, X0 sse_loop: - MOVOU (AX), X1 - MOVOU 16(AX), X2 + MOVOU 0*16(AX), X1 + MOVOU 1*16(AX), X2 MOVOU 2*16(AX), X3 MOVOU 3*16(AX), X4 PXOR X0, X1 @@ -99,9 +101,9 @@ sse_loop: MOVOU X2, 1*16(AX) MOVOU X3, 2*16(AX) MOVOU X4, 3*16(AX) - ADDQ $64, AX - SUBQ $64, CX - CMPQ CX, $64 + ADDQ $0x40, AX + SUBQ $0x40, CX + CMPQ CX, $0x40 JAE sse_loop less_than_64: @@ -141,10 +143,10 @@ less_than_4: less_than_2: TESTQ $1, CX - JZ end + JZ done XORB SI, (AX) ROLL $24, SI -end: +done: MOVL SI, ret+24(FP) RET diff --git a/mask_asm.go b/mask_asm.go index 865cd4b8..bf4bb635 100644 --- a/mask_asm.go +++ b/mask_asm.go @@ -11,6 +11,7 @@ func mask(b []byte, key uint32) uint32 { return key } +//lint:ignore U1000 mask_*.s var useAVX2 = cpu.X86.HasAVX2 // @nhooyr: I am not confident that the amd64 or the arm64 implementations of this @@ -18,5 +19,6 @@ var useAVX2 = cpu.X86.HasAVX2 // opportunities for // simplification. I'm confident there are no bugs though. // For example, the arm64 implementation doesn't align memory like the amd64. // Or the amd64 implementation could use AVX512 instead of just AVX2. +// //go:noescape func maskAsm(b *byte, len int, key uint32) uint32 diff --git a/mask_asm_test.go b/mask_asm_test.go new file mode 100644 index 00000000..416cbc43 --- /dev/null +++ b/mask_asm_test.go @@ -0,0 +1,11 @@ +//go:build amd64 || arm64 + +package websocket + +import "testing" + +func TestMaskASM(t *testing.T) { + t.Parallel() + + testMask(t, "maskASM", mask) +} diff --git a/mask_test.go b/mask_test.go new file mode 100644 index 00000000..5c3d43c4 --- /dev/null +++ b/mask_test.go @@ -0,0 +1,73 @@ +package websocket + +import ( + "bytes" + "crypto/rand" + "encoding/binary" + "math/big" + "math/bits" + "testing" + + "nhooyr.io/websocket/internal/test/assert" +) + +func basicMask(b []byte, key uint32) uint32 { + for i := range b { + b[i] ^= byte(key) + key = bits.RotateLeft32(key, -8) + } + return key +} + +func basicMask2(b []byte, key uint32) uint32 { + keyb := binary.LittleEndian.AppendUint32(nil, key) + pos := 0 + for i := range b { + b[i] ^= keyb[pos&3] + pos++ + } + return bits.RotateLeft32(key, (pos&3)*-8) +} + +func TestMask(t *testing.T) { + t.Parallel() + + testMask(t, "basicMask", basicMask) + testMask(t, "maskGo", maskGo) + testMask(t, "basicMask2", basicMask2) +} + +func testMask(t *testing.T, name string, fn func(b []byte, key uint32) uint32) { + t.Run(name, func(t *testing.T) { + t.Parallel() + for i := 0; i < 9999; i++ { + keyb := make([]byte, 4) + _, err := rand.Read(keyb) + assert.Success(t, err) + key := binary.LittleEndian.Uint32(keyb) + + n, err := rand.Int(rand.Reader, big.NewInt(1<<16)) + assert.Success(t, err) + + b := make([]byte, 1+n.Int64()) + _, err = rand.Read(b) + assert.Success(t, err) + + b2 := make([]byte, len(b)) + copy(b2, b) + b3 := make([]byte, len(b)) + copy(b3, b) + + key2 := basicMask(b2, key) + key3 := fn(b3, key) + + if key2 != key3 { + t.Errorf("expected key %X but got %X", key2, key3) + } + if !bytes.Equal(b2, b3) { + t.Error("bad bytes") + return + } + } + }) +} From f62cef395d5228b955967e05a7e7cbf5a0ab8f93 Mon Sep 17 00:00:00 2001 From: Anmol Sethi Date: Thu, 22 Feb 2024 05:00:41 -0800 Subject: [PATCH 23/26] test.sh: Test assembly masking on arm64 --- ci/test.sh | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/ci/test.sh b/ci/test.sh index 83bb9832..a3007614 100755 --- a/ci/test.sh +++ b/ci/test.sh @@ -11,6 +11,19 @@ cd -- "$(dirname "$0")/.." go test "$@" ./... ) +( + GOARCH=arm64 go test -c -o ./ci/out/websocket-arm64.test "$@" . + if [ "$#" -eq 0 ]; then + if [ "${CI-}" ]; then + sudo apt-get update + sudo apt-get install -y qemu-user-static + ln -s /usr/bin/qemu-aarch64-static /usr/local/bin/qemu-aarch64 + fi + qemu-aarch64 ./ci/out/websocket-arm64.test -test.run=TestMask + fi +) + + go install github.com/agnivade/wasmbrowsertest@latest go test --race --bench=. --timeout=1h --covermode=atomic --coverprofile=ci/out/coverage.prof --coverpkg=./... "$@" ./... sed -i.bak '/stringer\.go/d' ci/out/coverage.prof From 92acb74883ce505cd4eefd32841ef807de3e78f8 Mon Sep 17 00:00:00 2001 From: Anmol Sethi Date: Thu, 22 Feb 2024 05:06:05 -0800 Subject: [PATCH 24/26] internal/xcpu: Vendor golang.org/x/sys/cpu Standard library does this too. Unfortunate wish they just exposed it in the standard library. Perhaps we can isolate the specific code we need later. --- go.mod | 2 - go.sum | 2 - internal/examples/go.mod | 2 - internal/examples/go.sum | 2 - internal/xcpu/.gitattributes | 10 + internal/xcpu/.gitignore | 2 + internal/xcpu/README.md | 3 + internal/xcpu/asm_aix_ppc64.s | 17 ++ internal/xcpu/byteorder.go | 66 ++++++ internal/xcpu/cpu.go | 290 ++++++++++++++++++++++++++ internal/xcpu/cpu_aix.go | 33 +++ internal/xcpu/cpu_arm.go | 73 +++++++ internal/xcpu/cpu_arm64.go | 172 +++++++++++++++ internal/xcpu/cpu_arm64.s | 31 +++ internal/xcpu/cpu_gc_arm64.go | 11 + internal/xcpu/cpu_gc_s390x.go | 21 ++ internal/xcpu/cpu_gc_x86.go | 15 ++ internal/xcpu/cpu_gccgo_arm64.go | 11 + internal/xcpu/cpu_gccgo_s390x.go | 22 ++ internal/xcpu/cpu_gccgo_x86.c | 37 ++++ internal/xcpu/cpu_gccgo_x86.go | 31 +++ internal/xcpu/cpu_linux.go | 15 ++ internal/xcpu/cpu_linux_arm.go | 39 ++++ internal/xcpu/cpu_linux_arm64.go | 111 ++++++++++ internal/xcpu/cpu_linux_mips64x.go | 22 ++ internal/xcpu/cpu_linux_noinit.go | 9 + internal/xcpu/cpu_linux_ppc64x.go | 30 +++ internal/xcpu/cpu_linux_s390x.go | 40 ++++ internal/xcpu/cpu_loong64.go | 12 ++ internal/xcpu/cpu_mips64x.go | 15 ++ internal/xcpu/cpu_mipsx.go | 11 + internal/xcpu/cpu_netbsd_arm64.go | 173 +++++++++++++++ internal/xcpu/cpu_openbsd_arm64.go | 65 ++++++ internal/xcpu/cpu_openbsd_arm64.s | 11 + internal/xcpu/cpu_other_arm.go | 9 + internal/xcpu/cpu_other_arm64.go | 9 + internal/xcpu/cpu_other_mips64x.go | 11 + internal/xcpu/cpu_other_ppc64x.go | 12 ++ internal/xcpu/cpu_other_riscv64.go | 11 + internal/xcpu/cpu_ppc64x.go | 16 ++ internal/xcpu/cpu_riscv64.go | 11 + internal/xcpu/cpu_s390x.go | 172 +++++++++++++++ internal/xcpu/cpu_s390x.s | 57 +++++ internal/xcpu/cpu_wasm.go | 17 ++ internal/xcpu/cpu_x86.go | 151 ++++++++++++++ internal/xcpu/cpu_x86.s | 26 +++ internal/xcpu/cpu_zos.go | 10 + internal/xcpu/cpu_zos_s390x.go | 25 +++ internal/xcpu/endian_big.go | 10 + internal/xcpu/endian_little.go | 10 + internal/xcpu/hwcap_linux.go | 71 +++++++ internal/xcpu/parse.go | 43 ++++ internal/xcpu/proc_cpuinfo_linux.go | 53 +++++ internal/xcpu/runtime_auxv.go | 16 ++ internal/xcpu/runtime_auxv_go121.go | 18 ++ internal/xcpu/syscall_aix_gccgo.go | 26 +++ internal/xcpu/syscall_aix_ppc64_gc.go | 35 ++++ mask_asm.go | 4 +- mask_test.go | 46 ++-- 59 files changed, 2242 insertions(+), 33 deletions(-) create mode 100644 internal/xcpu/.gitattributes create mode 100644 internal/xcpu/.gitignore create mode 100644 internal/xcpu/README.md create mode 100644 internal/xcpu/asm_aix_ppc64.s create mode 100644 internal/xcpu/byteorder.go create mode 100644 internal/xcpu/cpu.go create mode 100644 internal/xcpu/cpu_aix.go create mode 100644 internal/xcpu/cpu_arm.go create mode 100644 internal/xcpu/cpu_arm64.go create mode 100644 internal/xcpu/cpu_arm64.s create mode 100644 internal/xcpu/cpu_gc_arm64.go create mode 100644 internal/xcpu/cpu_gc_s390x.go create mode 100644 internal/xcpu/cpu_gc_x86.go create mode 100644 internal/xcpu/cpu_gccgo_arm64.go create mode 100644 internal/xcpu/cpu_gccgo_s390x.go create mode 100644 internal/xcpu/cpu_gccgo_x86.c create mode 100644 internal/xcpu/cpu_gccgo_x86.go create mode 100644 internal/xcpu/cpu_linux.go create mode 100644 internal/xcpu/cpu_linux_arm.go create mode 100644 internal/xcpu/cpu_linux_arm64.go create mode 100644 internal/xcpu/cpu_linux_mips64x.go create mode 100644 internal/xcpu/cpu_linux_noinit.go create mode 100644 internal/xcpu/cpu_linux_ppc64x.go create mode 100644 internal/xcpu/cpu_linux_s390x.go create mode 100644 internal/xcpu/cpu_loong64.go create mode 100644 internal/xcpu/cpu_mips64x.go create mode 100644 internal/xcpu/cpu_mipsx.go create mode 100644 internal/xcpu/cpu_netbsd_arm64.go create mode 100644 internal/xcpu/cpu_openbsd_arm64.go create mode 100644 internal/xcpu/cpu_openbsd_arm64.s create mode 100644 internal/xcpu/cpu_other_arm.go create mode 100644 internal/xcpu/cpu_other_arm64.go create mode 100644 internal/xcpu/cpu_other_mips64x.go create mode 100644 internal/xcpu/cpu_other_ppc64x.go create mode 100644 internal/xcpu/cpu_other_riscv64.go create mode 100644 internal/xcpu/cpu_ppc64x.go create mode 100644 internal/xcpu/cpu_riscv64.go create mode 100644 internal/xcpu/cpu_s390x.go create mode 100644 internal/xcpu/cpu_s390x.s create mode 100644 internal/xcpu/cpu_wasm.go create mode 100644 internal/xcpu/cpu_x86.go create mode 100644 internal/xcpu/cpu_x86.s create mode 100644 internal/xcpu/cpu_zos.go create mode 100644 internal/xcpu/cpu_zos_s390x.go create mode 100644 internal/xcpu/endian_big.go create mode 100644 internal/xcpu/endian_little.go create mode 100644 internal/xcpu/hwcap_linux.go create mode 100644 internal/xcpu/parse.go create mode 100644 internal/xcpu/proc_cpuinfo_linux.go create mode 100644 internal/xcpu/runtime_auxv.go create mode 100644 internal/xcpu/runtime_auxv_go121.go create mode 100644 internal/xcpu/syscall_aix_gccgo.go create mode 100644 internal/xcpu/syscall_aix_ppc64_gc.go diff --git a/go.mod b/go.mod index c6ec72cc..715a9f7a 100644 --- a/go.mod +++ b/go.mod @@ -1,5 +1,3 @@ module nhooyr.io/websocket go 1.19 - -require golang.org/x/sys v0.17.0 diff --git a/go.sum b/go.sum index 735d9a79..e69de29b 100644 --- a/go.sum +++ b/go.sum @@ -1,2 +0,0 @@ -golang.org/x/sys v0.17.0 h1:25cE3gD+tdBA7lp7QfhuV+rJiE9YXTcS3VG1SqssI/Y= -golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= diff --git a/internal/examples/go.mod b/internal/examples/go.mod index 50695945..c98b81ce 100644 --- a/internal/examples/go.mod +++ b/internal/examples/go.mod @@ -8,5 +8,3 @@ require ( golang.org/x/time v0.3.0 nhooyr.io/websocket v0.0.0-00010101000000-000000000000 ) - -require golang.org/x/sys v0.17.0 // indirect diff --git a/internal/examples/go.sum b/internal/examples/go.sum index 06068548..f8a07e82 100644 --- a/internal/examples/go.sum +++ b/internal/examples/go.sum @@ -1,4 +1,2 @@ -golang.org/x/sys v0.17.0 h1:25cE3gD+tdBA7lp7QfhuV+rJiE9YXTcS3VG1SqssI/Y= -golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/time v0.3.0 h1:rg5rLMjNzMS1RkNLzCG38eapWhnYLFYXDXj2gOlr8j4= golang.org/x/time v0.3.0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= diff --git a/internal/xcpu/.gitattributes b/internal/xcpu/.gitattributes new file mode 100644 index 00000000..d2f212e5 --- /dev/null +++ b/internal/xcpu/.gitattributes @@ -0,0 +1,10 @@ +# Treat all files in this repo as binary, with no git magic updating +# line endings. Windows users contributing to Go will need to use a +# modern version of git and editors capable of LF line endings. +# +# We'll prevent accidental CRLF line endings from entering the repo +# via the git-review gofmt checks. +# +# See golang.org/issue/9281 + +* -text diff --git a/internal/xcpu/.gitignore b/internal/xcpu/.gitignore new file mode 100644 index 00000000..5a9d62ef --- /dev/null +++ b/internal/xcpu/.gitignore @@ -0,0 +1,2 @@ +# Add no patterns to .gitignore except for files generated by the build. +last-change diff --git a/internal/xcpu/README.md b/internal/xcpu/README.md new file mode 100644 index 00000000..96a1a30f --- /dev/null +++ b/internal/xcpu/README.md @@ -0,0 +1,3 @@ +# cpu + +Vendored from https://github.com/golang/sys diff --git a/internal/xcpu/asm_aix_ppc64.s b/internal/xcpu/asm_aix_ppc64.s new file mode 100644 index 00000000..269e173c --- /dev/null +++ b/internal/xcpu/asm_aix_ppc64.s @@ -0,0 +1,17 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build gc + +#include "textflag.h" + +// +// System calls for ppc64, AIX are implemented in runtime/syscall_aix.go +// + +TEXT ·syscall6(SB),NOSPLIT,$0-88 + JMP syscall·syscall6(SB) + +TEXT ·rawSyscall6(SB),NOSPLIT,$0-88 + JMP syscall·rawSyscall6(SB) diff --git a/internal/xcpu/byteorder.go b/internal/xcpu/byteorder.go new file mode 100644 index 00000000..8f28d86c --- /dev/null +++ b/internal/xcpu/byteorder.go @@ -0,0 +1,66 @@ +// Copyright 2019 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package xcpu + +import ( + "runtime" +) + +// byteOrder is a subset of encoding/binary.ByteOrder. +type byteOrder interface { + Uint32([]byte) uint32 + Uint64([]byte) uint64 +} + +type littleEndian struct{} +type bigEndian struct{} + +func (littleEndian) Uint32(b []byte) uint32 { + _ = b[3] // bounds check hint to compiler; see golang.org/issue/14808 + return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24 +} + +func (littleEndian) Uint64(b []byte) uint64 { + _ = b[7] // bounds check hint to compiler; see golang.org/issue/14808 + return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 | + uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56 +} + +func (bigEndian) Uint32(b []byte) uint32 { + _ = b[3] // bounds check hint to compiler; see golang.org/issue/14808 + return uint32(b[3]) | uint32(b[2])<<8 | uint32(b[1])<<16 | uint32(b[0])<<24 +} + +func (bigEndian) Uint64(b []byte) uint64 { + _ = b[7] // bounds check hint to compiler; see golang.org/issue/14808 + return uint64(b[7]) | uint64(b[6])<<8 | uint64(b[5])<<16 | uint64(b[4])<<24 | + uint64(b[3])<<32 | uint64(b[2])<<40 | uint64(b[1])<<48 | uint64(b[0])<<56 +} + +// hostByteOrder returns littleEndian on little-endian machines and +// bigEndian on big-endian machines. +func hostByteOrder() byteOrder { + switch runtime.GOARCH { + case "386", "amd64", "amd64p32", + "alpha", + "arm", "arm64", + "loong64", + "mipsle", "mips64le", "mips64p32le", + "nios2", + "ppc64le", + "riscv", "riscv64", + "sh": + return littleEndian{} + case "armbe", "arm64be", + "m68k", + "mips", "mips64", "mips64p32", + "ppc", "ppc64", + "s390", "s390x", + "shbe", + "sparc", "sparc64": + return bigEndian{} + } + panic("unknown architecture") +} diff --git a/internal/xcpu/cpu.go b/internal/xcpu/cpu.go new file mode 100644 index 00000000..5fc15019 --- /dev/null +++ b/internal/xcpu/cpu.go @@ -0,0 +1,290 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package cpu implements processor feature detection for +// various CPU architectures. +package xcpu + +import ( + "os" + "strings" +) + +// Initialized reports whether the CPU features were initialized. +// +// For some GOOS/GOARCH combinations initialization of the CPU features depends +// on reading an operating specific file, e.g. /proc/self/auxv on linux/arm +// Initialized will report false if reading the file fails. +var Initialized bool + +// CacheLinePad is used to pad structs to avoid false sharing. +type CacheLinePad struct{ _ [cacheLineSize]byte } + +// X86 contains the supported CPU features of the +// current X86/AMD64 platform. If the current platform +// is not X86/AMD64 then all feature flags are false. +// +// X86 is padded to avoid false sharing. Further the HasAVX +// and HasAVX2 are only set if the OS supports XMM and YMM +// registers in addition to the CPUID feature bit being set. +var X86 struct { + _ CacheLinePad + HasAES bool // AES hardware implementation (AES NI) + HasADX bool // Multi-precision add-carry instruction extensions + HasAVX bool // Advanced vector extension + HasAVX2 bool // Advanced vector extension 2 + HasAVX512 bool // Advanced vector extension 512 + HasAVX512F bool // Advanced vector extension 512 Foundation Instructions + HasAVX512CD bool // Advanced vector extension 512 Conflict Detection Instructions + HasAVX512ER bool // Advanced vector extension 512 Exponential and Reciprocal Instructions + HasAVX512PF bool // Advanced vector extension 512 Prefetch Instructions + HasAVX512VL bool // Advanced vector extension 512 Vector Length Extensions + HasAVX512BW bool // Advanced vector extension 512 Byte and Word Instructions + HasAVX512DQ bool // Advanced vector extension 512 Doubleword and Quadword Instructions + HasAVX512IFMA bool // Advanced vector extension 512 Integer Fused Multiply Add + HasAVX512VBMI bool // Advanced vector extension 512 Vector Byte Manipulation Instructions + HasAVX5124VNNIW bool // Advanced vector extension 512 Vector Neural Network Instructions Word variable precision + HasAVX5124FMAPS bool // Advanced vector extension 512 Fused Multiply Accumulation Packed Single precision + HasAVX512VPOPCNTDQ bool // Advanced vector extension 512 Double and quad word population count instructions + HasAVX512VPCLMULQDQ bool // Advanced vector extension 512 Vector carry-less multiply operations + HasAVX512VNNI bool // Advanced vector extension 512 Vector Neural Network Instructions + HasAVX512GFNI bool // Advanced vector extension 512 Galois field New Instructions + HasAVX512VAES bool // Advanced vector extension 512 Vector AES instructions + HasAVX512VBMI2 bool // Advanced vector extension 512 Vector Byte Manipulation Instructions 2 + HasAVX512BITALG bool // Advanced vector extension 512 Bit Algorithms + HasAVX512BF16 bool // Advanced vector extension 512 BFloat16 Instructions + HasAMXTile bool // Advanced Matrix Extension Tile instructions + HasAMXInt8 bool // Advanced Matrix Extension Int8 instructions + HasAMXBF16 bool // Advanced Matrix Extension BFloat16 instructions + HasBMI1 bool // Bit manipulation instruction set 1 + HasBMI2 bool // Bit manipulation instruction set 2 + HasCX16 bool // Compare and exchange 16 Bytes + HasERMS bool // Enhanced REP for MOVSB and STOSB + HasFMA bool // Fused-multiply-add instructions + HasOSXSAVE bool // OS supports XSAVE/XRESTOR for saving/restoring XMM registers. + HasPCLMULQDQ bool // PCLMULQDQ instruction - most often used for AES-GCM + HasPOPCNT bool // Hamming weight instruction POPCNT. + HasRDRAND bool // RDRAND instruction (on-chip random number generator) + HasRDSEED bool // RDSEED instruction (on-chip random number generator) + HasSSE2 bool // Streaming SIMD extension 2 (always available on amd64) + HasSSE3 bool // Streaming SIMD extension 3 + HasSSSE3 bool // Supplemental streaming SIMD extension 3 + HasSSE41 bool // Streaming SIMD extension 4 and 4.1 + HasSSE42 bool // Streaming SIMD extension 4 and 4.2 + _ CacheLinePad +} + +// ARM64 contains the supported CPU features of the +// current ARMv8(aarch64) platform. If the current platform +// is not arm64 then all feature flags are false. +var ARM64 struct { + _ CacheLinePad + HasFP bool // Floating-point instruction set (always available) + HasASIMD bool // Advanced SIMD (always available) + HasEVTSTRM bool // Event stream support + HasAES bool // AES hardware implementation + HasPMULL bool // Polynomial multiplication instruction set + HasSHA1 bool // SHA1 hardware implementation + HasSHA2 bool // SHA2 hardware implementation + HasCRC32 bool // CRC32 hardware implementation + HasATOMICS bool // Atomic memory operation instruction set + HasFPHP bool // Half precision floating-point instruction set + HasASIMDHP bool // Advanced SIMD half precision instruction set + HasCPUID bool // CPUID identification scheme registers + HasASIMDRDM bool // Rounding double multiply add/subtract instruction set + HasJSCVT bool // Javascript conversion from floating-point to integer + HasFCMA bool // Floating-point multiplication and addition of complex numbers + HasLRCPC bool // Release Consistent processor consistent support + HasDCPOP bool // Persistent memory support + HasSHA3 bool // SHA3 hardware implementation + HasSM3 bool // SM3 hardware implementation + HasSM4 bool // SM4 hardware implementation + HasASIMDDP bool // Advanced SIMD double precision instruction set + HasSHA512 bool // SHA512 hardware implementation + HasSVE bool // Scalable Vector Extensions + HasASIMDFHM bool // Advanced SIMD multiplication FP16 to FP32 + _ CacheLinePad +} + +// ARM contains the supported CPU features of the current ARM (32-bit) platform. +// All feature flags are false if: +// 1. the current platform is not arm, or +// 2. the current operating system is not Linux. +var ARM struct { + _ CacheLinePad + HasSWP bool // SWP instruction support + HasHALF bool // Half-word load and store support + HasTHUMB bool // ARM Thumb instruction set + Has26BIT bool // Address space limited to 26-bits + HasFASTMUL bool // 32-bit operand, 64-bit result multiplication support + HasFPA bool // Floating point arithmetic support + HasVFP bool // Vector floating point support + HasEDSP bool // DSP Extensions support + HasJAVA bool // Java instruction set + HasIWMMXT bool // Intel Wireless MMX technology support + HasCRUNCH bool // MaverickCrunch context switching and handling + HasTHUMBEE bool // Thumb EE instruction set + HasNEON bool // NEON instruction set + HasVFPv3 bool // Vector floating point version 3 support + HasVFPv3D16 bool // Vector floating point version 3 D8-D15 + HasTLS bool // Thread local storage support + HasVFPv4 bool // Vector floating point version 4 support + HasIDIVA bool // Integer divide instruction support in ARM mode + HasIDIVT bool // Integer divide instruction support in Thumb mode + HasVFPD32 bool // Vector floating point version 3 D15-D31 + HasLPAE bool // Large Physical Address Extensions + HasEVTSTRM bool // Event stream support + HasAES bool // AES hardware implementation + HasPMULL bool // Polynomial multiplication instruction set + HasSHA1 bool // SHA1 hardware implementation + HasSHA2 bool // SHA2 hardware implementation + HasCRC32 bool // CRC32 hardware implementation + _ CacheLinePad +} + +// MIPS64X contains the supported CPU features of the current mips64/mips64le +// platforms. If the current platform is not mips64/mips64le or the current +// operating system is not Linux then all feature flags are false. +var MIPS64X struct { + _ CacheLinePad + HasMSA bool // MIPS SIMD architecture + _ CacheLinePad +} + +// PPC64 contains the supported CPU features of the current ppc64/ppc64le platforms. +// If the current platform is not ppc64/ppc64le then all feature flags are false. +// +// For ppc64/ppc64le, it is safe to check only for ISA level starting on ISA v3.00, +// since there are no optional categories. There are some exceptions that also +// require kernel support to work (DARN, SCV), so there are feature bits for +// those as well. The struct is padded to avoid false sharing. +var PPC64 struct { + _ CacheLinePad + HasDARN bool // Hardware random number generator (requires kernel enablement) + HasSCV bool // Syscall vectored (requires kernel enablement) + IsPOWER8 bool // ISA v2.07 (POWER8) + IsPOWER9 bool // ISA v3.00 (POWER9), implies IsPOWER8 + _ CacheLinePad +} + +// S390X contains the supported CPU features of the current IBM Z +// (s390x) platform. If the current platform is not IBM Z then all +// feature flags are false. +// +// S390X is padded to avoid false sharing. Further HasVX is only set +// if the OS supports vector registers in addition to the STFLE +// feature bit being set. +var S390X struct { + _ CacheLinePad + HasZARCH bool // z/Architecture mode is active [mandatory] + HasSTFLE bool // store facility list extended + HasLDISP bool // long (20-bit) displacements + HasEIMM bool // 32-bit immediates + HasDFP bool // decimal floating point + HasETF3EH bool // ETF-3 enhanced + HasMSA bool // message security assist (CPACF) + HasAES bool // KM-AES{128,192,256} functions + HasAESCBC bool // KMC-AES{128,192,256} functions + HasAESCTR bool // KMCTR-AES{128,192,256} functions + HasAESGCM bool // KMA-GCM-AES{128,192,256} functions + HasGHASH bool // KIMD-GHASH function + HasSHA1 bool // K{I,L}MD-SHA-1 functions + HasSHA256 bool // K{I,L}MD-SHA-256 functions + HasSHA512 bool // K{I,L}MD-SHA-512 functions + HasSHA3 bool // K{I,L}MD-SHA3-{224,256,384,512} and K{I,L}MD-SHAKE-{128,256} functions + HasVX bool // vector facility + HasVXE bool // vector-enhancements facility 1 + _ CacheLinePad +} + +func init() { + archInit() + initOptions() + processOptions() +} + +// options contains the cpu debug options that can be used in GODEBUG. +// Options are arch dependent and are added by the arch specific initOptions functions. +// Features that are mandatory for the specific GOARCH should have the Required field set +// (e.g. SSE2 on amd64). +var options []option + +// Option names should be lower case. e.g. avx instead of AVX. +type option struct { + Name string + Feature *bool + Specified bool // whether feature value was specified in GODEBUG + Enable bool // whether feature should be enabled + Required bool // whether feature is mandatory and can not be disabled +} + +func processOptions() { + env := os.Getenv("GODEBUG") +field: + for env != "" { + field := "" + i := strings.IndexByte(env, ',') + if i < 0 { + field, env = env, "" + } else { + field, env = env[:i], env[i+1:] + } + if len(field) < 4 || field[:4] != "cpu." { + continue + } + i = strings.IndexByte(field, '=') + if i < 0 { + print("GODEBUG sys/cpu: no value specified for \"", field, "\"\n") + continue + } + key, value := field[4:i], field[i+1:] // e.g. "SSE2", "on" + + var enable bool + switch value { + case "on": + enable = true + case "off": + enable = false + default: + print("GODEBUG sys/cpu: value \"", value, "\" not supported for cpu option \"", key, "\"\n") + continue field + } + + if key == "all" { + for i := range options { + options[i].Specified = true + options[i].Enable = enable || options[i].Required + } + continue field + } + + for i := range options { + if options[i].Name == key { + options[i].Specified = true + options[i].Enable = enable + continue field + } + } + + print("GODEBUG sys/cpu: unknown cpu feature \"", key, "\"\n") + } + + for _, o := range options { + if !o.Specified { + continue + } + + if o.Enable && !*o.Feature { + print("GODEBUG sys/cpu: can not enable \"", o.Name, "\", missing CPU support\n") + continue + } + + if !o.Enable && o.Required { + print("GODEBUG sys/cpu: can not disable \"", o.Name, "\", required CPU feature\n") + continue + } + + *o.Feature = o.Enable + } +} diff --git a/internal/xcpu/cpu_aix.go b/internal/xcpu/cpu_aix.go new file mode 100644 index 00000000..5e6e2583 --- /dev/null +++ b/internal/xcpu/cpu_aix.go @@ -0,0 +1,33 @@ +// Copyright 2019 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build aix + +package xcpu + +const ( + // getsystemcfg constants + _SC_IMPL = 2 + _IMPL_POWER8 = 0x10000 + _IMPL_POWER9 = 0x20000 +) + +func archInit() { + impl := getsystemcfg(_SC_IMPL) + if impl&_IMPL_POWER8 != 0 { + PPC64.IsPOWER8 = true + } + if impl&_IMPL_POWER9 != 0 { + PPC64.IsPOWER8 = true + PPC64.IsPOWER9 = true + } + + Initialized = true +} + +func getsystemcfg(label int) (n uint64) { + r0, _ := callgetsystemcfg(label) + n = uint64(r0) + return +} diff --git a/internal/xcpu/cpu_arm.go b/internal/xcpu/cpu_arm.go new file mode 100644 index 00000000..ff120458 --- /dev/null +++ b/internal/xcpu/cpu_arm.go @@ -0,0 +1,73 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package xcpu + +const cacheLineSize = 32 + +// HWCAP/HWCAP2 bits. +// These are specific to Linux. +const ( + hwcap_SWP = 1 << 0 + hwcap_HALF = 1 << 1 + hwcap_THUMB = 1 << 2 + hwcap_26BIT = 1 << 3 + hwcap_FAST_MULT = 1 << 4 + hwcap_FPA = 1 << 5 + hwcap_VFP = 1 << 6 + hwcap_EDSP = 1 << 7 + hwcap_JAVA = 1 << 8 + hwcap_IWMMXT = 1 << 9 + hwcap_CRUNCH = 1 << 10 + hwcap_THUMBEE = 1 << 11 + hwcap_NEON = 1 << 12 + hwcap_VFPv3 = 1 << 13 + hwcap_VFPv3D16 = 1 << 14 + hwcap_TLS = 1 << 15 + hwcap_VFPv4 = 1 << 16 + hwcap_IDIVA = 1 << 17 + hwcap_IDIVT = 1 << 18 + hwcap_VFPD32 = 1 << 19 + hwcap_LPAE = 1 << 20 + hwcap_EVTSTRM = 1 << 21 + + hwcap2_AES = 1 << 0 + hwcap2_PMULL = 1 << 1 + hwcap2_SHA1 = 1 << 2 + hwcap2_SHA2 = 1 << 3 + hwcap2_CRC32 = 1 << 4 +) + +func initOptions() { + options = []option{ + {Name: "pmull", Feature: &ARM.HasPMULL}, + {Name: "sha1", Feature: &ARM.HasSHA1}, + {Name: "sha2", Feature: &ARM.HasSHA2}, + {Name: "swp", Feature: &ARM.HasSWP}, + {Name: "thumb", Feature: &ARM.HasTHUMB}, + {Name: "thumbee", Feature: &ARM.HasTHUMBEE}, + {Name: "tls", Feature: &ARM.HasTLS}, + {Name: "vfp", Feature: &ARM.HasVFP}, + {Name: "vfpd32", Feature: &ARM.HasVFPD32}, + {Name: "vfpv3", Feature: &ARM.HasVFPv3}, + {Name: "vfpv3d16", Feature: &ARM.HasVFPv3D16}, + {Name: "vfpv4", Feature: &ARM.HasVFPv4}, + {Name: "half", Feature: &ARM.HasHALF}, + {Name: "26bit", Feature: &ARM.Has26BIT}, + {Name: "fastmul", Feature: &ARM.HasFASTMUL}, + {Name: "fpa", Feature: &ARM.HasFPA}, + {Name: "edsp", Feature: &ARM.HasEDSP}, + {Name: "java", Feature: &ARM.HasJAVA}, + {Name: "iwmmxt", Feature: &ARM.HasIWMMXT}, + {Name: "crunch", Feature: &ARM.HasCRUNCH}, + {Name: "neon", Feature: &ARM.HasNEON}, + {Name: "idivt", Feature: &ARM.HasIDIVT}, + {Name: "idiva", Feature: &ARM.HasIDIVA}, + {Name: "lpae", Feature: &ARM.HasLPAE}, + {Name: "evtstrm", Feature: &ARM.HasEVTSTRM}, + {Name: "aes", Feature: &ARM.HasAES}, + {Name: "crc32", Feature: &ARM.HasCRC32}, + } + +} diff --git a/internal/xcpu/cpu_arm64.go b/internal/xcpu/cpu_arm64.go new file mode 100644 index 00000000..3d4113a5 --- /dev/null +++ b/internal/xcpu/cpu_arm64.go @@ -0,0 +1,172 @@ +// Copyright 2019 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package xcpu + +import "runtime" + +// cacheLineSize is used to prevent false sharing of cache lines. +// We choose 128 because Apple Silicon, a.k.a. M1, has 128-byte cache line size. +// It doesn't cost much and is much more future-proof. +const cacheLineSize = 128 + +func initOptions() { + options = []option{ + {Name: "fp", Feature: &ARM64.HasFP}, + {Name: "asimd", Feature: &ARM64.HasASIMD}, + {Name: "evstrm", Feature: &ARM64.HasEVTSTRM}, + {Name: "aes", Feature: &ARM64.HasAES}, + {Name: "fphp", Feature: &ARM64.HasFPHP}, + {Name: "jscvt", Feature: &ARM64.HasJSCVT}, + {Name: "lrcpc", Feature: &ARM64.HasLRCPC}, + {Name: "pmull", Feature: &ARM64.HasPMULL}, + {Name: "sha1", Feature: &ARM64.HasSHA1}, + {Name: "sha2", Feature: &ARM64.HasSHA2}, + {Name: "sha3", Feature: &ARM64.HasSHA3}, + {Name: "sha512", Feature: &ARM64.HasSHA512}, + {Name: "sm3", Feature: &ARM64.HasSM3}, + {Name: "sm4", Feature: &ARM64.HasSM4}, + {Name: "sve", Feature: &ARM64.HasSVE}, + {Name: "crc32", Feature: &ARM64.HasCRC32}, + {Name: "atomics", Feature: &ARM64.HasATOMICS}, + {Name: "asimdhp", Feature: &ARM64.HasASIMDHP}, + {Name: "cpuid", Feature: &ARM64.HasCPUID}, + {Name: "asimrdm", Feature: &ARM64.HasASIMDRDM}, + {Name: "fcma", Feature: &ARM64.HasFCMA}, + {Name: "dcpop", Feature: &ARM64.HasDCPOP}, + {Name: "asimddp", Feature: &ARM64.HasASIMDDP}, + {Name: "asimdfhm", Feature: &ARM64.HasASIMDFHM}, + } +} + +func archInit() { + switch runtime.GOOS { + case "freebsd": + readARM64Registers() + case "linux", "netbsd", "openbsd": + doinit() + default: + // Many platforms don't seem to allow reading these registers. + setMinimalFeatures() + } +} + +// setMinimalFeatures fakes the minimal ARM64 features expected by +// TestARM64minimalFeatures. +func setMinimalFeatures() { + ARM64.HasASIMD = true + ARM64.HasFP = true +} + +func readARM64Registers() { + Initialized = true + + parseARM64SystemRegisters(getisar0(), getisar1(), getpfr0()) +} + +func parseARM64SystemRegisters(isar0, isar1, pfr0 uint64) { + // ID_AA64ISAR0_EL1 + switch extractBits(isar0, 4, 7) { + case 1: + ARM64.HasAES = true + case 2: + ARM64.HasAES = true + ARM64.HasPMULL = true + } + + switch extractBits(isar0, 8, 11) { + case 1: + ARM64.HasSHA1 = true + } + + switch extractBits(isar0, 12, 15) { + case 1: + ARM64.HasSHA2 = true + case 2: + ARM64.HasSHA2 = true + ARM64.HasSHA512 = true + } + + switch extractBits(isar0, 16, 19) { + case 1: + ARM64.HasCRC32 = true + } + + switch extractBits(isar0, 20, 23) { + case 2: + ARM64.HasATOMICS = true + } + + switch extractBits(isar0, 28, 31) { + case 1: + ARM64.HasASIMDRDM = true + } + + switch extractBits(isar0, 32, 35) { + case 1: + ARM64.HasSHA3 = true + } + + switch extractBits(isar0, 36, 39) { + case 1: + ARM64.HasSM3 = true + } + + switch extractBits(isar0, 40, 43) { + case 1: + ARM64.HasSM4 = true + } + + switch extractBits(isar0, 44, 47) { + case 1: + ARM64.HasASIMDDP = true + } + + // ID_AA64ISAR1_EL1 + switch extractBits(isar1, 0, 3) { + case 1: + ARM64.HasDCPOP = true + } + + switch extractBits(isar1, 12, 15) { + case 1: + ARM64.HasJSCVT = true + } + + switch extractBits(isar1, 16, 19) { + case 1: + ARM64.HasFCMA = true + } + + switch extractBits(isar1, 20, 23) { + case 1: + ARM64.HasLRCPC = true + } + + // ID_AA64PFR0_EL1 + switch extractBits(pfr0, 16, 19) { + case 0: + ARM64.HasFP = true + case 1: + ARM64.HasFP = true + ARM64.HasFPHP = true + } + + switch extractBits(pfr0, 20, 23) { + case 0: + ARM64.HasASIMD = true + case 1: + ARM64.HasASIMD = true + ARM64.HasASIMDHP = true + } + + switch extractBits(pfr0, 32, 35) { + case 1: + ARM64.HasSVE = true + } +} + +func extractBits(data uint64, start, end uint) uint { + return (uint)(data>>start) & ((1 << (end - start + 1)) - 1) +} diff --git a/internal/xcpu/cpu_arm64.s b/internal/xcpu/cpu_arm64.s new file mode 100644 index 00000000..fcb9a388 --- /dev/null +++ b/internal/xcpu/cpu_arm64.s @@ -0,0 +1,31 @@ +// Copyright 2019 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build gc + +#include "textflag.h" + +// func getisar0() uint64 +TEXT ·getisar0(SB),NOSPLIT,$0-8 + // get Instruction Set Attributes 0 into x0 + // mrs x0, ID_AA64ISAR0_EL1 = d5380600 + WORD $0xd5380600 + MOVD R0, ret+0(FP) + RET + +// func getisar1() uint64 +TEXT ·getisar1(SB),NOSPLIT,$0-8 + // get Instruction Set Attributes 1 into x0 + // mrs x0, ID_AA64ISAR1_EL1 = d5380620 + WORD $0xd5380620 + MOVD R0, ret+0(FP) + RET + +// func getpfr0() uint64 +TEXT ·getpfr0(SB),NOSPLIT,$0-8 + // get Processor Feature Register 0 into x0 + // mrs x0, ID_AA64PFR0_EL1 = d5380400 + WORD $0xd5380400 + MOVD R0, ret+0(FP) + RET diff --git a/internal/xcpu/cpu_gc_arm64.go b/internal/xcpu/cpu_gc_arm64.go new file mode 100644 index 00000000..26d3050d --- /dev/null +++ b/internal/xcpu/cpu_gc_arm64.go @@ -0,0 +1,11 @@ +// Copyright 2019 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build gc + +package xcpu + +func getisar0() uint64 +func getisar1() uint64 +func getpfr0() uint64 diff --git a/internal/xcpu/cpu_gc_s390x.go b/internal/xcpu/cpu_gc_s390x.go new file mode 100644 index 00000000..34ca88b7 --- /dev/null +++ b/internal/xcpu/cpu_gc_s390x.go @@ -0,0 +1,21 @@ +// Copyright 2019 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build gc + +package xcpu + +// haveAsmFunctions reports whether the other functions in this file can +// be safely called. +func haveAsmFunctions() bool { return true } + +// The following feature detection functions are defined in cpu_s390x.s. +// They are likely to be expensive to call so the results should be cached. +func stfle() facilityList +func kmQuery() queryResult +func kmcQuery() queryResult +func kmctrQuery() queryResult +func kmaQuery() queryResult +func kimdQuery() queryResult +func klmdQuery() queryResult diff --git a/internal/xcpu/cpu_gc_x86.go b/internal/xcpu/cpu_gc_x86.go new file mode 100644 index 00000000..9d6f61c2 --- /dev/null +++ b/internal/xcpu/cpu_gc_x86.go @@ -0,0 +1,15 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build (386 || amd64 || amd64p32) && gc + +package xcpu + +// cpuid is implemented in cpu_x86.s for gc compiler +// and in cpu_gccgo.c for gccgo. +func cpuid(eaxArg, ecxArg uint32) (eax, ebx, ecx, edx uint32) + +// xgetbv with ecx = 0 is implemented in cpu_x86.s for gc compiler +// and in cpu_gccgo.c for gccgo. +func xgetbv() (eax, edx uint32) diff --git a/internal/xcpu/cpu_gccgo_arm64.go b/internal/xcpu/cpu_gccgo_arm64.go new file mode 100644 index 00000000..d6c2a3a8 --- /dev/null +++ b/internal/xcpu/cpu_gccgo_arm64.go @@ -0,0 +1,11 @@ +// Copyright 2019 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build gccgo + +package xcpu + +func getisar0() uint64 { return 0 } +func getisar1() uint64 { return 0 } +func getpfr0() uint64 { return 0 } diff --git a/internal/xcpu/cpu_gccgo_s390x.go b/internal/xcpu/cpu_gccgo_s390x.go new file mode 100644 index 00000000..4deec625 --- /dev/null +++ b/internal/xcpu/cpu_gccgo_s390x.go @@ -0,0 +1,22 @@ +// Copyright 2019 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build gccgo + +package xcpu + +// haveAsmFunctions reports whether the other functions in this file can +// be safely called. +func haveAsmFunctions() bool { return false } + +// TODO(mundaym): the following feature detection functions are currently +// stubs. See https://golang.org/cl/162887 for how to fix this. +// They are likely to be expensive to call so the results should be cached. +func stfle() facilityList { panic("not implemented for gccgo") } +func kmQuery() queryResult { panic("not implemented for gccgo") } +func kmcQuery() queryResult { panic("not implemented for gccgo") } +func kmctrQuery() queryResult { panic("not implemented for gccgo") } +func kmaQuery() queryResult { panic("not implemented for gccgo") } +func kimdQuery() queryResult { panic("not implemented for gccgo") } +func klmdQuery() queryResult { panic("not implemented for gccgo") } diff --git a/internal/xcpu/cpu_gccgo_x86.c b/internal/xcpu/cpu_gccgo_x86.c new file mode 100644 index 00000000..3f73a05d --- /dev/null +++ b/internal/xcpu/cpu_gccgo_x86.c @@ -0,0 +1,37 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build (386 || amd64 || amd64p32) && gccgo + +#include +#include +#include + +// Need to wrap __get_cpuid_count because it's declared as static. +int +gccgoGetCpuidCount(uint32_t leaf, uint32_t subleaf, + uint32_t *eax, uint32_t *ebx, + uint32_t *ecx, uint32_t *edx) +{ + return __get_cpuid_count(leaf, subleaf, eax, ebx, ecx, edx); +} + +#pragma GCC diagnostic ignored "-Wunknown-pragmas" +#pragma GCC push_options +#pragma GCC target("xsave") +#pragma clang attribute push (__attribute__((target("xsave"))), apply_to=function) + +// xgetbv reads the contents of an XCR (Extended Control Register) +// specified in the ECX register into registers EDX:EAX. +// Currently, the only supported value for XCR is 0. +void +gccgoXgetbv(uint32_t *eax, uint32_t *edx) +{ + uint64_t v = _xgetbv(0); + *eax = v & 0xffffffff; + *edx = v >> 32; +} + +#pragma clang attribute pop +#pragma GCC pop_options diff --git a/internal/xcpu/cpu_gccgo_x86.go b/internal/xcpu/cpu_gccgo_x86.go new file mode 100644 index 00000000..e66c6ee9 --- /dev/null +++ b/internal/xcpu/cpu_gccgo_x86.go @@ -0,0 +1,31 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build (386 || amd64 || amd64p32) && gccgo + +package xcpu + +//extern gccgoGetCpuidCount +func gccgoGetCpuidCount(eaxArg, ecxArg uint32, eax, ebx, ecx, edx *uint32) + +func cpuid(eaxArg, ecxArg uint32) (eax, ebx, ecx, edx uint32) { + var a, b, c, d uint32 + gccgoGetCpuidCount(eaxArg, ecxArg, &a, &b, &c, &d) + return a, b, c, d +} + +//extern gccgoXgetbv +func gccgoXgetbv(eax, edx *uint32) + +func xgetbv() (eax, edx uint32) { + var a, d uint32 + gccgoXgetbv(&a, &d) + return a, d +} + +// gccgo doesn't build on Darwin, per: +// https://github.com/Homebrew/homebrew-core/blob/HEAD/Formula/gcc.rb#L76 +func darwinSupportsAVX512() bool { + return false +} diff --git a/internal/xcpu/cpu_linux.go b/internal/xcpu/cpu_linux.go new file mode 100644 index 00000000..10a48916 --- /dev/null +++ b/internal/xcpu/cpu_linux.go @@ -0,0 +1,15 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build !386 && !amd64 && !amd64p32 && !arm64 + +package xcpu + +func archInit() { + if err := readHWCAP(); err != nil { + return + } + doinit() + Initialized = true +} diff --git a/internal/xcpu/cpu_linux_arm.go b/internal/xcpu/cpu_linux_arm.go new file mode 100644 index 00000000..28e32637 --- /dev/null +++ b/internal/xcpu/cpu_linux_arm.go @@ -0,0 +1,39 @@ +// Copyright 2019 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package xcpu + +func doinit() { + ARM.HasSWP = isSet(hwCap, hwcap_SWP) + ARM.HasHALF = isSet(hwCap, hwcap_HALF) + ARM.HasTHUMB = isSet(hwCap, hwcap_THUMB) + ARM.Has26BIT = isSet(hwCap, hwcap_26BIT) + ARM.HasFASTMUL = isSet(hwCap, hwcap_FAST_MULT) + ARM.HasFPA = isSet(hwCap, hwcap_FPA) + ARM.HasVFP = isSet(hwCap, hwcap_VFP) + ARM.HasEDSP = isSet(hwCap, hwcap_EDSP) + ARM.HasJAVA = isSet(hwCap, hwcap_JAVA) + ARM.HasIWMMXT = isSet(hwCap, hwcap_IWMMXT) + ARM.HasCRUNCH = isSet(hwCap, hwcap_CRUNCH) + ARM.HasTHUMBEE = isSet(hwCap, hwcap_THUMBEE) + ARM.HasNEON = isSet(hwCap, hwcap_NEON) + ARM.HasVFPv3 = isSet(hwCap, hwcap_VFPv3) + ARM.HasVFPv3D16 = isSet(hwCap, hwcap_VFPv3D16) + ARM.HasTLS = isSet(hwCap, hwcap_TLS) + ARM.HasVFPv4 = isSet(hwCap, hwcap_VFPv4) + ARM.HasIDIVA = isSet(hwCap, hwcap_IDIVA) + ARM.HasIDIVT = isSet(hwCap, hwcap_IDIVT) + ARM.HasVFPD32 = isSet(hwCap, hwcap_VFPD32) + ARM.HasLPAE = isSet(hwCap, hwcap_LPAE) + ARM.HasEVTSTRM = isSet(hwCap, hwcap_EVTSTRM) + ARM.HasAES = isSet(hwCap2, hwcap2_AES) + ARM.HasPMULL = isSet(hwCap2, hwcap2_PMULL) + ARM.HasSHA1 = isSet(hwCap2, hwcap2_SHA1) + ARM.HasSHA2 = isSet(hwCap2, hwcap2_SHA2) + ARM.HasCRC32 = isSet(hwCap2, hwcap2_CRC32) +} + +func isSet(hwc uint, value uint) bool { + return hwc&value != 0 +} diff --git a/internal/xcpu/cpu_linux_arm64.go b/internal/xcpu/cpu_linux_arm64.go new file mode 100644 index 00000000..481f450b --- /dev/null +++ b/internal/xcpu/cpu_linux_arm64.go @@ -0,0 +1,111 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package xcpu + +import ( + "strings" + "syscall" +) + +// HWCAP/HWCAP2 bits. These are exposed by Linux. +const ( + hwcap_FP = 1 << 0 + hwcap_ASIMD = 1 << 1 + hwcap_EVTSTRM = 1 << 2 + hwcap_AES = 1 << 3 + hwcap_PMULL = 1 << 4 + hwcap_SHA1 = 1 << 5 + hwcap_SHA2 = 1 << 6 + hwcap_CRC32 = 1 << 7 + hwcap_ATOMICS = 1 << 8 + hwcap_FPHP = 1 << 9 + hwcap_ASIMDHP = 1 << 10 + hwcap_CPUID = 1 << 11 + hwcap_ASIMDRDM = 1 << 12 + hwcap_JSCVT = 1 << 13 + hwcap_FCMA = 1 << 14 + hwcap_LRCPC = 1 << 15 + hwcap_DCPOP = 1 << 16 + hwcap_SHA3 = 1 << 17 + hwcap_SM3 = 1 << 18 + hwcap_SM4 = 1 << 19 + hwcap_ASIMDDP = 1 << 20 + hwcap_SHA512 = 1 << 21 + hwcap_SVE = 1 << 22 + hwcap_ASIMDFHM = 1 << 23 +) + +// linuxKernelCanEmulateCPUID reports whether we're running +// on Linux 4.11+. Ideally we'd like to ask the question about +// whether the current kernel contains +// https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=77c97b4ee21290f5f083173d957843b615abbff2 +// but the version number will have to do. +func linuxKernelCanEmulateCPUID() bool { + var un syscall.Utsname + syscall.Uname(&un) + var sb strings.Builder + for _, b := range un.Release[:] { + if b == 0 { + break + } + sb.WriteByte(byte(b)) + } + major, minor, _, ok := parseRelease(sb.String()) + return ok && (major > 4 || major == 4 && minor >= 11) +} + +func doinit() { + if err := readHWCAP(); err != nil { + // We failed to read /proc/self/auxv. This can happen if the binary has + // been given extra capabilities(7) with /bin/setcap. + // + // When this happens, we have two options. If the Linux kernel is new + // enough (4.11+), we can read the arm64 registers directly which'll + // trap into the kernel and then return back to userspace. + // + // But on older kernels, such as Linux 4.4.180 as used on many Synology + // devices, calling readARM64Registers (specifically getisar0) will + // cause a SIGILL and we'll die. So for older kernels, parse /proc/cpuinfo + // instead. + // + // See golang/go#57336. + if linuxKernelCanEmulateCPUID() { + readARM64Registers() + } else { + readLinuxProcCPUInfo() + } + return + } + + // HWCAP feature bits + ARM64.HasFP = isSet(hwCap, hwcap_FP) + ARM64.HasASIMD = isSet(hwCap, hwcap_ASIMD) + ARM64.HasEVTSTRM = isSet(hwCap, hwcap_EVTSTRM) + ARM64.HasAES = isSet(hwCap, hwcap_AES) + ARM64.HasPMULL = isSet(hwCap, hwcap_PMULL) + ARM64.HasSHA1 = isSet(hwCap, hwcap_SHA1) + ARM64.HasSHA2 = isSet(hwCap, hwcap_SHA2) + ARM64.HasCRC32 = isSet(hwCap, hwcap_CRC32) + ARM64.HasATOMICS = isSet(hwCap, hwcap_ATOMICS) + ARM64.HasFPHP = isSet(hwCap, hwcap_FPHP) + ARM64.HasASIMDHP = isSet(hwCap, hwcap_ASIMDHP) + ARM64.HasCPUID = isSet(hwCap, hwcap_CPUID) + ARM64.HasASIMDRDM = isSet(hwCap, hwcap_ASIMDRDM) + ARM64.HasJSCVT = isSet(hwCap, hwcap_JSCVT) + ARM64.HasFCMA = isSet(hwCap, hwcap_FCMA) + ARM64.HasLRCPC = isSet(hwCap, hwcap_LRCPC) + ARM64.HasDCPOP = isSet(hwCap, hwcap_DCPOP) + ARM64.HasSHA3 = isSet(hwCap, hwcap_SHA3) + ARM64.HasSM3 = isSet(hwCap, hwcap_SM3) + ARM64.HasSM4 = isSet(hwCap, hwcap_SM4) + ARM64.HasASIMDDP = isSet(hwCap, hwcap_ASIMDDP) + ARM64.HasSHA512 = isSet(hwCap, hwcap_SHA512) + ARM64.HasSVE = isSet(hwCap, hwcap_SVE) + ARM64.HasASIMDFHM = isSet(hwCap, hwcap_ASIMDFHM) +} + +func isSet(hwc uint, value uint) bool { + return hwc&value != 0 +} diff --git a/internal/xcpu/cpu_linux_mips64x.go b/internal/xcpu/cpu_linux_mips64x.go new file mode 100644 index 00000000..15fdee9c --- /dev/null +++ b/internal/xcpu/cpu_linux_mips64x.go @@ -0,0 +1,22 @@ +// Copyright 2020 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build linux && (mips64 || mips64le) + +package xcpu + +// HWCAP bits. These are exposed by the Linux kernel 5.4. +const ( + // CPU features + hwcap_MIPS_MSA = 1 << 1 +) + +func doinit() { + // HWCAP feature bits + MIPS64X.HasMSA = isSet(hwCap, hwcap_MIPS_MSA) +} + +func isSet(hwc uint, value uint) bool { + return hwc&value != 0 +} diff --git a/internal/xcpu/cpu_linux_noinit.go b/internal/xcpu/cpu_linux_noinit.go new file mode 100644 index 00000000..878e56fb --- /dev/null +++ b/internal/xcpu/cpu_linux_noinit.go @@ -0,0 +1,9 @@ +// Copyright 2019 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build linux && !arm && !arm64 && !mips64 && !mips64le && !ppc64 && !ppc64le && !s390x + +package xcpu + +func doinit() {} diff --git a/internal/xcpu/cpu_linux_ppc64x.go b/internal/xcpu/cpu_linux_ppc64x.go new file mode 100644 index 00000000..6a8ea12a --- /dev/null +++ b/internal/xcpu/cpu_linux_ppc64x.go @@ -0,0 +1,30 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build linux && (ppc64 || ppc64le) + +package xcpu + +// HWCAP/HWCAP2 bits. These are exposed by the kernel. +const ( + // ISA Level + _PPC_FEATURE2_ARCH_2_07 = 0x80000000 + _PPC_FEATURE2_ARCH_3_00 = 0x00800000 + + // CPU features + _PPC_FEATURE2_DARN = 0x00200000 + _PPC_FEATURE2_SCV = 0x00100000 +) + +func doinit() { + // HWCAP2 feature bits + PPC64.IsPOWER8 = isSet(hwCap2, _PPC_FEATURE2_ARCH_2_07) + PPC64.IsPOWER9 = isSet(hwCap2, _PPC_FEATURE2_ARCH_3_00) + PPC64.HasDARN = isSet(hwCap2, _PPC_FEATURE2_DARN) + PPC64.HasSCV = isSet(hwCap2, _PPC_FEATURE2_SCV) +} + +func isSet(hwc uint, value uint) bool { + return hwc&value != 0 +} diff --git a/internal/xcpu/cpu_linux_s390x.go b/internal/xcpu/cpu_linux_s390x.go new file mode 100644 index 00000000..ff0ca7f4 --- /dev/null +++ b/internal/xcpu/cpu_linux_s390x.go @@ -0,0 +1,40 @@ +// Copyright 2019 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package xcpu + +const ( + // bit mask values from /usr/include/bits/hwcap.h + hwcap_ZARCH = 2 + hwcap_STFLE = 4 + hwcap_MSA = 8 + hwcap_LDISP = 16 + hwcap_EIMM = 32 + hwcap_DFP = 64 + hwcap_ETF3EH = 256 + hwcap_VX = 2048 + hwcap_VXE = 8192 +) + +func initS390Xbase() { + // test HWCAP bit vector + has := func(featureMask uint) bool { + return hwCap&featureMask == featureMask + } + + // mandatory + S390X.HasZARCH = has(hwcap_ZARCH) + + // optional + S390X.HasSTFLE = has(hwcap_STFLE) + S390X.HasLDISP = has(hwcap_LDISP) + S390X.HasEIMM = has(hwcap_EIMM) + S390X.HasETF3EH = has(hwcap_ETF3EH) + S390X.HasDFP = has(hwcap_DFP) + S390X.HasMSA = has(hwcap_MSA) + S390X.HasVX = has(hwcap_VX) + if S390X.HasVX { + S390X.HasVXE = has(hwcap_VXE) + } +} diff --git a/internal/xcpu/cpu_loong64.go b/internal/xcpu/cpu_loong64.go new file mode 100644 index 00000000..fdb21c60 --- /dev/null +++ b/internal/xcpu/cpu_loong64.go @@ -0,0 +1,12 @@ +// Copyright 2022 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build loong64 + +package xcpu + +const cacheLineSize = 64 + +func initOptions() { +} diff --git a/internal/xcpu/cpu_mips64x.go b/internal/xcpu/cpu_mips64x.go new file mode 100644 index 00000000..447fee98 --- /dev/null +++ b/internal/xcpu/cpu_mips64x.go @@ -0,0 +1,15 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build mips64 || mips64le + +package xcpu + +const cacheLineSize = 32 + +func initOptions() { + options = []option{ + {Name: "msa", Feature: &MIPS64X.HasMSA}, + } +} diff --git a/internal/xcpu/cpu_mipsx.go b/internal/xcpu/cpu_mipsx.go new file mode 100644 index 00000000..6efa1917 --- /dev/null +++ b/internal/xcpu/cpu_mipsx.go @@ -0,0 +1,11 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build mips || mipsle + +package xcpu + +const cacheLineSize = 32 + +func initOptions() {} diff --git a/internal/xcpu/cpu_netbsd_arm64.go b/internal/xcpu/cpu_netbsd_arm64.go new file mode 100644 index 00000000..b84b4408 --- /dev/null +++ b/internal/xcpu/cpu_netbsd_arm64.go @@ -0,0 +1,173 @@ +// Copyright 2020 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package xcpu + +import ( + "syscall" + "unsafe" +) + +// Minimal copy of functionality from x/sys/unix so the cpu package can call +// sysctl without depending on x/sys/unix. + +const ( + _CTL_QUERY = -2 + + _SYSCTL_VERS_1 = 0x1000000 +) + +var _zero uintptr + +func sysctl(mib []int32, old *byte, oldlen *uintptr, new *byte, newlen uintptr) (err error) { + var _p0 unsafe.Pointer + if len(mib) > 0 { + _p0 = unsafe.Pointer(&mib[0]) + } else { + _p0 = unsafe.Pointer(&_zero) + } + _, _, errno := syscall.Syscall6( + syscall.SYS___SYSCTL, + uintptr(_p0), + uintptr(len(mib)), + uintptr(unsafe.Pointer(old)), + uintptr(unsafe.Pointer(oldlen)), + uintptr(unsafe.Pointer(new)), + uintptr(newlen)) + if errno != 0 { + return errno + } + return nil +} + +type sysctlNode struct { + Flags uint32 + Num int32 + Name [32]int8 + Ver uint32 + __rsvd uint32 + Un [16]byte + _sysctl_size [8]byte + _sysctl_func [8]byte + _sysctl_parent [8]byte + _sysctl_desc [8]byte +} + +func sysctlNodes(mib []int32) ([]sysctlNode, error) { + var olen uintptr + + // Get a list of all sysctl nodes below the given MIB by performing + // a sysctl for the given MIB with CTL_QUERY appended. + mib = append(mib, _CTL_QUERY) + qnode := sysctlNode{Flags: _SYSCTL_VERS_1} + qp := (*byte)(unsafe.Pointer(&qnode)) + sz := unsafe.Sizeof(qnode) + if err := sysctl(mib, nil, &olen, qp, sz); err != nil { + return nil, err + } + + // Now that we know the size, get the actual nodes. + nodes := make([]sysctlNode, olen/sz) + np := (*byte)(unsafe.Pointer(&nodes[0])) + if err := sysctl(mib, np, &olen, qp, sz); err != nil { + return nil, err + } + + return nodes, nil +} + +func nametomib(name string) ([]int32, error) { + // Split name into components. + var parts []string + last := 0 + for i := 0; i < len(name); i++ { + if name[i] == '.' { + parts = append(parts, name[last:i]) + last = i + 1 + } + } + parts = append(parts, name[last:]) + + mib := []int32{} + // Discover the nodes and construct the MIB OID. + for partno, part := range parts { + nodes, err := sysctlNodes(mib) + if err != nil { + return nil, err + } + for _, node := range nodes { + n := make([]byte, 0) + for i := range node.Name { + if node.Name[i] != 0 { + n = append(n, byte(node.Name[i])) + } + } + if string(n) == part { + mib = append(mib, int32(node.Num)) + break + } + } + if len(mib) != partno+1 { + return nil, err + } + } + + return mib, nil +} + +// aarch64SysctlCPUID is struct aarch64_sysctl_cpu_id from NetBSD's +type aarch64SysctlCPUID struct { + midr uint64 /* Main ID Register */ + revidr uint64 /* Revision ID Register */ + mpidr uint64 /* Multiprocessor Affinity Register */ + aa64dfr0 uint64 /* A64 Debug Feature Register 0 */ + aa64dfr1 uint64 /* A64 Debug Feature Register 1 */ + aa64isar0 uint64 /* A64 Instruction Set Attribute Register 0 */ + aa64isar1 uint64 /* A64 Instruction Set Attribute Register 1 */ + aa64mmfr0 uint64 /* A64 Memory Model Feature Register 0 */ + aa64mmfr1 uint64 /* A64 Memory Model Feature Register 1 */ + aa64mmfr2 uint64 /* A64 Memory Model Feature Register 2 */ + aa64pfr0 uint64 /* A64 Processor Feature Register 0 */ + aa64pfr1 uint64 /* A64 Processor Feature Register 1 */ + aa64zfr0 uint64 /* A64 SVE Feature ID Register 0 */ + mvfr0 uint32 /* Media and VFP Feature Register 0 */ + mvfr1 uint32 /* Media and VFP Feature Register 1 */ + mvfr2 uint32 /* Media and VFP Feature Register 2 */ + pad uint32 + clidr uint64 /* Cache Level ID Register */ + ctr uint64 /* Cache Type Register */ +} + +func sysctlCPUID(name string) (*aarch64SysctlCPUID, error) { + mib, err := nametomib(name) + if err != nil { + return nil, err + } + + out := aarch64SysctlCPUID{} + n := unsafe.Sizeof(out) + _, _, errno := syscall.Syscall6( + syscall.SYS___SYSCTL, + uintptr(unsafe.Pointer(&mib[0])), + uintptr(len(mib)), + uintptr(unsafe.Pointer(&out)), + uintptr(unsafe.Pointer(&n)), + uintptr(0), + uintptr(0)) + if errno != 0 { + return nil, errno + } + return &out, nil +} + +func doinit() { + cpuid, err := sysctlCPUID("machdep.cpu0.cpu_id") + if err != nil { + setMinimalFeatures() + return + } + parseARM64SystemRegisters(cpuid.aa64isar0, cpuid.aa64isar1, cpuid.aa64pfr0) + + Initialized = true +} diff --git a/internal/xcpu/cpu_openbsd_arm64.go b/internal/xcpu/cpu_openbsd_arm64.go new file mode 100644 index 00000000..2459a486 --- /dev/null +++ b/internal/xcpu/cpu_openbsd_arm64.go @@ -0,0 +1,65 @@ +// Copyright 2022 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package xcpu + +import ( + "syscall" + "unsafe" +) + +// Minimal copy of functionality from x/sys/unix so the cpu package can call +// sysctl without depending on x/sys/unix. + +const ( + // From OpenBSD's sys/sysctl.h. + _CTL_MACHDEP = 7 + + // From OpenBSD's machine/cpu.h. + _CPU_ID_AA64ISAR0 = 2 + _CPU_ID_AA64ISAR1 = 3 +) + +// Implemented in the runtime package (runtime/sys_openbsd3.go) +func syscall_syscall6(fn, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2 uintptr, err syscall.Errno) + +//go:linkname syscall_syscall6 syscall.syscall6 + +func sysctl(mib []uint32, old *byte, oldlen *uintptr, new *byte, newlen uintptr) (err error) { + _, _, errno := syscall_syscall6(libc_sysctl_trampoline_addr, uintptr(unsafe.Pointer(&mib[0])), uintptr(len(mib)), uintptr(unsafe.Pointer(old)), uintptr(unsafe.Pointer(oldlen)), uintptr(unsafe.Pointer(new)), uintptr(newlen)) + if errno != 0 { + return errno + } + return nil +} + +var libc_sysctl_trampoline_addr uintptr + +//go:cgo_import_dynamic libc_sysctl sysctl "libc.so" + +func sysctlUint64(mib []uint32) (uint64, bool) { + var out uint64 + nout := unsafe.Sizeof(out) + if err := sysctl(mib, (*byte)(unsafe.Pointer(&out)), &nout, nil, 0); err != nil { + return 0, false + } + return out, true +} + +func doinit() { + setMinimalFeatures() + + // Get ID_AA64ISAR0 and ID_AA64ISAR1 from sysctl. + isar0, ok := sysctlUint64([]uint32{_CTL_MACHDEP, _CPU_ID_AA64ISAR0}) + if !ok { + return + } + isar1, ok := sysctlUint64([]uint32{_CTL_MACHDEP, _CPU_ID_AA64ISAR1}) + if !ok { + return + } + parseARM64SystemRegisters(isar0, isar1, 0) + + Initialized = true +} diff --git a/internal/xcpu/cpu_openbsd_arm64.s b/internal/xcpu/cpu_openbsd_arm64.s new file mode 100644 index 00000000..054ba05d --- /dev/null +++ b/internal/xcpu/cpu_openbsd_arm64.s @@ -0,0 +1,11 @@ +// Copyright 2022 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "textflag.h" + +TEXT libc_sysctl_trampoline<>(SB),NOSPLIT,$0-0 + JMP libc_sysctl(SB) + +GLOBL ·libc_sysctl_trampoline_addr(SB), RODATA, $8 +DATA ·libc_sysctl_trampoline_addr(SB)/8, $libc_sysctl_trampoline<>(SB) diff --git a/internal/xcpu/cpu_other_arm.go b/internal/xcpu/cpu_other_arm.go new file mode 100644 index 00000000..e3247948 --- /dev/null +++ b/internal/xcpu/cpu_other_arm.go @@ -0,0 +1,9 @@ +// Copyright 2020 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build !linux && arm + +package xcpu + +func archInit() {} diff --git a/internal/xcpu/cpu_other_arm64.go b/internal/xcpu/cpu_other_arm64.go new file mode 100644 index 00000000..5257a0b6 --- /dev/null +++ b/internal/xcpu/cpu_other_arm64.go @@ -0,0 +1,9 @@ +// Copyright 2019 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build !linux && !netbsd && !openbsd && arm64 + +package xcpu + +func doinit() {} diff --git a/internal/xcpu/cpu_other_mips64x.go b/internal/xcpu/cpu_other_mips64x.go new file mode 100644 index 00000000..b1ddc9d5 --- /dev/null +++ b/internal/xcpu/cpu_other_mips64x.go @@ -0,0 +1,11 @@ +// Copyright 2020 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build !linux && (mips64 || mips64le) + +package xcpu + +func archInit() { + Initialized = true +} diff --git a/internal/xcpu/cpu_other_ppc64x.go b/internal/xcpu/cpu_other_ppc64x.go new file mode 100644 index 00000000..00a08baa --- /dev/null +++ b/internal/xcpu/cpu_other_ppc64x.go @@ -0,0 +1,12 @@ +// Copyright 2022 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build !aix && !linux && (ppc64 || ppc64le) + +package xcpu + +func archInit() { + PPC64.IsPOWER8 = true + Initialized = true +} diff --git a/internal/xcpu/cpu_other_riscv64.go b/internal/xcpu/cpu_other_riscv64.go new file mode 100644 index 00000000..7f8fd1fc --- /dev/null +++ b/internal/xcpu/cpu_other_riscv64.go @@ -0,0 +1,11 @@ +// Copyright 2022 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build !linux && riscv64 + +package xcpu + +func archInit() { + Initialized = true +} diff --git a/internal/xcpu/cpu_ppc64x.go b/internal/xcpu/cpu_ppc64x.go new file mode 100644 index 00000000..22afeec2 --- /dev/null +++ b/internal/xcpu/cpu_ppc64x.go @@ -0,0 +1,16 @@ +// Copyright 2020 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build ppc64 || ppc64le + +package xcpu + +const cacheLineSize = 128 + +func initOptions() { + options = []option{ + {Name: "darn", Feature: &PPC64.HasDARN}, + {Name: "scv", Feature: &PPC64.HasSCV}, + } +} diff --git a/internal/xcpu/cpu_riscv64.go b/internal/xcpu/cpu_riscv64.go new file mode 100644 index 00000000..28e57b68 --- /dev/null +++ b/internal/xcpu/cpu_riscv64.go @@ -0,0 +1,11 @@ +// Copyright 2019 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build riscv64 + +package xcpu + +const cacheLineSize = 64 + +func initOptions() {} diff --git a/internal/xcpu/cpu_s390x.go b/internal/xcpu/cpu_s390x.go new file mode 100644 index 00000000..e85a8c5d --- /dev/null +++ b/internal/xcpu/cpu_s390x.go @@ -0,0 +1,172 @@ +// Copyright 2020 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package xcpu + +const cacheLineSize = 256 + +func initOptions() { + options = []option{ + {Name: "zarch", Feature: &S390X.HasZARCH, Required: true}, + {Name: "stfle", Feature: &S390X.HasSTFLE, Required: true}, + {Name: "ldisp", Feature: &S390X.HasLDISP, Required: true}, + {Name: "eimm", Feature: &S390X.HasEIMM, Required: true}, + {Name: "dfp", Feature: &S390X.HasDFP}, + {Name: "etf3eh", Feature: &S390X.HasETF3EH}, + {Name: "msa", Feature: &S390X.HasMSA}, + {Name: "aes", Feature: &S390X.HasAES}, + {Name: "aescbc", Feature: &S390X.HasAESCBC}, + {Name: "aesctr", Feature: &S390X.HasAESCTR}, + {Name: "aesgcm", Feature: &S390X.HasAESGCM}, + {Name: "ghash", Feature: &S390X.HasGHASH}, + {Name: "sha1", Feature: &S390X.HasSHA1}, + {Name: "sha256", Feature: &S390X.HasSHA256}, + {Name: "sha3", Feature: &S390X.HasSHA3}, + {Name: "sha512", Feature: &S390X.HasSHA512}, + {Name: "vx", Feature: &S390X.HasVX}, + {Name: "vxe", Feature: &S390X.HasVXE}, + } +} + +// bitIsSet reports whether the bit at index is set. The bit index +// is in big endian order, so bit index 0 is the leftmost bit. +func bitIsSet(bits []uint64, index uint) bool { + return bits[index/64]&((1<<63)>>(index%64)) != 0 +} + +// facility is a bit index for the named facility. +type facility uint8 + +const ( + // mandatory facilities + zarch facility = 1 // z architecture mode is active + stflef facility = 7 // store-facility-list-extended + ldisp facility = 18 // long-displacement + eimm facility = 21 // extended-immediate + + // miscellaneous facilities + dfp facility = 42 // decimal-floating-point + etf3eh facility = 30 // extended-translation 3 enhancement + + // cryptography facilities + msa facility = 17 // message-security-assist + msa3 facility = 76 // message-security-assist extension 3 + msa4 facility = 77 // message-security-assist extension 4 + msa5 facility = 57 // message-security-assist extension 5 + msa8 facility = 146 // message-security-assist extension 8 + msa9 facility = 155 // message-security-assist extension 9 + + // vector facilities + vx facility = 129 // vector facility + vxe facility = 135 // vector-enhancements 1 + vxe2 facility = 148 // vector-enhancements 2 +) + +// facilityList contains the result of an STFLE call. +// Bits are numbered in big endian order so the +// leftmost bit (the MSB) is at index 0. +type facilityList struct { + bits [4]uint64 +} + +// Has reports whether the given facilities are present. +func (s *facilityList) Has(fs ...facility) bool { + if len(fs) == 0 { + panic("no facility bits provided") + } + for _, f := range fs { + if !bitIsSet(s.bits[:], uint(f)) { + return false + } + } + return true +} + +// function is the code for the named cryptographic function. +type function uint8 + +const ( + // KM{,A,C,CTR} function codes + aes128 function = 18 // AES-128 + aes192 function = 19 // AES-192 + aes256 function = 20 // AES-256 + + // K{I,L}MD function codes + sha1 function = 1 // SHA-1 + sha256 function = 2 // SHA-256 + sha512 function = 3 // SHA-512 + sha3_224 function = 32 // SHA3-224 + sha3_256 function = 33 // SHA3-256 + sha3_384 function = 34 // SHA3-384 + sha3_512 function = 35 // SHA3-512 + shake128 function = 36 // SHAKE-128 + shake256 function = 37 // SHAKE-256 + + // KLMD function codes + ghash function = 65 // GHASH +) + +// queryResult contains the result of a Query function +// call. Bits are numbered in big endian order so the +// leftmost bit (the MSB) is at index 0. +type queryResult struct { + bits [2]uint64 +} + +// Has reports whether the given functions are present. +func (q *queryResult) Has(fns ...function) bool { + if len(fns) == 0 { + panic("no function codes provided") + } + for _, f := range fns { + if !bitIsSet(q.bits[:], uint(f)) { + return false + } + } + return true +} + +func doinit() { + initS390Xbase() + + // We need implementations of stfle, km and so on + // to detect cryptographic features. + if !haveAsmFunctions() { + return + } + + // optional cryptographic functions + if S390X.HasMSA { + aes := []function{aes128, aes192, aes256} + + // cipher message + km, kmc := kmQuery(), kmcQuery() + S390X.HasAES = km.Has(aes...) + S390X.HasAESCBC = kmc.Has(aes...) + if S390X.HasSTFLE { + facilities := stfle() + if facilities.Has(msa4) { + kmctr := kmctrQuery() + S390X.HasAESCTR = kmctr.Has(aes...) + } + if facilities.Has(msa8) { + kma := kmaQuery() + S390X.HasAESGCM = kma.Has(aes...) + } + } + + // compute message digest + kimd := kimdQuery() // intermediate (no padding) + klmd := klmdQuery() // last (padding) + S390X.HasSHA1 = kimd.Has(sha1) && klmd.Has(sha1) + S390X.HasSHA256 = kimd.Has(sha256) && klmd.Has(sha256) + S390X.HasSHA512 = kimd.Has(sha512) && klmd.Has(sha512) + S390X.HasGHASH = kimd.Has(ghash) // KLMD-GHASH does not exist + sha3 := []function{ + sha3_224, sha3_256, sha3_384, sha3_512, + shake128, shake256, + } + S390X.HasSHA3 = kimd.Has(sha3...) && klmd.Has(sha3...) + } +} diff --git a/internal/xcpu/cpu_s390x.s b/internal/xcpu/cpu_s390x.s new file mode 100644 index 00000000..1fb4b701 --- /dev/null +++ b/internal/xcpu/cpu_s390x.s @@ -0,0 +1,57 @@ +// Copyright 2019 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build gc + +#include "textflag.h" + +// func stfle() facilityList +TEXT ·stfle(SB), NOSPLIT|NOFRAME, $0-32 + MOVD $ret+0(FP), R1 + MOVD $3, R0 // last doubleword index to store + XC $32, (R1), (R1) // clear 4 doublewords (32 bytes) + WORD $0xb2b01000 // store facility list extended (STFLE) + RET + +// func kmQuery() queryResult +TEXT ·kmQuery(SB), NOSPLIT|NOFRAME, $0-16 + MOVD $0, R0 // set function code to 0 (KM-Query) + MOVD $ret+0(FP), R1 // address of 16-byte return value + WORD $0xB92E0024 // cipher message (KM) + RET + +// func kmcQuery() queryResult +TEXT ·kmcQuery(SB), NOSPLIT|NOFRAME, $0-16 + MOVD $0, R0 // set function code to 0 (KMC-Query) + MOVD $ret+0(FP), R1 // address of 16-byte return value + WORD $0xB92F0024 // cipher message with chaining (KMC) + RET + +// func kmctrQuery() queryResult +TEXT ·kmctrQuery(SB), NOSPLIT|NOFRAME, $0-16 + MOVD $0, R0 // set function code to 0 (KMCTR-Query) + MOVD $ret+0(FP), R1 // address of 16-byte return value + WORD $0xB92D4024 // cipher message with counter (KMCTR) + RET + +// func kmaQuery() queryResult +TEXT ·kmaQuery(SB), NOSPLIT|NOFRAME, $0-16 + MOVD $0, R0 // set function code to 0 (KMA-Query) + MOVD $ret+0(FP), R1 // address of 16-byte return value + WORD $0xb9296024 // cipher message with authentication (KMA) + RET + +// func kimdQuery() queryResult +TEXT ·kimdQuery(SB), NOSPLIT|NOFRAME, $0-16 + MOVD $0, R0 // set function code to 0 (KIMD-Query) + MOVD $ret+0(FP), R1 // address of 16-byte return value + WORD $0xB93E0024 // compute intermediate message digest (KIMD) + RET + +// func klmdQuery() queryResult +TEXT ·klmdQuery(SB), NOSPLIT|NOFRAME, $0-16 + MOVD $0, R0 // set function code to 0 (KLMD-Query) + MOVD $ret+0(FP), R1 // address of 16-byte return value + WORD $0xB93F0024 // compute last message digest (KLMD) + RET diff --git a/internal/xcpu/cpu_wasm.go b/internal/xcpu/cpu_wasm.go new file mode 100644 index 00000000..230aaab4 --- /dev/null +++ b/internal/xcpu/cpu_wasm.go @@ -0,0 +1,17 @@ +// Copyright 2019 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build wasm + +package xcpu + +// We're compiling the cpu package for an unknown (software-abstracted) CPU. +// Make CacheLinePad an empty struct and hope that the usual struct alignment +// rules are good enough. + +const cacheLineSize = 0 + +func initOptions() {} + +func archInit() {} diff --git a/internal/xcpu/cpu_x86.go b/internal/xcpu/cpu_x86.go new file mode 100644 index 00000000..d2f83468 --- /dev/null +++ b/internal/xcpu/cpu_x86.go @@ -0,0 +1,151 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build 386 || amd64 || amd64p32 + +package xcpu + +import "runtime" + +const cacheLineSize = 64 + +func initOptions() { + options = []option{ + {Name: "adx", Feature: &X86.HasADX}, + {Name: "aes", Feature: &X86.HasAES}, + {Name: "avx", Feature: &X86.HasAVX}, + {Name: "avx2", Feature: &X86.HasAVX2}, + {Name: "avx512", Feature: &X86.HasAVX512}, + {Name: "avx512f", Feature: &X86.HasAVX512F}, + {Name: "avx512cd", Feature: &X86.HasAVX512CD}, + {Name: "avx512er", Feature: &X86.HasAVX512ER}, + {Name: "avx512pf", Feature: &X86.HasAVX512PF}, + {Name: "avx512vl", Feature: &X86.HasAVX512VL}, + {Name: "avx512bw", Feature: &X86.HasAVX512BW}, + {Name: "avx512dq", Feature: &X86.HasAVX512DQ}, + {Name: "avx512ifma", Feature: &X86.HasAVX512IFMA}, + {Name: "avx512vbmi", Feature: &X86.HasAVX512VBMI}, + {Name: "avx512vnniw", Feature: &X86.HasAVX5124VNNIW}, + {Name: "avx5124fmaps", Feature: &X86.HasAVX5124FMAPS}, + {Name: "avx512vpopcntdq", Feature: &X86.HasAVX512VPOPCNTDQ}, + {Name: "avx512vpclmulqdq", Feature: &X86.HasAVX512VPCLMULQDQ}, + {Name: "avx512vnni", Feature: &X86.HasAVX512VNNI}, + {Name: "avx512gfni", Feature: &X86.HasAVX512GFNI}, + {Name: "avx512vaes", Feature: &X86.HasAVX512VAES}, + {Name: "avx512vbmi2", Feature: &X86.HasAVX512VBMI2}, + {Name: "avx512bitalg", Feature: &X86.HasAVX512BITALG}, + {Name: "avx512bf16", Feature: &X86.HasAVX512BF16}, + {Name: "amxtile", Feature: &X86.HasAMXTile}, + {Name: "amxint8", Feature: &X86.HasAMXInt8}, + {Name: "amxbf16", Feature: &X86.HasAMXBF16}, + {Name: "bmi1", Feature: &X86.HasBMI1}, + {Name: "bmi2", Feature: &X86.HasBMI2}, + {Name: "cx16", Feature: &X86.HasCX16}, + {Name: "erms", Feature: &X86.HasERMS}, + {Name: "fma", Feature: &X86.HasFMA}, + {Name: "osxsave", Feature: &X86.HasOSXSAVE}, + {Name: "pclmulqdq", Feature: &X86.HasPCLMULQDQ}, + {Name: "popcnt", Feature: &X86.HasPOPCNT}, + {Name: "rdrand", Feature: &X86.HasRDRAND}, + {Name: "rdseed", Feature: &X86.HasRDSEED}, + {Name: "sse3", Feature: &X86.HasSSE3}, + {Name: "sse41", Feature: &X86.HasSSE41}, + {Name: "sse42", Feature: &X86.HasSSE42}, + {Name: "ssse3", Feature: &X86.HasSSSE3}, + + // These capabilities should always be enabled on amd64: + {Name: "sse2", Feature: &X86.HasSSE2, Required: runtime.GOARCH == "amd64"}, + } +} + +func archInit() { + + Initialized = true + + maxID, _, _, _ := cpuid(0, 0) + + if maxID < 1 { + return + } + + _, _, ecx1, edx1 := cpuid(1, 0) + X86.HasSSE2 = isSet(26, edx1) + + X86.HasSSE3 = isSet(0, ecx1) + X86.HasPCLMULQDQ = isSet(1, ecx1) + X86.HasSSSE3 = isSet(9, ecx1) + X86.HasFMA = isSet(12, ecx1) + X86.HasCX16 = isSet(13, ecx1) + X86.HasSSE41 = isSet(19, ecx1) + X86.HasSSE42 = isSet(20, ecx1) + X86.HasPOPCNT = isSet(23, ecx1) + X86.HasAES = isSet(25, ecx1) + X86.HasOSXSAVE = isSet(27, ecx1) + X86.HasRDRAND = isSet(30, ecx1) + + var osSupportsAVX, osSupportsAVX512 bool + // For XGETBV, OSXSAVE bit is required and sufficient. + if X86.HasOSXSAVE { + eax, _ := xgetbv() + // Check if XMM and YMM registers have OS support. + osSupportsAVX = isSet(1, eax) && isSet(2, eax) + + if runtime.GOOS == "darwin" { + // Darwin doesn't save/restore AVX-512 mask registers correctly across signal handlers. + // Since users can't rely on mask register contents, let's not advertise AVX-512 support. + // See issue 49233. + osSupportsAVX512 = false + } else { + // Check if OPMASK and ZMM registers have OS support. + osSupportsAVX512 = osSupportsAVX && isSet(5, eax) && isSet(6, eax) && isSet(7, eax) + } + } + + X86.HasAVX = isSet(28, ecx1) && osSupportsAVX + + if maxID < 7 { + return + } + + _, ebx7, ecx7, edx7 := cpuid(7, 0) + X86.HasBMI1 = isSet(3, ebx7) + X86.HasAVX2 = isSet(5, ebx7) && osSupportsAVX + X86.HasBMI2 = isSet(8, ebx7) + X86.HasERMS = isSet(9, ebx7) + X86.HasRDSEED = isSet(18, ebx7) + X86.HasADX = isSet(19, ebx7) + + X86.HasAVX512 = isSet(16, ebx7) && osSupportsAVX512 // Because avx-512 foundation is the core required extension + if X86.HasAVX512 { + X86.HasAVX512F = true + X86.HasAVX512CD = isSet(28, ebx7) + X86.HasAVX512ER = isSet(27, ebx7) + X86.HasAVX512PF = isSet(26, ebx7) + X86.HasAVX512VL = isSet(31, ebx7) + X86.HasAVX512BW = isSet(30, ebx7) + X86.HasAVX512DQ = isSet(17, ebx7) + X86.HasAVX512IFMA = isSet(21, ebx7) + X86.HasAVX512VBMI = isSet(1, ecx7) + X86.HasAVX5124VNNIW = isSet(2, edx7) + X86.HasAVX5124FMAPS = isSet(3, edx7) + X86.HasAVX512VPOPCNTDQ = isSet(14, ecx7) + X86.HasAVX512VPCLMULQDQ = isSet(10, ecx7) + X86.HasAVX512VNNI = isSet(11, ecx7) + X86.HasAVX512GFNI = isSet(8, ecx7) + X86.HasAVX512VAES = isSet(9, ecx7) + X86.HasAVX512VBMI2 = isSet(6, ecx7) + X86.HasAVX512BITALG = isSet(12, ecx7) + + eax71, _, _, _ := cpuid(7, 1) + X86.HasAVX512BF16 = isSet(5, eax71) + } + + X86.HasAMXTile = isSet(24, edx7) + X86.HasAMXInt8 = isSet(25, edx7) + X86.HasAMXBF16 = isSet(22, edx7) +} + +func isSet(bitpos uint, value uint32) bool { + return value&(1<> 63)) +) + +// For those platforms don't have a 'cpuid' equivalent we use HWCAP/HWCAP2 +// These are initialized in cpu_$GOARCH.go +// and should not be changed after they are initialized. +var hwCap uint +var hwCap2 uint + +func readHWCAP() error { + // For Go 1.21+, get auxv from the Go runtime. + if a := getAuxv(); len(a) > 0 { + for len(a) >= 2 { + tag, val := a[0], uint(a[1]) + a = a[2:] + switch tag { + case _AT_HWCAP: + hwCap = val + case _AT_HWCAP2: + hwCap2 = val + } + } + return nil + } + + buf, err := os.ReadFile(procAuxv) + if err != nil { + // e.g. on android /proc/self/auxv is not accessible, so silently + // ignore the error and leave Initialized = false. On some + // architectures (e.g. arm64) doinit() implements a fallback + // readout and will set Initialized = true again. + return err + } + bo := hostByteOrder() + for len(buf) >= 2*(uintSize/8) { + var tag, val uint + switch uintSize { + case 32: + tag = uint(bo.Uint32(buf[0:])) + val = uint(bo.Uint32(buf[4:])) + buf = buf[8:] + case 64: + tag = uint(bo.Uint64(buf[0:])) + val = uint(bo.Uint64(buf[8:])) + buf = buf[16:] + } + switch tag { + case _AT_HWCAP: + hwCap = val + case _AT_HWCAP2: + hwCap2 = val + } + } + return nil +} diff --git a/internal/xcpu/parse.go b/internal/xcpu/parse.go new file mode 100644 index 00000000..be30b60f --- /dev/null +++ b/internal/xcpu/parse.go @@ -0,0 +1,43 @@ +// Copyright 2022 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package xcpu + +import "strconv" + +// parseRelease parses a dot-separated version number. It follows the semver +// syntax, but allows the minor and patch versions to be elided. +// +// This is a copy of the Go runtime's parseRelease from +// https://golang.org/cl/209597. +func parseRelease(rel string) (major, minor, patch int, ok bool) { + // Strip anything after a dash or plus. + for i := 0; i < len(rel); i++ { + if rel[i] == '-' || rel[i] == '+' { + rel = rel[:i] + break + } + } + + next := func() (int, bool) { + for i := 0; i < len(rel); i++ { + if rel[i] == '.' { + ver, err := strconv.Atoi(rel[:i]) + rel = rel[i+1:] + return ver, err == nil + } + } + ver, err := strconv.Atoi(rel) + rel = "" + return ver, err == nil + } + if major, ok = next(); !ok || rel == "" { + return + } + if minor, ok = next(); !ok || rel == "" { + return + } + patch, ok = next() + return +} diff --git a/internal/xcpu/proc_cpuinfo_linux.go b/internal/xcpu/proc_cpuinfo_linux.go new file mode 100644 index 00000000..9c88d24e --- /dev/null +++ b/internal/xcpu/proc_cpuinfo_linux.go @@ -0,0 +1,53 @@ +// Copyright 2022 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build linux && arm64 + +package xcpu + +import ( + "errors" + "io" + "os" + "strings" +) + +func readLinuxProcCPUInfo() error { + f, err := os.Open("/proc/cpuinfo") + if err != nil { + return err + } + defer f.Close() + + var buf [1 << 10]byte // enough for first CPU + n, err := io.ReadFull(f, buf[:]) + if err != nil && err != io.ErrUnexpectedEOF { + return err + } + in := string(buf[:n]) + const features = "\nFeatures : " + i := strings.Index(in, features) + if i == -1 { + return errors.New("no CPU features found") + } + in = in[i+len(features):] + if i := strings.Index(in, "\n"); i != -1 { + in = in[:i] + } + m := map[string]*bool{} + + initOptions() // need it early here; it's harmless to call twice + for _, o := range options { + m[o.Name] = o.Feature + } + // The EVTSTRM field has alias "evstrm" in Go, but Linux calls it "evtstrm". + m["evtstrm"] = &ARM64.HasEVTSTRM + + for _, f := range strings.Fields(in) { + if p, ok := m[f]; ok { + *p = true + } + } + return nil +} diff --git a/internal/xcpu/runtime_auxv.go b/internal/xcpu/runtime_auxv.go new file mode 100644 index 00000000..b842842e --- /dev/null +++ b/internal/xcpu/runtime_auxv.go @@ -0,0 +1,16 @@ +// Copyright 2023 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package xcpu + +// getAuxvFn is non-nil on Go 1.21+ (via runtime_auxv_go121.go init) +// on platforms that use auxv. +var getAuxvFn func() []uintptr + +func getAuxv() []uintptr { + if getAuxvFn == nil { + return nil + } + return getAuxvFn() +} diff --git a/internal/xcpu/runtime_auxv_go121.go b/internal/xcpu/runtime_auxv_go121.go new file mode 100644 index 00000000..b4dba06a --- /dev/null +++ b/internal/xcpu/runtime_auxv_go121.go @@ -0,0 +1,18 @@ +// Copyright 2023 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build go1.21 + +package xcpu + +import ( + _ "unsafe" // for linkname +) + +//go:linkname runtime_getAuxv runtime.getAuxv +func runtime_getAuxv() []uintptr + +func init() { + getAuxvFn = runtime_getAuxv +} diff --git a/internal/xcpu/syscall_aix_gccgo.go b/internal/xcpu/syscall_aix_gccgo.go new file mode 100644 index 00000000..905566fe --- /dev/null +++ b/internal/xcpu/syscall_aix_gccgo.go @@ -0,0 +1,26 @@ +// Copyright 2020 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Recreate a getsystemcfg syscall handler instead of +// using the one provided by x/sys/unix to avoid having +// the dependency between them. (See golang.org/issue/32102) +// Moreover, this file will be used during the building of +// gccgo's libgo and thus must not used a CGo method. + +//go:build aix && gccgo + +package xcpu + +import ( + "syscall" +) + +//extern getsystemcfg +func gccgoGetsystemcfg(label uint32) (r uint64) + +func callgetsystemcfg(label int) (r1 uintptr, e1 syscall.Errno) { + r1 = uintptr(gccgoGetsystemcfg(uint32(label))) + e1 = syscall.GetErrno() + return +} diff --git a/internal/xcpu/syscall_aix_ppc64_gc.go b/internal/xcpu/syscall_aix_ppc64_gc.go new file mode 100644 index 00000000..18837396 --- /dev/null +++ b/internal/xcpu/syscall_aix_ppc64_gc.go @@ -0,0 +1,35 @@ +// Copyright 2019 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Minimal copy of x/sys/unix so the cpu package can make a +// system call on AIX without depending on x/sys/unix. +// (See golang.org/issue/32102) + +//go:build aix && ppc64 && gc + +package xcpu + +import ( + "syscall" + "unsafe" +) + +//go:cgo_import_dynamic libc_getsystemcfg getsystemcfg "libc.a/shr_64.o" + +//go:linkname libc_getsystemcfg libc_getsystemcfg + +type syscallFunc uintptr + +var libc_getsystemcfg syscallFunc + +type errno = syscall.Errno + +// Implemented in runtime/syscall_aix.go. +func rawSyscall6(trap, nargs, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2 uintptr, err errno) +func syscall6(trap, nargs, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2 uintptr, err errno) + +func callgetsystemcfg(label int) (r1 uintptr, e1 errno) { + r1, _, e1 = syscall6(uintptr(unsafe.Pointer(&libc_getsystemcfg)), 1, uintptr(label), 0, 0, 0, 0, 0) + return +} diff --git a/mask_asm.go b/mask_asm.go index bf4bb635..3b1ee517 100644 --- a/mask_asm.go +++ b/mask_asm.go @@ -2,7 +2,7 @@ package websocket -import "golang.org/x/sys/cpu" +import "nhooyr.io/websocket/internal/xcpu" func mask(b []byte, key uint32) uint32 { if len(b) > 0 { @@ -12,7 +12,7 @@ func mask(b []byte, key uint32) uint32 { } //lint:ignore U1000 mask_*.s -var useAVX2 = cpu.X86.HasAVX2 +var useAVX2 = xcpu.X86.HasAVX2 // @nhooyr: I am not confident that the amd64 or the arm64 implementations of this // function are perfect. There are almost certainly missing optimizations or diff --git a/mask_test.go b/mask_test.go index 5c3d43c4..54f55e43 100644 --- a/mask_test.go +++ b/mask_test.go @@ -40,34 +40,34 @@ func TestMask(t *testing.T) { func testMask(t *testing.T, name string, fn func(b []byte, key uint32) uint32) { t.Run(name, func(t *testing.T) { t.Parallel() - for i := 0; i < 9999; i++ { - keyb := make([]byte, 4) - _, err := rand.Read(keyb) - assert.Success(t, err) - key := binary.LittleEndian.Uint32(keyb) + for i := 0; i < 9999; i++ { + keyb := make([]byte, 4) + _, err := rand.Read(keyb) + assert.Success(t, err) + key := binary.LittleEndian.Uint32(keyb) - n, err := rand.Int(rand.Reader, big.NewInt(1<<16)) - assert.Success(t, err) + n, err := rand.Int(rand.Reader, big.NewInt(1<<16)) + assert.Success(t, err) - b := make([]byte, 1+n.Int64()) - _, err = rand.Read(b) - assert.Success(t, err) + b := make([]byte, 1+n.Int64()) + _, err = rand.Read(b) + assert.Success(t, err) - b2 := make([]byte, len(b)) - copy(b2, b) - b3 := make([]byte, len(b)) - copy(b3, b) + b2 := make([]byte, len(b)) + copy(b2, b) + b3 := make([]byte, len(b)) + copy(b3, b) - key2 := basicMask(b2, key) - key3 := fn(b3, key) + key2 := basicMask(b2, key) + key3 := fn(b3, key) - if key2 != key3 { - t.Errorf("expected key %X but got %X", key2, key3) + if key2 != key3 { + t.Errorf("expected key %X but got %X", key2, key3) + } + if !bytes.Equal(b2, b3) { + t.Error("bad bytes") + return + } } - if !bytes.Equal(b2, b3) { - t.Error("bad bytes") - return - } - } }) } From 17e1b864a276ee7a45d6b14f4ed8445a05de543c Mon Sep 17 00:00:00 2001 From: Anmol Sethi Date: Thu, 22 Feb 2024 05:14:52 -0800 Subject: [PATCH 25/26] mask_asm: Disable AVX2 See https://github.com/nhooyr/websocket/pull/326#issuecomment-1771138049 --- README.md | 2 +- internal/xcpu/.gitattributes | 10 - internal/xcpu/.gitignore | 2 - internal/xcpu/README.md | 3 - internal/xcpu/asm_aix_ppc64.s | 17 -- internal/xcpu/byteorder.go | 66 ------ internal/xcpu/cpu.go | 290 -------------------------- internal/xcpu/cpu_aix.go | 33 --- internal/xcpu/cpu_arm.go | 73 ------- internal/xcpu/cpu_arm64.go | 172 --------------- internal/xcpu/cpu_arm64.s | 31 --- internal/xcpu/cpu_gc_arm64.go | 11 - internal/xcpu/cpu_gc_s390x.go | 21 -- internal/xcpu/cpu_gc_x86.go | 15 -- internal/xcpu/cpu_gccgo_arm64.go | 11 - internal/xcpu/cpu_gccgo_s390x.go | 22 -- internal/xcpu/cpu_gccgo_x86.c | 37 ---- internal/xcpu/cpu_gccgo_x86.go | 31 --- internal/xcpu/cpu_linux.go | 15 -- internal/xcpu/cpu_linux_arm.go | 39 ---- internal/xcpu/cpu_linux_arm64.go | 111 ---------- internal/xcpu/cpu_linux_mips64x.go | 22 -- internal/xcpu/cpu_linux_noinit.go | 9 - internal/xcpu/cpu_linux_ppc64x.go | 30 --- internal/xcpu/cpu_linux_s390x.go | 40 ---- internal/xcpu/cpu_loong64.go | 12 -- internal/xcpu/cpu_mips64x.go | 15 -- internal/xcpu/cpu_mipsx.go | 11 - internal/xcpu/cpu_netbsd_arm64.go | 173 --------------- internal/xcpu/cpu_openbsd_arm64.go | 65 ------ internal/xcpu/cpu_openbsd_arm64.s | 11 - internal/xcpu/cpu_other_arm.go | 9 - internal/xcpu/cpu_other_arm64.go | 9 - internal/xcpu/cpu_other_mips64x.go | 11 - internal/xcpu/cpu_other_ppc64x.go | 12 -- internal/xcpu/cpu_other_riscv64.go | 11 - internal/xcpu/cpu_ppc64x.go | 16 -- internal/xcpu/cpu_riscv64.go | 11 - internal/xcpu/cpu_s390x.go | 172 --------------- internal/xcpu/cpu_s390x.s | 57 ----- internal/xcpu/cpu_wasm.go | 17 -- internal/xcpu/cpu_x86.go | 151 -------------- internal/xcpu/cpu_x86.s | 26 --- internal/xcpu/cpu_zos.go | 10 - internal/xcpu/cpu_zos_s390x.go | 25 --- internal/xcpu/endian_big.go | 10 - internal/xcpu/endian_little.go | 10 - internal/xcpu/hwcap_linux.go | 71 ------- internal/xcpu/parse.go | 43 ---- internal/xcpu/proc_cpuinfo_linux.go | 53 ----- internal/xcpu/runtime_auxv.go | 16 -- internal/xcpu/runtime_auxv_go121.go | 18 -- internal/xcpu/syscall_aix_gccgo.go | 26 --- internal/xcpu/syscall_aix_ppc64_gc.go | 35 ---- mask_amd64.s | 29 +-- mask_asm.go | 9 +- 56 files changed, 6 insertions(+), 2251 deletions(-) delete mode 100644 internal/xcpu/.gitattributes delete mode 100644 internal/xcpu/.gitignore delete mode 100644 internal/xcpu/README.md delete mode 100644 internal/xcpu/asm_aix_ppc64.s delete mode 100644 internal/xcpu/byteorder.go delete mode 100644 internal/xcpu/cpu.go delete mode 100644 internal/xcpu/cpu_aix.go delete mode 100644 internal/xcpu/cpu_arm.go delete mode 100644 internal/xcpu/cpu_arm64.go delete mode 100644 internal/xcpu/cpu_arm64.s delete mode 100644 internal/xcpu/cpu_gc_arm64.go delete mode 100644 internal/xcpu/cpu_gc_s390x.go delete mode 100644 internal/xcpu/cpu_gc_x86.go delete mode 100644 internal/xcpu/cpu_gccgo_arm64.go delete mode 100644 internal/xcpu/cpu_gccgo_s390x.go delete mode 100644 internal/xcpu/cpu_gccgo_x86.c delete mode 100644 internal/xcpu/cpu_gccgo_x86.go delete mode 100644 internal/xcpu/cpu_linux.go delete mode 100644 internal/xcpu/cpu_linux_arm.go delete mode 100644 internal/xcpu/cpu_linux_arm64.go delete mode 100644 internal/xcpu/cpu_linux_mips64x.go delete mode 100644 internal/xcpu/cpu_linux_noinit.go delete mode 100644 internal/xcpu/cpu_linux_ppc64x.go delete mode 100644 internal/xcpu/cpu_linux_s390x.go delete mode 100644 internal/xcpu/cpu_loong64.go delete mode 100644 internal/xcpu/cpu_mips64x.go delete mode 100644 internal/xcpu/cpu_mipsx.go delete mode 100644 internal/xcpu/cpu_netbsd_arm64.go delete mode 100644 internal/xcpu/cpu_openbsd_arm64.go delete mode 100644 internal/xcpu/cpu_openbsd_arm64.s delete mode 100644 internal/xcpu/cpu_other_arm.go delete mode 100644 internal/xcpu/cpu_other_arm64.go delete mode 100644 internal/xcpu/cpu_other_mips64x.go delete mode 100644 internal/xcpu/cpu_other_ppc64x.go delete mode 100644 internal/xcpu/cpu_other_riscv64.go delete mode 100644 internal/xcpu/cpu_ppc64x.go delete mode 100644 internal/xcpu/cpu_riscv64.go delete mode 100644 internal/xcpu/cpu_s390x.go delete mode 100644 internal/xcpu/cpu_s390x.s delete mode 100644 internal/xcpu/cpu_wasm.go delete mode 100644 internal/xcpu/cpu_x86.go delete mode 100644 internal/xcpu/cpu_x86.s delete mode 100644 internal/xcpu/cpu_zos.go delete mode 100644 internal/xcpu/cpu_zos_s390x.go delete mode 100644 internal/xcpu/endian_big.go delete mode 100644 internal/xcpu/endian_little.go delete mode 100644 internal/xcpu/hwcap_linux.go delete mode 100644 internal/xcpu/parse.go delete mode 100644 internal/xcpu/proc_cpuinfo_linux.go delete mode 100644 internal/xcpu/runtime_auxv.go delete mode 100644 internal/xcpu/runtime_auxv_go121.go delete mode 100644 internal/xcpu/syscall_aix_gccgo.go delete mode 100644 internal/xcpu/syscall_aix_ppc64_gc.go diff --git a/README.md b/README.md index 0f286e63..3dead855 100644 --- a/README.md +++ b/README.md @@ -120,7 +120,7 @@ Advantages of nhooyr.io/websocket: - Gorilla requires registering a pong callback before sending a Ping - Can target Wasm ([gorilla/websocket#432](https://github.com/gorilla/websocket/issues/432)) - Transparent message buffer reuse with [wsjson](https://pkg.go.dev/nhooyr.io/websocket/wsjson) subpackage -- [4x](https://github.com/nhooyr/websocket/pull/326) faster WebSocket masking implementation in assembly for amd64 and arm64 and [2x](https://github.com/nhooyr/websocket/releases/tag/v1.7.4) faster implementation in pure Go +- [3-4x](https://github.com/nhooyr/websocket/pull/326) faster WebSocket masking implementation in assembly for amd64 and arm64 and [2x](https://github.com/nhooyr/websocket/releases/tag/v1.7.4) faster implementation in pure Go - Gorilla's implementation is slower and uses [unsafe](https://golang.org/pkg/unsafe/). - Full [permessage-deflate](https://tools.ietf.org/html/rfc7692) compression extension support - Gorilla only supports no context takeover mode diff --git a/internal/xcpu/.gitattributes b/internal/xcpu/.gitattributes deleted file mode 100644 index d2f212e5..00000000 --- a/internal/xcpu/.gitattributes +++ /dev/null @@ -1,10 +0,0 @@ -# Treat all files in this repo as binary, with no git magic updating -# line endings. Windows users contributing to Go will need to use a -# modern version of git and editors capable of LF line endings. -# -# We'll prevent accidental CRLF line endings from entering the repo -# via the git-review gofmt checks. -# -# See golang.org/issue/9281 - -* -text diff --git a/internal/xcpu/.gitignore b/internal/xcpu/.gitignore deleted file mode 100644 index 5a9d62ef..00000000 --- a/internal/xcpu/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -# Add no patterns to .gitignore except for files generated by the build. -last-change diff --git a/internal/xcpu/README.md b/internal/xcpu/README.md deleted file mode 100644 index 96a1a30f..00000000 --- a/internal/xcpu/README.md +++ /dev/null @@ -1,3 +0,0 @@ -# cpu - -Vendored from https://github.com/golang/sys diff --git a/internal/xcpu/asm_aix_ppc64.s b/internal/xcpu/asm_aix_ppc64.s deleted file mode 100644 index 269e173c..00000000 --- a/internal/xcpu/asm_aix_ppc64.s +++ /dev/null @@ -1,17 +0,0 @@ -// Copyright 2018 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build gc - -#include "textflag.h" - -// -// System calls for ppc64, AIX are implemented in runtime/syscall_aix.go -// - -TEXT ·syscall6(SB),NOSPLIT,$0-88 - JMP syscall·syscall6(SB) - -TEXT ·rawSyscall6(SB),NOSPLIT,$0-88 - JMP syscall·rawSyscall6(SB) diff --git a/internal/xcpu/byteorder.go b/internal/xcpu/byteorder.go deleted file mode 100644 index 8f28d86c..00000000 --- a/internal/xcpu/byteorder.go +++ /dev/null @@ -1,66 +0,0 @@ -// Copyright 2019 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package xcpu - -import ( - "runtime" -) - -// byteOrder is a subset of encoding/binary.ByteOrder. -type byteOrder interface { - Uint32([]byte) uint32 - Uint64([]byte) uint64 -} - -type littleEndian struct{} -type bigEndian struct{} - -func (littleEndian) Uint32(b []byte) uint32 { - _ = b[3] // bounds check hint to compiler; see golang.org/issue/14808 - return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24 -} - -func (littleEndian) Uint64(b []byte) uint64 { - _ = b[7] // bounds check hint to compiler; see golang.org/issue/14808 - return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 | - uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56 -} - -func (bigEndian) Uint32(b []byte) uint32 { - _ = b[3] // bounds check hint to compiler; see golang.org/issue/14808 - return uint32(b[3]) | uint32(b[2])<<8 | uint32(b[1])<<16 | uint32(b[0])<<24 -} - -func (bigEndian) Uint64(b []byte) uint64 { - _ = b[7] // bounds check hint to compiler; see golang.org/issue/14808 - return uint64(b[7]) | uint64(b[6])<<8 | uint64(b[5])<<16 | uint64(b[4])<<24 | - uint64(b[3])<<32 | uint64(b[2])<<40 | uint64(b[1])<<48 | uint64(b[0])<<56 -} - -// hostByteOrder returns littleEndian on little-endian machines and -// bigEndian on big-endian machines. -func hostByteOrder() byteOrder { - switch runtime.GOARCH { - case "386", "amd64", "amd64p32", - "alpha", - "arm", "arm64", - "loong64", - "mipsle", "mips64le", "mips64p32le", - "nios2", - "ppc64le", - "riscv", "riscv64", - "sh": - return littleEndian{} - case "armbe", "arm64be", - "m68k", - "mips", "mips64", "mips64p32", - "ppc", "ppc64", - "s390", "s390x", - "shbe", - "sparc", "sparc64": - return bigEndian{} - } - panic("unknown architecture") -} diff --git a/internal/xcpu/cpu.go b/internal/xcpu/cpu.go deleted file mode 100644 index 5fc15019..00000000 --- a/internal/xcpu/cpu.go +++ /dev/null @@ -1,290 +0,0 @@ -// Copyright 2018 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// Package cpu implements processor feature detection for -// various CPU architectures. -package xcpu - -import ( - "os" - "strings" -) - -// Initialized reports whether the CPU features were initialized. -// -// For some GOOS/GOARCH combinations initialization of the CPU features depends -// on reading an operating specific file, e.g. /proc/self/auxv on linux/arm -// Initialized will report false if reading the file fails. -var Initialized bool - -// CacheLinePad is used to pad structs to avoid false sharing. -type CacheLinePad struct{ _ [cacheLineSize]byte } - -// X86 contains the supported CPU features of the -// current X86/AMD64 platform. If the current platform -// is not X86/AMD64 then all feature flags are false. -// -// X86 is padded to avoid false sharing. Further the HasAVX -// and HasAVX2 are only set if the OS supports XMM and YMM -// registers in addition to the CPUID feature bit being set. -var X86 struct { - _ CacheLinePad - HasAES bool // AES hardware implementation (AES NI) - HasADX bool // Multi-precision add-carry instruction extensions - HasAVX bool // Advanced vector extension - HasAVX2 bool // Advanced vector extension 2 - HasAVX512 bool // Advanced vector extension 512 - HasAVX512F bool // Advanced vector extension 512 Foundation Instructions - HasAVX512CD bool // Advanced vector extension 512 Conflict Detection Instructions - HasAVX512ER bool // Advanced vector extension 512 Exponential and Reciprocal Instructions - HasAVX512PF bool // Advanced vector extension 512 Prefetch Instructions - HasAVX512VL bool // Advanced vector extension 512 Vector Length Extensions - HasAVX512BW bool // Advanced vector extension 512 Byte and Word Instructions - HasAVX512DQ bool // Advanced vector extension 512 Doubleword and Quadword Instructions - HasAVX512IFMA bool // Advanced vector extension 512 Integer Fused Multiply Add - HasAVX512VBMI bool // Advanced vector extension 512 Vector Byte Manipulation Instructions - HasAVX5124VNNIW bool // Advanced vector extension 512 Vector Neural Network Instructions Word variable precision - HasAVX5124FMAPS bool // Advanced vector extension 512 Fused Multiply Accumulation Packed Single precision - HasAVX512VPOPCNTDQ bool // Advanced vector extension 512 Double and quad word population count instructions - HasAVX512VPCLMULQDQ bool // Advanced vector extension 512 Vector carry-less multiply operations - HasAVX512VNNI bool // Advanced vector extension 512 Vector Neural Network Instructions - HasAVX512GFNI bool // Advanced vector extension 512 Galois field New Instructions - HasAVX512VAES bool // Advanced vector extension 512 Vector AES instructions - HasAVX512VBMI2 bool // Advanced vector extension 512 Vector Byte Manipulation Instructions 2 - HasAVX512BITALG bool // Advanced vector extension 512 Bit Algorithms - HasAVX512BF16 bool // Advanced vector extension 512 BFloat16 Instructions - HasAMXTile bool // Advanced Matrix Extension Tile instructions - HasAMXInt8 bool // Advanced Matrix Extension Int8 instructions - HasAMXBF16 bool // Advanced Matrix Extension BFloat16 instructions - HasBMI1 bool // Bit manipulation instruction set 1 - HasBMI2 bool // Bit manipulation instruction set 2 - HasCX16 bool // Compare and exchange 16 Bytes - HasERMS bool // Enhanced REP for MOVSB and STOSB - HasFMA bool // Fused-multiply-add instructions - HasOSXSAVE bool // OS supports XSAVE/XRESTOR for saving/restoring XMM registers. - HasPCLMULQDQ bool // PCLMULQDQ instruction - most often used for AES-GCM - HasPOPCNT bool // Hamming weight instruction POPCNT. - HasRDRAND bool // RDRAND instruction (on-chip random number generator) - HasRDSEED bool // RDSEED instruction (on-chip random number generator) - HasSSE2 bool // Streaming SIMD extension 2 (always available on amd64) - HasSSE3 bool // Streaming SIMD extension 3 - HasSSSE3 bool // Supplemental streaming SIMD extension 3 - HasSSE41 bool // Streaming SIMD extension 4 and 4.1 - HasSSE42 bool // Streaming SIMD extension 4 and 4.2 - _ CacheLinePad -} - -// ARM64 contains the supported CPU features of the -// current ARMv8(aarch64) platform. If the current platform -// is not arm64 then all feature flags are false. -var ARM64 struct { - _ CacheLinePad - HasFP bool // Floating-point instruction set (always available) - HasASIMD bool // Advanced SIMD (always available) - HasEVTSTRM bool // Event stream support - HasAES bool // AES hardware implementation - HasPMULL bool // Polynomial multiplication instruction set - HasSHA1 bool // SHA1 hardware implementation - HasSHA2 bool // SHA2 hardware implementation - HasCRC32 bool // CRC32 hardware implementation - HasATOMICS bool // Atomic memory operation instruction set - HasFPHP bool // Half precision floating-point instruction set - HasASIMDHP bool // Advanced SIMD half precision instruction set - HasCPUID bool // CPUID identification scheme registers - HasASIMDRDM bool // Rounding double multiply add/subtract instruction set - HasJSCVT bool // Javascript conversion from floating-point to integer - HasFCMA bool // Floating-point multiplication and addition of complex numbers - HasLRCPC bool // Release Consistent processor consistent support - HasDCPOP bool // Persistent memory support - HasSHA3 bool // SHA3 hardware implementation - HasSM3 bool // SM3 hardware implementation - HasSM4 bool // SM4 hardware implementation - HasASIMDDP bool // Advanced SIMD double precision instruction set - HasSHA512 bool // SHA512 hardware implementation - HasSVE bool // Scalable Vector Extensions - HasASIMDFHM bool // Advanced SIMD multiplication FP16 to FP32 - _ CacheLinePad -} - -// ARM contains the supported CPU features of the current ARM (32-bit) platform. -// All feature flags are false if: -// 1. the current platform is not arm, or -// 2. the current operating system is not Linux. -var ARM struct { - _ CacheLinePad - HasSWP bool // SWP instruction support - HasHALF bool // Half-word load and store support - HasTHUMB bool // ARM Thumb instruction set - Has26BIT bool // Address space limited to 26-bits - HasFASTMUL bool // 32-bit operand, 64-bit result multiplication support - HasFPA bool // Floating point arithmetic support - HasVFP bool // Vector floating point support - HasEDSP bool // DSP Extensions support - HasJAVA bool // Java instruction set - HasIWMMXT bool // Intel Wireless MMX technology support - HasCRUNCH bool // MaverickCrunch context switching and handling - HasTHUMBEE bool // Thumb EE instruction set - HasNEON bool // NEON instruction set - HasVFPv3 bool // Vector floating point version 3 support - HasVFPv3D16 bool // Vector floating point version 3 D8-D15 - HasTLS bool // Thread local storage support - HasVFPv4 bool // Vector floating point version 4 support - HasIDIVA bool // Integer divide instruction support in ARM mode - HasIDIVT bool // Integer divide instruction support in Thumb mode - HasVFPD32 bool // Vector floating point version 3 D15-D31 - HasLPAE bool // Large Physical Address Extensions - HasEVTSTRM bool // Event stream support - HasAES bool // AES hardware implementation - HasPMULL bool // Polynomial multiplication instruction set - HasSHA1 bool // SHA1 hardware implementation - HasSHA2 bool // SHA2 hardware implementation - HasCRC32 bool // CRC32 hardware implementation - _ CacheLinePad -} - -// MIPS64X contains the supported CPU features of the current mips64/mips64le -// platforms. If the current platform is not mips64/mips64le or the current -// operating system is not Linux then all feature flags are false. -var MIPS64X struct { - _ CacheLinePad - HasMSA bool // MIPS SIMD architecture - _ CacheLinePad -} - -// PPC64 contains the supported CPU features of the current ppc64/ppc64le platforms. -// If the current platform is not ppc64/ppc64le then all feature flags are false. -// -// For ppc64/ppc64le, it is safe to check only for ISA level starting on ISA v3.00, -// since there are no optional categories. There are some exceptions that also -// require kernel support to work (DARN, SCV), so there are feature bits for -// those as well. The struct is padded to avoid false sharing. -var PPC64 struct { - _ CacheLinePad - HasDARN bool // Hardware random number generator (requires kernel enablement) - HasSCV bool // Syscall vectored (requires kernel enablement) - IsPOWER8 bool // ISA v2.07 (POWER8) - IsPOWER9 bool // ISA v3.00 (POWER9), implies IsPOWER8 - _ CacheLinePad -} - -// S390X contains the supported CPU features of the current IBM Z -// (s390x) platform. If the current platform is not IBM Z then all -// feature flags are false. -// -// S390X is padded to avoid false sharing. Further HasVX is only set -// if the OS supports vector registers in addition to the STFLE -// feature bit being set. -var S390X struct { - _ CacheLinePad - HasZARCH bool // z/Architecture mode is active [mandatory] - HasSTFLE bool // store facility list extended - HasLDISP bool // long (20-bit) displacements - HasEIMM bool // 32-bit immediates - HasDFP bool // decimal floating point - HasETF3EH bool // ETF-3 enhanced - HasMSA bool // message security assist (CPACF) - HasAES bool // KM-AES{128,192,256} functions - HasAESCBC bool // KMC-AES{128,192,256} functions - HasAESCTR bool // KMCTR-AES{128,192,256} functions - HasAESGCM bool // KMA-GCM-AES{128,192,256} functions - HasGHASH bool // KIMD-GHASH function - HasSHA1 bool // K{I,L}MD-SHA-1 functions - HasSHA256 bool // K{I,L}MD-SHA-256 functions - HasSHA512 bool // K{I,L}MD-SHA-512 functions - HasSHA3 bool // K{I,L}MD-SHA3-{224,256,384,512} and K{I,L}MD-SHAKE-{128,256} functions - HasVX bool // vector facility - HasVXE bool // vector-enhancements facility 1 - _ CacheLinePad -} - -func init() { - archInit() - initOptions() - processOptions() -} - -// options contains the cpu debug options that can be used in GODEBUG. -// Options are arch dependent and are added by the arch specific initOptions functions. -// Features that are mandatory for the specific GOARCH should have the Required field set -// (e.g. SSE2 on amd64). -var options []option - -// Option names should be lower case. e.g. avx instead of AVX. -type option struct { - Name string - Feature *bool - Specified bool // whether feature value was specified in GODEBUG - Enable bool // whether feature should be enabled - Required bool // whether feature is mandatory and can not be disabled -} - -func processOptions() { - env := os.Getenv("GODEBUG") -field: - for env != "" { - field := "" - i := strings.IndexByte(env, ',') - if i < 0 { - field, env = env, "" - } else { - field, env = env[:i], env[i+1:] - } - if len(field) < 4 || field[:4] != "cpu." { - continue - } - i = strings.IndexByte(field, '=') - if i < 0 { - print("GODEBUG sys/cpu: no value specified for \"", field, "\"\n") - continue - } - key, value := field[4:i], field[i+1:] // e.g. "SSE2", "on" - - var enable bool - switch value { - case "on": - enable = true - case "off": - enable = false - default: - print("GODEBUG sys/cpu: value \"", value, "\" not supported for cpu option \"", key, "\"\n") - continue field - } - - if key == "all" { - for i := range options { - options[i].Specified = true - options[i].Enable = enable || options[i].Required - } - continue field - } - - for i := range options { - if options[i].Name == key { - options[i].Specified = true - options[i].Enable = enable - continue field - } - } - - print("GODEBUG sys/cpu: unknown cpu feature \"", key, "\"\n") - } - - for _, o := range options { - if !o.Specified { - continue - } - - if o.Enable && !*o.Feature { - print("GODEBUG sys/cpu: can not enable \"", o.Name, "\", missing CPU support\n") - continue - } - - if !o.Enable && o.Required { - print("GODEBUG sys/cpu: can not disable \"", o.Name, "\", required CPU feature\n") - continue - } - - *o.Feature = o.Enable - } -} diff --git a/internal/xcpu/cpu_aix.go b/internal/xcpu/cpu_aix.go deleted file mode 100644 index 5e6e2583..00000000 --- a/internal/xcpu/cpu_aix.go +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright 2019 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build aix - -package xcpu - -const ( - // getsystemcfg constants - _SC_IMPL = 2 - _IMPL_POWER8 = 0x10000 - _IMPL_POWER9 = 0x20000 -) - -func archInit() { - impl := getsystemcfg(_SC_IMPL) - if impl&_IMPL_POWER8 != 0 { - PPC64.IsPOWER8 = true - } - if impl&_IMPL_POWER9 != 0 { - PPC64.IsPOWER8 = true - PPC64.IsPOWER9 = true - } - - Initialized = true -} - -func getsystemcfg(label int) (n uint64) { - r0, _ := callgetsystemcfg(label) - n = uint64(r0) - return -} diff --git a/internal/xcpu/cpu_arm.go b/internal/xcpu/cpu_arm.go deleted file mode 100644 index ff120458..00000000 --- a/internal/xcpu/cpu_arm.go +++ /dev/null @@ -1,73 +0,0 @@ -// Copyright 2018 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package xcpu - -const cacheLineSize = 32 - -// HWCAP/HWCAP2 bits. -// These are specific to Linux. -const ( - hwcap_SWP = 1 << 0 - hwcap_HALF = 1 << 1 - hwcap_THUMB = 1 << 2 - hwcap_26BIT = 1 << 3 - hwcap_FAST_MULT = 1 << 4 - hwcap_FPA = 1 << 5 - hwcap_VFP = 1 << 6 - hwcap_EDSP = 1 << 7 - hwcap_JAVA = 1 << 8 - hwcap_IWMMXT = 1 << 9 - hwcap_CRUNCH = 1 << 10 - hwcap_THUMBEE = 1 << 11 - hwcap_NEON = 1 << 12 - hwcap_VFPv3 = 1 << 13 - hwcap_VFPv3D16 = 1 << 14 - hwcap_TLS = 1 << 15 - hwcap_VFPv4 = 1 << 16 - hwcap_IDIVA = 1 << 17 - hwcap_IDIVT = 1 << 18 - hwcap_VFPD32 = 1 << 19 - hwcap_LPAE = 1 << 20 - hwcap_EVTSTRM = 1 << 21 - - hwcap2_AES = 1 << 0 - hwcap2_PMULL = 1 << 1 - hwcap2_SHA1 = 1 << 2 - hwcap2_SHA2 = 1 << 3 - hwcap2_CRC32 = 1 << 4 -) - -func initOptions() { - options = []option{ - {Name: "pmull", Feature: &ARM.HasPMULL}, - {Name: "sha1", Feature: &ARM.HasSHA1}, - {Name: "sha2", Feature: &ARM.HasSHA2}, - {Name: "swp", Feature: &ARM.HasSWP}, - {Name: "thumb", Feature: &ARM.HasTHUMB}, - {Name: "thumbee", Feature: &ARM.HasTHUMBEE}, - {Name: "tls", Feature: &ARM.HasTLS}, - {Name: "vfp", Feature: &ARM.HasVFP}, - {Name: "vfpd32", Feature: &ARM.HasVFPD32}, - {Name: "vfpv3", Feature: &ARM.HasVFPv3}, - {Name: "vfpv3d16", Feature: &ARM.HasVFPv3D16}, - {Name: "vfpv4", Feature: &ARM.HasVFPv4}, - {Name: "half", Feature: &ARM.HasHALF}, - {Name: "26bit", Feature: &ARM.Has26BIT}, - {Name: "fastmul", Feature: &ARM.HasFASTMUL}, - {Name: "fpa", Feature: &ARM.HasFPA}, - {Name: "edsp", Feature: &ARM.HasEDSP}, - {Name: "java", Feature: &ARM.HasJAVA}, - {Name: "iwmmxt", Feature: &ARM.HasIWMMXT}, - {Name: "crunch", Feature: &ARM.HasCRUNCH}, - {Name: "neon", Feature: &ARM.HasNEON}, - {Name: "idivt", Feature: &ARM.HasIDIVT}, - {Name: "idiva", Feature: &ARM.HasIDIVA}, - {Name: "lpae", Feature: &ARM.HasLPAE}, - {Name: "evtstrm", Feature: &ARM.HasEVTSTRM}, - {Name: "aes", Feature: &ARM.HasAES}, - {Name: "crc32", Feature: &ARM.HasCRC32}, - } - -} diff --git a/internal/xcpu/cpu_arm64.go b/internal/xcpu/cpu_arm64.go deleted file mode 100644 index 3d4113a5..00000000 --- a/internal/xcpu/cpu_arm64.go +++ /dev/null @@ -1,172 +0,0 @@ -// Copyright 2019 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package xcpu - -import "runtime" - -// cacheLineSize is used to prevent false sharing of cache lines. -// We choose 128 because Apple Silicon, a.k.a. M1, has 128-byte cache line size. -// It doesn't cost much and is much more future-proof. -const cacheLineSize = 128 - -func initOptions() { - options = []option{ - {Name: "fp", Feature: &ARM64.HasFP}, - {Name: "asimd", Feature: &ARM64.HasASIMD}, - {Name: "evstrm", Feature: &ARM64.HasEVTSTRM}, - {Name: "aes", Feature: &ARM64.HasAES}, - {Name: "fphp", Feature: &ARM64.HasFPHP}, - {Name: "jscvt", Feature: &ARM64.HasJSCVT}, - {Name: "lrcpc", Feature: &ARM64.HasLRCPC}, - {Name: "pmull", Feature: &ARM64.HasPMULL}, - {Name: "sha1", Feature: &ARM64.HasSHA1}, - {Name: "sha2", Feature: &ARM64.HasSHA2}, - {Name: "sha3", Feature: &ARM64.HasSHA3}, - {Name: "sha512", Feature: &ARM64.HasSHA512}, - {Name: "sm3", Feature: &ARM64.HasSM3}, - {Name: "sm4", Feature: &ARM64.HasSM4}, - {Name: "sve", Feature: &ARM64.HasSVE}, - {Name: "crc32", Feature: &ARM64.HasCRC32}, - {Name: "atomics", Feature: &ARM64.HasATOMICS}, - {Name: "asimdhp", Feature: &ARM64.HasASIMDHP}, - {Name: "cpuid", Feature: &ARM64.HasCPUID}, - {Name: "asimrdm", Feature: &ARM64.HasASIMDRDM}, - {Name: "fcma", Feature: &ARM64.HasFCMA}, - {Name: "dcpop", Feature: &ARM64.HasDCPOP}, - {Name: "asimddp", Feature: &ARM64.HasASIMDDP}, - {Name: "asimdfhm", Feature: &ARM64.HasASIMDFHM}, - } -} - -func archInit() { - switch runtime.GOOS { - case "freebsd": - readARM64Registers() - case "linux", "netbsd", "openbsd": - doinit() - default: - // Many platforms don't seem to allow reading these registers. - setMinimalFeatures() - } -} - -// setMinimalFeatures fakes the minimal ARM64 features expected by -// TestARM64minimalFeatures. -func setMinimalFeatures() { - ARM64.HasASIMD = true - ARM64.HasFP = true -} - -func readARM64Registers() { - Initialized = true - - parseARM64SystemRegisters(getisar0(), getisar1(), getpfr0()) -} - -func parseARM64SystemRegisters(isar0, isar1, pfr0 uint64) { - // ID_AA64ISAR0_EL1 - switch extractBits(isar0, 4, 7) { - case 1: - ARM64.HasAES = true - case 2: - ARM64.HasAES = true - ARM64.HasPMULL = true - } - - switch extractBits(isar0, 8, 11) { - case 1: - ARM64.HasSHA1 = true - } - - switch extractBits(isar0, 12, 15) { - case 1: - ARM64.HasSHA2 = true - case 2: - ARM64.HasSHA2 = true - ARM64.HasSHA512 = true - } - - switch extractBits(isar0, 16, 19) { - case 1: - ARM64.HasCRC32 = true - } - - switch extractBits(isar0, 20, 23) { - case 2: - ARM64.HasATOMICS = true - } - - switch extractBits(isar0, 28, 31) { - case 1: - ARM64.HasASIMDRDM = true - } - - switch extractBits(isar0, 32, 35) { - case 1: - ARM64.HasSHA3 = true - } - - switch extractBits(isar0, 36, 39) { - case 1: - ARM64.HasSM3 = true - } - - switch extractBits(isar0, 40, 43) { - case 1: - ARM64.HasSM4 = true - } - - switch extractBits(isar0, 44, 47) { - case 1: - ARM64.HasASIMDDP = true - } - - // ID_AA64ISAR1_EL1 - switch extractBits(isar1, 0, 3) { - case 1: - ARM64.HasDCPOP = true - } - - switch extractBits(isar1, 12, 15) { - case 1: - ARM64.HasJSCVT = true - } - - switch extractBits(isar1, 16, 19) { - case 1: - ARM64.HasFCMA = true - } - - switch extractBits(isar1, 20, 23) { - case 1: - ARM64.HasLRCPC = true - } - - // ID_AA64PFR0_EL1 - switch extractBits(pfr0, 16, 19) { - case 0: - ARM64.HasFP = true - case 1: - ARM64.HasFP = true - ARM64.HasFPHP = true - } - - switch extractBits(pfr0, 20, 23) { - case 0: - ARM64.HasASIMD = true - case 1: - ARM64.HasASIMD = true - ARM64.HasASIMDHP = true - } - - switch extractBits(pfr0, 32, 35) { - case 1: - ARM64.HasSVE = true - } -} - -func extractBits(data uint64, start, end uint) uint { - return (uint)(data>>start) & ((1 << (end - start + 1)) - 1) -} diff --git a/internal/xcpu/cpu_arm64.s b/internal/xcpu/cpu_arm64.s deleted file mode 100644 index fcb9a388..00000000 --- a/internal/xcpu/cpu_arm64.s +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright 2019 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build gc - -#include "textflag.h" - -// func getisar0() uint64 -TEXT ·getisar0(SB),NOSPLIT,$0-8 - // get Instruction Set Attributes 0 into x0 - // mrs x0, ID_AA64ISAR0_EL1 = d5380600 - WORD $0xd5380600 - MOVD R0, ret+0(FP) - RET - -// func getisar1() uint64 -TEXT ·getisar1(SB),NOSPLIT,$0-8 - // get Instruction Set Attributes 1 into x0 - // mrs x0, ID_AA64ISAR1_EL1 = d5380620 - WORD $0xd5380620 - MOVD R0, ret+0(FP) - RET - -// func getpfr0() uint64 -TEXT ·getpfr0(SB),NOSPLIT,$0-8 - // get Processor Feature Register 0 into x0 - // mrs x0, ID_AA64PFR0_EL1 = d5380400 - WORD $0xd5380400 - MOVD R0, ret+0(FP) - RET diff --git a/internal/xcpu/cpu_gc_arm64.go b/internal/xcpu/cpu_gc_arm64.go deleted file mode 100644 index 26d3050d..00000000 --- a/internal/xcpu/cpu_gc_arm64.go +++ /dev/null @@ -1,11 +0,0 @@ -// Copyright 2019 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build gc - -package xcpu - -func getisar0() uint64 -func getisar1() uint64 -func getpfr0() uint64 diff --git a/internal/xcpu/cpu_gc_s390x.go b/internal/xcpu/cpu_gc_s390x.go deleted file mode 100644 index 34ca88b7..00000000 --- a/internal/xcpu/cpu_gc_s390x.go +++ /dev/null @@ -1,21 +0,0 @@ -// Copyright 2019 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build gc - -package xcpu - -// haveAsmFunctions reports whether the other functions in this file can -// be safely called. -func haveAsmFunctions() bool { return true } - -// The following feature detection functions are defined in cpu_s390x.s. -// They are likely to be expensive to call so the results should be cached. -func stfle() facilityList -func kmQuery() queryResult -func kmcQuery() queryResult -func kmctrQuery() queryResult -func kmaQuery() queryResult -func kimdQuery() queryResult -func klmdQuery() queryResult diff --git a/internal/xcpu/cpu_gc_x86.go b/internal/xcpu/cpu_gc_x86.go deleted file mode 100644 index 9d6f61c2..00000000 --- a/internal/xcpu/cpu_gc_x86.go +++ /dev/null @@ -1,15 +0,0 @@ -// Copyright 2018 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build (386 || amd64 || amd64p32) && gc - -package xcpu - -// cpuid is implemented in cpu_x86.s for gc compiler -// and in cpu_gccgo.c for gccgo. -func cpuid(eaxArg, ecxArg uint32) (eax, ebx, ecx, edx uint32) - -// xgetbv with ecx = 0 is implemented in cpu_x86.s for gc compiler -// and in cpu_gccgo.c for gccgo. -func xgetbv() (eax, edx uint32) diff --git a/internal/xcpu/cpu_gccgo_arm64.go b/internal/xcpu/cpu_gccgo_arm64.go deleted file mode 100644 index d6c2a3a8..00000000 --- a/internal/xcpu/cpu_gccgo_arm64.go +++ /dev/null @@ -1,11 +0,0 @@ -// Copyright 2019 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build gccgo - -package xcpu - -func getisar0() uint64 { return 0 } -func getisar1() uint64 { return 0 } -func getpfr0() uint64 { return 0 } diff --git a/internal/xcpu/cpu_gccgo_s390x.go b/internal/xcpu/cpu_gccgo_s390x.go deleted file mode 100644 index 4deec625..00000000 --- a/internal/xcpu/cpu_gccgo_s390x.go +++ /dev/null @@ -1,22 +0,0 @@ -// Copyright 2019 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build gccgo - -package xcpu - -// haveAsmFunctions reports whether the other functions in this file can -// be safely called. -func haveAsmFunctions() bool { return false } - -// TODO(mundaym): the following feature detection functions are currently -// stubs. See https://golang.org/cl/162887 for how to fix this. -// They are likely to be expensive to call so the results should be cached. -func stfle() facilityList { panic("not implemented for gccgo") } -func kmQuery() queryResult { panic("not implemented for gccgo") } -func kmcQuery() queryResult { panic("not implemented for gccgo") } -func kmctrQuery() queryResult { panic("not implemented for gccgo") } -func kmaQuery() queryResult { panic("not implemented for gccgo") } -func kimdQuery() queryResult { panic("not implemented for gccgo") } -func klmdQuery() queryResult { panic("not implemented for gccgo") } diff --git a/internal/xcpu/cpu_gccgo_x86.c b/internal/xcpu/cpu_gccgo_x86.c deleted file mode 100644 index 3f73a05d..00000000 --- a/internal/xcpu/cpu_gccgo_x86.c +++ /dev/null @@ -1,37 +0,0 @@ -// Copyright 2018 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build (386 || amd64 || amd64p32) && gccgo - -#include -#include -#include - -// Need to wrap __get_cpuid_count because it's declared as static. -int -gccgoGetCpuidCount(uint32_t leaf, uint32_t subleaf, - uint32_t *eax, uint32_t *ebx, - uint32_t *ecx, uint32_t *edx) -{ - return __get_cpuid_count(leaf, subleaf, eax, ebx, ecx, edx); -} - -#pragma GCC diagnostic ignored "-Wunknown-pragmas" -#pragma GCC push_options -#pragma GCC target("xsave") -#pragma clang attribute push (__attribute__((target("xsave"))), apply_to=function) - -// xgetbv reads the contents of an XCR (Extended Control Register) -// specified in the ECX register into registers EDX:EAX. -// Currently, the only supported value for XCR is 0. -void -gccgoXgetbv(uint32_t *eax, uint32_t *edx) -{ - uint64_t v = _xgetbv(0); - *eax = v & 0xffffffff; - *edx = v >> 32; -} - -#pragma clang attribute pop -#pragma GCC pop_options diff --git a/internal/xcpu/cpu_gccgo_x86.go b/internal/xcpu/cpu_gccgo_x86.go deleted file mode 100644 index e66c6ee9..00000000 --- a/internal/xcpu/cpu_gccgo_x86.go +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright 2018 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build (386 || amd64 || amd64p32) && gccgo - -package xcpu - -//extern gccgoGetCpuidCount -func gccgoGetCpuidCount(eaxArg, ecxArg uint32, eax, ebx, ecx, edx *uint32) - -func cpuid(eaxArg, ecxArg uint32) (eax, ebx, ecx, edx uint32) { - var a, b, c, d uint32 - gccgoGetCpuidCount(eaxArg, ecxArg, &a, &b, &c, &d) - return a, b, c, d -} - -//extern gccgoXgetbv -func gccgoXgetbv(eax, edx *uint32) - -func xgetbv() (eax, edx uint32) { - var a, d uint32 - gccgoXgetbv(&a, &d) - return a, d -} - -// gccgo doesn't build on Darwin, per: -// https://github.com/Homebrew/homebrew-core/blob/HEAD/Formula/gcc.rb#L76 -func darwinSupportsAVX512() bool { - return false -} diff --git a/internal/xcpu/cpu_linux.go b/internal/xcpu/cpu_linux.go deleted file mode 100644 index 10a48916..00000000 --- a/internal/xcpu/cpu_linux.go +++ /dev/null @@ -1,15 +0,0 @@ -// Copyright 2018 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build !386 && !amd64 && !amd64p32 && !arm64 - -package xcpu - -func archInit() { - if err := readHWCAP(); err != nil { - return - } - doinit() - Initialized = true -} diff --git a/internal/xcpu/cpu_linux_arm.go b/internal/xcpu/cpu_linux_arm.go deleted file mode 100644 index 28e32637..00000000 --- a/internal/xcpu/cpu_linux_arm.go +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright 2019 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package xcpu - -func doinit() { - ARM.HasSWP = isSet(hwCap, hwcap_SWP) - ARM.HasHALF = isSet(hwCap, hwcap_HALF) - ARM.HasTHUMB = isSet(hwCap, hwcap_THUMB) - ARM.Has26BIT = isSet(hwCap, hwcap_26BIT) - ARM.HasFASTMUL = isSet(hwCap, hwcap_FAST_MULT) - ARM.HasFPA = isSet(hwCap, hwcap_FPA) - ARM.HasVFP = isSet(hwCap, hwcap_VFP) - ARM.HasEDSP = isSet(hwCap, hwcap_EDSP) - ARM.HasJAVA = isSet(hwCap, hwcap_JAVA) - ARM.HasIWMMXT = isSet(hwCap, hwcap_IWMMXT) - ARM.HasCRUNCH = isSet(hwCap, hwcap_CRUNCH) - ARM.HasTHUMBEE = isSet(hwCap, hwcap_THUMBEE) - ARM.HasNEON = isSet(hwCap, hwcap_NEON) - ARM.HasVFPv3 = isSet(hwCap, hwcap_VFPv3) - ARM.HasVFPv3D16 = isSet(hwCap, hwcap_VFPv3D16) - ARM.HasTLS = isSet(hwCap, hwcap_TLS) - ARM.HasVFPv4 = isSet(hwCap, hwcap_VFPv4) - ARM.HasIDIVA = isSet(hwCap, hwcap_IDIVA) - ARM.HasIDIVT = isSet(hwCap, hwcap_IDIVT) - ARM.HasVFPD32 = isSet(hwCap, hwcap_VFPD32) - ARM.HasLPAE = isSet(hwCap, hwcap_LPAE) - ARM.HasEVTSTRM = isSet(hwCap, hwcap_EVTSTRM) - ARM.HasAES = isSet(hwCap2, hwcap2_AES) - ARM.HasPMULL = isSet(hwCap2, hwcap2_PMULL) - ARM.HasSHA1 = isSet(hwCap2, hwcap2_SHA1) - ARM.HasSHA2 = isSet(hwCap2, hwcap2_SHA2) - ARM.HasCRC32 = isSet(hwCap2, hwcap2_CRC32) -} - -func isSet(hwc uint, value uint) bool { - return hwc&value != 0 -} diff --git a/internal/xcpu/cpu_linux_arm64.go b/internal/xcpu/cpu_linux_arm64.go deleted file mode 100644 index 481f450b..00000000 --- a/internal/xcpu/cpu_linux_arm64.go +++ /dev/null @@ -1,111 +0,0 @@ -// Copyright 2018 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package xcpu - -import ( - "strings" - "syscall" -) - -// HWCAP/HWCAP2 bits. These are exposed by Linux. -const ( - hwcap_FP = 1 << 0 - hwcap_ASIMD = 1 << 1 - hwcap_EVTSTRM = 1 << 2 - hwcap_AES = 1 << 3 - hwcap_PMULL = 1 << 4 - hwcap_SHA1 = 1 << 5 - hwcap_SHA2 = 1 << 6 - hwcap_CRC32 = 1 << 7 - hwcap_ATOMICS = 1 << 8 - hwcap_FPHP = 1 << 9 - hwcap_ASIMDHP = 1 << 10 - hwcap_CPUID = 1 << 11 - hwcap_ASIMDRDM = 1 << 12 - hwcap_JSCVT = 1 << 13 - hwcap_FCMA = 1 << 14 - hwcap_LRCPC = 1 << 15 - hwcap_DCPOP = 1 << 16 - hwcap_SHA3 = 1 << 17 - hwcap_SM3 = 1 << 18 - hwcap_SM4 = 1 << 19 - hwcap_ASIMDDP = 1 << 20 - hwcap_SHA512 = 1 << 21 - hwcap_SVE = 1 << 22 - hwcap_ASIMDFHM = 1 << 23 -) - -// linuxKernelCanEmulateCPUID reports whether we're running -// on Linux 4.11+. Ideally we'd like to ask the question about -// whether the current kernel contains -// https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=77c97b4ee21290f5f083173d957843b615abbff2 -// but the version number will have to do. -func linuxKernelCanEmulateCPUID() bool { - var un syscall.Utsname - syscall.Uname(&un) - var sb strings.Builder - for _, b := range un.Release[:] { - if b == 0 { - break - } - sb.WriteByte(byte(b)) - } - major, minor, _, ok := parseRelease(sb.String()) - return ok && (major > 4 || major == 4 && minor >= 11) -} - -func doinit() { - if err := readHWCAP(); err != nil { - // We failed to read /proc/self/auxv. This can happen if the binary has - // been given extra capabilities(7) with /bin/setcap. - // - // When this happens, we have two options. If the Linux kernel is new - // enough (4.11+), we can read the arm64 registers directly which'll - // trap into the kernel and then return back to userspace. - // - // But on older kernels, such as Linux 4.4.180 as used on many Synology - // devices, calling readARM64Registers (specifically getisar0) will - // cause a SIGILL and we'll die. So for older kernels, parse /proc/cpuinfo - // instead. - // - // See golang/go#57336. - if linuxKernelCanEmulateCPUID() { - readARM64Registers() - } else { - readLinuxProcCPUInfo() - } - return - } - - // HWCAP feature bits - ARM64.HasFP = isSet(hwCap, hwcap_FP) - ARM64.HasASIMD = isSet(hwCap, hwcap_ASIMD) - ARM64.HasEVTSTRM = isSet(hwCap, hwcap_EVTSTRM) - ARM64.HasAES = isSet(hwCap, hwcap_AES) - ARM64.HasPMULL = isSet(hwCap, hwcap_PMULL) - ARM64.HasSHA1 = isSet(hwCap, hwcap_SHA1) - ARM64.HasSHA2 = isSet(hwCap, hwcap_SHA2) - ARM64.HasCRC32 = isSet(hwCap, hwcap_CRC32) - ARM64.HasATOMICS = isSet(hwCap, hwcap_ATOMICS) - ARM64.HasFPHP = isSet(hwCap, hwcap_FPHP) - ARM64.HasASIMDHP = isSet(hwCap, hwcap_ASIMDHP) - ARM64.HasCPUID = isSet(hwCap, hwcap_CPUID) - ARM64.HasASIMDRDM = isSet(hwCap, hwcap_ASIMDRDM) - ARM64.HasJSCVT = isSet(hwCap, hwcap_JSCVT) - ARM64.HasFCMA = isSet(hwCap, hwcap_FCMA) - ARM64.HasLRCPC = isSet(hwCap, hwcap_LRCPC) - ARM64.HasDCPOP = isSet(hwCap, hwcap_DCPOP) - ARM64.HasSHA3 = isSet(hwCap, hwcap_SHA3) - ARM64.HasSM3 = isSet(hwCap, hwcap_SM3) - ARM64.HasSM4 = isSet(hwCap, hwcap_SM4) - ARM64.HasASIMDDP = isSet(hwCap, hwcap_ASIMDDP) - ARM64.HasSHA512 = isSet(hwCap, hwcap_SHA512) - ARM64.HasSVE = isSet(hwCap, hwcap_SVE) - ARM64.HasASIMDFHM = isSet(hwCap, hwcap_ASIMDFHM) -} - -func isSet(hwc uint, value uint) bool { - return hwc&value != 0 -} diff --git a/internal/xcpu/cpu_linux_mips64x.go b/internal/xcpu/cpu_linux_mips64x.go deleted file mode 100644 index 15fdee9c..00000000 --- a/internal/xcpu/cpu_linux_mips64x.go +++ /dev/null @@ -1,22 +0,0 @@ -// Copyright 2020 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build linux && (mips64 || mips64le) - -package xcpu - -// HWCAP bits. These are exposed by the Linux kernel 5.4. -const ( - // CPU features - hwcap_MIPS_MSA = 1 << 1 -) - -func doinit() { - // HWCAP feature bits - MIPS64X.HasMSA = isSet(hwCap, hwcap_MIPS_MSA) -} - -func isSet(hwc uint, value uint) bool { - return hwc&value != 0 -} diff --git a/internal/xcpu/cpu_linux_noinit.go b/internal/xcpu/cpu_linux_noinit.go deleted file mode 100644 index 878e56fb..00000000 --- a/internal/xcpu/cpu_linux_noinit.go +++ /dev/null @@ -1,9 +0,0 @@ -// Copyright 2019 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build linux && !arm && !arm64 && !mips64 && !mips64le && !ppc64 && !ppc64le && !s390x - -package xcpu - -func doinit() {} diff --git a/internal/xcpu/cpu_linux_ppc64x.go b/internal/xcpu/cpu_linux_ppc64x.go deleted file mode 100644 index 6a8ea12a..00000000 --- a/internal/xcpu/cpu_linux_ppc64x.go +++ /dev/null @@ -1,30 +0,0 @@ -// Copyright 2018 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build linux && (ppc64 || ppc64le) - -package xcpu - -// HWCAP/HWCAP2 bits. These are exposed by the kernel. -const ( - // ISA Level - _PPC_FEATURE2_ARCH_2_07 = 0x80000000 - _PPC_FEATURE2_ARCH_3_00 = 0x00800000 - - // CPU features - _PPC_FEATURE2_DARN = 0x00200000 - _PPC_FEATURE2_SCV = 0x00100000 -) - -func doinit() { - // HWCAP2 feature bits - PPC64.IsPOWER8 = isSet(hwCap2, _PPC_FEATURE2_ARCH_2_07) - PPC64.IsPOWER9 = isSet(hwCap2, _PPC_FEATURE2_ARCH_3_00) - PPC64.HasDARN = isSet(hwCap2, _PPC_FEATURE2_DARN) - PPC64.HasSCV = isSet(hwCap2, _PPC_FEATURE2_SCV) -} - -func isSet(hwc uint, value uint) bool { - return hwc&value != 0 -} diff --git a/internal/xcpu/cpu_linux_s390x.go b/internal/xcpu/cpu_linux_s390x.go deleted file mode 100644 index ff0ca7f4..00000000 --- a/internal/xcpu/cpu_linux_s390x.go +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright 2019 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package xcpu - -const ( - // bit mask values from /usr/include/bits/hwcap.h - hwcap_ZARCH = 2 - hwcap_STFLE = 4 - hwcap_MSA = 8 - hwcap_LDISP = 16 - hwcap_EIMM = 32 - hwcap_DFP = 64 - hwcap_ETF3EH = 256 - hwcap_VX = 2048 - hwcap_VXE = 8192 -) - -func initS390Xbase() { - // test HWCAP bit vector - has := func(featureMask uint) bool { - return hwCap&featureMask == featureMask - } - - // mandatory - S390X.HasZARCH = has(hwcap_ZARCH) - - // optional - S390X.HasSTFLE = has(hwcap_STFLE) - S390X.HasLDISP = has(hwcap_LDISP) - S390X.HasEIMM = has(hwcap_EIMM) - S390X.HasETF3EH = has(hwcap_ETF3EH) - S390X.HasDFP = has(hwcap_DFP) - S390X.HasMSA = has(hwcap_MSA) - S390X.HasVX = has(hwcap_VX) - if S390X.HasVX { - S390X.HasVXE = has(hwcap_VXE) - } -} diff --git a/internal/xcpu/cpu_loong64.go b/internal/xcpu/cpu_loong64.go deleted file mode 100644 index fdb21c60..00000000 --- a/internal/xcpu/cpu_loong64.go +++ /dev/null @@ -1,12 +0,0 @@ -// Copyright 2022 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build loong64 - -package xcpu - -const cacheLineSize = 64 - -func initOptions() { -} diff --git a/internal/xcpu/cpu_mips64x.go b/internal/xcpu/cpu_mips64x.go deleted file mode 100644 index 447fee98..00000000 --- a/internal/xcpu/cpu_mips64x.go +++ /dev/null @@ -1,15 +0,0 @@ -// Copyright 2018 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build mips64 || mips64le - -package xcpu - -const cacheLineSize = 32 - -func initOptions() { - options = []option{ - {Name: "msa", Feature: &MIPS64X.HasMSA}, - } -} diff --git a/internal/xcpu/cpu_mipsx.go b/internal/xcpu/cpu_mipsx.go deleted file mode 100644 index 6efa1917..00000000 --- a/internal/xcpu/cpu_mipsx.go +++ /dev/null @@ -1,11 +0,0 @@ -// Copyright 2018 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build mips || mipsle - -package xcpu - -const cacheLineSize = 32 - -func initOptions() {} diff --git a/internal/xcpu/cpu_netbsd_arm64.go b/internal/xcpu/cpu_netbsd_arm64.go deleted file mode 100644 index b84b4408..00000000 --- a/internal/xcpu/cpu_netbsd_arm64.go +++ /dev/null @@ -1,173 +0,0 @@ -// Copyright 2020 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package xcpu - -import ( - "syscall" - "unsafe" -) - -// Minimal copy of functionality from x/sys/unix so the cpu package can call -// sysctl without depending on x/sys/unix. - -const ( - _CTL_QUERY = -2 - - _SYSCTL_VERS_1 = 0x1000000 -) - -var _zero uintptr - -func sysctl(mib []int32, old *byte, oldlen *uintptr, new *byte, newlen uintptr) (err error) { - var _p0 unsafe.Pointer - if len(mib) > 0 { - _p0 = unsafe.Pointer(&mib[0]) - } else { - _p0 = unsafe.Pointer(&_zero) - } - _, _, errno := syscall.Syscall6( - syscall.SYS___SYSCTL, - uintptr(_p0), - uintptr(len(mib)), - uintptr(unsafe.Pointer(old)), - uintptr(unsafe.Pointer(oldlen)), - uintptr(unsafe.Pointer(new)), - uintptr(newlen)) - if errno != 0 { - return errno - } - return nil -} - -type sysctlNode struct { - Flags uint32 - Num int32 - Name [32]int8 - Ver uint32 - __rsvd uint32 - Un [16]byte - _sysctl_size [8]byte - _sysctl_func [8]byte - _sysctl_parent [8]byte - _sysctl_desc [8]byte -} - -func sysctlNodes(mib []int32) ([]sysctlNode, error) { - var olen uintptr - - // Get a list of all sysctl nodes below the given MIB by performing - // a sysctl for the given MIB with CTL_QUERY appended. - mib = append(mib, _CTL_QUERY) - qnode := sysctlNode{Flags: _SYSCTL_VERS_1} - qp := (*byte)(unsafe.Pointer(&qnode)) - sz := unsafe.Sizeof(qnode) - if err := sysctl(mib, nil, &olen, qp, sz); err != nil { - return nil, err - } - - // Now that we know the size, get the actual nodes. - nodes := make([]sysctlNode, olen/sz) - np := (*byte)(unsafe.Pointer(&nodes[0])) - if err := sysctl(mib, np, &olen, qp, sz); err != nil { - return nil, err - } - - return nodes, nil -} - -func nametomib(name string) ([]int32, error) { - // Split name into components. - var parts []string - last := 0 - for i := 0; i < len(name); i++ { - if name[i] == '.' { - parts = append(parts, name[last:i]) - last = i + 1 - } - } - parts = append(parts, name[last:]) - - mib := []int32{} - // Discover the nodes and construct the MIB OID. - for partno, part := range parts { - nodes, err := sysctlNodes(mib) - if err != nil { - return nil, err - } - for _, node := range nodes { - n := make([]byte, 0) - for i := range node.Name { - if node.Name[i] != 0 { - n = append(n, byte(node.Name[i])) - } - } - if string(n) == part { - mib = append(mib, int32(node.Num)) - break - } - } - if len(mib) != partno+1 { - return nil, err - } - } - - return mib, nil -} - -// aarch64SysctlCPUID is struct aarch64_sysctl_cpu_id from NetBSD's -type aarch64SysctlCPUID struct { - midr uint64 /* Main ID Register */ - revidr uint64 /* Revision ID Register */ - mpidr uint64 /* Multiprocessor Affinity Register */ - aa64dfr0 uint64 /* A64 Debug Feature Register 0 */ - aa64dfr1 uint64 /* A64 Debug Feature Register 1 */ - aa64isar0 uint64 /* A64 Instruction Set Attribute Register 0 */ - aa64isar1 uint64 /* A64 Instruction Set Attribute Register 1 */ - aa64mmfr0 uint64 /* A64 Memory Model Feature Register 0 */ - aa64mmfr1 uint64 /* A64 Memory Model Feature Register 1 */ - aa64mmfr2 uint64 /* A64 Memory Model Feature Register 2 */ - aa64pfr0 uint64 /* A64 Processor Feature Register 0 */ - aa64pfr1 uint64 /* A64 Processor Feature Register 1 */ - aa64zfr0 uint64 /* A64 SVE Feature ID Register 0 */ - mvfr0 uint32 /* Media and VFP Feature Register 0 */ - mvfr1 uint32 /* Media and VFP Feature Register 1 */ - mvfr2 uint32 /* Media and VFP Feature Register 2 */ - pad uint32 - clidr uint64 /* Cache Level ID Register */ - ctr uint64 /* Cache Type Register */ -} - -func sysctlCPUID(name string) (*aarch64SysctlCPUID, error) { - mib, err := nametomib(name) - if err != nil { - return nil, err - } - - out := aarch64SysctlCPUID{} - n := unsafe.Sizeof(out) - _, _, errno := syscall.Syscall6( - syscall.SYS___SYSCTL, - uintptr(unsafe.Pointer(&mib[0])), - uintptr(len(mib)), - uintptr(unsafe.Pointer(&out)), - uintptr(unsafe.Pointer(&n)), - uintptr(0), - uintptr(0)) - if errno != 0 { - return nil, errno - } - return &out, nil -} - -func doinit() { - cpuid, err := sysctlCPUID("machdep.cpu0.cpu_id") - if err != nil { - setMinimalFeatures() - return - } - parseARM64SystemRegisters(cpuid.aa64isar0, cpuid.aa64isar1, cpuid.aa64pfr0) - - Initialized = true -} diff --git a/internal/xcpu/cpu_openbsd_arm64.go b/internal/xcpu/cpu_openbsd_arm64.go deleted file mode 100644 index 2459a486..00000000 --- a/internal/xcpu/cpu_openbsd_arm64.go +++ /dev/null @@ -1,65 +0,0 @@ -// Copyright 2022 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package xcpu - -import ( - "syscall" - "unsafe" -) - -// Minimal copy of functionality from x/sys/unix so the cpu package can call -// sysctl without depending on x/sys/unix. - -const ( - // From OpenBSD's sys/sysctl.h. - _CTL_MACHDEP = 7 - - // From OpenBSD's machine/cpu.h. - _CPU_ID_AA64ISAR0 = 2 - _CPU_ID_AA64ISAR1 = 3 -) - -// Implemented in the runtime package (runtime/sys_openbsd3.go) -func syscall_syscall6(fn, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2 uintptr, err syscall.Errno) - -//go:linkname syscall_syscall6 syscall.syscall6 - -func sysctl(mib []uint32, old *byte, oldlen *uintptr, new *byte, newlen uintptr) (err error) { - _, _, errno := syscall_syscall6(libc_sysctl_trampoline_addr, uintptr(unsafe.Pointer(&mib[0])), uintptr(len(mib)), uintptr(unsafe.Pointer(old)), uintptr(unsafe.Pointer(oldlen)), uintptr(unsafe.Pointer(new)), uintptr(newlen)) - if errno != 0 { - return errno - } - return nil -} - -var libc_sysctl_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_sysctl sysctl "libc.so" - -func sysctlUint64(mib []uint32) (uint64, bool) { - var out uint64 - nout := unsafe.Sizeof(out) - if err := sysctl(mib, (*byte)(unsafe.Pointer(&out)), &nout, nil, 0); err != nil { - return 0, false - } - return out, true -} - -func doinit() { - setMinimalFeatures() - - // Get ID_AA64ISAR0 and ID_AA64ISAR1 from sysctl. - isar0, ok := sysctlUint64([]uint32{_CTL_MACHDEP, _CPU_ID_AA64ISAR0}) - if !ok { - return - } - isar1, ok := sysctlUint64([]uint32{_CTL_MACHDEP, _CPU_ID_AA64ISAR1}) - if !ok { - return - } - parseARM64SystemRegisters(isar0, isar1, 0) - - Initialized = true -} diff --git a/internal/xcpu/cpu_openbsd_arm64.s b/internal/xcpu/cpu_openbsd_arm64.s deleted file mode 100644 index 054ba05d..00000000 --- a/internal/xcpu/cpu_openbsd_arm64.s +++ /dev/null @@ -1,11 +0,0 @@ -// Copyright 2022 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -#include "textflag.h" - -TEXT libc_sysctl_trampoline<>(SB),NOSPLIT,$0-0 - JMP libc_sysctl(SB) - -GLOBL ·libc_sysctl_trampoline_addr(SB), RODATA, $8 -DATA ·libc_sysctl_trampoline_addr(SB)/8, $libc_sysctl_trampoline<>(SB) diff --git a/internal/xcpu/cpu_other_arm.go b/internal/xcpu/cpu_other_arm.go deleted file mode 100644 index e3247948..00000000 --- a/internal/xcpu/cpu_other_arm.go +++ /dev/null @@ -1,9 +0,0 @@ -// Copyright 2020 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build !linux && arm - -package xcpu - -func archInit() {} diff --git a/internal/xcpu/cpu_other_arm64.go b/internal/xcpu/cpu_other_arm64.go deleted file mode 100644 index 5257a0b6..00000000 --- a/internal/xcpu/cpu_other_arm64.go +++ /dev/null @@ -1,9 +0,0 @@ -// Copyright 2019 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build !linux && !netbsd && !openbsd && arm64 - -package xcpu - -func doinit() {} diff --git a/internal/xcpu/cpu_other_mips64x.go b/internal/xcpu/cpu_other_mips64x.go deleted file mode 100644 index b1ddc9d5..00000000 --- a/internal/xcpu/cpu_other_mips64x.go +++ /dev/null @@ -1,11 +0,0 @@ -// Copyright 2020 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build !linux && (mips64 || mips64le) - -package xcpu - -func archInit() { - Initialized = true -} diff --git a/internal/xcpu/cpu_other_ppc64x.go b/internal/xcpu/cpu_other_ppc64x.go deleted file mode 100644 index 00a08baa..00000000 --- a/internal/xcpu/cpu_other_ppc64x.go +++ /dev/null @@ -1,12 +0,0 @@ -// Copyright 2022 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build !aix && !linux && (ppc64 || ppc64le) - -package xcpu - -func archInit() { - PPC64.IsPOWER8 = true - Initialized = true -} diff --git a/internal/xcpu/cpu_other_riscv64.go b/internal/xcpu/cpu_other_riscv64.go deleted file mode 100644 index 7f8fd1fc..00000000 --- a/internal/xcpu/cpu_other_riscv64.go +++ /dev/null @@ -1,11 +0,0 @@ -// Copyright 2022 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build !linux && riscv64 - -package xcpu - -func archInit() { - Initialized = true -} diff --git a/internal/xcpu/cpu_ppc64x.go b/internal/xcpu/cpu_ppc64x.go deleted file mode 100644 index 22afeec2..00000000 --- a/internal/xcpu/cpu_ppc64x.go +++ /dev/null @@ -1,16 +0,0 @@ -// Copyright 2020 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build ppc64 || ppc64le - -package xcpu - -const cacheLineSize = 128 - -func initOptions() { - options = []option{ - {Name: "darn", Feature: &PPC64.HasDARN}, - {Name: "scv", Feature: &PPC64.HasSCV}, - } -} diff --git a/internal/xcpu/cpu_riscv64.go b/internal/xcpu/cpu_riscv64.go deleted file mode 100644 index 28e57b68..00000000 --- a/internal/xcpu/cpu_riscv64.go +++ /dev/null @@ -1,11 +0,0 @@ -// Copyright 2019 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build riscv64 - -package xcpu - -const cacheLineSize = 64 - -func initOptions() {} diff --git a/internal/xcpu/cpu_s390x.go b/internal/xcpu/cpu_s390x.go deleted file mode 100644 index e85a8c5d..00000000 --- a/internal/xcpu/cpu_s390x.go +++ /dev/null @@ -1,172 +0,0 @@ -// Copyright 2020 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package xcpu - -const cacheLineSize = 256 - -func initOptions() { - options = []option{ - {Name: "zarch", Feature: &S390X.HasZARCH, Required: true}, - {Name: "stfle", Feature: &S390X.HasSTFLE, Required: true}, - {Name: "ldisp", Feature: &S390X.HasLDISP, Required: true}, - {Name: "eimm", Feature: &S390X.HasEIMM, Required: true}, - {Name: "dfp", Feature: &S390X.HasDFP}, - {Name: "etf3eh", Feature: &S390X.HasETF3EH}, - {Name: "msa", Feature: &S390X.HasMSA}, - {Name: "aes", Feature: &S390X.HasAES}, - {Name: "aescbc", Feature: &S390X.HasAESCBC}, - {Name: "aesctr", Feature: &S390X.HasAESCTR}, - {Name: "aesgcm", Feature: &S390X.HasAESGCM}, - {Name: "ghash", Feature: &S390X.HasGHASH}, - {Name: "sha1", Feature: &S390X.HasSHA1}, - {Name: "sha256", Feature: &S390X.HasSHA256}, - {Name: "sha3", Feature: &S390X.HasSHA3}, - {Name: "sha512", Feature: &S390X.HasSHA512}, - {Name: "vx", Feature: &S390X.HasVX}, - {Name: "vxe", Feature: &S390X.HasVXE}, - } -} - -// bitIsSet reports whether the bit at index is set. The bit index -// is in big endian order, so bit index 0 is the leftmost bit. -func bitIsSet(bits []uint64, index uint) bool { - return bits[index/64]&((1<<63)>>(index%64)) != 0 -} - -// facility is a bit index for the named facility. -type facility uint8 - -const ( - // mandatory facilities - zarch facility = 1 // z architecture mode is active - stflef facility = 7 // store-facility-list-extended - ldisp facility = 18 // long-displacement - eimm facility = 21 // extended-immediate - - // miscellaneous facilities - dfp facility = 42 // decimal-floating-point - etf3eh facility = 30 // extended-translation 3 enhancement - - // cryptography facilities - msa facility = 17 // message-security-assist - msa3 facility = 76 // message-security-assist extension 3 - msa4 facility = 77 // message-security-assist extension 4 - msa5 facility = 57 // message-security-assist extension 5 - msa8 facility = 146 // message-security-assist extension 8 - msa9 facility = 155 // message-security-assist extension 9 - - // vector facilities - vx facility = 129 // vector facility - vxe facility = 135 // vector-enhancements 1 - vxe2 facility = 148 // vector-enhancements 2 -) - -// facilityList contains the result of an STFLE call. -// Bits are numbered in big endian order so the -// leftmost bit (the MSB) is at index 0. -type facilityList struct { - bits [4]uint64 -} - -// Has reports whether the given facilities are present. -func (s *facilityList) Has(fs ...facility) bool { - if len(fs) == 0 { - panic("no facility bits provided") - } - for _, f := range fs { - if !bitIsSet(s.bits[:], uint(f)) { - return false - } - } - return true -} - -// function is the code for the named cryptographic function. -type function uint8 - -const ( - // KM{,A,C,CTR} function codes - aes128 function = 18 // AES-128 - aes192 function = 19 // AES-192 - aes256 function = 20 // AES-256 - - // K{I,L}MD function codes - sha1 function = 1 // SHA-1 - sha256 function = 2 // SHA-256 - sha512 function = 3 // SHA-512 - sha3_224 function = 32 // SHA3-224 - sha3_256 function = 33 // SHA3-256 - sha3_384 function = 34 // SHA3-384 - sha3_512 function = 35 // SHA3-512 - shake128 function = 36 // SHAKE-128 - shake256 function = 37 // SHAKE-256 - - // KLMD function codes - ghash function = 65 // GHASH -) - -// queryResult contains the result of a Query function -// call. Bits are numbered in big endian order so the -// leftmost bit (the MSB) is at index 0. -type queryResult struct { - bits [2]uint64 -} - -// Has reports whether the given functions are present. -func (q *queryResult) Has(fns ...function) bool { - if len(fns) == 0 { - panic("no function codes provided") - } - for _, f := range fns { - if !bitIsSet(q.bits[:], uint(f)) { - return false - } - } - return true -} - -func doinit() { - initS390Xbase() - - // We need implementations of stfle, km and so on - // to detect cryptographic features. - if !haveAsmFunctions() { - return - } - - // optional cryptographic functions - if S390X.HasMSA { - aes := []function{aes128, aes192, aes256} - - // cipher message - km, kmc := kmQuery(), kmcQuery() - S390X.HasAES = km.Has(aes...) - S390X.HasAESCBC = kmc.Has(aes...) - if S390X.HasSTFLE { - facilities := stfle() - if facilities.Has(msa4) { - kmctr := kmctrQuery() - S390X.HasAESCTR = kmctr.Has(aes...) - } - if facilities.Has(msa8) { - kma := kmaQuery() - S390X.HasAESGCM = kma.Has(aes...) - } - } - - // compute message digest - kimd := kimdQuery() // intermediate (no padding) - klmd := klmdQuery() // last (padding) - S390X.HasSHA1 = kimd.Has(sha1) && klmd.Has(sha1) - S390X.HasSHA256 = kimd.Has(sha256) && klmd.Has(sha256) - S390X.HasSHA512 = kimd.Has(sha512) && klmd.Has(sha512) - S390X.HasGHASH = kimd.Has(ghash) // KLMD-GHASH does not exist - sha3 := []function{ - sha3_224, sha3_256, sha3_384, sha3_512, - shake128, shake256, - } - S390X.HasSHA3 = kimd.Has(sha3...) && klmd.Has(sha3...) - } -} diff --git a/internal/xcpu/cpu_s390x.s b/internal/xcpu/cpu_s390x.s deleted file mode 100644 index 1fb4b701..00000000 --- a/internal/xcpu/cpu_s390x.s +++ /dev/null @@ -1,57 +0,0 @@ -// Copyright 2019 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build gc - -#include "textflag.h" - -// func stfle() facilityList -TEXT ·stfle(SB), NOSPLIT|NOFRAME, $0-32 - MOVD $ret+0(FP), R1 - MOVD $3, R0 // last doubleword index to store - XC $32, (R1), (R1) // clear 4 doublewords (32 bytes) - WORD $0xb2b01000 // store facility list extended (STFLE) - RET - -// func kmQuery() queryResult -TEXT ·kmQuery(SB), NOSPLIT|NOFRAME, $0-16 - MOVD $0, R0 // set function code to 0 (KM-Query) - MOVD $ret+0(FP), R1 // address of 16-byte return value - WORD $0xB92E0024 // cipher message (KM) - RET - -// func kmcQuery() queryResult -TEXT ·kmcQuery(SB), NOSPLIT|NOFRAME, $0-16 - MOVD $0, R0 // set function code to 0 (KMC-Query) - MOVD $ret+0(FP), R1 // address of 16-byte return value - WORD $0xB92F0024 // cipher message with chaining (KMC) - RET - -// func kmctrQuery() queryResult -TEXT ·kmctrQuery(SB), NOSPLIT|NOFRAME, $0-16 - MOVD $0, R0 // set function code to 0 (KMCTR-Query) - MOVD $ret+0(FP), R1 // address of 16-byte return value - WORD $0xB92D4024 // cipher message with counter (KMCTR) - RET - -// func kmaQuery() queryResult -TEXT ·kmaQuery(SB), NOSPLIT|NOFRAME, $0-16 - MOVD $0, R0 // set function code to 0 (KMA-Query) - MOVD $ret+0(FP), R1 // address of 16-byte return value - WORD $0xb9296024 // cipher message with authentication (KMA) - RET - -// func kimdQuery() queryResult -TEXT ·kimdQuery(SB), NOSPLIT|NOFRAME, $0-16 - MOVD $0, R0 // set function code to 0 (KIMD-Query) - MOVD $ret+0(FP), R1 // address of 16-byte return value - WORD $0xB93E0024 // compute intermediate message digest (KIMD) - RET - -// func klmdQuery() queryResult -TEXT ·klmdQuery(SB), NOSPLIT|NOFRAME, $0-16 - MOVD $0, R0 // set function code to 0 (KLMD-Query) - MOVD $ret+0(FP), R1 // address of 16-byte return value - WORD $0xB93F0024 // compute last message digest (KLMD) - RET diff --git a/internal/xcpu/cpu_wasm.go b/internal/xcpu/cpu_wasm.go deleted file mode 100644 index 230aaab4..00000000 --- a/internal/xcpu/cpu_wasm.go +++ /dev/null @@ -1,17 +0,0 @@ -// Copyright 2019 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build wasm - -package xcpu - -// We're compiling the cpu package for an unknown (software-abstracted) CPU. -// Make CacheLinePad an empty struct and hope that the usual struct alignment -// rules are good enough. - -const cacheLineSize = 0 - -func initOptions() {} - -func archInit() {} diff --git a/internal/xcpu/cpu_x86.go b/internal/xcpu/cpu_x86.go deleted file mode 100644 index d2f83468..00000000 --- a/internal/xcpu/cpu_x86.go +++ /dev/null @@ -1,151 +0,0 @@ -// Copyright 2018 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build 386 || amd64 || amd64p32 - -package xcpu - -import "runtime" - -const cacheLineSize = 64 - -func initOptions() { - options = []option{ - {Name: "adx", Feature: &X86.HasADX}, - {Name: "aes", Feature: &X86.HasAES}, - {Name: "avx", Feature: &X86.HasAVX}, - {Name: "avx2", Feature: &X86.HasAVX2}, - {Name: "avx512", Feature: &X86.HasAVX512}, - {Name: "avx512f", Feature: &X86.HasAVX512F}, - {Name: "avx512cd", Feature: &X86.HasAVX512CD}, - {Name: "avx512er", Feature: &X86.HasAVX512ER}, - {Name: "avx512pf", Feature: &X86.HasAVX512PF}, - {Name: "avx512vl", Feature: &X86.HasAVX512VL}, - {Name: "avx512bw", Feature: &X86.HasAVX512BW}, - {Name: "avx512dq", Feature: &X86.HasAVX512DQ}, - {Name: "avx512ifma", Feature: &X86.HasAVX512IFMA}, - {Name: "avx512vbmi", Feature: &X86.HasAVX512VBMI}, - {Name: "avx512vnniw", Feature: &X86.HasAVX5124VNNIW}, - {Name: "avx5124fmaps", Feature: &X86.HasAVX5124FMAPS}, - {Name: "avx512vpopcntdq", Feature: &X86.HasAVX512VPOPCNTDQ}, - {Name: "avx512vpclmulqdq", Feature: &X86.HasAVX512VPCLMULQDQ}, - {Name: "avx512vnni", Feature: &X86.HasAVX512VNNI}, - {Name: "avx512gfni", Feature: &X86.HasAVX512GFNI}, - {Name: "avx512vaes", Feature: &X86.HasAVX512VAES}, - {Name: "avx512vbmi2", Feature: &X86.HasAVX512VBMI2}, - {Name: "avx512bitalg", Feature: &X86.HasAVX512BITALG}, - {Name: "avx512bf16", Feature: &X86.HasAVX512BF16}, - {Name: "amxtile", Feature: &X86.HasAMXTile}, - {Name: "amxint8", Feature: &X86.HasAMXInt8}, - {Name: "amxbf16", Feature: &X86.HasAMXBF16}, - {Name: "bmi1", Feature: &X86.HasBMI1}, - {Name: "bmi2", Feature: &X86.HasBMI2}, - {Name: "cx16", Feature: &X86.HasCX16}, - {Name: "erms", Feature: &X86.HasERMS}, - {Name: "fma", Feature: &X86.HasFMA}, - {Name: "osxsave", Feature: &X86.HasOSXSAVE}, - {Name: "pclmulqdq", Feature: &X86.HasPCLMULQDQ}, - {Name: "popcnt", Feature: &X86.HasPOPCNT}, - {Name: "rdrand", Feature: &X86.HasRDRAND}, - {Name: "rdseed", Feature: &X86.HasRDSEED}, - {Name: "sse3", Feature: &X86.HasSSE3}, - {Name: "sse41", Feature: &X86.HasSSE41}, - {Name: "sse42", Feature: &X86.HasSSE42}, - {Name: "ssse3", Feature: &X86.HasSSSE3}, - - // These capabilities should always be enabled on amd64: - {Name: "sse2", Feature: &X86.HasSSE2, Required: runtime.GOARCH == "amd64"}, - } -} - -func archInit() { - - Initialized = true - - maxID, _, _, _ := cpuid(0, 0) - - if maxID < 1 { - return - } - - _, _, ecx1, edx1 := cpuid(1, 0) - X86.HasSSE2 = isSet(26, edx1) - - X86.HasSSE3 = isSet(0, ecx1) - X86.HasPCLMULQDQ = isSet(1, ecx1) - X86.HasSSSE3 = isSet(9, ecx1) - X86.HasFMA = isSet(12, ecx1) - X86.HasCX16 = isSet(13, ecx1) - X86.HasSSE41 = isSet(19, ecx1) - X86.HasSSE42 = isSet(20, ecx1) - X86.HasPOPCNT = isSet(23, ecx1) - X86.HasAES = isSet(25, ecx1) - X86.HasOSXSAVE = isSet(27, ecx1) - X86.HasRDRAND = isSet(30, ecx1) - - var osSupportsAVX, osSupportsAVX512 bool - // For XGETBV, OSXSAVE bit is required and sufficient. - if X86.HasOSXSAVE { - eax, _ := xgetbv() - // Check if XMM and YMM registers have OS support. - osSupportsAVX = isSet(1, eax) && isSet(2, eax) - - if runtime.GOOS == "darwin" { - // Darwin doesn't save/restore AVX-512 mask registers correctly across signal handlers. - // Since users can't rely on mask register contents, let's not advertise AVX-512 support. - // See issue 49233. - osSupportsAVX512 = false - } else { - // Check if OPMASK and ZMM registers have OS support. - osSupportsAVX512 = osSupportsAVX && isSet(5, eax) && isSet(6, eax) && isSet(7, eax) - } - } - - X86.HasAVX = isSet(28, ecx1) && osSupportsAVX - - if maxID < 7 { - return - } - - _, ebx7, ecx7, edx7 := cpuid(7, 0) - X86.HasBMI1 = isSet(3, ebx7) - X86.HasAVX2 = isSet(5, ebx7) && osSupportsAVX - X86.HasBMI2 = isSet(8, ebx7) - X86.HasERMS = isSet(9, ebx7) - X86.HasRDSEED = isSet(18, ebx7) - X86.HasADX = isSet(19, ebx7) - - X86.HasAVX512 = isSet(16, ebx7) && osSupportsAVX512 // Because avx-512 foundation is the core required extension - if X86.HasAVX512 { - X86.HasAVX512F = true - X86.HasAVX512CD = isSet(28, ebx7) - X86.HasAVX512ER = isSet(27, ebx7) - X86.HasAVX512PF = isSet(26, ebx7) - X86.HasAVX512VL = isSet(31, ebx7) - X86.HasAVX512BW = isSet(30, ebx7) - X86.HasAVX512DQ = isSet(17, ebx7) - X86.HasAVX512IFMA = isSet(21, ebx7) - X86.HasAVX512VBMI = isSet(1, ecx7) - X86.HasAVX5124VNNIW = isSet(2, edx7) - X86.HasAVX5124FMAPS = isSet(3, edx7) - X86.HasAVX512VPOPCNTDQ = isSet(14, ecx7) - X86.HasAVX512VPCLMULQDQ = isSet(10, ecx7) - X86.HasAVX512VNNI = isSet(11, ecx7) - X86.HasAVX512GFNI = isSet(8, ecx7) - X86.HasAVX512VAES = isSet(9, ecx7) - X86.HasAVX512VBMI2 = isSet(6, ecx7) - X86.HasAVX512BITALG = isSet(12, ecx7) - - eax71, _, _, _ := cpuid(7, 1) - X86.HasAVX512BF16 = isSet(5, eax71) - } - - X86.HasAMXTile = isSet(24, edx7) - X86.HasAMXInt8 = isSet(25, edx7) - X86.HasAMXBF16 = isSet(22, edx7) -} - -func isSet(bitpos uint, value uint32) bool { - return value&(1<> 63)) -) - -// For those platforms don't have a 'cpuid' equivalent we use HWCAP/HWCAP2 -// These are initialized in cpu_$GOARCH.go -// and should not be changed after they are initialized. -var hwCap uint -var hwCap2 uint - -func readHWCAP() error { - // For Go 1.21+, get auxv from the Go runtime. - if a := getAuxv(); len(a) > 0 { - for len(a) >= 2 { - tag, val := a[0], uint(a[1]) - a = a[2:] - switch tag { - case _AT_HWCAP: - hwCap = val - case _AT_HWCAP2: - hwCap2 = val - } - } - return nil - } - - buf, err := os.ReadFile(procAuxv) - if err != nil { - // e.g. on android /proc/self/auxv is not accessible, so silently - // ignore the error and leave Initialized = false. On some - // architectures (e.g. arm64) doinit() implements a fallback - // readout and will set Initialized = true again. - return err - } - bo := hostByteOrder() - for len(buf) >= 2*(uintSize/8) { - var tag, val uint - switch uintSize { - case 32: - tag = uint(bo.Uint32(buf[0:])) - val = uint(bo.Uint32(buf[4:])) - buf = buf[8:] - case 64: - tag = uint(bo.Uint64(buf[0:])) - val = uint(bo.Uint64(buf[8:])) - buf = buf[16:] - } - switch tag { - case _AT_HWCAP: - hwCap = val - case _AT_HWCAP2: - hwCap2 = val - } - } - return nil -} diff --git a/internal/xcpu/parse.go b/internal/xcpu/parse.go deleted file mode 100644 index be30b60f..00000000 --- a/internal/xcpu/parse.go +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2022 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package xcpu - -import "strconv" - -// parseRelease parses a dot-separated version number. It follows the semver -// syntax, but allows the minor and patch versions to be elided. -// -// This is a copy of the Go runtime's parseRelease from -// https://golang.org/cl/209597. -func parseRelease(rel string) (major, minor, patch int, ok bool) { - // Strip anything after a dash or plus. - for i := 0; i < len(rel); i++ { - if rel[i] == '-' || rel[i] == '+' { - rel = rel[:i] - break - } - } - - next := func() (int, bool) { - for i := 0; i < len(rel); i++ { - if rel[i] == '.' { - ver, err := strconv.Atoi(rel[:i]) - rel = rel[i+1:] - return ver, err == nil - } - } - ver, err := strconv.Atoi(rel) - rel = "" - return ver, err == nil - } - if major, ok = next(); !ok || rel == "" { - return - } - if minor, ok = next(); !ok || rel == "" { - return - } - patch, ok = next() - return -} diff --git a/internal/xcpu/proc_cpuinfo_linux.go b/internal/xcpu/proc_cpuinfo_linux.go deleted file mode 100644 index 9c88d24e..00000000 --- a/internal/xcpu/proc_cpuinfo_linux.go +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright 2022 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build linux && arm64 - -package xcpu - -import ( - "errors" - "io" - "os" - "strings" -) - -func readLinuxProcCPUInfo() error { - f, err := os.Open("/proc/cpuinfo") - if err != nil { - return err - } - defer f.Close() - - var buf [1 << 10]byte // enough for first CPU - n, err := io.ReadFull(f, buf[:]) - if err != nil && err != io.ErrUnexpectedEOF { - return err - } - in := string(buf[:n]) - const features = "\nFeatures : " - i := strings.Index(in, features) - if i == -1 { - return errors.New("no CPU features found") - } - in = in[i+len(features):] - if i := strings.Index(in, "\n"); i != -1 { - in = in[:i] - } - m := map[string]*bool{} - - initOptions() // need it early here; it's harmless to call twice - for _, o := range options { - m[o.Name] = o.Feature - } - // The EVTSTRM field has alias "evstrm" in Go, but Linux calls it "evtstrm". - m["evtstrm"] = &ARM64.HasEVTSTRM - - for _, f := range strings.Fields(in) { - if p, ok := m[f]; ok { - *p = true - } - } - return nil -} diff --git a/internal/xcpu/runtime_auxv.go b/internal/xcpu/runtime_auxv.go deleted file mode 100644 index b842842e..00000000 --- a/internal/xcpu/runtime_auxv.go +++ /dev/null @@ -1,16 +0,0 @@ -// Copyright 2023 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package xcpu - -// getAuxvFn is non-nil on Go 1.21+ (via runtime_auxv_go121.go init) -// on platforms that use auxv. -var getAuxvFn func() []uintptr - -func getAuxv() []uintptr { - if getAuxvFn == nil { - return nil - } - return getAuxvFn() -} diff --git a/internal/xcpu/runtime_auxv_go121.go b/internal/xcpu/runtime_auxv_go121.go deleted file mode 100644 index b4dba06a..00000000 --- a/internal/xcpu/runtime_auxv_go121.go +++ /dev/null @@ -1,18 +0,0 @@ -// Copyright 2023 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build go1.21 - -package xcpu - -import ( - _ "unsafe" // for linkname -) - -//go:linkname runtime_getAuxv runtime.getAuxv -func runtime_getAuxv() []uintptr - -func init() { - getAuxvFn = runtime_getAuxv -} diff --git a/internal/xcpu/syscall_aix_gccgo.go b/internal/xcpu/syscall_aix_gccgo.go deleted file mode 100644 index 905566fe..00000000 --- a/internal/xcpu/syscall_aix_gccgo.go +++ /dev/null @@ -1,26 +0,0 @@ -// Copyright 2020 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// Recreate a getsystemcfg syscall handler instead of -// using the one provided by x/sys/unix to avoid having -// the dependency between them. (See golang.org/issue/32102) -// Moreover, this file will be used during the building of -// gccgo's libgo and thus must not used a CGo method. - -//go:build aix && gccgo - -package xcpu - -import ( - "syscall" -) - -//extern getsystemcfg -func gccgoGetsystemcfg(label uint32) (r uint64) - -func callgetsystemcfg(label int) (r1 uintptr, e1 syscall.Errno) { - r1 = uintptr(gccgoGetsystemcfg(uint32(label))) - e1 = syscall.GetErrno() - return -} diff --git a/internal/xcpu/syscall_aix_ppc64_gc.go b/internal/xcpu/syscall_aix_ppc64_gc.go deleted file mode 100644 index 18837396..00000000 --- a/internal/xcpu/syscall_aix_ppc64_gc.go +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright 2019 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// Minimal copy of x/sys/unix so the cpu package can make a -// system call on AIX without depending on x/sys/unix. -// (See golang.org/issue/32102) - -//go:build aix && ppc64 && gc - -package xcpu - -import ( - "syscall" - "unsafe" -) - -//go:cgo_import_dynamic libc_getsystemcfg getsystemcfg "libc.a/shr_64.o" - -//go:linkname libc_getsystemcfg libc_getsystemcfg - -type syscallFunc uintptr - -var libc_getsystemcfg syscallFunc - -type errno = syscall.Errno - -// Implemented in runtime/syscall_aix.go. -func rawSyscall6(trap, nargs, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2 uintptr, err errno) -func syscall6(trap, nargs, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2 uintptr, err errno) - -func callgetsystemcfg(label int) (r1 uintptr, e1 errno) { - r1, _, e1 = syscall6(uintptr(unsafe.Pointer(&libc_getsystemcfg)), 1, uintptr(label), 0, 0, 0, 0, 0) - return -} diff --git a/mask_amd64.s b/mask_amd64.s index caca53ec..bd42be31 100644 --- a/mask_amd64.s +++ b/mask_amd64.s @@ -26,11 +26,6 @@ TEXT ·maskAsm(SB), NOSPLIT, $0-28 TESTQ $31, AX JNZ unaligned -aligned: - CMPB ·useAVX2(SB), $1 - JE avx2 - JMP sse - unaligned_loop_1byte: XORB SI, (AX) INCQ AX @@ -47,7 +42,7 @@ unaligned_loop_1byte: ORQ DX, DI TESTQ $31, AX - JZ aligned + JZ sse unaligned: TESTQ $7, AX // AND $7 & len, if not zero jump to loop_1b. @@ -60,27 +55,7 @@ unaligned_loop: SUBQ $8, CX TESTQ $31, AX JNZ unaligned_loop - JMP aligned - -avx2: - CMPQ CX, $0x80 - JL sse - VMOVQ DI, X0 - VPBROADCASTQ X0, Y0 - -avx2_loop: - VPXOR (AX), Y0, Y1 - VPXOR 32(AX), Y0, Y2 - VPXOR 64(AX), Y0, Y3 - VPXOR 96(AX), Y0, Y4 - VMOVDQU Y1, (AX) - VMOVDQU Y2, 32(AX) - VMOVDQU Y3, 64(AX) - VMOVDQU Y4, 96(AX) - ADDQ $0x80, AX - SUBQ $0x80, CX - CMPQ CX, $0x80 - JAE avx2_loop // loop if CX >= 0x80 + JMP sse sse: CMPQ CX, $0x40 diff --git a/mask_asm.go b/mask_asm.go index 3b1ee517..259eec03 100644 --- a/mask_asm.go +++ b/mask_asm.go @@ -2,8 +2,6 @@ package websocket -import "nhooyr.io/websocket/internal/xcpu" - func mask(b []byte, key uint32) uint32 { if len(b) > 0 { return maskAsm(&b[0], len(b), key) @@ -11,14 +9,13 @@ func mask(b []byte, key uint32) uint32 { return key } -//lint:ignore U1000 mask_*.s -var useAVX2 = xcpu.X86.HasAVX2 - // @nhooyr: I am not confident that the amd64 or the arm64 implementations of this // function are perfect. There are almost certainly missing optimizations or -// opportunities for // simplification. I'm confident there are no bugs though. +// opportunities for simplification. I'm confident there are no bugs though. // For example, the arm64 implementation doesn't align memory like the amd64. // Or the amd64 implementation could use AVX512 instead of just AVX2. +// The AVX2 code I had to disable anyway as it wasn't performing as expected. +// See https://github.com/nhooyr/websocket/pull/326#issuecomment-1771138049 // //go:noescape func maskAsm(b *byte, len int, key uint32) uint32 From 2cd18b3742d1b29df86bd8daa81fc55fe26f9f8c Mon Sep 17 00:00:00 2001 From: Anmol Sethi Date: Thu, 22 Feb 2024 05:39:37 -0800 Subject: [PATCH 26/26] README.md: Link to assembly benchmark results --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 3dead855..cde3ec6d 100644 --- a/README.md +++ b/README.md @@ -120,7 +120,7 @@ Advantages of nhooyr.io/websocket: - Gorilla requires registering a pong callback before sending a Ping - Can target Wasm ([gorilla/websocket#432](https://github.com/gorilla/websocket/issues/432)) - Transparent message buffer reuse with [wsjson](https://pkg.go.dev/nhooyr.io/websocket/wsjson) subpackage -- [3-4x](https://github.com/nhooyr/websocket/pull/326) faster WebSocket masking implementation in assembly for amd64 and arm64 and [2x](https://github.com/nhooyr/websocket/releases/tag/v1.7.4) faster implementation in pure Go +- [3.5x](https://github.com/nhooyr/websocket/pull/326#issuecomment-1959470758) faster WebSocket masking implementation in assembly for amd64 and arm64 and [2x](https://github.com/nhooyr/websocket/releases/tag/v1.7.4) faster implementation in pure Go - Gorilla's implementation is slower and uses [unsafe](https://golang.org/pkg/unsafe/). - Full [permessage-deflate](https://tools.ietf.org/html/rfc7692) compression extension support - Gorilla only supports no context takeover mode