From 84de8017bb1b887dfadcc085ea38595031bcc7da Mon Sep 17 00:00:00 2001 From: Bas Westerbaan Date: Tue, 18 Apr 2023 14:11:16 +0200 Subject: [PATCH] Add TurboShake{128,256} --- internal/sha3/keccakf.go | 12 ++- internal/sha3/sha3.go | 7 +- internal/sha3/sha3_test.go | 38 +++++++++- internal/sha3/shake.go | 36 +++++++++ pke/kyber/internal/common/sample.go | 2 +- sign/dilithium/mode2/internal/sample.go | 4 +- sign/dilithium/mode2aes/internal/sample.go | 4 +- sign/dilithium/mode3/internal/sample.go | 4 +- sign/dilithium/mode3aes/internal/sample.go | 4 +- sign/dilithium/mode5/internal/sample.go | 4 +- sign/dilithium/mode5aes/internal/sample.go | 4 +- simd/keccakf1600/example_test.go | 2 +- simd/keccakf1600/f1600x.go | 42 +++++++---- simd/keccakf1600/f1600x2_arm64.go | 6 +- simd/keccakf1600/f1600x2_arm64.s | 10 ++- simd/keccakf1600/f1600x4_amd64.go | 4 +- simd/keccakf1600/f1600x4_amd64.s | 17 +++-- simd/keccakf1600/f1600x4stubs_amd64.go | 3 +- simd/keccakf1600/f1600x_test.go | 85 +++++++++++++++------- simd/keccakf1600/fallback.go | 4 +- simd/keccakf1600/internal/asm/go.mod | 6 +- simd/keccakf1600/internal/asm/go.sum | 44 +++++++++++ simd/keccakf1600/internal/asm/src.go | 9 ++- 23 files changed, 273 insertions(+), 78 deletions(-) diff --git a/internal/sha3/keccakf.go b/internal/sha3/keccakf.go index ab19d0ad..1755fd1e 100644 --- a/internal/sha3/keccakf.go +++ b/internal/sha3/keccakf.go @@ -6,13 +6,21 @@ package sha3 // KeccakF1600 applies the Keccak permutation to a 1600b-wide // state represented as a slice of 25 uint64s. +// If turbo is true, applies the 12-round variant instead of the +// regular 24-round variant. // nolint:funlen -func KeccakF1600(a *[25]uint64) { +func KeccakF1600(a *[25]uint64, turbo bool) { // Implementation translated from Keccak-inplace.c // in the keccak reference code. var t, bc0, bc1, bc2, bc3, bc4, d0, d1, d2, d3, d4 uint64 - for i := 0; i < 24; i += 4 { + i := 0 + + if turbo { + i = 12 + } + + for ; i < 24; i += 4 { // Combines the 5 steps in each round into 2 steps. // Unrolls 4 rounds per loop and spreads some steps across rounds. diff --git a/internal/sha3/sha3.go b/internal/sha3/sha3.go index b35cd006..01806d7d 100644 --- a/internal/sha3/sha3.go +++ b/internal/sha3/sha3.go @@ -51,6 +51,7 @@ type State struct { // Specific to SHA-3 and SHAKE. outputLen int // the default output size in bytes state spongeDirection // whether the sponge is absorbing or squeezing + turbo bool // Whether we're using 12 rounds instead of 24 } // BlockSize returns the rate of sponge underlying this hash function. @@ -86,11 +87,11 @@ func (d *State) permute() { xorIn(d, d.buf()) d.bufe = 0 d.bufo = 0 - KeccakF1600(&d.a) + KeccakF1600(&d.a, d.turbo) case spongeSqueezing: // If we're squeezing, we need to apply the permutation before // copying more output. - KeccakF1600(&d.a) + KeccakF1600(&d.a, d.turbo) d.bufe = d.rate d.bufo = 0 copyOut(d, d.buf()) @@ -136,7 +137,7 @@ func (d *State) Write(p []byte) (written int, err error) { // The fast path; absorb a full "rate" bytes of input and apply the permutation. xorIn(d, p[:d.rate]) p = p[d.rate:] - KeccakF1600(&d.a) + KeccakF1600(&d.a, d.turbo) } else { // The slow path; buffer the input until we can fill the sponge, and then xor it in. todo := d.rate - bufl diff --git a/internal/sha3/sha3_test.go b/internal/sha3/sha3_test.go index b4850afa..6e6930d4 100644 --- a/internal/sha3/sha3_test.go +++ b/internal/sha3/sha3_test.go @@ -161,13 +161,23 @@ func sequentialBytes(size int) []byte { return result } +// BenchmarkPermutationFunction measures the speed of the permutation function +// with no input data. +func BenchmarkPermutationFunctionTurbo(b *testing.B) { + b.SetBytes(int64(200)) + var lanes [25]uint64 + for i := 0; i < b.N; i++ { + KeccakF1600(&lanes, true) + } +} + // BenchmarkPermutationFunction measures the speed of the permutation function // with no input data. func BenchmarkPermutationFunction(b *testing.B) { b.SetBytes(int64(200)) var lanes [25]uint64 for i := 0; i < b.N; i++ { - KeccakF1600(&lanes) + KeccakF1600(&lanes, false) } } @@ -220,6 +230,9 @@ func BenchmarkShake256_MTU(b *testing.B) { benchmarkShake(b, NewShake256(), 135 func BenchmarkShake256_16x(b *testing.B) { benchmarkShake(b, NewShake256(), 16, 1024) } func BenchmarkShake256_1MiB(b *testing.B) { benchmarkShake(b, NewShake256(), 1024, 1024) } +func BenchmarkTurboShake128_1MiB(b *testing.B) { benchmarkShake(b, NewTurboShake128(0x37), 1024, 1024) } +func BenchmarkTurboShake256_1MiB(b *testing.B) { benchmarkShake(b, NewTurboShake256(0x37), 1024, 1024) } + func BenchmarkSha3_512_1MiB(b *testing.B) { benchmarkHash(b, New512(), 1024, 1024) } func Example_sum() { @@ -247,3 +260,26 @@ func Example_mac() { fmt.Printf("%x\n", h) // Output: 78de2974bd2711d5549ffd32b753ef0f5fa80a0db2556db60f0987eb8a9218ff } + +func TestTurboShake128(t *testing.T) { + out := make([]byte, 64) + TurboShakeSum128(out, []byte{}, 0x07) + if hex.EncodeToString(out) != "5a223ad30b3b8c66a243048cfced430f54e7529287d15150b973133adfac6a2ffe2708e73061e09a4000168ba9c8ca1813198f7bbed4984b4185f2c2580ee623" { + t.Fatal() + } + + h := NewTurboShake128(0x07) + out = make([]byte, 10032) + _, _ = h.Read(out) + if hex.EncodeToString(out[len(out)-32:]) != "7593a28020a3c4ae0d605fd61f5eb56eccd27cc3d12ff09f78369772a460c55d" { + t.Fatal() + } + + out = make([]byte, 32) + TurboShakeSum128(out, []byte{0xff}, 0x06) + if hex.EncodeToString(out) != "8ec9c66465ed0d4a6c35d13506718d687a25cb05c74cca1e42501abd83874a67" { + t.Fatal() + } + + // TODO all tests +} diff --git a/internal/sha3/shake.go b/internal/sha3/shake.go index b92c5b7d..ac9954fe 100644 --- a/internal/sha3/shake.go +++ b/internal/sha3/shake.go @@ -57,6 +57,17 @@ func NewShake128() State { return State{rate: rate128, dsbyte: dsbyteShake} } +// NewTurboShake128 creates a new TurboSHAKE128 variable-output-length ShakeHash. +// Its generic security strength is 128 bits against all attacks if at +// least 32 bytes of its output are used. +// D is the seperationByte and must be between 0x01 and 0x7f inclusive. +func NewTurboShake128(D byte) State { + if D == 0 || D > 0x7f { + panic("turboshake: D out of range") + } + return State{rate: rate128, dsbyte: D, turbo: true} +} + // NewShake256 creates a new SHAKE256 variable-output-length ShakeHash. // Its generic security strength is 256 bits against all attacks if // at least 64 bytes of its output are used. @@ -64,6 +75,17 @@ func NewShake256() State { return State{rate: rate256, dsbyte: dsbyteShake} } +// NewTurboShake256 creates a new TurboSHAKE256 variable-output-length ShakeHash. +// Its generic security strength is 256 bits against all attacks if +// at least 64 bytes of its output are used. +// D is the seperationByte and must be between 0x01 and 0x7f inclusive. +func NewTurboShake256(D byte) State { + if D == 0 || D > 0x7f { + panic("turboshake: D out of range") + } + return State{rate: rate256, dsbyte: D, turbo: true} +} + // ShakeSum128 writes an arbitrary-length digest of data into hash. func ShakeSum128(hash, data []byte) { h := NewShake128() @@ -77,3 +99,17 @@ func ShakeSum256(hash, data []byte) { _, _ = h.Write(data) _, _ = h.Read(hash) } + +// TurboShakeSum128 writes an arbitrary-length digest of data into hash. +func TurboShakeSum128(hash, data []byte, D byte) { + h := NewTurboShake128(D) + _, _ = h.Write(data) + _, _ = h.Read(hash) +} + +// TurboShakeSum256 writes an arbitrary-length digest of data into hash. +func TurboShakeSum256(hash, data []byte, D byte) { + h := NewTurboShake256(D) + _, _ = h.Write(data) + _, _ = h.Read(hash) +} diff --git a/pke/kyber/internal/common/sample.go b/pke/kyber/internal/common/sample.go index 1f15f32c..ed5a33dd 100644 --- a/pke/kyber/internal/common/sample.go +++ b/pke/kyber/internal/common/sample.go @@ -100,7 +100,7 @@ func (p *Poly) DeriveNoise2(seed []byte, nonce uint8) { // Can only be called when DeriveX4Available is true. func PolyDeriveUniformX4(ps [4]*Poly, seed *[32]byte, xs, ys [4]uint8) { var perm keccakf1600.StateX4 - state := perm.Initialize() + state := perm.Initialize(false) // Absorb the seed in the four states for i := 0; i < 4; i++ { diff --git a/sign/dilithium/mode2/internal/sample.go b/sign/dilithium/mode2/internal/sample.go index db5aac8c..fa866293 100644 --- a/sign/dilithium/mode2/internal/sample.go +++ b/sign/dilithium/mode2/internal/sample.go @@ -20,7 +20,7 @@ var DeriveX4Available = keccakf1600.IsEnabledX4() && !UseAES // Can only be called when DeriveX4Available is true. func PolyDeriveUniformX4(ps [4]*common.Poly, seed *[32]byte, nonces [4]uint16) { var perm keccakf1600.StateX4 - state := perm.Initialize() + state := perm.Initialize(false) // Absorb the seed in the four states for i := 0; i < 4; i++ { @@ -248,7 +248,7 @@ func PolyDeriveUniformLeGamma1(p *common.Poly, seed *[64]byte, nonce uint16) { // This function is currently not used (yet). func PolyDeriveUniformBallX4(ps [4]*common.Poly, seed *[32]byte) { var perm keccakf1600.StateX4 - state := perm.Initialize() + state := perm.Initialize(false) // Absorb the seed in the four states for i := 0; i < 4; i++ { diff --git a/sign/dilithium/mode2aes/internal/sample.go b/sign/dilithium/mode2aes/internal/sample.go index db5aac8c..fa866293 100644 --- a/sign/dilithium/mode2aes/internal/sample.go +++ b/sign/dilithium/mode2aes/internal/sample.go @@ -20,7 +20,7 @@ var DeriveX4Available = keccakf1600.IsEnabledX4() && !UseAES // Can only be called when DeriveX4Available is true. func PolyDeriveUniformX4(ps [4]*common.Poly, seed *[32]byte, nonces [4]uint16) { var perm keccakf1600.StateX4 - state := perm.Initialize() + state := perm.Initialize(false) // Absorb the seed in the four states for i := 0; i < 4; i++ { @@ -248,7 +248,7 @@ func PolyDeriveUniformLeGamma1(p *common.Poly, seed *[64]byte, nonce uint16) { // This function is currently not used (yet). func PolyDeriveUniformBallX4(ps [4]*common.Poly, seed *[32]byte) { var perm keccakf1600.StateX4 - state := perm.Initialize() + state := perm.Initialize(false) // Absorb the seed in the four states for i := 0; i < 4; i++ { diff --git a/sign/dilithium/mode3/internal/sample.go b/sign/dilithium/mode3/internal/sample.go index c6ae31dd..ccde1f2e 100644 --- a/sign/dilithium/mode3/internal/sample.go +++ b/sign/dilithium/mode3/internal/sample.go @@ -18,7 +18,7 @@ var DeriveX4Available = keccakf1600.IsEnabledX4() && !UseAES // Can only be called when DeriveX4Available is true. func PolyDeriveUniformX4(ps [4]*common.Poly, seed *[32]byte, nonces [4]uint16) { var perm keccakf1600.StateX4 - state := perm.Initialize() + state := perm.Initialize(false) // Absorb the seed in the four states for i := 0; i < 4; i++ { @@ -246,7 +246,7 @@ func PolyDeriveUniformLeGamma1(p *common.Poly, seed *[64]byte, nonce uint16) { // This function is currently not used (yet). func PolyDeriveUniformBallX4(ps [4]*common.Poly, seed *[32]byte) { var perm keccakf1600.StateX4 - state := perm.Initialize() + state := perm.Initialize(false) // Absorb the seed in the four states for i := 0; i < 4; i++ { diff --git a/sign/dilithium/mode3aes/internal/sample.go b/sign/dilithium/mode3aes/internal/sample.go index db5aac8c..fa866293 100644 --- a/sign/dilithium/mode3aes/internal/sample.go +++ b/sign/dilithium/mode3aes/internal/sample.go @@ -20,7 +20,7 @@ var DeriveX4Available = keccakf1600.IsEnabledX4() && !UseAES // Can only be called when DeriveX4Available is true. func PolyDeriveUniformX4(ps [4]*common.Poly, seed *[32]byte, nonces [4]uint16) { var perm keccakf1600.StateX4 - state := perm.Initialize() + state := perm.Initialize(false) // Absorb the seed in the four states for i := 0; i < 4; i++ { @@ -248,7 +248,7 @@ func PolyDeriveUniformLeGamma1(p *common.Poly, seed *[64]byte, nonce uint16) { // This function is currently not used (yet). func PolyDeriveUniformBallX4(ps [4]*common.Poly, seed *[32]byte) { var perm keccakf1600.StateX4 - state := perm.Initialize() + state := perm.Initialize(false) // Absorb the seed in the four states for i := 0; i < 4; i++ { diff --git a/sign/dilithium/mode5/internal/sample.go b/sign/dilithium/mode5/internal/sample.go index db5aac8c..fa866293 100644 --- a/sign/dilithium/mode5/internal/sample.go +++ b/sign/dilithium/mode5/internal/sample.go @@ -20,7 +20,7 @@ var DeriveX4Available = keccakf1600.IsEnabledX4() && !UseAES // Can only be called when DeriveX4Available is true. func PolyDeriveUniformX4(ps [4]*common.Poly, seed *[32]byte, nonces [4]uint16) { var perm keccakf1600.StateX4 - state := perm.Initialize() + state := perm.Initialize(false) // Absorb the seed in the four states for i := 0; i < 4; i++ { @@ -248,7 +248,7 @@ func PolyDeriveUniformLeGamma1(p *common.Poly, seed *[64]byte, nonce uint16) { // This function is currently not used (yet). func PolyDeriveUniformBallX4(ps [4]*common.Poly, seed *[32]byte) { var perm keccakf1600.StateX4 - state := perm.Initialize() + state := perm.Initialize(false) // Absorb the seed in the four states for i := 0; i < 4; i++ { diff --git a/sign/dilithium/mode5aes/internal/sample.go b/sign/dilithium/mode5aes/internal/sample.go index db5aac8c..fa866293 100644 --- a/sign/dilithium/mode5aes/internal/sample.go +++ b/sign/dilithium/mode5aes/internal/sample.go @@ -20,7 +20,7 @@ var DeriveX4Available = keccakf1600.IsEnabledX4() && !UseAES // Can only be called when DeriveX4Available is true. func PolyDeriveUniformX4(ps [4]*common.Poly, seed *[32]byte, nonces [4]uint16) { var perm keccakf1600.StateX4 - state := perm.Initialize() + state := perm.Initialize(false) // Absorb the seed in the four states for i := 0; i < 4; i++ { @@ -248,7 +248,7 @@ func PolyDeriveUniformLeGamma1(p *common.Poly, seed *[64]byte, nonce uint16) { // This function is currently not used (yet). func PolyDeriveUniformBallX4(ps [4]*common.Poly, seed *[32]byte) { var perm keccakf1600.StateX4 - state := perm.Initialize() + state := perm.Initialize(false) // Absorb the seed in the four states for i := 0; i < 4; i++ { diff --git a/simd/keccakf1600/example_test.go b/simd/keccakf1600/example_test.go index 8c2e4c87..0385de4e 100644 --- a/simd/keccakf1600/example_test.go +++ b/simd/keccakf1600/example_test.go @@ -38,7 +38,7 @@ func Example() { // type is used to ensure that the encapsulated [100]uint64 is aligned // properly to be used efficiently with vector instructions.) var perm keccakf1600.StateX4 - state := perm.Initialize() + state := perm.Initialize(false) // state is initialized with zeroes. As the messages fit within one // block, we only need to write the messages, domain separators diff --git a/simd/keccakf1600/f1600x.go b/simd/keccakf1600/f1600x.go index 7ce0c2ef..20ac96f0 100644 --- a/simd/keccakf1600/f1600x.go +++ b/simd/keccakf1600/f1600x.go @@ -17,6 +17,7 @@ package keccakf1600 import ( + "runtime" "unsafe" "github.com/cloudflare/circl/internal/sha3" @@ -37,6 +38,9 @@ type StateX4 struct { // Offset into a that is 32 byte aligned. offset int + + // If true, permute will use 12-round keccak instead of 24-round keccak + turbo bool } // StateX2 contains state for the two-way permutation including the two @@ -53,6 +57,9 @@ type StateX2 struct { // Offset into a that is 32 byte aligned. offset int + + // If true, permute will use 12-round keccak instead of 24-round keccak + turbo bool } // IsEnabledX4 returns true if the architecture supports a four-way SIMD @@ -61,15 +68,14 @@ func IsEnabledX4() bool { return cpu.X86.HasAVX2 } // IsEnabledX2 returns true if the architecture supports a two-way SIMD // implementation provided in this package. -func IsEnabledX2() bool { - // After Go 1.16 the flag cpu.ARM64.HasSHA3 is no longer exposed. - return false -} +func IsEnabledX2() bool { return enabledX2 } // Initialize the state and returns the buffer on which the four permutations // will act: a uint64 slice of length 100. The first permutation will act // on {a[0], a[4], ..., a[96]}, the second on {a[1], a[5], ..., a[97]}, etc. -func (s *StateX4) Initialize() []uint64 { +// If turbo is true, applies 12-round variant instead of the usual 24. +func (s *StateX4) Initialize(turbo bool) []uint64 { + s.turbo = turbo rp := unsafe.Pointer(&s.a[0]) // uint64s are always aligned by a multiple of 8. Compute the remainder @@ -87,7 +93,9 @@ func (s *StateX4) Initialize() []uint64 { // Initialize the state and returns the buffer on which the two permutations // will act: a uint64 slice of length 50. The first permutation will act // on {a[0], a[2], ..., a[48]} and the second on {a[1], a[3], ..., a[49]}. -func (s *StateX2) Initialize() []uint64 { +// If turbo is true, applies 12-round variant instead of the usual 24. +func (s *StateX2) Initialize(turbo bool) []uint64 { + s.turbo = turbo rp := unsafe.Pointer(&s.a[0]) // uint64s are always aligned by a multiple of 8. Compute the remainder @@ -106,9 +114,9 @@ func (s *StateX2) Initialize() []uint64 { // returned from Initialize(). func (s *StateX4) Permute() { if IsEnabledX4() { - permuteSIMDx4(s.a[s.offset:]) + permuteSIMDx4(s.a[s.offset:], s.turbo) } else { - permuteScalarX4(s.a[s.offset:]) // A slower generic implementation. + permuteScalarX4(s.a[s.offset:], s.turbo) // A slower generic implementation. } } @@ -116,34 +124,40 @@ func (s *StateX4) Permute() { // returned from Initialize(). func (s *StateX2) Permute() { if IsEnabledX2() { - permuteSIMDx2(s.a[s.offset:]) + permuteSIMDx2(s.a[s.offset:], s.turbo) } else { - permuteScalarX2(s.a[s.offset:]) // A slower generic implementation. + permuteScalarX2(s.a[s.offset:], s.turbo) // A slower generic implementation. } } -func permuteScalarX4(a []uint64) { +func permuteScalarX4(a []uint64, turbo bool) { var buf [25]uint64 for i := 0; i < 4; i++ { for j := 0; j < 25; j++ { buf[j] = a[4*j+i] } - sha3.KeccakF1600(&buf) + sha3.KeccakF1600(&buf, turbo) for j := 0; j < 25; j++ { a[4*j+i] = buf[j] } } } -func permuteScalarX2(a []uint64) { +func permuteScalarX2(a []uint64, turbo bool) { var buf [25]uint64 for i := 0; i < 2; i++ { for j := 0; j < 25; j++ { buf[j] = a[2*j+i] } - sha3.KeccakF1600(&buf) + sha3.KeccakF1600(&buf, turbo) for j := 0; j < 25; j++ { a[2*j+i] = buf[j] } } } + +var enabledX2 bool + +func init() { + enabledX2 = runtime.GOARCH == "arm64" && runtime.GOOS == "darwin" +} diff --git a/simd/keccakf1600/f1600x2_arm64.go b/simd/keccakf1600/f1600x2_arm64.go index 75461dd7..44f05436 100644 --- a/simd/keccakf1600/f1600x2_arm64.go +++ b/simd/keccakf1600/f1600x2_arm64.go @@ -5,9 +5,9 @@ package keccakf1600 import "github.com/cloudflare/circl/internal/sha3" -func permuteSIMDx2(state []uint64) { f1600x2ARM(&state[0], &sha3.RC) } +func permuteSIMDx2(state []uint64, turbo bool) { f1600x2ARM(&state[0], &sha3.RC, turbo) } -func permuteSIMDx4(state []uint64) { permuteScalarX4(state) } +func permuteSIMDx4(state []uint64, turbo bool) { permuteScalarX4(state, turbo) } //go:noescape -func f1600x2ARM(state *uint64, rc *[24]uint64) +func f1600x2ARM(state *uint64, rc *[24]uint64, turbo bool) diff --git a/simd/keccakf1600/f1600x2_arm64.s b/simd/keccakf1600/f1600x2_arm64.s index 1e8547f9..fe1c852a 100644 --- a/simd/keccakf1600/f1600x2_arm64.s +++ b/simd/keccakf1600/f1600x2_arm64.s @@ -4,8 +4,8 @@ #include "textflag.h" -// func f1600x2ARM(state *uint64, rc *[24]uint64) -TEXT ·f1600x2ARM(SB), NOSPLIT, $0-16 +// func f1600x2ARM(state *uint64, rc *[24]uint64, turbo bool) +TEXT ·f1600x2ARM(SB), NOSPLIT, $0-17 MOVD state+0(FP), R0 MOVD rc+8(FP), R1 MOVD R0, R2 @@ -19,6 +19,12 @@ TEXT ·f1600x2ARM(SB), NOSPLIT, $0-16 VLD1.P 64(R0), [V20.B16, V21.B16, V22.B16, V23.B16] VLD1.P (R0), [V24.B16] + MOVBU turbo+16(FP), R4 + CBZ R4, loop + + SUBS $12, R3, R3 + ADD $96, R1, R1 + loop: // Execute theta but without xorring into the state yet. VEOR3 V10.B16, V5.B16, V0.B16, V25.B16 diff --git a/simd/keccakf1600/f1600x4_amd64.go b/simd/keccakf1600/f1600x4_amd64.go index ac5c658d..669ce65f 100644 --- a/simd/keccakf1600/f1600x4_amd64.go +++ b/simd/keccakf1600/f1600x4_amd64.go @@ -2,6 +2,6 @@ package keccakf1600 import "github.com/cloudflare/circl/internal/sha3" -func permuteSIMDx4(state []uint64) { f1600x4AVX2(&state[0], &sha3.RC) } +func permuteSIMDx4(state []uint64, turbo bool) { f1600x4AVX2(&state[0], &sha3.RC, turbo) } -func permuteSIMDx2(state []uint64) { permuteScalarX2(state) } +func permuteSIMDx2(state []uint64, turbo bool) { permuteScalarX2(state, turbo) } diff --git a/simd/keccakf1600/f1600x4_amd64.s b/simd/keccakf1600/f1600x4_amd64.s index 194981f1..314a8555 100644 --- a/simd/keccakf1600/f1600x4_amd64.s +++ b/simd/keccakf1600/f1600x4_amd64.s @@ -1,15 +1,20 @@ // Code generated by command: go run src.go -out ../../f1600x4_amd64.s -stubs ../../f1600x4stubs_amd64.go -pkg keccakf1600. DO NOT EDIT. -// +build amd64 +//go:build amd64 #include "textflag.h" -// func f1600x4AVX2(state *uint64, rc *[24]uint64) +// func f1600x4AVX2(state *uint64, rc *[24]uint64, turbo bool) // Requires: AVX, AVX2 -TEXT ·f1600x4AVX2(SB), NOSPLIT, $0-16 - MOVQ state+0(FP), AX - MOVQ rc+8(FP), CX - MOVQ $0x0000000000000006, DX +TEXT ·f1600x4AVX2(SB), NOSPLIT, $0-17 + MOVQ state+0(FP), AX + MOVQ rc+8(FP), CX + MOVQ $0x0000000000000006, DX + MOVBQZX turbo+16(FP), BX + TESTQ BX, BX + JZ loop + MOVQ $0x0000000000000003, DX + ADDQ $0x60, CX loop: VMOVDQA (AX), Y0 diff --git a/simd/keccakf1600/f1600x4stubs_amd64.go b/simd/keccakf1600/f1600x4stubs_amd64.go index 76c6cf99..de289441 100644 --- a/simd/keccakf1600/f1600x4stubs_amd64.go +++ b/simd/keccakf1600/f1600x4stubs_amd64.go @@ -1,9 +1,8 @@ // Code generated by command: go run src.go -out ../../f1600x4_amd64.s -stubs ../../f1600x4stubs_amd64.go -pkg keccakf1600. DO NOT EDIT. //go:build amd64 -// +build amd64 package keccakf1600 //go:noescape -func f1600x4AVX2(state *uint64, rc *[24]uint64) +func f1600x4AVX2(state *uint64, rc *[24]uint64, turbo bool) diff --git a/simd/keccakf1600/f1600x_test.go b/simd/keccakf1600/f1600x_test.go index 09e83bba..cd0cae40 100644 --- a/simd/keccakf1600/f1600x_test.go +++ b/simd/keccakf1600/f1600x_test.go @@ -1,6 +1,9 @@ package keccakf1600 -import "testing" +import ( + "reflect" + "testing" +) // From the Keccak code package. var permutationOfZeroes = [25]uint64{ @@ -16,10 +19,10 @@ var permutationOfZeroes = [25]uint64{ } func TestKeccakF1600x2(t *testing.T) { - test := func(t *testing.T, f func(s *StateX2, a []uint64)) { + test := func(t *testing.T, turbo bool, f func(s *StateX2, a []uint64)) { t.Helper() var state StateX2 - a := state.Initialize() + a := state.Initialize(turbo) f(&state, a) for i := 0; i < 25; i++ { for j := 0; j < 2; j++ { @@ -31,18 +34,18 @@ func TestKeccakF1600x2(t *testing.T) { } t.Run("Generic", func(t *testing.T) { - test(t, func(s *StateX2, a []uint64) { permuteScalarX2(a) }) + test(t, false, func(s *StateX2, a []uint64) { permuteScalarX2(a, false) }) }) t.Run("SIMD", func(t *testing.T) { - test(t, func(s *StateX2, a []uint64) { s.Permute() }) + test(t, false, func(s *StateX2, a []uint64) { s.Permute() }) }) } func TestKeccakF1600x4(t *testing.T) { - test := func(t *testing.T, f func(s *StateX4, a []uint64)) { + test := func(t *testing.T, turbo bool, f func(s *StateX4, a []uint64)) { t.Helper() var state StateX4 - a := state.Initialize() + a := state.Initialize(turbo) f(&state, a) for i := 0; i < 25; i++ { for j := 0; j < 4; j++ { @@ -54,45 +57,77 @@ func TestKeccakF1600x4(t *testing.T) { } t.Run("Generic", func(t *testing.T) { - test(t, func(s *StateX4, a []uint64) { permuteScalarX4(a) }) + test(t, false, func(s *StateX4, a []uint64) { permuteScalarX4(a, false) }) }) t.Run("SIMD", func(t *testing.T) { - test(t, func(s *StateX4, a []uint64) { s.Permute() }) + test(t, false, func(s *StateX4, a []uint64) { s.Permute() }) }) } +func TestTurboX2(t *testing.T) { + var state1, state2 StateX2 + a1 := state1.Initialize(true) + a2 := state2.Initialize(true) + permuteScalarX2(a1, true) + state2.Permute() + if !reflect.DeepEqual(a1, a2) { + t.Fatal() + } +} + +func TestTurboX4(t *testing.T) { + var state1, state2 StateX4 + a1 := state1.Initialize(true) + a2 := state2.Initialize(true) + permuteScalarX4(a1, true) + state2.Permute() + if !reflect.DeepEqual(a1, a2) { + t.Fatal() + } +} + func BenchmarkF1600x2(b *testing.B) { - benchmark := func(b *testing.B, f func(s *StateX2, a []uint64)) { + benchmark := func(b *testing.B, turbo bool, f func(s *StateX2, a []uint64)) { var state StateX2 - a := state.Initialize() + a := state.Initialize(turbo) for i := 0; i < b.N; i++ { f(&state, a) } } - b.Run("Generic", func(b *testing.B) { - benchmark(b, func(s *StateX2, a []uint64) { permuteScalarX2(a) }) - }) - b.Run("SIMD", func(b *testing.B) { - benchmark(b, func(s *StateX2, a []uint64) { s.Permute() }) - }) + bench2 := func(b *testing.B, turbo bool) { + b.Run("Generic", func(b *testing.B) { + benchmark(b, turbo, func(s *StateX2, a []uint64) { permuteScalarX2(a, turbo) }) + }) + b.Run("SIMD", func(b *testing.B) { + benchmark(b, turbo, func(s *StateX2, a []uint64) { s.Permute() }) + }) + } + + b.Run("Regular", func(b *testing.B) { bench2(b, false) }) + b.Run("Turbo", func(b *testing.B) { bench2(b, true) }) } func BenchmarkF1600x4(b *testing.B) { - benchmark := func(b *testing.B, f func(s *StateX4, a []uint64)) { + benchmark := func(b *testing.B, turbo bool, f func(s *StateX4, a []uint64)) { var state StateX4 - a := state.Initialize() + a := state.Initialize(turbo) for i := 0; i < b.N; i++ { f(&state, a) } } - b.Run("Generic", func(b *testing.B) { - benchmark(b, func(s *StateX4, a []uint64) { permuteScalarX4(a) }) - }) - b.Run("SIMD", func(b *testing.B) { - benchmark(b, func(s *StateX4, a []uint64) { s.Permute() }) - }) + bench2 := func(b *testing.B, turbo bool) { + b.Run("Generic", func(b *testing.B) { + benchmark(b, turbo, func(s *StateX4, a []uint64) { permuteScalarX4(a, turbo) }) + }) + b.Run("SIMD", func(b *testing.B) { + benchmark(b, turbo, func(s *StateX4, a []uint64) { s.Permute() }) + }) + } + + b.Run("Regular", func(b *testing.B) { bench2(b, false) }) + b.Run("Turbo", func(b *testing.B) { bench2(b, true) }) } diff --git a/simd/keccakf1600/fallback.go b/simd/keccakf1600/fallback.go index 5d56c09b..5287c1f5 100644 --- a/simd/keccakf1600/fallback.go +++ b/simd/keccakf1600/fallback.go @@ -3,6 +3,6 @@ package keccakf1600 -func permuteSIMDx2(state []uint64) { permuteScalarX2(state) } +func permuteSIMDx2(state []uint64, turbo bool) { permuteScalarX2(state, turbo) } -func permuteSIMDx4(state []uint64) { permuteScalarX4(state) } +func permuteSIMDx4(state []uint64, turbo bool) { permuteScalarX4(state, turbo) } diff --git a/simd/keccakf1600/internal/asm/go.mod b/simd/keccakf1600/internal/asm/go.mod index 3858e4ef..7543b8f5 100644 --- a/simd/keccakf1600/internal/asm/go.mod +++ b/simd/keccakf1600/internal/asm/go.mod @@ -2,4 +2,8 @@ module github.com/cloudflare/circl/simd/keccakf1600/internal/asm go 1.12 -require github.com/mmcloughlin/avo v0.0.0-20200523190732-4439b6b2c061 +require ( + github.com/mmcloughlin/avo v0.5.0 + golang.org/x/tools v0.8.0 // indirect + golang.org/x/xerrors v0.0.0-20220907171357-04be3eba64a2 // indirect +) diff --git a/simd/keccakf1600/internal/asm/go.sum b/simd/keccakf1600/internal/asm/go.sum index b292bf5c..a80d5be3 100644 --- a/simd/keccakf1600/internal/asm/go.sum +++ b/simd/keccakf1600/internal/asm/go.sum @@ -1,24 +1,68 @@ github.com/mmcloughlin/avo v0.0.0-20200523190732-4439b6b2c061 h1:UCU8+cLbbvyxi0sQ9fSeoEhZgvrrD9HKMtX6Gmc1vk8= github.com/mmcloughlin/avo v0.0.0-20200523190732-4439b6b2c061/go.mod h1:wqKykBG2QzQDJEzvRkcS8x6MiSJkF52hXZsXcjaB3ls= +github.com/mmcloughlin/avo v0.5.0 h1:nAco9/aI9Lg2kiuROBY6BhCI/z0t5jEvJfjWbL8qXLU= +github.com/mmcloughlin/avo v0.5.0/go.mod h1:ChHFdoV7ql95Wi7vuq2YT1bwCJqiWdZrQ1im3VujLYM= github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= +github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= golang.org/x/arch v0.0.0-20190909030613-46d78d1859ac/go.mod h1:flIaEI6LNU6xOCD5PaJvn9wGP0agmIOqjrtsKGRguv4= +golang.org/x/arch v0.1.0/go.mod h1:5om86z9Hs0C8fWVUuoMHwpExlXzs5Tkyp9hOrfG7pp8= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= +golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= +golang.org/x/crypto v0.1.0/go.mod h1:RecgLatLF4+eUMCP1PoPZQb+cVrJcOPbHkTkbkB9sbw= golang.org/x/mod v0.2.0 h1:KU7oHjnv3XNWfa5COkzUifxZmxp1TyI7ImMXqFxLwvQ= golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= +golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= +golang.org/x/mod v0.6.0/go.mod h1:4mET923SAdbXp2ki8ey+zGs1SLqsuM2Y0uvdZR/fUNI= +golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= +golang.org/x/mod v0.10.0 h1:lFO9qtOdlre5W1jxS3r/4szv2/6iXxScdzjoBMXNhYk= +golang.org/x/mod v0.10.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= +golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= +golang.org/x/net v0.1.0/go.mod h1:Cx3nUiGt4eDBEyega/BKRp+/AlGL8hYe7U9odMt2Cco= +golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= +golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.7.0 h1:3jlCCIQZPdOYu1h8BkNvLz8Kgwtae2cagcG/VamtZRU= +golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= +golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= +golang.org/x/term v0.1.0/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= +golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= +golang.org/x/term v0.7.0/go.mod h1:P32HKFT3hSsZrRxla30E9HqToFYAQPCMs/zFMBUFqPY= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= +golang.org/x/text v0.4.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= +golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= +golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20200425043458-8463f397d07c h1:iHhCR0b26amDCiiO+kBguKZom9aMF+NrFxh9zeKR/XU= golang.org/x/tools v0.0.0-20200425043458-8463f397d07c/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= +golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= +golang.org/x/tools v0.2.0/go.mod h1:y4OqIKeOV/fWJetJ8bXPU1sEVniLMIyDAZWeHdV+NTA= +golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= +golang.org/x/tools v0.8.0 h1:vSDcovVPld282ceKgDimkRSC8kpaH1dgyc9UMzlt84Y= +golang.org/x/tools v0.8.0/go.mod h1:JxBZ99ISMI5ViVkT1tr6tdNmXeTrcpVSD3vZ1RsRdN4= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543 h1:E7g+9GITq07hpfrRu66IVDexMakfv52eLZ2CXBWiKr4= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20220907171357-04be3eba64a2 h1:H2TDz8ibqkAF6YGhCdN3jS9O0/s90v0rJh3X/OLHEUk= +golang.org/x/xerrors v0.0.0-20220907171357-04be3eba64a2/go.mod h1:K8+ghG5WaK9qNqU5K3HdILfMLy1f3aNYFI/wnl100a8= rsc.io/pdf v0.1.1/go.mod h1:n8OzWcQ6Sp37PL01nO98y4iUCRdTGarVfzxY20ICaU4= diff --git a/simd/keccakf1600/internal/asm/src.go b/simd/keccakf1600/internal/asm/src.go index acb381e8..1ccfde35 100644 --- a/simd/keccakf1600/internal/asm/src.go +++ b/simd/keccakf1600/internal/asm/src.go @@ -14,7 +14,7 @@ func main() { ConstraintExpr("amd64") // Must be called on 32 byte aligned memory. - TEXT("f1600x4AVX2", NOSPLIT, "func(state *uint64, rc *[24]uint64)") + TEXT("f1600x4AVX2", NOSPLIT, "func(state *uint64, rc *[24]uint64, turbo bool)") Pragma("noescape") @@ -31,6 +31,13 @@ func main() { superRound := GP64() MOVQ(U64(6), superRound) // count down. + turbo := Load(Param("turbo"), GP64()) + TESTQ(turbo, turbo) + JZ(LabelRef("loop")) + + MOVQ(U64(3), superRound) // Skip 3 * 4 = 12 rounds + ADDQ(Imm(8*12), rcPtr) + // XXX Because our AVX2 is significantly larger, it might better not // to group four rounds together, but simply loop over the rounds // themselves.