From 759a0afb563dcdb0d6ee1c7241d8cfcb967d6e51 Mon Sep 17 00:00:00 2001 From: armfazh Date: Sun, 12 Feb 2023 11:40:38 -0800 Subject: [PATCH] dh/sidh: Avoid reference to global variable in rdc751. --- dh/sidh/internal/p751/arith_amd64.s | 203 ++++++++++++++++++---------- 1 file changed, 128 insertions(+), 75 deletions(-) diff --git a/dh/sidh/internal/p751/arith_amd64.s b/dh/sidh/internal/p751/arith_amd64.s index 22d6de1b..7d587b75 100644 --- a/dh/sidh/internal/p751/arith_amd64.s +++ b/dh/sidh/internal/p751/arith_amd64.s @@ -1427,48 +1427,61 @@ TEXT ·mulP751(SB), $96-24 RET // This multiplies a 256-bit number pointed to by M0 with p751+1. -// It is assumed that M1 points to p751+1 stored as a 768-bit Fp751Element. // C points to the place to store the result and should be at least 192 bits. // This should only be used when the BMI2 and ADX instruction set extensions // are available. -#define mul256x448bmi2adx(M0, M1, C, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, T10) \ +#define mul256x448bmi2adx(M0, C, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, T10) \ MOVQ 0+M0, DX \ - MULXQ M1+40(SB), T1, T0 \ - MULXQ M1+48(SB), T3, T2 \ + MOVQ P751P1_5, AX \ + MULXQ AX, T1, T0 \ + MOVQ P751P1_6, AX \ + MULXQ AX, T3, T2 \ MOVQ T1, 0+C \ // C0_final - XORQ AX, AX \ - MULXQ M1+56(SB), T5, T4 \ + MOVQ P751P1_7, AX \ + MULXQ AX, T5, T4 \ ADOXQ T3, T0 \ ADOXQ T5, T2 \ - MULXQ M1+64(SB), T3, T1 \ + MOVQ P751P1_8, AX \ + MULXQ AX, T3, T1 \ ADOXQ T3, T4 \ - MULXQ M1+72(SB), T6, T5 \ + MOVQ P751P1_9, AX \ + MULXQ AX, T6, T5 \ ADOXQ T6, T1 \ - MULXQ M1+80(SB), T7, T3 \ + MOVQ P751P1_10, AX \ + MULXQ AX, T7, T3 \ ADOXQ T7, T5 \ - MULXQ M1+88(SB), T8, T6 \ + MOVQ P751P1_11, AX \ + MULXQ AX, T8, T6 \ ADOXQ T8, T3 \ + MOVL $0, AX \ ADOXQ AX, T6 \ \ MOVQ 8+M0, DX \ - MULXQ M1+40(SB), T7, T8 \ - XORQ AX, AX \ + MOVQ P751P1_5, AX \ + MULXQ AX, T7, T8 \ ADCXQ T7, T0 \ MOVQ T0, 8+C \ // C1_final ADCXQ T8, T2 \ - MULXQ M1+48(SB), T8, T7 \ + MOVQ P751P1_6, AX \ + MULXQ AX, T8, T7 \ ADOXQ T8, T2 \ ADCXQ T7, T4 \ - MULXQ M1+56(SB), T8, T0 \ + MOVQ P751P1_7, AX \ + MULXQ AX, T8, T0 \ ADOXQ T8, T4 \ ADCXQ T1, T0 \ - MULXQ M1+64(SB), T7, T1 \ + MOVQ P751P1_8, AX \ + MULXQ AX, T7, T1 \ ADCXQ T5, T1 \ - MULXQ M1+72(SB), T8, T5 \ + MOVQ P751P1_9, AX \ + MULXQ AX, T8, T5 \ ADCXQ T5, T3 \ - MULXQ M1+80(SB), T9, T5 \ + MOVQ P751P1_10, AX \ + MULXQ AX, T9, T5 \ ADCXQ T5, T6 \ - MULXQ M1+88(SB), DX, T5 \ + MOVQ P751P1_11, AX \ + MULXQ AX, DX, T5 \ + MOVL $0, AX \ ADCXQ AX, T5 \ \ ADOXQ T7, T0 \ @@ -1477,25 +1490,32 @@ TEXT ·mulP751(SB), $96-24 ADOXQ DX, T6 \ ADOXQ AX, T5 \ \ - MOVQ 16+M0, DX \ - MULXQ M1+40(SB), T7, T8 \ - XORQ AX, AX \ + MOVQ 16+M0, DX \ + MOVQ P751P1_5, AX \ + MULXQ AX, T7, T8 \ ADCXQ T7, T2 \ MOVQ T2, 16+C \ // C2_final ADCXQ T8, T4 \ - MULXQ M1+48(SB), T7, T8 \ + MOVQ P751P1_6, AX \ + MULXQ AX, T7, T8 \ ADOXQ T7, T4 \ ADCXQ T8, T0 \ - MULXQ M1+56(SB), T8, T2 \ + MOVQ P751P1_7, AX \ + MULXQ AX, T8, T2 \ ADOXQ T8, T0 \ ADCXQ T2, T1 \ - MULXQ M1+64(SB), T7, T2 \ + MOVQ P751P1_8, AX \ + MULXQ AX, T7, T2 \ ADCXQ T2, T3 \ - MULXQ M1+72(SB), T8, T2 \ + MOVQ P751P1_9, AX \ + MULXQ AX, T8, T2 \ ADCXQ T2, T6 \ - MULXQ M1+80(SB), T9, T2 \ + MOVQ P751P1_10, AX \ + MULXQ AX, T9, T2 \ ADCXQ T2, T5 \ - MULXQ M1+88(SB), DX, T2 \ + MOVQ P751P1_11, AX \ + MULXQ AX, DX, T2 \ + MOVL $0, AX \ ADCXQ AX, T2 \ \ ADOXQ T7, T1 \ @@ -1505,23 +1525,30 @@ TEXT ·mulP751(SB), $96-24 ADOXQ AX, T2 \ \ MOVQ 24+M0, DX \ - MULXQ M1+40(SB), T7, T8 \ - XORQ AX, AX \ + MOVQ P751P1_5, AX \ + MULXQ AX, T7, T8 \ ADCXQ T4, T7 \ ADCXQ T8, T0 \ - MULXQ M1+48(SB), T10, T8 \ + MOVQ P751P1_6, AX \ + MULXQ AX, T10, T8 \ ADOXQ T10, T0 \ ADCXQ T8, T1 \ - MULXQ M1+56(SB), T8, T4 \ + MOVQ P751P1_7, AX \ + MULXQ AX, T8, T4 \ ADOXQ T8, T1 \ ADCXQ T4, T3 \ - MULXQ M1+64(SB), T10, T4 \ + MOVQ P751P1_8, AX \ + MULXQ AX, T10, T4 \ ADCXQ T4, T6 \ - MULXQ M1+72(SB), T8, T4 \ + MOVQ P751P1_9, AX \ + MULXQ AX, T8, T4 \ ADCXQ T4, T5 \ - MULXQ M1+80(SB), T9, T4 \ + MOVQ P751P1_10, AX \ + MULXQ AX, T9, T4 \ ADCXQ T4, T2 \ - MULXQ M1+88(SB), DX, T4 \ + MOVQ P751P1_11, AX \ + MULXQ AX, DX, T4 \ + MOVL $0, AX \ ADCXQ AX, T4 \ \ ADOXQ T10, T3 \ @@ -1531,48 +1558,60 @@ TEXT ·mulP751(SB), $96-24 ADOXQ AX, T4 // This multiplies a 256-bit number pointed to by M0 with p751+1. -// It is assumed that M1 points to p751+1 stored as a 768-bit Fp751Element. // C points to the place to store the result and should be at least 192 bits. // This should only be used when the BMI2 instruction set extension is // available. -#define mul256x448bmi2(M0, M1, C, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, T10) \ +#define mul256x448bmi2(M0, C, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, T10) \ MOVQ 0+M0, DX \ - MULXQ M1+40(SB), T1, T0 \ - MULXQ M1+48(SB), T3, T2 \ + MOVQ P751P1_5, AX \ + MULXQ AX, T1, T0 \ + MOVQ P751P1_6, AX \ + MULXQ AX, T3, T2 \ MOVQ T1, 0+C \ // C0_final - XORQ AX, AX \ - MULXQ M1+56(SB), T5, T4 \ + MOVQ P751P1_7, AX \ + MULXQ AX, T5, T4 \ ADDQ T3, T0 \ ADCQ T5, T2 \ - MULXQ M1+64(SB), T3, T1 \ + MOVQ P751P1_8, AX \ + MULXQ AX, T3, T1 \ ADCQ T3, T4 \ - MULXQ M1+72(SB), T6, T5 \ + MOVQ P751P1_9, AX \ + MULXQ AX, T6, T5 \ ADCQ T6, T1 \ - MULXQ M1+80(SB), T7, T3 \ + MOVQ P751P1_10, AX \ + MULXQ AX, T7, T3 \ ADCQ T7, T5 \ - MULXQ M1+88(SB), T8, T6 \ + MOVQ P751P1_11, AX \ + MULXQ AX, T8, T6 \ ADCQ T8, T3 \ - ADCQ AX, T6 \ + ADCQ $0, T6 \ \ MOVQ 8+M0, DX \ - MULXQ M1+40(SB), T7, T8 \ + MOVQ P751P1_5, AX \ + MULXQ AX, T7, T8 \ ADDQ T7, T0 \ MOVQ T0, 8+C \ // C1_final ADCQ T8, T2 \ - MULXQ M1+48(SB), T8, T7 \ + MOVQ P751P1_6, AX \ + MULXQ AX, T8, T7 \ MOVQ T8, 32+C \ ADCQ T7, T4 \ - MULXQ M1+56(SB), T8, T0 \ - MOVQ T8, 40+C \ + MOVQ P751P1_7, AX \ + MULXQ AX, T8, T0 \ + MOVQ T8, 40+C \ ADCQ T1, T0 \ - MULXQ M1+64(SB), T7, T1 \ + MOVQ P751P1_8, AX \ + MULXQ AX, T7, T1 \ ADCQ T5, T1 \ - MULXQ M1+72(SB), T8, T5 \ + MOVQ P751P1_9, AX \ + MULXQ AX, T8, T5 \ ADCQ T5, T3 \ - MULXQ M1+80(SB), T9, T5 \ + MOVQ P751P1_10, AX \ + MULXQ AX, T9, T5 \ ADCQ T5, T6 \ - MULXQ M1+88(SB), DX, T5 \ - ADCQ AX, T5 \ + MOVQ P751P1_11, AX \ + MULXQ AX, DX, T5 \ + ADCQ $0, T5 \ \ XORQ AX, AX \ ADDQ 32+C, T2 \ @@ -1584,24 +1623,31 @@ TEXT ·mulP751(SB), $96-24 ADCQ AX, T5 \ \ MOVQ 16+M0, DX \ - MULXQ M1+40(SB), T7, T8 \ + MOVQ P751P1_5, AX \ + MULXQ AX, T7, T8 \ ADDQ T7, T2 \ MOVQ T2, 16+C \ // C2_final ADCQ T8, T4 \ - MULXQ M1+48(SB), T7, T8 \ + MOVQ P751P1_6, AX \ + MULXQ AX, T7, T8 \ MOVQ T7, 32+C \ ADCQ T8, T0 \ - MULXQ M1+56(SB), T8, T2 \ + MOVQ P751P1_7, AX \ + MULXQ AX, T8, T2 \ MOVQ T8, 40+C \ ADCQ T2, T1 \ - MULXQ M1+64(SB), T7, T2 \ + MOVQ P751P1_8, AX \ + MULXQ AX, T7, T2 \ ADCQ T2, T3 \ - MULXQ M1+72(SB), T8, T2 \ + MOVQ P751P1_9, AX \ + MULXQ AX, T8, T2 \ ADCQ T2, T6 \ - MULXQ M1+80(SB), T9, T2 \ + MOVQ P751P1_10, AX \ + MULXQ AX, T9, T2 \ ADCQ T2, T5 \ - MULXQ M1+88(SB), DX, T2 \ - ADCQ AX, T2 \ + MOVQ P751P1_11, AX \ + MULXQ AX, DX, T2 \ + ADCQ $0, T2 \ \ XORQ AX, AX \ ADDQ 32+C, T4 \ @@ -1613,23 +1659,30 @@ TEXT ·mulP751(SB), $96-24 ADCQ AX, T2 \ \ MOVQ 24+M0, DX \ - MULXQ M1+40(SB), T7, T8 \ + MOVQ P751P1_5, AX \ + MULXQ AX, T7, T8 \ ADDQ T4, T7 \ ADCQ T8, T0 \ - MULXQ M1+48(SB), T10, T8 \ - MOVQ T10, 32+C \ + MOVQ P751P1_6, AX \ + MULXQ AX, T10, T8 \ + MOVQ T10, 32+C \ ADCQ T8, T1 \ - MULXQ M1+56(SB), T8, T4 \ + MOVQ P751P1_7, AX \ + MULXQ AX, T8, T4 \ MOVQ T8, 40+C \ ADCQ T4, T3 \ - MULXQ M1+64(SB), T10, T4 \ + MOVQ P751P1_8, AX \ + MULXQ AX, T10, T4 \ ADCQ T4, T6 \ - MULXQ M1+72(SB), T8, T4 \ + MOVQ P751P1_9, AX \ + MULXQ AX, T8, T4 \ ADCQ T4, T5 \ - MULXQ M1+80(SB), T9, T4 \ + MOVQ P751P1_10, AX \ + MULXQ AX, T9, T4 \ ADCQ T4, T2 \ - MULXQ M1+88(SB), DX, T4 \ - ADCQ AX, T4 \ + MOVQ P751P1_11, AX \ + MULXQ AX, DX, T4 \ + ADCQ $0, T4 \ \ XORQ AX, AX \ ADDQ 32+C, T0 \ @@ -1651,7 +1704,7 @@ TEXT ·mulP751(SB), $96-24 // Output: OUT 768-bit #define REDC(C, M0, MULS) \ \ // a[0-3] x p751p1_nz --> result: [reg_p2+48], [reg_p2+56], [reg_p2+64], and rbp, r8:r14 - MULS(M0, ·P751p1, 48+C, R8, R9, R13, R10, R14, R12, R11, BP, BX, CX, R15) \ + MULS(M0, 48+C, R8, R9, R13, R10, R14, R12, R11, BP, BX, CX, R15) \ XORQ R15, R15 \ MOVQ 48+C, AX \ MOVQ 56+C, DX \ @@ -1702,7 +1755,7 @@ TEXT ·mulP751(SB), $96-24 MOVQ R13, 176+M0 \ MOVQ R14, 184+M0 \ \ // a[4-7] x p751p1_nz --> result: [reg_p2+48], [reg_p2+56], [reg_p2+64], and rbp, r8:r14 - MULS(32+M0, ·P751p1, 48+C, R8, R9, R13, R10, R14, R12, R11, BP, BX, CX, R15) \ + MULS(32+M0, 48+C, R8, R9, R13, R10, R14, R12, R11, BP, BX, CX, R15) \ XORQ R15, R15 \ MOVQ 48+C, AX \ MOVQ 56+C, DX \ @@ -1741,7 +1794,7 @@ TEXT ·mulP751(SB), $96-24 MOVQ R13, 176+M0 \ MOVQ R14, 184+M0 \ \ // a[8-11] x p751p1_nz --> result: [reg_p2+48], [reg_p2+56], [reg_p2+64], and rbp, r8:r14 - MULS(64+M0, ·P751p1, 48+C, R8, R9, R13, R10, R14, R12, R11, BP, BX, CX, R15) \ + MULS(64+M0, 48+C, R8, R9, R13, R10, R14, R12, R11, BP, BX, CX, R15) \ MOVQ 48+C, AX \ // Final result c1:c11 MOVQ 56+C, DX \ MOVQ 64+C, BX \