Skip to content

Commit

Permalink
dh/sidh: Avoid reference to global variable in rdc751.
Browse files Browse the repository at this point in the history
  • Loading branch information
armfazh committed Feb 22, 2023
1 parent 9988c67 commit 759a0af
Showing 1 changed file with 128 additions and 75 deletions.
203 changes: 128 additions & 75 deletions dh/sidh/internal/p751/arith_amd64.s
Expand Up @@ -1427,48 +1427,61 @@ TEXT ·mulP751(SB), $96-24
RET

// This multiplies a 256-bit number pointed to by M0 with p751+1.
// It is assumed that M1 points to p751+1 stored as a 768-bit Fp751Element.
// C points to the place to store the result and should be at least 192 bits.
// This should only be used when the BMI2 and ADX instruction set extensions
// are available.
#define mul256x448bmi2adx(M0, M1, C, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, T10) \
#define mul256x448bmi2adx(M0, C, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, T10) \
MOVQ 0+M0, DX \
MULXQ M1+40(SB), T1, T0 \
MULXQ M1+48(SB), T3, T2 \
MOVQ P751P1_5, AX \
MULXQ AX, T1, T0 \
MOVQ P751P1_6, AX \
MULXQ AX, T3, T2 \
MOVQ T1, 0+C \ // C0_final
XORQ AX, AX \
MULXQ M1+56(SB), T5, T4 \
MOVQ P751P1_7, AX \
MULXQ AX, T5, T4 \
ADOXQ T3, T0 \
ADOXQ T5, T2 \
MULXQ M1+64(SB), T3, T1 \
MOVQ P751P1_8, AX \
MULXQ AX, T3, T1 \
ADOXQ T3, T4 \
MULXQ M1+72(SB), T6, T5 \
MOVQ P751P1_9, AX \
MULXQ AX, T6, T5 \
ADOXQ T6, T1 \
MULXQ M1+80(SB), T7, T3 \
MOVQ P751P1_10, AX \
MULXQ AX, T7, T3 \
ADOXQ T7, T5 \
MULXQ M1+88(SB), T8, T6 \
MOVQ P751P1_11, AX \
MULXQ AX, T8, T6 \
ADOXQ T8, T3 \
MOVL $0, AX \
ADOXQ AX, T6 \
\
MOVQ 8+M0, DX \
MULXQ M1+40(SB), T7, T8 \
XORQ AX, AX \
MOVQ P751P1_5, AX \
MULXQ AX, T7, T8 \
ADCXQ T7, T0 \
MOVQ T0, 8+C \ // C1_final
ADCXQ T8, T2 \
MULXQ M1+48(SB), T8, T7 \
MOVQ P751P1_6, AX \
MULXQ AX, T8, T7 \
ADOXQ T8, T2 \
ADCXQ T7, T4 \
MULXQ M1+56(SB), T8, T0 \
MOVQ P751P1_7, AX \
MULXQ AX, T8, T0 \
ADOXQ T8, T4 \
ADCXQ T1, T0 \
MULXQ M1+64(SB), T7, T1 \
MOVQ P751P1_8, AX \
MULXQ AX, T7, T1 \
ADCXQ T5, T1 \
MULXQ M1+72(SB), T8, T5 \
MOVQ P751P1_9, AX \
MULXQ AX, T8, T5 \
ADCXQ T5, T3 \
MULXQ M1+80(SB), T9, T5 \
MOVQ P751P1_10, AX \
MULXQ AX, T9, T5 \
ADCXQ T5, T6 \
MULXQ M1+88(SB), DX, T5 \
MOVQ P751P1_11, AX \
MULXQ AX, DX, T5 \
MOVL $0, AX \
ADCXQ AX, T5 \
\
ADOXQ T7, T0 \
Expand All @@ -1477,25 +1490,32 @@ TEXT ·mulP751(SB), $96-24
ADOXQ DX, T6 \
ADOXQ AX, T5 \
\
MOVQ 16+M0, DX \
MULXQ M1+40(SB), T7, T8 \
XORQ AX, AX \
MOVQ 16+M0, DX \
MOVQ P751P1_5, AX \
MULXQ AX, T7, T8 \
ADCXQ T7, T2 \
MOVQ T2, 16+C \ // C2_final
ADCXQ T8, T4 \
MULXQ M1+48(SB), T7, T8 \
MOVQ P751P1_6, AX \
MULXQ AX, T7, T8 \
ADOXQ T7, T4 \
ADCXQ T8, T0 \
MULXQ M1+56(SB), T8, T2 \
MOVQ P751P1_7, AX \
MULXQ AX, T8, T2 \
ADOXQ T8, T0 \
ADCXQ T2, T1 \
MULXQ M1+64(SB), T7, T2 \
MOVQ P751P1_8, AX \
MULXQ AX, T7, T2 \
ADCXQ T2, T3 \
MULXQ M1+72(SB), T8, T2 \
MOVQ P751P1_9, AX \
MULXQ AX, T8, T2 \
ADCXQ T2, T6 \
MULXQ M1+80(SB), T9, T2 \
MOVQ P751P1_10, AX \
MULXQ AX, T9, T2 \
ADCXQ T2, T5 \
MULXQ M1+88(SB), DX, T2 \
MOVQ P751P1_11, AX \
MULXQ AX, DX, T2 \
MOVL $0, AX \
ADCXQ AX, T2 \
\
ADOXQ T7, T1 \
Expand All @@ -1505,23 +1525,30 @@ TEXT ·mulP751(SB), $96-24
ADOXQ AX, T2 \
\
MOVQ 24+M0, DX \
MULXQ M1+40(SB), T7, T8 \
XORQ AX, AX \
MOVQ P751P1_5, AX \
MULXQ AX, T7, T8 \
ADCXQ T4, T7 \
ADCXQ T8, T0 \
MULXQ M1+48(SB), T10, T8 \
MOVQ P751P1_6, AX \
MULXQ AX, T10, T8 \
ADOXQ T10, T0 \
ADCXQ T8, T1 \
MULXQ M1+56(SB), T8, T4 \
MOVQ P751P1_7, AX \
MULXQ AX, T8, T4 \
ADOXQ T8, T1 \
ADCXQ T4, T3 \
MULXQ M1+64(SB), T10, T4 \
MOVQ P751P1_8, AX \
MULXQ AX, T10, T4 \
ADCXQ T4, T6 \
MULXQ M1+72(SB), T8, T4 \
MOVQ P751P1_9, AX \
MULXQ AX, T8, T4 \
ADCXQ T4, T5 \
MULXQ M1+80(SB), T9, T4 \
MOVQ P751P1_10, AX \
MULXQ AX, T9, T4 \
ADCXQ T4, T2 \
MULXQ M1+88(SB), DX, T4 \
MOVQ P751P1_11, AX \
MULXQ AX, DX, T4 \
MOVL $0, AX \
ADCXQ AX, T4 \
\
ADOXQ T10, T3 \
Expand All @@ -1531,48 +1558,60 @@ TEXT ·mulP751(SB), $96-24
ADOXQ AX, T4

// This multiplies a 256-bit number pointed to by M0 with p751+1.
// It is assumed that M1 points to p751+1 stored as a 768-bit Fp751Element.
// C points to the place to store the result and should be at least 192 bits.
// This should only be used when the BMI2 instruction set extension is
// available.
#define mul256x448bmi2(M0, M1, C, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, T10) \
#define mul256x448bmi2(M0, C, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, T10) \
MOVQ 0+M0, DX \
MULXQ M1+40(SB), T1, T0 \
MULXQ M1+48(SB), T3, T2 \
MOVQ P751P1_5, AX \
MULXQ AX, T1, T0 \
MOVQ P751P1_6, AX \
MULXQ AX, T3, T2 \
MOVQ T1, 0+C \ // C0_final
XORQ AX, AX \
MULXQ M1+56(SB), T5, T4 \
MOVQ P751P1_7, AX \
MULXQ AX, T5, T4 \
ADDQ T3, T0 \
ADCQ T5, T2 \
MULXQ M1+64(SB), T3, T1 \
MOVQ P751P1_8, AX \
MULXQ AX, T3, T1 \
ADCQ T3, T4 \
MULXQ M1+72(SB), T6, T5 \
MOVQ P751P1_9, AX \
MULXQ AX, T6, T5 \
ADCQ T6, T1 \
MULXQ M1+80(SB), T7, T3 \
MOVQ P751P1_10, AX \
MULXQ AX, T7, T3 \
ADCQ T7, T5 \
MULXQ M1+88(SB), T8, T6 \
MOVQ P751P1_11, AX \
MULXQ AX, T8, T6 \
ADCQ T8, T3 \
ADCQ AX, T6 \
ADCQ $0, T6 \
\
MOVQ 8+M0, DX \
MULXQ M1+40(SB), T7, T8 \
MOVQ P751P1_5, AX \
MULXQ AX, T7, T8 \
ADDQ T7, T0 \
MOVQ T0, 8+C \ // C1_final
ADCQ T8, T2 \
MULXQ M1+48(SB), T8, T7 \
MOVQ P751P1_6, AX \
MULXQ AX, T8, T7 \
MOVQ T8, 32+C \
ADCQ T7, T4 \
MULXQ M1+56(SB), T8, T0 \
MOVQ T8, 40+C \
MOVQ P751P1_7, AX \
MULXQ AX, T8, T0 \
MOVQ T8, 40+C \
ADCQ T1, T0 \
MULXQ M1+64(SB), T7, T1 \
MOVQ P751P1_8, AX \
MULXQ AX, T7, T1 \
ADCQ T5, T1 \
MULXQ M1+72(SB), T8, T5 \
MOVQ P751P1_9, AX \
MULXQ AX, T8, T5 \
ADCQ T5, T3 \
MULXQ M1+80(SB), T9, T5 \
MOVQ P751P1_10, AX \
MULXQ AX, T9, T5 \
ADCQ T5, T6 \
MULXQ M1+88(SB), DX, T5 \
ADCQ AX, T5 \
MOVQ P751P1_11, AX \
MULXQ AX, DX, T5 \
ADCQ $0, T5 \
\
XORQ AX, AX \
ADDQ 32+C, T2 \
Expand All @@ -1584,24 +1623,31 @@ TEXT ·mulP751(SB), $96-24
ADCQ AX, T5 \
\
MOVQ 16+M0, DX \
MULXQ M1+40(SB), T7, T8 \
MOVQ P751P1_5, AX \
MULXQ AX, T7, T8 \
ADDQ T7, T2 \
MOVQ T2, 16+C \ // C2_final
ADCQ T8, T4 \
MULXQ M1+48(SB), T7, T8 \
MOVQ P751P1_6, AX \
MULXQ AX, T7, T8 \
MOVQ T7, 32+C \
ADCQ T8, T0 \
MULXQ M1+56(SB), T8, T2 \
MOVQ P751P1_7, AX \
MULXQ AX, T8, T2 \
MOVQ T8, 40+C \
ADCQ T2, T1 \
MULXQ M1+64(SB), T7, T2 \
MOVQ P751P1_8, AX \
MULXQ AX, T7, T2 \
ADCQ T2, T3 \
MULXQ M1+72(SB), T8, T2 \
MOVQ P751P1_9, AX \
MULXQ AX, T8, T2 \
ADCQ T2, T6 \
MULXQ M1+80(SB), T9, T2 \
MOVQ P751P1_10, AX \
MULXQ AX, T9, T2 \
ADCQ T2, T5 \
MULXQ M1+88(SB), DX, T2 \
ADCQ AX, T2 \
MOVQ P751P1_11, AX \
MULXQ AX, DX, T2 \
ADCQ $0, T2 \
\
XORQ AX, AX \
ADDQ 32+C, T4 \
Expand All @@ -1613,23 +1659,30 @@ TEXT ·mulP751(SB), $96-24
ADCQ AX, T2 \
\
MOVQ 24+M0, DX \
MULXQ M1+40(SB), T7, T8 \
MOVQ P751P1_5, AX \
MULXQ AX, T7, T8 \
ADDQ T4, T7 \
ADCQ T8, T0 \
MULXQ M1+48(SB), T10, T8 \
MOVQ T10, 32+C \
MOVQ P751P1_6, AX \
MULXQ AX, T10, T8 \
MOVQ T10, 32+C \
ADCQ T8, T1 \
MULXQ M1+56(SB), T8, T4 \
MOVQ P751P1_7, AX \
MULXQ AX, T8, T4 \
MOVQ T8, 40+C \
ADCQ T4, T3 \
MULXQ M1+64(SB), T10, T4 \
MOVQ P751P1_8, AX \
MULXQ AX, T10, T4 \
ADCQ T4, T6 \
MULXQ M1+72(SB), T8, T4 \
MOVQ P751P1_9, AX \
MULXQ AX, T8, T4 \
ADCQ T4, T5 \
MULXQ M1+80(SB), T9, T4 \
MOVQ P751P1_10, AX \
MULXQ AX, T9, T4 \
ADCQ T4, T2 \
MULXQ M1+88(SB), DX, T4 \
ADCQ AX, T4 \
MOVQ P751P1_11, AX \
MULXQ AX, DX, T4 \
ADCQ $0, T4 \
\
XORQ AX, AX \
ADDQ 32+C, T0 \
Expand All @@ -1651,7 +1704,7 @@ TEXT ·mulP751(SB), $96-24
// Output: OUT 768-bit
#define REDC(C, M0, MULS) \
\ // a[0-3] x p751p1_nz --> result: [reg_p2+48], [reg_p2+56], [reg_p2+64], and rbp, r8:r14
MULS(M0, ·P751p1, 48+C, R8, R9, R13, R10, R14, R12, R11, BP, BX, CX, R15) \
MULS(M0, 48+C, R8, R9, R13, R10, R14, R12, R11, BP, BX, CX, R15) \
XORQ R15, R15 \
MOVQ 48+C, AX \
MOVQ 56+C, DX \
Expand Down Expand Up @@ -1702,7 +1755,7 @@ TEXT ·mulP751(SB), $96-24
MOVQ R13, 176+M0 \
MOVQ R14, 184+M0 \
\ // a[4-7] x p751p1_nz --> result: [reg_p2+48], [reg_p2+56], [reg_p2+64], and rbp, r8:r14
MULS(32+M0, ·P751p1, 48+C, R8, R9, R13, R10, R14, R12, R11, BP, BX, CX, R15) \
MULS(32+M0, 48+C, R8, R9, R13, R10, R14, R12, R11, BP, BX, CX, R15) \
XORQ R15, R15 \
MOVQ 48+C, AX \
MOVQ 56+C, DX \
Expand Down Expand Up @@ -1741,7 +1794,7 @@ TEXT ·mulP751(SB), $96-24
MOVQ R13, 176+M0 \
MOVQ R14, 184+M0 \
\ // a[8-11] x p751p1_nz --> result: [reg_p2+48], [reg_p2+56], [reg_p2+64], and rbp, r8:r14
MULS(64+M0, ·P751p1, 48+C, R8, R9, R13, R10, R14, R12, R11, BP, BX, CX, R15) \
MULS(64+M0, 48+C, R8, R9, R13, R10, R14, R12, R11, BP, BX, CX, R15) \
MOVQ 48+C, AX \ // Final result c1:c11
MOVQ 56+C, DX \
MOVQ 64+C, BX \
Expand Down

0 comments on commit 759a0af

Please sign in to comment.