/
arith_arm64.s
511 lines (462 loc) · 9.87 KB
/
arith_arm64.s
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
// +build arm64,!purego
#include "textflag.h"
TEXT ·fp384Cmov(SB), NOSPLIT, $0
MOVD x+0(FP), R0
MOVD y+8(FP), R1
MOVW b+16(FP), R2
CMP $0, R2
LDP 0(R0), (R3, R5)
LDP 0(R1), (R4, R6)
CSEL NE,R4,R3,R7
CSEL NE,R6,R5,R8
STP (R7, R8), 0(R0)
LDP 16(R0), (R3, R5)
LDP 16(R1), (R4, R6)
CSEL NE,R4,R3,R7
CSEL NE,R6,R5,R8
STP (R7, R8), 16(R0)
LDP 32(R0), (R3, R5)
LDP 32(R1), (R4, R6)
CSEL NE,R4,R3,R7
CSEL NE,R6,R5,R8
STP (R7, R8), 32(R0)
RET
// Compute c = -a mod p
TEXT ·fp384Neg(SB), NOSPLIT, $0-16
MOVD c+0(FP), R0
MOVD a+8(FP), R1
// Load p in R2-R7, a in R8-R13
// Compute p-a in R8-R13
LDP ·p+0(SB), (R2, R3)
LDP 0(R1), (R8, R9)
SUBS R8, R2, R8
SBCS R9, R3, R9
LDP ·p+16(SB), (R4, R5)
LDP 16(R1), (R10, R11)
SBCS R10, R4, R10
SBCS R11, R5, R11
LDP ·p+32(SB), (R6, R7)
LDP 32(R1), (R12, R13)
SBCS R12, R6, R12
SBC R13, R7, R13
// Compute (p-a)-p in R2-R7
SUBS R2, R8, R2
SBCS R3, R9, R3
SBCS R4, R10, R4
SBCS R5, R11, R5
SBCS R6, R12, R6
SBCS R7, R13, R7
// If (p-a)-p < 0 (nearly always), return p-a
// Only return (p-a)-p for a = 0
// Store result in c
CSEL CC, R8, R2, R2
CSEL CC, R9, R3, R3
STP (R2, R3), 0(R0)
CSEL CC, R10, R4, R4
CSEL CC, R11, R5, R5
STP (R4, R5), 16(R0)
CSEL CC, R12, R6, R6
CSEL CC, R13, R7, R7
STP (R6, R7), 32(R0)
RET
// Compute c = a+b mod p
TEXT ·fp384Add(SB), NOSPLIT, $0-24
MOVD c+0(FP), R0
MOVD a+8(FP), R1
MOVD b+16(FP), R2
// Load a in R3-R8, b in R9-R14
// Compute a+b in R3-R9
LDP 0(R1), (R3, R4)
LDP 0(R2), (R9, R10)
ADDS R9, R3
ADCS R10, R4
LDP 16(R1), (R5, R6)
LDP 16(R2), (R11, R12)
ADCS R11, R5
ADCS R12, R6
LDP 32(R1), (R7, R8)
LDP 32(R2), (R13, R14)
ADCS R13, R7
ADCS R14, R8
ADC ZR, ZR, R9
// Load p in R10-R15
LDP ·p+ 0(SB), (R10, R11)
LDP ·p+16(SB), (R12, R13)
LDP ·p+32(SB), (R14, R15)
// Compute a+b-p in R10-R16
SUBS R10, R3, R10
SBCS R11, R4, R11
SBCS R12, R5, R12
SBCS R13, R6, R13
SBCS R14, R7, R14
SBCS R15, R8, R15
SBCS ZR, R9, R16
// If a+b-p is negative, return a+b
// Store result in c
CSEL CC, R3, R10, R3
CSEL CC, R4, R11, R4
STP (R3, R4), 0(R0)
CSEL CC, R5, R12, R5
CSEL CC, R6, R13, R6
STP (R5, R6), 16(R0)
CSEL CC, R7, R14, R7
CSEL CC, R8, R15, R8
STP (R7, R8), 32(R0)
RET
// Compute c = a-b mod p
TEXT ·fp384Sub(SB), NOSPLIT, $0-24
MOVD c+0(FP), R0
MOVD a+8(FP), R1
MOVD b+16(FP), R2
// Load a in R3-R8, b in R9-R14
// Compute a-b in R3-R9
LDP 0(R1), (R3, R4)
LDP 0(R2), (R9, R10)
SUBS R9, R3
SBCS R10, R4
LDP 16(R1), (R5, R6)
LDP 16(R2), (R11, R12)
SBCS R11, R5
SBCS R12, R6
LDP 32(R1), (R7, R8)
LDP 32(R2), (R13, R14)
SBCS R13, R7
SBCS R14, R8
SBC ZR, ZR, R9
// Load p in R10-R15
// If a-b < 0, (a-b)+p to R3-R8
// Store result in c
LDP ·p+ 0(SB), (R10, R11)
AND R9, R10
LDP ·p+16(SB), (R12, R13)
AND R9, R11
AND R9, R12
LDP ·p+32(SB), (R14, R15)
AND R9, R13
AND R9, R14
AND R9, R15
ADDS R10, R3
ADCS R11, R4
STP (R3, R4), 0(R0)
ADCS R12, R5
ADCS R13, R6
STP (R5, R6), 16(R0)
ADCS R14, R7
ADC R15, R8
STP (R7, R8), 32(R0)
RET
// Expects that A0*B0 is already in C0(low),C3(high) and A0*B1 in C1(low),C2(high)
// C0 is not actually touched
// Result of (A0-A2) * (B0-B2) will be in C0-C5
// Inputs remain intact
#define mul192x192comba(A0,A1,A2, B0,B1,B2, C0,C1,C2,C3,C4,C5, S0,S1,S2,S3) \
MUL A1, B0, S2 \
UMULH A1, B0, S3 \
\
ADDS C3, C1 \
ADCS ZR, C2 \
ADC ZR, ZR, C3 \
\
MUL A0, B2, S0 \
UMULH A0, B2, S1 \
\
ADDS S2, C1 \
ADCS S3, C2 \
ADC ZR, C3 \
\
MUL A1, B1, S2 \
UMULH A1, B1, S3 \
\
ADDS S0, C2 \
ADCS S1, C3 \
ADC ZR, ZR, C4 \
\
MUL A2, B0, S0 \
UMULH A2, B0, S1 \
\
ADDS S2, C2 \
ADCS S3, C3 \
ADC ZR, C4 \
\
MUL A1, B2, S2 \
UMULH A1, B2, S3 \
\
ADDS S0, C2 \
ADCS S1, C3 \
ADC ZR, C4 \
\
MUL A2, B1, S0 \
UMULH A2, B1, S1 \
\
ADDS S2, C3 \
ADCS S3, C4 \
ADC ZR, ZR, C5 \
\
MUL A2, B2, S2 \
UMULH A2, B2, S3 \
\
ADDS S0, C3 \
ADCS S1, C4 \
ADC ZR, C5 \
\
ADDS S2, C4 \
ADC S3, C5
// Assumes that there are at least 96 bytes left on the stack
// Expects that X and Y point to input
// X and Y get overwritten, Z0 will be in Y
#define mul384x384karatsuba(X,Y, Z1,Z2,Z3,Z4,Z5,Z6,Z7,Z8,Z9,Z10,Z11, T0,T1,T2,T3,T4,T5,T6,T7,T8,T9,T10,T11,T12) \
/* Load a in Z1-Z6, b in T12,Z7-Z11 */ \
LDP 0(X), ( Z1, Z2) \
LDP 0(Y), (T12, Z7) \
MUL Z1, Z7, T1 \
UMULH Z1, T12, T3 \
LDP 16(X), ( Z3, Z4) \
LDP 16(Y), ( Z8, Z9) \
MUL Z1, T12, T0 \
UMULH Z1, Z7, T2 \
LDP 32(X), ( Z5, Z6) \
LDP 32(Y), (Z10, Z11) \
\
/* Compute aL*bL in T0-T5 */ \
mul192x192comba(Z1,Z2,Z3, T12,Z7,Z8, T0,T1,T2,T3,T4,T5, T6,T7,T8,T9) \
\
/* Compute aH*bH in T6-T11, destroys aL and bL */ \
MUL Z4, Z10, T7 \
MUL Z4, Z9, T6 \
UMULH Z4, Z9, T9 \
UMULH Z4, Z10, T8 \
mul192x192comba(Z4,Z5,Z6, Z9,Z10,Z11, T6,T7,T8,T9,T10,T11, Z1,Z2,T12,Z7) \
\
/* Compute aL*bL + aH*bH in Z1-Z6,T12, destroys aH */ \
ADDS T0, T6, Z1 \
ADCS T1, T7, Z2 \
ADCS T2, T8, Z3 \
ADCS T3, T9, Z4 \
ADCS T4, T10, Z5 \
ADCS T5, T11, Z6 \
ADC ZR, ZR, T12 \
\
/* Add to T0-T11 and store on stack */ \
STP ( T0, T1), -16(RSP) \
ADDS Z1, T3 \
STP ( T2, T3), -32(RSP) \
ADCS Z2, T4 \
ADCS Z3, T5 \
STP ( T4, T5), -48(RSP) \
ADCS Z4, T6 \
ADCS Z5, T7 \
STP ( T6, T7), -64(RSP) \
ADCS Z6, T8 \
ADC ZR, T12 \
STP ( T8, T9), -80(RSP) \
STP (T10, T11), -96(RSP) \
\
/* Load a to Z1-Z6 */ \
LDP 0(X), (Z1, Z2) \
LDP 16(X), (Z3, Z4) \
LDP 32(X), (Z5, Z6) \
\
/* Compute |aL-aH| to Z1-Z3, keep borrow in X */ \
SUBS Z4, Z1 \
SBCS Z5, Z2 \
SBCS Z6, Z3 \
SBC ZR, ZR, X \
NEGS Z1, Z4 \
NGCS Z2, Z5 \
NGC Z3, Z6 \
ADDS $1, X \
\
/* Load b to Z7-Z11,T0 */ \
LDP 0(Y), ( Z7, Z8) \
LDP 16(Y), ( Z9, Z10) \
LDP 32(Y), (Z11, T0) \
\
CSEL EQ, Z4, Z1, Z1 \
CSEL EQ, Z5, Z2 ,Z2 \
CSEL EQ, Z6, Z3, Z3 \
\
/* Compute |bH-bL| to Z7-Z9, keep borrow in Y */ \
SUBS Z7, Z10 \
SBCS Z8, Z11 \
SBCS Z9, T0 \
SBC ZR, ZR, Y \
NEGS Z10, Z7 \
NGCS Z11, Z8 \
NGC T0, Z9 \
ADDS $1, Y \
CSEL EQ, Z7, Z10, Z7 \
CSEL EQ, Z8, Z11, Z8 \
CSEL EQ, Z9, T0, Z9 \
\
/* Combine borrows */ \
EOR Y, X \
\
/* Compute |aL-aH|*|bH-bL| to Z10,Z11,T0-T3 */ \
MUL Z1, Z8, Z11 \
MUL Z1, Z7, Z10 \
UMULH Z1, Z8, T0 \
UMULH Z1, Z7, T1 \
mul192x192comba(Z1,Z2,Z3, Z7,Z8,Z9, Z10,Z11,T0,T1,T2,T3, T4,T5,T6,T7) \
\
/* The result has to be negated if exactly one of the operands was negative */ \
NEGS Z10, Y \
NGCS Z11, Z1 \
NGCS T0, Z2 \
NGCS T1, Z3 \
NGCS T2, Z4 \
NGCS T3, Z5 \
NGC ZR, T4 \
\
AND T4, X \
CMP $1, X \
CSEL EQ, Y, Z10, Z10 \
CSEL EQ, Z1, Z11, Z11 \
CSEL EQ, Z2, T0, T0 \
CSEL EQ, Z3, T1, T1 \
CSEL EQ, Z4, T2, T2 \
CSEL EQ, Z5, T3, T3 \
\
/* Add that to the middle part */ \
LDP -16(RSP), ( Y, Z1) \
LDP -32(RSP), ( Z2, Z3) \
LDP -48(RSP), ( Z4, Z5) \
ADDS Z10, Z3 \
ADCS Z11, Z4 \
LDP -64(RSP), ( Z6, Z7) \
ADCS T0, Z5 \
ADCS T1, Z6 \
LDP -80(RSP), ( Z8, Z9) \
ADCS T2, Z7 \
ADCS T3, Z8 \
LDP -96(RSP), (Z10, Z11) \
ADCS T12, Z9 \
ADCS ZR, Z10 \
ADC ZR, Z11 \
SUBS X, Z9 \
SBCS ZR, Z10 \
SBC ZR, Z11
// Compute c = a*b*R^-1 mod p
TEXT ·fp384Mul(SB), NOSPLIT, $200-24
MOVD c+0(FP), R0
MOVD a+8(FP), R1
MOVD b+16(FP), R2
// Compute a*b in R2-R13
mul384x384karatsuba(R1, R2, R3,R4,R5,R6,R7,R8,R9,R10,R11,R12,R13, R14,R15,R16,R17,R19,R20,R21,R22,R23,R24,R25,R26,R27)
// Store a*b on the stack
STP ( R2, R3), -112(RSP)
STP ( R4, R5), -128(RSP)
STP ( R6, R7), -144(RSP)
STP ( R8, R9), -160(RSP)
STP (R10, R11), -176(RSP)
STP (R12, R13), -192(RSP)
// Compute m = a*b*pp mod 2^384 in R19-R24
// Store it temporarily in c
MOVD ·pp+0(SB), R14
MUL R14, R2, R19
UMULH R14, R2, R20
MUL R14, R3, R16
UMULH R14, R3, R21
ADDS R16, R20
ADC ZR, R21
MUL R14, R4, R16
UMULH R14, R4, R22
ADDS R16, R21
ADC ZR, R22
MUL R14, R5, R16
UMULH R14, R5, R23
ADDS R16, R22
ADC ZR, R23
MUL R14, R6, R16
UMULH R14, R6, R24
ADDS R16, R23
ADC ZR, R24
MADD R14, R24, R7, R24
// ·pp+8(SB) = 1, so we can just add
ADDS R2, R20
STP (R19, R20), 0(R0)
ADCS R3, R21
ADCS R4, R22
ADCS R5, R23
ADC R6, R24
LDP ·pp+16(SB), (R14, R15)
MUL R14, R2, R8
UMULH R14, R2, R9
MUL R14, R3, R16
UMULH R14, R3, R10
ADDS R16, R9
ADC ZR, R10
MUL R14, R4, R16
UMULH R14, R4, R11
ADDS R16, R10
ADC ZR, R11
MUL R14, R5, R16
ADD R16, R11
ADDS R8, R21
ADCS R9, R22
ADCS R10, R23
ADC R11, R24
MUL R15, R2, R8
UMULH R15, R2, R9
MUL R15, R3, R16
UMULH R15, R3, R10
ADDS R16, R9
ADC ZR, R10
MADD R15, R10, R4, R10
ADDS R8, R22
STP (R21, R22), 16(R0)
ADCS R9, R23
ADC R10, R24
LDP ·pp+32(SB), (R14, R15)
MUL R14, R2, R8
UMULH R14, R2, R9
MADD R14, R9, R3, R9
ADDS R8, R23
ADC R9, R24
MADD R15, R24, R2, R24
STP (R23, R24), 32(R0)
// Compute m*p in R1-R12
MOVD $·p(SB), R1
mul384x384karatsuba(R0, R1, R2,R3,R4,R5,R6,R7,R8,R9,R10,R11,R12, R13,R14,R15,R16,R17,R19,R20,R21,R22,R23,R24,R25,R26)
// Add a*b to m*p in R1-R12,R26
LDP -112(RSP), (R13, R14)
ADDS R13, R1
LDP -128(RSP), (R15, R16)
ADCS R14, R2
ADCS R15, R3
LDP -144(RSP), (R17, R19)
ADCS R16, R4
ADCS R17, R5
LDP -160(RSP), (R20, R21)
ADCS R19, R6
ADCS R20, R7
LDP -176(RSP), (R22, R23)
ADCS R21, R8
ADCS R22, R9
LDP -192(RSP), (R24, R25)
ADCS R23, R10
ADCS R24, R11
ADCS R25, R12
ADC ZR, ZR, R26
// Reduce the top half mod p
LDP ·p+ 0(SB), (R13, R14)
SUBS R13, R7, R13
LDP ·p+16(SB), (R15, R16)
SBCS R14, R8, R14
SBCS R15, R9, R15
LDP ·p+32(SB), (R17, R19)
SBCS R16, R10, R16
SBCS R17, R11, R17
SBCS R19, R12, R19
SBCS ZR, R26
// Store result in c
MOVD c+0(FP), R0
CSEL CC, R7, R13, R7
CSEL CC, R8, R14, R8
STP ( R7, R8), 0(R0)
CSEL CC, R9, R15, R9
CSEL CC, R10, R16, R10
STP ( R9, R10), 16(R0)
CSEL CC, R11, R17, R11
CSEL CC, R12, R19, R12
STP (R11, R12), 32(R0)
RET