1 /// -*- mode: asm; asm-comment-char: ?/ -*-
3 /// Large SIMD-based multiplications
5 /// (c) 2019 Straylight/Edgeware
8 ///----- Licensing notice ---------------------------------------------------
10 /// This file is part of Catacomb.
12 /// Catacomb is free software: you can redistribute it and/or modify it
13 /// under the terms of the GNU Library General Public License as published
14 /// by the Free Software Foundation; either version 2 of the License, or
15 /// (at your option) any later version.
17 /// Catacomb is distributed in the hope that it will be useful, but
18 /// WITHOUT ANY WARRANTY; without even the implied warranty of
19 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 /// Library General Public License for more details.
22 /// You should have received a copy of the GNU Library General Public
23 /// License along with Catacomb. If not, write to the Free Software
24 /// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
27 ///--------------------------------------------------------------------------
31 #include "asm-common.h"
38 ///--------------------------------------------------------------------------
41 /// We define a number of primitive fixed-size multipliers from which we can
42 /// construct more general variable-length multipliers.
44 /// The basic trick is the same throughout. In an operand-scanning
45 /// multiplication, the inner multiplication loop multiplies a multiple-
46 /// precision operand by a single precision factor, and adds the result,
47 /// appropriately shifted, to the result. A `finely integrated operand
48 /// scanning' implementation of Montgomery multiplication also adds the
49 /// product of a single-precision `Montgomery factor' and the modulus,
50 /// calculated in the same pass. The more common `coarsely integrated
51 /// operand scanning' alternates main multiplication and Montgomery passes,
52 /// which requires additional carry propagation.
54 /// Throughout both plain-multiplication and Montgomery stages, then, one of
55 /// the factors remains constant throughout the operation, so we can afford
56 /// to take a little time to preprocess it. The transformation we perform is
57 /// as follows. Let b = 2^16, and B = b^2 = 2^32. Suppose we're given a
58 /// 128-bit factor v = v_0 + v_1 B + v_2 B^2 + v_3 B^3. Split each v_i into
59 /// two sixteen-bit pieces, so v_i = v'_i + v''_i b. These eight 16-bit
60 /// pieces are placed into 32-bit cells, and arranged as two 128-bit NEON
61 /// operands, as follows.
64 /// 0 v''_1 v'_1 v''_0 v'_0
65 /// 16 v''_3 v'_3 v''_2 v'_2
67 /// The `vmull' and `vmlal' instructions can multiply a vector of two 32-bit
68 /// values by a 32-bit scalar, giving two 64-bit results; thus, it will act
69 /// on (say) v'_0 and v''_0 in a single instruction, to produce two 48-bit
70 /// results in 64-bit fields. The sixteen bits of headroom allows us to add
71 /// many products together before we must deal with carrying; it also allows
72 /// for some calculations to be performed on the above expanded form.
74 /// We maintain three `carry' registers, q12--q14, accumulating intermediate
75 /// results; we name them `c0', `c1', and `c2'. Each carry register holds
76 /// two 64-bit halves: the register c0, for example, holds c'_0 (low half)
77 /// and c''_0 (high half), and represents the value c_0 = c'_0 + c''_0 b; the
78 /// carry registers collectively represent the value c_0 + c_1 B + c_2 B^2.
79 /// The `vmull' or `vmlal' instruction acting on a scalar operand and an
80 /// operand in the expanded form above produces a result which can be added
81 /// directly to the appropriate carry register.
83 /// Multiplication is performed in product-scanning order, since ARM
84 /// processors commonly implement result forwarding for consecutive multiply-
85 /// and-accumulate instructions specifying the same destination.
86 /// Experimentally, this runs faster than operand-scanning in an attempt to
87 /// hide instruction latencies.
89 /// On 32-bit ARM, we have a reasonable number of registers: the expanded
90 /// operands are kept in registers. The packed operands are read from memory
91 /// into working registers q0 and q1. The following conventional argument
92 /// names and locations are used throughout.
94 /// Arg Format Location Notes
97 /// X packed [r2] In Montgomery multiplication, X = N
99 /// Y expanded q4/q5 In Montgomery multiplication, Y = (A + U V) M
100 /// M expanded q4/q5 -N^{-1} (mod B^4)
101 /// N Modulus, for Montgomery multiplication
102 /// A packed [r0] Destination/accumulator
105 /// The calculation is some variant of
107 /// A' + C' B^4 <- U V + X Y + A + C
109 /// The low-level functions fit into a fairly traditional (finely-integrated)
110 /// operand scanning loop over operand pairs (U, X) (indexed by j) and (V, Y)
113 /// The variants are as follows.
115 /// Function Variant Use i j
117 /// mmul4 A = C = 0 Montgomery 0 0
118 /// dmul4 A = 0 Montgomery 0 +
119 /// mmla4 C = 0 Montgomery + 0
120 /// dmla4 exactly as shown Montgomery + +
122 /// mul4zc U = V = A = C = 0 Plain 0 0
123 /// mul4 U = V = A = 0 Plain 0 +
124 /// mla4zc U = V = C = 0 Plain + 0
125 /// mla4 U = V = 0 Plain + +
127 /// The `mmul4' and `mmla4' functions are also responsible for calculating
128 /// the Montgomery reduction factor Y = (A + U V) M used by the rest of the
131 ///--------------------------------------------------------------------------
132 /// Macro definitions.
134 .macro mulacc z, u, v, x=nil, y=nil
135 // Set Z = Z + U V + X Y. X may be `nil' to omit the second
136 // operand. Z should be a 128-bit `qN' register; V and Y should be
137 // 64-bit `dN' registers; and U and X should be 32-bit `dN[I]'
138 // scalars; the multiplications produce two 64-bit elementwise
139 // products, which are added elementwise to Z.
147 .macro mulinit z, zinitp, u, v, x, y
148 // If ZINITP then set Z = Z + U V + X Y, as for `mulacc'; otherwise,
149 // set Z = U V + X Y. Operand requirements and detailed operation
150 // are as for `mulacc'.
152 .ifeqs "\zinitp", "t"
153 mulacc \z, \u, \v, \x, \y
162 // `MULI': accumulate the B^I and b B^i terms of the polynomial product sum U
163 // V + X Y, given that U = u_0 + B u_1 + B^2 u_2 + B^3 u_3 (and similarly for
164 // x), and V = v'_0 + b v''_0 + B (v'_1 + b v''_1) + B^2 (v'_2 + b v''_2) +
165 // B^3 (v'_3 + b v''_3) (and similarly for Y). The 64-bit coefficients are
166 // added into the low and high halves of the 128-bit register Z (if ZINIT is
167 // `nil' then simply set Z, as if it were initially zero).
168 #define MUL0(z, zinitp, u, v0, v1, x, y0, y1) \
169 mulinit z, zinitp, QW(u, 0), D0(v0), QW(x, 0), D0(y0)
170 #define MUL1(z, zinitp, u, v0, v1, x, y0, y1) \
171 mulinit z, zinitp, QW(u, 0), D1(v0), QW(x, 0), D1(y0); \
172 mulacc z, QW(u, 1), D0(v0), QW(x, 1), D0(y0)
173 #define MUL2(z, zinitp, u, v0, v1, x, y0, y1) \
174 mulinit z, zinitp, QW(u, 0), D0(v1), QW(x, 0), D0(y1); \
175 mulacc z, QW(u, 1), D1(v0), QW(x, 1), D1(y0); \
176 mulacc z, QW(u, 2), D0(v0), QW(x, 2), D0(y0)
177 #define MUL3(z, zinitp, u, v0, v1, x, y0, y1) \
178 mulinit z, zinitp, QW(u, 0), D1(v1), QW(x, 0), D1(y1); \
179 mulacc z, QW(u, 1), D0(v1), QW(x, 1), D0(y1); \
180 mulacc z, QW(u, 2), D1(v0), QW(x, 2), D1(y0); \
181 mulacc z, QW(u, 3), D0(v0), QW(x, 3), D0(y0)
182 #define MUL4(z, zinitp, u, v0, v1, x, y0, y1) \
183 mulinit z, zinitp, QW(u, 1), D1(v1), QW(x, 1), D1(y1); \
184 mulacc z, QW(u, 2), D0(v1), QW(x, 2), D0(y1); \
185 mulacc z, QW(u, 3), D1(v0), QW(x, 3), D1(y0)
186 #define MUL5(z, zinitp, u, v0, v1, x, y0, y1) \
187 mulinit z, zinitp, QW(u, 2), D1(v1), QW(x, 2), D1(y1); \
188 mulacc z, QW(u, 3), D0(v1), QW(x, 3), D0(y1)
189 #define MUL6(z, zinitp, u, v0, v1, x, y0, y1) \
190 mulinit z, zinitp, QW(u, 3), D1(v1), QW(x, 3), D1(y1)
192 // Steps in the process of propagating carry bits from ZLO to ZHI (both
193 // 128-bit `qN' registers). Here, T is a 128-bit `qN' temporary register.
194 // Set the low 32 bits of the 64-bit `dN' register ZOUT to the completed
197 // In detail, what happens is as follows. Suppose initially that ZLO =
198 // (z'_i; z''_i) and ZHI = (z'_{i+1}; z''_{i+1}). Let t = z'_i + b z''_i;
199 // observe that floor(t/b) = floor(z'_i/b) + z''_i. Let z_i = t mod B, and
200 // add floor(t/B) = floor((floor(z'_i/b) + z''_i)/b) onto z'_{i+1}. This has
201 // a circuit depth of 4; I don't know how to do better.
202 .macro _carry0 zout, zlo0, zlo1, t0, t1
203 // ZLO0 and ZLO1 are the low and high halves of a carry register.
204 // Extract a 32-bit output, in the bottom 32 bits of ZOUT, and set T1
205 // so as to continue processing using `_carry1'. All operands are
206 // 64-bit `dN' registers. If ZOUT is `nil' then no output is
207 // produced; if T1 is `nil' then no further processing will be
209 .ifnes "\zout", "nil"
210 vshl.i64 \t0, \zlo1, #16
213 vshr.u64 \t1, \zlo0, #16
215 .ifnes "\zout", "nil"
216 vadd.i64 \zout, \zlo0, \t0
219 vadd.i64 \t1, \t1, \zlo1
222 .macro _carry1 u, zhi0, t1
223 // ZHI0 is the low half of a carry register, and T1 is the result of
224 // applying `_carry0' to the previous carry register. Set U to the
225 // result of propagating the carry into ZHI0.
226 vshr.u64 \t1, \t1, #16
227 vadd.i64 \u, \zhi0, \t1
230 // More convenient wrappers for `_carry0' and `_carry1'.
232 // CARRY0(ZOUT, ZLO, T)
233 // Store a 32-bit output in ZOUT from carry ZLO, using T as a
234 // temporary. ZOUT is a 64-bit `dN' register; ZLO and T are 128-bit
238 // Propagate carry from T into ZHI. Both are 128-bit `qN' registers;
240 #define CARRY0(zout, zlo, t) \
241 CASIDE0(zout, D0(zlo), zlo, t)
242 #define CARRY1(zhi, t) \
243 CASIDE1(D0(zhi), zhi, t)
245 // Versions of `CARRY0' and `CARRY1' which don't mutate their operands.
247 // CASIDE0(ZOUT, U, ZLO, T)
248 // As for `CARRY0', but the low half of ZLO is actually in U (a 64-bit
251 // CASIDE0E(ZOUT, U, ZLO, T)
252 // As for `CASIDE0', but only calculate the output word, and no
255 // CASIDE1(U, ZHI, T)
256 // As for `CARRY1', but write the updated low half of ZHI to U.
257 #define CASIDE0(zout, u, zlo, t) \
258 _carry0 zout, u, D1(zlo), D0(t), D1(t)
259 #define CASIDE0E(zout, u, zlo, t) \
260 _carry0 zout, u, D1(zlo), D0(t), nil
261 #define CASIDE1(u, zhi, t) \
262 _carry1 u, D0(zhi), D1(t)
264 // Steps in spreading a packed 128-bit operand in A0 across A0, A1, A2, A3 in
266 #define SPREADACC0(a0, a1, a2, a3) \
270 #define SPREADACC1(a0, a1, a2, a3) \
271 vswp D1(a0), D0(a2); \
272 vtrn.32 D0(a0), D0(a1); \
273 vtrn.32 D0(a2), D0(a3)
275 // Add the carry-format values A0, A1, A2 into the existing carries C0, C1,
276 // C2 (leaving A3 where it is).
277 #define CARRYACC(a0, a1, a2, a3, c0, c1, c2) \
278 vadd.i64 c0, c0, a0; \
279 vadd.i64 c1, c1, a1; \
282 ///--------------------------------------------------------------------------
283 /// Primitive multipliers and related utilities.
286 // On entry, r0 points to a destination, and q13--q15 hold incoming
287 // carries c0--c2. On exit, the low 128 bits of the carry value are
288 // stored at [r0]; the remaining 16 bits of carry are left in d30; r0
289 // is advanced by 16; and q10--q14 are clobbered.
292 CARRY0(D0(q10), q13, q12)
294 CARRY0(D0(q11), q14, q12)
296 CARRY0(D1(q10), q15, q12)
297 vshr.u64 D1(q11), D1(q12), #16
298 vshr.u64 D0(q15), D1(q12), #48
305 // On entry, r0 points to the destination; r1 and r2 point to packed
306 // operands U and X; q2/q3 and q4/q5 hold expanded operands V and Y;
307 // and q13--q15 hold incoming carries c0--c2. On exit, the
308 // destination and carries are updated; r0, r1, r2 are each advanced
309 // by 16; q2--q5 are preserved; and the other NEON registers are
313 // Start by loading the operand words from memory.
317 // Do the multiplication.
318 MUL0(q13, t, q0, q2, q3, q1, q4, q5)
319 MUL1(q14, t, q0, q2, q3, q1, q4, q5)
320 CARRY0(D0(q8), q13, q6)
321 MUL2(q15, t, q0, q2, q3, q1, q4, q5)
323 CARRY0(D0(q9), q14, q6)
324 MUL3(q12, nil, q0, q2, q3, q1, q4, q5)
326 CARRY0(D1(q8), q15, q6)
327 MUL4(q13, nil, q0, q2, q3, q1, q4, q5)
329 CARRY0(D1(q9), q12, q6)
330 MUL5(q14, nil, q0, q2, q3, q1, q4, q5)
332 MUL6(q15, nil, q0, q2, q3, q1, q4, q5)
334 // Finish up and store the result.
343 // On entry, r0 points to the destination/accumulator; r1 and r2
344 // point to packed operands U and X; q2/q3 and q4/q5 hold expanded
345 // operands V and Y; and q13--q15 hold incoming carries c0--c2. On
346 // exit, the accumulator and carries are updated; r0, r1, r2 are each
347 // advanced by 16; q2--q5 are preserved; and the other NEON registers
351 // Start by loading the operand words from memory.
353 SPREADACC0(q9, q10, q11, q12)
357 // Add the accumulator input to the incoming carries. Split the
358 // accumulator into four pieces and add the carries onto them.
359 SPREADACC1(q9, q10, q11, q12)
360 CARRYACC(q9, q10, q11, q12, q13, q14, q15)
362 // Do the multiplication.
363 MUL0(q13, t, q0, q2, q3, q1, q4, q5)
364 MUL1(q14, t, q0, q2, q3, q1, q4, q5)
365 CARRY0(D0(q8), q13, q6)
366 MUL2(q15, t, q0, q2, q3, q1, q4, q5)
368 CARRY0(D0(q9), q14, q6)
369 MUL3(q12, t, q0, q2, q3, q1, q4, q5)
371 CARRY0(D1(q8), q15, q6)
372 MUL4(q13, nil, q0, q2, q3, q1, q4, q5)
374 CARRY0(D1(q9), q12, q6)
375 MUL5(q14, nil, q0, q2, q3, q1, q4, q5)
377 MUL6(q15, nil, q0, q2, q3, q1, q4, q5)
379 // Finish up and store the result.
388 // On entry, r0 points to the destination; r2 points to a packed
389 // operand X; q4/q5 holds an expanded operand Y; and q13--q15 hold
390 // incoming carries c0--c2. On exit, the destination and carries are
391 // updated; r0 and r2 are each advanced by 16; q4 and q5 are
392 // preserved; and the other NEON registers are clobbered.
395 // Start by loading the operand words from memory.
398 // Do the multiplication.
399 MUL0(q13, t, q1, q4, q5, nil, nil, nil)
400 MUL1(q14, t, q1, q4, q5, nil, nil, nil)
401 CARRY0(D0(q8), q13, q6)
402 MUL2(q15, t, q1, q4, q5, nil, nil, nil)
404 CARRY0(D0(q9), q14, q6)
405 MUL3(q12, nil, q1, q4, q5, nil, nil, nil)
407 CARRY0(D1(q8), q15, q6)
408 MUL4(q13, nil, q1, q4, q5, nil, nil, nil)
410 CARRY0(D1(q9), q12, q6)
411 MUL5(q14, nil, q1, q4, q5, nil, nil, nil)
413 MUL6(q15, nil, q1, q4, q5, nil, nil, nil)
415 // Finish up and store the result.
424 // On entry, r0 points to the destination; r2 points to a packed
425 // operand X; and q4/q5 holds an expanded operand Y. On exit, the
426 // destination is updated; q13--q15 hold outgoing carries c0--c2; r0
427 // and r2 are each advanced by 16; q4 and q5 are preserved; and the
428 // other NEON registers are clobbered.
431 // Start by loading the operand words from memory.
434 // Do the multiplication.
435 MUL0(q13, nil, q1, q4, q5, nil, nil, nil)
436 MUL1(q14, nil, q1, q4, q5, nil, nil, nil)
437 CARRY0(D0(q8), q13, q6)
438 MUL2(q15, nil, q1, q4, q5, nil, nil, nil)
440 CARRY0(D0(q9), q14, q6)
441 MUL3(q12, nil, q1, q4, q5, nil, nil, nil)
443 CARRY0(D1(q8), q15, q6)
444 MUL4(q13, nil, q1, q4, q5, nil, nil, nil)
446 CARRY0(D1(q9), q12, q6)
447 MUL5(q14, nil, q1, q4, q5, nil, nil, nil)
449 MUL6(q15, nil, q1, q4, q5, nil, nil, nil)
451 // Finish up and store the result.
460 // On entry, r0 points to the destination/accumulator; r2 points to a
461 // packed operand X; q4/q5 holds an expanded operand Y; and q13--q15
462 // hold incoming carries c0--c2. On exit, the accumulator and
463 // carries are updated; r0 and r2 are each advanced by 16; q4 and q5
464 // are preserved; and the other NEON registers are clobbered.
467 // Start by loading the operand words from memory.
469 SPREADACC0(q9, q10, q11, q12)
472 // Add the accumulator input to the incoming carries. Split the
473 // accumulator into four pieces and add the carries onto them.
474 SPREADACC1(q9, q10, q11, q12)
475 CARRYACC(q9, q10, q11, q12, q13, q14, q15)
477 // Do the multiplication.
479 MUL0(q13, t, q1, q4, q5, nil, nil, nil)
480 MUL1(q14, t, q1, q4, q5, nil, nil, nil)
481 CARRY0(D0(q8), q13, q6)
482 MUL2(q15, t, q1, q4, q5, nil, nil, nil)
484 CARRY0(D0(q9), q14, q6)
485 MUL3(q12, t, q1, q4, q5, nil, nil, nil)
487 CARRY0(D1(q8), q15, q6)
488 MUL4(q13, nil, q1, q4, q5, nil, nil, nil)
490 CARRY0(D1(q9), q12, q6)
491 MUL5(q14, nil, q1, q4, q5, nil, nil, nil)
493 MUL6(q15, nil, q1, q4, q5, nil, nil, nil)
495 // Finish up and store the result.
504 // On entry, r0 points to the destination/accumulator; r2 points to a
505 // packed operand X; and q4/q5 holds an expanded operand Y. On exit,
506 // the accumulator is updated; q13--q15 hold outgoing carries c0--c2;
507 // r0 and r2 are each advanced by 16; q4 and q5 are preserved; and
508 // the other NEON registers are clobbered.
511 // Start by loading the operand words from memory.
513 SPREADACC0(q13, q14, q15, q12)
516 // Move the accumulator input to the incoming carry slots. Split the
517 // accumulator into four pieces.
518 SPREADACC1(q13, q14, q15, q12)
524 // On entry, r0 points to the destination; r1 points to a packed
525 // operand U; r2 points to a packed operand X (the modulus); q2/q3
526 // holds an expanded operand V; and q4/q5 holds an expanded operand M
527 // (the Montgomery factor -N^{-1} (mod B)). On exit, the destination
528 // is updated (to zero); q4/q5 hold an expanded factor Y = U V M (mod
529 // B); q13--q15 hold outgoing carries c0--c2; r0, r1, and r2 are each
530 // advanced by 16; q2 and q3 are preserved; and the other NEON
531 // registers are clobbered.
533 // Start by loading the operand words from memory.
536 // Calculate the low half of W = A + U V, being careful to leave the
538 MUL0(q13, nil, q0, q2, q3, nil, nil, nil)
539 MUL1(q14, nil, q0, q2, q3, nil, nil, nil)
540 CARRY0(D0(q6), q13, q8)
541 MUL2(q15, nil, q0, q2, q3, nil, nil, nil)
542 CASIDE1(D0(q9), q14, q8)
543 CASIDE0(D0(q7), D0(q9), q14, q8)
544 MUL3(q12, nil, q0, q2, q3, nil, nil, nil)
549 // On entry, r0 points to the destination/accumulator A; r1 points to
550 // a packed operand U; r2 points to a packed operand X (the modulus);
551 // q2/q3 holds an expanded operand V; and q4/q5 holds an expanded
552 // operand M (the Montgomery factor -N^{-1} (mod B)). On exit, the
553 // accumulator is updated (to zero); q4/q5 hold an expanded factor Y
554 // = (A + U V) M (mod B); q13--q15 hold outgoing carries c0--c2; r0,
555 // r1, and r2 are each advanced by 16; q2 and q3 are preserved; and
556 // the other NEON registers are clobbered.
559 // Start by loading the operand words from memory.
561 SPREADACC0(q13, q14, q15, q12)
564 // Move the accumulator input to the incoming carry slots. Split the
565 // accumulator into four pieces.
566 SPREADACC1(q13, q14, q15, q12)
568 // Calculate the low half of W = A + U V, being careful to leave the
570 MUL0(q13, t, q0, q2, q3, nil, nil, nil)
571 MUL1(q14, t, q0, q2, q3, nil, nil, nil)
572 CARRY0(D0(q6), q13, q8)
573 MUL2(q15, t, q0, q2, q3, nil, nil, nil)
574 CASIDE1(D0(q9), q14, q8)
575 CASIDE0(D0(q7), D0(q9), q14, q8)
576 MUL3(q12, t, q0, q2, q3, nil, nil, nil)
578 CASIDE1(D0(q9), q15, q8)
579 CASIDE0(D1(q6), D0(q9), q15, q8)
580 CASIDE1(D0(q9), q12, q8)
581 CASIDE0E(D1(q7), D0(q9), q12, q8)
584 // Calculate the low half of the Montgomery factor Y = W M. At this
585 // point, registers are a little tight.
586 MUL0( q8, nil, q6, q4, q5, nil, nil, nil)
587 MUL1( q9, nil, q6, q4, q5, nil, nil, nil)
588 CARRY0(D0(q8), q8, q1)
589 MUL2(q10, nil, q6, q4, q5, nil, nil, nil)
591 CARRY0(D0(q9), q9, q1)
592 MUL3(q11, nil, q6, q4, q5, nil, nil, nil)
594 CARRY0(D1(q8), q10, q1)
597 CARRY0(D1(q9), q11, q1)
601 // Expand Y. We'll put it in its proper place a bit later.
604 // Build up the product X Y in the carry slots.
605 MUL0(q13, t, q1, q8, q5, nil, nil, nil)
606 MUL1(q14, t, q1, q8, q5, nil, nil, nil)
608 MUL2(q15, t, q1, q8, q5, nil, nil, nil)
612 MUL3(q12, t, q1, q8, q5, nil, nil, nil)
617 // And complete the calculation.
618 MUL4(q13, nil, q0, q2, q3, q1, q4, q5)
621 MUL5(q14, nil, q0, q2, q3, q1, q4, q5)
623 MUL6(q15, nil, q0, q2, q3, q1, q4, q5)
625 // Finish up and store the result.
633 // On entry, r0 points to the destination/accumulator A; r2 points to
634 // a packed operand X (the modulus); and q2/q3 holds an expanded
635 // operand M (the Montgomery factor -N^{-1} (mod B)). On exit, the
636 // accumulator is updated (to zero); q4/q5 hold an expanded factor Y
637 // = A M (mod B); q13--q15 hold outgoing carries c0--c2; r0 and r2
638 // are each advanced by 16; q2 and q3 are preserved; and the other
639 // NEON registers are clobbered.
642 // Start by loading the operand words from memory.
646 // Calculate Y = A M (mod B).
647 MUL0(q8, nil, q0, q2, q3, nil, nil, nil)
648 MUL1(q9, nil, q0, q2, q3, nil, nil, nil)
649 CARRY0(D0(q4), q8, q6)
650 MUL2(q10, nil, q0, q2, q3, nil, nil, nil)
653 CARRY0(D0(q7), q9, q6)
654 MUL3(q11, nil, q0, q2, q3, nil, nil, nil)
656 CARRY0(D1(q4), q10, q6)
657 SPREADACC0(q13, q14, q15, q12)
659 CARRY0(D1(q7), q11, q6)
660 SPREADACC1(q13, q14, q15, q12)
665 // Calculate the actual result. Well, the carries, at least.
667 MUL0(q13, t, q1, q4, q5, nil, nil, nil)
668 MUL1(q14, t, q1, q4, q5, nil, nil, nil)
670 MUL2(q15, t, q1, q4, q5, nil, nil, nil)
673 MUL3(q12, t, q1, q4, q5, nil, nil, nil)
676 MUL4(q13, nil, q1, q4, q5, nil, nil, nil)
679 MUL5(q14, nil, q1, q4, q5, nil, nil, nil)
681 MUL6(q15, nil, q1, q4, q5, nil, nil, nil)
683 // Finish up and store the result.
690 ///--------------------------------------------------------------------------
691 /// Bulk multipliers.
693 FUNC(mpx_umul4_arm_neon)
694 // void mpx_umul4_arm_neon(mpw *dv, const mpw *av, const mpw *avl,
695 // const mpw *bv, const mpw *bvl);
697 // Establish the arguments and do initial setup.
710 // Prepare for the first iteration.
711 vld1.32 {q4}, [r3]! // = Y = bv[0]
713 // r0 // = dv for inner loop
715 // r3 // = bv for outer loop
716 ldr r4, [sp, #76] // = bv limit
717 mov r12, r2 // = av limit
718 mov r2, r1 // = av for inner loop
719 add r5, r0, #16 // = dv for outer loop
720 vzip.16 q4, q5 // expand Y
722 cmp r2, r12 // all done?
725 // Continue with the first iteration.
727 cmp r2, r12 // all done?
730 // Write out the leftover carry. There can be no tail here.
732 cmp r3, r4 // more passes to do?
735 // Set up for the next pass.
736 1: vld1.32 {q4}, [r3]! // = Y = bv[i]
738 mov r0, r5 // -> dv[i]
739 mov r2, r1 // -> av[0]
741 vzip.16 q4, q5 // expand Y
743 cmp r2, r12 // done yet?
751 // Finish off this pass. There was no tail on the previous pass, and
752 // there can be done on this pass.
763 FUNC(mpxmont_mul4_arm_neon)
764 // void mpxmont_mul4_arm_neon(mpw *dv, const mpw *av, const mpw *bv,
765 // const mpw *nv, size_t n, const mpw *mi);
767 // Establish the arguments and do initial setup.
787 // Establish the expanded operands.
788 ldrd r4, r5, [sp, #96] // r4 = n; r5 -> mi
789 vld1.32 {q2}, [r2] // = V = bv[0]
792 vld1.32 {q4}, [r5] // = M
794 // Set up the outer loop state and prepare for the first iteration.
795 // r0 // -> dv for inner loop
796 // r1 // -> av for inner loop
797 add r7, r2, #16 // -> bv
799 add r6, r0, #16 // -> dv
801 add r9, r1, r4, lsl #2 // -> av limit
802 add r4, r2, r4, lsl #2 // -> bv limit
803 mov r2, r3 // -> nv for inner loop
806 vzip.16 q2, q3 // expand V
807 vzip.16 q4, q5 // expand M
809 cmp r1, r9 // done already?
812 // Complete the first inner loop.
814 cmp r1, r9 // done yet?
817 // Still have carries left to propagate. Rather than store the tail
818 // end in memory, keep it in a general-purpose register for later.
820 vmov.32 r10, QW(q15, 0)
822 // Embark on the next iteration. (There must be one. If n = 1 then
823 // we would have bailed above, to label 8. Similarly, the subsequent
824 // iterations can fall into the inner loop immediately.)
825 1: vld1.32 {q2}, [r7]! // = V = bv[i]
826 vld1.32 {q4}, [r5] // = M
829 mov r0, r6 // -> dv[i]
831 mov r1, r8 // -> av[0]
832 mov r2, r3 // -> nv[0]
833 vzip.16 q2, q3 // expand V
834 vzip.16 q4, q5 // expand M
837 // Complete the next inner loop.
839 cmp r1, r9 // done yet?
842 // Still have carries left to propagate, and they overlap the
843 // previous iteration's final tail, so read that and add it.
845 vmov.32 D0(q12), r10, r12
846 vadd.i64 D0(q13), D0(q13), D0(q12)
848 vmov.32 r10, QW(q15, 0)
850 // Back again, maybe.
859 // First iteration was short. Write out the carries and we're done.
860 // (This could be folded into the main loop structure, but that would
861 // penalize small numbers more.)
863 vst1.32 {QW(q15, 0)}, [r0]!
869 FUNC(mpxmont_redc4_arm_neon)
870 // void mpxmont_redc4_arm_neon(mpw *dv, mpw *dvl, const mpw *nv,
871 // size_t n, const mpw *mi);
873 // Establish the arguments and do initial setup.
878 // blocks-of-4 dv limit r3
881 // outer loop dv limit r5
886 // t0, t1, t2, t3 r2, r8, r9, r10
893 // Set up the outer loop state and prepare for the first iteration.
894 ldr r14, [sp, #96] // -> mi
896 sub r12, r1, r0 // total dv bytes
897 // r0 // -> dv for inner loop
898 // r1 // -> overall dv limit
899 // r2 // -> nv for inner loop
900 // r3 // = n (for now)
901 add r4, r0, #16 // -> dv for outer loop
902 add r5, r0, r3, lsl #2 // -> dv limit
903 bic r12, r12, #15 // dv blocks of 4
904 vld1.32 {q2}, [r14] // = M
906 add r7, r2, r3, lsl #2 // -> nv limit
907 add r3, r0, r12 // -> dv blocks-of-4 limit
908 vzip.16 q2, q3 // expand M
911 cmp r2, r7 // done already?
915 cmp r2, r7 // done yet?
918 // Still have carries left to propagate. Adding the accumulator
919 // block into the carries is a little different this time, because
920 // all four accumulator limbs have to be squished into the three
921 // carry registers for `carryprop' to do its thing.
922 8: vld1.32 {q9}, [r0]
923 SPREADACC0(q9, q10, q11, q12)
924 SPREADACC1(q9, q10, q11, q12)
925 vshl.u64 D0(q12), D0(q12), #16
926 CARRYACC(q9, q10, q11, q12, q13, q14, q15)
927 vadd.u64 D1(q15), D1(q15), D0(q12)
930 vmov.32 r14, QW(q15, 0)
934 // Propagate the first group of carries.
935 ldmia r0, {r2, r8-r10}
940 stmia r0!, {r2, r8-r10}
944 // Continue carry propagation until the end of the buffer.
945 0: ldmia r0, {r2, r8-r10}
950 stmia r0!, {r2, r8-r10}
954 // Deal with the tail end. Note that the actual destination length
955 // won't be an exacty number of blocks of four, so it's safe to just
956 // drop through here.
969 // All done for this iteration. Start the next.
984 ///--------------------------------------------------------------------------
985 /// Testing and performance measurement.
989 // dmul smul mmul mont
994 // x r2 r3 r2 stk0 r3
995 // vv q2/q3 stk0 -- stk1 stk0
996 // yy q4/q5 stk1 r3 stk2 --
997 // n r5 stk2 stk0 stk3 stk1
998 // cyv r6 stk3 stk1 stk4 stk2
1000 #define STKARG(i) sp, #80 + 4*(i)
1002 .macro testprologue mode
1007 .ifeqs "\mode", "dmul"
1012 ldr r14, [STKARG(0)] // -> vv
1015 vzip.16 q2, q3 // (v''_1, v'_1; v''_0, v'_0)
1017 ldr r14, [STKARG(1)] // -> yy
1020 vzip.16 q4, q5 // (y''_1, y'_1; y''_0, y'_0)
1022 ldr r5, [STKARG(2)] // = n
1023 ldr r6, [STKARG(3)] // -> cyv
1026 .ifeqs "\mode", "smul"
1032 vzip.16 q4, q5 // (y''_1, y'_1; y''_0, y'_0)
1034 ldr r5, [STKARG(0)] // = n
1035 ldr r6, [STKARG(1)] // -> cyv
1038 .ifeqs "\mode", "mmul"
1042 ldr r2, [STKARG(0)] // -> x
1044 ldr r14, [STKARG(1)] // -> vv
1047 vzip.16 q2, q3 // (v''_1, v'_1; v''_0, v'_0)
1049 ldr r14, [STKARG(2)] // -> yy
1052 vzip.16 q4, q5 // (y''_1, y'_1; y''_0, y'_0)
1054 ldr r5, [STKARG(3)] // = n
1055 ldr r6, [STKARG(4)] // -> cyv
1058 .ifeqs "\mode", "mont"
1065 ldr r14, [STKARG(0)] // -> vv
1068 vzip.16 q2, q3 // (v''_1, v'_1; v''_0, v'_0)
1070 ldr r5, [STKARG(1)] // = n
1071 ldr r6, [STKARG(2)] // -> cyv
1076 vldmia r4, {QQ(q13, q15)} // c0, c1, c2
1088 vstmia r4, {QQ(q13, q15)}
1162 vst1.32 {q4, q5}, [r3]
1172 vst1.32 {q4, q5}, [r3]
1182 vst1.32 {q4, q5}, [r3]
1189 ///----- That's all, folks --------------------------------------------------