1 /// -*- mode: asm; asm-comment-char: ?/ -*-
3 /// Large SIMD-based multiplications
5 /// (c) 2019 Straylight/Edgeware
8 ///----- Licensing notice ---------------------------------------------------
10 /// This file is part of Catacomb.
12 /// Catacomb is free software: you can redistribute it and/or modify it
13 /// under the terms of the GNU Library General Public License as published
14 /// by the Free Software Foundation; either version 2 of the License, or
15 /// (at your option) any later version.
17 /// Catacomb is distributed in the hope that it will be useful, but
18 /// WITHOUT ANY WARRANTY; without even the implied warranty of
19 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 /// Library General Public License for more details.
22 /// You should have received a copy of the GNU Library General Public
23 /// License along with Catacomb. If not, write to the Free Software
24 /// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
27 ///--------------------------------------------------------------------------
31 #include "asm-common.h"
35 ///--------------------------------------------------------------------------
38 /// We define a number of primitive fixed-size multipliers from which we can
39 /// construct more general variable-length multipliers.
41 /// The basic trick is the same throughout. In an operand-scanning
42 /// multiplication, the inner multiplication loop multiplies a multiple-
43 /// precision operand by a single precision factor, and adds the result,
44 /// appropriately shifted, to the result. A `finely integrated operand
45 /// scanning' implementation of Montgomery multiplication also adds the
46 /// product of a single-precision `Montgomery factor' and the modulus,
47 /// calculated in the same pass. The more common `coarsely integrated
48 /// operand scanning' alternates main multiplication and Montgomery passes,
49 /// which requires additional carry propagation.
51 /// Throughout both plain-multiplication and Montgomery stages, then, one of
52 /// the factors remains constant throughout the operation, so we can afford
53 /// to take a little time to preprocess it. The transformation we perform is
54 /// as follows. Let b = 2^16, and B = b^2 = 2^32. Suppose we're given a
55 /// 128-bit factor v = v_0 + v_1 B + v_2 B^2 + v_3 B^3. Split each v_i into
56 /// two sixteen-bit pieces, so v_i = v'_i + v''_i b. These eight 16-bit
57 /// pieces are placed into 32-bit cells, and arranged as two 128-bit SIMD
58 /// operands, as follows.
61 /// 0 v''_1 v'_1 v''_0 v'_0
62 /// 16 v''_3 v'_3 v''_2 v'_2
64 /// The `umull' and `umlal' instructions can multiply a vector of two 32-bit
65 /// values by a 32-bit scalar, giving two 64-bit results; thus, it will act
66 /// on (say) v'_0 and v''_0 in a single instruction, to produce two 48-bit
67 /// results in 64-bit fields. The sixteen bits of headroom allows us to add
68 /// many products together before we must deal with carrying; it also allows
69 /// for some calculations to be performed on the above expanded form.
71 /// We maintain three `carry' registers, v28--v30, accumulating intermediate
72 /// results; we name them `c0', `c1', and `c2'. Each carry register holds
73 /// two 64-bit halves: the register c0, for example, holds c'_0 (low half)
74 /// and c''_0 (high half), and represents the value c_0 = c'_0 + c''_0 b; the
75 /// carry registers collectively represent the value c_0 + c_1 B + c_2 B^2.
76 /// The `umull' or `umlal' instruction acting on a scalar operand and an
77 /// operand in the expanded form above produces a result which can be added
78 /// directly to the appropriate carry register.
80 /// An unusual feature of this code, as compared to the other `mul4'
81 /// implementations, is that it makes extensive use of the ARM64
82 /// general-purpose registers for carry resolution and output construction.
83 /// As a result, an additional general-purpose register (typically x15) is
84 /// used as an additional carry, with the carry value in bits 16--63.
86 /// Multiplication is performed in product-scanning order, since ARM
87 /// processors commonly implement result forwarding for consecutive multiply-
88 /// and-accumulate instructions specifying the same destination.
89 /// Experimentally, this runs faster than operand-scanning in an attempt to
90 /// hide instruction latencies.
92 /// On 64-bit ARM, we have a vast supply number of registers: the expanded
93 /// operands are kept in registers. The packed operands are read from memory
94 /// into working registers v0 and v1. The following conventional argument
95 /// names and locations are used throughout.
97 /// Arg Format Location Notes
100 /// X packed [x2] In Montgomery multiplication, X = N
102 /// Y expanded v4/v5 In Montgomery multiplication, Y = (A + U V) M
103 /// M expanded v6/v7 -N^{-1} (mod B^4)
104 /// N Modulus, for Montgomery multiplication
105 /// A packed [x0] Destination/accumulator
107 /// 0 v31 128-bit zero
109 /// The calculation is some variant of
111 /// A' + C' B^4 <- U V + X Y + A + C
113 /// The low-level functions fit into a fairly traditional (finely-integrated)
114 /// operand scanning loop over operand pairs (U, X) (indexed by j) and (V, Y)
117 /// The variants are as follows.
119 /// Function Variant Use i j
121 /// mmul4 A = C = 0 Montgomery 0 0
122 /// dmul4 A = 0 Montgomery 0 +
123 /// mmla4 C = 0 Montgomery + 0
124 /// dmla4 exactly as shown Montgomery + +
126 /// mul4zc U = V = A = C = 0 Plain 0 0
127 /// mul4 U = V = A = 0 Plain 0 +
128 /// mla4zc U = V = C = 0 Plain + 0
129 /// mla4 U = V = 0 Plain + +
131 /// The `mmul4' and `mmla4' functions are also responsible for calculating
132 /// the Montgomery reduction factor Y = (A + U V) M used by the rest of the
135 ///--------------------------------------------------------------------------
136 /// Macro definitions.
138 .macro mulacc z, u, v, x=nil, y=nil
139 // Set Z = Z + U V + X Y, using the low halves of V and Y. Y may be
140 // `nil' to omit the second operand. Z, V, and Y should be 128-bit
141 // `vN' registers; and U and X should be 32-bit `vN.s[I]' scalars;
142 // the multiplications produce two 64-bit elementwise products, which
143 // are added elementwise to Z.
145 umlal \z\().2d, \v\().2s, \u
147 umlal \z\().2d, \y\().2s, \x
151 .macro mulacc2 z, u, v, x=nil, y=nil
152 // Set Z = Z + U V + X Y, using the high halves of V and Y; see
155 umlal2 \z\().2d, \v\().4s, \u
157 umlal2 \z\().2d, \y\().4s, \x
161 .macro mulinit z, zinitp, u, v=nil, x=nil, y=nil
162 // If ZINITP then set Z = Z + U V + X Y, as for `mulacc'; otherwise,
163 // set Z = U V + X Y. Operand requirements and detailed operation
164 // are as for `mulacc'.
166 .ifeqs "\zinitp", "t"
167 mulacc \z, \u, \v, \x, \y
169 umull \z\().2d, \v\().2s, \u
171 umlal \z\().2d, \y\().2s, \x
176 .macro mulini2 z, zinitp, u, v=nil, x=nil, y=nil
177 // As `mulinit', but with the high halves of V and Y.
179 .ifeqs "\zinitp", "t"
180 mulacc2 \z, \u, \v, \x, \y
182 umull2 \z\().2d, \v\().4s, \u
184 umlal2 \z\().2d, \y\().4s, \x
189 // `mulI': accumulate the B^I and b B^I terms of the polynomial product sum
190 // U V + X Y, given that U = u_0 + B u_1 + B^2 u_2 + B^3 u_3 (and similarly
191 // for x), and V = v'_0 + b v''_0 + B (v'_1 + b v''_1) + B^2 (v'_2 + b v''_2)
192 // + B^3 (v'_3 + b v''_3) (and similarly for Y). The 64-bit coefficients are
193 // added into the low and high halves of the 128-bit register Z (if ZINIT is
194 // `nil' then simply set Z, as if it were initially zero).
195 .macro mul0 z, zinitp, u, v0, v1, x=nil, y0=nil, y1=nil
196 mulinit \z, \zinitp, \u\().s[0], \v0, \x\().s[0], \y0
198 .macro mul1 z, zinitp, u, v0, v1, x=nil, y0=nil, y1=nil
199 mulini2 \z, \zinitp, \u\().s[0], \v0, \x\().s[0], \y0
200 mulacc \z, \u\().s[1], \v0, \x\().s[1], \y0
202 .macro mul2 z, zinitp, u, v0, v1, x=nil, y0=nil, y1=nil
203 mulinit \z, \zinitp, \u\().s[0], \v1, \x\().s[0], \y1
204 mulacc2 \z, \u\().s[1], \v0, \x\().s[1], \y0
205 mulacc \z, \u\().s[2], \v0, \x\().s[2], \y0
207 .macro mul3 z, zinitp, u, v0, v1, x=nil, y0=nil, y1=nil
208 mulini2 \z, \zinitp, \u\().s[0], \v1, \x\().s[0], \y1
209 mulacc \z, \u\().s[1], \v1, \x\().s[1], \y1
210 mulacc2 \z, \u\().s[2], \v0, \x\().s[2], \y0
211 mulacc \z, \u\().s[3], \v0, \x\().s[3], \y0
213 .macro mul4 z, zinitp, u, v0, v1, x=nil, y0=nil, y1=nil
214 mulini2 \z, \zinitp, \u\().s[1], \v1, \x\().s[1], \y1
215 mulacc \z, \u\().s[2], \v1, \x\().s[2], \y1
216 mulacc2 \z, \u\().s[3], \v0, \x\().s[3], \y0
218 .macro mul5 z, zinitp, u, v0, v1, x=nil, y0=nil, y1=nil
219 mulini2 \z, \zinitp, \u\().s[2], \v1, \x\().s[2], \y1
220 mulacc \z, \u\().s[3], \v1, \x\().s[3], \y1
222 .macro mul6 z, zinitp, u, v0, v1, x=nil, y0=nil, y1=nil
223 mulini2 \z, \zinitp, \u\().s[3], \v1, \x\().s[3], \y1
226 // Steps in the process of propagating carry bits upwards from ZLO (a 128-bit
227 // `vN' register). Here, T0, T1, and CG are 64-bit `xN' general-purpose
228 // registers clobbered in the process. Set the low 32 bits of the 64-bit
229 // `xN' general-purpose register ZOUT to the completed coefficient z_1,
230 // leaving a carry in CG.
232 // In detail, what happens is as follows. Suppose initially that ZLO =
233 // (z''_i; z'_i) and ZHI = (z''_{i+1}; z'_{i+1}). Let t = z'_i + b z''_i;
234 // observe that floor(t/b) = floor(z'_i/b) + z''_i. Let z_i = t mod B, and
235 // add floor(t/B) = floor((floor(z'_i/b) + z''_i)/b) onto z'_{i+1}. This has
236 // a circuit depth of 3; I don't know how to do better.
238 // Output words are left in the low half of a 64-bit register, with rubbish
239 // in the high half. Two such results can be combined using the `bfi'
241 .macro carry0 zlo, cg=x15, t0=x16, t1=x17
242 // Capture the values of carry-register ZLO and CG (if not `nil') in
243 // general-purpose registers T0 and T1, suitable for use in `carry1'.
244 mov \t0, \zlo\().d[0]
245 mov \t1, \zlo\().d[1]
247 add \t0, \t0, \cg, lsr #16
250 .macro carry1 zout, cg=x15, t0=x16, t1=x17
251 // Collect a 32-bit output word in the low 32 bits of ZOUT (leaving
252 // rubbish in the high 32 bits), and update CG suitably to continue
253 // processing with the next carry register.
254 .ifnes "\zout", "nil"
255 add \zout, \t0, \t1, lsl #16
258 add \cg, \t1, \t0, lsr #16
262 .macro expand vlo, vhi, vz=v31
263 // Expand the packed 128-bit operand in VLO to an expanded operand in
264 // VLO and VHI, assuming that VZ is all-bits-zero. All three are
265 // `vN' 128-bit SIMD registers.
266 zip2 \vhi\().8h, \vlo\().8h, \vz\().8h
267 zip1 \vlo\().8h, \vlo\().8h, \vz\().8h
270 .macro sprdacc a0, a1, a2, a3=nil
271 // Spread the packed 128-bit operand in A0 into carry-format values
272 // in A0, A1, A2, A3. If A3 is `nil', then spread the same value
273 // into A0, A1, A2 only, clobbering x16.
277 trn2 \a2\().2d, \a0\().2d, v31.2d
278 trn2 \a1\().2s, \a0\().2s, v31.2s
282 trn1 \a0\().2s, \a0\().2s, v31.2s
286 trn2 \a3\().2s, \a2\().2s, v31.2s
291 .macro crryacc a0, a1, a2, a3, c0, c1, c2
292 // Add the carry-format values A0, A1, A2 into the existing carries
293 // C0, C1, C2 (leaving A3 where it is).
294 add \c0\().2d, \c0\().2d, \a0\().2d
295 add \c1\().2d, \c1\().2d, \a1\().2d
296 add \c2\().2d, \c2\().2d, \a2\().2d
299 ///--------------------------------------------------------------------------
300 /// Primitive multipliers and related utilities.
303 // On entry, x0 points to a destination, and v28--v30 and x15 hold
304 // incoming carries c0--c2 and cg. On exit, the low 128 bits of the
305 // carry value are stored at [x0]; the remaining 16 bits of carry are
306 // left in x10; x0 is advanced by 16; and x11--x17 are clobbered.
315 bfi x11, x13, #32, #32
318 bfi x12, x14, #32, #32
319 stp x11, x12, [x0], #16
324 // On entry, x0 points to the destination; x1 and x2 point to packed
325 // operands U and X; v2/v3 and v4/v5 hold expanded operands V and Y;
326 // v28--v30 and x15 hold incoming carries c0--c2 and cg; and v31 is
327 // zero. On exit, the destination and carries are updated; x0, x1,
328 // x2 are each advanced by 16; v2--v5 and v8--v15 are preserved; and
329 // x11--x14, x16, x17 and the other SIMD registers are clobbered.
332 // Start by loading the operand words from memory.
336 // Do the multiplication.
337 mul0 v28, t, v0, v2, v3, v1, v4, v5
338 mul1 v29, t, v0, v2, v3, v1, v4, v5
340 mul2 v30, t, v0, v2, v3, v1, v4, v5
343 mul3 v27, nil, v0, v2, v3, v1, v4, v5
346 mul4 v28, nil, v0, v2, v3, v1, v4, v5
349 mul5 v29, nil, v0, v2, v3, v1, v4, v5
351 mul6 v30, nil, v0, v2, v3, v1, v4, v5
353 // Finish up and store the result.
354 bfi x11, x13, #32, #32
355 bfi x12, x14, #32, #32
356 stp x11, x12, [x0], #16
363 // On entry, x0 points to the destination/accumulator A; x1 and x2
364 // point to packed operands U and X; v2/v3 and v4/v5 hold expanded
365 // operands V and Y; v28--v30 and x15 hold incoming carries c0--c2
366 // and cg; and v31 is zero. On exit, the accumulator and carries are
367 // updated; x0, x1, x2 are each advanced by 16; v2--v5 and v8--v15
368 // are preserved; and x11--x14, x16, x17 and the other SIMD registers
372 // Start by loading the operand words from memory.
376 sprdacc v24, v25, v26, v27
377 crryacc v24, v25, v26, v27, v28, v29, v30
379 // Do the multiplication.
380 mul0 v28, t, v0, v2, v3, v1, v4, v5
381 mul1 v29, t, v0, v2, v3, v1, v4, v5
383 mul2 v30, t, v0, v2, v3, v1, v4, v5
386 mul3 v27, t, v0, v2, v3, v1, v4, v5
389 mul4 v28, nil, v0, v2, v3, v1, v4, v5
392 mul5 v29, nil, v0, v2, v3, v1, v4, v5
394 mul6 v30, nil, v0, v2, v3, v1, v4, v5
396 // Finish up and store the result.
397 bfi x11, x13, #32, #32
398 bfi x12, x14, #32, #32
399 stp x11, x12, [x0], #16
406 // On entry, x0 points to the destination; x2 points to a packed
407 // operand X; v4/v5 holds an expanded operand Y; v13--v15 and x15
408 // hold incoming carries c0--c2 and cg; and v31 is zero. On exit,
409 // the destination and carries are updated; x0 and x2 are each
410 // advanced by 16; v4 and v5 and v8--v15 are preserved; and x11--x14,
411 // x16, x17 and the other SIMD registers are clobbered.
414 // Start by loading the operand words from memory.
417 // Do the multiplication.
418 mul0 v28, t, v1, v4, v5
419 mul1 v29, t, v1, v4, v5
421 mul2 v30, t, v1, v4, v5
424 mul3 v27, nil, v1, v4, v5
427 mul4 v28, nil, v1, v4, v5
430 mul5 v29, nil, v1, v4, v5
432 mul6 v30, nil, v1, v4, v5
434 // Finish up and store the result.
435 bfi x11, x13, #32, #32
436 bfi x12, x14, #32, #32
437 stp x11, x12, [x0], #16
444 // On entry, x0 points to the destination; x2 points to a packed
445 // operand X; v4/v5 holds an expanded operand Y; and v31 is zero. On
446 // exit, the destination is updated; v28--v30 and x15 hold outgoing
447 // carries c0--c2 and cg; x0 and x2 are each advanced by 16; v4 and
448 // v5 and v8--v15 are preserved; and x11--x14, x16, x17 and the other
449 // SIMD registers are clobbered.
452 // Start by loading the operand words from memory.
455 // Do the multiplication.
456 mul0 v28, nil, v1, v4, v5
457 mul1 v29, nil, v1, v4, v5
459 mul2 v30, nil, v1, v4, v5
462 mul3 v27, nil, v1, v4, v5
465 mul4 v28, nil, v1, v4, v5
468 mul5 v29, nil, v1, v4, v5
470 mul6 v30, nil, v1, v4, v5
472 // Finish up and store the result.
473 bfi x11, x13, #32, #32
474 bfi x12, x14, #32, #32
475 stp x11, x12, [x0], #16
482 // On entry, x0 points to the destination/accumulator A; x2 points to
483 // a packed operand X; v4/v5 holds an expanded operand Y; v13--v15
484 // and x15 hold incoming carries c0--c2 and cg; and v31 is zero. On
485 // exit, the accumulator and carries are updated; x0 and x2 are each
486 // advanced by 16; v4 and v5 and v8--v15 are preserved; and x11--x14,
487 // x16, x17 and the other SIMD registers are clobbered.
490 // Start by loading the operand words from memory.
493 sprdacc v24, v25, v26, v27
494 crryacc v24, v25, v26, v27, v28, v29, v30
496 // Do the multiplication.
497 mul0 v28, t, v1, v4, v5
498 mul1 v29, t, v1, v4, v5
500 mul2 v30, t, v1, v4, v5
503 mul3 v27, t, v1, v4, v5
506 mul4 v28, nil, v1, v4, v5
509 mul5 v29, nil, v1, v4, v5
511 mul6 v30, nil, v1, v4, v5
513 // Finish up and store the result.
514 bfi x11, x13, #32, #32
515 bfi x12, x14, #32, #32
516 stp x11, x12, [x0], #16
523 // On entry, x0 points to the destination/accumulator A; x2 points to
524 // a packed operand X; v4/v5 holds an expanded operand Y; and v31 is
525 // zero. On exit, the accumulator is updated; v28--v30 and x15 hold
526 // outgoing carries c0--c2 and cg; x0 and x2 are each advanced by 16;
527 // v4, v5, and v8--v15 are preserved; and x11--x14, x16, x17 and the
528 // other SIMD registers are clobbered.
531 // Start by loading the operand words from memory.
534 sprdacc v28, v29, v30, v27
536 // Do the multiplication.
537 mul0 v28, t, v1, v4, v5
538 mul1 v29, t, v1, v4, v5
540 mul2 v30, t, v1, v4, v5
543 mul3 v27, t, v1, v4, v5
546 mul4 v28, nil, v1, v4, v5
549 mul5 v29, nil, v1, v4, v5
551 mul6 v30, nil, v1, v4, v5
553 // Finish up and store the result.
554 bfi x11, x13, #32, #32
555 bfi x12, x14, #32, #32
556 stp x11, x12, [x0], #16
563 // On entry, x0 points to the destination; x1 points to a packed
564 // operand U; x2 points to a packed operand X (the modulus); v2/v3
565 // holds an expanded operand V; and v6/v7 holds an expanded operand M
566 // (the Montgomery factor -N^{-1} (mod B)). On exit, the destination
567 // is updated (to zero); v4/v5 hold an expanded factor Y = U V M (mod
568 // B); v28--v30 and x15 hold outgoing carries c0--c2 and cg; x0, x1,
569 // and x2 are each advanced by 16; v2, v3, and v8--v15 are preserved;
570 // and x11--x14, x16, x17 and the other SIMD registers are clobbered.
573 // Start by loading the operand words from memory.
577 // Calculate the low half of W = A + U V, being careful to leave the
579 mul0 v28, nil, v0, v2, v3
580 mul1 v29, nil, v0, v2, v3
582 mul2 v30, nil, v0, v2, v3
585 mul3 v27, nil, v0, v2, v3
590 // On entry, x0 points to the destination/accumulator A; x1 points to
591 // a packed operand U; x2 points to a packed operand X (the modulus);
592 // v2/v3 holds an expanded operand V; and v6/v7 holds an expanded
593 // operand M (the Montgomery factor -N^{-1} (mod B)). On exit, the
594 // accumulator is updated (to zero); v4/v5 hold an expanded factor Y
595 // = (A + U V) M (mod B); v28--v30 and x15 hold outgoing carries
596 // c0--c2 and cg; x0, x1, and x2 are each advanced by 16; v2, v3, v6,
597 // v7, and v8--v15 are preserved; and x11--x14, x16, x17 and the
598 // other SIMD registers are clobbered.
601 // Start by loading the operand words from memory.
605 sprdacc v28, v29, v30, v27
607 // Calculate the low half of W = A + U V, being careful to leave the
609 mul0 v28, t, v0, v2, v3
610 mul1 v29, t, v0, v2, v3
612 mul2 v30, t, v0, v2, v3
615 mul3 v27, t, v0, v2, v3
623 // Piece the result together and ship it back.
624 bfi x11, x13, #32, #32
625 bfi x12, x14, #32, #32
629 // Calculate the low half of the Montgomery factor Y = W M.
630 mul0 v18, nil, v16, v6, v7
631 mul1 v19, nil, v16, v6, v7
633 mul2 v20, nil, v16, v6, v7
636 mul3 v21, nil, v16, v6, v7
643 // Piece the result together, ship it back, and expand.
644 bfi x11, x13, #32, #32
645 bfi x12, x14, #32, #32
650 // Build up the product X Y in the carry slots.
651 mul0 v28, t, v1, v4, v5
652 mul1 v29, t, v1, v4, v5
654 mul2 v30, t, v1, v4, v5
657 mul3 v27, t, v1, v4, v5
661 // And complete the calculation.
662 mul4 v28, nil, v0, v2, v3, v1, v4, v5
665 mul5 v29, nil, v0, v2, v3, v1, v4, v5
667 mul6 v30, nil, v0, v2, v3, v1, v4, v5
669 // Finish up and store the result.
670 stp xzr, xzr, [x0], #16
677 // On entry, x0 points to the destination/accumulator A; x2 points to
678 // a packed operand X (the modulus); and v6/v7 holds an expanded
679 // operand M (the Montgomery factor -N^{-1} (mod B)). On exit, the
680 // accumulator is updated (to zero); v4/v5 hold an expanded factor Y
681 // = A M (mod B); v28--v30 and x15 hold outgoing carries c0--c2 and
682 // cg; x0 and x2 are each advanced by 16; v6, v7, and v8--v15 are
683 // preserved; and x11--x14, x16, x17 and the other SIMD registers are
687 // Start by loading the operand words from memory.
691 // Calculate Y = A M (mod B).
692 mul0 v18, nil, v28, v6, v7
693 mul1 v19, nil, v28, v6, v7
695 mul2 v20, nil, v28, v6, v7
698 mul3 v21, nil, v28, v6, v7
701 sprdacc v28, v29, v30, v27
706 // Piece the result together, ship it back, and expand.
707 bfi x11, x13, #32, #32
708 bfi x12, x14, #32, #32
713 // Calculate the actual result. Well, the carries, at least.
714 mul0 v28, t, v1, v4, v5
715 mul1 v29, t, v1, v4, v5
717 mul2 v30, t, v1, v4, v5
720 mul3 v27, t, v1, v4, v5
724 // And complete the calculation.
725 mul4 v28, nil, v1, v4, v5
728 mul5 v29, nil, v1, v4, v5
730 mul6 v30, nil, v1, v4, v5
732 // Finish up and store the result.
733 stp xzr, xzr, [x0], #16
739 ///--------------------------------------------------------------------------
740 /// Bulk multipliers.
742 FUNC(mpx_umul4_arm64_simd)
743 // void mpx_umul4_arm64_simd(mpw *dv, const mpw *av, const mpw *avl,
744 // const mpw *bv, const mpw *bvl);
746 // Establish the arguments and do initial setup.
760 // Prepare for the first iteration.
761 ldr q4, [x3], #16 // Y = bv[0]
763 sub x7, x2, x1 // = inner loop count base
764 // x0 // = dv for inner loop
766 // x3 // = bv for outer loop
767 sub x4, x4, x3 // = outer loop count (decremented)
768 sub x6, x7, #16 // = inner loop count (decremented)
769 mov x2, x1 // = av for inner loop
770 add x5, x0, #16 // = dv for outer loop
771 expand v4, v5 // expand Y
773 cbz x6, 8f // all done?
775 // Continue with the first iteration.
780 // Write out the leftover carry. There can be no tail here.
784 // Set up for the next pass.
785 1: ldr q4, [x3], #16 // Y = bv[i]
786 mov x0, x5 // -> dv[i]
787 mov x2, x1 // -> av[0]
789 sub x6, x7, #16 // = inner loop count (decremented)
790 sub x4, x4, #16 // outer loop count
791 expand v4, v5 // expand Y
800 // Finish off this pass. There was no tail on the previous pass, and
801 // there can be done on this pass.
810 FUNC(mpxmont_mul4_arm64_simd)
811 // void mpxmont_mul4_arm64_simd(mpw *dv,
812 // const mpw *av, const mpw *bv,
813 // const mpw *nv, size_t n,
816 // Establish the arguments and do initial setup.
834 // Set up the outer loop state and prepare for the first iteration.
835 ldr q2, [x2] // = V = bv[0]
838 // x0 // -> dv for inner loop
839 // x1 // -> av for inner loop
842 add x5, x0, #16 // -> dv
843 add x6, x2, #16 // -> bv
844 mov x2, x3 // -> nv[0]
845 mov x7, x1 // -> av base
846 sub x8, x4, #4 // = inner n (decremented)
847 sub x9, x4, #4 // = outer n (decremented)
848 expand v2, v3 // expand V
849 expand v6, v7 // expand M
851 cbz x8, 8f // done already?
853 // Complete the first inner loop.
856 cbnz x8, 0b // done yet?
858 // Still have carries left to propagate. Rather than store the tail
859 // end in memory, keep it in x10 for later.
862 // Embark on the next iteration. (There must be one. If n = 1 then
863 // we would have bailed above, to label 8. Similarly, the subsequent
864 // iterations can fall into the inner loop immediately.)
865 1: ldr q2, [x6], #16 // = Y = bv[i]
866 mov x0, x5 // -> dv[i]
867 mov x1, x7 // -> av[0]
868 mov x2, x3 // -> nv[0]
875 // Complete the next inner loop.
880 // Still have carries left to propagate, and they overlap the
881 // previous iteration's final tail, so read that and add it.
882 add x15, x15, x10, lsl #16
885 // Back again, maybe.
893 // First iteration was short. Write out the carries and we're done.
894 // (This could be folded into the main loop structure, but that would
895 // penalize small numbers more.)
902 FUNC(mpxmont_redc4_arm64_simd)
903 // void mpxmont_redc4_arm64_simd(mpw *dv, mpw *dvl, const mpw *nv,
904 // size_t n, const mpw *mi);
906 // Establish the arguments and do initial setup.
910 // blocks-of-4 count x6
914 // outer loop count x8
916 // inner loop count x1
925 // Set up the outer loop state and prepare for the first iteration.
928 // x0 // -> dv for inner loop
929 sub x6, x1, x0 // total dv bytes
930 sub x1, x3, #4 // inner loop counter
931 // x2 // -> nv for inner loop
933 add x4, x0, #16 // -> dv for outer loop
934 mov x5, x2 // -> nv base
935 sub x6, x6, x3, lsl #2 // dv carry range bytes
936 sub x8, x3, #4 // outer loop counter
937 sub x6, x6, #16 // dv steam-powered carry bytes
938 expand v6, v7 // expand M
939 and x7, x6, #15 // dv tail length in bytes
940 bic x6, x6, #15 // dv blocks-of-four length in bytes
943 cbz x1, 8f // done already?
947 cbnz x1, 5b // done yet?
949 // Still have carries left to propagate. Adding the accumulator
950 // block into the carries is a little different this time, because
951 // all four accumulator limbs have to be squished into the three
952 // carry registers for `carryprop' to do its thing.
954 sprdacc v24, v25, v26
955 add v28.2d, v28.2d, v24.2d
956 add v29.2d, v29.2d, v25.2d
957 add v30.2d, v30.2d, v26.2d
961 // Propagate the first group of carries.
966 stp x16, x17, [x0], #16
969 // Continue carry propagation until the end of the buffer.
970 0: ldp x16, x17, [x0]
974 stp x16, x17, [x0], #16
977 // Deal with the tail end. Note that the actual destination length
978 // won't be an exacty number of blocks of four, so it's safe to just
979 // drop through here.
992 // All done for this iteration. Start the next.
1008 ///--------------------------------------------------------------------------
1009 /// Testing and performance measurement.
1013 // dmul smul mmul mont
1019 // vv v2/v3 x4 -- x5 --
1020 // yy v4/v5 x5 x3 x6 --
1021 // mm v6/v7 -- -- -- x4
1023 // cyv x6 x7 x5 stk0 x6
1025 #define STKARG(i) sp, #16 + i
1027 .macro testprologue mode
1033 .ifeqs "\mode", "dmul"
1035 zip2 v3.8h, v2.8h, v31.8h // (v''_3, v'_3; v''_2, v'_2)
1036 zip1 v2.8h, v2.8h, v31.8h // (v''_1, v'_1; v''_0, v'_0)
1039 zip2 v5.8h, v4.8h, v31.8h // (y''_3, y'_3; y''_2, y'_2)
1040 zip1 v4.8h, v4.8h, v31.8h // (y''_1, y'_1; y''_0, y'_0)
1048 mov x6, x7 // -> cyv
1051 .ifeqs "\mode", "smul"
1053 zip2 v5.8h, v4.8h, v31.8h // (y''_3, y'_3; y''_2, y'_2)
1054 zip1 v4.8h, v4.8h, v31.8h // (y''_1, y'_1; y''_0, y'_0)
1058 mov x6, x5 // -> cyv
1062 .ifeqs "\mode", "mmul"
1064 zip2 v3.8h, v2.8h, v31.8h // (v''_3, v'_3; v''_2, v'_2)
1065 zip1 v2.8h, v2.8h, v31.8h // (v''_1, v'_1; v''_0, v'_0)
1068 zip2 v7.8h, v6.8h, v31.8h // (y''_3, y'_3; y''_2, y'_2)
1069 zip1 v6.8h, v6.8h, v31.8h // (y''_1, y'_1; y''_0, y'_0)
1080 ldr x6, [STKARG(0)] // -> cyv
1083 .ifeqs "\mode", "mont"
1085 zip2 v7.8h, v6.8h, v31.8h // (m''_3, m'_3; m''_2, m'_2)
1086 zip1 v6.8h, v6.8h, v31.8h // (m''_1, m'_1; m''_0, m'_0)
1098 ld1 {v28.2d-v30.2d}, [x3]
1111 // More complicated than usual because we must mix the general-
1112 // purpose carry back in.
1116 add v28.2d, v28.2d, v0.2d
1117 st1 {v28.2d-v30.2d}, [x3]
1217 ///----- That's all, folks --------------------------------------------------