1 /// -*- mode: asm; asm-comment-char: ?/; comment-start: "// " -*-
3 /// Large SIMD-based multiplications
5 /// (c) 2016 Straylight/Edgeware
8 ///----- Licensing notice ---------------------------------------------------
10 /// This file is part of Catacomb.
12 /// Catacomb is free software; you can redistribute it and/or modify
13 /// it under the terms of the GNU Library General Public License as
14 /// published by the Free Software Foundation; either version 2 of the
15 /// License, or (at your option) any later version.
17 /// Catacomb is distributed in the hope that it will be useful,
18 /// but WITHOUT ANY WARRANTY; without even the implied warranty of
19 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 /// GNU Library General Public License for more details.
22 /// You should have received a copy of the GNU Library General Public
23 /// License along with Catacomb; if not, write to the Free
24 /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
25 /// MA 02111-1307, USA.
27 ///--------------------------------------------------------------------------
31 #include "asm-common.h"
37 ///--------------------------------------------------------------------------
40 /// We define a number of primitive fixed-size multipliers from which we can
41 /// construct more general variable-length multipliers.
43 /// The basic trick is the same throughout. In an operand-scanning
44 /// multiplication, the inner multiplication loop multiplies a multiple-
45 /// precision operand by a single precision factor, and adds the result,
46 /// appropriately shifted, to the result. A `finely integrated operand
47 /// scanning' implementation of Montgomery multiplication also adds the
48 /// product of a single-precision `Montgomery factor' and the modulus,
49 /// calculated in the same pass. The more common `coarsely integrated
50 /// operand scanning' alternates main multiplication and Montgomery passes,
51 /// which requires additional carry propagation.
53 /// Throughout both plain-multiplication and Montgomery stages, then, one of
54 /// the factors remains constant throughout the operation, so we can afford
55 /// to take a little time to preprocess it. The transformation we perform is
56 /// as follows. Let b = 2^16, and B = b^2 = 2^32. Suppose we're given a
57 /// 128-bit factor v = v_0 + v_1 B + v_2 B^2 + v_3 B^3. Split each v_i into
58 /// two sixteen-bit pieces, so v_i = v'_i + v''_i b. These eight 16-bit
59 /// pieces are placed into 32-bit cells, and arranged as two 128-bit SSE
60 /// operands, as follows.
63 /// 0 v'_0 v'_1 v''_0 v''_1
64 /// 16 v'_2 v'_3 v''_2 v''_3
66 /// A `pmuludqd' instruction ignores the odd positions in its operands; thus,
67 /// it will act on (say) v'_0 and v''_0 in a single instruction. Shifting
68 /// this vector right by 4 bytes brings v'_1 and v''_1 into position. We can
69 /// multiply such a vector by a full 32-bit scalar to produce two 48-bit
70 /// results in 64-bit fields. The sixteen bits of headroom allows us to add
71 /// many products together before we must deal with carrying; it also allows
72 /// for some calculations to be performed on the above expanded form.
74 /// We maintain four `carry' registers XMM12--XMM15 accumulating intermediate
75 /// results. The registers' precise roles rotate during the computation; we
76 /// name them `c0', `c1', `c2', and `c3'. Each carry register holds two
77 /// 64-bit halves: the register c0, for example, holds c'_0 (low half) and
78 /// c''_0 (high half), and represents the value c_0 = c'_0 + c''_0 b; the
79 /// carry registers collectively represent the value c_0 + c_1 B + c_2 B^2 +
80 /// c_3 B^3. The `pmuluqdq' instruction acting on a scalar operand
81 /// (broadcast across all lanes of its vector) and an operand in the expanded
82 /// form above produces a result which can be added directly to the
83 /// appropriate carry register. Following a pass of four multiplications, we
84 /// perform some limited carry propagation: let t = c''_0 mod B, and let d =
85 /// c'_0 + t b; then we output z = d mod B, add (floor(d/B), floor(c''_0/B))
86 /// to c1, and cycle the carry registers around, so that c1 becomes c0, and
87 /// the old (implicitly) zeroed c0 becomes c3.
89 /// On 64-bit AMD64, we have a reasonable number of registers: the expanded
90 /// operands are kept in registers. The packed operands are read from memory
91 /// into working registers XMM4 and XMM5; XMM0--XMM3 are used for the actual
92 /// multiplications; and XMM6 and XMM7 are used for combining the results.
93 /// The following conventional argument names and locations are used
96 /// Arg Format Location Notes
99 /// X packed [RBX] In Montgomery multiplication, X = N
100 /// V expanded XMM8/XMM9
101 /// Y expanded XMM10/XMM11 In Montgomery multiplication, Y = (A + U V) M
102 /// M expanded (see below) Montgomery factor, M = -N^{-1} (mod B^4)
103 /// N Modulus, for Montgomery multiplication
104 /// A packed [RDI] Destination/accumulator
105 /// C carry XMM12--XMM15
107 /// The calculation is some variant of
109 /// A' + C' B^4 <- U V + X Y + A + C
111 /// The low-level functions fit into a fairly traditional (finely-integrated)
112 /// operand scanning loop over operand pairs (U, X) (indexed by j) and (V, Y)
115 /// The variants are as follows.
117 /// Function Variant Use i j
119 /// mmul4 A = C = 0, Y = M Montgomery 0 0
120 /// dmul4 A = 0 Montgomery 0 +
121 /// mmla4 C = 0, Y = M Montgomery + 0
122 /// dmla4 exactly as shown Montgomery + +
123 /// mont4 U = C = 0, V = M Montgomery any 0
125 /// mul4zc U = V = A = C = 0 Plain 0 0
126 /// mul4 U = V = A = 0 Plain 0 +
127 /// mla4zc U = V = C = 0 Plain + 0
128 /// mla4 U = V = 0 Plain + +
130 /// The `mmul4' and `mmla4' functions are also responsible for calculating
131 /// the Montgomery reduction factor Y = (A + U V) M used by the rest of the
134 ///--------------------------------------------------------------------------
135 /// Macro definitions.
137 .macro mulcore r, i, slo, shi, d0, d1=nil, d2=nil, d3=nil
138 // Multiply R_I by the expanded operand SLO/SHI, and leave the pieces
139 // of the product in registers D0, D1, D2, D3.
140 pshufd \d0, \r, SHUF(\i, 3, \i, 3) // (r_i, ?; r_i, ?)
142 movdqa \d1, \slo // (s'_0, s'_1; s''_0, s''_1)
145 movdqa \d3, \shi // (s'_2, s'_3; s''_2, s''_3)
148 psrldq \d1, 4 // (s'_1, s''_0; s''_1, 0)
151 movdqa \d2, \d0 // another copy of (r_i, ?; r_i, ?)
154 psrldq \d3, 4 // (s'_3, s''_2; s''_3, 0)
157 pmuludq \d1, \d0 // (r_i s'_1; r_i s''_1)
160 pmuludq \d3, \d0 // (r_i s'_3; r_i s''_3)
163 pmuludq \d2, \shi // (r_i s'_2; r_i s''_2)
165 pmuludq \d0, \slo // (r_i s'_0; r_i s''_0)
168 .macro accum c0, c1=nil, c2=nil, c3=nil
169 // Accumulate 64-bit pieces in XMM0--XMM3 into the corresponding
170 // carry registers C0--C3. Any or all of C1--C3 may be `nil' to skip
171 // updating that register.
184 .macro mulacc r, i, slo, shi, c0=nil, c1=nil, c2=nil, c3=nil, z3p=nil
185 // Multiply R_I by the expanded operand SLO/SHI, and accumulate in
186 // carry registers C0, C1, C2, C3. If Z3P is `t' then C3 notionally
187 // contains zero, but needs clearing; in practice, we store the
188 // product directly rather than attempting to add. On completion,
189 // XMM0, XMM1, and XMM2 are clobbered, as is XMM3 if Z3P is not `t'.
191 mulcore \r, \i, \slo, \shi, xmm0, xmm1, xmm2, \c3
194 mulcore \r, \i, \slo, \shi, xmm0, xmm1, xmm2, xmm3
195 accum \c0, \c1, \c2, \c3
199 .macro propout d, pos, c, cc=nil
200 // Calculate an output word from C, and store it at POS in D;
201 // propagate carries out from C to CC in preparation for a rotation
202 // of the carry registers. D is an XMM register; the POS is either
203 // `lo' or `hi' according to whether the output word should be in
204 // lane 0 or 1 of D; the high two lanes of D are clobbered. On
205 // completion, XMM3 is clobbered. If CC is `nil', then the
206 // contribution which would have been added to it is left in C.
207 pshufd xmm3, \c, SHUF(3, 3, 3, 2) // (?, ?; ?, t = c'' mod B)
208 psrldq xmm3, 12 // (t, 0; 0, 0) = (t; 0)
209 pslldq xmm3, 2 // (t b; 0)
210 paddq \c, xmm3 // (c' + t b; c'')
216 psrlq \c, 32 // floor(c/B)
218 paddq \cc, \c // propagate up
222 .macro endprop d, pos, c, t
223 // On entry, C contains a carry register. On exit, the low 32 bits
224 // of the value represented in C are written at POS in D, and the
225 // remaining bits are left at the bottom of T.
227 psllq \t, 16 // (?; c'' b)
228 pslldq \c, 8 // (0; c')
229 paddq \t, \c // (?; c' + c'' b)
230 psrldq \t, 8 // (c' + c'' b; 0) = (c; 0)
236 psrldq \t, 4 // (floor(c/B); 0)
239 .macro expand z, a, b, c=nil, d=nil
240 // On entry, A and C hold packed 128-bit values, and Z is zero. On
241 // exit, A:B and C:D together hold the same values in expanded
242 // form. If C is `nil', then only expand A to A:B.
243 movdqa \b, \a // (a_0, a_1; a_2, a_3)
245 movdqa \d, \c // (c_0, c_1; c_2, c_3)
247 punpcklwd \a, \z // (a'_0, a''_0; a'_1, a''_1)
248 punpckhwd \b, \z // (a'_2, a''_2; a'_3, a''_3)
250 punpcklwd \c, \z // (c'_0, c''_0; c'_1, c''_1)
251 punpckhwd \d, \z // (c'_2, c''_2; c'_3, c''_3)
253 pshufd \a, \a, SHUF(0, 2, 1, 3) // (a'_0, a'_1; a''_0, a''_1)
254 pshufd \b, \b, SHUF(0, 2, 1, 3) // (a'_2, a'_3; a''_2, a''_3)
256 pshufd \c, \c, SHUF(0, 2, 1, 3) // (c'_0, c'_1; c''_0, c''_1)
257 pshufd \d, \d, SHUF(0, 2, 1, 3) // (c'_2, c'_3; c''_2, c''_3)
261 .macro squash c0, c1, c2, c3, t, u, lo, hi=nil
262 // On entry, C0, C1, C2, C3 are carry registers representing a value
263 // Y. On exit, LO holds the low 128 bits of the carry value; C1, C2,
264 // C3, T, and U are clobbered; and the high bits of Y are stored in
265 // HI, if this is not `nil'.
267 // The first step is to eliminate the `double-prime' pieces -- i.e.,
268 // the ones offset by 16 bytes from a 32-bit boundary -- by carrying
269 // them into the 32-bit-aligned pieces above and below. But before
270 // we can do that, we must gather them together.
273 punpcklqdq \t, \c2 // (y'_0; y'_2)
274 punpckhqdq \c0, \c2 // (y''_0; y''_2)
275 punpcklqdq \u, \c3 // (y'_1; y'_3)
276 punpckhqdq \c1, \c3 // (y''_1; y''_3)
278 // Now split the double-prime pieces. The high (up to) 48 bits will
279 // go up; the low 16 bits go down.
284 psrlq \c0, 16 // high parts of (y''_0; y''_2)
285 psrlq \c1, 16 // high parts of (y''_1; y''_3)
286 psrlq \c2, 32 // low parts of (y''_0; y''_2)
287 psrlq \c3, 32 // low parts of (y''_1; y''_3)
291 pslldq \c1, 8 // high part of (0; y''_1)
293 paddq \t, \c2 // propagate down
295 paddq \t, \c1 // and up: (y_0; y_2)
296 paddq \u, \c0 // (y_1; y_3)
298 psrldq \hi, 8 // high part of (y''_3; 0)
301 // Finally extract the answer. This complicated dance is better than
302 // storing to memory and loading, because the piecemeal stores
303 // inhibit store forwarding.
304 movdqa \c3, \t // (y_0; ?)
305 movdqa \lo, \t // (y^*_0, ?; ?, ?)
306 psrldq \t, 8 // (y_2; 0)
307 psrlq \c3, 32 // (floor(y_0/B); ?)
308 paddq \c3, \u // (y_1 + floor(y_0/B); ?)
309 movdqa \c1, \c3 // (y^*_1, ?; ?, ?)
310 psrldq \u, 8 // (y_3; 0)
311 psrlq \c3, 32 // (floor((y_1 B + y_0)/B^2; ?)
312 paddq \c3, \t // (y_2 + floor((y_1 B + y_0)/B^2; ?)
313 punpckldq \lo, \c3 // (y^*_0, y^*_2; ?, ?)
314 psrlq \c3, 32 // (floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
315 paddq \c3, \u // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
320 punpckldq \c1, \c3 // (y^*_1, y^*_3; ?, ?)
322 psrlq \t, 32 // very high bits of y
324 punpcklqdq \hi, \u // carry up
326 punpckldq \lo, \c1 // y mod B^4
330 // On entry, RDI points to a packed addend A, and XMM12, XMM13, XMM14
331 // hold the incoming carry registers c0, c1, and c2 representing a
334 // On exit, the carry registers, including XMM15, are updated to hold
335 // C + A; XMM0, XMM1, XMM2, and XMM3 are clobbered. The other
336 // registers are preserved.
337 movd xmm0, [rdi + 0] // (a_0; 0)
338 movd xmm1, [rdi + 4] // (a_1; 0)
339 movd xmm2, [rdi + 8] // (a_2; 0)
340 movd xmm15, [rdi + 12] // (a_3; 0)
341 paddq xmm12, xmm0 // (c'_0 + a_0; c''_0)
342 paddq xmm13, xmm1 // (c'_1 + a_1; c''_1)
343 paddq xmm14, xmm2 // (c'_2 + a_2; c''_2 + a_3 b)
346 ///--------------------------------------------------------------------------
347 /// Primitive multipliers and related utilities.
350 // On entry, XMM12, XMM13, and XMM14 hold a 144-bit carry in an
351 // expanded form. Store the low 128 bits of the represented carry to
352 // [RDI] as a packed 128-bit value, and leave the remaining 16 bits
353 // in the low 32 bits of XMM12. On exit, XMM0, XMM1, XMM3, XMM13 and
354 // XMM14 are clobbered.
357 propout xmm0, lo, xmm12, xmm13
358 propout xmm1, lo, xmm13, xmm14
359 propout xmm0, hi, xmm14, nil
360 endprop xmm1, hi, xmm14, xmm12
368 // On entry, RDI points to the destination buffer; RAX and RBX point
369 // to the packed operands U and X; XMM8/XMM9 and XMM10/XMM11 hold the
370 // expanded operands V and Y; and XMM12, XMM13, XMM14 hold the
371 // incoming carry registers c0, c1, and c2; c3 is assumed to be zero.
373 // On exit, we write the low 128 bits of the sum C + U V + X Y to
374 // [RDI], and update the carry registers with the carry out. The
375 // registers XMM0--XMM7, and XMM15 are clobbered; the general-purpose
376 // registers are preserved.
382 mulacc xmm4, 0, xmm8, xmm9, xmm12, xmm13, xmm14, xmm15, t
383 mulacc xmm5, 0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
384 propout xmm6, lo, xmm12, xmm13
386 mulacc xmm4, 1, xmm8, xmm9, xmm13, xmm14, xmm15, xmm12, t
387 mulacc xmm5, 1, xmm10, xmm11, xmm13, xmm14, xmm15, xmm12
388 propout xmm7, lo, xmm13, xmm14
390 mulacc xmm4, 2, xmm8, xmm9, xmm14, xmm15, xmm12, xmm13, t
391 mulacc xmm5, 2, xmm10, xmm11, xmm14, xmm15, xmm12, xmm13
392 propout xmm6, hi, xmm14, xmm15
394 mulacc xmm4, 3, xmm8, xmm9, xmm15, xmm12, xmm13, xmm14, t
395 mulacc xmm5, 3, xmm10, xmm11, xmm15, xmm12, xmm13, xmm14
396 propout xmm7, hi, xmm15, xmm12
405 // On entry, RDI points to the destination buffer, which also
406 // contains an addend A to accumulate; RAX and RBX point to the
407 // packed operands U and X; XMM8/XMM9 and XMM10/XMM11 hold the
408 // expanded operands V and Y; and XMM12, XMM13, XMM14 hold the
409 // incoming carry registers c0, c1, and c2 representing a carry-in C;
410 // c3 is assumed to be zero.
412 // On exit, we write the low 128 bits of the sum A + C + U V + X Y to
413 // [RDI], and update the carry registers with the carry out. The
414 // registers XMM0--XMM7, and XMM15 are clobbered; the general-purpose
415 // registers are preserved.
422 mulacc xmm4, 0, xmm8, xmm9, xmm12, xmm13, xmm14, xmm15
423 mulacc xmm5, 0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
424 propout xmm6, lo, xmm12, xmm13
426 mulacc xmm4, 1, xmm8, xmm9, xmm13, xmm14, xmm15, xmm12, t
427 mulacc xmm5, 1, xmm10, xmm11, xmm13, xmm14, xmm15, xmm12
428 propout xmm7, lo, xmm13, xmm14
430 mulacc xmm4, 2, xmm8, xmm9, xmm14, xmm15, xmm12, xmm13, t
431 mulacc xmm5, 2, xmm10, xmm11, xmm14, xmm15, xmm12, xmm13
432 propout xmm6, hi, xmm14, xmm15
434 mulacc xmm4, 3, xmm8, xmm9, xmm15, xmm12, xmm13, xmm14, t
435 mulacc xmm5, 3, xmm10, xmm11, xmm15, xmm12, xmm13, xmm14
436 propout xmm7, hi, xmm15, xmm12
445 // On entry, RDI points to the destination buffer; RBX points to a
446 // packed operand X; and XMM10/XMM11 hold an expanded operand Y.
448 // On exit, we write the low 128 bits of the product X Y to [RDI],
449 // and set the carry registers XMM12, XMM13, XMM14 to the carry out.
450 // The registers XMM0--XMM3, XMM5--XMM7, and XMM15 are clobbered; the
451 // general-purpose registers are preserved.
456 mulcore xmm5, 0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
457 propout xmm6, lo, xmm12, xmm13
459 mulacc xmm5, 1, xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
460 propout xmm7, lo, xmm13, xmm14
462 mulacc xmm5, 2, xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
463 propout xmm6, hi, xmm14, xmm15
465 mulacc xmm5, 3, xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
466 propout xmm7, hi, xmm15, xmm12
475 // On entry, RDI points to the destination buffer; RBX points to a
476 // packed operand X; XMM10/XMM11 hold an expanded operand Y; and
477 // XMM12, XMM13, XMM14 hold the incoming carry registers c0, c1, and
478 // c2, representing a carry-in C; c3 is assumed to be zero.
480 // On exit, we write the low 128 bits of the sum C + X Y to [RDI],
481 // and update the carry registers with the carry out. The registers
482 // XMM0--XMM3, XMM5--XMM7, and XMM15 are clobbered; the
483 // general-purpose registers are preserved.
488 mulacc xmm5, 0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, t
489 propout xmm6, lo, xmm12, xmm13
491 mulacc xmm5, 1, xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
492 propout xmm7, lo, xmm13, xmm14
494 mulacc xmm5, 2, xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
495 propout xmm6, hi, xmm14, xmm15
497 mulacc xmm5, 3, xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
498 propout xmm7, hi, xmm15, xmm12
507 // On entry, RDI points to the destination buffer, which also
508 // contains an addend A to accumulate; RBX points to a packed operand
509 // X; and XMM10/XMM11 points to an expanded operand Y.
511 // On exit, we write the low 128 bits of the sum A + X Y to [RDI],
512 // and set the carry registers XMM12, XMM13, XMM14 to the carry out.
513 // The registers XMM0--XMM3, XMM5--XMM7, and XMM15 are clobbered; the
514 // general-purpose registers are preserved.
518 movd xmm12, [rdi + 0]
519 movd xmm13, [rdi + 4]
520 movd xmm14, [rdi + 8]
521 movd xmm15, [rdi + 12]
523 mulacc xmm5, 0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
524 propout xmm6, lo, xmm12, xmm13
526 mulacc xmm5, 1, xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
527 propout xmm7, lo, xmm13, xmm14
529 mulacc xmm5, 2, xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
530 propout xmm6, hi, xmm14, xmm15
532 mulacc xmm5, 3, xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
533 propout xmm7, hi, xmm15, xmm12
542 // On entry, RDI points to the destination buffer, which also
543 // contains an addend A to accumulate; RBX points to a packed operand
544 // X; XMM10/XMM11 holds an expanded operand Y; and XMM12, XMM13,
545 // XMM14 hold the incoming carry registers c0, c1, and c2,
546 // representing a carry-in C; c3 is assumed to be zero.
548 // On exit, we write the low 128 bits of the sum A + C + X Y to
549 // [RDI], and update the carry registers with the carry out. The
550 // registers XMM0--XMM3, XMM5--XMM7, and XMM15 are clobbered; the
551 // general-purpose registers are preserved.
557 mulacc xmm5, 0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
558 propout xmm6, lo, xmm12, xmm13
560 mulacc xmm5, 1, xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
561 propout xmm7, lo, xmm13, xmm14
563 mulacc xmm5, 2, xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
564 propout xmm6, hi, xmm14, xmm15
566 mulacc xmm5, 3, xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
567 propout xmm7, hi, xmm15, xmm12
576 // On entry, RDI points to the destination buffer; RAX and RBX point
577 // to the packed operands U and N; and XMM8/XMM9 and XMM10/XMM11 hold
578 // the expanded operands V and M. The stack pointer must be 8 modulo
579 // 16 (as usual for AMD64 ABIs).
581 // On exit, we store Y = U V M mod B in XMM10/XMM11, and write the
582 // low 128 bits of the sum U V + N Y to [RDI], leaving the remaining
583 // carry in XMM12, XMM13, and XMM14. The registers XMM0--XMM7, and
584 // XMM15 are clobbered; the general-purpose registers are preserved.
587 stalloc 48 + 8 // space for the carries
591 // Calculate W = U V, and leave it in XMM7. Stash the carry pieces
593 mulcore xmm4, 0, xmm8, xmm9, xmm12, xmm13, xmm14, xmm15
594 propout xmm7, lo, xmm12, xmm13
599 // On entry, RDI points to the destination buffer, which also
600 // contains an addend A to accumulate; RAX and RBX point to the
601 // packed operands U and N; and XMM8/XMM9 and XMM10/XMM11 hold the
602 // expanded operands V and M. The stack pointer must be 8 modulo 16
603 // (as usual for AMD64 ABIs).
605 // On exit, we store Y = (A + U V) M mod B in XMM10/XMM11, and write
606 // the low 128 bits of the sum A + U V + N Y to [RDI], leaving the
607 // remaining carry in XMM12, XMM13, and XMM14. The registers
608 // XMM0--XMM7, and XMM15 are clobbered; the general-purpose registers
612 stalloc 48 + 8 // space for the carries
613 # define STKTMP(i) [SP + i]
616 # define STKTMP(i) [SP + i - 48 - 8] // use red zone
620 movd xmm12, [rdi + 0]
621 movd xmm13, [rdi + 4]
622 movd xmm14, [rdi + 8]
623 movd xmm15, [rdi + 12]
625 // Calculate W = U V, and leave it in XMM7. Stash the carry pieces
627 mulacc xmm4, 0, xmm8, xmm9, xmm12, xmm13, xmm14, xmm15
628 propout xmm7, lo, xmm12, xmm13
630 5: mulacc xmm4, 1, xmm8, xmm9, xmm13, xmm14, xmm15, xmm12, t
631 propout xmm6, lo, xmm13, xmm14
633 mulacc xmm4, 2, xmm8, xmm9, xmm14, xmm15, xmm12, xmm13, t
634 propout xmm7, hi, xmm14, xmm15
636 mulacc xmm4, 3, xmm8, xmm9, xmm15, xmm12, xmm13, xmm14, t
637 propout xmm6, hi, xmm15, xmm12
639 // Prepare W, and stash carries for later.
641 movdqa STKTMP( 0), xmm12
642 movdqa STKTMP(16), xmm13
643 movdqa STKTMP(32), xmm14
645 // Calculate Y = W M. We just about have enough spare registers to
647 mulcore xmm7, 0, xmm10, xmm11, xmm3, xmm4, xmm5, xmm6
649 // Start expanding W back into the main carry registers...
654 mulcore xmm7, 1, xmm10, xmm11, xmm0, xmm1, xmm2
655 accum xmm4, xmm5, xmm6
657 punpckldq xmm12, xmm15 // (w_0, 0; w_1, 0)
658 punpckhdq xmm14, xmm15 // (w_2, 0; w_3, 0)
660 mulcore xmm7, 2, xmm10, xmm11, xmm0, xmm1
667 mulcore xmm7, 3, xmm10, xmm11, xmm0
670 punpckldq xmm12, xmm2 // (w_0, 0; 0, 0)
671 punpckldq xmm14, xmm2 // (w_2, 0; 0, 0)
672 punpckhdq xmm13, xmm2 // (w_1, 0; 0, 0)
673 punpckhdq xmm15, xmm2 // (w_3, 0; 0, 0)
675 // That's lots of pieces. Now we have to assemble the answer.
676 squash xmm3, xmm4, xmm5, xmm6, xmm0, xmm1, xmm10
680 expand xmm2, xmm10, xmm11
682 // Finish the calculation by adding the Montgomery product.
683 mulacc xmm5, 0 xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
684 propout xmm6, lo, xmm12, xmm13
686 mulacc xmm5, 1 xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
687 propout xmm7, lo, xmm13, xmm14
689 mulacc xmm5, 2 xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
690 propout xmm6, hi, xmm14, xmm15
692 mulacc xmm5, 3 xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
693 propout xmm7, hi, xmm15, xmm12
697 // Add add on the carry we calculated earlier.
698 paddq xmm12, STKTMP( 0)
699 paddq xmm13, STKTMP(16)
700 paddq xmm14, STKTMP(32)
702 // And, with that, we're done.
714 // On entry, RDI points to the destination buffer holding a packed
715 // value W; RBX points to a packed operand N; and XMM8/XMM9 hold an
716 // expanded operand M.
718 // On exit, we store Y = W M mod B in XMM10/XMM11, and write the low
719 // 128 bits of the sum W + N Y to [RDI], leaving the remaining carry
720 // in XMM12, XMM13, and XMM14. The registers XMM0--XMM3, XMM5--XMM7,
721 // and XMM15 are clobbered; the general-purpose registers are
727 // Calculate Y = W M. Avoid the standard carry registers, because
728 // we're setting something else up there.
729 mulcore xmm7, 0, xmm8, xmm9, xmm3, xmm4, xmm5, xmm6
731 // Start expanding W back into the main carry registers...
736 mulcore xmm7, 1, xmm8, xmm9, xmm0, xmm1, xmm2
737 accum xmm4, xmm5, xmm6
739 punpckldq xmm12, xmm15 // (w_0, 0; w_1, 0)
740 punpckhdq xmm14, xmm15 // (w_2, 0; w_3, 0)
742 mulcore xmm7, 2, xmm8, xmm9, xmm0, xmm1
749 mulcore xmm7, 3, xmm8, xmm9, xmm0
752 punpckldq xmm12, xmm2 // (w_0, 0; 0, 0)
753 punpckldq xmm14, xmm2 // (w_2, 0; 0, 0)
754 punpckhdq xmm13, xmm2 // (w_1, 0; 0, 0)
755 punpckhdq xmm15, xmm2 // (w_3, 0; 0, 0)
757 // That's lots of pieces. Now we have to assemble the answer.
758 squash xmm3, xmm4, xmm5, xmm6, xmm0, xmm1, xmm10
762 expand xmm2, xmm10, xmm11
764 // Finish the calculation by adding the Montgomery product.
765 mulacc xmm5, 0 xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
766 propout xmm6, lo, xmm12, xmm13
768 mulacc xmm5, 1 xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
769 propout xmm7, lo, xmm13, xmm14
771 mulacc xmm5, 2 xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
772 propout xmm6, hi, xmm14, xmm15
774 mulacc xmm5, 3 xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
775 propout xmm7, hi, xmm15, xmm12
779 // And, with that, we're done.
784 ///--------------------------------------------------------------------------
785 /// Bulk multipliers.
787 FUNC(mpx_umul4_amd64_avx)
794 FUNC(mpx_umul4_amd64_sse2)
795 // void mpx_umul4_amd64_sse2(mpw *dv, const mpw *av, const mpw *avl,
796 // const mpw *bv, const mpw *bvl);
798 // Establish the arguments and do initial setup.
801 // inner loop dv rdi rdi*
802 // inner loop av rbx* rbx*
803 // outer loop dv r10 rcx
804 // outer loop bv rcx r9
850 // Prepare for the first iteration.
852 movdqu xmm10, [BV] // bv[0]
856 expand xmm0, xmm10, xmm11
860 cmp rbx, AVL // all done?
864 // Continue with the first iteration.
868 cmp rbx, AVL // all done?
871 // Write out the leftover carry. There can be no tail here.
873 cmp BV, BVL // more passes to do?
877 // Set up for the next pass.
878 1: movdqu xmm10, [BV] // bv[i]
879 mov rdi, DV // -> dv[i]
881 expand xmm0, xmm10, xmm11
882 mov rbx, AV // -> av[0]
888 cmp rbx, AVL // done yet?
899 // Finish off this pass. There was no tail on the previous pass, and
900 // there can be none on this pass.
939 FUNC(mpxmont_mul4_amd64_avx)
946 FUNC(mpxmont_mul4_amd64_sse2)
947 // void mpxmont_mul4_amd64_sse2(mpw *dv, const mpw *av, const mpw *bv,
948 // const mpw *nv, size_t n, const mpw *mi);
950 // Establish the arguments and do initial setup.
953 // inner loop dv rdi rdi*
954 // inner loop av rax rax
955 // inner loop nv rbx* rbx*
957 // outer loop dv r10 rcx
958 // outer loop bv rdx r8
1014 // Establish the expanded operands.
1016 movdqu xmm8, [BV] // bv[0]
1017 movdqu xmm10, [MI] // mi
1018 expand xmm0, xmm8, xmm9, xmm10, xmm11
1020 // Set up the outer loop state and prepare for the first iteration.
1021 mov rax, AV // -> U = av[0]
1022 mov rbx, NV // -> X = nv[0]
1023 lea AVL, [AV + 4*N] // -> av[n/4] = av limit
1024 lea BVL, [BV + 4*N] // -> bv[n/4] = bv limit
1031 cmp rax, AVL // done already?
1035 // Complete the first inner loop.
1040 cmp rax, AVL // done yet?
1043 // Still have carries left to propagate.
1045 movd [rdi + 16], xmm12
1048 // Embark on the next iteration. (There must be one. If n = 1, then
1049 // we would have bailed above, to label 8. Similarly, the subsequent
1050 // iterations can fall into the inner loop immediately.)
1052 movdqu xmm8, [BV] // bv[i]
1053 movdqu xmm10, [MI] // mi
1054 mov rdi, DV // -> Z = dv[i]
1055 mov rax, AV // -> U = av[0]
1056 mov rbx, NV // -> X = nv[0]
1057 expand xmm0, xmm8, xmm9, xmm10, xmm11
1066 // Complete the next inner loop.
1074 // Still have carries left to propagate, and they overlap the
1075 // previous iteration's final tail, so read that in and add it.
1079 movd [rdi + 16], xmm12
1081 // Back again, maybe.
1112 // First iteration was short. Write out the carries and we're done.
1113 // (This could be folded into the main loop structure, but that would
1114 // penalize small numbers more.)
1116 movd [rdi + 16], xmm12
1136 FUNC(mpxmont_redc4_amd64_avx)
1143 FUNC(mpxmont_redc4_amd64_sse2)
1144 // void mpxmont_redc4_amd64_sse2(mpw *dv, mpw *dvl, const mpw *nv,
1145 // size_t n, const mpw *mi);
1147 // Establish the arguments and do initial setup.
1150 // inner loop dv rdi rdi*
1152 // blocks-of-4 dv limit rsi rdx
1153 // inner loop nv rbx* rbx*
1155 // outer loop dv r10 rcx
1156 // outer loop dv limit r11 r11
1211 // Establish the expanded operands and the blocks-of-4 dv limit.
1213 mov DVL, DVL4 // -> dv[n] = dv limit
1214 sub DVL4, DV // length of dv in bytes
1215 movdqu xmm8, [MI] // mi
1216 and DVL4, ~15 // mask off the tail end
1217 expand xmm0, xmm8, xmm9
1218 add DVL4, DV // find limit
1220 // Set up the outer loop state and prepare for the first iteration.
1221 mov rbx, NV // -> X = nv[0]
1222 lea DVLO, [DV + 4*N] // -> dv[n/4] = outer dv limit
1223 lea NVL, [NV + 4*N] // -> nv[n/4] = nv limit
1228 cmp rbx, NVL // done already?
1232 // Complete the first inner loop.
1236 cmp rbx, NVL // done yet?
1239 // Still have carries left to propagate.
1251 // Continue carry propagation until the end of the buffer.
1253 mov C, 0 // preserves flags
1254 adc dword ptr [rdi + 4], 0
1255 adc dword ptr [rdi + 8], 0
1256 adc dword ptr [rdi + 12], 0
1262 // Deal with the tail end. Note that the actual destination length
1263 // won't be an exacty number of blocks of four, so it's safe to just
1264 // drop through here.
1272 // All done for this iteration. Start the next.
1273 cmp DV, DVLO // all done yet?
1275 mov rdi, DV // -> Z = dv[i]
1276 mov rbx, NV // -> X = nv[0]
1321 ///--------------------------------------------------------------------------
1322 /// Testing and performance measurement.
1333 # define ARG6 STKARG(0)
1334 # define ARG7 STKARG(1)
1335 # define ARG8 STKARG(2)
1336 # define STKARG_OFFSET 16
1343 # define ARG4 STKARG(0)
1344 # define ARG5 STKARG(1)
1345 # define ARG6 STKARG(2)
1346 # define ARG7 STKARG(3)
1347 # define ARG8 STKARG(4)
1348 # define STKARG_OFFSET 224
1350 #define STKARG(i) [SP + STKARG_OFFSET + 8*(i)]
1353 // dmul smul mmul mont dmul smul mmul mont
1356 // z rdi rdi rdi rdi rdi rcx rcx rcx rcx
1357 // c rcx rsi rsi rsi rsi rdx rdx rdx rdx
1358 // y r10 -- -- rdx rdx -- -- r8 r8
1359 // u r11 rdx -- rcx -- r8 -- r9 --
1360 // x rbx rcx rdx r8 rcx r9 r8 stk0 r9
1361 // vv xmm8/9 r8 -- r9 r8 stk0 -- stk1 stk0
1362 // yy xmm10/11 r9 rcx stk0 -- stk1 r9 stk2 --
1363 // n r8 stk0 r8 stk1 r9 stk2 stk0 stk3 stk1
1364 // cyv r9 stk1 r9 stk2 stk0 stk3 stk1 stk4 stk2
1370 mov [\v + 8*\n - 8], rax
1377 sub rax, [\v + 8*\n - 8]
1378 mov [\v + 8*\n - 8], rax
1382 .macro testprologue mode
1386 .ifeqs "\mode", "dmul"
1395 .ifeqs "\mode", "smul"
1400 .ifeqs "\mode", "mmul"
1411 .ifeqs "\mode", "mont"
1434 .ifeqs "\mode", "dmul"
1446 .ifeqs "\mode", "smul"
1454 .ifeqs "\mode", "mmul"
1467 .ifeqs "\mode", "mont"
1480 .ifeqs "\mode", "dmul"
1481 expand xmm0, xmm8, xmm9, xmm10, xmm11
1483 .ifeqs "\mode", "smul"
1484 expand xmm0, xmm10, xmm11
1486 .ifeqs "\mode", "mmul"
1487 expand xmm0, xmm8, xmm9, xmm10, xmm11
1489 .ifeqs "\mode", "mont"
1490 expand xmm0, xmm8, xmm9
1514 movdqu xmm12, [rcx + 0] // (c'_0; c''_0)
1515 movdqu xmm13, [rcx + 16] // (c'_1; c''_1)
1516 movdqu xmm14, [rcx + 32] // (c'_2; c''_2)
1519 .macro testtop u=nil
1534 movdqu [rcx + 0], xmm12
1535 movdqu [rcx + 16], xmm13
1536 movdqu [rcx + 32], xmm14
1604 pshufd xmm10, xmm10, SHUF(0, 2, 1, 3)
1605 pshufd xmm11, xmm11, SHUF(0, 2, 1, 3)
1606 movdqu [r10 + 0], xmm10
1607 movdqu [r10 + 16], xmm11
1617 pshufd xmm10, xmm10, SHUF(0, 2, 1, 3)
1618 pshufd xmm11, xmm11, SHUF(0, 2, 1, 3)
1619 movdqu [r10 + 0], xmm10
1620 movdqu [r10 + 16], xmm11
1630 pshufd xmm10, xmm10, SHUF(0, 2, 1, 3)
1631 pshufd xmm11, xmm11, SHUF(0, 2, 1, 3)
1632 movdqu [r10 + 0], xmm10
1633 movdqu [r10 + 16], xmm11
1640 ///----- That's all, folks --------------------------------------------------