1 /// -*- mode: asm; asm-comment-char: ?/; comment-start: "// " -*-
3 /// Large SIMD-based multiplications
5 /// (c) 2016 Straylight/Edgeware
8 ///----- Licensing notice ---------------------------------------------------
10 /// This file is part of Catacomb.
12 /// Catacomb is free software; you can redistribute it and/or modify
13 /// it under the terms of the GNU Library General Public License as
14 /// published by the Free Software Foundation; either version 2 of the
15 /// License, or (at your option) any later version.
17 /// Catacomb is distributed in the hope that it will be useful,
18 /// but WITHOUT ANY WARRANTY; without even the implied warranty of
19 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 /// GNU Library General Public License for more details.
22 /// You should have received a copy of the GNU Library General Public
23 /// License along with Catacomb; if not, write to the Free
24 /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
25 /// MA 02111-1307, USA.
27 ///--------------------------------------------------------------------------
31 #include "asm-common.h"
37 ///--------------------------------------------------------------------------
40 /// We define a number of primitive fixed-size multipliers from which we can
41 /// construct more general variable-length multipliers.
43 /// The basic trick is the same throughout. In an operand-scanning
44 /// multiplication, the inner multiplication loop multiplies a
45 /// multiple-precision operand by a single precision factor, and adds the
46 /// result, appropriately shifted, to the result. A `finely integrated
47 /// operand scanning' implementation of Montgomery multiplication also adds
48 /// the product of a single-precision `Montgomery factor' and the modulus,
49 /// calculated in the same pass. The more common `coarsely integrated
50 /// operand scanning' alternates main multiplication and Montgomery passes,
51 /// which requires additional carry propagation.
53 /// Throughout both plain-multiplication and Montgomery stages, then, one of
54 /// the factors remains constant throughout the operation, so we can afford
55 /// to take a little time to preprocess it. The transformation we perform is
56 /// as follows. Let b = 2^16, and B = b^2 = 2^32. Suppose we're given a
57 /// 128-bit factor v = v_0 + v_1 B + v_2 B^2 + v_3 B^3. Split each v_i into
58 /// two sixteen-bit pieces, so v_i = v'_i + v''_i b. These eight 16-bit
59 /// pieces are placed into 32-bit cells, and arranged as two 128-bit SSE
60 /// operands, as follows.
63 /// 0 v'_0 v'_1 v''_0 v''_1
64 /// 16 v'_2 v'_3 v''_2 v''_3
66 /// A `pmuludqd' instruction ignores the odd positions in its operands; thus,
67 /// it will act on (say) v'_0 and v''_0 in a single instruction. Shifting
68 /// this vector right by 4 bytes brings v'_1 and v''_1 into position. We can
69 /// multiply such a vector by a full 32-bit scalar to produce two 48-bit
70 /// results in 64-bit fields. The sixteen bits of headroom allows us to add
71 /// many products together before we must deal with carrying; it also allows
72 /// for some calculations to be performed on the above expanded form.
76 /// We maintain four `carry' registers accumulating intermediate results.
77 /// The registers' precise roles rotate during the computation; we name them
78 /// `c0', `c1', `c2', and `c3'. Each carry register holds two 64-bit halves:
79 /// the register c0, for example, holds c'_0 (low half) and c''_0 (high
80 /// half), and represents the value c_0 = c'_0 + c''_0 b; the carry registers
81 /// collectively represent the value c_0 + c_1 B + c_2 B^2 + c_3 B^3. The
82 /// `pmuluqdq' instruction acting on a scalar operand (broadcast across all
83 /// lanes of its vector) and an operand in the expanded form above produces a
84 /// result which can be added directly to the appropriate carry register.
85 /// Following a pass of four multiplications, we perform some limited carry
86 /// propagation: let t = c''_0 mod B, and let d = c'_0 + t b; then we output
87 /// z = d mod B, add (floor(d/B), floor(c''_0/B)) to c1, and cycle the carry
88 /// registers around, so that c1 becomes c0, and the old c0 is (implicitly)
89 /// zeroed becomes c3.
91 ///--------------------------------------------------------------------------
92 /// Macro definitions.
94 .macro mulcore r, i, slo, shi, d0, d1=nil, d2=nil, d3=nil
95 // Multiply R_I by the expanded operand SLO/SHI, and leave the pieces
96 // of the product in registers D0, D1, D2, D3.
97 pshufd \d0, \r, SHUF(\i, 3, \i, 3) // (r_i, ?; r_i, ?)
99 movdqa \d1, \slo // (s'_0, s'_1; s''_0, s''_1)
102 movdqa \d3, \shi // (s'_2, s'_3; s''_2, s''_3)
105 psrldq \d1, 4 // (s'_1, s''_0; s''_1, 0)
108 movdqa \d2, \d0 // another copy of (r_i, ?; r_i, ?)
111 psrldq \d3, 4 // (s'_3, s''_2; s''_3, 0)
114 pmuludq \d1, \d0 // (r_i s'_1; r_i s''_1)
117 pmuludq \d3, \d0 // (r_i s'_3; r_i s''_3)
120 pmuludq \d2, \shi // (r_i s'_2; r_i s''_2)
122 pmuludq \d0, \slo // (r_i s'_0; r_i s''_0)
125 .macro accum c0, c1=nil, c2=nil, c3=nil
126 // Accumulate 64-bit pieces in XMM0--XMM3 into the corresponding
127 // carry registers C0--C3. Any or all of C1--C3 may be `nil' to skip
128 // updating that register.
141 .macro mulacc r, i, slo, shi, c0=nil, c1=nil, c2=nil, c3=nil, z3p=nil
142 // Multiply R_I by the expanded operand SLO/SHI, and accumulate in
143 // carry registers C0, C1, C2, C3. If Z3P is `t' then C3 notionally
144 // contains zero, but needs clearing; in practice, we store the
145 // product directly rather than attempting to add. On completion,
146 // XMM0, XMM1, and XMM2 are clobbered, as is XMM3 if Z3P is not `t'.
148 mulcore \r, \i, \slo, \shi, xmm0, xmm1, xmm2, \c3
151 mulcore \r, \i, \slo, \shi, xmm0, xmm1, xmm2, xmm3
152 accum \c0, \c1, \c2, \c3
156 .macro propout d, pos, c, cc=nil
157 // Calculate an output word from C, and store it at POS in D;
158 // propagate carries out from C to CC in preparation for a rotation
159 // of the carry registers. D is an XMM register; the POS is either
160 // `lo' or `hi' according to whether the output word should be in
161 // lane 0 or 1 of D; the high two lanes of D are clobbered. On
162 // completion, XMM3 is clobbered. If CC is `nil', then the
163 // contribution which would have been added to it is left in C.
164 pshufd xmm3, \c, SHUF(3, 3, 3, 2) // (?, ?; ?, t = c'' mod B)
165 psrldq xmm3, 12 // (t, 0; 0, 0) = (t; 0)
166 pslldq xmm3, 2 // (t b; 0)
167 paddq \c, xmm3 // (c' + t b; c'')
173 psrlq \c, 32 // floor(c/B)
175 paddq \cc, \c // propagate up
179 .macro endprop d, pos, c, t
180 // On entry, C contains a carry register. On exit, the low 32 bits
181 // of the value represented in C are written at POS in D, and the
182 // remaining bits are left at the bottom of T.
184 psllq \t, 16 // (?; c'' b)
185 pslldq \c, 8 // (0; c')
186 paddq \t, \c // (?; c' + c'' b)
187 psrldq \t, 8 // (c' + c'' b; 0) = (c; 0)
193 psrldq \t, 4 // (floor(c/B); 0)
196 .macro expand z, a, b, c=nil, d=nil
197 // On entry, A and C hold packed 128-bit values, and Z is zero. On
198 // exit, A:B and C:D together hold the same values in expanded
199 // form. If C is `nil', then only expand A to A:B.
200 movdqa \b, \a // (a_0, a_1; a_2, a_3)
202 movdqa \d, \c // (c_0, c_1; c_2, c_3)
204 punpcklwd \a, \z // (a'_0, a''_0; a'_1, a''_1)
205 punpckhwd \b, \z // (a'_2, a''_2; a'_3, a''_3)
207 punpcklwd \c, \z // (c'_0, c''_0; c'_1, c''_1)
208 punpckhwd \d, \z // (c'_2, c''_2; c'_3, c''_3)
210 pshufd \a, \a, SHUF(0, 2, 1, 3) // (a'_0, a'_1; a''_0, a''_1)
211 pshufd \b, \b, SHUF(0, 2, 1, 3) // (a'_2, a'_3; a''_2, a''_3)
213 pshufd \c, \c, SHUF(0, 2, 1, 3) // (c'_0, c'_1; c''_0, c''_1)
214 pshufd \d, \d, SHUF(0, 2, 1, 3) // (c'_2, c'_3; c''_2, c''_3)
218 .macro squash c0, c1, c2, c3, t, u, lo, hi=nil
219 // On entry, C0, C1, C2, C3 are carry registers representing a value
220 // Y. On exit, LO holds the low 128 bits of the carry value; C1, C2,
221 // C3, T, and U are clobbered; and the high bits of Y are stored in
222 // HI, if this is not `nil'.
224 // The first step is to eliminate the `double-prime' pieces -- i.e.,
225 // the ones offset by 16 bytes from a 32-bit boundary -- by carrying
226 // them into the 32-bit-aligned pieces above and below. But before
227 // we can do that, we must gather them together.
230 punpcklqdq \t, \c2 // (y'_0; y'_2)
231 punpckhqdq \c0, \c2 // (y''_0; y''_2)
232 punpcklqdq \u, \c3 // (y'_1; y'_3)
233 punpckhqdq \c1, \c3 // (y''_1; y''_3)
235 // Now split the double-prime pieces. The high (up to) 48 bits will
236 // go up; the low 16 bits go down.
241 psrlq \c0, 16 // high parts of (y''_0; y''_2)
242 psrlq \c1, 16 // high parts of (y''_1; y''_3)
243 psrlq \c2, 32 // low parts of (y''_0; y''_2)
244 psrlq \c3, 32 // low parts of (y''_1; y''_3)
248 pslldq \c1, 8 // high part of (0; y''_1)
250 paddq \t, \c2 // propagate down
252 paddq \t, \c1 // and up: (y_0; y_2)
253 paddq \u, \c0 // (y_1; y_3)
255 psrldq \hi, 8 // high part of (y''_3; 0)
258 // Finally extract the answer. This complicated dance is better than
259 // storing to memory and loading, because the piecemeal stores
260 // inhibit store forwarding.
261 movdqa \c3, \t // (y_0; ?)
262 movdqa \lo, \t // (y^*_0, ?; ?, ?)
263 psrldq \t, 8 // (y_2; 0)
264 psrlq \c3, 32 // (floor(y_0/B); ?)
265 paddq \c3, \u // (y_1 + floor(y_0/B); ?)
266 movdqa \c1, \c3 // (y^*_1, ?; ?, ?)
267 psrldq \u, 8 // (y_3; 0)
268 psrlq \c3, 32 // (floor((y_1 B + y_0)/B^2; ?)
269 paddq \c3, \t // (y_2 + floor((y_1 B + y_0)/B^2; ?)
270 punpckldq \lo, \c3 // (y^*_0, y^*_2; ?, ?)
271 psrlq \c3, 32 // (floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
272 paddq \c3, \u // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
277 punpckldq \c1, \c3 // (y^*_1, y^*_3; ?, ?)
279 psrlq \t, 32 // very high bits of y
281 punpcklqdq \hi, \u // carry up
283 punpckldq \lo, \c1 // y mod B^4
287 // On entry, RDI points to a packed addend A, and XMM12, XMM13, XMM14
288 // hold the incoming carry registers c0, c1, and c2 representing a
291 // On exit, the carry registers, including XMM15, are updated to hold
292 // C + A; XMM0, XMM1, XMM2, and XMM3 are clobbered. The other
293 // registers are preserved.
294 movd xmm0, [rdi + 0] // (a_0; 0)
295 movd xmm1, [rdi + 4] // (a_1; 0)
296 movd xmm2, [rdi + 8] // (a_2; 0)
297 movd xmm15, [rdi + 12] // (a_3; 0)
298 paddq xmm12, xmm0 // (c'_0 + a_0; c''_0)
299 paddq xmm13, xmm1 // (c'_1 + a_1; c''_1)
300 paddq xmm14, xmm2 // (c'_2 + a_2; c''_2 + a_3 b)
303 ///--------------------------------------------------------------------------
304 /// Primitive multipliers and related utilities.
307 // On entry, XMM12, XMM13, and XMM14 hold a 144-bit carry in an
308 // expanded form. Store the low 128 bits of the represented carry to
309 // [RDI] as a packed 128-bit value, and leave the remaining 16 bits
310 // in the low 32 bits of XMM12. On exit, XMM0, XMM1, XMM3, XMM13 and
311 // XMM14 are clobbered.
314 propout xmm0, lo, xmm12, xmm13
315 propout xmm1, lo, xmm13, xmm14
316 propout xmm0, hi, xmm14, nil
317 endprop xmm1, hi, xmm14, xmm12
326 // On entry, RDI points to the destination buffer; RAX and RBX point
327 // to the packed operands U and X; XMM8/XMM9 and XMM10/XMM11 hold the
328 // expanded operands V and Y; and XMM12, XMM13, XMM14 hold the
329 // incoming carry registers c0, c1, and c2; c3 is assumed to be zero.
331 // On exit, we write the low 128 bits of the sum C + U V + X Y to
332 // [RDI], and update the carry registers with the carry out. The
333 // registers XMM0--XMM7, and XMM15 are clobbered; the general-purpose
334 // registers are preserved.
340 mulacc xmm4, 0, xmm8, xmm9, xmm12, xmm13, xmm14, xmm15, t
341 mulacc xmm5, 0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
342 propout xmm6, lo, xmm12, xmm13
344 mulacc xmm4, 1, xmm8, xmm9, xmm13, xmm14, xmm15, xmm12, t
345 mulacc xmm5, 1, xmm10, xmm11, xmm13, xmm14, xmm15, xmm12
346 propout xmm7, lo, xmm13, xmm14
348 mulacc xmm4, 2, xmm8, xmm9, xmm14, xmm15, xmm12, xmm13, t
349 mulacc xmm5, 2, xmm10, xmm11, xmm14, xmm15, xmm12, xmm13
350 propout xmm6, hi, xmm14, xmm15
352 mulacc xmm4, 3, xmm8, xmm9, xmm15, xmm12, xmm13, xmm14, t
353 mulacc xmm5, 3, xmm10, xmm11, xmm15, xmm12, xmm13, xmm14
354 propout xmm7, hi, xmm15, xmm12
364 // On entry, RDI points to the destination buffer, which also
365 // contains an addend A to accumulate; RAX and RBX point to the
366 // packed operands U and X; XMM8/XMM9 and XMM10/XMM11 hold the
367 // expanded operands V and Y; and XMM12, XMM13, XMM14 hold the
368 // incoming carry registers c0, c1, and c2 representing a carry-in C;
369 // c3 is assumed to be zero.
371 // On exit, we write the low 128 bits of the sum A + C + U V + X Y to
372 // [RDI], and update the carry registers with the carry out. The
373 // registers XMM0--XMM7, and XMM15 are clobbered; the general-purpose
374 // registers are preserved.
381 mulacc xmm4, 0, xmm8, xmm9, xmm12, xmm13, xmm14, xmm15
382 mulacc xmm5, 0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
383 propout xmm6, lo, xmm12, xmm13
385 mulacc xmm4, 1, xmm8, xmm9, xmm13, xmm14, xmm15, xmm12, t
386 mulacc xmm5, 1, xmm10, xmm11, xmm13, xmm14, xmm15, xmm12
387 propout xmm7, lo, xmm13, xmm14
389 mulacc xmm4, 2, xmm8, xmm9, xmm14, xmm15, xmm12, xmm13, t
390 mulacc xmm5, 2, xmm10, xmm11, xmm14, xmm15, xmm12, xmm13
391 propout xmm6, hi, xmm14, xmm15
393 mulacc xmm4, 3, xmm8, xmm9, xmm15, xmm12, xmm13, xmm14, t
394 mulacc xmm5, 3, xmm10, xmm11, xmm15, xmm12, xmm13, xmm14
395 propout xmm7, hi, xmm15, xmm12
405 // On entry, RDI points to the destination buffer; RBX points to a
406 // packed operand X; and XMM10/XMM11 hold an expanded operand Y.
408 // On exit, we write the low 128 bits of the product X Y to [RDI],
409 // and set the carry registers XMM12, XMM13, XMM14 to the carry out.
410 // The registers XMM0--XMM3, XMM5--XMM7, and XMM15 are clobbered; the
411 // general-purpose registers are preserved.
416 mulcore xmm5, 0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
417 propout xmm6, lo, xmm12, xmm13
419 mulacc xmm5, 1, xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
420 propout xmm7, lo, xmm13, xmm14
422 mulacc xmm5, 2, xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
423 propout xmm6, hi, xmm14, xmm15
425 mulacc xmm5, 3, xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
426 propout xmm7, hi, xmm15, xmm12
436 // On entry, RDI points to the destination buffer; RBX points to a
437 // packed operand X; XMM10/XMM11 hold an expanded operand Y; and
438 // XMM12, XMM13, XMM14 hold the incoming carry registers c0, c1, and
439 // c2, representing a carry-in C; c3 is assumed to be zero.
441 // On exit, we write the low 128 bits of the sum C + X Y to [RDI],
442 // and update the carry registers with the carry out. The registers
443 // XMM0--XMM3, XMM5--XMM7, and XMM15 are clobbered; the
444 // general-purpose registers are preserved.
449 mulacc xmm5, 0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, t
450 propout xmm6, lo, xmm12, xmm13
452 mulacc xmm5, 1, xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
453 propout xmm7, lo, xmm13, xmm14
455 mulacc xmm5, 2, xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
456 propout xmm6, hi, xmm14, xmm15
458 mulacc xmm5, 3, xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
459 propout xmm7, hi, xmm15, xmm12
469 // On entry, RDI points to the destination buffer, which also
470 // contains an addend A to accumulate; RBX points to a packed operand
471 // X; and XMM10/XMM11 points to an expanded operand Y.
473 // On exit, we write the low 128 bits of the sum A + X Y to [RDI],
474 // and set the carry registers XMM12, XMM13, XMM14 to the carry out.
475 // The registers XMM0--XMM3, XMM5--XMM7, and XMM15 are clobbered; the
476 // general-purpose registers are preserved.
480 movd xmm12, [rdi + 0]
481 movd xmm13, [rdi + 4]
482 movd xmm14, [rdi + 8]
483 movd xmm15, [rdi + 12]
485 mulacc xmm5, 0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
486 propout xmm6, lo, xmm12, xmm13
488 mulacc xmm5, 1, xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
489 propout xmm7, lo, xmm13, xmm14
491 mulacc xmm5, 2, xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
492 propout xmm6, hi, xmm14, xmm15
494 mulacc xmm5, 3, xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
495 propout xmm7, hi, xmm15, xmm12
505 // On entry, RDI points to the destination buffer, which also
506 // contains an addend A to accumulate; RBX points to a packed operand
507 // X; XMM10/XMM11 holds an expanded operand Y; and XMM12, XMM13,
508 // XMM14 hold the incoming carry registers c0, c1, and c2,
509 // representing a carry-in C; c3 is assumed to be zero.
511 // On exit, we write the low 128 bits of the sum A + C + X Y to
512 // [RDI], and update the carry registers with the carry out. The
513 // registers XMM0--XMM3, XMM5--XMM7, and XMM15 are clobbered; the
514 // general-purpose registers are preserved.
520 mulacc xmm5, 0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
521 propout xmm6, lo, xmm12, xmm13
523 mulacc xmm5, 1, xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
524 propout xmm7, lo, xmm13, xmm14
526 mulacc xmm5, 2, xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
527 propout xmm6, hi, xmm14, xmm15
529 mulacc xmm5, 3, xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
530 propout xmm7, hi, xmm15, xmm12
540 // On entry, RDI points to the destination buffer; RAX and RBX point
541 // to the packed operands U and N; and XMM8/XMM9 and XMM10/XMM11 hold
542 // the expanded operands V and M. The stack pointer must be 8 modulo 16
543 // (as usual for AMD64 ABIs).
545 // On exit, we store Y = U V M mod B in XMM10/XMM11, and write the
546 // low 128 bits of the sum U V + N Y to [RDI], leaving the remaining
547 // carry in XMM12, XMM13, and XMM14. The registers XMM0--XMM7, and
548 // XMM15 are clobbered; the general-purpose registers are preserved.
551 stalloc 48 + 8 // space for the carries
555 // Calculate W = U V, and leave it in XMM7. Stash the carry pieces
557 mulcore xmm4, 0, xmm8, xmm9, xmm12, xmm13, xmm14, xmm15
558 propout xmm7, lo, xmm12, xmm13
564 // On entry, RDI points to the destination buffer, which also
565 // contains an addend A to accumulate; RAX and RBX point to the
566 // packed operands U and N; and XMM8/XMM9 and XMM10/XMM11 hold the
567 // expanded operands V and M. The stack pointer must be 8 modulo 16
568 // (as usual for AMD64 ABIs).
570 // On exit, we store Y = (A + U V) M mod B in XMM10/XMM11, and write
571 // the low 128 bits of the sum A + U V + N Y to [RDI], leaving the
572 // remaining carry in XMM12, XMM13, and XMM14. The registers
573 // XMM0--XMM7, and XMM15 are clobbered; the general-purpose registers
577 stalloc 48 + 8 // space for the carries
578 # define STKTMP(i) [rsp + i]
581 # define STKTMP(i) [rsp + i - 48 - 8] // use red zone
585 movd xmm12, [rdi + 0]
586 movd xmm13, [rdi + 4]
587 movd xmm14, [rdi + 8]
588 movd xmm15, [rdi + 12]
590 // Calculate W = U V, and leave it in XMM7. Stash the carry pieces
592 mulacc xmm4, 0, xmm8, xmm9, xmm12, xmm13, xmm14, xmm15
593 propout xmm7, lo, xmm12, xmm13
595 5: mulacc xmm4, 1, xmm8, xmm9, xmm13, xmm14, xmm15, xmm12, t
596 propout xmm6, lo, xmm13, xmm14
598 mulacc xmm4, 2, xmm8, xmm9, xmm14, xmm15, xmm12, xmm13, t
599 propout xmm7, hi, xmm14, xmm15
601 mulacc xmm4, 3, xmm8, xmm9, xmm15, xmm12, xmm13, xmm14, t
602 propout xmm6, hi, xmm15, xmm12
604 // Prepare W, and stash carries for later.
606 movdqa STKTMP( 0), xmm12
607 movdqa STKTMP(16), xmm13
608 movdqa STKTMP(32), xmm14
610 // Calculate Y = W M. We just about have enough spare registers to
612 mulcore xmm7, 0, xmm10, xmm11, xmm3, xmm4, xmm5, xmm6
614 // Start expanding W back into the main carry registers...
619 mulcore xmm7, 1, xmm10, xmm11, xmm0, xmm1, xmm2
620 accum xmm4, xmm5, xmm6
622 punpckldq xmm12, xmm15 // (w_0, 0; w_1, 0)
623 punpckhdq xmm14, xmm15 // (w_2, 0; w_3, 0)
625 mulcore xmm7, 2, xmm10, xmm11, xmm0, xmm1
632 mulcore xmm7, 3, xmm10, xmm11, xmm0
635 punpckldq xmm12, xmm2 // (w_0, 0; 0, 0)
636 punpckldq xmm14, xmm2 // (w_2, 0; 0, 0)
637 punpckhdq xmm13, xmm2 // (w_1, 0; 0, 0)
638 punpckhdq xmm15, xmm2 // (w_3, 0; 0, 0)
640 // That's lots of pieces. Now we have to assemble the answer.
641 squash xmm3, xmm4, xmm5, xmm6, xmm0, xmm1, xmm10
645 expand xmm2, xmm10, xmm11
647 // Finish the calculation by adding the Montgomery product.
648 mulacc xmm5, 0 xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
649 propout xmm6, lo, xmm12, xmm13
651 mulacc xmm5, 1 xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
652 propout xmm7, lo, xmm13, xmm14
654 mulacc xmm5, 2 xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
655 propout xmm6, hi, xmm14, xmm15
657 mulacc xmm5, 3 xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
658 propout xmm7, hi, xmm15, xmm12
662 // Add add on the carry we calculated earlier.
663 paddq xmm12, STKTMP( 0)
664 paddq xmm13, STKTMP(16)
665 paddq xmm14, STKTMP(32)
667 // And, with that, we're done.
679 // On entry, RDI points to the destination buffer holding a packed
680 // value W; RBX points to a packed operand N; and XMM8/XMM9 hold an
681 // expanded operand M.
683 // On exit, we store Y = W M mod B in XMM10/XMM11, and write the low
684 // 128 bits of the sum W + N Y to [RDI], leaving the remaining carry
685 // in XMM12, XMM13, and XMM14. The registers XMM0--XMM3, XMM5--XMM7,
686 // and XMM15 are clobbered; the general-purpose registers are
692 // Calculate Y = W M. Avoid the standard carry registers, because
693 // we're setting something else up there.
694 mulcore xmm7, 0, xmm8, xmm9, xmm3, xmm4, xmm5, xmm6
696 // Start expanding W back into the main carry registers...
701 mulcore xmm7, 1, xmm8, xmm9, xmm0, xmm1, xmm2
702 accum xmm4, xmm5, xmm6
704 punpckldq xmm12, xmm15 // (w_0, 0; w_1, 0)
705 punpckhdq xmm14, xmm15 // (w_2, 0; w_3, 0)
707 mulcore xmm7, 2, xmm8, xmm9, xmm0, xmm1
714 mulcore xmm7, 3, xmm8, xmm9, xmm0
717 punpckldq xmm12, xmm2 // (w_0, 0; 0, 0)
718 punpckldq xmm14, xmm2 // (w_2, 0; 0, 0)
719 punpckhdq xmm13, xmm2 // (w_1, 0; 0, 0)
720 punpckhdq xmm15, xmm2 // (w_3, 0; 0, 0)
722 // That's lots of pieces. Now we have to assemble the answer.
723 squash xmm3, xmm4, xmm5, xmm6, xmm0, xmm1, xmm10
727 expand xmm2, xmm10, xmm11
729 // Finish the calculation by adding the Montgomery product.
730 mulacc xmm5, 0 xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
731 propout xmm6, lo, xmm12, xmm13
733 mulacc xmm5, 1 xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
734 propout xmm7, lo, xmm13, xmm14
736 mulacc xmm5, 2 xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
737 propout xmm6, hi, xmm14, xmm15
739 mulacc xmm5, 3 xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
740 propout xmm7, hi, xmm15, xmm12
744 // And, with that, we're done.
750 ///--------------------------------------------------------------------------
751 /// Bulk multipliers.
753 FUNC(mpx_umul4_amd64_avx)
760 FUNC(mpx_umul4_amd64_sse2)
761 // void mpx_umul4_amd64_sse2(mpw *dv, const mpw *av, const mpw *avl,
762 // const mpw *bv, const mpw *bvl);
764 // Establish the arguments and do initial setup.
767 // inner loop dv rdi rdi*
768 // inner loop av rbx* rbx*
769 // outer loop dv r10 rcx
770 // outer loop bv rcx r9
818 // Prepare for the first iteration.
820 movdqu xmm10, [BV] // bv[0]
824 expand xmm0, xmm10, xmm11
828 cmp rbx, AVL // all done?
832 // Continue with the first iteration.
836 cmp rbx, AVL // all done?
839 // Write out the leftover carry. There can be no tail here.
841 cmp BV, BVL // more passes to do?
845 // Set up for the next pass.
846 1: movdqu xmm10, [BV] // bv[i]
847 mov rdi, DV // -> dv[i]
849 expand xmm0, xmm10, xmm11
850 mov rbx, AV // -> av[0]
856 cmp rbx, AVL // done yet?
867 // Finish off this pass. There was no tail on the previous pass, and
868 // there can be none on this pass.
909 FUNC(mpxmont_mul4_amd64_avx)
916 FUNC(mpxmont_mul4_amd64_sse2)
917 // void mpxmont_mul4_amd64_sse2(mpw *dv, const mpw *av, const mpw *bv,
918 // const mpw *nv, size_t n, const mpw *mi);
920 // Establish the arguments and do initial setup.
923 // inner loop dv rdi rdi*
924 // inner loop av rax rax
925 // inner loop nv rbx* rbx*
927 // outer loop dv r10 rcx
928 // outer loop bv rdx r8
986 // Establish the expanded operands.
988 movdqu xmm8, [BV] // bv[0]
989 movdqu xmm10, [MI] // mi
990 expand xmm0, xmm8, xmm9, xmm10, xmm11
992 // Set up the outer loop state and prepare for the first iteration.
993 mov rax, AV // -> U = av[0]
994 mov rbx, NV // -> X = nv[0]
995 lea AVL, [AV + 4*N] // -> av[n/4] = av limit
996 lea BVL, [BV + 4*N] // -> bv[n/4] = bv limit
1003 cmp rax, AVL // done already?
1007 // Complete the first inner loop.
1012 cmp rax, AVL // done yet?
1015 // Still have carries left to propagate.
1017 movd [rdi + 16], xmm12
1020 // Embark on the next iteration. (There must be one. If n = 1, then
1021 // we would have bailed above, to label 8. Similarly, the subsequent
1022 // iterations can fall into the inner loop immediately.)
1024 movdqu xmm8, [BV] // bv[i]
1025 movdqu xmm10, [MI] // mi
1026 mov rdi, DV // -> Z = dv[i]
1027 mov rax, AV // -> U = av[0]
1028 mov rbx, NV // -> X = nv[0]
1029 expand xmm0, xmm8, xmm9, xmm10, xmm11
1038 // Complete the next inner loop.
1046 // Still have carries left to propagate, and they overlap the
1047 // previous iteration's final tail, so read that in and add it.
1051 movd [rdi + 16], xmm12
1053 // Back again, maybe.
1086 // First iteration was short. Write out the carries and we're done.
1087 // (This could be folded into the main loop structure, but that would
1088 // penalize small numbers more.)
1090 movd [rdi + 16], xmm12
1110 FUNC(mpxmont_redc4_amd64_avx)
1117 FUNC(mpxmont_redc4_amd64_sse2)
1118 // void mpxmont_redc4_amd64_sse2(mpw *dv, mpw *dvl, const mpw *nv,
1119 // size_t n, const mpw *mi);
1121 // Establish the arguments and do initial setup.
1124 // inner loop dv rdi rdi*
1126 // blocks-of-4 dv limit rsi rdx
1127 // inner loop nv rbx* rbx*
1129 // outer loop dv r10 rcx
1130 // outer loop dv limit r11 r11
1190 // Establish the expanded operands and the blocks-of-4 dv limit.
1192 mov DVL, DVL4 // -> dv[n] = dv limit
1193 sub DVL4, DV // length of dv in bytes
1194 movdqu xmm8, [MI] // mi
1195 and DVL4, ~15 // mask off the tail end
1196 expand xmm0, xmm8, xmm9
1197 add DVL4, DV // find limit
1199 // Set up the outer loop state and prepare for the first iteration.
1200 mov rbx, NV // -> X = nv[0]
1201 lea DVLO, [DV + 4*N] // -> dv[n/4] = outer dv limit
1202 lea NVL, [NV + 4*N] // -> nv[n/4] = nv limit
1207 cmp rbx, NVL // done already?
1211 // Complete the first inner loop.
1215 cmp rbx, NVL // done yet?
1218 // Still have carries left to propagate.
1230 // Continue carry propagation until the end of the buffer.
1232 mov C, 0 // preserves flags
1241 // Deal with the tail end.
1243 mov C, 0 // preserves flags
1249 // All done for this iteration. Start the next. (This must have at
1250 // least one follow-on iteration, or we'd not have started this outer
1252 8: mov rdi, DV // -> Z = dv[i]
1253 mov rbx, NV // -> X = nv[0]
1254 cmp rdi, DVLO // all done yet?
1303 ///--------------------------------------------------------------------------
1304 /// Testing and performance measurement.
1315 # define ARG6 STKARG(0)
1316 # define ARG7 STKARG(1)
1317 # define ARG8 STKARG(2)
1318 # define STKARG_OFFSET 16
1325 # define ARG4 STKARG(0)
1326 # define ARG5 STKARG(1)
1327 # define ARG6 STKARG(2)
1328 # define ARG7 STKARG(3)
1329 # define ARG8 STKARG(4)
1330 # define STKARG_OFFSET 224
1332 #define STKARG(i) [rsp + STKARG_OFFSET + 8*(i)]
1335 // dmul smul mmul mont dmul smul mmul mont
1338 // z rdi rdi rdi rdi rdi rcx rcx rcx rcx
1339 // c rcx rsi rsi rsi rsi rdx rdx rdx rdx
1340 // y r10 -- -- rdx rdx -- -- r8 r8
1341 // u r11 rdx -- rcx -- r8 -- r9 --
1342 // x rbx rcx rdx r8 rcx r9 r8 stk0 r9
1343 // vv xmm8/9 r8 -- r9 r8 stk0 -- stk1 stk0
1344 // yy xmm10/11 r9 rcx stk0 -- stk1 r9 stk2 --
1345 // n r8 stk0 r8 stk1 r9 stk2 stk0 stk3 stk1
1346 // cyv r9 stk1 r9 stk2 stk0 stk3 stk1 stk4 stk2
1352 mov [\v + 8*\n - 8], rax
1359 sub rax, [\v + 8*\n - 8]
1360 mov [\v + 8*\n - 8], rax
1364 .macro testprologue mode
1368 .ifeqs "\mode", "dmul"
1377 .ifeqs "\mode", "smul"
1382 .ifeqs "\mode", "mmul"
1393 .ifeqs "\mode", "mont"
1416 .ifeqs "\mode", "dmul"
1428 .ifeqs "\mode", "smul"
1436 .ifeqs "\mode", "mmul"
1449 .ifeqs "\mode", "mont"
1462 .ifeqs "\mode", "dmul"
1463 expand xmm0, xmm8, xmm9, xmm10, xmm11
1465 .ifeqs "\mode", "smul"
1466 expand xmm0, xmm10, xmm11
1468 .ifeqs "\mode", "mmul"
1469 expand xmm0, xmm8, xmm9, xmm10, xmm11
1471 .ifeqs "\mode", "mont"
1472 expand xmm0, xmm8, xmm9
1496 movdqu xmm12, [rcx + 0] // (c'_0; c''_0)
1497 movdqu xmm13, [rcx + 16] // (c'_1; c''_1)
1498 movdqu xmm14, [rcx + 32] // (c'_2; c''_2)
1501 .macro testtop u=nil
1516 movdqu [rcx + 0], xmm12
1517 movdqu [rcx + 16], xmm13
1518 movdqu [rcx + 32], xmm14
1566 movdqu [r10 + 0], xmm10
1567 movdqu [r10 + 16], xmm11
1577 movdqu [r10 + 0], xmm10
1578 movdqu [r10 + 16], xmm11
1588 movdqu [r10 + 0], xmm10
1589 movdqu [r10 + 16], xmm11
1596 ///----- That's all, folks --------------------------------------------------