1 /// -*- mode: asm; asm-comment-char: ?/; comment-start: "// " -*-
3 /// Large SIMD-based multiplications
5 /// (c) 2016 Straylight/Edgeware
8 ///----- Licensing notice ---------------------------------------------------
10 /// This file is part of Catacomb.
12 /// Catacomb is free software; you can redistribute it and/or modify
13 /// it under the terms of the GNU Library General Public License as
14 /// published by the Free Software Foundation; either version 2 of the
15 /// License, or (at your option) any later version.
17 /// Catacomb is distributed in the hope that it will be useful,
18 /// but WITHOUT ANY WARRANTY; without even the implied warranty of
19 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 /// GNU Library General Public License for more details.
22 /// You should have received a copy of the GNU Library General Public
23 /// License along with Catacomb; if not, write to the Free
24 /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
25 /// MA 02111-1307, USA.
27 ///--------------------------------------------------------------------------
31 #include "asm-common.h"
37 ///--------------------------------------------------------------------------
40 /// We define a number of primitive fixed-size multipliers from which we can
41 /// construct more general variable-length multipliers.
43 /// The basic trick is the same throughout. In an operand-scanning
44 /// multiplication, the inner multiplication loop multiplies a
45 /// multiple-precision operand by a single precision factor, and adds the
46 /// result, appropriately shifted, to the result. A `finely integrated
47 /// operand scanning' implementation of Montgomery multiplication also adds
48 /// the product of a single-precision `Montgomery factor' and the modulus,
49 /// calculated in the same pass. The more common `coarsely integrated
50 /// operand scanning' alternates main multiplication and Montgomery passes,
51 /// which requires additional carry propagation.
53 /// Throughout both plain-multiplication and Montgomery stages, then, one of
54 /// the factors remains constant throughout the operation, so we can afford
55 /// to take a little time to preprocess it. The transformation we perform is
56 /// as follows. Let b = 2^16, and B = b^2 = 2^32. Suppose we're given a
57 /// 128-bit factor v = v_0 + v_1 B + v_2 B^2 + v_3 B^3. Split each v_i into
58 /// two sixteen-bit pieces, so v_i = v'_i + v''_i b. These eight 16-bit
59 /// pieces are placed into 32-bit cells, and arranged as two 128-bit SSE
60 /// operands, as follows.
63 /// 0 v'_0 v'_1 v''_0 v''_1
64 /// 16 v'_2 v'_3 v''_2 v''_3
66 /// A `pmuludqd' instruction ignores the odd positions in its operands; thus,
67 /// it will act on (say) v'_0 and v''_0 in a single instruction. Shifting
68 /// this vector right by 4 bytes brings v'_1 and v''_1 into position. We can
69 /// multiply such a vector by a full 32-bit scalar to produce two 48-bit
70 /// results in 64-bit fields. The sixteen bits of headroom allows us to add
71 /// many products together before we must deal with carrying; it also allows
72 /// for some calculations to be performed on the above expanded form.
76 /// We maintain four `carry' registers accumulating intermediate results.
77 /// The registers' precise roles rotate during the computation; we name them
78 /// `c0', `c1', `c2', and `c3'. Each carry register holds two 64-bit halves:
79 /// the register c0, for example, holds c'_0 (low half) and c''_0 (high
80 /// half), and represents the value c_0 = c'_0 + c''_0 b; the carry registers
81 /// collectively represent the value c_0 + c_1 B + c_2 B^2 + c_3 B^3. The
82 /// `pmuluqdq' instruction acting on a scalar operand (broadcast across all
83 /// lanes of its vector) and an operand in the expanded form above produces a
84 /// result which can be added directly to the appropriate carry register.
85 /// Following a pass of four multiplications, we perform some limited carry
86 /// propagation: let t = c''_0 mod B, and let d = c'_0 + t b; then we output
87 /// z = d mod B, add (floor(d/B), floor(c''_0/B)) to c1, and cycle the carry
88 /// registers around, so that c1 becomes c0, and the old c0 is (implicitly)
89 /// zeroed becomes c3.
91 ///--------------------------------------------------------------------------
92 /// Macro definitions.
94 .macro mulcore r, i, slo, shi, d0, d1=nil, d2=nil, d3=nil
95 // Multiply R_I by the expanded operand SLO/SHI, and leave the pieces
96 // of the product in registers D0, D1, D2, D3.
97 pshufd \d0, \r, SHUF(\i, 3, \i, 3) // (r_i, ?; r_i, ?)
99 movdqa \d1, \slo // (s'_0, s'_1; s''_0, s''_1)
102 movdqa \d3, \shi // (s'_2, s'_3; s''_2, s''_3)
105 psrldq \d1, 4 // (s'_1, s''_0; s''_1, 0)
108 movdqa \d2, \d0 // another copy of (r_i, ?; r_i, ?)
111 psrldq \d3, 4 // (s'_3, s''_2; s''_3, 0)
114 pmuludq \d1, \d0 // (r_i s'_1; r_i s''_1)
117 pmuludq \d3, \d0 // (r_i s'_3; r_i s''_3)
120 pmuludq \d2, \shi // (r_i s'_2; r_i s''_2)
122 pmuludq \d0, \slo // (r_i s'_0; r_i s''_0)
125 .macro accum c0, c1=nil, c2=nil, c3=nil
126 // Accumulate 64-bit pieces in XMM0--XMM3 into the corresponding
127 // carry registers C0--C3. Any or all of C1--C3 may be `nil' to skip
128 // updating that register.
141 .macro mulacc r, i, slo, shi, c0=nil, c1=nil, c2=nil, c3=nil, z3p=nil
142 // Multiply R_I by the expanded operand SLO/SHI, and accumulate in
143 // carry registers C0, C1, C2, C3. If Z3P is `t' then C3 notionally
144 // contains zero, but needs clearing; in practice, we store the
145 // product directly rather than attempting to add. On completion,
146 // XMM0, XMM1, and XMM2 are clobbered, as is XMM3 if Z3P is not `t'.
148 mulcore \r, \i, \slo, \shi, xmm0, xmm1, xmm2, \c3
151 mulcore \r, \i, \slo, \shi, xmm0, xmm1, xmm2, xmm3
152 accum \c0, \c1, \c2, \c3
156 .macro propout d, pos, c, cc=nil
157 // Calculate an output word from C, and store it at POS in D;
158 // propagate carries out from C to CC in preparation for a rotation
159 // of the carry registers. D is an XMM register; the POS is either
160 // `lo' or `hi' according to whether the output word should be in
161 // lane 0 or 1 of D; the high two lanes of D are clobbered. On
162 // completion, XMM3 is clobbered. If CC is `nil', then the
163 // contribution which would have been added to it is left in C.
164 pshufd xmm3, \c, SHUF(3, 3, 3, 2) // (?, ?; ?, t = c'' mod B)
165 psrldq xmm3, 12 // (t, 0; 0, 0) = (t; 0)
166 pslldq xmm3, 2 // (t b; 0)
167 paddq \c, xmm3 // (c' + t b; c'')
173 psrlq \c, 32 // floor(c/B)
175 paddq \cc, \c // propagate up
179 .macro endprop d, pos, c, t
180 // On entry, C contains a carry register. On exit, the low 32 bits
181 // of the value represented in C are written at POS in D, and the
182 // remaining bits are left at the bottom of T.
184 psllq \t, 16 // (?; c'' b)
185 pslldq \c, 8 // (0; c')
186 paddq \t, \c // (?; c' + c'' b)
187 psrldq \t, 8 // (c' + c'' b; 0) = (c; 0)
193 psrldq \t, 4 // (floor(c/B); 0)
196 .macro expand z, a, b, c=nil, d=nil
197 // On entry, A and C hold packed 128-bit values, and Z is zero. On
198 // exit, A:B and C:D together hold the same values in expanded
199 // form. If C is `nil', then only expand A to A:B.
200 movdqa \b, \a // (a_0, a_1; a_2, a_3)
202 movdqa \d, \c // (c_0, c_1; c_2, c_3)
204 punpcklwd \a, \z // (a'_0, a''_0; a'_1, a''_1)
205 punpckhwd \b, \z // (a'_2, a''_2; a'_3, a''_3)
207 punpcklwd \c, \z // (c'_0, c''_0; c'_1, c''_1)
208 punpckhwd \d, \z // (c'_2, c''_2; c'_3, c''_3)
210 pshufd \a, \a, SHUF(0, 2, 1, 3) // (a'_0, a'_1; a''_0, a''_1)
211 pshufd \b, \b, SHUF(0, 2, 1, 3) // (a'_2, a'_3; a''_2, a''_3)
213 pshufd \c, \c, SHUF(0, 2, 1, 3) // (c'_0, c'_1; c''_0, c''_1)
214 pshufd \d, \d, SHUF(0, 2, 1, 3) // (c'_2, c'_3; c''_2, c''_3)
218 .macro squash c0, c1, c2, c3, t, u, lo, hi=nil
219 // On entry, C0, C1, C2, C3 are carry registers representing a value
220 // Y. On exit, LO holds the low 128 bits of the carry value; C1, C2,
221 // C3, T, and U are clobbered; and the high bits of Y are stored in
222 // HI, if this is not `nil'.
224 // The first step is to eliminate the `double-prime' pieces -- i.e.,
225 // the ones offset by 16 bytes from a 32-bit boundary -- by carrying
226 // them into the 32-bit-aligned pieces above and below. But before
227 // we can do that, we must gather them together.
230 punpcklqdq \t, \c2 // (y'_0; y'_2)
231 punpckhqdq \c0, \c2 // (y''_0; y''_2)
232 punpcklqdq \u, \c3 // (y'_1; y'_3)
233 punpckhqdq \c1, \c3 // (y''_1; y''_3)
235 // Now split the double-prime pieces. The high (up to) 48 bits will
236 // go up; the low 16 bits go down.
241 psrlq \c0, 16 // high parts of (y''_0; y''_2)
242 psrlq \c1, 16 // high parts of (y''_1; y''_3)
243 psrlq \c2, 32 // low parts of (y''_0; y''_2)
244 psrlq \c3, 32 // low parts of (y''_1; y''_3)
248 pslldq \c1, 8 // high part of (0; y''_1)
250 paddq \t, \c2 // propagate down
252 paddq \t, \c1 // and up: (y_0; y_2)
253 paddq \u, \c0 // (y_1; y_3)
255 psrldq \hi, 8 // high part of (y''_3; 0)
258 // Finally extract the answer. This complicated dance is better than
259 // storing to memory and loading, because the piecemeal stores
260 // inhibit store forwarding.
261 movdqa \c3, \t // (y_0; ?)
262 movdqa \lo, \t // (y^*_0, ?; ?, ?)
263 psrldq \t, 8 // (y_2; 0)
264 psrlq \c3, 32 // (floor(y_0/B); ?)
265 paddq \c3, \u // (y_1 + floor(y_0/B); ?)
266 movdqa \c1, \c3 // (y^*_1, ?; ?, ?)
267 psrldq \u, 8 // (y_3; 0)
268 psrlq \c3, 32 // (floor((y_1 B + y_0)/B^2; ?)
269 paddq \c3, \t // (y_2 + floor((y_1 B + y_0)/B^2; ?)
270 punpckldq \lo, \c3 // (y^*_0, y^*_2; ?, ?)
271 psrlq \c3, 32 // (floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
272 paddq \c3, \u // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
277 punpckldq \c1, \c3 // (y^*_1, y^*_3; ?, ?)
279 psrlq \t, 32 // very high bits of y
281 punpcklqdq \hi, \u // carry up
283 punpckldq \lo, \c1 // y mod B^4
287 // On entry, RDI points to a packed addend A, and XMM12, XMM13, XMM14
288 // hold the incoming carry registers c0, c1, and c2 representing a
291 // On exit, the carry registers, including XMM15, are updated to hold
292 // C + A; XMM0, XMM1, XMM2, and XMM3 are clobbered. The other
293 // registers are preserved.
294 movd xmm0, [rdi + 0] // (a_0; 0)
295 movd xmm1, [rdi + 4] // (a_1; 0)
296 movd xmm2, [rdi + 8] // (a_2; 0)
297 movd xmm15, [rdi + 12] // (a_3; 0)
298 paddq xmm12, xmm0 // (c'_0 + a_0; c''_0)
299 paddq xmm13, xmm1 // (c'_1 + a_1; c''_1)
300 paddq xmm14, xmm2 // (c'_2 + a_2; c''_2 + a_3 b)
303 ///--------------------------------------------------------------------------
304 /// Primitive multipliers and related utilities.
307 // On entry, XMM12, XMM13, and XMM14 hold a 144-bit carry in an
308 // expanded form. Store the low 128 bits of the represented carry to
309 // [RDI] as a packed 128-bit value, and leave the remaining 16 bits
310 // in the low 32 bits of XMM12. On exit, XMM0, XMM1, XMM3, XMM13 and
311 // XMM14 are clobbered.
314 propout xmm0, lo, xmm12, xmm13
315 propout xmm1, lo, xmm13, xmm14
316 propout xmm0, hi, xmm14, nil
317 endprop xmm1, hi, xmm14, xmm12
325 // On entry, RDI points to the destination buffer; RAX and RBX point
326 // to the packed operands U and X; XMM8/XMM9 and XMM10/XMM11 hold the
327 // expanded operands V and Y; and XMM12, XMM13, XMM14 hold the
328 // incoming carry registers c0, c1, and c2; c3 is assumed to be zero.
330 // On exit, we write the low 128 bits of the sum C + U V + X Y to
331 // [RDI], and update the carry registers with the carry out. The
332 // registers XMM0--XMM7, and XMM15 are clobbered; the general-purpose
333 // registers are preserved.
339 mulacc xmm4, 0, xmm8, xmm9, xmm12, xmm13, xmm14, xmm15, t
340 mulacc xmm5, 0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
341 propout xmm6, lo, xmm12, xmm13
343 mulacc xmm4, 1, xmm8, xmm9, xmm13, xmm14, xmm15, xmm12, t
344 mulacc xmm5, 1, xmm10, xmm11, xmm13, xmm14, xmm15, xmm12
345 propout xmm7, lo, xmm13, xmm14
347 mulacc xmm4, 2, xmm8, xmm9, xmm14, xmm15, xmm12, xmm13, t
348 mulacc xmm5, 2, xmm10, xmm11, xmm14, xmm15, xmm12, xmm13
349 propout xmm6, hi, xmm14, xmm15
351 mulacc xmm4, 3, xmm8, xmm9, xmm15, xmm12, xmm13, xmm14, t
352 mulacc xmm5, 3, xmm10, xmm11, xmm15, xmm12, xmm13, xmm14
353 propout xmm7, hi, xmm15, xmm12
362 // On entry, RDI points to the destination buffer, which also
363 // contains an addend A to accumulate; RAX and RBX point to the
364 // packed operands U and X; XMM8/XMM9 and XMM10/XMM11 hold the
365 // expanded operands V and Y; and XMM12, XMM13, XMM14 hold the
366 // incoming carry registers c0, c1, and c2 representing a carry-in C;
367 // c3 is assumed to be zero.
369 // On exit, we write the low 128 bits of the sum A + C + U V + X Y to
370 // [RDI], and update the carry registers with the carry out. The
371 // registers XMM0--XMM7, and XMM15 are clobbered; the general-purpose
372 // registers are preserved.
379 mulacc xmm4, 0, xmm8, xmm9, xmm12, xmm13, xmm14, xmm15
380 mulacc xmm5, 0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
381 propout xmm6, lo, xmm12, xmm13
383 mulacc xmm4, 1, xmm8, xmm9, xmm13, xmm14, xmm15, xmm12, t
384 mulacc xmm5, 1, xmm10, xmm11, xmm13, xmm14, xmm15, xmm12
385 propout xmm7, lo, xmm13, xmm14
387 mulacc xmm4, 2, xmm8, xmm9, xmm14, xmm15, xmm12, xmm13, t
388 mulacc xmm5, 2, xmm10, xmm11, xmm14, xmm15, xmm12, xmm13
389 propout xmm6, hi, xmm14, xmm15
391 mulacc xmm4, 3, xmm8, xmm9, xmm15, xmm12, xmm13, xmm14, t
392 mulacc xmm5, 3, xmm10, xmm11, xmm15, xmm12, xmm13, xmm14
393 propout xmm7, hi, xmm15, xmm12
402 // On entry, RDI points to the destination buffer; RBX points to a
403 // packed operand X; and XMM10/XMM11 hold an expanded operand Y.
405 // On exit, we write the low 128 bits of the product X Y to [RDI],
406 // and set the carry registers XMM12, XMM13, XMM14 to the carry out.
407 // The registers XMM0--XMM3, XMM5--XMM7, and XMM15 are clobbered; the
408 // general-purpose registers are preserved.
413 mulcore xmm5, 0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
414 propout xmm6, lo, xmm12, xmm13
416 mulacc xmm5, 1, xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
417 propout xmm7, lo, xmm13, xmm14
419 mulacc xmm5, 2, xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
420 propout xmm6, hi, xmm14, xmm15
422 mulacc xmm5, 3, xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
423 propout xmm7, hi, xmm15, xmm12
432 // On entry, RDI points to the destination buffer; RBX points to a
433 // packed operand X; XMM10/XMM11 hold an expanded operand Y; and
434 // XMM12, XMM13, XMM14 hold the incoming carry registers c0, c1, and
435 // c2, representing a carry-in C; c3 is assumed to be zero.
437 // On exit, we write the low 128 bits of the sum C + X Y to [RDI],
438 // and update the carry registers with the carry out. The registers
439 // XMM0--XMM3, XMM5--XMM7, and XMM15 are clobbered; the
440 // general-purpose registers are preserved.
445 mulacc xmm5, 0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, t
446 propout xmm6, lo, xmm12, xmm13
448 mulacc xmm5, 1, xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
449 propout xmm7, lo, xmm13, xmm14
451 mulacc xmm5, 2, xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
452 propout xmm6, hi, xmm14, xmm15
454 mulacc xmm5, 3, xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
455 propout xmm7, hi, xmm15, xmm12
464 // On entry, RDI points to the destination buffer, which also
465 // contains an addend A to accumulate; RBX points to a packed operand
466 // X; and XMM10/XMM11 points to an expanded operand Y.
468 // On exit, we write the low 128 bits of the sum A + X Y to [RDI],
469 // and set the carry registers XMM12, XMM13, XMM14 to the carry out.
470 // The registers XMM0--XMM3, XMM5--XMM7, and XMM15 are clobbered; the
471 // general-purpose registers are preserved.
475 movd xmm12, [rdi + 0]
476 movd xmm13, [rdi + 4]
477 movd xmm14, [rdi + 8]
478 movd xmm15, [rdi + 12]
480 mulacc xmm5, 0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
481 propout xmm6, lo, xmm12, xmm13
483 mulacc xmm5, 1, xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
484 propout xmm7, lo, xmm13, xmm14
486 mulacc xmm5, 2, xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
487 propout xmm6, hi, xmm14, xmm15
489 mulacc xmm5, 3, xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
490 propout xmm7, hi, xmm15, xmm12
499 // On entry, RDI points to the destination buffer, which also
500 // contains an addend A to accumulate; RBX points to a packed operand
501 // X; XMM10/XMM11 holds an expanded operand Y; and XMM12, XMM13,
502 // XMM14 hold the incoming carry registers c0, c1, and c2,
503 // representing a carry-in C; c3 is assumed to be zero.
505 // On exit, we write the low 128 bits of the sum A + C + X Y to
506 // [RDI], and update the carry registers with the carry out. The
507 // registers XMM0--XMM3, XMM5--XMM7, and XMM15 are clobbered; the
508 // general-purpose registers are preserved.
514 mulacc xmm5, 0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
515 propout xmm6, lo, xmm12, xmm13
517 mulacc xmm5, 1, xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
518 propout xmm7, lo, xmm13, xmm14
520 mulacc xmm5, 2, xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
521 propout xmm6, hi, xmm14, xmm15
523 mulacc xmm5, 3, xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
524 propout xmm7, hi, xmm15, xmm12
533 // On entry, RDI points to the destination buffer; RAX and RBX point
534 // to the packed operands U and N; and XMM8/XMM9 and XMM10/XMM11 hold
535 // the expanded operands V and M. The stack pointer must be 8 modulo 16
536 // (as usual for AMD64 ABIs).
538 // On exit, we store Y = U V M mod B in XMM10/XMM11, and write the
539 // low 128 bits of the sum U V + N Y to [RDI], leaving the remaining
540 // carry in XMM12, XMM13, and XMM14. The registers XMM0--XMM7, and
541 // XMM15 are clobbered; the general-purpose registers are preserved.
544 stalloc 48 + 8 // space for the carries
548 // Calculate W = U V, and leave it in XMM7. Stash the carry pieces
550 mulcore xmm4, 0, xmm8, xmm9, xmm12, xmm13, xmm14, xmm15
551 propout xmm7, lo, xmm12, xmm13
556 // On entry, RDI points to the destination buffer, which also
557 // contains an addend A to accumulate; RAX and RBX point to the
558 // packed operands U and N; and XMM8/XMM9 and XMM10/XMM11 hold the
559 // expanded operands V and M. The stack pointer must be 8 modulo 16
560 // (as usual for AMD64 ABIs).
562 // On exit, we store Y = (A + U V) M mod B in XMM10/XMM11, and write
563 // the low 128 bits of the sum A + U V + N Y to [RDI], leaving the
564 // remaining carry in XMM12, XMM13, and XMM14. The registers
565 // XMM0--XMM7, and XMM15 are clobbered; the general-purpose registers
569 stalloc 48 + 8 // space for the carries
570 # define STKTMP(i) [SP + i]
573 # define STKTMP(i) [SP + i - 48 - 8] // use red zone
577 movd xmm12, [rdi + 0]
578 movd xmm13, [rdi + 4]
579 movd xmm14, [rdi + 8]
580 movd xmm15, [rdi + 12]
582 // Calculate W = U V, and leave it in XMM7. Stash the carry pieces
584 mulacc xmm4, 0, xmm8, xmm9, xmm12, xmm13, xmm14, xmm15
585 propout xmm7, lo, xmm12, xmm13
587 5: mulacc xmm4, 1, xmm8, xmm9, xmm13, xmm14, xmm15, xmm12, t
588 propout xmm6, lo, xmm13, xmm14
590 mulacc xmm4, 2, xmm8, xmm9, xmm14, xmm15, xmm12, xmm13, t
591 propout xmm7, hi, xmm14, xmm15
593 mulacc xmm4, 3, xmm8, xmm9, xmm15, xmm12, xmm13, xmm14, t
594 propout xmm6, hi, xmm15, xmm12
596 // Prepare W, and stash carries for later.
598 movdqa STKTMP( 0), xmm12
599 movdqa STKTMP(16), xmm13
600 movdqa STKTMP(32), xmm14
602 // Calculate Y = W M. We just about have enough spare registers to
604 mulcore xmm7, 0, xmm10, xmm11, xmm3, xmm4, xmm5, xmm6
606 // Start expanding W back into the main carry registers...
611 mulcore xmm7, 1, xmm10, xmm11, xmm0, xmm1, xmm2
612 accum xmm4, xmm5, xmm6
614 punpckldq xmm12, xmm15 // (w_0, 0; w_1, 0)
615 punpckhdq xmm14, xmm15 // (w_2, 0; w_3, 0)
617 mulcore xmm7, 2, xmm10, xmm11, xmm0, xmm1
624 mulcore xmm7, 3, xmm10, xmm11, xmm0
627 punpckldq xmm12, xmm2 // (w_0, 0; 0, 0)
628 punpckldq xmm14, xmm2 // (w_2, 0; 0, 0)
629 punpckhdq xmm13, xmm2 // (w_1, 0; 0, 0)
630 punpckhdq xmm15, xmm2 // (w_3, 0; 0, 0)
632 // That's lots of pieces. Now we have to assemble the answer.
633 squash xmm3, xmm4, xmm5, xmm6, xmm0, xmm1, xmm10
637 expand xmm2, xmm10, xmm11
639 // Finish the calculation by adding the Montgomery product.
640 mulacc xmm5, 0 xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
641 propout xmm6, lo, xmm12, xmm13
643 mulacc xmm5, 1 xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
644 propout xmm7, lo, xmm13, xmm14
646 mulacc xmm5, 2 xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
647 propout xmm6, hi, xmm14, xmm15
649 mulacc xmm5, 3 xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
650 propout xmm7, hi, xmm15, xmm12
654 // Add add on the carry we calculated earlier.
655 paddq xmm12, STKTMP( 0)
656 paddq xmm13, STKTMP(16)
657 paddq xmm14, STKTMP(32)
659 // And, with that, we're done.
671 // On entry, RDI points to the destination buffer holding a packed
672 // value W; RBX points to a packed operand N; and XMM8/XMM9 hold an
673 // expanded operand M.
675 // On exit, we store Y = W M mod B in XMM10/XMM11, and write the low
676 // 128 bits of the sum W + N Y to [RDI], leaving the remaining carry
677 // in XMM12, XMM13, and XMM14. The registers XMM0--XMM3, XMM5--XMM7,
678 // and XMM15 are clobbered; the general-purpose registers are
684 // Calculate Y = W M. Avoid the standard carry registers, because
685 // we're setting something else up there.
686 mulcore xmm7, 0, xmm8, xmm9, xmm3, xmm4, xmm5, xmm6
688 // Start expanding W back into the main carry registers...
693 mulcore xmm7, 1, xmm8, xmm9, xmm0, xmm1, xmm2
694 accum xmm4, xmm5, xmm6
696 punpckldq xmm12, xmm15 // (w_0, 0; w_1, 0)
697 punpckhdq xmm14, xmm15 // (w_2, 0; w_3, 0)
699 mulcore xmm7, 2, xmm8, xmm9, xmm0, xmm1
706 mulcore xmm7, 3, xmm8, xmm9, xmm0
709 punpckldq xmm12, xmm2 // (w_0, 0; 0, 0)
710 punpckldq xmm14, xmm2 // (w_2, 0; 0, 0)
711 punpckhdq xmm13, xmm2 // (w_1, 0; 0, 0)
712 punpckhdq xmm15, xmm2 // (w_3, 0; 0, 0)
714 // That's lots of pieces. Now we have to assemble the answer.
715 squash xmm3, xmm4, xmm5, xmm6, xmm0, xmm1, xmm10
719 expand xmm2, xmm10, xmm11
721 // Finish the calculation by adding the Montgomery product.
722 mulacc xmm5, 0 xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
723 propout xmm6, lo, xmm12, xmm13
725 mulacc xmm5, 1 xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
726 propout xmm7, lo, xmm13, xmm14
728 mulacc xmm5, 2 xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
729 propout xmm6, hi, xmm14, xmm15
731 mulacc xmm5, 3 xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
732 propout xmm7, hi, xmm15, xmm12
736 // And, with that, we're done.
741 ///--------------------------------------------------------------------------
742 /// Bulk multipliers.
744 FUNC(mpx_umul4_amd64_avx)
751 FUNC(mpx_umul4_amd64_sse2)
752 // void mpx_umul4_amd64_sse2(mpw *dv, const mpw *av, const mpw *avl,
753 // const mpw *bv, const mpw *bvl);
755 // Establish the arguments and do initial setup.
758 // inner loop dv rdi rdi*
759 // inner loop av rbx* rbx*
760 // outer loop dv r10 rcx
761 // outer loop bv rcx r9
807 // Prepare for the first iteration.
809 movdqu xmm10, [BV] // bv[0]
813 expand xmm0, xmm10, xmm11
817 cmp rbx, AVL // all done?
821 // Continue with the first iteration.
825 cmp rbx, AVL // all done?
828 // Write out the leftover carry. There can be no tail here.
830 cmp BV, BVL // more passes to do?
834 // Set up for the next pass.
835 1: movdqu xmm10, [BV] // bv[i]
836 mov rdi, DV // -> dv[i]
838 expand xmm0, xmm10, xmm11
839 mov rbx, AV // -> av[0]
845 cmp rbx, AVL // done yet?
856 // Finish off this pass. There was no tail on the previous pass, and
857 // there can be none on this pass.
896 FUNC(mpxmont_mul4_amd64_avx)
903 FUNC(mpxmont_mul4_amd64_sse2)
904 // void mpxmont_mul4_amd64_sse2(mpw *dv, const mpw *av, const mpw *bv,
905 // const mpw *nv, size_t n, const mpw *mi);
907 // Establish the arguments and do initial setup.
910 // inner loop dv rdi rdi*
911 // inner loop av rax rax
912 // inner loop nv rbx* rbx*
914 // outer loop dv r10 rcx
915 // outer loop bv rdx r8
971 // Establish the expanded operands.
973 movdqu xmm8, [BV] // bv[0]
974 movdqu xmm10, [MI] // mi
975 expand xmm0, xmm8, xmm9, xmm10, xmm11
977 // Set up the outer loop state and prepare for the first iteration.
978 mov rax, AV // -> U = av[0]
979 mov rbx, NV // -> X = nv[0]
980 lea AVL, [AV + 4*N] // -> av[n/4] = av limit
981 lea BVL, [BV + 4*N] // -> bv[n/4] = bv limit
988 cmp rax, AVL // done already?
992 // Complete the first inner loop.
997 cmp rax, AVL // done yet?
1000 // Still have carries left to propagate.
1002 movd [rdi + 16], xmm12
1005 // Embark on the next iteration. (There must be one. If n = 1, then
1006 // we would have bailed above, to label 8. Similarly, the subsequent
1007 // iterations can fall into the inner loop immediately.)
1009 movdqu xmm8, [BV] // bv[i]
1010 movdqu xmm10, [MI] // mi
1011 mov rdi, DV // -> Z = dv[i]
1012 mov rax, AV // -> U = av[0]
1013 mov rbx, NV // -> X = nv[0]
1014 expand xmm0, xmm8, xmm9, xmm10, xmm11
1023 // Complete the next inner loop.
1031 // Still have carries left to propagate, and they overlap the
1032 // previous iteration's final tail, so read that in and add it.
1036 movd [rdi + 16], xmm12
1038 // Back again, maybe.
1069 // First iteration was short. Write out the carries and we're done.
1070 // (This could be folded into the main loop structure, but that would
1071 // penalize small numbers more.)
1073 movd [rdi + 16], xmm12
1093 FUNC(mpxmont_redc4_amd64_avx)
1100 FUNC(mpxmont_redc4_amd64_sse2)
1101 // void mpxmont_redc4_amd64_sse2(mpw *dv, mpw *dvl, const mpw *nv,
1102 // size_t n, const mpw *mi);
1104 // Establish the arguments and do initial setup.
1107 // inner loop dv rdi rdi*
1109 // blocks-of-4 dv limit rsi rdx
1110 // inner loop nv rbx* rbx*
1112 // outer loop dv r10 rcx
1113 // outer loop dv limit r11 r11
1169 // Establish the expanded operands and the blocks-of-4 dv limit.
1171 mov DVL, DVL4 // -> dv[n] = dv limit
1172 sub DVL4, DV // length of dv in bytes
1173 movdqu xmm8, [MI] // mi
1174 and DVL4, ~15 // mask off the tail end
1175 expand xmm0, xmm8, xmm9
1176 add DVL4, DV // find limit
1178 // Set up the outer loop state and prepare for the first iteration.
1179 mov rbx, NV // -> X = nv[0]
1180 lea DVLO, [DV + 4*N] // -> dv[n/4] = outer dv limit
1181 lea NVL, [NV + 4*N] // -> nv[n/4] = nv limit
1186 cmp rbx, NVL // done already?
1190 // Complete the first inner loop.
1194 cmp rbx, NVL // done yet?
1197 // Still have carries left to propagate.
1209 // Continue carry propagation until the end of the buffer.
1211 mov C, 0 // preserves flags
1220 // Deal with the tail end.
1222 mov C, 0 // preserves flags
1228 // All done for this iteration. Start the next. (This must have at
1229 // least one follow-on iteration, or we'd not have started this outer
1231 8: mov rdi, DV // -> Z = dv[i]
1232 mov rbx, NV // -> X = nv[0]
1233 cmp rdi, DVLO // all done yet?
1280 ///--------------------------------------------------------------------------
1281 /// Testing and performance measurement.
1292 # define ARG6 STKARG(0)
1293 # define ARG7 STKARG(1)
1294 # define ARG8 STKARG(2)
1295 # define STKARG_OFFSET 16
1302 # define ARG4 STKARG(0)
1303 # define ARG5 STKARG(1)
1304 # define ARG6 STKARG(2)
1305 # define ARG7 STKARG(3)
1306 # define ARG8 STKARG(4)
1307 # define STKARG_OFFSET 224
1309 #define STKARG(i) [SP + STKARG_OFFSET + 8*(i)]
1312 // dmul smul mmul mont dmul smul mmul mont
1315 // z rdi rdi rdi rdi rdi rcx rcx rcx rcx
1316 // c rcx rsi rsi rsi rsi rdx rdx rdx rdx
1317 // y r10 -- -- rdx rdx -- -- r8 r8
1318 // u r11 rdx -- rcx -- r8 -- r9 --
1319 // x rbx rcx rdx r8 rcx r9 r8 stk0 r9
1320 // vv xmm8/9 r8 -- r9 r8 stk0 -- stk1 stk0
1321 // yy xmm10/11 r9 rcx stk0 -- stk1 r9 stk2 --
1322 // n r8 stk0 r8 stk1 r9 stk2 stk0 stk3 stk1
1323 // cyv r9 stk1 r9 stk2 stk0 stk3 stk1 stk4 stk2
1329 mov [\v + 8*\n - 8], rax
1336 sub rax, [\v + 8*\n - 8]
1337 mov [\v + 8*\n - 8], rax
1341 .macro testprologue mode
1345 .ifeqs "\mode", "dmul"
1354 .ifeqs "\mode", "smul"
1359 .ifeqs "\mode", "mmul"
1370 .ifeqs "\mode", "mont"
1393 .ifeqs "\mode", "dmul"
1405 .ifeqs "\mode", "smul"
1413 .ifeqs "\mode", "mmul"
1426 .ifeqs "\mode", "mont"
1439 .ifeqs "\mode", "dmul"
1440 expand xmm0, xmm8, xmm9, xmm10, xmm11
1442 .ifeqs "\mode", "smul"
1443 expand xmm0, xmm10, xmm11
1445 .ifeqs "\mode", "mmul"
1446 expand xmm0, xmm8, xmm9, xmm10, xmm11
1448 .ifeqs "\mode", "mont"
1449 expand xmm0, xmm8, xmm9
1473 movdqu xmm12, [rcx + 0] // (c'_0; c''_0)
1474 movdqu xmm13, [rcx + 16] // (c'_1; c''_1)
1475 movdqu xmm14, [rcx + 32] // (c'_2; c''_2)
1478 .macro testtop u=nil
1493 movdqu [rcx + 0], xmm12
1494 movdqu [rcx + 16], xmm13
1495 movdqu [rcx + 32], xmm14
1563 movdqu [r10 + 0], xmm10
1564 movdqu [r10 + 16], xmm11
1574 movdqu [r10 + 0], xmm10
1575 movdqu [r10 + 16], xmm11
1585 movdqu [r10 + 0], xmm10
1586 movdqu [r10 + 16], xmm11
1593 ///----- That's all, folks --------------------------------------------------