1 /// -*- mode: asm; asm-comment-char: ?/; comment-start: "// " -*-
3 /// Large SIMD-based multiplications
5 /// (c) 2016 Straylight/Edgeware
7 ///----- Licensing notice ---------------------------------------------------
9 /// This file is part of Catacomb.
11 /// Catacomb is free software; you can redistribute it and/or modify
12 /// it under the terms of the GNU Library General Public License as
13 /// published by the Free Software Foundation; either version 2 of the
14 /// License, or (at your option) any later version.
16 /// Catacomb is distributed in the hope that it will be useful,
17 /// but WITHOUT ANY WARRANTY; without even the implied warranty of
18 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 /// GNU Library General Public License for more details.
21 /// You should have received a copy of the GNU Library General Public
22 /// License along with Catacomb; if not, write to the Free
23 /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
24 /// MA 02111-1307, USA.
26 ///--------------------------------------------------------------------------
30 #include "asm-common.h"
36 ///--------------------------------------------------------------------------
39 /// We define a number of primitive fixed-size multipliers from which we can
40 /// construct more general variable-length multipliers.
42 /// The basic trick is the same throughout. In an operand-scanning
43 /// multiplication, the inner multiplication loop multiplies a multiple-
44 /// precision operand by a single precision factor, and adds the result,
45 /// appropriately shifted, to the result. A `finely integrated operand
46 /// scanning' implementation of Montgomery multiplication also adds the
47 /// product of a single-precision `Montgomery factor' and the modulus,
48 /// calculated in the same pass. The more common `coarsely integrated
49 /// operand scanning' alternates main multiplication and Montgomery passes,
50 /// which requires additional carry propagation.
52 /// Throughout both plain-multiplication and Montgomery stages, then, one of
53 /// the factors remains constant throughout the operation, so we can afford
54 /// to take a little time to preprocess it. The transformation we perform is
55 /// as follows. Let b = 2^16, and B = b^2 = 2^32. Suppose we're given a
56 /// 128-bit factor v = v_0 + v_1 B + v_2 B^2 + v_3 B^3. Split each v_i into
57 /// two sixteen-bit pieces, so v_i = v'_i + v''_i b. These eight 16-bit
58 /// pieces are placed into 32-bit cells, and arranged as two 128-bit SSE
59 /// operands, as follows.
62 /// 0 v'_0 v'_1 v''_0 v''_1
63 /// 16 v'_2 v'_3 v''_2 v''_3
65 /// A `pmuludq' instruction ignores the odd positions in its operands; thus,
66 /// it will act on (say) v'_0 and v''_0 in a single instruction. Shifting
67 /// this vector right by 4 bytes brings v'_1 and v''_1 into position. We can
68 /// multiply such a vector by a full 32-bit scalar to produce two 48-bit
69 /// results in 64-bit fields. The sixteen bits of headroom allows us to add
70 /// many products together before we must deal with carrying; it also allows
71 /// for some calculations to be performed on the above expanded form.
73 /// We maintain four `carry' registers XMM4--XMM7 accumulating intermediate
74 /// results. The registers' precise roles rotate during the computation; we
75 /// name them `c0', `c1', `c2', and `c3'. Each carry register holds two
76 /// 64-bit halves: the register c0, for example, holds c'_0 (low half) and
77 /// c''_0 (high half), and represents the value c_0 = c'_0 + c''_0 b; the
78 /// carry registers collectively represent the value c_0 + c_1 B + c_2 B^2 +
79 /// c_3 B^3. The `pmuluqd' instruction acting on a scalar operand (broadcast
80 /// across all lanes of its vector) and an operand in the expanded form above
81 /// produces a result which can be added directly to the appropriate carry
82 /// register. Following a pass of four multiplications, we perform some
83 /// limited carry propagation: let t = c''_0 mod B, and let d = c'_0 + t b;
84 /// then we output z = d mod B, add (floor(d/B), floor(c''_0/B)) to c1, and
85 /// cycle the carry registers around, so that c1 becomes c0, and the old
86 /// (implicitly) zeroed c0 becomes c3.
88 /// On 32-bit x86, we are register starved: the expanded operands are kept in
89 /// memory, typically in warm L1 cache. The packed operands are read from
90 /// memory into working registers XMM0--XMM3 and processed immediately.
91 /// The following conventional argument names and locations are used
94 /// Arg Format Location Notes
97 /// X packed [EBX] In Montgomery multiplication, X = N
99 /// Y expanded [EDX] In Montgomery multiplication, Y = (A + U V) M
100 /// M expanded [ESI] -N^{-1} (mod B^4)
101 /// N Modulus, for Montgomery multiplication
102 /// A packed [EDI] Destination/accumulator
103 /// C carry XMM4--XMM7
105 /// The calculation is some variant of
107 /// A' + C' B^4 <- U V + X Y + A + C
109 /// The low-level functions fit into a fairly traditional (finely-integrated)
110 /// operand scanning loop over operand pairs (U, X) (indexed by j) and (V, Y)
113 /// The variants are as follows.
115 /// Function Variant Use i j
117 /// mmul4 A = C = 0 Montgomery 0 0
118 /// dmul4 A = 0 Montgomery 0 +
119 /// mmla4 C = 0 Montgomery + 0
120 /// dmla4 exactly as shown Montgomery + +
121 /// mont4 U = V = C = 0 Montgomery any 0
123 /// mul4zc U = V = A = C = 0 Plain 0 0
124 /// mul4 U = V = A = 0 Plain 0 +
125 /// mla4zc U = V = C = 0 Plain + 0
126 /// mla4 U = V = 0 Plain + +
128 /// The `mmul4' and `mmla4' functions are also responsible for calculating
129 /// the Montgomery reduction factor Y = (A + U V) M used by the rest of the
132 ///--------------------------------------------------------------------------
133 /// Macro definitions.
135 .macro mulcore r, s, d0, d1=nil, d2=nil, d3=nil
136 // Load a word r_i from R, multiply by the expanded operand [S], and
137 // leave the pieces of the product in registers D0, D1, D2, D3.
138 movd \d0, \r // (r_i, 0; 0, 0)
140 movdqa \d1, [\s] // (s'_0, s'_1; s''_0, s''_1)
143 movdqa \d3, [\s + 16] // (s'_2, s'_3; s''_2, s''_3)
145 pshufd \d0, \d0, SHUF(0, 3, 0, 3) // (r_i, ?; r_i, ?)
147 psrldq \d1, 4 // (s'_1, s''_0; s''_1, 0)
151 movdqa \d2, \d3 // another copy of (s'_2, s'_3; ...)
153 movdqa \d2, \d0 // another copy of (r_i, ?; r_i, ?)
157 psrldq \d3, 4 // (s'_3, s''_2; s''_3, 0)
160 pmuludq \d1, \d0 // (r_i s'_1; r_i s''_1)
163 pmuludq \d3, \d0 // (r_i s'_3; r_i s''_3)
167 pmuludq \d2, \d0 // (r_i s'_2; r_i s''_2)
169 pmuludq \d2, [\s + 16]
172 pmuludq \d0, [\s] // (r_i s'_0; r_i s''_0)
175 .macro accum c0, c1=nil, c2=nil, c3=nil
176 // Accumulate 64-bit pieces in XMM0--XMM3 into the corresponding
177 // carry registers C0--C3. Any or all of C1--C3 may be `nil' to skip
178 // updating that register.
191 .macro mulacc r, s, c0, c1, c2, c3, z3p=nil
192 // Load a word r_i from R, multiply by the expanded operand [S],
193 // and accumulate in carry registers C0, C1, C2, C3. If Z3P is `t'
194 // then C3 notionally contains zero, but needs clearing; in practice,
195 // we store the product directly rather than attempting to add. On
196 // completion, XMM0, XMM1, and XMM2 are clobbered, as is XMM3 if Z3P
199 mulcore \r, \s, xmm0, xmm1, xmm2, \c3
202 mulcore \r, \s, xmm0, xmm1, xmm2, xmm3
203 accum \c0, \c1, \c2, \c3
207 .macro propout d, c, cc=nil
208 // Calculate an output word from C, and store it in D; propagate
209 // carries out from C to CC in preparation for a rotation of the
210 // carry registers. On completion, XMM3 is clobbered. If CC is
211 // `nil', then the contribution which would have been added to it is
213 pshufd xmm3, \c, SHUF(3, 3, 3, 2) // (?, ?; ?, t = c'' mod B)
214 psrldq xmm3, 12 // (t, 0; 0, 0) = (t, 0)
215 pslldq xmm3, 2 // (t b; 0)
216 paddq \c, xmm3 // (c' + t b; c'')
218 psrlq \c, 32 // floor(c/B)
220 paddq \cc, \c // propagate up
224 .macro endprop d, c, t
225 // On entry, C contains a carry register. On exit, the low 32 bits
226 // of the value represented in C are written to D, and the remaining
227 // bits are left at the bottom of T.
229 psllq \t, 16 // (?; c'' b)
230 pslldq \c, 8 // (0; c')
231 paddq \t, \c // (?; c' + c'' b)
232 psrldq \t, 8 // (c' + c'' b; 0) = (c; 0)
234 psrldq \t, 4 // (floor(c/B); 0)
237 .macro expand z, a, b, c=nil, d=nil
238 // On entry, A and C hold packed 128-bit values, and Z is zero. On
239 // exit, A:B and C:D together hold the same values in expanded
240 // form. If C is `nil', then only expand A to A:B.
241 movdqa \b, \a // (a_0, a_1; a_2, a_3)
243 movdqa \d, \c // (c_0, c_1; c_2, c_3)
245 punpcklwd \a, \z // (a'_0, a''_0; a'_1, a''_1)
246 punpckhwd \b, \z // (a'_2, a''_2; a'_3, a''_3)
248 punpcklwd \c, \z // (c'_0, c''_0; c'_1, c''_1)
249 punpckhwd \d, \z // (c'_2, c''_2; c'_3, c''_3)
251 pshufd \a, \a, SHUF(0, 2, 1, 3) // (a'_0, a'_1; a''_0, a''_1)
252 pshufd \b, \b, SHUF(0, 2, 1, 3) // (a'_2, a'_3; a''_2, a''_3)
254 pshufd \c, \c, SHUF(0, 2, 1, 3) // (c'_0, c'_1; c''_0, c''_1)
255 pshufd \d, \d, SHUF(0, 2, 1, 3) // (c'_2, c'_3; c''_2, c''_3)
259 .macro squash c0, c1, c2, c3, t, u, lo, hi=nil
260 // On entry, C0, C1, C2, C3 are carry registers representing a value
261 // Y. On exit, LO holds the low 128 bits of the carry value; C1, C2,
262 // C3, T, and U are clobbered; and the high bits of Y are stored in
263 // HI, if this is not `nil'.
265 // The first step is to eliminate the `double-prime' pieces -- i.e.,
266 // the ones offset by 16 bytes from a 32-bit boundary -- by carrying
267 // them into the 32-bit-aligned pieces above and below. But before
268 // we can do that, we must gather them together.
271 punpcklqdq \t, \c2 // (y'_0; y'_2)
272 punpckhqdq \c0, \c2 // (y''_0; y''_2)
273 punpcklqdq \u, \c3 // (y'_1; y'_3)
274 punpckhqdq \c1, \c3 // (y''_1; y''_3)
276 // Now split the double-prime pieces. The high (up to) 48 bits will
277 // go up; the low 16 bits go down.
282 psrlq \c0, 16 // high parts of (y''_0; y''_2)
283 psrlq \c1, 16 // high parts of (y''_1; y''_3)
284 psrlq \c2, 32 // low parts of (y''_0; y''_2)
285 psrlq \c3, 32 // low parts of (y''_1; y''_3)
289 pslldq \c1, 8 // high part of (0; y''_1)
291 paddq \t, \c2 // propagate down
293 paddq \t, \c1 // and up: (y_0; y_2)
294 paddq \u, \c0 // (y_1; y_3)
296 psrldq \hi, 8 // high part of (y''_3; 0)
299 // Finally extract the answer. This complicated dance is better than
300 // storing to memory and loading, because the piecemeal stores
301 // inhibit store forwarding.
302 movdqa \c3, \t // (y_0; ?)
303 movdqa \lo, \t // (y^*_0, ?; ?, ?)
304 psrldq \t, 8 // (y_2; 0)
305 psrlq \c3, 32 // (floor(y_0/B); ?)
306 paddq \c3, \u // (y_1 + floor(y_0/B); ?)
307 movdqa \c1, \c3 // (y^*_1, ?; ?, ?)
308 psrldq \u, 8 // (y_3; 0)
309 psrlq \c3, 32 // (floor((y_1 B + y_0)/B^2; ?)
310 paddq \c3, \t // (y_2 + floor((y_1 B + y_0)/B^2; ?)
311 punpckldq \lo, \c3 // (y^*_0, y^*_2; ?, ?)
312 psrlq \c3, 32 // (floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
313 paddq \c3, \u // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
318 punpckldq \c1, \c3 // (y^*_1, y^*_3; ?, ?)
320 psrlq \t, 32 // very high bits of y
322 punpcklqdq \hi, \u // carry up
324 punpckldq \lo, \c1 // y mod B^4
328 // On entry, EDI points to a packed addend A, and XMM4, XMM5, XMM6
329 // hold the incoming carry registers c0, c1, and c2 representing a
332 // On exit, the carry registers, including XMM7, are updated to hold
333 // C + A; XMM0, XMM1, XMM2, and XMM3 are clobbered. The other
334 // registers are preserved.
335 movd xmm0, [edi + 0] // (a_0; 0)
336 movd xmm1, [edi + 4] // (a_1; 0)
337 movd xmm2, [edi + 8] // (a_2; 0)
338 movd xmm7, [edi + 12] // (a_3; 0)
340 paddq xmm4, xmm0 // (c'_0 + a_0; c''_0)
341 paddq xmm5, xmm1 // (c'_1 + a_1; c''_1)
342 paddq xmm6, xmm2 // (c'_2 + a_2; c''_2 + a_3 b)
345 ///--------------------------------------------------------------------------
346 /// Primitive multipliers and related utilities.
349 // On entry, XMM4, XMM5, and XMM6 hold a 144-bit carry in an expanded
350 // form. Store the low 128 bits of the represented carry to [EDI] as
351 // a packed 128-bit value, and leave the remaining 16 bits in the low
352 // 32 bits of XMM4. On exit, XMM3, XMM5 and XMM6 are clobbered.
355 propout [edi + 0], xmm4, xmm5
356 propout [edi + 4], xmm5, xmm6
357 propout [edi + 8], xmm6, nil
358 endprop [edi + 12], xmm6, xmm4
363 // On entry, EDI points to the destination buffer; EAX and EBX point
364 // to the packed operands U and X; ECX and EDX point to the expanded
365 // operands V and Y; and XMM4, XMM5, XMM6 hold the incoming carry
366 // registers c0, c1, and c2; c3 is assumed to be zero.
368 // On exit, we write the low 128 bits of the sum C + U V + X Y to
369 // [EDI], and update the carry registers with the carry out. The
370 // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
371 // general-purpose registers are preserved.
374 mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7, t
375 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
376 propout [edi + 0], xmm4, xmm5
378 mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
379 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4
380 propout [edi + 4], xmm5, xmm6
382 mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t
383 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5
384 propout [edi + 8], xmm6, xmm7
386 mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
387 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6
388 propout [edi + 12], xmm7, xmm4
394 // On entry, EDI points to the destination buffer, which also
395 // contains an addend A to accumulate; EAX and EBX point to the
396 // packed operands U and X; ECX and EDX point to the expanded
397 // operands V and Y; and XMM4, XMM5, XMM6 hold the incoming carry
398 // registers c0, c1, and c2 representing a carry-in C; c3 is assumed
401 // On exit, we write the low 128 bits of the sum A + C + U V + X Y to
402 // [EDI], and update the carry registers with the carry out. The
403 // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
404 // general-purpose registers are preserved.
409 mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7
410 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
411 propout [edi + 0], xmm4, xmm5
413 mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
414 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4
415 propout [edi + 4], xmm5, xmm6
417 mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t
418 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5
419 propout [edi + 8], xmm6, xmm7
421 mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
422 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6
423 propout [edi + 12], xmm7, xmm4
429 // On entry, EDI points to the destination buffer; EBX points to a
430 // packed operand X; and EDX points to an expanded operand Y.
432 // On exit, we write the low 128 bits of the product X Y to [EDI],
433 // and set the carry registers XMM4, XMM5, XMM6 to the carry out.
434 // The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
435 // general-purpose registers are preserved.
438 mulcore [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
439 propout [edi + 0], xmm4, xmm5
441 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
442 propout [edi + 4], xmm5, xmm6
444 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
445 propout [edi + 8], xmm6, xmm7
447 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
448 propout [edi + 12], xmm7, xmm4
454 // On entry, EDI points to the destination buffer; EBX points to a
455 // packed operand X; EDX points to an expanded operand Y; and XMM4,
456 // XMM5, XMM6 hold the incoming carry registers c0, c1, and c2,
457 // representing a carry-in C; c3 is assumed to be zero.
459 // On exit, we write the low 128 bits of the sum C + X Y to [EDI],
460 // and update the carry registers with the carry out. The registers
461 // XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
462 // general-purpose registers are preserved.
465 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, t
466 propout [edi + 0], xmm4, xmm5
468 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
469 propout [edi + 4], xmm5, xmm6
471 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
472 propout [edi + 8], xmm6, xmm7
474 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
475 propout [edi + 12], xmm7, xmm4
481 // On entry, EDI points to the destination buffer, which also
482 // contains an addend A to accumulate; EBX points to a packed operand
483 // X; and EDX points to an expanded operand Y.
485 // On exit, we write the low 128 bits of the sum A + X Y to [EDI],
486 // and set the carry registers XMM4, XMM5, XMM6 to the carry out.
487 // The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
488 // general-purpose registers are preserved.
494 movd xmm7, [edi + 12]
496 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
497 propout [edi + 0], xmm4, xmm5
499 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
500 propout [edi + 4], xmm5, xmm6
502 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
503 propout [edi + 8], xmm6, xmm7
505 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
506 propout [edi + 12], xmm7, xmm4
512 // On entry, EDI points to the destination buffer, which also
513 // contains an addend A to accumulate; EBX points to a packed operand
514 // X; EDX points to an expanded operand Y; and XMM4, XMM5, XMM6 hold
515 // the incoming carry registers c0, c1, and c2, representing a
516 // carry-in C; c3 is assumed to be zero.
518 // On exit, we write the low 128 bits of the sum A + C + X Y to
519 // [EDI], and update the carry registers with the carry out. The
520 // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
521 // general-purpose registers are preserved.
526 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
527 propout [edi + 0], xmm4, xmm5
529 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
530 propout [edi + 4], xmm5, xmm6
532 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
533 propout [edi + 8], xmm6, xmm7
535 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
536 propout [edi + 12], xmm7, xmm4
542 // On entry, EDI points to the destination buffer; EAX and EBX point
543 // to the packed operands U and N; ECX and ESI point to the expanded
544 // operands V and M; and EDX points to a place to store an expanded
545 // result Y (32 bytes, at a 16-byte boundary). The stack pointer
546 // must be 12 modulo 16, as is usual for modern x86 ABIs.
548 // On exit, we write Y = U V M mod B to [EDX], and the low 128 bits
549 // of the sum U V + N Y to [EDI], leaving the remaining carry in
550 // XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2, XMM3, and
551 // XMM7 are clobbered; the general-purpose registers are preserved.
552 stalloc 48 + 12 // space for the carries
555 // Calculate W = U V, and leave it in the destination. Stash the
556 // carry pieces for later.
557 mulcore [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7
558 propout [edi + 0], xmm4, xmm5
563 // On entry, EDI points to the destination buffer, which also
564 // contains an addend A to accumulate; EAX and EBX point to the
565 // packed operands U and N; ECX and ESI point to the expanded
566 // operands V and M; and EDX points to a place to store an expanded
567 // result Y (32 bytes, at a 16-byte boundary). The stack pointer
568 // must be 12 modulo 16, as is usual for modern x86 ABIs.
570 // On exit, we write Y = (A + U V) M mod B to [EDX], and the low 128
571 // bits of the sum A + U V + N Y to [EDI], leaving the remaining
572 // carry in XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2,
573 // XMM3, and XMM7 are clobbered; the general-purpose registers are
575 stalloc 48 + 12 // space for the carries
581 movd xmm7, [edi + 12]
583 // Calculate W = U V, and leave it in the destination. Stash the
584 // carry pieces for later.
585 mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7
586 propout [edi + 0], xmm4, xmm5
588 5: mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
589 propout [edi + 4], xmm5, xmm6
591 mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t
592 propout [edi + 8], xmm6, xmm7
594 mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
595 propout [edi + 12], xmm7, xmm4
597 movdqa [SP + 0], xmm4
598 movdqa [SP + 16], xmm5
599 movdqa [SP + 32], xmm6
601 // Calculate Y = W M.
602 mulcore [edi + 0], esi, xmm4, xmm5, xmm6, xmm7
604 mulcore [edi + 4], esi, xmm0, xmm1, xmm2
605 accum xmm5, xmm6, xmm7
607 mulcore [edi + 8], esi, xmm0, xmm1
610 mulcore [edi + 12], esi, xmm0
613 // That's lots of pieces. Now we have to assemble the answer.
614 squash xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm4
618 expand xmm2, xmm4, xmm1
619 movdqa [edx + 0], xmm4
620 movdqa [edx + 16], xmm1
622 // Initialize the carry from the value for W we calculated earlier.
626 movd xmm7, [edi + 12]
628 // Finish the calculation by adding the Montgomery product.
629 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
630 propout [edi + 0], xmm4, xmm5
632 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
633 propout [edi + 4], xmm5, xmm6
635 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
636 propout [edi + 8], xmm6, xmm7
638 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
639 propout [edi + 12], xmm7, xmm4
641 // Add add on the carry we calculated earlier.
643 paddq xmm5, [SP + 16]
644 paddq xmm6, [SP + 32]
646 // And, with that, we're done.
652 // On entry, EDI points to the destination buffer holding a packed
653 // value W; EBX points to a packed operand N; ESI points to an
654 // expanded operand M; and EDX points to a place to store an expanded
655 // result Y (32 bytes, at a 16-byte boundary).
657 // On exit, we write Y = W M mod B to [EDX], and the low 128 bits
658 // of the sum W + N Y to [EDI], leaving the remaining carry in
659 // XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2, XMM3, and
660 // XMM7 are clobbered; the general-purpose registers are preserved.
663 // Calculate Y = W M.
664 mulcore [edi + 0], esi, xmm4, xmm5, xmm6, xmm7
666 mulcore [edi + 4], esi, xmm0, xmm1, xmm2
667 accum xmm5, xmm6, xmm7
669 mulcore [edi + 8], esi, xmm0, xmm1
672 mulcore [edi + 12], esi, xmm0
675 // That's lots of pieces. Now we have to assemble the answer.
676 squash xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm4
680 expand xmm2, xmm4, xmm1
681 movdqa [edx + 0], xmm4
682 movdqa [edx + 16], xmm1
684 // Initialize the carry from W.
688 movd xmm7, [edi + 12]
690 // Finish the calculation by adding the Montgomery product.
691 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
692 propout [edi + 0], xmm4, xmm5
694 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
695 propout [edi + 4], xmm5, xmm6
697 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
698 propout [edi + 8], xmm6, xmm7
700 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
701 propout [edi + 12], xmm7, xmm4
703 // And, with that, we're done.
707 ///--------------------------------------------------------------------------
708 /// Bulk multipliers.
710 FUNC(mpx_umul4_x86_avx)
714 // and drop through...
718 FUNC(mpx_umul4_x86_sse2)
719 // void mpx_umul4_x86_sse2(mpw *dv, const mpw *av, const mpw *avl,
720 // const mpw *bv, const mpw *bvl);
722 // Build a stack frame. Arguments will be relative to BP, as
731 // Locals are relative to SP, as follows.
733 // SP + 0 expanded Y (32 bytes)
734 // SP + 32 (top of locals)
744 // Prepare for the first iteration.
745 mov esi, [BP + 32] // -> bv[0]
747 movdqu xmm0, [esi] // bv[0]
748 mov edi, [BP + 20] // -> dv[0]
749 mov ecx, edi // outer loop dv cursor
750 expand xmm7, xmm0, xmm1
751 mov ebx, [BP + 24] // -> av[0]
752 mov eax, [BP + 28] // -> av[m] = av limit
753 mov edx, SP // -> expanded Y = bv[0]
754 movdqa [SP + 0], xmm0 // bv[0] expanded low
755 movdqa [SP + 16], xmm1 // bv[0] expanded high
761 cmp ebx, eax // all done?
765 // Continue with the first iteration.
769 cmp ebx, eax // all done?
772 // Write out the leftover carry. There can be no tail here.
774 cmp esi, [BP + 36] // more passes to do?
778 // Set up for the next pass.
779 1: movdqu xmm0, [esi] // bv[i]
780 mov edi, ecx // -> dv[i]
782 expand xmm7, xmm0, xmm1
783 mov ebx, [BP + 24] // -> av[0]
784 movdqa [SP + 0], xmm0 // bv[i] expanded low
785 movdqa [SP + 16], xmm1 // bv[i] expanded high
791 cmp ebx, eax // done yet?
802 // Finish off this pass. There was no tail on the previous pass, and
803 // there can be none on this pass.
817 FUNC(mpxmont_mul4_x86_avx)
821 // and drop through...
825 FUNC(mpxmont_mul4_x86_sse2)
826 // void mpxmont_mul4_x86_sse2(mpw *dv, const mpw *av, const mpw *bv,
827 // const mpw *nv, size_t n, const mpw *mi);
829 // Build a stack frame. Arguments will be relative to BP, as
836 // BP + 36 n (nonzero multiple of 4)
839 // Locals are relative to SP, which 16-byte aligned, as follows.
841 // SP + 0 expanded V (32 bytes)
842 // SP + 32 expanded M (32 bytes)
843 // SP + 64 expanded Y (32 bytes)
844 // SP + 96 outer loop dv
845 // SP + 100 outer loop bv
846 // SP + 104 av limit (mostly in ESI)
848 // SP + 112 (top of locals)
858 // Establish the expanded operands.
860 mov ecx, [BP + 28] // -> bv
861 mov edx, [BP + 40] // -> mi
862 movdqu xmm0, [ecx] // bv[0]
863 movdqu xmm2, [edx] // mi
864 expand xmm7, xmm0, xmm1, xmm2, xmm3
865 movdqa [SP + 0], xmm0 // bv[0] expanded low
866 movdqa [SP + 16], xmm1 // bv[0] expanded high
867 movdqa [SP + 32], xmm2 // mi expanded low
868 movdqa [SP + 48], xmm3 // mi expanded high
870 // Set up the outer loop state and prepare for the first iteration.
871 mov edx, [BP + 36] // n
872 mov eax, [BP + 24] // -> U = av[0]
873 mov ebx, [BP + 32] // -> X = nv[0]
874 mov edi, [BP + 20] // -> Z = dv[0]
876 lea ecx, [ecx + 4*edx] // -> bv[n/4] = bv limit
877 lea edx, [eax + 4*edx] // -> av[n/4] = av limit
881 lea ecx, [SP + 0] // -> expanded V = bv[0]
882 lea esi, [SP + 32] // -> expanded M = mi
883 lea edx, [SP + 64] // -> space for Y
885 mov esi, [SP + 104] // recover av limit
889 cmp eax, esi // done already?
894 // Complete the first inner loop.
899 cmp eax, esi // done yet?
902 // Still have carries left to propagate.
904 movd [edi + 16], xmm4
907 // Embark on the next iteration. (There must be one. If n = 1, then
908 // we would have bailed above, to label 8. Similarly, the subsequent
909 // iterations can fall into the inner loop immediately.)
910 1: mov eax, [SP + 100] // -> bv[i - 1]
911 mov edi, [SP + 96] // -> Z = dv[i]
912 add eax, 16 // -> bv[i]
915 cmp eax, [SP + 108] // done yet?
917 movdqu xmm0, [eax] // bv[i]
918 mov ebx, [BP + 32] // -> X = nv[0]
919 lea esi, [SP + 32] // -> expanded M = mi
920 mov eax, [BP + 24] // -> U = av[0]
921 expand xmm7, xmm0, xmm1
922 movdqa [SP + 0], xmm0 // bv[i] expanded low
923 movdqa [SP + 16], xmm1 // bv[i] expanded high
925 mov esi, [SP + 104] // recover av limit
932 // Complete the next inner loop.
940 // Still have carries left to propagate, and they overlap the
941 // previous iteration's final tail, so read that in and add it.
945 movd [edi + 16], xmm4
950 // First iteration was short. Write out the carries and we're done.
951 // (This could be folded into the main loop structure, but that would
952 // penalize small numbers more.)
954 movd [edi + 16], xmm4
965 FUNC(mpxmont_redc4_x86_avx)
969 // and drop through...
973 FUNC(mpxmont_redc4_x86_sse2)
974 // void mpxmont_redc4_x86_sse2(mpw *dv, mpw *dvl, const mpw *nv,
975 // size_t n, const mpw *mi);
977 // Build a stack frame. Arguments will be relative to BP, as
983 // BP + 32 n (nonzero multiple of 4)
986 // Locals are relative to SP, as follows.
988 // SP + 0 outer loop dv
989 // SP + 4 outer dv limit
990 // SP + 8 blocks-of-4 dv limit
991 // SP + 12 expanded M (32 bytes)
992 // SP + 44 expanded Y (32 bytes)
993 // SP + 76 (top of locals)
1003 // Establish the expanded operands and the blocks-of-4 dv limit.
1004 mov edi, [BP + 20] // -> Z = dv[0]
1006 mov eax, [BP + 24] // -> dv[n] = dv limit
1007 sub eax, edi // length of dv in bytes
1008 mov edx, [BP + 36] // -> mi
1009 movdqu xmm0, [edx] // mi
1010 and eax, ~15 // mask off the tail end
1011 expand xmm7, xmm0, xmm1
1012 add eax, edi // find limit
1013 movdqa [SP + 12], xmm0 // mi expanded low
1014 movdqa [SP + 28], xmm1 // mi expanded high
1017 // Set up the outer loop state and prepare for the first iteration.
1018 mov ecx, [BP + 32] // n
1019 mov ebx, [BP + 28] // -> X = nv[0]
1020 lea edx, [edi + 4*ecx] // -> dv[n/4] = outer dv limit
1021 lea ecx, [ebx + 4*ecx] // -> nv[n/4] = nv limit
1024 lea esi, [SP + 12] // -> expanded M = mi
1025 lea edx, [SP + 44] // -> space for Y
1029 cmp ebx, ecx // done already?
1033 // Complete the first inner loop.
1037 cmp ebx, ecx // done yet?
1040 // Still have carries left to propagate.
1042 mov esi, [SP + 8] // -> dv blocks limit
1043 mov edx, [BP + 24] // dv limit
1054 // Continue carry propagation until the end of the buffer.
1056 mov eax, 0 // preserves flags
1057 adc dword ptr [edi + 4], 0
1058 adc dword ptr [edi + 8], 0
1059 adc dword ptr [edi + 12], 0
1065 // Deal with the tail end. Note that the actual destination length
1066 // won't be an exact number of blocks of four, so it's safe to just
1067 // drop through here.
1075 // All done for this iteration. Start the next.
1076 8: mov edi, [SP + 0] // -> dv[i - 1]
1077 mov ebx, [BP + 28] // -> X = nv[0]
1078 lea edx, [SP + 44] // -> space for Y
1079 lea esi, [SP + 12] // -> expanded M = mi
1080 add edi, 16 // -> Z = dv[i]
1081 cmp edi, [SP + 4] // all done yet?
1098 ///--------------------------------------------------------------------------
1099 /// Testing and performance measurement.
1109 .macro cystore c, v, n
1117 mov [ebx + ecx*8], eax
1118 mov [ebx + ecx*8 + 4], edx
1121 .macro testprologue n
1133 // SP + 0 = v expanded
1134 // SP + 32 = y expanded
1135 // SP + 64 = ? expanded
1149 .macro testldcarry c
1151 movdqu xmm4, [ecx + 0] // (c'_0; c''_0)
1152 movdqu xmm5, [ecx + 16] // (c'_1; c''_1)
1153 movdqu xmm6, [ecx + 32] // (c'_2; c''_2)
1156 .macro testexpand v=nil, y=nil
1161 expand xmm7, xmm0, xmm1
1162 movdqa [SP + 0], xmm0
1163 movdqa [SP + 16], xmm1
1168 expand xmm7, xmm2, xmm3
1169 movdqa [SP + 32], xmm2
1170 movdqa [SP + 48], xmm3
1174 .macro testtop u=nil, x=nil, mode=nil
1181 .ifeqs "\mode", "mont"
1188 .ifeqs "\mode", "mont"
1196 cystore SP + 96, \cyv, SP + 104
1200 .macro testcarryout c
1202 movdqu [ecx + 0], xmm4
1203 movdqu [ecx + 16], xmm5
1204 movdqu [ecx + 32], xmm6
1208 testprologue [BP + 44]
1209 testldcarry [BP + 24]
1210 testexpand [BP + 36], [BP + 40]
1212 testtop [BP + 28], [BP + 32]
1215 testcarryout [BP + 24]
1220 testprologue [BP + 44]
1221 testldcarry [BP + 24]
1222 testexpand [BP + 36], [BP + 40]
1224 testtop [BP + 28], [BP + 32]
1227 testcarryout [BP + 24]
1232 testprologue [BP + 36]
1233 testldcarry [BP + 24]
1234 testexpand nil, [BP + 32]
1236 testtop nil, [BP + 28]
1239 testcarryout [BP + 24]
1244 testprologue [BP + 36]
1245 testldcarry [BP + 24]
1246 testexpand nil, [BP + 32]
1248 testtop nil, [BP + 28]
1251 testcarryout [BP + 24]
1256 testprologue [BP + 36]
1257 testldcarry [BP + 24]
1258 testexpand nil, [BP + 32]
1260 testtop nil, [BP + 28]
1263 testcarryout [BP + 24]
1268 testprologue [BP + 36]
1269 testldcarry [BP + 24]
1270 testexpand nil, [BP + 32]
1272 testtop nil, [BP + 28]
1275 testcarryout [BP + 24]
1280 testprologue [BP + 48]
1281 testexpand [BP + 40], [BP + 44]
1283 testtop [BP + 32], [BP + 36], mont
1287 movdqa xmm0, [SP + 64]
1288 movdqa xmm1, [SP + 80]
1289 pshufd xmm0, xmm0, SHUF(0, 2, 1, 3)
1290 pshufd xmm1, xmm1, SHUF(0, 2, 1, 3)
1292 movdqu [edi + 16], xmm1
1293 testcarryout [BP + 24]
1298 testprologue [BP + 48]
1299 testexpand [BP + 40], [BP + 44]
1301 testtop [BP + 32], [BP + 36], mont
1305 movdqa xmm0, [SP + 64]
1306 movdqa xmm1, [SP + 80]
1307 pshufd xmm0, xmm0, SHUF(0, 2, 1, 3)
1308 pshufd xmm1, xmm1, SHUF(0, 2, 1, 3)
1310 movdqu [edi + 16], xmm1
1311 testcarryout [BP + 24]
1316 testprologue [BP + 40]
1317 testexpand nil, [BP + 36]
1319 testtop nil, [BP + 32], mont
1323 movdqa xmm0, [SP + 64]
1324 movdqa xmm1, [SP + 80]
1325 pshufd xmm0, xmm0, SHUF(0, 2, 1, 3)
1326 pshufd xmm1, xmm1, SHUF(0, 2, 1, 3)
1328 movdqu [edi + 16], xmm1
1329 testcarryout [BP + 24]
1335 ///----- That's all, folks --------------------------------------------------