1 /// -*- mode: asm; asm-comment-char: ?/; comment-start: "// " -*-
3 /// Large SIMD-based multiplications
5 /// (c) 2016 Straylight/Edgeware
7 ///----- Licensing notice ---------------------------------------------------
9 /// This file is part of Catacomb.
11 /// Catacomb is free software; you can redistribute it and/or modify
12 /// it under the terms of the GNU Library General Public License as
13 /// published by the Free Software Foundation; either version 2 of the
14 /// License, or (at your option) any later version.
16 /// Catacomb is distributed in the hope that it will be useful,
17 /// but WITHOUT ANY WARRANTY; without even the implied warranty of
18 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 /// GNU Library General Public License for more details.
21 /// You should have received a copy of the GNU Library General Public
22 /// License along with Catacomb; if not, write to the Free
23 /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
24 /// MA 02111-1307, USA.
26 ///--------------------------------------------------------------------------
30 #include "asm-common.h"
36 ///--------------------------------------------------------------------------
39 /// We define a number of primitive fixed-size multipliers from which we can
40 /// construct more general variable-length multipliers.
42 /// The basic trick is the same throughout. In an operand-scanning
43 /// multiplication, the inner multiplication loop multiplies a
44 /// multiple-precision operand by a single precision factor, and adds the
45 /// result, appropriately shifted, to the result. A `finely integrated
46 /// operand scanning' implementation of Montgomery multiplication also adds
47 /// the product of a single-precision `Montgomery factor' and the modulus,
48 /// calculated in the same pass. The more common `coarsely integrated
49 /// operand scanning' alternates main multiplication and Montgomery passes,
50 /// which requires additional carry propagation.
52 /// Throughout both plain-multiplication and Montgomery stages, then, one of
53 /// the factors remains constant throughout the operation, so we can afford
54 /// to take a little time to preprocess it. The transformation we perform is
55 /// as follows. Let b = 2^16, and B = b^2 = 2^32. Suppose we're given a
56 /// 128-bit factor v = v_0 + v_1 B + v_2 B^2 + v_3 B^3. Split each v_i into
57 /// two sixteen-bit pieces, so v_i = v'_i + v''_i b. These eight 16-bit
58 /// pieces are placed into 32-bit cells, and arranged as two 128-bit SSE
59 /// operands, as follows.
62 /// 0 v'_0 v'_1 v''_0 v''_1
63 /// 16 v'_2 v'_3 v''_2 v''_3
65 /// A `pmuludq' instruction ignores the odd positions in its operands; thus,
66 /// it will act on (say) v'_0 and v''_0 in a single instruction. Shifting
67 /// this vector right by 4 bytes brings v'_1 and v''_1 into position. We can
68 /// multiply such a vector by a full 32-bit scalar to produce two 48-bit
69 /// results in 64-bit fields. The sixteen bits of headroom allows us to add
70 /// many products together before we must deal with carrying; it also allows
71 /// for some calculations to be performed on the above expanded form.
73 /// On 32-bit x86, we are register starved: the expanded operands are kept in
74 /// memory, typically in warm L1 cache.
76 /// We maintain four `carry' registers accumulating intermediate results.
77 /// The registers' precise roles rotate during the computation; we name them
78 /// `c0', `c1', `c2', and `c3'. Each carry register holds two 64-bit halves:
79 /// the register c0, for example, holds c'_0 (low half) and c''_0 (high
80 /// half), and represents the value c_0 = c'_0 + c''_0 b; the carry registers
81 /// collectively represent the value c_0 + c_1 B + c_2 B^2 + c_3 B^3. The
82 /// `pmuluqd' instruction acting on a scalar operand (broadcast across all
83 /// lanes of its vector) and an operand in the expanded form above produces a
84 /// result which can be added directly to the appropriate carry register.
85 /// Following a pass of four multiplications, we perform some limited carry
86 /// propagation: let t = c''_0 mod B, and let d = c'_0 + t b; then we output
87 /// z = d mod B, add (floor(d/B), floor(c''_0/B)) to c1, and cycle the carry
88 /// registers around, so that c1 becomes c0, and the old c0 is (implicitly)
89 /// zeroed becomes c3.
91 ///--------------------------------------------------------------------------
92 /// Macro definitions.
94 .macro mulcore r, s, d0, d1=nil, d2=nil, d3=nil
95 // Load a word r_i from R, multiply by the expanded operand [S], and
96 // leave the pieces of the product in registers D0, D1, D2, D3.
97 movd \d0, \r // (r_i, 0; 0, 0)
99 movdqa \d1, [\s] // (s'_0, s'_1; s''_0, s''_1)
102 movdqa \d3, [\s + 16] // (s'_2, s'_3; s''_2, s''_3)
104 pshufd \d0, \d0, SHUF(0, 3, 0, 3) // (r_i, ?; r_i, ?)
106 psrldq \d1, 4 // (s'_1, s''_0; s''_1, 0)
110 movdqa \d2, \d3 // another copy of (s'_2, s'_3; ...)
112 movdqa \d2, \d0 // another copy of (r_i, ?; r_i, ?)
116 psrldq \d3, 4 // (s'_3, s''_2; s''_3, 0)
119 pmuludq \d1, \d0 // (r_i s'_1; r_i s''_1)
122 pmuludq \d3, \d0 // (r_i s'_3; r_i s''_3)
126 pmuludq \d2, \d0 // (r_i s'_2; r_i s''_2)
128 pmuludq \d2, [\s + 16]
131 pmuludq \d0, [\s] // (r_i s'_0; r_i s''_0)
134 .macro accum c0, c1=nil, c2=nil, c3=nil
135 // Accumulate 64-bit pieces in XMM0--XMM3 into the corresponding
136 // carry registers C0--C3. Any or all of C1--C3 may be `nil' to skip
137 // updating that register.
150 .macro mulacc r, s, c0, c1, c2, c3, z3p=nil
151 // Load a word r_i from R, multiply by the expanded operand [S],
152 // and accumulate in carry registers C0, C1, C2, C3. If Z3P is `t'
153 // then C3 notionally contains zero, but needs clearing; in practice,
154 // we store the product directly rather than attempting to add. On
155 // completion, XMM0, XMM1, and XMM2 are clobbered, as is XMM3 if Z3P
158 mulcore \r, \s, xmm0, xmm1, xmm2, \c3
161 mulcore \r, \s, xmm0, xmm1, xmm2, xmm3
162 accum \c0, \c1, \c2, \c3
166 .macro propout d, c, cc=nil
167 // Calculate an output word from C, and store it in D; propagate
168 // carries out from C to CC in preparation for a rotation of the
169 // carry registers. On completion, XMM3 is clobbered. If CC is
170 // `nil', then the contribution which would have been added to it is
172 pshufd xmm3, \c, SHUF(3, 3, 3, 2) // (?, ?; ?, t = c'' mod B)
173 psrldq xmm3, 12 // (t, 0; 0, 0) = (t, 0)
174 pslldq xmm3, 2 // (t b; 0)
175 paddq \c, xmm3 // (c' + t b; c'')
177 psrlq \c, 32 // floor(c/B)
179 paddq \cc, \c // propagate up
183 .macro endprop d, c, t
184 // On entry, C contains a carry register. On exit, the low 32 bits
185 // of the value represented in C are written to D, and the remaining
186 // bits are left at the bottom of T.
188 psllq \t, 16 // (?; c'' b)
189 pslldq \c, 8 // (0; c')
190 paddq \t, \c // (?; c' + c'' b)
191 psrldq \t, 8 // (c' + c'' b; 0) = (c; 0)
193 psrldq \t, 4 // (floor(c/B); 0)
196 .macro expand z, a, b, c=nil, d=nil
197 // On entry, A and C hold packed 128-bit values, and Z is zero. On
198 // exit, A:B and C:D together hold the same values in expanded
199 // form. If C is `nil', then only expand A to A:B.
200 movdqa \b, \a // (a_0, a_1; a_2, a_3)
202 movdqa \d, \c // (c_0, c_1; c_2, c_3)
204 punpcklwd \a, \z // (a'_0, a''_0; a'_1, a''_1)
205 punpckhwd \b, \z // (a'_2, a''_2; a'_3, a''_3)
207 punpcklwd \c, \z // (c'_0, c''_0; c'_1, c''_1)
208 punpckhwd \d, \z // (c'_2, c''_2; c'_3, c''_3)
210 pshufd \a, \a, SHUF(0, 2, 1, 3) // (a'_0, a'_1; a''_0, a''_1)
211 pshufd \b, \b, SHUF(0, 2, 1, 3) // (a'_2, a'_3; a''_2, a''_3)
213 pshufd \c, \c, SHUF(0, 2, 1, 3) // (c'_0, c'_1; c''_0, c''_1)
214 pshufd \d, \d, SHUF(0, 2, 1, 3) // (c'_2, c'_3; c''_2, c''_3)
218 .macro squash c0, c1, c2, c3, t, u, lo, hi=nil
219 // On entry, C0, C1, C2, C3 are carry registers representing a value
220 // Y. On exit, LO holds the low 128 bits of the carry value; C1, C2,
221 // C3, T, and U are clobbered; and the high bits of Y are stored in
222 // HI, if this is not `nil'.
224 // The first step is to eliminate the `double-prime' pieces -- i.e.,
225 // the ones offset by 16 bytes from a 32-bit boundary -- by carrying
226 // them into the 32-bit-aligned pieces above and below. But before
227 // we can do that, we must gather them together.
230 punpcklqdq \t, \c2 // (y'_0; y'_2)
231 punpckhqdq \c0, \c2 // (y''_0; y''_2)
232 punpcklqdq \u, \c3 // (y'_1; y'_3)
233 punpckhqdq \c1, \c3 // (y''_1; y''_3)
235 // Now split the double-prime pieces. The high (up to) 48 bits will
236 // go up; the low 16 bits go down.
241 psrlq \c0, 16 // high parts of (y''_0; y''_2)
242 psrlq \c1, 16 // high parts of (y''_1; y''_3)
243 psrlq \c2, 32 // low parts of (y''_0; y''_2)
244 psrlq \c3, 32 // low parts of (y''_1; y''_3)
248 pslldq \c1, 8 // high part of (0; y''_1)
250 paddq \t, \c2 // propagate down
252 paddq \t, \c1 // and up: (y_0; y_2)
253 paddq \u, \c0 // (y_1; y_3)
255 psrldq \hi, 8 // high part of (y''_3; 0)
258 // Finally extract the answer. This complicated dance is better than
259 // storing to memory and loading, because the piecemeal stores
260 // inhibit store forwarding.
261 movdqa \c3, \t // (y_0; ?)
262 movdqa \lo, \t // (y^*_0, ?; ?, ?)
263 psrldq \t, 8 // (y_2; 0)
264 psrlq \c3, 32 // (floor(y_0/B); ?)
265 paddq \c3, \u // (y_1 + floor(y_0/B); ?)
266 movdqa \c1, \c3 // (y^*_1, ?; ?, ?)
267 psrldq \u, 8 // (y_3; 0)
268 psrlq \c3, 32 // (floor((y_1 B + y_0)/B^2; ?)
269 paddq \c3, \t // (y_2 + floor((y_1 B + y_0)/B^2; ?)
270 punpckldq \lo, \c3 // (y^*_0, y^*_2; ?, ?)
271 psrlq \c3, 32 // (floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
272 paddq \c3, \u // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
277 punpckldq \c1, \c3 // (y^*_1, y^*_3; ?, ?)
279 psrlq \t, 32 // very high bits of y
281 punpcklqdq \hi, \u // carry up
283 punpckldq \lo, \c1 // y mod B^4
287 // On entry, EDI points to a packed addend A, and XMM4, XMM5, XMM6
288 // hold the incoming carry registers c0, c1, and c2 representing a
291 // On exit, the carry registers, including XMM7, are updated to hold
292 // C + A; XMM0, XMM1, XMM2, and XMM3 are clobbered. The other
293 // registers are preserved.
294 movd xmm0, [edi + 0] // (a_0; 0)
295 movd xmm1, [edi + 4] // (a_1; 0)
296 movd xmm2, [edi + 8] // (a_2; 0)
297 movd xmm7, [edi + 12] // (a_3; 0)
299 paddq xmm4, xmm0 // (c'_0 + a_0; c''_0)
300 paddq xmm5, xmm1 // (c'_1 + a_1; c''_1)
301 paddq xmm6, xmm2 // (c'_2 + a_2; c''_2 + a_3 b)
304 ///--------------------------------------------------------------------------
305 /// Primitive multipliers and related utilities.
308 // On entry, XMM4, XMM5, and XMM6 hold a 144-bit carry in an expanded
309 // form. Store the low 128 bits of the represented carry to [EDI] as
310 // a packed 128-bit value, and leave the remaining 16 bits in the low
311 // 32 bits of XMM4. On exit, XMM3, XMM5 and XMM6 are clobbered.
314 propout [edi + 0], xmm4, xmm5
315 propout [edi + 4], xmm5, xmm6
316 propout [edi + 8], xmm6, nil
317 endprop [edi + 12], xmm6, xmm4
323 // On entry, EDI points to the destination buffer; EAX and EBX point
324 // to the packed operands U and X; ECX and EDX point to the expanded
325 // operands V and Y; and XMM4, XMM5, XMM6 hold the incoming carry
326 // registers c0, c1, and c2; c3 is assumed to be zero.
328 // On exit, we write the low 128 bits of the sum C + U V + X Y to
329 // [EDI], and update the carry registers with the carry out. The
330 // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
331 // general-purpose registers are preserved.
334 mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7, t
335 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
336 propout [edi + 0], xmm4, xmm5
338 mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
339 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4
340 propout [edi + 4], xmm5, xmm6
342 mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t
343 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5
344 propout [edi + 8], xmm6, xmm7
346 mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
347 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6
348 propout [edi + 12], xmm7, xmm4
355 // On entry, EDI points to the destination buffer, which also
356 // contains an addend A to accumulate; EAX and EBX point to the
357 // packed operands U and X; ECX and EDX point to the expanded
358 // operands V and Y; and XMM4, XMM5, XMM6 hold the incoming carry
359 // registers c0, c1, and c2 representing a carry-in C; c3 is assumed
362 // On exit, we write the low 128 bits of the sum A + C + U V + X Y to
363 // [EDI], and update the carry registers with the carry out. The
364 // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
365 // general-purpose registers are preserved.
370 mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7
371 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
372 propout [edi + 0], xmm4, xmm5
374 mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
375 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4
376 propout [edi + 4], xmm5, xmm6
378 mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t
379 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5
380 propout [edi + 8], xmm6, xmm7
382 mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
383 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6
384 propout [edi + 12], xmm7, xmm4
391 // On entry, EDI points to the destination buffer; EBX points to a
392 // packed operand X; and EDX points to an expanded operand Y.
394 // On exit, we write the low 128 bits of the product X Y to [EDI],
395 // and set the carry registers XMM4, XMM5, XMM6 to the carry out.
396 // The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
397 // general-purpose registers are preserved.
400 mulcore [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
401 propout [edi + 0], xmm4, xmm5
403 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
404 propout [edi + 4], xmm5, xmm6
406 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
407 propout [edi + 8], xmm6, xmm7
409 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
410 propout [edi + 12], xmm7, xmm4
417 // On entry, EDI points to the destination buffer; EBX points to a
418 // packed operand X; EDX points to an expanded operand Y; and XMM4,
419 // XMM5, XMM6 hold the incoming carry registers c0, c1, and c2,
420 // representing a carry-in C; c3 is assumed to be zero.
422 // On exit, we write the low 128 bits of the sum C + X Y to [EDI],
423 // and update the carry registers with the carry out. The registers
424 // XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
425 // general-purpose registers are preserved.
428 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, t
429 propout [edi + 0], xmm4, xmm5
431 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
432 propout [edi + 4], xmm5, xmm6
434 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
435 propout [edi + 8], xmm6, xmm7
437 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
438 propout [edi + 12], xmm7, xmm4
445 // On entry, EDI points to the destination buffer, which also
446 // contains an addend A to accumulate; EBX points to a packed operand
447 // X; and EDX points to an expanded operand Y.
449 // On exit, we write the low 128 bits of the sum A + X Y to [EDI],
450 // and set the carry registers XMM4, XMM5, XMM6 to the carry out.
451 // The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
452 // general-purpose registers are preserved.
458 movd xmm7, [edi + 12]
460 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
461 propout [edi + 0], xmm4, xmm5
463 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
464 propout [edi + 4], xmm5, xmm6
466 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
467 propout [edi + 8], xmm6, xmm7
469 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
470 propout [edi + 12], xmm7, xmm4
477 // On entry, EDI points to the destination buffer, which also
478 // contains an addend A to accumulate; EBX points to a packed operand
479 // X; EDX points to an expanded operand Y; and XMM4, XMM5, XMM6 hold
480 // the incoming carry registers c0, c1, and c2, representing a
481 // carry-in C; c3 is assumed to be zero.
483 // On exit, we write the low 128 bits of the sum A + C + X Y to
484 // [EDI], and update the carry registers with the carry out. The
485 // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
486 // general-purpose registers are preserved.
491 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
492 propout [edi + 0], xmm4, xmm5
494 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
495 propout [edi + 4], xmm5, xmm6
497 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
498 propout [edi + 8], xmm6, xmm7
500 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
501 propout [edi + 12], xmm7, xmm4
508 // On entry, EDI points to the destination buffer; EAX and EBX point
509 // to the packed operands U and N; ECX and ESI point to the expanded
510 // operands V and M; and EDX points to a place to store an expanded
511 // result Y (32 bytes, at a 16-byte boundary). The stack pointer
512 // must be 12 modulo 16, as is usual for modern x86 ABIs.
514 // On exit, we write Y = U V M mod B to [EDX], and the low 128 bits
515 // of the sum U V + N Y to [EDI], leaving the remaining carry in
516 // XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2, XMM3, and
517 // XMM7 are clobbered; the general-purpose registers are preserved.
518 stalloc 48 + 12 // space for the carries
521 // Calculate W = U V, and leave it in the destination. Stash the
522 // carry pieces for later.
523 mulcore [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7
524 propout [edi + 0], xmm4, xmm5
530 // On entry, EDI points to the destination buffer, which also
531 // contains an addend A to accumulate; EAX and EBX point to the
532 // packed operands U and N; ECX and ESI point to the expanded
533 // operands V and M; and EDX points to a place to store an expanded
534 // result Y (32 bytes, at a 16-byte boundary). The stack pointer
535 // must be 12 modulo 16, as is usual for modern x86 ABIs.
537 // On exit, we write Y = (A + U V) M mod B to [EDX], and the low 128
538 // bits of the sum A + U V + N Y to [EDI], leaving the remaining
539 // carry in XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2,
540 // XMM3, and XMM7 are clobbered; the general-purpose registers are
542 stalloc 48 + 12 // space for the carries
548 movd xmm7, [edi + 12]
550 // Calculate W = U V, and leave it in the destination. Stash the
551 // carry pieces for later.
552 mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7
553 propout [edi + 0], xmm4, xmm5
555 5: mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
556 propout [edi + 4], xmm5, xmm6
558 mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t
559 propout [edi + 8], xmm6, xmm7
561 mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
562 propout [edi + 12], xmm7, xmm4
564 movdqa [esp + 0], xmm4
565 movdqa [esp + 16], xmm5
566 movdqa [esp + 32], xmm6
568 // Calculate Y = W M.
569 mulcore [edi + 0], esi, xmm4, xmm5, xmm6, xmm7
571 mulcore [edi + 4], esi, xmm0, xmm1, xmm2
572 accum xmm5, xmm6, xmm7
574 mulcore [edi + 8], esi, xmm0, xmm1
577 mulcore [edi + 12], esi, xmm0
580 // That's lots of pieces. Now we have to assemble the answer.
581 squash xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm4
585 expand xmm2, xmm4, xmm1
586 movdqa [edx + 0], xmm4
587 movdqa [edx + 16], xmm1
589 // Initialize the carry from the value for W we calculated earlier.
593 movd xmm7, [edi + 12]
595 // Finish the calculation by adding the Montgomery product.
596 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
597 propout [edi + 0], xmm4, xmm5
599 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
600 propout [edi + 4], xmm5, xmm6
602 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
603 propout [edi + 8], xmm6, xmm7
605 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
606 propout [edi + 12], xmm7, xmm4
608 // Add add on the carry we calculated earlier.
609 paddq xmm4, [esp + 0]
610 paddq xmm5, [esp + 16]
611 paddq xmm6, [esp + 32]
613 // And, with that, we're done.
620 // On entry, EDI points to the destination buffer holding a packed
621 // value W; EBX points to a packed operand N; ESI points to an
622 // expanded operand M; and EDX points to a place to store an expanded
623 // result Y (32 bytes, at a 16-byte boundary).
625 // On exit, we write Y = W M mod B to [EDX], and the low 128 bits
626 // of the sum W + N Y to [EDI], leaving the remaining carry in
627 // XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2, XMM3, and
628 // XMM7 are clobbered; the general-purpose registers are preserved.
631 // Calculate Y = W M.
632 mulcore [edi + 0], esi, xmm4, xmm5, xmm6, xmm7
634 mulcore [edi + 4], esi, xmm0, xmm1, xmm2
635 accum xmm5, xmm6, xmm7
637 mulcore [edi + 8], esi, xmm0, xmm1
640 mulcore [edi + 12], esi, xmm0
643 // That's lots of pieces. Now we have to assemble the answer.
644 squash xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm4
648 expand xmm2, xmm4, xmm1
649 movdqa [edx + 0], xmm4
650 movdqa [edx + 16], xmm1
652 // Initialize the carry from W.
656 movd xmm7, [edi + 12]
658 // Finish the calculation by adding the Montgomery product.
659 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
660 propout [edi + 0], xmm4, xmm5
662 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
663 propout [edi + 4], xmm5, xmm6
665 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
666 propout [edi + 8], xmm6, xmm7
668 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
669 propout [edi + 12], xmm7, xmm4
671 // And, with that, we're done.
676 ///--------------------------------------------------------------------------
677 /// Bulk multipliers.
679 FUNC(mpx_umul4_x86_avx)
683 // and drop through...
687 FUNC(mpx_umul4_x86_sse2)
688 // void mpx_umul4_x86_sse2(mpw *dv, const mpw *av, const mpw *avl,
689 // const mpw *bv, const mpw *bvl);
691 // Build a stack frame. Arguments will be relative to EBP, as
700 // Locals are relative to ESP, as follows.
702 // esp + 0 expanded Y (32 bytes)
703 // esp + 32 (top of locals)
713 // Prepare for the first iteration.
714 mov esi, [ebp + 32] // -> bv[0]
716 movdqu xmm0, [esi] // bv[0]
717 mov edi, [ebp + 20] // -> dv[0]
718 mov ecx, edi // outer loop dv cursor
719 expand xmm7, xmm0, xmm1
720 mov ebx, [ebp + 24] // -> av[0]
721 mov eax, [ebp + 28] // -> av[m] = av limit
722 mov edx, esp // -> expanded Y = bv[0]
723 movdqa [esp + 0], xmm0 // bv[0] expanded low
724 movdqa [esp + 16], xmm1 // bv[0] expanded high
730 cmp ebx, eax // all done?
734 // Continue with the first iteration.
738 cmp ebx, eax // all done?
741 // Write out the leftover carry. There can be no tail here.
743 cmp esi, [ebp + 36] // more passes to do?
747 // Set up for the next pass.
748 1: movdqu xmm0, [esi] // bv[i]
749 mov edi, ecx // -> dv[i]
751 expand xmm7, xmm0, xmm1
752 mov ebx, [ebp + 24] // -> av[0]
753 movdqa [esp + 0], xmm0 // bv[i] expanded low
754 movdqa [esp + 16], xmm1 // bv[i] expanded high
760 cmp ebx, eax // done yet?
771 // Finish off this pass. There was no tail on the previous pass, and
772 // there can be none on this pass.
787 FUNC(mpxmont_mul4_x86_avx)
791 // and drop through...
795 FUNC(mpxmont_mul4_x86_sse2)
796 // void mpxmont_mul4_x86_sse2(mpw *dv, const mpw *av, const mpw *bv,
797 // const mpw *nv, size_t n, const mpw *mi);
799 // Build a stack frame. Arguments will be relative to EBP, as
806 // ebp + 36 n (nonzero multiple of 4)
809 // Locals are relative to ESP, which 16-byte aligned, as follows.
811 // esp + 0 expanded V (32 bytes)
812 // esp + 32 expanded M (32 bytes)
813 // esp + 64 expanded Y (32 bytes)
814 // esp + 96 outer loop dv
815 // esp + 100 outer loop bv
816 // esp + 104 av limit (mostly in ESI)
817 // esp + 108 bv limit
818 // esp + 112 (top of locals)
828 // Establish the expanded operands.
830 mov ecx, [ebp + 28] // -> bv
831 mov edx, [ebp + 40] // -> mi
832 movdqu xmm0, [ecx] // bv[0]
833 movdqu xmm2, [edx] // mi
834 expand xmm7, xmm0, xmm1, xmm2, xmm3
835 movdqa [esp + 0], xmm0 // bv[0] expanded low
836 movdqa [esp + 16], xmm1 // bv[0] expanded high
837 movdqa [esp + 32], xmm2 // mi expanded low
838 movdqa [esp + 48], xmm3 // mi expanded high
840 // Set up the outer loop state and prepare for the first iteration.
841 mov edx, [ebp + 36] // n
842 mov eax, [ebp + 24] // -> U = av[0]
843 mov ebx, [ebp + 32] // -> X = nv[0]
844 mov edi, [ebp + 20] // -> Z = dv[0]
846 lea ecx, [ecx + 4*edx] // -> bv[n/4] = bv limit
847 lea edx, [eax + 4*edx] // -> av[n/4] = av limit
851 lea ecx, [esp + 0] // -> expanded V = bv[0]
852 lea esi, [esp + 32] // -> expanded M = mi
853 lea edx, [esp + 64] // -> space for Y
855 mov esi, [esp + 104] // recover av limit
859 cmp eax, esi // done already?
864 // Complete the first inner loop.
869 cmp eax, esi // done yet?
872 // Still have carries left to propagate.
874 movd [edi + 16], xmm4
877 // Embark on the next iteration. (There must be one. If n = 1, then
878 // we would have bailed above, to label 8. Similarly, the subsequent
879 // iterations can fall into the inner loop immediately.)
880 1: mov eax, [esp + 100] // -> bv[i - 1]
881 mov edi, [esp + 96] // -> Z = dv[i]
882 add eax, 16 // -> bv[i]
885 cmp eax, [esp + 108] // done yet?
887 movdqu xmm0, [eax] // bv[i]
888 mov ebx, [ebp + 32] // -> X = nv[0]
889 lea esi, [esp + 32] // -> expanded M = mi
890 mov eax, [ebp + 24] // -> U = av[0]
891 expand xmm7, xmm0, xmm1
892 movdqa [esp + 0], xmm0 // bv[i] expanded low
893 movdqa [esp + 16], xmm1 // bv[i] expanded high
895 mov esi, [esp + 104] // recover av limit
902 // Complete the next inner loop.
910 // Still have carries left to propagate, and they overlap the
911 // previous iteration's final tail, so read that in and add it.
915 movd [edi + 16], xmm4
920 // First iteration was short. Write out the carries and we're done.
921 // (This could be folded into the main loop structure, but that would
922 // penalize small numbers more.)
924 movd [edi + 16], xmm4
936 FUNC(mpxmont_redc4_x86_avx)
940 // and drop through...
944 FUNC(mpxmont_redc4_x86_sse2)
945 // void mpxmont_redc4_x86_sse2(mpw *dv, mpw *dvl, const mpw *nv,
946 // size_t n, const mpw *mi);
948 // Build a stack frame. Arguments will be relative to EBP, as
954 // ebp + 32 n (nonzero multiple of 4)
957 // Locals are relative to ESP, as follows.
959 // esp + 0 outer loop dv
960 // esp + 4 outer dv limit
961 // esp + 8 blocks-of-4 dv limit
962 // esp + 12 expanded M (32 bytes)
963 // esp + 44 expanded Y (32 bytes)
964 // esp + 76 (top of locals)
974 // Establish the expanded operands and the blocks-of-4 dv limit.
975 mov edi, [ebp + 20] // -> Z = dv[0]
977 mov eax, [ebp + 24] // -> dv[n] = dv limit
978 sub eax, edi // length of dv in bytes
979 mov edx, [ebp + 36] // -> mi
980 movdqu xmm0, [edx] // mi
981 and eax, ~15 // mask off the tail end
982 expand xmm7, xmm0, xmm1
983 add eax, edi // find limit
984 movdqa [esp + 12], xmm0 // mi expanded low
985 movdqa [esp + 28], xmm1 // mi expanded high
988 // Set up the outer loop state and prepare for the first iteration.
989 mov ecx, [ebp + 32] // n
990 mov ebx, [ebp + 28] // -> X = nv[0]
991 lea edx, [edi + 4*ecx] // -> dv[n/4] = outer dv limit
992 lea ecx, [ebx + 4*ecx] // -> nv[n/4] = nv limit
995 lea esi, [esp + 12] // -> expanded M = mi
996 lea edx, [esp + 44] // -> space for Y
1000 cmp ebx, ecx // done already?
1004 // Complete the first inner loop.
1008 cmp ebx, ecx // done yet?
1011 // Still have carries left to propagate.
1013 mov esi, [esp + 8] // -> dv blocks limit
1014 mov edx, [ebp + 24] // dv limit
1025 // Continue carry propagation until the end of the buffer.
1027 mov eax, 0 // preserves flags
1036 // Deal with the tail end.
1038 mov eax, 0 // preserves flags
1044 // All done for this iteration. Start the next. (This must have at
1045 // least one follow-on iteration, or we'd not have started this outer
1047 8: mov edi, [esp + 0] // -> dv[i - 1]
1048 mov ebx, [ebp + 28] // -> X = nv[0]
1049 lea edx, [esp + 44] // -> space for Y
1050 lea esi, [esp + 12] // -> expanded M = mi
1051 add edi, 16 // -> Z = dv[i]
1052 cmp edi, [esp + 4] // all done yet?
1070 ///--------------------------------------------------------------------------
1071 /// Testing and performance measurement.
1081 .macro cystore c, v, n
1089 mov [ebx + ecx*8], eax
1090 mov [ebx + ecx*8 + 4], edx
1093 .macro testprologue n
1103 mov [esp + 104], eax
1105 // esp + 0 = v expanded
1106 // esp + 32 = y expanded
1107 // esp + 64 = ? expanded
1108 // esp + 96 = cycles
1109 // esp + 104 = count
1121 .macro testldcarry c
1123 movdqu xmm4, [ecx + 0] // (c'_0; c''_0)
1124 movdqu xmm5, [ecx + 16] // (c'_1; c''_1)
1125 movdqu xmm6, [ecx + 32] // (c'_2; c''_2)
1128 .macro testexpand v=nil, y=nil
1133 expand xmm7, xmm0, xmm1
1134 movdqa [esp + 0], xmm0
1135 movdqa [esp + 16], xmm1
1140 expand xmm7, xmm2, xmm3
1141 movdqa [esp + 32], xmm2
1142 movdqa [esp + 48], xmm3
1146 .macro testtop u=nil, x=nil, mode=nil
1153 .ifeqs "\mode", "mont"
1160 .ifeqs "\mode", "mont"
1168 cystore esp + 96, \cyv, esp + 104
1172 .macro testcarryout c
1174 movdqu [ecx + 0], xmm4
1175 movdqu [ecx + 16], xmm5
1176 movdqu [ecx + 32], xmm6
1180 testprologue [ebp + 44]
1181 testldcarry [ebp + 24]
1182 testexpand [ebp + 36], [ebp + 40]
1184 testtop [ebp + 28], [ebp + 32]
1187 testcarryout [ebp + 24]
1192 testprologue [ebp + 44]
1193 testldcarry [ebp + 24]
1194 testexpand [ebp + 36], [ebp + 40]
1196 testtop [ebp + 28], [ebp + 32]
1199 testcarryout [ebp + 24]
1204 testprologue [ebp + 36]
1205 testldcarry [ebp + 24]
1206 testexpand nil, [ebp + 32]
1208 testtop nil, [ebp + 28]
1211 testcarryout [ebp + 24]
1216 testprologue [ebp + 36]
1217 testldcarry [ebp + 24]
1218 testexpand nil, [ebp + 32]
1220 testtop nil, [ebp + 28]
1223 testcarryout [ebp + 24]
1228 testprologue [ebp + 48]
1229 testexpand [ebp + 40], [ebp + 44]
1231 testtop [ebp + 32], [ebp + 36], mont
1235 movdqa xmm0, [esp + 64]
1236 movdqa xmm1, [esp + 80]
1238 movdqu [edi + 16], xmm1
1239 testcarryout [ebp + 24]
1244 testprologue [ebp + 48]
1245 testexpand [ebp + 40], [ebp + 44]
1247 testtop [ebp + 32], [ebp + 36], mont
1251 movdqa xmm0, [esp + 64]
1252 movdqa xmm1, [esp + 80]
1254 movdqu [edi + 16], xmm1
1255 testcarryout [ebp + 24]
1260 testprologue [ebp + 40]
1261 testexpand nil, [ebp + 36]
1263 testtop nil, [ebp + 32], mont
1267 movdqa xmm0, [esp + 64]
1268 movdqa xmm1, [esp + 80]
1270 movdqu [edi + 16], xmm1
1271 testcarryout [ebp + 24]
1277 ///----- That's all, folks --------------------------------------------------