1 /// -*- mode: asm; asm-comment-char: ?/; comment-start: "// " -*-
3 /// Large SIMD-based multiplications
5 /// (c) 2016 Straylight/Edgeware
7 ///----- Licensing notice ---------------------------------------------------
9 /// This file is part of Catacomb.
11 /// Catacomb is free software; you can redistribute it and/or modify
12 /// it under the terms of the GNU Library General Public License as
13 /// published by the Free Software Foundation; either version 2 of the
14 /// License, or (at your option) any later version.
16 /// Catacomb is distributed in the hope that it will be useful,
17 /// but WITHOUT ANY WARRANTY; without even the implied warranty of
18 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 /// GNU Library General Public License for more details.
21 /// You should have received a copy of the GNU Library General Public
22 /// License along with Catacomb; if not, write to the Free
23 /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
24 /// MA 02111-1307, USA.
26 ///--------------------------------------------------------------------------
27 /// External definitions.
30 #include "asm-common.h"
32 ///--------------------------------------------------------------------------
38 ///--------------------------------------------------------------------------
41 /// We define a number of primitive fixed-size multipliers from which we can
42 /// construct more general variable-length multipliers.
44 /// The basic trick is the same throughout. In an operand-scanning
45 /// multiplication, the inner multiplication loop multiplies a
46 /// multiple-precision operand by a single precision factor, and adds the
47 /// result, appropriately shifted, to the result. A `finely integrated
48 /// operand scanning' implementation of Montgomery multiplication also adds
49 /// the product of a single-precision `Montgomery factor' and the modulus,
50 /// calculated in the same pass. The more common `coarsely integrated
51 /// operand scanning' alternates main multiplication and Montgomery passes,
52 /// which requires additional carry propagation.
54 /// Throughout both plain-multiplication and Montgomery stages, then, one of
55 /// the factors remains constant throughout the operation, so we can afford
56 /// to take a little time to preprocess it. The transformation we perform is
57 /// as follows. Let b = 2^16, and B = b^2 = 2^32. Suppose we're given a
58 /// 128-bit factor v = v_0 + v_1 B + v_2 B^2 + v_3 B^3. Split each v_i into
59 /// two sixteen-bit pieces, so v_i = v'_i + v''_i b. These eight 16-bit
60 /// pieces are placed into 32-bit cells, and arranged as two 128-bit SSE
61 /// operands, as follows.
64 /// 0 v'_0 v'_1 v''_0 v''_1
65 /// 16 v'_2 v'_3 v''_2 v''_3
67 /// A `pmuludq' instruction ignores the odd positions in its operands; thus,
68 /// it will act on (say) v'_0 and v''_0 in a single instruction. Shifting
69 /// this vector right by 4 bytes brings v'_1 and v''_1 into position. We can
70 /// multiply such a vector by a full 32-bit scalar to produce two 48-bit
71 /// results in 64-bit fields. The sixteen bits of headroom allows us to add
72 /// many products together before we must deal with carrying; it also allows
73 /// for some calculations to be performed on the above expanded form.
75 /// On 32-bit x86, we are register starved: the expanded operands are kept in
76 /// memory, typically in warm L1 cache.
78 /// We maintain four `carry' registers accumulating intermediate results.
79 /// The registers' precise roles rotate during the computation; we name them
80 /// `c0', `c1', `c2', and `c3'. Each carry register holds two 64-bit halves:
81 /// the register c0, for example, holds c'_0 (low half) and c''_0 (high
82 /// half), and represents the value c_0 = c'_0 + c''_0 b; the carry registers
83 /// collectively represent the value c_0 + c_1 B + c_2 B^2 + c_3 B^3. The
84 /// `pmuluqd' instruction acting on a scalar operand (broadcast across all
85 /// lanes of its vector) and an operand in the expanded form above produces a
86 /// result which can be added directly to the appropriate carry register.
87 /// Following a pass of four multiplications, we perform some limited carry
88 /// propagation: let t = c''_0 mod B, and let d = c'_0 + t b; then we output
89 /// z = d mod B, add (floor(d/B), floor(c''_0/B)) to c1, and cycle the carry
90 /// registers around, so that c1 becomes c0, and the old c0 is (implicitly)
91 /// zeroed becomes c3.
93 ///--------------------------------------------------------------------------
94 /// Macro definitions.
96 .macro mulcore r, s, d0, d1=nil, d2=nil, d3=nil
97 // Load a word r_i from R, multiply by the expanded operand [S], and
98 // leave the pieces of the product in registers D0, D1, D2, D3.
99 movd \d0, \r // (r_i, 0, 0, 0)
101 movdqa \d1, [\s] // (s'_0, s'_1, s''_0, s''_1)
104 movdqa \d3, [\s + 16] // (s'_2, s'_3, s''_2, s''_3)
106 pshufd \d0, \d0, SHUF(3, 0, 3, 0) // (r_i, ?, r_i, ?)
108 psrldq \d1, 4 // (s'_1, s''_0, s''_1, 0)
112 movdqa \d2, \d3 // another copy of (s'_2, s'_3, ...)
114 movdqa \d2, \d0 // another copy of (r_i, ?, r_i, ?)
118 psrldq \d3, 4 // (s'_3, s''_2, s''_3, 0)
121 pmuludq \d1, \d0 // (r_i s'_1, r_i s''_1)
124 pmuludq \d3, \d0 // (r_i s'_3, r_i s''_3)
128 pmuludq \d2, \d0 // (r_i s'_2, r_i s''_2)
130 pmuludq \d2, [\s + 16]
133 pmuludq \d0, [\s] // (r_i s'_0, r_i s''_0)
136 .macro accum c0, c1=nil, c2=nil, c3=nil
137 // Accumulate 64-bit pieces in XMM0--XMM3 into the corresponding
138 // carry registers C0--C3. Any or all of C1--C3 may be `nil' to skip
139 // updating that register.
152 .macro mulacc r, s, c0, c1, c2, c3, z3p=nil
153 // Load a word r_i from R, multiply by the expanded operand [S],
154 // and accumulate in carry registers C0, C1, C2, C3. If Z3P is `t'
155 // then C3 notionally contains zero, but needs clearing; in practice,
156 // we store the product directly rather than attempting to add. On
157 // completion, XMM0, XMM1, and XMM2 are clobbered, as is XMM3 if Z3P
160 mulcore \r, \s, xmm0, xmm1, xmm2, \c3
163 mulcore \r, \s, xmm0, xmm1, xmm2, xmm3
164 accum \c0, \c1, \c2, \c3
168 .macro propout d, c, cc=nil
169 // Calculate an output word from C, and store it in D; propagate
170 // carries out from C to CC in preparation for a rotation of the
171 // carry registers. On completion, XMM3 is clobbered. If CC is
172 // `nil', then the contribution which would have been added to it is
174 pshufd xmm3, \c, SHUF(2, 3, 3, 3) // (?, ?, ?, t = c'' mod B)
175 psrldq xmm3, 12 // (t, 0, 0, 0) = (t, 0)
176 pslldq xmm3, 2 // (t b, 0)
177 paddq \c, xmm3 // (c' + t b, c'')
179 psrlq \c, 32 // floor(c/B)
181 paddq \cc, \c // propagate up
185 .macro endprop d, c, t
186 // On entry, C contains a carry register. On exit, the low 32 bits
187 // of the value represented in C are written to D, and the remaining
188 // bits are left at the bottom of T.
190 psllq \t, 16 // (?, c'' b)
191 pslldq \c, 8 // (0, c')
192 paddq \t, \c // (?, c' + c'' b)
193 psrldq \t, 8 // c' + c'' b
195 psrldq \t, 4 // floor((c' + c'' b)/B)
198 .macro expand z, a, b, c=nil, d=nil
199 // On entry, A and C hold packed 128-bit values, and Z is zero. On
200 // exit, A:B and C:D together hold the same values in expanded
201 // form. If C is `nil', then only expand A to A:B.
202 movdqa \b, \a // (a_0, a_1, a_2, a_3)
204 movdqa \d, \c // (c_0, c_1, c_2, c_3)
206 punpcklwd \a, \z // (a'_0, a''_0, a'_1, a''_1)
207 punpckhwd \b, \z // (a'_2, a''_2, a'_3, a''_3)
209 punpcklwd \c, \z // (c'_0, c''_0, c'_1, c''_1)
210 punpckhwd \d, \z // (c'_2, c''_2, c'_3, c''_3)
212 pshufd \a, \a, SHUF(3, 1, 2, 0) // (a'_0, a'_1, a''_0, a''_1)
213 pshufd \b, \b, SHUF(3, 1, 2, 0) // (a'_2, a'_3, a''_2, a''_3)
215 pshufd \c, \c, SHUF(3, 1, 2, 0) // (c'_0, c'_1, c''_0, c''_1)
216 pshufd \d, \d, SHUF(3, 1, 2, 0) // (c'_2, c'_3, c''_2, c''_3)
220 .macro squash c0, c1, c2, c3, t, u, lo, hi=nil
221 // On entry, C0, C1, C2, C3 are carry registers representing a value
222 // Y. On exit, LO holds the low 128 bits of the carry value; C1, C2,
223 // C3, T, and U are clobbered; and the high bits of Y are stored in
224 // HI, if this is not `nil'.
226 // The first step is to eliminate the `double-prime' pieces -- i.e.,
227 // the ones offset by 16 bytes from a 32-bit boundary -- by carrying
228 // them into the 32-bit-aligned pieces above and below. But before
229 // we can do that, we must gather them together.
232 punpcklqdq \t, \c2 // (y'_0, y'_2)
233 punpckhqdq \c0, \c2 // (y''_0, y''_2)
234 punpcklqdq \u, \c3 // (y'_1, y'_3)
235 punpckhqdq \c1, \c3 // (y''_1, y''_3)
237 // Now split the double-prime pieces. The high (up to) 48 bits will
238 // go up; the low 16 bits go down.
243 psrlq \c0, 16 // high parts of (y''_0, y''_2)
244 psrlq \c1, 16 // high parts of (y''_1, y''_3)
245 psrlq \c2, 32 // low parts of (y''_0, y''_2)
246 psrlq \c3, 32 // low parts of (y''_1, y''_3)
250 pslldq \c1, 8 // high part of (0, y''_1)
252 paddq \t, \c2 // propagate down
254 paddq \t, \c1 // and up: (y_0, y_2)
255 paddq \u, \c0 // (y_1, y_3)
257 psrldq \hi, 8 // high part of (y''_3, 0)
260 // Finally extract the answer. This complicated dance is better than
261 // storing to memory and loading, because the piecemeal stores
262 // inhibit store forwarding.
263 movdqa \c3, \t // (y_0, y_1)
264 movdqa \lo, \t // (y^*_0, ?, ?, ?)
265 psrldq \t, 8 // (y_2, 0)
266 psrlq \c3, 32 // (floor(y_0/B), ?)
267 paddq \c3, \u // (y_1 + floor(y_0/B), ?)
268 movdqa \c1, \c3 // (y^*_1, ?, ?, ?)
269 psrldq \u, 8 // (y_3, 0)
270 psrlq \c3, 32 // (floor((y_1 B + y_0)/B^2, ?)
271 paddq \c3, \t // (y_2 + floor((y_1 B + y_0)/B^2, ?)
272 punpckldq \lo, \c3 // (y^*_0, y^*_2, ?, ?)
273 psrlq \c3, 32 // (floor((y_2 B^2 + y_1 B + y_0)/B^3, ?)
274 paddq \c3, \u // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3, ?)
279 punpckldq \c1, \c3 // (y^*_1, y^*_3, ?, ?)
281 psrlq \t, 32 // very high bits of y
283 punpcklqdq \hi, \u // carry up
285 punpckldq \lo, \c1 // y mod B^4
289 // On entry, EDI points to a packed addend A, and XMM4, XMM5, XMM6
290 // hold the incoming carry registers c0, c1, and c2 representing a
293 // On exit, the carry registers, including XMM7, are updated to hold
294 // C + A; XMM0, XMM1, XMM2, and XMM3 are clobbered. The other
295 // registers are preserved.
296 movd xmm0, [edi + 0] // (a_0, 0)
297 movd xmm1, [edi + 4] // (a_1, 0)
298 movd xmm2, [edi + 8] // (a_2, 0)
299 movd xmm7, [edi + 12] // (a_3, 0)
301 paddq xmm4, xmm0 // (c'_0 + a_0, c''_0)
302 paddq xmm5, xmm1 // (c'_1 + a_1, c''_1)
303 paddq xmm6, xmm2 // (c'_2 + a_2, c''_2 + a_3 b)
306 ///--------------------------------------------------------------------------
307 /// Primitive multipliers and related utilities.
310 // On entry, XMM4, XMM5, and XMM6 hold a 144-bit carry in an expanded
311 // form. Store the low 128 bits of the represented carry to [EDI] as
312 // a packed 128-bit value, and leave the remaining 16 bits in the low
313 // 32 bits of XMM4. On exit, XMM3, XMM5 and XMM6 are clobbered.
316 propout [edi + 0], xmm4, xmm5
317 propout [edi + 4], xmm5, xmm6
318 propout [edi + 8], xmm6, nil
319 endprop [edi + 12], xmm6, xmm4
325 // On entry, EDI points to the destination buffer; EAX and EBX point
326 // to the packed operands U and X; ECX and EDX point to the expanded
327 // operands V and Y; and XMM4, XMM5, XMM6 hold the incoming carry
328 // registers c0, c1, and c2; c3 is assumed to be zero.
330 // On exit, we write the low 128 bits of the sum C + U V + X Y to
331 // [EDI], and update the carry registers with the carry out. The
332 // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
333 // general-purpose registers are preserved.
336 mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7, t
337 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
338 propout [edi + 0], xmm4, xmm5
340 mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
341 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4
342 propout [edi + 4], xmm5, xmm6
344 mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t
345 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5
346 propout [edi + 8], xmm6, xmm7
348 mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
349 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6
350 propout [edi + 12], xmm7, xmm4
357 // On entry, EDI points to the destination buffer, which also
358 // contains an addend A to accumulate; EAX and EBX point to the
359 // packed operands U and X; ECX and EDX point to the expanded
360 // operands V and Y; and XMM4, XMM5, XMM6 hold the incoming carry
361 // registers c0, c1, and c2 representing a carry-in C; c3 is assumed
364 // On exit, we write the low 128 bits of the sum A + C + U V + X Y to
365 // [EDI], and update the carry registers with the carry out. The
366 // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
367 // general-purpose registers are preserved.
372 mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7
373 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
374 propout [edi + 0], xmm4, xmm5
376 mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
377 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4
378 propout [edi + 4], xmm5, xmm6
380 mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t
381 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5
382 propout [edi + 8], xmm6, xmm7
384 mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
385 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6
386 propout [edi + 12], xmm7, xmm4
393 // On entry, EDI points to the destination buffer; EBX points to a
394 // packed operand X; and EDX points to an expanded operand Y.
396 // On exit, we write the low 128 bits of the product X Y to [EDI],
397 // and set the carry registers XMM4, XMM5, XMM6 to the carry out.
398 // The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
399 // general-purpose registers are preserved.
402 mulcore [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
403 propout [edi + 0], xmm4, xmm5
405 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
406 propout [edi + 4], xmm5, xmm6
408 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
409 propout [edi + 8], xmm6, xmm7
411 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
412 propout [edi + 12], xmm7, xmm4
419 // On entry, EDI points to the destination buffer; EBX points to a
420 // packed operand X; EDX points to an expanded operand Y; and XMM4,
421 // XMM5, XMM6 hold the incoming carry registers c0, c1, and c2,
422 // representing a carry-in C; c3 is assumed to be zero.
424 // On exit, we write the low 128 bits of the sum C + X Y to [EDI],
425 // and update the carry registers with the carry out. The registers
426 // XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
427 // general-purpose registers are preserved.
430 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, t
431 propout [edi + 0], xmm4, xmm5
433 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
434 propout [edi + 4], xmm5, xmm6
436 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
437 propout [edi + 8], xmm6, xmm7
439 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
440 propout [edi + 12], xmm7, xmm4
447 // On entry, EDI points to the destination buffer, which also
448 // contains an addend A to accumulate; EBX points to a packed operand
449 // X; and EDX points to an expanded operand Y.
451 // On exit, we write the low 128 bits of the sum A + X Y to [EDI],
452 // and set the carry registers XMM4, XMM5, XMM6 to the carry out.
453 // The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
454 // general-purpose registers are preserved.
460 movd xmm7, [edi + 12]
462 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
463 propout [edi + 0], xmm4, xmm5
465 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
466 propout [edi + 4], xmm5, xmm6
468 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
469 propout [edi + 8], xmm6, xmm7
471 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
472 propout [edi + 12], xmm7, xmm4
479 // On entry, EDI points to the destination buffer, which also
480 // contains an addend A to accumulate; EBX points to a packed operand
481 // X; EDX points to an expanded operand Y; and XMM4, XMM5, XMM6 hold
482 // the incoming carry registers c0, c1, and c2, representing a
483 // carry-in C; c3 is assumed to be zero.
485 // On exit, we write the low 128 bits of the sum A + C + X Y to
486 // [EDI], and update the carry registers with the carry out. The
487 // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
488 // general-purpose registers are preserved.
493 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
494 propout [edi + 0], xmm4, xmm5
496 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
497 propout [edi + 4], xmm5, xmm6
499 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
500 propout [edi + 8], xmm6, xmm7
502 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
503 propout [edi + 12], xmm7, xmm4
510 // On entry, EDI points to the destination buffer; EAX and EBX point
511 // to the packed operands U and N; ECX and ESI point to the expanded
512 // operands V and M; and EDX points to a place to store an expanded
513 // result Y (32 bytes, at a 16-byte boundary). The stack pointer
514 // must be 12 modulo 16, as is usual for modern x86 ABIs.
516 // On exit, we write Y = U V M mod B to [EDX], and the low 128 bits
517 // of the sum U V + N Y to [EDI], leaving the remaining carry in
518 // XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2, XMM3, and
519 // XMM7 are clobbered; the general-purpose registers are preserved.
520 stalloc 48 + 12 // space for the carries
523 // Calculate W = U V, and leave it in the destination. Stash the
524 // carry pieces for later.
525 mulcore [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7
526 propout [edi + 0], xmm4, xmm5
532 // On entry, EDI points to the destination buffer, which also
533 // contains an addend A to accumulate; EAX and EBX point to the
534 // packed operands U and N; ECX and ESI point to the expanded
535 // operands V and M; and EDX points to a place to store an expanded
536 // result Y (32 bytes, at a 16-byte boundary). The stack pointer
537 // must be 12 modulo 16, as is usual for modern x86 ABIs.
539 // On exit, we write Y = (A + U V) M mod B to [EDX], and the low 128
540 // bits of the sum A + U V + N Y to [EDI], leaving the remaining
541 // carry in XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2,
542 // XMM3, and XMM7 are clobbered; the general-purpose registers are
544 stalloc 48 + 12 // space for the carries
550 movd xmm7, [edi + 12]
552 // Calculate W = U V, and leave it in the destination. Stash the
553 // carry pieces for later.
554 mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7
555 propout [edi + 0], xmm4, xmm5
557 5: mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
558 propout [edi + 4], xmm5, xmm6
560 mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t
561 propout [edi + 8], xmm6, xmm7
563 mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
564 propout [edi + 12], xmm7, xmm4
566 movdqa [esp + 0], xmm4
567 movdqa [esp + 16], xmm5
568 movdqa [esp + 32], xmm6
570 // Calculate Y = W M.
571 mulcore [edi + 0], esi, xmm4, xmm5, xmm6, xmm7
573 mulcore [edi + 4], esi, xmm0, xmm1, xmm2
574 accum xmm5, xmm6, xmm7
576 mulcore [edi + 8], esi, xmm0, xmm1
579 mulcore [edi + 12], esi, xmm0
582 // That's lots of pieces. Now we have to assemble the answer.
583 squash xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm4
587 expand xmm2, xmm4, xmm1
588 movdqa [edx + 0], xmm4
589 movdqa [edx + 16], xmm1
591 // Initialize the carry from the value for W we calculated earlier.
595 movd xmm7, [edi + 12]
597 // Finish the calculation by adding the Montgomery product.
598 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
599 propout [edi + 0], xmm4, xmm5
601 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
602 propout [edi + 4], xmm5, xmm6
604 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
605 propout [edi + 8], xmm6, xmm7
607 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
608 propout [edi + 12], xmm7, xmm4
610 // Add add on the carry we calculated earlier.
611 paddq xmm4, [esp + 0]
612 paddq xmm5, [esp + 16]
613 paddq xmm6, [esp + 32]
615 // And, with that, we're done.
622 // On entry, EDI points to the destination buffer holding a packed
623 // value W; EBX points to a packed operand N; ESI points to an
624 // expanded operand M; and EDX points to a place to store an expanded
625 // result Y (32 bytes, at a 16-byte boundary).
627 // On exit, we write Y = W M mod B to [EDX], and the low 128 bits
628 // of the sum W + N Y to [EDI], leaving the remaining carry in
629 // XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2, XMM3, and
630 // XMM7 are clobbered; the general-purpose registers are preserved.
633 // Calculate Y = W M.
634 mulcore [edi + 0], esi, xmm4, xmm5, xmm6, xmm7
636 mulcore [edi + 4], esi, xmm0, xmm1, xmm2
637 accum xmm5, xmm6, xmm7
639 mulcore [edi + 8], esi, xmm0, xmm1
642 mulcore [edi + 12], esi, xmm0
645 // That's lots of pieces. Now we have to assemble the answer.
646 squash xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm4
650 expand xmm2, xmm4, xmm1
651 movdqa [edx + 0], xmm4
652 movdqa [edx + 16], xmm1
654 // Initialize the carry from W.
658 movd xmm7, [edi + 12]
660 // Finish the calculation by adding the Montgomery product.
661 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
662 propout [edi + 0], xmm4, xmm5
664 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
665 propout [edi + 4], xmm5, xmm6
667 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
668 propout [edi + 8], xmm6, xmm7
670 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
671 propout [edi + 12], xmm7, xmm4
673 // And, with that, we're done.
678 ///--------------------------------------------------------------------------
679 /// Bulk multipliers.
681 FUNC(mpx_umul4_x86_sse2)
682 // void mpx_umul4_x86_sse2(mpw *dv, const mpw *av, const mpw *avl,
683 // const mpw *bv, const mpw *bvl);
685 // Build a stack frame. Arguments will be relative to EBP, as
694 // Locals are relative to ESP, as follows.
696 // esp + 0 expanded Y (32 bytes)
697 // esp + 32 (top of locals)
707 // Prepare for the first iteration.
708 mov esi, [ebp + 32] // -> bv[0]
710 movdqu xmm0, [esi] // bv[0]
711 mov edi, [ebp + 20] // -> dv[0]
712 mov ecx, edi // outer loop dv cursor
713 expand xmm7, xmm0, xmm1
714 mov ebx, [ebp + 24] // -> av[0]
715 mov eax, [ebp + 28] // -> av[m] = av limit
716 mov edx, esp // -> expanded Y = bv[0]
717 movdqa [esp + 0], xmm0 // bv[0] expanded low
718 movdqa [esp + 16], xmm1 // bv[0] expanded high
724 cmp ebx, eax // all done?
728 // Continue with the first iteration.
732 cmp ebx, eax // all done?
735 // Write out the leftover carry. There can be no tail here.
737 cmp esi, [ebp + 36] // more passes to do?
741 // Set up for the next pass.
742 1: movdqu xmm0, [esi] // bv[i]
743 mov edi, ecx // -> dv[i]
745 expand xmm7, xmm0, xmm1
746 mov ebx, [ebp + 24] // -> av[0]
747 movdqa [esp + 0], xmm0 // bv[i] expanded low
748 movdqa [esp + 16], xmm1 // bv[i] expanded high
754 cmp ebx, eax // done yet?
765 // Finish off this pass. There was no tail on the previous pass, and
766 // there can be none on this pass.
781 FUNC(mpxmont_mul4_x86_sse2)
782 // void mpxmont_mul4_x86_sse2(mpw *dv, const mpw *av, const mpw *bv,
783 // const mpw *nv, size_t n, const mpw *mi);
785 // Build a stack frame. Arguments will be relative to EBP, as
792 // ebp + 36 n (nonzero multiple of 4)
795 // Locals are relative to ESP, which 16-byte aligned, as follows.
797 // esp + 0 expanded V (32 bytes)
798 // esp + 32 expanded M (32 bytes)
799 // esp + 64 expanded Y (32 bytes)
800 // esp + 96 outer loop dv
801 // esp + 100 outer loop bv
802 // esp + 104 av limit (mostly in ESI)
803 // esp + 108 bv limit
804 // esp + 112 (top of locals)
814 // Establish the expanded operands.
816 mov ecx, [ebp + 28] // -> bv
817 mov edx, [ebp + 40] // -> mi
818 movdqu xmm0, [ecx] // bv[0]
819 movdqu xmm2, [edx] // mi
820 expand xmm7, xmm0, xmm1, xmm2, xmm3
821 movdqa [esp + 0], xmm0 // bv[0] expanded low
822 movdqa [esp + 16], xmm1 // bv[0] expanded high
823 movdqa [esp + 32], xmm2 // mi expanded low
824 movdqa [esp + 48], xmm3 // mi expanded high
826 // Set up the outer loop state and prepare for the first iteration.
827 mov edx, [ebp + 36] // n
828 mov eax, [ebp + 24] // -> U = av[0]
829 mov ebx, [ebp + 32] // -> X = nv[0]
830 mov edi, [ebp + 20] // -> Z = dv[0]
832 lea ecx, [ecx + 4*edx] // -> bv[n/4] = bv limit
833 lea edx, [eax + 4*edx] // -> av[n/4] = av limit
837 lea ecx, [esp + 0] // -> expanded V = bv[0]
838 lea esi, [esp + 32] // -> expanded M = mi
839 lea edx, [esp + 64] // -> space for Y
841 mov esi, [esp + 104] // recover av limit
845 cmp eax, esi // done already?
850 // Complete the first inner loop.
855 cmp eax, esi // done yet?
858 // Still have carries left to propagate.
860 movd [edi + 16], xmm4
863 // Embark on the next iteration. (There must be one. If n = 1, then
864 // we would have bailed above, to label 8. Similarly, the subsequent
865 // iterations can fall into the inner loop immediately.)
866 1: mov eax, [esp + 100] // -> bv[i - 1]
867 mov edi, [esp + 96] // -> Z = dv[i]
868 add eax, 16 // -> bv[i]
871 cmp eax, [esp + 108] // done yet?
873 movdqu xmm0, [eax] // bv[i]
874 mov ebx, [ebp + 32] // -> X = nv[0]
875 lea esi, [esp + 32] // -> expanded M = mi
876 mov eax, [ebp + 24] // -> U = av[0]
877 expand xmm7, xmm0, xmm1
878 movdqa [esp + 0], xmm0 // bv[i] expanded low
879 movdqa [esp + 16], xmm1 // bv[i] expanded high
881 mov esi, [esp + 104] // recover av limit
888 // Complete the next inner loop.
896 // Still have carries left to propagate, and they overlap the
897 // previous iteration's final tail, so read that in and add it.
901 movd [edi + 16], xmm4
906 // First iteration was short. Write out the carries and we're done.
907 // (This could be folded into the main loop structure, but that would
908 // penalize small numbers more.)
910 movd [edi + 16], xmm4
922 FUNC(mpxmont_redc4_x86_sse2)
923 // void mpxmont_redc4_x86_sse2(mpw *dv, mpw *dvl, const mpw *nv,
924 // size_t n, const mpw *mi);
926 // Build a stack frame. Arguments will be relative to EBP, as
932 // ebp + 32 n (nonzero multiple of 4)
935 // Locals are relative to ESP, as follows.
937 // esp + 0 outer loop dv
938 // esp + 4 outer dv limit
939 // esp + 8 blocks-of-4 dv limit
940 // esp + 12 expanded M (32 bytes)
941 // esp + 44 expanded Y (32 bytes)
942 // esp + 76 (top of locals)
952 // Establish the expanded operands and the blocks-of-4 dv limit.
953 mov edi, [ebp + 20] // -> Z = dv[0]
955 mov eax, [ebp + 24] // -> dv[n] = dv limit
956 sub eax, edi // length of dv in bytes
957 mov edx, [ebp + 36] // -> mi
958 movdqu xmm0, [edx] // mi
959 and eax, ~15 // mask off the tail end
960 expand xmm7, xmm0, xmm1
961 add eax, edi // find limit
962 movdqa [esp + 12], xmm0 // mi expanded low
963 movdqa [esp + 28], xmm1 // mi expanded high
966 // Set up the outer loop state and prepare for the first iteration.
967 mov ecx, [ebp + 32] // n
968 mov ebx, [ebp + 28] // -> X = nv[0]
969 lea edx, [edi + 4*ecx] // -> dv[n/4] = outer dv limit
970 lea ecx, [ebx + 4*ecx] // -> nv[n/4] = nv limit
973 lea esi, [esp + 12] // -> expanded M = mi
974 lea edx, [esp + 44] // -> space for Y
978 cmp ebx, ecx // done already?
982 // Complete the first inner loop.
986 cmp ebx, ecx // done yet?
989 // Still have carries left to propagate.
991 mov esi, [esp + 8] // -> dv blocks limit
992 mov edx, [ebp + 24] // dv limit
1003 // Continue carry propagation until the end of the buffer.
1005 mov eax, 0 // preserves flags
1014 // Deal with the tail end.
1016 mov eax, 0 // preserves flags
1022 // All done for this iteration. Start the next. (This must have at
1023 // least one follow-on iteration, or we'd not have started this outer
1025 8: mov edi, [esp + 0] // -> dv[i - 1]
1026 mov ebx, [ebp + 28] // -> X = nv[0]
1027 lea edx, [esp + 44] // -> space for Y
1028 lea esi, [esp + 12] // -> expanded M = mi
1029 add edi, 16 // -> Z = dv[i]
1030 cmp edi, [esp + 4] // all done yet?
1048 ///--------------------------------------------------------------------------
1049 /// Testing and performance measurement.
1059 .macro cystore c, v, n
1067 mov [ebx + ecx*8], eax
1068 mov [ebx + ecx*8 + 4], edx
1071 .macro testprologue n
1081 mov [esp + 104], eax
1083 // esp + 0 = v expanded
1084 // esp + 32 = y expanded
1085 // esp + 64 = ? expanded
1086 // esp + 96 = cycles
1087 // esp + 104 = count
1099 .macro testldcarry c
1101 movdqu xmm4, [ecx + 0] // (c'_0, c''_0)
1102 movdqu xmm5, [ecx + 16] // (c'_1, c''_1)
1103 movdqu xmm6, [ecx + 32] // (c'_2, c''_2)
1106 .macro testexpand v=nil, y=nil
1111 expand xmm7, xmm0, xmm1
1112 movdqa [esp + 0], xmm0
1113 movdqa [esp + 16], xmm1
1118 expand xmm7, xmm2, xmm3
1119 movdqa [esp + 32], xmm2
1120 movdqa [esp + 48], xmm3
1124 .macro testtop u=nil, x=nil, mode=nil
1131 .ifeqs "\mode", "mont"
1138 .ifeqs "\mode", "mont"
1146 cystore esp + 96, \cyv, esp + 104
1150 .macro testcarryout c
1152 movdqu [ecx + 0], xmm4
1153 movdqu [ecx + 16], xmm5
1154 movdqu [ecx + 32], xmm6
1158 testprologue [ebp + 44]
1159 testldcarry [ebp + 24]
1160 testexpand [ebp + 36], [ebp + 40]
1162 testtop [ebp + 28], [ebp + 32]
1165 testcarryout [ebp + 24]
1170 testprologue [ebp + 44]
1171 testldcarry [ebp + 24]
1172 testexpand [ebp + 36], [ebp + 40]
1174 testtop [ebp + 28], [ebp + 32]
1177 testcarryout [ebp + 24]
1182 testprologue [ebp + 36]
1183 testldcarry [ebp + 24]
1184 testexpand nil, [ebp + 32]
1186 testtop nil, [ebp + 28]
1189 testcarryout [ebp + 24]
1194 testprologue [ebp + 36]
1195 testldcarry [ebp + 24]
1196 testexpand nil, [ebp + 32]
1198 testtop nil, [ebp + 28]
1201 testcarryout [ebp + 24]
1206 testprologue [ebp + 48]
1207 testexpand [ebp + 40], [ebp + 44]
1209 testtop [ebp + 32], [ebp + 36], mont
1213 movdqa xmm0, [esp + 64]
1214 movdqa xmm1, [esp + 80]
1216 movdqu [edi + 16], xmm1
1217 testcarryout [ebp + 24]
1222 testprologue [ebp + 48]
1223 testexpand [ebp + 40], [ebp + 44]
1225 testtop [ebp + 32], [ebp + 36], mont
1229 movdqa xmm0, [esp + 64]
1230 movdqa xmm1, [esp + 80]
1232 movdqu [edi + 16], xmm1
1233 testcarryout [ebp + 24]
1238 testprologue [ebp + 40]
1239 testexpand nil, [ebp + 36]
1241 testtop nil, [ebp + 32], mont
1245 movdqa xmm0, [esp + 64]
1246 movdqa xmm1, [esp + 80]
1248 movdqu [edi + 16], xmm1
1249 testcarryout [ebp + 24]
1255 ///----- That's all, folks --------------------------------------------------