1 /// -*- mode: asm; asm-comment-char: ?/; comment-start: "// " -*-
3 /// Large SIMD-based multiplications
5 /// (c) 2016 Straylight/Edgeware
7 ///----- Licensing notice ---------------------------------------------------
9 /// This file is part of Catacomb.
11 /// Catacomb is free software; you can redistribute it and/or modify
12 /// it under the terms of the GNU Library General Public License as
13 /// published by the Free Software Foundation; either version 2 of the
14 /// License, or (at your option) any later version.
16 /// Catacomb is distributed in the hope that it will be useful,
17 /// but WITHOUT ANY WARRANTY; without even the implied warranty of
18 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 /// GNU Library General Public License for more details.
21 /// You should have received a copy of the GNU Library General Public
22 /// License along with Catacomb; if not, write to the Free
23 /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
24 /// MA 02111-1307, USA.
26 ///--------------------------------------------------------------------------
27 /// External definitions.
30 #include "asm-common.h"
32 ///--------------------------------------------------------------------------
38 ///--------------------------------------------------------------------------
41 /// We define a number of primitive fixed-size multipliers from which we can
42 /// construct more general variable-length multipliers.
44 /// The basic trick is the same throughout. In an operand-scanning
45 /// multiplication, the inner multiplication loop multiplies a
46 /// multiple-precision operand by a single precision factor, and adds the
47 /// result, appropriately shifted, to the result. A `finely integrated
48 /// operand scanning' implementation of Montgomery multiplication also adds
49 /// the product of a single-precision `Montgomery factor' and the modulus,
50 /// calculated in the same pass. The more common `coarsely integrated
51 /// operand scanning' alternates main multiplication and Montgomery passes,
52 /// which requires additional carry propagation.
54 /// Throughout both plain-multiplication and Montgomery stages, then, one of
55 /// the factors remains constant throughout the operation, so we can afford
56 /// to take a little time to preprocess it. The transformation we perform is
57 /// as follows. Let b = 2^16, and B = b^2 = 2^32. Suppose we're given a
58 /// 128-bit factor v = v_0 + v_1 B + v_2 B^2 + v_3 B^3. Split each v_i into
59 /// two sixteen-bit pieces, so v_i = v'_i + v''_i b. These eight 16-bit
60 /// pieces are placed into 32-bit cells, and arranged as two 128-bit SSE
61 /// operands, as follows.
64 /// 0 v'_0 v'_1 v''_0 v''_1
65 /// 16 v'_2 v'_3 v''_2 v''_3
67 /// A `pmuludq' instruction ignores the odd positions in its operands; thus,
68 /// it will act on (say) v'_0 and v''_0 in a single instruction. Shifting
69 /// this vector right by 4 bytes brings v'_1 and v''_1 into position. We can
70 /// multiply such a vector by a full 32-bit scalar to produce two 48-bit
71 /// results in 64-bit fields. The sixteen bits of headroom allows us to add
72 /// many products together before we must deal with carrying; it also allows
73 /// for some calculations to be performed on the above expanded form.
75 /// On 32-bit x86, we are register starved: the expanded operands are kept in
76 /// memory, typically in warm L1 cache.
78 /// We maintain four `carry' registers accumulating intermediate results.
79 /// The registers' precise roles rotate during the computation; we name them
80 /// `c0', `c1', `c2', and `c3'. Each carry register holds two 64-bit halves:
81 /// the register c0, for example, holds c'_0 (low half) and c''_0 (high
82 /// half), and represents the value c_0 = c'_0 + c''_0 b; the carry registers
83 /// collectively represent the value c_0 + c_1 B + c_2 B^2 + c_3 B^3. The
84 /// `pmuluqd' instruction acting on a scalar operand (broadcast across all
85 /// lanes of its vector) and an operand in the expanded form above produces a
86 /// result which can be added directly to the appropriate carry register.
87 /// Following a pass of four multiplications, we perform some limited carry
88 /// propagation: let t = c''_0 mod B, and let d = c'_0 + t b; then we output
89 /// z = d mod B, add (floor(d/B), floor(c''_0/B)) to c1, and cycle the carry
90 /// registers around, so that c1 becomes c0, and the old c0 is (implicitly)
91 /// zeroed becomes c3.
93 ///--------------------------------------------------------------------------
94 /// Macro definitions.
96 .macro mulcore r, s, d0, d1, d2, d3
97 // Load a word r_i from R, multiply by the expanded operand [S], and
98 // leave the pieces of the product in registers D0, D1, D2, D3.
99 movd \d0, \r // (r_i, 0, 0, 0)
101 movdqa \d1, [\s] // (s'_0, s'_1, s''_0, s''_1)
104 movdqa \d3, [\s + 16] // (s'_2, s'_3, s''_2, s''_3)
106 pshufd \d0, \d0, SHUF(3, 0, 3, 0) // (r_i, ?, r_i, ?)
108 psrldq \d1, 4 // (s'_1, s''_0, s''_1, 0)
112 movdqa \d2, \d3 // another copy of (s'_2, s'_3, ...)
114 movdqa \d2, \d0 // another copy of (r_i, ?, r_i, ?)
118 psrldq \d3, 4 // (s'_3, s''_2, s''_3, 0)
121 pmuludq \d1, \d0 // (r_i s'_1, r_i s''_1)
124 pmuludq \d3, \d0 // (r_i s'_3, r_i s''_3)
128 pmuludq \d2, \d0 // (r_i s'_2, r_i s''_2)
130 pmuludq \d2, [\s + 16]
133 pmuludq \d0, [\s] // (r_i s'_0, r_i s''_0)
136 .macro accum c0, c1, c2, c3
149 .macro mulacc r, s, c0, c1, c2, c3, z3p
150 // Load a word r_i from R, multiply by the expanded operand [S],
151 // and accumulate in carry registers C0, C1, C2, C3. If Z3P is `t'
152 // then C3 notionally contains zero, but needs clearing; in practice,
153 // we store the product directly rather than attempting to add. On
154 // completion, XMM0, XMM1, and XMM2 are clobbered, as is XMM3 if Z3P
157 mulcore \r, \s, xmm0, xmm1, xmm2, \c3
158 accum \c0, \c1, \c2, nil
160 mulcore \r, \s, xmm0, xmm1, xmm2, xmm3
161 accum \c0, \c1, \c2, \c3
165 .macro propout d, c, cc
166 // Calculate an output word from C, and store it in D; propagate
167 // carries out from C to CC in preparation for a rotation of the
168 // carry registers. On completion, XMM3 is clobbered. If CC is
169 // `nil', then the contribution which would have been added to it is
171 pshufd xmm3, \c, SHUF(2, 3, 3, 3) // (?, ?, ?, t = c'' mod B)
172 psrldq xmm3, 12 // (t, 0, 0, 0) = (t, 0)
173 pslldq xmm3, 2 // (t b, 0)
174 paddq \c, xmm3 // (c' + t b, c'')
176 psrlq \c, 32 // floor(c/B)
178 paddq \cc, \c // propagate up
182 .macro endprop d, c, t
183 // On entry, C contains a carry register. On exit, the low 32 bits
184 // of the value represented in C are written to D, and the remaining
185 // bits are left at the bottom of T.
187 psllq \t, 16 // (?, c'' b)
188 pslldq \c, 8 // (0, c')
189 paddq \t, \c // (?, c' + c'' b)
190 psrldq \t, 8 // c' + c'' b
192 psrldq \t, 4 // floor((c' + c'' b)/B)
195 .macro expand a, b, c, d, z
196 // On entry, A and C hold packed 128-bit values, and Z is zero. On
197 // exit, A:B and C:D together hold the same values in expanded
198 // form. If C is `nil', then only expand A to A:B.
199 movdqa \b, \a // (a_0, a_1, a_2, a_3)
201 movdqa \d, \c // (c_0, c_1, c_2, c_3)
203 punpcklwd \a, \z // (a'_0, a''_0, a'_1, a''_1)
204 punpckhwd \b, \z // (a'_2, a''_2, a'_3, a''_3)
206 punpcklwd \c, \z // (c'_0, c''_0, c'_1, c''_1)
207 punpckhwd \d, \z // (c'_2, c''_2, c'_3, c''_3)
209 pshufd \a, \a, SHUF(3, 1, 2, 0) // (a'_0, a'_1, a''_0, a''_1)
210 pshufd \b, \b, SHUF(3, 1, 2, 0) // (a'_2, a'_3, a''_2, a''_3)
212 pshufd \c, \c, SHUF(3, 1, 2, 0) // (c'_0, c'_1, c''_0, c''_1)
213 pshufd \d, \d, SHUF(3, 1, 2, 0) // (c'_2, c'_3, c''_2, c''_3)
217 .macro squash lo, hi, c0, c1, c2, c3, t, u
218 // On entry, C0, C1, C2, C3 are carry registers representing a value
219 // Y. On exit, LO holds the low 128 bits of the carry value; C1, C2,
220 // C3, T, and U are clobbered; and the high bits of Y are stored in
221 // HI, if this is not `nil'.
223 // The first step is to eliminate the `double-prime' pieces -- i.e.,
224 // the ones offset by 16 bytes from a 32-bit boundary -- by carrying
225 // them into the 32-bit-aligned pieces above and below. But before
226 // we can do that, we must gather them together.
229 punpcklqdq \t, \c2 // (y'_0, y'_2)
230 punpckhqdq \c0, \c2 // (y''_0, y''_2)
231 punpcklqdq \u, \c3 // (y'_1, y'_3)
232 punpckhqdq \c1, \c3 // (y''_1, y''_3)
234 // Now split the double-prime pieces. The high (up to) 48 bits will
235 // go up; the low 16 bits go down.
240 psrlq \c0, 16 // high parts of (y''_0, y''_2)
241 psrlq \c1, 16 // high parts of (y''_1, y''_3)
242 psrlq \c2, 32 // low parts of (y''_0, y''_2)
243 psrlq \c3, 32 // low parts of (y''_1, y''_3)
247 pslldq \c1, 8 // high part of (0, y''_1)
249 paddq \t, \c2 // propagate down
251 paddq \t, \c1 // and up: (y_0, y_2)
252 paddq \u, \c0 // (y_1, y_3)
254 psrldq \hi, 8 // high part of (y''_3, 0)
257 // Finally extract the answer. This complicated dance is better than
258 // storing to memory and loading, because the piecemeal stores
259 // inhibit store forwarding.
260 movdqa \c3, \t // (y_0, y_1)
261 movdqa \lo, \t // (y^*_0, ?, ?, ?)
262 psrldq \t, 8 // (y_2, 0)
263 psrlq \c3, 32 // (floor(y_0/B), ?)
264 paddq \c3, \u // (y_1 + floor(y_0/B), ?)
265 movdqa \c1, \c3 // (y^*_1, ?, ?, ?)
266 psrldq \u, 8 // (y_3, 0)
267 psrlq \c3, 32 // (floor((y_1 B + y_0)/B^2, ?)
268 paddq \c3, \t // (y_2 + floor((y_1 B + y_0)/B^2, ?)
269 punpckldq \lo, \c3 // (y^*_0, y^*_2, ?, ?)
270 psrlq \c3, 32 // (floor((y_2 B^2 + y_1 B + y_0)/B^3, ?)
271 paddq \c3, \u // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3, ?)
276 punpckldq \c1, \c3 // (y^*_1, y^*_3, ?, ?)
278 psrlq \t, 32 // very high bits of y
280 punpcklqdq \hi, \u // carry up
282 punpckldq \lo, \c1 // y mod B^4
286 // On entry, EDI points to a packed addend A, and XMM4, XMM5, XMM6
287 // hold the incoming carry registers c0, c1, and c2 representing a
290 // On exit, the carry registers, including XMM7, are updated to hold
291 // C + A; XMM0, XMM1, XMM2, and XMM3 are clobbered. The other
292 // registers are preserved.
293 movd xmm0, [edi + 0] // (a_0, 0)
294 movd xmm1, [edi + 4] // (a_1, 0)
295 movd xmm2, [edi + 8] // (a_2, 0)
296 movd xmm7, [edi + 12] // (a_3, 0)
298 paddq xmm4, xmm0 // (c'_0 + a_0, c''_0)
299 paddq xmm5, xmm1 // (c'_1 + a_1, c''_1)
300 paddq xmm6, xmm2 // (c'_2 + a_2, c''_2 + a_3 b)
303 ///--------------------------------------------------------------------------
304 /// Primitive multipliers and related utilities.
307 // On entry, XMM4, XMM5, and XMM6 hold a 144-bit carry in an expanded
308 // form. Store the low 128 bits of the represented carry to [EDI] as
309 // a packed 128-bit value, and leave the remaining 16 bits in the low
310 // 32 bits of XMM4. On exit, XMM3, XMM5 and XMM6 are clobbered.
313 propout [edi + 0], xmm4, xmm5
314 propout [edi + 4], xmm5, xmm6
315 propout [edi + 8], xmm6, nil
316 endprop [edi + 12], xmm6, xmm4
322 // On entry, EDI points to the destination buffer; EAX and EBX point
323 // to the packed operands U and X; ECX and EDX point to the expanded
324 // operands V and Y; and XMM4, XMM5, XMM6 hold the incoming carry
325 // registers c0, c1, and c2; c3 is assumed to be zero.
327 // On exit, we write the low 128 bits of the sum C + U V + X Y to
328 // [EDI], and update the carry registers with the carry out. The
329 // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
330 // general-purpose registers are preserved.
333 mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7, t
334 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil
335 propout [edi + 0], xmm4, xmm5
337 mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
338 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, nil
339 propout [edi + 4], xmm5, xmm6
341 mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t
342 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, nil
343 propout [edi + 8], xmm6, xmm7
345 mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
346 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, nil
347 propout [edi + 12], xmm7, xmm4
354 // On entry, EDI points to the destination buffer, which also
355 // contains an addend A to accumulate; EAX and EBX point to the
356 // packed operands U and X; ECX and EDX point to the expanded
357 // operands V and Y; and XMM4, XMM5, XMM6 hold the incoming carry
358 // registers c0, c1, and c2 representing a carry-in C; c3 is assumed
361 // On exit, we write the low 128 bits of the sum A + C + U V + X Y to
362 // [EDI], and update the carry registers with the carry out. The
363 // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
364 // general-purpose registers are preserved.
369 mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7, nil
370 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil
371 propout [edi + 0], xmm4, xmm5
373 mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
374 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, nil
375 propout [edi + 4], xmm5, xmm6
377 mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t
378 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, nil
379 propout [edi + 8], xmm6, xmm7
381 mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
382 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, nil
383 propout [edi + 12], xmm7, xmm4
390 // On entry, EDI points to the destination buffer; EBX points to a
391 // packed operand X; and EDX points to an expanded operand Y.
393 // On exit, we write the low 128 bits of the product X Y to [EDI],
394 // and set the carry registers XMM4, XMM5, XMM6 to the carry out.
395 // The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
396 // general-purpose registers are preserved.
399 mulcore [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
400 propout [edi + 0], xmm4, xmm5
402 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
403 propout [edi + 4], xmm5, xmm6
405 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
406 propout [edi + 8], xmm6, xmm7
408 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
409 propout [edi + 12], xmm7, xmm4
416 // On entry, EDI points to the destination buffer; EBX points to a
417 // packed operand X; EDX points to an expanded operand Y; and XMM4,
418 // XMM5, XMM6 hold the incoming carry registers c0, c1, and c2,
419 // representing a carry-in C; c3 is assumed to be zero.
421 // On exit, we write the low 128 bits of the sum C + X Y to [EDI],
422 // and update the carry registers with the carry out. The registers
423 // XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
424 // general-purpose registers are preserved.
427 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, t
428 propout [edi + 0], xmm4, xmm5
430 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
431 propout [edi + 4], xmm5, xmm6
433 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
434 propout [edi + 8], xmm6, xmm7
436 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
437 propout [edi + 12], xmm7, xmm4
444 // On entry, EDI points to the destination buffer, which also
445 // contains an addend A to accumulate; EBX points to a packed operand
446 // X; and EDX points to an expanded operand Y.
448 // On exit, we write the low 128 bits of the sum A + X Y to [EDI],
449 // and set the carry registers XMM4, XMM5, XMM6 to the carry out.
450 // The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
451 // general-purpose registers are preserved.
457 movd xmm7, [edi + 12]
459 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil
460 propout [edi + 0], xmm4, xmm5
462 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
463 propout [edi + 4], xmm5, xmm6
465 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
466 propout [edi + 8], xmm6, xmm7
468 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
469 propout [edi + 12], xmm7, xmm4
476 // On entry, EDI points to the destination buffer, which also
477 // contains an addend A to accumulate; EBX points to a packed operand
478 // X; EDX points to an expanded operand Y; and XMM4, XMM5, XMM6 hold
479 // the incoming carry registers c0, c1, and c2, representing a
480 // carry-in C; c3 is assumed to be zero.
482 // On exit, we write the low 128 bits of the sum A + C + X Y to
483 // [EDI], and update the carry registers with the carry out. The
484 // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
485 // general-purpose registers are preserved.
490 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil
491 propout [edi + 0], xmm4, xmm5
493 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
494 propout [edi + 4], xmm5, xmm6
496 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
497 propout [edi + 8], xmm6, xmm7
499 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
500 propout [edi + 12], xmm7, xmm4
507 // On entry, EDI points to the destination buffer; EAX and EBX point
508 // to the packed operands U and N; ECX and ESI point to the expanded
509 // operands V and M; and EDX points to a place to store an expanded
510 // result Y (32 bytes, at a 16-byte boundary). The stack pointer
511 // must be 16-byte aligned. (This is not the usual convention, which
512 // requires alignment before the call.)
514 // On exit, we write Y = U V M mod B to [EDX], and the low 128 bits
515 // of the sum U V + N Y to [EDI], leaving the remaining carry in
516 // XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2, XMM3, and
517 // XMM7 are clobbered; the general-purpose registers are preserved.
518 stalloc 48 // space for the carries
521 // Calculate W = U V, and leave it in the destination. Stash the
522 // carry pieces for later.
523 mulcore [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7
524 propout [edi + 0], xmm4, xmm5
530 // On entry, EDI points to the destination buffer, which also
531 // contains an addend A to accumulate; EAX and EBX point
532 // to the packed operands U and N; ECX and ESI point to the expanded
533 // operands V and M; and EDX points to a place to store an expanded
534 // result Y (32 bytes, at a 16-byte boundary). The stack pointer
535 // must be 16-byte aligned. (This is not the usual convention, which
536 // requires alignment before the call.)
538 // On exit, we write Y = (A + U V) M mod B to [EDX], and the low 128
539 // bits of the sum A + U V + N Y to [EDI], leaving the remaining
540 // carry in XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2,
541 // XMM3, and XMM7 are clobbered; the general-purpose registers are
543 stalloc 48 // space for the carries
549 movd xmm7, [edi + 12]
550 mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7, nil
551 propout [edi + 0], xmm4, xmm5
553 5: mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
554 propout [edi + 4], xmm5, xmm6
556 mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t
557 propout [edi + 8], xmm6, xmm7
559 mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
560 propout [edi + 12], xmm7, xmm4
562 movdqa [esp + 0], xmm4
563 movdqa [esp + 16], xmm5
564 movdqa [esp + 32], xmm6
566 // Calculate Y = W M.
567 mulcore [edi + 0], esi, xmm4, xmm5, xmm6, xmm7
569 mulcore [edi + 4], esi, xmm0, xmm1, xmm2, nil
570 accum xmm5, xmm6, xmm7, nil
572 mulcore [edi + 8], esi, xmm0, xmm1, nil, nil
573 accum xmm6, xmm7, nil, nil
575 mulcore [edi + 12], esi, xmm0, nil, nil, nil
576 accum xmm7, nil, nil, nil
578 // That's lots of pieces. Now we have to assemble the answer.
579 squash xmm4, nil, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1
583 expand xmm4, xmm1, nil, nil, xmm2
584 movdqa [edx + 0], xmm4
585 movdqa [edx + 16], xmm1
587 // Initialize the carry from the value for W we calculated earlier.
591 movd xmm7, [edi + 12]
593 // Finish the calculation by adding the Montgomery product.
594 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil
595 propout [edi + 0], xmm4, xmm5
597 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
598 propout [edi + 4], xmm5, xmm6
600 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
601 propout [edi + 8], xmm6, xmm7
603 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
604 propout [edi + 12], xmm7, xmm4
606 // Add add on the carry we calculated earlier.
607 paddq xmm4, [esp + 0]
608 paddq xmm5, [esp + 16]
609 paddq xmm6, [esp + 32]
611 // And, with that, we're done.
618 // On entry, EDI points to the destination buffer holding a packed
619 // value W; EBX points to a packed operand N; ESI points to an
620 // expanded operand M; and EDX points to a place to store an expanded
621 // result Y (32 bytes, at a 16-byte boundary).
623 // On exit, we write Y = W M mod B to [EDX], and the low 128 bits
624 // of the sum W + N Y to [EDI], leaving the remaining carry in
625 // XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2, XMM3, and
626 // XMM7 are clobbered; the general-purpose registers are preserved.
629 // Calculate Y = W M.
630 mulcore [edi + 0], esi, xmm4, xmm5, xmm6, xmm7
632 mulcore [edi + 4], esi, xmm0, xmm1, xmm2, nil
633 accum xmm5, xmm6, xmm7, nil
635 mulcore [edi + 8], esi, xmm0, xmm1, nil, nil
636 accum xmm6, xmm7, nil, nil
638 mulcore [edi + 12], esi, xmm0, nil, nil, nil
639 accum xmm7, nil, nil, nil
641 // That's lots of pieces. Now we have to assemble the answer.
642 squash xmm4, nil, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1
646 expand xmm4, xmm1, nil, nil, xmm2
647 movdqa [edx + 0], xmm4
648 movdqa [edx + 16], xmm1
650 // Initialize the carry from W.
654 movd xmm7, [edi + 12]
656 // Finish the calculation by adding the Montgomery product.
657 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil
658 propout [edi + 0], xmm4, xmm5
660 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
661 propout [edi + 4], xmm5, xmm6
663 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
664 propout [edi + 8], xmm6, xmm7
666 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
667 propout [edi + 12], xmm7, xmm4
669 // And, with that, we're done.
674 ///--------------------------------------------------------------------------
675 /// Bulk multipliers.
677 FUNC(mpx_umul4_x86_sse2)
678 // void mpx_umul4_x86_sse2(mpw *dv, const mpw *av, const mpw *avl,
679 // const mpw *bv, const mpw *bvl);
681 // Build a stack frame. Arguments will be relative to EBP, as
690 // Locals are relative to ESP, as follows.
692 // esp + 0 expanded Y (32 bytes)
693 // esp + 32 (top of locals)
703 // Prepare for the first iteration.
704 mov esi, [ebp + 32] // -> bv[0]
706 movdqu xmm0, [esi] // bv[0]
707 mov edi, [ebp + 20] // -> dv[0]
708 mov ecx, edi // outer loop dv cursor
709 expand xmm0, xmm1, nil, nil, xmm7
710 mov ebx, [ebp + 24] // -> av[0]
711 mov eax, [ebp + 28] // -> av[m] = av limit
712 mov edx, esp // -> expanded Y = bv[0]
713 movdqa [esp + 0], xmm0 // bv[0] expanded low
714 movdqa [esp + 16], xmm1 // bv[0] expanded high
720 cmp ebx, eax // all done?
724 // Continue with the first iteration.
728 cmp ebx, eax // all done?
731 // Write out the leftover carry. There can be no tail here.
733 cmp esi, [ebp + 36] // more passes to do?
737 // Set up for the next pass.
738 1: movdqu xmm0, [esi] // bv[i]
739 mov edi, ecx // -> dv[i]
741 expand xmm0, xmm1, nil, nil, xmm7
742 mov ebx, [ebp + 24] // -> av[0]
743 movdqa [esp + 0], xmm0 // bv[i] expanded low
744 movdqa [esp + 16], xmm1 // bv[i] expanded high
750 cmp ebx, eax // done yet?
761 // Finish off this pass. There was no tail on the previous pass, and
762 // there can be none on this pass.
777 FUNC(mpxmont_mul4_x86_sse2)
778 // void mpxmont_mul4_x86_sse2(mpw *dv, const mpw *av, const mpw *bv,
779 // const mpw *nv, size_t n, const mpw *mi);
781 // Build a stack frame. Arguments will be relative to EBP, as
788 // ebp + 36 n (nonzero multiple of 4)
791 // Locals are relative to ESP, which is 4 mod 16, as follows.
793 // esp + 0 outer loop dv
794 // esp + 4 outer loop bv
795 // esp + 8 av limit (mostly in ESI)
796 // esp + 12 expanded V (32 bytes)
797 // esp + 44 expanded M (32 bytes)
798 // esp + 76 expanded Y (32 bytes)
799 // esp + 108 bv limit
801 // esp + 124 (top of locals)
811 // Establish the expanded operands.
813 mov ecx, [ebp + 28] // -> bv
814 mov edx, [ebp + 40] // -> mi
815 movdqu xmm0, [ecx] // bv[0]
816 movdqu xmm2, [edx] // mi
817 expand xmm0, xmm1, xmm2, xmm3, xmm7
818 movdqa [esp + 12], xmm0 // bv[0] expanded low
819 movdqa [esp + 28], xmm1 // bv[0] expanded high
820 movdqa [esp + 44], xmm2 // mi expanded low
821 movdqa [esp + 60], xmm3 // mi expanded high
823 // Set up the outer loop state and prepare for the first iteration.
824 mov edx, [ebp + 36] // n
825 mov eax, [ebp + 24] // -> U = av[0]
826 mov ebx, [ebp + 32] // -> X = nv[0]
827 mov edi, [ebp + 20] // -> Z = dv[0]
829 lea ecx, [ecx + 4*edx] // -> bv[n/4] = bv limit
830 lea edx, [eax + 4*edx] // -> av[n/4] = av limit
834 lea ecx, [esp + 12] // -> expanded V = bv[0]
835 lea esi, [esp + 44] // -> expanded M = mi
836 lea edx, [esp + 76] // -> space for Y
838 mov esi, [esp + 8] // recover av limit
842 cmp eax, esi // done already?
847 // Complete the first inner loop.
852 cmp eax, esi // done yet?
855 // Still have carries left to propagate.
857 movd [edi + 16], xmm4
860 // Embark on the next iteration. (There must be one. If n = 1, then
861 // we would have bailed above, to label 8. Similarly, the subsequent
862 // iterations can fall into the inner loop immediately.)
863 1: mov eax, [esp + 4] // -> bv[i - 1]
864 mov edi, [esp + 0] // -> Z = dv[i]
865 add eax, 16 // -> bv[i]
867 movdqu xmm0, [eax] // bv[i]
869 cmp eax, [esp + 108] // done yet?
871 mov ebx, [ebp + 32] // -> X = nv[0]
872 lea esi, [esp + 44] // -> expanded M = mi
873 mov eax, [ebp + 24] // -> U = av[0]
874 expand xmm0, xmm1, nil, nil, xmm7
875 movdqa [esp + 12], xmm0 // bv[i] expanded low
876 movdqa [esp + 28], xmm1 // bv[i] expanded high
878 mov esi, [esp + 8] // recover av limit
885 // Complete the next inner loop.
893 // Still have carries left to propagate, and they overlap the
894 // previous iteration's final tail, so read that in and add it.
898 movd [edi + 16], xmm4
903 // First iteration was short. Write out the carries and we're done.
904 // (This could be folded into the main loop structure, but that would
905 // penalize small numbers more.)
907 movd [edi + 16], xmm4
919 FUNC(mpxmont_redc4_x86_sse2)
920 // void mpxmont_redc4_x86_sse2(mpw *dv, mpw *dvl, const mpw *nv,
921 // size_t n, const mpw *mi);
923 // Build a stack frame. Arguments will be relative to EBP, as
929 // ebp + 32 n (nonzero multiple of 4)
932 // Locals are relative to ESP, as follows.
934 // esp + 0 outer loop dv
935 // esp + 4 outer dv limit
936 // esp + 8 blocks-of-4 dv limit
937 // esp + 12 expanded M (32 bytes)
938 // esp + 44 expanded Y (32 bytes)
939 // esp + 76 (top of locals)
949 // Establish the expanded operands and the blocks-of-4 dv limit.
950 mov edi, [ebp + 20] // -> Z = dv[0]
952 mov eax, [ebp + 24] // -> dv[n] = dv limit
953 sub eax, edi // length of dv in bytes
954 mov edx, [ebp + 36] // -> mi
955 movdqu xmm0, [edx] // mi
956 and eax, ~15 // mask off the tail end
957 expand xmm0, xmm1, nil, nil, xmm7
958 add eax, edi // find limit
959 movdqa [esp + 12], xmm0 // mi expanded low
960 movdqa [esp + 28], xmm1 // mi expanded high
963 // Set up the outer loop state and prepare for the first iteration.
964 mov ecx, [ebp + 32] // n
965 mov ebx, [ebp + 28] // -> X = nv[0]
966 lea edx, [edi + 4*ecx] // -> dv[n/4] = outer dv limit
967 lea ecx, [ebx + 4*ecx] // -> nv[n/4] = nv limit
970 lea esi, [esp + 12] // -> expanded M = mi
971 lea edx, [esp + 44] // -> space for Y
975 cmp ebx, ecx // done already?
979 // Complete the first inner loop.
983 cmp ebx, ecx // done yet?
986 // Still have carries left to propagate.
988 mov esi, [esp + 8] // -> dv blocks limit
989 mov edx, [ebp + 24] // dv limit
1000 // Continue carry propagation until the end of the buffer.
1002 mov eax, 0 // preserves flags
1011 // Deal with the tail end.
1013 mov eax, 0 // preserves flags
1019 // All done for this iteration. Start the next. (This must have at
1020 // least one follow-on iteration, or we'd not have started this outer
1022 8: mov edi, [esp + 0] // -> dv[i - 1]
1023 mov ebx, [ebp + 28] // -> X = nv[0]
1024 lea edx, [esp + 44] // -> space for Y
1025 lea esi, [esp + 12] // -> expanded M = mi
1026 add edi, 16 // -> Z = dv[i]
1027 cmp edi, [esp + 4] // all done yet?
1045 ///--------------------------------------------------------------------------
1046 /// Testing and performance measurement.
1056 .macro cystore c, v, n
1064 mov [ebx + ecx*8], eax
1065 mov [ebx + ecx*8 + 4], edx
1079 // esp + 12 = v expanded
1080 // esp + 44 = y expanded
1081 // esp + 72 = ? expanded
1093 .macro testldcarry c
1095 movdqu xmm4, [ecx + 0] // (c'_0, c''_0)
1096 movdqu xmm5, [ecx + 16] // (c'_1, c''_1)
1097 movdqu xmm6, [ecx + 32] // (c'_2, c''_2)
1100 .macro testexpand v, y
1105 expand xmm0, xmm1, nil, nil, xmm7
1106 movdqa [esp + 12], xmm0
1107 movdqa [esp + 28], xmm1
1112 expand xmm2, xmm3, nil, nil, xmm7
1113 movdqa [esp + 44], xmm2
1114 movdqa [esp + 60], xmm3
1118 .macro testtop u, x, mode
1125 .ifeqs "\mode", "mont"
1132 .ifeqs "\mode", "mont"
1139 .macro testtail cyv, n
1140 cystore esp + 0, \cyv, \n
1144 .macro testcarryout c
1146 movdqu [ecx + 0], xmm4
1147 movdqu [ecx + 16], xmm5
1148 movdqu [ecx + 32], xmm6
1153 testldcarry [ebp + 24]
1154 testexpand [ebp + 36], [ebp + 40]
1156 testtop [ebp + 28], [ebp + 32]
1158 testtail [ebp + 48], [ebp + 44]
1159 testcarryout [ebp + 24]
1165 testldcarry [ebp + 24]
1166 testexpand [ebp + 36], [ebp + 40]
1168 testtop [ebp + 28], [ebp + 32]
1170 testtail [ebp + 48], [ebp + 44]
1171 testcarryout [ebp + 24]
1177 testldcarry [ebp + 24]
1178 testexpand nil, [ebp + 32]
1180 testtop nil, [ebp + 28]
1182 testtail [ebp + 40], [ebp + 36]
1183 testcarryout [ebp + 24]
1189 testldcarry [ebp + 24]
1190 testexpand nil, [ebp + 32]
1192 testtop nil, [ebp + 28]
1194 testtail [ebp + 40], [ebp + 36]
1195 testcarryout [ebp + 24]
1201 testexpand [ebp + 40], [ebp + 44]
1203 testtop [ebp + 32], [ebp + 36], mont
1205 testtail [ebp + 52], [ebp + 48]
1207 movdqa xmm0, [esp + 76]
1208 movdqa xmm1, [esp + 92]
1210 movdqu [edi + 16], xmm1
1211 testcarryout [ebp + 24]
1217 testexpand [ebp + 40], [ebp + 44]
1219 testtop [ebp + 32], [ebp + 36], mont
1221 testtail [ebp + 52], [ebp + 48]
1223 movdqa xmm0, [esp + 76]
1224 movdqa xmm1, [esp + 92]
1226 movdqu [edi + 16], xmm1
1227 testcarryout [ebp + 24]
1233 testexpand nil, [ebp + 36]
1235 testtop nil, [ebp + 32], mont
1237 testtail [ebp + 44], [ebp + 40]
1239 movdqa xmm0, [esp + 76]
1240 movdqa xmm1, [esp + 92]
1242 movdqu [edi + 16], xmm1
1243 testcarryout [ebp + 24]
1249 ///----- That's all, folks --------------------------------------------------