1 /// -*- mode: asm; asm-comment-char: ?/; comment-start: "// " -*-
3 /// Large SIMD-based multiplications
5 /// (c) 2016 Straylight/Edgeware
7 ///----- Licensing notice ---------------------------------------------------
9 /// This file is part of Catacomb.
11 /// Catacomb is free software; you can redistribute it and/or modify
12 /// it under the terms of the GNU Library General Public License as
13 /// published by the Free Software Foundation; either version 2 of the
14 /// License, or (at your option) any later version.
16 /// Catacomb is distributed in the hope that it will be useful,
17 /// but WITHOUT ANY WARRANTY; without even the implied warranty of
18 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 /// GNU Library General Public License for more details.
21 /// You should have received a copy of the GNU Library General Public
22 /// License along with Catacomb; if not, write to the Free
23 /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
24 /// MA 02111-1307, USA.
26 ///--------------------------------------------------------------------------
27 /// External definitions.
30 #include "asm-common.h"
32 ///--------------------------------------------------------------------------
38 ///--------------------------------------------------------------------------
41 /// We define a number of primitive fixed-size multipliers from which we can
42 /// construct more general variable-length multipliers.
44 /// The basic trick is the same throughout. In an operand-scanning
45 /// multiplication, the inner multiplication loop multiplies a
46 /// multiple-precision operand by a single precision factor, and adds the
47 /// result, appropriately shifted, to the result. A `finely integrated
48 /// operand scanning' implementation of Montgomery multiplication also adds
49 /// the product of a single-precision `Montgomery factor' and the modulus,
50 /// calculated in the same pass. The more common `coarsely integrated
51 /// operand scanning' alternates main multiplication and Montgomery passes,
52 /// which requires additional carry propagation.
54 /// Throughout both plain-multiplication and Montgomery stages, then, one of
55 /// the factors remains constant throughout the operation, so we can afford
56 /// to take a little time to preprocess it. The transformation we perform is
57 /// as follows. Let b = 2^16, and B = b^2 = 2^32. Suppose we're given a
58 /// 128-bit factor v = v_0 + v_1 B + v_2 B^2 + v_3 B^3. Split each v_i into
59 /// two sixteen-bit pieces, so v_i = v'_i + v''_i b. These eight 16-bit
60 /// pieces are placed into 32-bit cells, and arranged as two 128-bit SSE
61 /// operands, as follows.
64 /// 0 v'_0 v'_1 v''_0 v''_1
65 /// 16 v'_2 v'_3 v''_2 v''_3
67 /// A `pmuludqd' instruction ignores the odd positions in its operands; thus,
68 /// it will act on (say) v'_0 and v''_0 in a single instruction. Shifting
69 /// this vector right by 4 bytes brings v'_1 and v''_1 into position. We can
70 /// multiply such a vector by a full 32-bit scalar to produce two 48-bit
71 /// results in 64-bit fields. The sixteen bits of headroom allows us to add
72 /// many products together before we must deal with carrying; it also allows
73 /// for some calculations to be performed on the above expanded form.
75 /// On 32-bit x86, we are register starved: the expanded operands are kept in
76 /// memory, typically in warm L1 cache.
78 /// We maintain four `carry' registers accumulating intermediate results.
79 /// The registers' precise roles rotate during the computation; we name them
80 /// `c0', `c1', `c2', and `c3'. Each carry register holds two 64-bit halves:
81 /// the register c0, for example, holds c'_0 (low half) and c''_0 (high
82 /// half), and represents the value c_0 = c'_0 + c''_0 b; the carry registers
83 /// collectively represent the value c_0 + c_1 B + c_2 B^2 + c_3 B^3. The
84 /// `pmuluqdq' instruction acting on a scalar operand (broadcast across all
85 /// lanes of its vector) and an operand in the expanded form above produces a
86 /// result which can be added directly to the appropriate carry register.
87 /// Following a pass of four multiplications, we perform some limited carry
88 /// propagation: let t = c''_0 mod B, and let d = c'_0 + t b; then we output
89 /// z = d mod B, add (floor(d/B), floor(c''_0/B)) to c1, and cycle the carry
90 /// registers around, so that c1 becomes c0, and the old c0 is (implicitly)
91 /// zeroed becomes c3.
93 ///--------------------------------------------------------------------------
94 /// Macro definitions.
96 .macro mulcore r, s, d0, d1, d2, d3
97 // Load a word r_i from R, multiply by the expanded operand [S], and
98 // leave the pieces of the product in registers D0, D1, D2, D3.
99 movd \d0, \r // (r_i, 0, 0, 0)
101 movdqa \d1, [\s] // (s'_0, s'_1, s''_0, s''_1)
104 movdqa \d3, [\s + 16] // (s'_2, s'_3, s''_2, s''_3)
106 pshufd \d0, \d0, SHUF(3, 0, 3, 0) // (r_i, ?, r_i, ?)
108 psrldq \d1, 4 // (s'_1, s''_0, s''_1, 0)
112 movdqa \d2, \d3 // another copy of (s'_2, s'_3, ...)
114 movdqa \d2, \d0 // another copy of (r_i, ?, r_i, ?)
118 psrldq \d3, 4 // (s'_3, s''_2, s''_3, 0)
121 pmuludqd \d1, \d0 // (r_i s'_1, r_i s''_1)
124 pmuludqd \d3, \d0 // (r_i s'_3, r_i s''_3)
128 pmuludqd \d2, \d0 // (r_i s'_2, r_i s''_2)
130 pmuludqd \d2, [\s + 16]
133 pmuludqd \d0, [\s] // (r_i s'_0, r_i s''_0)
136 .macro accum c0, c1, c2, c3
149 .macro mulacc r, s, c0, c1, c2, c3, z3p
150 // Load a word r_i from R, multiply by the expanded operand [S],
151 // and accumulate in carry registers C0, C1, C2, C3. If Z3P is `t'
152 // then C3 notionally contains zero, but needs clearing; in practice,
153 // we store the product directly rather than attempting to add. On
154 // completion, XMM0, XMM1, and XMM2 are clobbered, as is XMM3 if Z3P
157 mulcore \r, \s, xmm0, xmm1, xmm2, \c3
158 accum \c0, \c1, \c2, nil
160 mulcore \r, \s, xmm0, xmm1, xmm2, xmm3
161 accum \c0, \c1, \c2, \c3
165 .macro propout d, c, cc
166 // Calculate an output word from C, and store it in D; propagate
167 // carries out from C to CC in preparation for a rotation of the
168 // carry registers. On completion, XMM3 is clobbered. If CC is
169 // `nil', then the contribution which would have been added to it is
171 pshufd xmm3, \c, SHUF(2, 3, 3, 3) // (?, ?, ?, t = c'' mod B)
172 psrldq xmm3, 12 // (t, 0, 0, 0) = (t, 0)
173 pslldq xmm3, 2 // (t b, 0)
174 paddq \c, xmm3 // (c' + t b, c'')
176 psrlq \c, 32 // floor(c/B)
178 paddq \cc, \c // propagate up
182 .macro endprop d, c, t
183 // On entry, C contains a carry register. On exit, the low 32 bits
184 // of the value represented in C are written to D, and the remaining
185 // bits are left at the bottom of T.
187 psllq \t, 16 // (?, c'' b)
188 pslldq \c, 8 // (0, c')
189 paddq \t, \c // (?, c' + c'' b)
190 psrldq \t, 8 // c' + c'' b
192 psrldq \t, 4 // floor((c' + c'' b)/B)
195 .macro expand a, b, c, d, z
196 // On entry, A and C hold packed 128-bit values, and Z is zero. On
197 // exit, A:B and C:D together hold the same values in expanded
198 // form. If C is `nil', then only expand A to A:B.
199 movdqa \b, \a // (a_0, a_1, a_2, a_3)
201 movdqa \d, \c // (c_0, c_1, c_2, c_3)
203 punpcklwd \a, \z // (a'_0, a''_0, a'_1, a''_1)
204 punpckhwd \b, \z // (a'_2, a''_2, a'_3, a''_3)
206 punpcklwd \c, \z // (c'_0, c''_0, c'_1, c''_1)
207 punpckhwd \d, \z // (c'_2, c''_2, c'_3, c''_3)
209 pshufd \a, \a, SHUF(3, 1, 2, 0) // (a'_0, a'_1, a''_0, a''_1)
210 pshufd \b, \b, SHUF(3, 1, 2, 0) // (a'_2, a'_3, a''_2, a''_3)
212 pshufd \c, \c, SHUF(3, 1, 2, 0) // (c'_0, c'_1, c''_0, c''_1)
213 pshufd \d, \d, SHUF(3, 1, 2, 0) // (c'_2, c'_3, c''_2, c''_3)
217 .macro squash c0, c1, c2, c3, h, t, u
218 // On entry, C0, C1, C2, C3 are carry registers representing a value
219 // Y. On exit, C0 holds the low 128 bits of the carry value; C1, C2,
220 // C3, T, and U are clobbered; and the high bits of Y are stored in
221 // H, if this is not `nil'.
223 // The first step is to eliminate the `double-prime' pieces -- i.e.,
224 // the ones offset by 16 bytes from a 32-bit boundary -- by carrying
225 // them into the 32-bit-aligned pieces above and below. But before
226 // we can do that, we must gather them together.
229 punpcklqdq \t, \c2 // (y'_0, y'_2)
230 punpckhqdq \c0, \c2 // (y''_0, y''_2)
231 punpcklqdq \u, \c3 // (y'_1, y'_3)
232 punpckhqdq \c1, \c3 // (y''_1, y''_3)
234 // Now split the double-prime pieces. The high (up to) 48 bits will
235 // go up; the low 16 bits go down.
240 psrlq \c0, 16 // high parts of (y''_0, y''_2)
241 psrlq \c1, 16 // high parts of (y''_1, y''_3)
242 psrlq \c2, 32 // low parts of (y''_0, y''_2)
243 psrlq \c3, 32 // low parts of (y''_1, y''_3)
247 pslldq \c1, 8 // high part of (0, y''_1)
249 paddq \t, \c2 // propagate down
251 paddq \t, \c1 // and up: (y_0, y_2)
252 paddq \u, \c0 // (y_1, y_3)
254 psrldq \h, 8 // high part of (y''_3, 0)
257 // Finally extract the answer. This complicated dance is better than
258 // storing to memory and loading, because the piecemeal stores
259 // inhibit store forwarding.
260 movdqa \c3, \t // (y_0, y_1)
261 movdqa \c0, \t // (y^*_0, ?, ?, ?)
262 psrldq \t, 8 // (y_2, 0)
263 psrlq \c3, 32 // (floor(y_0/B), ?)
264 paddq \c3, \u // (y_1 + floor(y_0/B), ?)
265 pslldq \c0, 12 // (0, 0, 0, y^*_0)
266 movdqa \c1, \c3 // (y^*_1, ?, ?, ?)
267 psrldq \u, 8 // (y_3, 0)
268 psrlq \c3, 32 // (floor((y_1 B + y_0)/B^2, ?)
269 paddq \c3, \t // (y_2 + floor((y_1 B + y_0)/B^2, ?)
270 pslldq \c1, 12 // (0, 0, 0, y^*_1)
271 psrldq \c0, 12 // (y^*_0, 0, 0, 0)
272 movdqa \c2, \c3 // (y^*_2, ?, ?, ?)
273 psrlq \c3, 32 // (floor((y_2 B^2 + y_1 B + y_0)/B^3, ?)
274 paddq \c3, \u // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3, ?)
275 pslldq \c2, 12 // (0, 0, 0, y^*_2)
276 psrldq \c1, 8 // (0, y^*_1, 0, 0)
277 psrldq \c2, 4 // (0, 0, y^*_2, 0)
282 pslldq \c3, 12 // (0, 0, 0, y^*_3)
283 por \c0, \c1 // (y^*_0, y^*_1, 0, 0)
284 por \c2, \c3 // (0, 0, y^*_2, y^*_3)
285 por \c0, \c2 // y mod B^4
287 psrlq \t, 32 // very high bits of y
289 punpcklqdq \h, \u // carry up
294 // On entry, EDI points to a packed addend A, and XMM4, XMM5, XMM6
295 // hold the incoming carry registers c0, c1, and c2 representing a
298 // On exit, the carry registers, including XMM7, are updated to hold
299 // C + A; XMM0, XMM1, XMM2, and XMM3 are clobbered. The other
300 // registers are preserved.
301 movd xmm0, [edi + 0] // (a_0, 0)
302 movd xmm1, [edi + 4] // (a_1, 0)
303 movd xmm2, [edi + 8] // (a_2, 0)
304 movd xmm7, [edi + 12] // (a_3, 0)
305 paddq xmm4, xmm0 // (c'_0 + a_0, c''_0)
306 paddq xmm5, xmm1 // (c'_1 + a_1, c''_1)
307 paddq xmm6, xmm2 // (c'_2 + a_2, c''_2 + a_3 b)
310 ///--------------------------------------------------------------------------
311 /// Primitive multipliers and related utilities.
315 // On entry, XMM4, XMM5, and XMM6 hold a 144-bit carry in an expanded
316 // form. Store the low 128 bits of the represented carry to [EDI] as
317 // a packed 128-bit value, and leave the remaining 16 bits in the low
318 // 32 bits of XMM4. On exit, XMM3, XMM5 and XMM6 are clobbered.
319 propout [edi + 0], xmm4, xmm5
320 propout [edi + 4], xmm5, xmm6
321 propout [edi + 8], xmm6, nil
322 endprop [edi + 12], xmm6, xmm4
327 // On entry, EDI points to the destination buffer; EAX and EBX point
328 // to the packed operands U and X; ECX and EDX point to the expanded
329 // operands V and Y; and XMM4, XMM5, XMM6 hold the incoming carry
330 // registers c0, c1, and c2; c3 is assumed to be zero.
332 // On exit, we write the low 128 bits of the sum C + U V + X Y to
333 // [EDI], and update the carry registers with the carry out. The
334 // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
335 // general-purpose registers are preserved.
336 mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7, t
337 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil
338 propout [edi + 0], xmm4, xmm5
340 mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
341 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, nil
342 propout [edi + 4], xmm5, xmm6
344 mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t
345 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, nil
346 propout [edi + 8], xmm6, xmm7
348 mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
349 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, nil
350 propout [edi + 12], xmm7, xmm4
356 // On entry, EDI points to the destination buffer, which also
357 // contains an addend A to accumulate; EAX and EBX point to the
358 // packed operands U and X; ECX and EDX point to the expanded
359 // operands V and Y; and XMM4, XMM5, XMM6 hold the incoming carry
360 // registers c0, c1, and c2 representing a carry-in C; c3 is assumed
363 // On exit, we write the low 128 bits of the sum A + C + U V + X Y to
364 // [EDI], and update the carry registers with the carry out. The
365 // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
366 // general-purpose registers are preserved.
369 mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7, nil
370 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil
371 propout [edi + 0], xmm4, xmm5
373 mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
374 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, nil
375 propout [edi + 4], xmm5, xmm6
377 mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t
378 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, nil
379 propout [edi + 8], xmm6, xmm7
381 mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
382 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, nil
383 propout [edi + 12], xmm7, xmm4
389 // On entry, EDI points to the destination buffer; EBX points to a
390 // packed operand X; and EDX points to an expanded operand Y.
392 // On exit, we write the low 128 bits of the product X Y to [EDI],
393 // and set the carry registers XMM4, XMM5, XMM6 to the carry out.
394 // The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
395 // general-purpose registers are preserved.
396 mulcore [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
397 propout [edi + 0], xmm4, xmm5
399 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
400 propout [edi + 4], xmm5, xmm6
402 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
403 propout [edi + 8], xmm6, xmm7
405 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
406 propout [edi + 12], xmm7, xmm4
412 // On entry, EDI points to the destination buffer; EBX points to a
413 // packed operand X; EDX points to an expanded operand Y; and XMM4,
414 // XMM5, XMM6 hold the incoming carry registers c0, c1, and c2,
415 // representing a carry-in C; c3 is assumed to be zero.
417 // On exit, we write the low 128 bits of the sum C + X Y to [EDI],
418 // and update the carry registers with the carry out. The registers
419 // XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
420 // general-purpose registers are preserved.
421 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, t
422 propout [edi + 0], xmm4, xmm5
424 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
425 propout [edi + 4], xmm5, xmm6
427 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
428 propout [edi + 8], xmm6, xmm7
430 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
431 propout [edi + 12], xmm7, xmm4
437 // On entry, EDI points to the destination buffer, which also
438 // contains an addend A to accumulate; EBX points to a packed operand
439 // X; and EDX points to an expanded operand Y.
441 // On exit, we write the low 128 bits of the sum A + X Y to [EDI],
442 // and set the carry registers XMM4, XMM5, XMM6 to the carry out.
443 // The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
444 // general-purpose registers are preserved.
448 movd xmm7, [edi + 12]
450 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil
451 propout [edi + 0], xmm4, xmm5
453 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
454 propout [edi + 4], xmm5, xmm6
456 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
457 propout [edi + 8], xmm6, xmm7
459 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
460 propout [edi + 12], xmm7, xmm4
466 // On entry, EDI points to the destination buffer, which also
467 // contains an addend A to accumulate; EBX points to a packed operand
468 // X; EDX points to an expanded operand Y; and XMM4, XMM5, XMM6 hold
469 // the incoming carry registers c0, c1, and c2, representing a
470 // carry-in C; c3 is assumed to be zero.
472 // On exit, we write the low 128 bits of the sum A + C + X Y to
473 // [EDI], and update the carry registers with the carry out. The
474 // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
475 // general-purpose registers are preserved.
478 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil
479 propout [edi + 0], xmm4, xmm5
481 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
482 propout [edi + 4], xmm5, xmm6
484 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
485 propout [edi + 8], xmm6, xmm7
487 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
488 propout [edi + 12], xmm7, xmm4
494 // On entry, EDI points to the destination buffer; EAX and EBX point
495 // to the packed operands U and N; ECX and ESI point to the expanded
496 // operands V and M; and EDX points to a place to store an expanded
497 // result Y (32 bytes, at a 16-byte boundary). The stack pointer
498 // must be 16-byte aligned. (This is not the usual convention, which
499 // requires alignment before the call.)
501 // On exit, we write Y = U V M mod B to [EDX], and the low 128 bits
502 // of the sum U V + N Y to [EDI], leaving the remaining carry in
503 // XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2, XMM3, and
504 // XMM7 are clobbered; the general-purpose registers are preserved.
505 sub esp, 64 // space for the carries
507 // Calculate W = U V, and leave it in the destination. Stash the
508 // carry pieces for later.
509 mulcore [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7
510 propout [edi + 0], xmm4, xmm5
515 // On entry, EDI points to the destination buffer, which also
516 // contains an addend A to accumulate; EAX and EBX point
517 // to the packed operands U and N; ECX and ESI point to the expanded
518 // operands V and M; and EDX points to a place to store an expanded
519 // result Y (32 bytes, at a 16-byte boundary). The stack pointer
520 // must be 16-byte aligned. (This is not the usual convention, which
521 // requires alignment before the call.)
523 // On exit, we write Y = (A + U V) M mod B to [EDX], and the low 128
524 // bits of the sum A + U V + N Y to [EDI], leaving the remaining
525 // carry in XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2,
526 // XMM3, and XMM7 are clobbered; the general-purpose registers are
528 sub esp, 64 // space for the carries
532 movd xmm7, [edi + 12]
533 mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7, nil
534 propout [edi + 0], xmm4, xmm5
536 5: mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
537 propout [edi + 4], xmm5, xmm6
539 mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t
540 propout [edi + 8], xmm6, xmm7
542 mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
543 propout [edi + 12], xmm7, xmm4
545 movdqa [esp + 0], xmm4
546 movdqa [esp + 16], xmm5
547 movdqa [esp + 32], xmm6
549 // Calculate Y = W M.
550 mulcore [edi + 0], esi, xmm4, xmm5, xmm6, xmm7
552 mulcore [edi + 4], esi, xmm0, xmm1, xmm2, nil
553 accum xmm5, xmm6, xmm7, nil
555 mulcore [edi + 8], esi, xmm0, xmm1, nil, nil
556 accum xmm6, xmm7, nil, nil
558 mulcore [edi + 12], esi, xmm0, nil, nil, nil
559 accum xmm7, nil, nil, nil
561 // That's lots of pieces. Now we have to assemble the answer.
562 squash xmm4, xmm5, xmm6, xmm7, nil, xmm0, xmm1
566 expand xmm4, xmm1, nil, nil, xmm2
567 movdqa [edx + 0], xmm4
568 movdqa [edx + 16], xmm1
570 // Initialize the carry from the value for W we calculated earlier.
574 movd xmm7, [edi + 12]
576 // Finish the calculation by adding the Montgomery product.
577 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil
578 propout [edi + 0], xmm4, xmm5
580 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
581 propout [edi + 4], xmm5, xmm6
583 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
584 propout [edi + 8], xmm6, xmm7
586 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
587 propout [edi + 12], xmm7, xmm4
589 // Add add on the carry we calculated earlier.
590 paddq xmm4, [esp + 0]
591 paddq xmm5, [esp + 16]
592 paddq xmm6, [esp + 32]
594 // And, with that, we're done.
600 // On entry, EDI points to the destination buffer holding a packed
601 // value A; EBX points to a packed operand N; ESI points to an
602 // expanded operand M; and EDX points to a place to store an expanded
603 // result Y (32 bytes, at a 16-byte boundary).
605 // On exit, we write Y = W M mod B to [EDX], and the low 128 bits
606 // of the sum W + N Y to [EDI], leaving the remaining carry in
607 // XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2, XMM3, and
608 // XMM7 are clobbered; the general-purpose registers are preserved.
610 // Calculate Y = W M.
611 mulcore [edi + 0], esi, xmm4, xmm5, xmm6, xmm7
613 mulcore [edi + 4], esi, xmm0, xmm1, xmm2, nil
614 accum xmm5, xmm6, xmm7, nil
616 mulcore [edi + 8], esi, xmm0, xmm1, nil, nil
617 accum xmm6, xmm7, nil, nil
619 mulcore [edi + 12], esi, xmm0, nil, nil, nil
620 accum xmm7, nil, nil, nil
622 // That's lots of pieces. Now we have to assemble the answer.
623 squash xmm4, xmm5, xmm6, xmm7, nil, xmm0, xmm1
627 expand xmm4, xmm1, nil, nil, xmm2
628 movdqa [edx + 0], xmm4
629 movdqa [edx + 16], xmm1
631 // Initialize the carry from W.
635 movd xmm7, [edi + 12]
637 // Finish the calculation by adding the Montgomery product.
638 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil
639 propout [edi + 0], xmm4, xmm5
641 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
642 propout [edi + 4], xmm5, xmm6
644 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
645 propout [edi + 8], xmm6, xmm7
647 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
648 propout [edi + 12], xmm7, xmm4
650 // And, with that, we're done.
653 ///--------------------------------------------------------------------------
654 /// Bulk multipliers.
656 FUNC(mpx_umul4_x86_sse2)
657 // void mpx_umul4_x86_sse2(mpw *dv, const mpw *av, const mpw *avl,
658 // const mpw *bv, const mpw *bvl);
660 // Build a stack frame. Arguments will be relative to EBP, as
669 // Locals are relative to ESP, as follows.
671 // esp + 0 expanded Y (32 bytes)
672 // esp + 32 (top of locals)
681 // Prepare for the first iteration.
682 mov esi, [ebp + 32] // -> bv[0]
684 movdqu xmm0, [esi] // bv[0]
685 mov edi, [ebp + 20] // -> dv[0]
686 mov ecx, edi // outer loop dv cursor
687 expand xmm0, xmm1, nil, nil, xmm7
688 mov ebx, [ebp + 24] // -> av[0]
689 mov eax, [ebp + 28] // -> av[m] = av limit
690 mov edx, esp // -> expanded Y = bv[0]
691 movdqa [esp + 0], xmm0 // bv[0] expanded low
692 movdqa [esp + 16], xmm1 // bv[0] expanded high
698 cmp ebx, eax // all done?
702 // Continue with the first iteration.
706 cmp ebx, eax // all done?
709 // Write out the leftover carry. There can be no tail here.
711 cmp esi, [ebp + 36] // more passes to do?
715 // Set up for the next pass.
716 1: movdqu xmm0, [esi] // bv[i]
717 mov edi, ecx // -> dv[i]
719 expand xmm0, xmm1, nil, nil, xmm7
720 mov ebx, [ebp + 24] // -> av[0]
721 movdqa [esp + 0], xmm0 // bv[i] expanded low
722 movdqa [esp + 16], xmm1 // bv[i] expanded high
728 cmp ebx, eax // done yet?
739 // Finish off this pass. There was no tail on the previous pass, and
740 // there can be none on this pass.
755 FUNC(mpxmont_mul4_x86_sse2)
756 // void mpxmont_mul4_x86_sse2(mpw *dv, const mpw *av, const mpw *bv,
757 // const mpw *nv, size_t n, const mpw *mi);
759 // Build a stack frame. Arguments will be relative to EBP, as
766 // ebp + 36 n (nonzero multiple of 4)
769 // Locals are relative to ESP, which is 4 mod 16, as follows.
771 // esp + 0 outer loop dv
772 // esp + 4 outer loop bv
773 // esp + 8 av limit (mostly in ESI)
774 // esp + 12 expanded V (32 bytes)
775 // esp + 44 expanded M (32 bytes)
776 // esp + 76 expanded Y (32 bytes)
777 // esp + 108 bv limit
779 // esp + 124 (top of locals)
788 // Establish the expanded operands.
790 mov ecx, [ebp + 28] // -> bv
791 mov edx, [ebp + 40] // -> mi
792 movdqu xmm0, [ecx] // bv[0]
793 movdqu xmm2, [edx] // mi
794 expand xmm0, xmm1, xmm2, xmm3, xmm7
795 movdqa [esp + 12], xmm0 // bv[0] expanded low
796 movdqa [esp + 28], xmm1 // bv[0] expanded high
797 movdqa [esp + 44], xmm2 // mi expanded low
798 movdqa [esp + 60], xmm3 // mi expanded high
800 // Set up the outer loop state and prepare for the first iteration.
801 mov edx, [ebp + 36] // n
802 mov eax, [ebp + 24] // -> U = av[0]
803 mov ebx, [ebp + 32] // -> X = nv[0]
804 mov edi, [ebp + 20] // -> Z = dv[0]
806 lea ecx, [ecx + 4*edx] // -> bv[n/4] = bv limit
807 lea edx, [eax + 4*edx] // -> av[n/4] = av limit
811 lea ecx, [esp + 12] // -> expanded V = bv[0]
812 lea esi, [esp + 44] // -> expanded M = mi
813 lea edx, [esp + 76] // -> space for Y
815 mov esi, [esp + 8] // recover av limit
819 cmp eax, esi // done already?
824 // Complete the first inner loop.
829 cmp eax, esi // done yet?
832 // Still have carries left to propagate.
834 movd [edi + 16], xmm4
837 // Embark on the next iteration. (There must be one. If n = 1, then
838 // we would have bailed above, to label 8. Similarly, the subsequent
839 // iterations can fall into the inner loop immediately.)
840 1: mov eax, [esp + 4] // -> bv[i - 1]
841 mov edi, [esp + 0] // -> Z = dv[i]
842 add eax, 16 // -> bv[i]
844 movdqu xmm0, [eax] // bv[i]
846 cmp eax, [esp + 108] // done yet?
848 mov ebx, [ebp + 32] // -> X = nv[0]
849 lea esi, [esp + 44] // -> expanded M = mi
850 mov eax, [ebp + 24] // -> U = av[0]
851 expand xmm0, xmm1, nil, nil, xmm7
852 movdqa [esp + 12], xmm0 // bv[i] expanded low
853 movdqa [esp + 28], xmm1 // bv[i] expanded high
855 mov esi, [esp + 8] // recover av limit
862 // Complete the next inner loop.
870 // Still have carries left to propagate, and they overlap the
871 // previous iteration's final tail, so read that in and add it.
875 movd [edi + 16], xmm4
880 // First iteration was short. Write out the carries and we're done.
881 // (This could be folded into the main loop structure, but that would
882 // penalize small numbers more.)
884 movd [edi + 16], xmm4
896 FUNC(mpxmont_redc4_x86_sse2)
897 // void mpxmont_redc4_x86_sse2(mpw *dv, mpw *dvl, const mpw *nv,
898 // size_t n, const mpw *mi);
900 // Build a stack frame. Arguments will be relative to EBP, as
906 // ebp + 32 n (nonzero multiple of 4)
909 // Locals are relative to ESP, as follows.
911 // esp + 0 outer loop dv
912 // esp + 4 outer dv limit
913 // esp + 8 blocks-of-4 dv limit
914 // esp + 12 expanded M (32 bytes)
915 // esp + 44 expanded Y (32 bytes)
916 // esp + 76 (top of locals)
925 // Establish the expanded operands and the blocks-of-4 dv limit.
926 mov edi, [ebp + 20] // -> Z = dv[0]
928 mov eax, [ebp + 24] // -> dv[n] = dv limit
929 sub eax, edi // length of dv in bytes
930 mov edx, [ebp + 36] // -> mi
931 movdqu xmm0, [edx] // mi
932 and eax, ~15 // mask off the tail end
933 expand xmm0, xmm1, nil, nil, xmm7
934 add eax, edi // find limit
935 movdqa [esp + 12], xmm0 // mi expanded low
936 movdqa [esp + 28], xmm1 // mi expanded high
939 // Set up the outer loop state and prepare for the first iteration.
940 mov ecx, [ebp + 32] // n
941 mov ebx, [ebp + 28] // -> X = nv[0]
942 lea edx, [edi + 4*ecx] // -> dv[n/4] = outer dv limit
943 lea ecx, [ebx + 4*ecx] // -> nv[n/4] = nv limit
946 lea esi, [esp + 12] // -> expanded M = mi
947 lea edx, [esp + 44] // -> space for Y
951 cmp ebx, ecx // done already?
955 // Complete the first inner loop.
959 cmp ebx, ecx // done yet?
962 // Still have carries left to propagate.
964 mov esi, [esp + 8] // -> dv blocks limit
965 mov edx, [ebp + 24] // dv limit
976 // Continue carry propagation until the end of the buffer.
978 mov eax, 0 // preserves flags
987 // Deal with the tail end.
989 mov eax, 0 // preserves flags
995 // All done for this iteration. Start the next. (This must have at
996 // least one follow-on iteration, or we'd not have started this outer
998 8: mov edi, [esp + 0] // -> dv[i - 1]
999 mov ebx, [ebp + 28] // -> X = nv[0]
1000 lea edx, [esp + 44] // -> space for Y
1001 lea esi, [esp + 12] // -> expanded M = mi
1002 add edi, 16 // -> Z = dv[i]
1003 cmp edi, [esp + 4] // all done yet?
1021 ///--------------------------------------------------------------------------
1022 /// Testing and performance measurement.
1032 .macro cystore c, v, n
1040 mov [ebx + ecx*8], eax
1041 mov [ebx + ecx*8 + 4], edx
1054 // esp + 12 = v expanded
1055 // esp + 44 = y expanded
1056 // esp + 72 = ? expanded
1068 .macro testldcarry c
1070 movdqu xmm4, [ecx + 0] // (c'_0, c''_0)
1071 movdqu xmm5, [ecx + 16] // (c'_1, c''_1)
1072 movdqu xmm6, [ecx + 32] // (c'_2, c''_2)
1075 .macro testexpand v, y
1080 expand xmm0, xmm1, nil, nil, xmm7
1081 movdqa [esp + 12], xmm0
1082 movdqa [esp + 28], xmm1
1087 expand xmm2, xmm3, nil, nil, xmm7
1088 movdqa [esp + 44], xmm2
1089 movdqa [esp + 60], xmm3
1093 .macro testtop u, x, mode
1100 .ifeqs "\mode", "mont"
1107 .ifeqs "\mode", "mont"
1114 .macro testtail cyv, n
1115 cystore esp + 0, \cyv, \n
1119 .macro testcarryout c
1121 movdqu [ecx + 0], xmm4
1122 movdqu [ecx + 16], xmm5
1123 movdqu [ecx + 32], xmm6
1129 testldcarry [ebp + 24]
1130 testexpand [ebp + 36], [ebp + 40]
1132 testtop [ebp + 28], [ebp + 32]
1134 testtail [ebp + 48], [ebp + 44]
1135 testcarryout [ebp + 24]
1141 testldcarry [ebp + 24]
1142 testexpand [ebp + 36], [ebp + 40]
1144 testtop [ebp + 28], [ebp + 32]
1146 testtail [ebp + 48], [ebp + 44]
1147 testcarryout [ebp + 24]
1153 testldcarry [ebp + 24]
1154 testexpand nil, [ebp + 32]
1156 testtop nil, [ebp + 28]
1158 testtail [ebp + 40], [ebp + 36]
1159 testcarryout [ebp + 24]
1165 testldcarry [ebp + 24]
1166 testexpand nil, [ebp + 32]
1168 testtop nil, [ebp + 28]
1170 testtail [ebp + 40], [ebp + 36]
1171 testcarryout [ebp + 24]
1177 testexpand [ebp + 40], [ebp + 44]
1179 testtop [ebp + 32], [ebp + 36], mont
1181 testtail [ebp + 52], [ebp + 48]
1183 movdqa xmm0, [esp + 76]
1184 movdqa xmm1, [esp + 92]
1186 movdqu [edi + 16], xmm1
1187 testcarryout [ebp + 24]
1193 testexpand [ebp + 40], [ebp + 44]
1195 testtop [ebp + 32], [ebp + 36], mont
1197 testtail [ebp + 52], [ebp + 48]
1199 movdqa xmm0, [esp + 76]
1200 movdqa xmm1, [esp + 92]
1202 movdqu [edi + 16], xmm1
1203 testcarryout [ebp + 24]
1209 testexpand nil, [ebp + 36]
1211 testtop nil, [ebp + 32], mont
1213 testtail [ebp + 44], [ebp + 40]
1215 movdqa xmm0, [esp + 76]
1216 movdqa xmm1, [esp + 92]
1218 movdqu [edi + 16], xmm1
1219 testcarryout [ebp + 24]
1224 ///----- That's all, folks --------------------------------------------------