1 /// -*- mode: asm; asm-comment-char: ?/; comment-start: "// " -*-
3 /// Large SIMD-based multiplications
5 /// (c) 2016 Straylight/Edgeware
7 ///----- Licensing notice ---------------------------------------------------
9 /// This file is part of Catacomb.
11 /// Catacomb is free software; you can redistribute it and/or modify
12 /// it under the terms of the GNU Library General Public License as
13 /// published by the Free Software Foundation; either version 2 of the
14 /// License, or (at your option) any later version.
16 /// Catacomb is distributed in the hope that it will be useful,
17 /// but WITHOUT ANY WARRANTY; without even the implied warranty of
18 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 /// GNU Library General Public License for more details.
21 /// You should have received a copy of the GNU Library General Public
22 /// License along with Catacomb; if not, write to the Free
23 /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
24 /// MA 02111-1307, USA.
26 ///--------------------------------------------------------------------------
27 /// External definitions.
30 #include "asm-common.h"
32 ///--------------------------------------------------------------------------
38 ///--------------------------------------------------------------------------
41 /// We define a number of primitive fixed-size multipliers from which we can
42 /// construct more general variable-length multipliers.
44 /// The basic trick is the same throughout. In an operand-scanning
45 /// multiplication, the inner multiplication loop multiplies a
46 /// multiple-precision operand by a single precision factor, and adds the
47 /// result, appropriately shifted, to the result. A `finely integrated
48 /// operand scanning' implementation of Montgomery multiplication also adds
49 /// the product of a single-precision `Montgomery factor' and the modulus,
50 /// calculated in the same pass. The more common `coarsely integrated
51 /// operand scanning' alternates main multiplication and Montgomery passes,
52 /// which requires additional carry propagation.
54 /// Throughout both plain-multiplication and Montgomery stages, then, one of
55 /// the factors remains constant throughout the operation, so we can afford
56 /// to take a little time to preprocess it. The transformation we perform is
57 /// as follows. Let b = 2^16, and B = b^2 = 2^32. Suppose we're given a
58 /// 128-bit factor v = v_0 + v_1 B + v_2 B^2 + v_3 B^3. Split each v_i into
59 /// two sixteen-bit pieces, so v_i = v'_i + v''_i b. These eight 16-bit
60 /// pieces are placed into 32-bit cells, and arranged as two 128-bit SSE
61 /// operands, as follows.
64 /// 0 v'_0 v'_1 v''_0 v''_1
65 /// 16 v'_2 v'_3 v''_2 v''_3
67 /// A `pmuludqd' instruction ignores the odd positions in its operands; thus,
68 /// it will act on (say) v'_0 and v''_0 in a single instruction. Shifting
69 /// this vector right by 4 bytes brings v'_1 and v''_1 into position. We can
70 /// multiply such a vector by a full 32-bit scalar to produce two 48-bit
71 /// results in 64-bit fields. The sixteen bits of headroom allows us to add
72 /// many products together before we must deal with carrying; it also allows
73 /// for some calculations to be performed on the above expanded form.
75 /// On 32-bit x86, we are register starved: the expanded operands are kept in
76 /// memory, typically in warm L1 cache.
78 /// We maintain four `carry' registers accumulating intermediate results.
79 /// The registers' precise roles rotate during the computation; we name them
80 /// `c0', `c1', `c2', and `c3'. Each carry register holds two 64-bit halves:
81 /// the register c0, for example, holds c'_0 (low half) and c''_0 (high
82 /// half), and represents the value c_0 = c'_0 + c''_0 b; the carry registers
83 /// collectively represent the value c_0 + c_1 B + c_2 B^2 + c_3 B^3. The
84 /// `pmuluqdq' instruction acting on a scalar operand (broadcast across all
85 /// lanes of its vector) and an operand in the expanded form above produces a
86 /// result which can be added directly to the appropriate carry register.
87 /// Following a pass of four multiplications, we perform some limited carry
88 /// propagation: let t = c''_0 mod B, and let d = c'_0 + t b; then we output
89 /// z = d mod B, add (floor(d/B), floor(c''_0/B)) to c1, and cycle the carry
90 /// registers around, so that c1 becomes c0, and the old c0 is (implicitly)
91 /// zeroed becomes c3.
93 ///--------------------------------------------------------------------------
94 /// Macro definitions.
96 .macro mulcore r, s, d0, d1, d2, d3
97 // Load a word r_i from R, multiply by the expanded operand [S], and
98 // leave the pieces of the product in registers D0, D1, D2, D3.
99 movd \d0, \r // (r_i, 0, 0, 0)
101 movdqa \d1, [\s] // (s'_0, s'_1, s''_0, s''_1)
104 movdqa \d3, [\s + 16] // (s'_2, s'_3, s''_2, s''_3)
106 pshufd \d0, \d0, SHUF(3, 0, 3, 0) // (r_i, ?, r_i, ?)
108 psrldq \d1, 4 // (s'_1, s''_0, s''_1, 0)
112 movdqa \d2, \d3 // another copy of (s'_2, s'_3, ...)
114 movdqa \d2, \d0 // another copy of (r_i, ?, r_i, ?)
118 psrldq \d3, 4 // (s'_3, s''_2, s''_3, 0)
121 pmuludqd \d1, \d0 // (r_i s'_1, r_i s''_1)
124 pmuludqd \d3, \d0 // (r_i s'_3, r_i s''_3)
128 pmuludqd \d2, \d0 // (r_i s'_2, r_i s''_2)
130 pmuludqd \d2, [\s + 16]
133 pmuludqd \d0, [\s] // (r_i s'_0, r_i s''_0)
136 .macro accum c0, c1, c2, c3
149 .macro mulacc r, s, c0, c1, c2, c3, z3p
150 // Load a word r_i from R, multiply by the expanded operand [S],
151 // and accumulate in carry registers C0, C1, C2, C3. If Z3P is `t'
152 // then C3 notionally contains zero, but needs clearing; in practice,
153 // we store the product directly rather than attempting to add. On
154 // completion, XMM0, XMM1, and XMM2 are clobbered, as is XMM3 if Z3P
157 mulcore \r, \s, xmm0, xmm1, xmm2, \c3
158 accum \c0, \c1, \c2, nil
160 mulcore \r, \s, xmm0, xmm1, xmm2, xmm3
161 accum \c0, \c1, \c2, \c3
165 .macro propout d, c, cc
166 // Calculate an output word from C, and store it in D; propagate
167 // carries out from C to CC in preparation for a rotation of the
168 // carry registers. On completion, XMM3 is clobbered. If CC is
169 // `nil', then the contribution which would have been added to it is
171 pshufd xmm3, \c, SHUF(2, 3, 3, 3) // (?, ?, ?, t = c'' mod B)
172 psrldq xmm3, 12 // (t, 0, 0, 0) = (t, 0)
173 pslldq xmm3, 2 // (t b, 0)
174 paddq \c, xmm3 // (c' + t b, c'')
176 psrlq \c, 32 // floor(c/B)
178 paddq \cc, \c // propagate up
182 .macro endprop d, c, t
183 // On entry, C contains a carry register. On exit, the low 32 bits
184 // of the value represented in C are written to D, and the remaining
185 // bits are left at the bottom of T.
187 psllq \t, 16 // (?, c'' b)
188 pslldq \c, 8 // (0, c')
189 paddq \t, \c // (?, c' + c'' b)
190 psrldq \t, 8 // c' + c'' b
192 psrldq \t, 4 // floor((c' + c'' b)/B)
195 .macro expand a, b, c, d, z
196 // On entry, A and C hold packed 128-bit values, and Z is zero. On
197 // exit, A:B and C:D together hold the same values in expanded
198 // form. If C is `nil', then only expand A to A:B.
199 movdqa \b, \a // (a_0, a_1, a_2, a_3)
201 movdqa \d, \c // (c_0, c_1, c_2, c_3)
203 punpcklwd \a, \z // (a'_0, a''_0, a'_1, a''_1)
204 punpckhwd \b, \z // (a'_2, a''_2, a'_3, a''_3)
206 punpcklwd \c, \z // (c'_0, c''_0, c'_1, c''_1)
207 punpckhwd \d, \z // (c'_2, c''_2, c'_3, c''_3)
209 pshufd \a, \a, SHUF(3, 1, 2, 0) // (a'_0, a'_1, a''_0, a''_1)
210 pshufd \b, \b, SHUF(3, 1, 2, 0) // (a'_2, a'_3, a''_2, a''_3)
212 pshufd \c, \c, SHUF(3, 1, 2, 0) // (c'_0, c'_1, c''_0, c''_1)
213 pshufd \d, \d, SHUF(3, 1, 2, 0) // (c'_2, c'_3, c''_2, c''_3)
217 .macro squash c0, c1, c2, c3, h, t, u
218 // On entry, C0, C1, C2, C3 are carry registers representing a value
219 // Y. On exit, C0 holds the low 128 bits of the carry value; C1, C2,
220 // C3, T, and U are clobbered; and the high bits of Y are stored in
221 // H, if this is not `nil'.
223 // The first step is to eliminate the `double-prime' pieces -- i.e.,
224 // the ones offset by 16 bytes from a 32-bit boundary -- by carrying
225 // them into the 32-bit-aligned pieces above and below. But before
226 // we can do that, we must gather them together.
229 punpcklqdq \t, \c2 // (y'_0, y'_2)
230 punpckhqdq \c0, \c2 // (y''_0, y''_2)
231 punpcklqdq \u, \c3 // (y'_1, y'_3)
232 punpckhqdq \c1, \c3 // (y''_1, y''_3)
234 // Now split the double-prime pieces. The high (up to) 48 bits will
235 // go up; the low 16 bits go down.
240 psrlq \c0, 16 // high parts of (y''_0, y''_2)
241 psrlq \c1, 16 // high parts of (y''_1, y''_3)
242 psrlq \c2, 32 // low parts of (y''_0, y''_2)
243 psrlq \c3, 32 // low parts of (y''_1, y''_3)
247 pslldq \c1, 8 // high part of (0, y''_1)
249 paddq \t, \c2 // propagate down
251 paddq \t, \c1 // and up: (y_0, y_2)
252 paddq \u, \c0 // (y_1, y_3)
254 psrldq \h, 8 // high part of (y''_3, 0)
257 // Finally extract the answer. This complicated dance is better than
258 // storing to memory and loading, because the piecemeal stores
259 // inhibit store forwarding.
260 movdqa \c3, \t // (y_0, y_1)
261 movdqa \c0, \t // (y^*_0, ?, ?, ?)
262 psrldq \t, 8 // (y_2, 0)
263 psrlq \c3, 32 // (floor(y_0/B), ?)
264 paddq \c3, \u // (y_1 + floor(y_0/B), ?)
265 pslldq \c0, 12 // (0, 0, 0, y^*_0)
266 movdqa \c1, \c3 // (y^*_1, ?, ?, ?)
267 psrldq \u, 8 // (y_3, 0)
268 psrlq \c3, 32 // (floor((y_1 B + y_0)/B^2, ?)
269 paddq \c3, \t // (y_2 + floor((y_1 B + y_0)/B^2, ?)
270 pslldq \c1, 12 // (0, 0, 0, y^*_1)
271 psrldq \c0, 12 // (y^*_0, 0, 0, 0)
272 movdqa \c2, \c3 // (y^*_2, ?, ?, ?)
273 psrlq \c3, 32 // (floor((y_2 B^2 + y_1 B + y_0)/B^3, ?)
274 paddq \c3, \u // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3, ?)
275 pslldq \c2, 12 // (0, 0, 0, y^*_2)
276 psrldq \c1, 8 // (0, y^*_1, 0, 0)
277 psrldq \c2, 4 // (0, 0, y^*_2, 0)
282 pslldq \c3, 12 // (0, 0, 0, y^*_3)
283 por \c0, \c1 // (y^*_0, y^*_1, 0, 0)
284 por \c2, \c3 // (0, 0, y^*_2, y^*_3)
285 por \c0, \c2 // y mod B^4
287 psrlq \t, 32 // very high bits of y
289 punpcklqdq \h, \u // carry up
294 // On entry, EDI points to a packed addend A, and XMM4, XMM5, XMM6
295 // hold the incoming carry registers c0, c1, and c2 representing a
298 // On exit, the carry registers, including XMM7, are updated to hold
299 // C + A; XMM0, XMM1, XMM2, and XMM3 are clobbered. The other
300 // registers are preserved.
301 movd xmm0, [edi + 0] // (a_0, 0)
302 movd xmm1, [edi + 4] // (a_1, 0)
303 movd xmm2, [edi + 8] // (a_2, 0)
304 movd xmm7, [edi + 12] // (a_3, 0)
306 paddq xmm4, xmm0 // (c'_0 + a_0, c''_0)
307 paddq xmm5, xmm1 // (c'_1 + a_1, c''_1)
308 paddq xmm6, xmm2 // (c'_2 + a_2, c''_2 + a_3 b)
311 ///--------------------------------------------------------------------------
312 /// Primitive multipliers and related utilities.
315 // On entry, XMM4, XMM5, and XMM6 hold a 144-bit carry in an expanded
316 // form. Store the low 128 bits of the represented carry to [EDI] as
317 // a packed 128-bit value, and leave the remaining 16 bits in the low
318 // 32 bits of XMM4. On exit, XMM3, XMM5 and XMM6 are clobbered.
321 propout [edi + 0], xmm4, xmm5
322 propout [edi + 4], xmm5, xmm6
323 propout [edi + 8], xmm6, nil
324 endprop [edi + 12], xmm6, xmm4
330 // On entry, EDI points to the destination buffer; EAX and EBX point
331 // to the packed operands U and X; ECX and EDX point to the expanded
332 // operands V and Y; and XMM4, XMM5, XMM6 hold the incoming carry
333 // registers c0, c1, and c2; c3 is assumed to be zero.
335 // On exit, we write the low 128 bits of the sum C + U V + X Y to
336 // [EDI], and update the carry registers with the carry out. The
337 // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
338 // general-purpose registers are preserved.
341 mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7, t
342 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil
343 propout [edi + 0], xmm4, xmm5
345 mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
346 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, nil
347 propout [edi + 4], xmm5, xmm6
349 mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t
350 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, nil
351 propout [edi + 8], xmm6, xmm7
353 mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
354 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, nil
355 propout [edi + 12], xmm7, xmm4
362 // On entry, EDI points to the destination buffer, which also
363 // contains an addend A to accumulate; EAX and EBX point to the
364 // packed operands U and X; ECX and EDX point to the expanded
365 // operands V and Y; and XMM4, XMM5, XMM6 hold the incoming carry
366 // registers c0, c1, and c2 representing a carry-in C; c3 is assumed
369 // On exit, we write the low 128 bits of the sum A + C + U V + X Y to
370 // [EDI], and update the carry registers with the carry out. The
371 // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
372 // general-purpose registers are preserved.
377 mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7, nil
378 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil
379 propout [edi + 0], xmm4, xmm5
381 mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
382 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, nil
383 propout [edi + 4], xmm5, xmm6
385 mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t
386 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, nil
387 propout [edi + 8], xmm6, xmm7
389 mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
390 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, nil
391 propout [edi + 12], xmm7, xmm4
398 // On entry, EDI points to the destination buffer; EBX points to a
399 // packed operand X; and EDX points to an expanded operand Y.
401 // On exit, we write the low 128 bits of the product X Y to [EDI],
402 // and set the carry registers XMM4, XMM5, XMM6 to the carry out.
403 // The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
404 // general-purpose registers are preserved.
407 mulcore [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
408 propout [edi + 0], xmm4, xmm5
410 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
411 propout [edi + 4], xmm5, xmm6
413 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
414 propout [edi + 8], xmm6, xmm7
416 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
417 propout [edi + 12], xmm7, xmm4
424 // On entry, EDI points to the destination buffer; EBX points to a
425 // packed operand X; EDX points to an expanded operand Y; and XMM4,
426 // XMM5, XMM6 hold the incoming carry registers c0, c1, and c2,
427 // representing a carry-in C; c3 is assumed to be zero.
429 // On exit, we write the low 128 bits of the sum C + X Y to [EDI],
430 // and update the carry registers with the carry out. The registers
431 // XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
432 // general-purpose registers are preserved.
435 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, t
436 propout [edi + 0], xmm4, xmm5
438 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
439 propout [edi + 4], xmm5, xmm6
441 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
442 propout [edi + 8], xmm6, xmm7
444 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
445 propout [edi + 12], xmm7, xmm4
452 // On entry, EDI points to the destination buffer, which also
453 // contains an addend A to accumulate; EBX points to a packed operand
454 // X; and EDX points to an expanded operand Y.
456 // On exit, we write the low 128 bits of the sum A + X Y to [EDI],
457 // and set the carry registers XMM4, XMM5, XMM6 to the carry out.
458 // The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
459 // general-purpose registers are preserved.
465 movd xmm7, [edi + 12]
467 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil
468 propout [edi + 0], xmm4, xmm5
470 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
471 propout [edi + 4], xmm5, xmm6
473 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
474 propout [edi + 8], xmm6, xmm7
476 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
477 propout [edi + 12], xmm7, xmm4
484 // On entry, EDI points to the destination buffer, which also
485 // contains an addend A to accumulate; EBX points to a packed operand
486 // X; EDX points to an expanded operand Y; and XMM4, XMM5, XMM6 hold
487 // the incoming carry registers c0, c1, and c2, representing a
488 // carry-in C; c3 is assumed to be zero.
490 // On exit, we write the low 128 bits of the sum A + C + X Y to
491 // [EDI], and update the carry registers with the carry out. The
492 // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
493 // general-purpose registers are preserved.
498 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil
499 propout [edi + 0], xmm4, xmm5
501 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
502 propout [edi + 4], xmm5, xmm6
504 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
505 propout [edi + 8], xmm6, xmm7
507 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
508 propout [edi + 12], xmm7, xmm4
515 // On entry, EDI points to the destination buffer; EAX and EBX point
516 // to the packed operands U and N; ECX and ESI point to the expanded
517 // operands V and M; and EDX points to a place to store an expanded
518 // result Y (32 bytes, at a 16-byte boundary). The stack pointer
519 // must be 16-byte aligned. (This is not the usual convention, which
520 // requires alignment before the call.)
522 // On exit, we write Y = U V M mod B to [EDX], and the low 128 bits
523 // of the sum U V + N Y to [EDI], leaving the remaining carry in
524 // XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2, XMM3, and
525 // XMM7 are clobbered; the general-purpose registers are preserved.
526 stalloc 48 // space for the carries
529 // Calculate W = U V, and leave it in the destination. Stash the
530 // carry pieces for later.
531 mulcore [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7
532 propout [edi + 0], xmm4, xmm5
538 // On entry, EDI points to the destination buffer, which also
539 // contains an addend A to accumulate; EAX and EBX point
540 // to the packed operands U and N; ECX and ESI point to the expanded
541 // operands V and M; and EDX points to a place to store an expanded
542 // result Y (32 bytes, at a 16-byte boundary). The stack pointer
543 // must be 16-byte aligned. (This is not the usual convention, which
544 // requires alignment before the call.)
546 // On exit, we write Y = (A + U V) M mod B to [EDX], and the low 128
547 // bits of the sum A + U V + N Y to [EDI], leaving the remaining
548 // carry in XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2,
549 // XMM3, and XMM7 are clobbered; the general-purpose registers are
551 stalloc 48 // space for the carries
557 movd xmm7, [edi + 12]
558 mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7, nil
559 propout [edi + 0], xmm4, xmm5
561 5: mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
562 propout [edi + 4], xmm5, xmm6
564 mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t
565 propout [edi + 8], xmm6, xmm7
567 mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
568 propout [edi + 12], xmm7, xmm4
570 movdqa [esp + 0], xmm4
571 movdqa [esp + 16], xmm5
572 movdqa [esp + 32], xmm6
574 // Calculate Y = W M.
575 mulcore [edi + 0], esi, xmm4, xmm5, xmm6, xmm7
577 mulcore [edi + 4], esi, xmm0, xmm1, xmm2, nil
578 accum xmm5, xmm6, xmm7, nil
580 mulcore [edi + 8], esi, xmm0, xmm1, nil, nil
581 accum xmm6, xmm7, nil, nil
583 mulcore [edi + 12], esi, xmm0, nil, nil, nil
584 accum xmm7, nil, nil, nil
586 // That's lots of pieces. Now we have to assemble the answer.
587 squash xmm4, xmm5, xmm6, xmm7, nil, xmm0, xmm1
591 expand xmm4, xmm1, nil, nil, xmm2
592 movdqa [edx + 0], xmm4
593 movdqa [edx + 16], xmm1
595 // Initialize the carry from the value for W we calculated earlier.
599 movd xmm7, [edi + 12]
601 // Finish the calculation by adding the Montgomery product.
602 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil
603 propout [edi + 0], xmm4, xmm5
605 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
606 propout [edi + 4], xmm5, xmm6
608 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
609 propout [edi + 8], xmm6, xmm7
611 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
612 propout [edi + 12], xmm7, xmm4
614 // Add add on the carry we calculated earlier.
615 paddq xmm4, [esp + 0]
616 paddq xmm5, [esp + 16]
617 paddq xmm6, [esp + 32]
619 // And, with that, we're done.
626 // On entry, EDI points to the destination buffer holding a packed
627 // value W; EBX points to a packed operand N; ESI points to an
628 // expanded operand M; and EDX points to a place to store an expanded
629 // result Y (32 bytes, at a 16-byte boundary).
631 // On exit, we write Y = W M mod B to [EDX], and the low 128 bits
632 // of the sum W + N Y to [EDI], leaving the remaining carry in
633 // XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2, XMM3, and
634 // XMM7 are clobbered; the general-purpose registers are preserved.
637 // Calculate Y = W M.
638 mulcore [edi + 0], esi, xmm4, xmm5, xmm6, xmm7
640 mulcore [edi + 4], esi, xmm0, xmm1, xmm2, nil
641 accum xmm5, xmm6, xmm7, nil
643 mulcore [edi + 8], esi, xmm0, xmm1, nil, nil
644 accum xmm6, xmm7, nil, nil
646 mulcore [edi + 12], esi, xmm0, nil, nil, nil
647 accum xmm7, nil, nil, nil
649 // That's lots of pieces. Now we have to assemble the answer.
650 squash xmm4, xmm5, xmm6, xmm7, nil, xmm0, xmm1
654 expand xmm4, xmm1, nil, nil, xmm2
655 movdqa [edx + 0], xmm4
656 movdqa [edx + 16], xmm1
658 // Initialize the carry from W.
662 movd xmm7, [edi + 12]
664 // Finish the calculation by adding the Montgomery product.
665 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil
666 propout [edi + 0], xmm4, xmm5
668 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
669 propout [edi + 4], xmm5, xmm6
671 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
672 propout [edi + 8], xmm6, xmm7
674 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
675 propout [edi + 12], xmm7, xmm4
677 // And, with that, we're done.
682 ///--------------------------------------------------------------------------
683 /// Bulk multipliers.
685 FUNC(mpx_umul4_x86_sse2)
686 // void mpx_umul4_x86_sse2(mpw *dv, const mpw *av, const mpw *avl,
687 // const mpw *bv, const mpw *bvl);
689 // Build a stack frame. Arguments will be relative to EBP, as
698 // Locals are relative to ESP, as follows.
700 // esp + 0 expanded Y (32 bytes)
701 // esp + 32 (top of locals)
711 // Prepare for the first iteration.
712 mov esi, [ebp + 32] // -> bv[0]
714 movdqu xmm0, [esi] // bv[0]
715 mov edi, [ebp + 20] // -> dv[0]
716 mov ecx, edi // outer loop dv cursor
717 expand xmm0, xmm1, nil, nil, xmm7
718 mov ebx, [ebp + 24] // -> av[0]
719 mov eax, [ebp + 28] // -> av[m] = av limit
720 mov edx, esp // -> expanded Y = bv[0]
721 movdqa [esp + 0], xmm0 // bv[0] expanded low
722 movdqa [esp + 16], xmm1 // bv[0] expanded high
728 cmp ebx, eax // all done?
732 // Continue with the first iteration.
736 cmp ebx, eax // all done?
739 // Write out the leftover carry. There can be no tail here.
741 cmp esi, [ebp + 36] // more passes to do?
745 // Set up for the next pass.
746 1: movdqu xmm0, [esi] // bv[i]
747 mov edi, ecx // -> dv[i]
749 expand xmm0, xmm1, nil, nil, xmm7
750 mov ebx, [ebp + 24] // -> av[0]
751 movdqa [esp + 0], xmm0 // bv[i] expanded low
752 movdqa [esp + 16], xmm1 // bv[i] expanded high
758 cmp ebx, eax // done yet?
769 // Finish off this pass. There was no tail on the previous pass, and
770 // there can be none on this pass.
785 FUNC(mpxmont_mul4_x86_sse2)
786 // void mpxmont_mul4_x86_sse2(mpw *dv, const mpw *av, const mpw *bv,
787 // const mpw *nv, size_t n, const mpw *mi);
789 // Build a stack frame. Arguments will be relative to EBP, as
796 // ebp + 36 n (nonzero multiple of 4)
799 // Locals are relative to ESP, which is 4 mod 16, as follows.
801 // esp + 0 outer loop dv
802 // esp + 4 outer loop bv
803 // esp + 8 av limit (mostly in ESI)
804 // esp + 12 expanded V (32 bytes)
805 // esp + 44 expanded M (32 bytes)
806 // esp + 76 expanded Y (32 bytes)
807 // esp + 108 bv limit
809 // esp + 124 (top of locals)
819 // Establish the expanded operands.
821 mov ecx, [ebp + 28] // -> bv
822 mov edx, [ebp + 40] // -> mi
823 movdqu xmm0, [ecx] // bv[0]
824 movdqu xmm2, [edx] // mi
825 expand xmm0, xmm1, xmm2, xmm3, xmm7
826 movdqa [esp + 12], xmm0 // bv[0] expanded low
827 movdqa [esp + 28], xmm1 // bv[0] expanded high
828 movdqa [esp + 44], xmm2 // mi expanded low
829 movdqa [esp + 60], xmm3 // mi expanded high
831 // Set up the outer loop state and prepare for the first iteration.
832 mov edx, [ebp + 36] // n
833 mov eax, [ebp + 24] // -> U = av[0]
834 mov ebx, [ebp + 32] // -> X = nv[0]
835 mov edi, [ebp + 20] // -> Z = dv[0]
837 lea ecx, [ecx + 4*edx] // -> bv[n/4] = bv limit
838 lea edx, [eax + 4*edx] // -> av[n/4] = av limit
842 lea ecx, [esp + 12] // -> expanded V = bv[0]
843 lea esi, [esp + 44] // -> expanded M = mi
844 lea edx, [esp + 76] // -> space for Y
846 mov esi, [esp + 8] // recover av limit
850 cmp eax, esi // done already?
855 // Complete the first inner loop.
860 cmp eax, esi // done yet?
863 // Still have carries left to propagate.
865 movd [edi + 16], xmm4
868 // Embark on the next iteration. (There must be one. If n = 1, then
869 // we would have bailed above, to label 8. Similarly, the subsequent
870 // iterations can fall into the inner loop immediately.)
871 1: mov eax, [esp + 4] // -> bv[i - 1]
872 mov edi, [esp + 0] // -> Z = dv[i]
873 add eax, 16 // -> bv[i]
875 movdqu xmm0, [eax] // bv[i]
877 cmp eax, [esp + 108] // done yet?
879 mov ebx, [ebp + 32] // -> X = nv[0]
880 lea esi, [esp + 44] // -> expanded M = mi
881 mov eax, [ebp + 24] // -> U = av[0]
882 expand xmm0, xmm1, nil, nil, xmm7
883 movdqa [esp + 12], xmm0 // bv[i] expanded low
884 movdqa [esp + 28], xmm1 // bv[i] expanded high
886 mov esi, [esp + 8] // recover av limit
893 // Complete the next inner loop.
901 // Still have carries left to propagate, and they overlap the
902 // previous iteration's final tail, so read that in and add it.
906 movd [edi + 16], xmm4
911 // First iteration was short. Write out the carries and we're done.
912 // (This could be folded into the main loop structure, but that would
913 // penalize small numbers more.)
915 movd [edi + 16], xmm4
927 FUNC(mpxmont_redc4_x86_sse2)
928 // void mpxmont_redc4_x86_sse2(mpw *dv, mpw *dvl, const mpw *nv,
929 // size_t n, const mpw *mi);
931 // Build a stack frame. Arguments will be relative to EBP, as
937 // ebp + 32 n (nonzero multiple of 4)
940 // Locals are relative to ESP, as follows.
942 // esp + 0 outer loop dv
943 // esp + 4 outer dv limit
944 // esp + 8 blocks-of-4 dv limit
945 // esp + 12 expanded M (32 bytes)
946 // esp + 44 expanded Y (32 bytes)
947 // esp + 76 (top of locals)
957 // Establish the expanded operands and the blocks-of-4 dv limit.
958 mov edi, [ebp + 20] // -> Z = dv[0]
960 mov eax, [ebp + 24] // -> dv[n] = dv limit
961 sub eax, edi // length of dv in bytes
962 mov edx, [ebp + 36] // -> mi
963 movdqu xmm0, [edx] // mi
964 and eax, ~15 // mask off the tail end
965 expand xmm0, xmm1, nil, nil, xmm7
966 add eax, edi // find limit
967 movdqa [esp + 12], xmm0 // mi expanded low
968 movdqa [esp + 28], xmm1 // mi expanded high
971 // Set up the outer loop state and prepare for the first iteration.
972 mov ecx, [ebp + 32] // n
973 mov ebx, [ebp + 28] // -> X = nv[0]
974 lea edx, [edi + 4*ecx] // -> dv[n/4] = outer dv limit
975 lea ecx, [ebx + 4*ecx] // -> nv[n/4] = nv limit
978 lea esi, [esp + 12] // -> expanded M = mi
979 lea edx, [esp + 44] // -> space for Y
983 cmp ebx, ecx // done already?
987 // Complete the first inner loop.
991 cmp ebx, ecx // done yet?
994 // Still have carries left to propagate.
996 mov esi, [esp + 8] // -> dv blocks limit
997 mov edx, [ebp + 24] // dv limit
1008 // Continue carry propagation until the end of the buffer.
1010 mov eax, 0 // preserves flags
1019 // Deal with the tail end.
1021 mov eax, 0 // preserves flags
1027 // All done for this iteration. Start the next. (This must have at
1028 // least one follow-on iteration, or we'd not have started this outer
1030 8: mov edi, [esp + 0] // -> dv[i - 1]
1031 mov ebx, [ebp + 28] // -> X = nv[0]
1032 lea edx, [esp + 44] // -> space for Y
1033 lea esi, [esp + 12] // -> expanded M = mi
1034 add edi, 16 // -> Z = dv[i]
1035 cmp edi, [esp + 4] // all done yet?
1053 ///--------------------------------------------------------------------------
1054 /// Testing and performance measurement.
1064 .macro cystore c, v, n
1072 mov [ebx + ecx*8], eax
1073 mov [ebx + ecx*8 + 4], edx
1087 // esp + 12 = v expanded
1088 // esp + 44 = y expanded
1089 // esp + 72 = ? expanded
1101 .macro testldcarry c
1103 movdqu xmm4, [ecx + 0] // (c'_0, c''_0)
1104 movdqu xmm5, [ecx + 16] // (c'_1, c''_1)
1105 movdqu xmm6, [ecx + 32] // (c'_2, c''_2)
1108 .macro testexpand v, y
1113 expand xmm0, xmm1, nil, nil, xmm7
1114 movdqa [esp + 12], xmm0
1115 movdqa [esp + 28], xmm1
1120 expand xmm2, xmm3, nil, nil, xmm7
1121 movdqa [esp + 44], xmm2
1122 movdqa [esp + 60], xmm3
1126 .macro testtop u, x, mode
1133 .ifeqs "\mode", "mont"
1140 .ifeqs "\mode", "mont"
1147 .macro testtail cyv, n
1148 cystore esp + 0, \cyv, \n
1152 .macro testcarryout c
1154 movdqu [ecx + 0], xmm4
1155 movdqu [ecx + 16], xmm5
1156 movdqu [ecx + 32], xmm6
1161 testldcarry [ebp + 24]
1162 testexpand [ebp + 36], [ebp + 40]
1164 testtop [ebp + 28], [ebp + 32]
1166 testtail [ebp + 48], [ebp + 44]
1167 testcarryout [ebp + 24]
1173 testldcarry [ebp + 24]
1174 testexpand [ebp + 36], [ebp + 40]
1176 testtop [ebp + 28], [ebp + 32]
1178 testtail [ebp + 48], [ebp + 44]
1179 testcarryout [ebp + 24]
1185 testldcarry [ebp + 24]
1186 testexpand nil, [ebp + 32]
1188 testtop nil, [ebp + 28]
1190 testtail [ebp + 40], [ebp + 36]
1191 testcarryout [ebp + 24]
1197 testldcarry [ebp + 24]
1198 testexpand nil, [ebp + 32]
1200 testtop nil, [ebp + 28]
1202 testtail [ebp + 40], [ebp + 36]
1203 testcarryout [ebp + 24]
1209 testexpand [ebp + 40], [ebp + 44]
1211 testtop [ebp + 32], [ebp + 36], mont
1213 testtail [ebp + 52], [ebp + 48]
1215 movdqa xmm0, [esp + 76]
1216 movdqa xmm1, [esp + 92]
1218 movdqu [edi + 16], xmm1
1219 testcarryout [ebp + 24]
1225 testexpand [ebp + 40], [ebp + 44]
1227 testtop [ebp + 32], [ebp + 36], mont
1229 testtail [ebp + 52], [ebp + 48]
1231 movdqa xmm0, [esp + 76]
1232 movdqa xmm1, [esp + 92]
1234 movdqu [edi + 16], xmm1
1235 testcarryout [ebp + 24]
1241 testexpand nil, [ebp + 36]
1243 testtop nil, [ebp + 32], mont
1245 testtail [ebp + 44], [ebp + 40]
1247 movdqa xmm0, [esp + 76]
1248 movdqa xmm1, [esp + 92]
1250 movdqu [edi + 16], xmm1
1251 testcarryout [ebp + 24]
1257 ///----- That's all, folks --------------------------------------------------