1 /// -*- mode: asm; asm-comment-char: ?/; comment-start: "// " -*-
3 /// Large SIMD-based multiplications
5 /// (c) 2016 Straylight/Edgeware
7 ///----- Licensing notice ---------------------------------------------------
9 /// This file is part of Catacomb.
11 /// Catacomb is free software; you can redistribute it and/or modify
12 /// it under the terms of the GNU Library General Public License as
13 /// published by the Free Software Foundation; either version 2 of the
14 /// License, or (at your option) any later version.
16 /// Catacomb is distributed in the hope that it will be useful,
17 /// but WITHOUT ANY WARRANTY; without even the implied warranty of
18 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 /// GNU Library General Public License for more details.
21 /// You should have received a copy of the GNU Library General Public
22 /// License along with Catacomb; if not, write to the Free
23 /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
24 /// MA 02111-1307, USA.
26 ///--------------------------------------------------------------------------
30 #include "asm-common.h"
36 ///--------------------------------------------------------------------------
39 /// We define a number of primitive fixed-size multipliers from which we can
40 /// construct more general variable-length multipliers.
42 /// The basic trick is the same throughout. In an operand-scanning
43 /// multiplication, the inner multiplication loop multiplies a
44 /// multiple-precision operand by a single precision factor, and adds the
45 /// result, appropriately shifted, to the result. A `finely integrated
46 /// operand scanning' implementation of Montgomery multiplication also adds
47 /// the product of a single-precision `Montgomery factor' and the modulus,
48 /// calculated in the same pass. The more common `coarsely integrated
49 /// operand scanning' alternates main multiplication and Montgomery passes,
50 /// which requires additional carry propagation.
52 /// Throughout both plain-multiplication and Montgomery stages, then, one of
53 /// the factors remains constant throughout the operation, so we can afford
54 /// to take a little time to preprocess it. The transformation we perform is
55 /// as follows. Let b = 2^16, and B = b^2 = 2^32. Suppose we're given a
56 /// 128-bit factor v = v_0 + v_1 B + v_2 B^2 + v_3 B^3. Split each v_i into
57 /// two sixteen-bit pieces, so v_i = v'_i + v''_i b. These eight 16-bit
58 /// pieces are placed into 32-bit cells, and arranged as two 128-bit SSE
59 /// operands, as follows.
62 /// 0 v'_0 v'_1 v''_0 v''_1
63 /// 16 v'_2 v'_3 v''_2 v''_3
65 /// A `pmuludq' instruction ignores the odd positions in its operands; thus,
66 /// it will act on (say) v'_0 and v''_0 in a single instruction. Shifting
67 /// this vector right by 4 bytes brings v'_1 and v''_1 into position. We can
68 /// multiply such a vector by a full 32-bit scalar to produce two 48-bit
69 /// results in 64-bit fields. The sixteen bits of headroom allows us to add
70 /// many products together before we must deal with carrying; it also allows
71 /// for some calculations to be performed on the above expanded form.
73 /// On 32-bit x86, we are register starved: the expanded operands are kept in
74 /// memory, typically in warm L1 cache.
76 /// We maintain four `carry' registers accumulating intermediate results.
77 /// The registers' precise roles rotate during the computation; we name them
78 /// `c0', `c1', `c2', and `c3'. Each carry register holds two 64-bit halves:
79 /// the register c0, for example, holds c'_0 (low half) and c''_0 (high
80 /// half), and represents the value c_0 = c'_0 + c''_0 b; the carry registers
81 /// collectively represent the value c_0 + c_1 B + c_2 B^2 + c_3 B^3. The
82 /// `pmuluqd' instruction acting on a scalar operand (broadcast across all
83 /// lanes of its vector) and an operand in the expanded form above produces a
84 /// result which can be added directly to the appropriate carry register.
85 /// Following a pass of four multiplications, we perform some limited carry
86 /// propagation: let t = c''_0 mod B, and let d = c'_0 + t b; then we output
87 /// z = d mod B, add (floor(d/B), floor(c''_0/B)) to c1, and cycle the carry
88 /// registers around, so that c1 becomes c0, and the old c0 is (implicitly)
89 /// zeroed becomes c3.
91 ///--------------------------------------------------------------------------
92 /// Macro definitions.
94 .macro mulcore r, s, d0, d1=nil, d2=nil, d3=nil
95 // Load a word r_i from R, multiply by the expanded operand [S], and
96 // leave the pieces of the product in registers D0, D1, D2, D3.
97 movd \d0, \r // (r_i, 0; 0, 0)
99 movdqa \d1, [\s] // (s'_0, s'_1; s''_0, s''_1)
102 movdqa \d3, [\s + 16] // (s'_2, s'_3; s''_2, s''_3)
104 pshufd \d0, \d0, SHUF(0, 3, 0, 3) // (r_i, ?; r_i, ?)
106 psrldq \d1, 4 // (s'_1, s''_0; s''_1, 0)
110 movdqa \d2, \d3 // another copy of (s'_2, s'_3; ...)
112 movdqa \d2, \d0 // another copy of (r_i, ?; r_i, ?)
116 psrldq \d3, 4 // (s'_3, s''_2; s''_3, 0)
119 pmuludq \d1, \d0 // (r_i s'_1; r_i s''_1)
122 pmuludq \d3, \d0 // (r_i s'_3; r_i s''_3)
126 pmuludq \d2, \d0 // (r_i s'_2; r_i s''_2)
128 pmuludq \d2, [\s + 16]
131 pmuludq \d0, [\s] // (r_i s'_0; r_i s''_0)
134 .macro accum c0, c1=nil, c2=nil, c3=nil
135 // Accumulate 64-bit pieces in XMM0--XMM3 into the corresponding
136 // carry registers C0--C3. Any or all of C1--C3 may be `nil' to skip
137 // updating that register.
150 .macro mulacc r, s, c0, c1, c2, c3, z3p=nil
151 // Load a word r_i from R, multiply by the expanded operand [S],
152 // and accumulate in carry registers C0, C1, C2, C3. If Z3P is `t'
153 // then C3 notionally contains zero, but needs clearing; in practice,
154 // we store the product directly rather than attempting to add. On
155 // completion, XMM0, XMM1, and XMM2 are clobbered, as is XMM3 if Z3P
158 mulcore \r, \s, xmm0, xmm1, xmm2, \c3
161 mulcore \r, \s, xmm0, xmm1, xmm2, xmm3
162 accum \c0, \c1, \c2, \c3
166 .macro propout d, c, cc=nil
167 // Calculate an output word from C, and store it in D; propagate
168 // carries out from C to CC in preparation for a rotation of the
169 // carry registers. On completion, XMM3 is clobbered. If CC is
170 // `nil', then the contribution which would have been added to it is
172 pshufd xmm3, \c, SHUF(3, 3, 3, 2) // (?, ?; ?, t = c'' mod B)
173 psrldq xmm3, 12 // (t, 0; 0, 0) = (t, 0)
174 pslldq xmm3, 2 // (t b; 0)
175 paddq \c, xmm3 // (c' + t b; c'')
177 psrlq \c, 32 // floor(c/B)
179 paddq \cc, \c // propagate up
183 .macro endprop d, c, t
184 // On entry, C contains a carry register. On exit, the low 32 bits
185 // of the value represented in C are written to D, and the remaining
186 // bits are left at the bottom of T.
188 psllq \t, 16 // (?; c'' b)
189 pslldq \c, 8 // (0; c')
190 paddq \t, \c // (?; c' + c'' b)
191 psrldq \t, 8 // (c' + c'' b; 0) = (c; 0)
193 psrldq \t, 4 // (floor(c/B); 0)
196 .macro expand z, a, b, c=nil, d=nil
197 // On entry, A and C hold packed 128-bit values, and Z is zero. On
198 // exit, A:B and C:D together hold the same values in expanded
199 // form. If C is `nil', then only expand A to A:B.
200 movdqa \b, \a // (a_0, a_1; a_2, a_3)
202 movdqa \d, \c // (c_0, c_1; c_2, c_3)
204 punpcklwd \a, \z // (a'_0, a''_0; a'_1, a''_1)
205 punpckhwd \b, \z // (a'_2, a''_2; a'_3, a''_3)
207 punpcklwd \c, \z // (c'_0, c''_0; c'_1, c''_1)
208 punpckhwd \d, \z // (c'_2, c''_2; c'_3, c''_3)
210 pshufd \a, \a, SHUF(0, 2, 1, 3) // (a'_0, a'_1; a''_0, a''_1)
211 pshufd \b, \b, SHUF(0, 2, 1, 3) // (a'_2, a'_3; a''_2, a''_3)
213 pshufd \c, \c, SHUF(0, 2, 1, 3) // (c'_0, c'_1; c''_0, c''_1)
214 pshufd \d, \d, SHUF(0, 2, 1, 3) // (c'_2, c'_3; c''_2, c''_3)
218 .macro squash c0, c1, c2, c3, t, u, lo, hi=nil
219 // On entry, C0, C1, C2, C3 are carry registers representing a value
220 // Y. On exit, LO holds the low 128 bits of the carry value; C1, C2,
221 // C3, T, and U are clobbered; and the high bits of Y are stored in
222 // HI, if this is not `nil'.
224 // The first step is to eliminate the `double-prime' pieces -- i.e.,
225 // the ones offset by 16 bytes from a 32-bit boundary -- by carrying
226 // them into the 32-bit-aligned pieces above and below. But before
227 // we can do that, we must gather them together.
230 punpcklqdq \t, \c2 // (y'_0; y'_2)
231 punpckhqdq \c0, \c2 // (y''_0; y''_2)
232 punpcklqdq \u, \c3 // (y'_1; y'_3)
233 punpckhqdq \c1, \c3 // (y''_1; y''_3)
235 // Now split the double-prime pieces. The high (up to) 48 bits will
236 // go up; the low 16 bits go down.
241 psrlq \c0, 16 // high parts of (y''_0; y''_2)
242 psrlq \c1, 16 // high parts of (y''_1; y''_3)
243 psrlq \c2, 32 // low parts of (y''_0; y''_2)
244 psrlq \c3, 32 // low parts of (y''_1; y''_3)
248 pslldq \c1, 8 // high part of (0; y''_1)
250 paddq \t, \c2 // propagate down
252 paddq \t, \c1 // and up: (y_0; y_2)
253 paddq \u, \c0 // (y_1; y_3)
255 psrldq \hi, 8 // high part of (y''_3; 0)
258 // Finally extract the answer. This complicated dance is better than
259 // storing to memory and loading, because the piecemeal stores
260 // inhibit store forwarding.
261 movdqa \c3, \t // (y_0; ?)
262 movdqa \lo, \t // (y^*_0, ?; ?, ?)
263 psrldq \t, 8 // (y_2; 0)
264 psrlq \c3, 32 // (floor(y_0/B); ?)
265 paddq \c3, \u // (y_1 + floor(y_0/B); ?)
266 movdqa \c1, \c3 // (y^*_1, ?; ?, ?)
267 psrldq \u, 8 // (y_3; 0)
268 psrlq \c3, 32 // (floor((y_1 B + y_0)/B^2; ?)
269 paddq \c3, \t // (y_2 + floor((y_1 B + y_0)/B^2; ?)
270 punpckldq \lo, \c3 // (y^*_0, y^*_2; ?, ?)
271 psrlq \c3, 32 // (floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
272 paddq \c3, \u // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
277 punpckldq \c1, \c3 // (y^*_1, y^*_3; ?, ?)
279 psrlq \t, 32 // very high bits of y
281 punpcklqdq \hi, \u // carry up
283 punpckldq \lo, \c1 // y mod B^4
287 // On entry, EDI points to a packed addend A, and XMM4, XMM5, XMM6
288 // hold the incoming carry registers c0, c1, and c2 representing a
291 // On exit, the carry registers, including XMM7, are updated to hold
292 // C + A; XMM0, XMM1, XMM2, and XMM3 are clobbered. The other
293 // registers are preserved.
294 movd xmm0, [edi + 0] // (a_0; 0)
295 movd xmm1, [edi + 4] // (a_1; 0)
296 movd xmm2, [edi + 8] // (a_2; 0)
297 movd xmm7, [edi + 12] // (a_3; 0)
299 paddq xmm4, xmm0 // (c'_0 + a_0; c''_0)
300 paddq xmm5, xmm1 // (c'_1 + a_1; c''_1)
301 paddq xmm6, xmm2 // (c'_2 + a_2; c''_2 + a_3 b)
304 ///--------------------------------------------------------------------------
305 /// Primitive multipliers and related utilities.
308 // On entry, XMM4, XMM5, and XMM6 hold a 144-bit carry in an expanded
309 // form. Store the low 128 bits of the represented carry to [EDI] as
310 // a packed 128-bit value, and leave the remaining 16 bits in the low
311 // 32 bits of XMM4. On exit, XMM3, XMM5 and XMM6 are clobbered.
314 propout [edi + 0], xmm4, xmm5
315 propout [edi + 4], xmm5, xmm6
316 propout [edi + 8], xmm6, nil
317 endprop [edi + 12], xmm6, xmm4
322 // On entry, EDI points to the destination buffer; EAX and EBX point
323 // to the packed operands U and X; ECX and EDX point to the expanded
324 // operands V and Y; and XMM4, XMM5, XMM6 hold the incoming carry
325 // registers c0, c1, and c2; c3 is assumed to be zero.
327 // On exit, we write the low 128 bits of the sum C + U V + X Y to
328 // [EDI], and update the carry registers with the carry out. The
329 // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
330 // general-purpose registers are preserved.
333 mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7, t
334 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
335 propout [edi + 0], xmm4, xmm5
337 mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
338 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4
339 propout [edi + 4], xmm5, xmm6
341 mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t
342 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5
343 propout [edi + 8], xmm6, xmm7
345 mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
346 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6
347 propout [edi + 12], xmm7, xmm4
353 // On entry, EDI points to the destination buffer, which also
354 // contains an addend A to accumulate; EAX and EBX point to the
355 // packed operands U and X; ECX and EDX point to the expanded
356 // operands V and Y; and XMM4, XMM5, XMM6 hold the incoming carry
357 // registers c0, c1, and c2 representing a carry-in C; c3 is assumed
360 // On exit, we write the low 128 bits of the sum A + C + U V + X Y to
361 // [EDI], and update the carry registers with the carry out. The
362 // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
363 // general-purpose registers are preserved.
368 mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7
369 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
370 propout [edi + 0], xmm4, xmm5
372 mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
373 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4
374 propout [edi + 4], xmm5, xmm6
376 mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t
377 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5
378 propout [edi + 8], xmm6, xmm7
380 mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
381 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6
382 propout [edi + 12], xmm7, xmm4
388 // On entry, EDI points to the destination buffer; EBX points to a
389 // packed operand X; and EDX points to an expanded operand Y.
391 // On exit, we write the low 128 bits of the product X Y to [EDI],
392 // and set the carry registers XMM4, XMM5, XMM6 to the carry out.
393 // The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
394 // general-purpose registers are preserved.
397 mulcore [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
398 propout [edi + 0], xmm4, xmm5
400 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
401 propout [edi + 4], xmm5, xmm6
403 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
404 propout [edi + 8], xmm6, xmm7
406 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
407 propout [edi + 12], xmm7, xmm4
413 // On entry, EDI points to the destination buffer; EBX points to a
414 // packed operand X; EDX points to an expanded operand Y; and XMM4,
415 // XMM5, XMM6 hold the incoming carry registers c0, c1, and c2,
416 // representing a carry-in C; c3 is assumed to be zero.
418 // On exit, we write the low 128 bits of the sum C + X Y to [EDI],
419 // and update the carry registers with the carry out. The registers
420 // XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
421 // general-purpose registers are preserved.
424 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, t
425 propout [edi + 0], xmm4, xmm5
427 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
428 propout [edi + 4], xmm5, xmm6
430 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
431 propout [edi + 8], xmm6, xmm7
433 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
434 propout [edi + 12], xmm7, xmm4
440 // On entry, EDI points to the destination buffer, which also
441 // contains an addend A to accumulate; EBX points to a packed operand
442 // X; and EDX points to an expanded operand Y.
444 // On exit, we write the low 128 bits of the sum A + X Y to [EDI],
445 // and set the carry registers XMM4, XMM5, XMM6 to the carry out.
446 // The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
447 // general-purpose registers are preserved.
453 movd xmm7, [edi + 12]
455 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
456 propout [edi + 0], xmm4, xmm5
458 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
459 propout [edi + 4], xmm5, xmm6
461 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
462 propout [edi + 8], xmm6, xmm7
464 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
465 propout [edi + 12], xmm7, xmm4
471 // On entry, EDI points to the destination buffer, which also
472 // contains an addend A to accumulate; EBX points to a packed operand
473 // X; EDX points to an expanded operand Y; and XMM4, XMM5, XMM6 hold
474 // the incoming carry registers c0, c1, and c2, representing a
475 // carry-in C; c3 is assumed to be zero.
477 // On exit, we write the low 128 bits of the sum A + C + X Y to
478 // [EDI], and update the carry registers with the carry out. The
479 // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
480 // general-purpose registers are preserved.
485 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
486 propout [edi + 0], xmm4, xmm5
488 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
489 propout [edi + 4], xmm5, xmm6
491 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
492 propout [edi + 8], xmm6, xmm7
494 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
495 propout [edi + 12], xmm7, xmm4
501 // On entry, EDI points to the destination buffer; EAX and EBX point
502 // to the packed operands U and N; ECX and ESI point to the expanded
503 // operands V and M; and EDX points to a place to store an expanded
504 // result Y (32 bytes, at a 16-byte boundary). The stack pointer
505 // must be 12 modulo 16, as is usual for modern x86 ABIs.
507 // On exit, we write Y = U V M mod B to [EDX], and the low 128 bits
508 // of the sum U V + N Y to [EDI], leaving the remaining carry in
509 // XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2, XMM3, and
510 // XMM7 are clobbered; the general-purpose registers are preserved.
511 stalloc 48 + 12 // space for the carries
514 // Calculate W = U V, and leave it in the destination. Stash the
515 // carry pieces for later.
516 mulcore [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7
517 propout [edi + 0], xmm4, xmm5
522 // On entry, EDI points to the destination buffer, which also
523 // contains an addend A to accumulate; EAX and EBX point to the
524 // packed operands U and N; ECX and ESI point to the expanded
525 // operands V and M; and EDX points to a place to store an expanded
526 // result Y (32 bytes, at a 16-byte boundary). The stack pointer
527 // must be 12 modulo 16, as is usual for modern x86 ABIs.
529 // On exit, we write Y = (A + U V) M mod B to [EDX], and the low 128
530 // bits of the sum A + U V + N Y to [EDI], leaving the remaining
531 // carry in XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2,
532 // XMM3, and XMM7 are clobbered; the general-purpose registers are
534 stalloc 48 + 12 // space for the carries
540 movd xmm7, [edi + 12]
542 // Calculate W = U V, and leave it in the destination. Stash the
543 // carry pieces for later.
544 mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7
545 propout [edi + 0], xmm4, xmm5
547 5: mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
548 propout [edi + 4], xmm5, xmm6
550 mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t
551 propout [edi + 8], xmm6, xmm7
553 mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
554 propout [edi + 12], xmm7, xmm4
556 movdqa [SP + 0], xmm4
557 movdqa [SP + 16], xmm5
558 movdqa [SP + 32], xmm6
560 // Calculate Y = W M.
561 mulcore [edi + 0], esi, xmm4, xmm5, xmm6, xmm7
563 mulcore [edi + 4], esi, xmm0, xmm1, xmm2
564 accum xmm5, xmm6, xmm7
566 mulcore [edi + 8], esi, xmm0, xmm1
569 mulcore [edi + 12], esi, xmm0
572 // That's lots of pieces. Now we have to assemble the answer.
573 squash xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm4
577 expand xmm2, xmm4, xmm1
578 movdqa [edx + 0], xmm4
579 movdqa [edx + 16], xmm1
581 // Initialize the carry from the value for W we calculated earlier.
585 movd xmm7, [edi + 12]
587 // Finish the calculation by adding the Montgomery product.
588 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
589 propout [edi + 0], xmm4, xmm5
591 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
592 propout [edi + 4], xmm5, xmm6
594 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
595 propout [edi + 8], xmm6, xmm7
597 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
598 propout [edi + 12], xmm7, xmm4
600 // Add add on the carry we calculated earlier.
602 paddq xmm5, [SP + 16]
603 paddq xmm6, [SP + 32]
605 // And, with that, we're done.
611 // On entry, EDI points to the destination buffer holding a packed
612 // value W; EBX points to a packed operand N; ESI points to an
613 // expanded operand M; and EDX points to a place to store an expanded
614 // result Y (32 bytes, at a 16-byte boundary).
616 // On exit, we write Y = W M mod B to [EDX], and the low 128 bits
617 // of the sum W + N Y to [EDI], leaving the remaining carry in
618 // XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2, XMM3, and
619 // XMM7 are clobbered; the general-purpose registers are preserved.
622 // Calculate Y = W M.
623 mulcore [edi + 0], esi, xmm4, xmm5, xmm6, xmm7
625 mulcore [edi + 4], esi, xmm0, xmm1, xmm2
626 accum xmm5, xmm6, xmm7
628 mulcore [edi + 8], esi, xmm0, xmm1
631 mulcore [edi + 12], esi, xmm0
634 // That's lots of pieces. Now we have to assemble the answer.
635 squash xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm4
639 expand xmm2, xmm4, xmm1
640 movdqa [edx + 0], xmm4
641 movdqa [edx + 16], xmm1
643 // Initialize the carry from W.
647 movd xmm7, [edi + 12]
649 // Finish the calculation by adding the Montgomery product.
650 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
651 propout [edi + 0], xmm4, xmm5
653 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
654 propout [edi + 4], xmm5, xmm6
656 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
657 propout [edi + 8], xmm6, xmm7
659 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
660 propout [edi + 12], xmm7, xmm4
662 // And, with that, we're done.
666 ///--------------------------------------------------------------------------
667 /// Bulk multipliers.
669 FUNC(mpx_umul4_x86_avx)
673 // and drop through...
677 FUNC(mpx_umul4_x86_sse2)
678 // void mpx_umul4_x86_sse2(mpw *dv, const mpw *av, const mpw *avl,
679 // const mpw *bv, const mpw *bvl);
681 // Build a stack frame. Arguments will be relative to BP, as
690 // Locals are relative to SP, as follows.
692 // SP + 0 expanded Y (32 bytes)
693 // SP + 32 (top of locals)
703 // Prepare for the first iteration.
704 mov esi, [BP + 32] // -> bv[0]
706 movdqu xmm0, [esi] // bv[0]
707 mov edi, [BP + 20] // -> dv[0]
708 mov ecx, edi // outer loop dv cursor
709 expand xmm7, xmm0, xmm1
710 mov ebx, [BP + 24] // -> av[0]
711 mov eax, [BP + 28] // -> av[m] = av limit
712 mov edx, SP // -> expanded Y = bv[0]
713 movdqa [SP + 0], xmm0 // bv[0] expanded low
714 movdqa [SP + 16], xmm1 // bv[0] expanded high
720 cmp ebx, eax // all done?
724 // Continue with the first iteration.
728 cmp ebx, eax // all done?
731 // Write out the leftover carry. There can be no tail here.
733 cmp esi, [BP + 36] // more passes to do?
737 // Set up for the next pass.
738 1: movdqu xmm0, [esi] // bv[i]
739 mov edi, ecx // -> dv[i]
741 expand xmm7, xmm0, xmm1
742 mov ebx, [BP + 24] // -> av[0]
743 movdqa [SP + 0], xmm0 // bv[i] expanded low
744 movdqa [SP + 16], xmm1 // bv[i] expanded high
750 cmp ebx, eax // done yet?
761 // Finish off this pass. There was no tail on the previous pass, and
762 // there can be none on this pass.
776 FUNC(mpxmont_mul4_x86_avx)
780 // and drop through...
784 FUNC(mpxmont_mul4_x86_sse2)
785 // void mpxmont_mul4_x86_sse2(mpw *dv, const mpw *av, const mpw *bv,
786 // const mpw *nv, size_t n, const mpw *mi);
788 // Build a stack frame. Arguments will be relative to BP, as
795 // BP + 36 n (nonzero multiple of 4)
798 // Locals are relative to SP, which 16-byte aligned, as follows.
800 // SP + 0 expanded V (32 bytes)
801 // SP + 32 expanded M (32 bytes)
802 // SP + 64 expanded Y (32 bytes)
803 // SP + 96 outer loop dv
804 // SP + 100 outer loop bv
805 // SP + 104 av limit (mostly in ESI)
807 // SP + 112 (top of locals)
817 // Establish the expanded operands.
819 mov ecx, [BP + 28] // -> bv
820 mov edx, [BP + 40] // -> mi
821 movdqu xmm0, [ecx] // bv[0]
822 movdqu xmm2, [edx] // mi
823 expand xmm7, xmm0, xmm1, xmm2, xmm3
824 movdqa [SP + 0], xmm0 // bv[0] expanded low
825 movdqa [SP + 16], xmm1 // bv[0] expanded high
826 movdqa [SP + 32], xmm2 // mi expanded low
827 movdqa [SP + 48], xmm3 // mi expanded high
829 // Set up the outer loop state and prepare for the first iteration.
830 mov edx, [BP + 36] // n
831 mov eax, [BP + 24] // -> U = av[0]
832 mov ebx, [BP + 32] // -> X = nv[0]
833 mov edi, [BP + 20] // -> Z = dv[0]
835 lea ecx, [ecx + 4*edx] // -> bv[n/4] = bv limit
836 lea edx, [eax + 4*edx] // -> av[n/4] = av limit
840 lea ecx, [SP + 0] // -> expanded V = bv[0]
841 lea esi, [SP + 32] // -> expanded M = mi
842 lea edx, [SP + 64] // -> space for Y
844 mov esi, [SP + 104] // recover av limit
848 cmp eax, esi // done already?
853 // Complete the first inner loop.
858 cmp eax, esi // done yet?
861 // Still have carries left to propagate.
863 movd [edi + 16], xmm4
866 // Embark on the next iteration. (There must be one. If n = 1, then
867 // we would have bailed above, to label 8. Similarly, the subsequent
868 // iterations can fall into the inner loop immediately.)
869 1: mov eax, [SP + 100] // -> bv[i - 1]
870 mov edi, [SP + 96] // -> Z = dv[i]
871 add eax, 16 // -> bv[i]
874 cmp eax, [SP + 108] // done yet?
876 movdqu xmm0, [eax] // bv[i]
877 mov ebx, [BP + 32] // -> X = nv[0]
878 lea esi, [SP + 32] // -> expanded M = mi
879 mov eax, [BP + 24] // -> U = av[0]
880 expand xmm7, xmm0, xmm1
881 movdqa [SP + 0], xmm0 // bv[i] expanded low
882 movdqa [SP + 16], xmm1 // bv[i] expanded high
884 mov esi, [SP + 104] // recover av limit
891 // Complete the next inner loop.
899 // Still have carries left to propagate, and they overlap the
900 // previous iteration's final tail, so read that in and add it.
904 movd [edi + 16], xmm4
909 // First iteration was short. Write out the carries and we're done.
910 // (This could be folded into the main loop structure, but that would
911 // penalize small numbers more.)
913 movd [edi + 16], xmm4
924 FUNC(mpxmont_redc4_x86_avx)
928 // and drop through...
932 FUNC(mpxmont_redc4_x86_sse2)
933 // void mpxmont_redc4_x86_sse2(mpw *dv, mpw *dvl, const mpw *nv,
934 // size_t n, const mpw *mi);
936 // Build a stack frame. Arguments will be relative to BP, as
942 // BP + 32 n (nonzero multiple of 4)
945 // Locals are relative to SP, as follows.
947 // SP + 0 outer loop dv
948 // SP + 4 outer dv limit
949 // SP + 8 blocks-of-4 dv limit
950 // SP + 12 expanded M (32 bytes)
951 // SP + 44 expanded Y (32 bytes)
952 // SP + 76 (top of locals)
962 // Establish the expanded operands and the blocks-of-4 dv limit.
963 mov edi, [BP + 20] // -> Z = dv[0]
965 mov eax, [BP + 24] // -> dv[n] = dv limit
966 sub eax, edi // length of dv in bytes
967 mov edx, [BP + 36] // -> mi
968 movdqu xmm0, [edx] // mi
969 and eax, ~15 // mask off the tail end
970 expand xmm7, xmm0, xmm1
971 add eax, edi // find limit
972 movdqa [SP + 12], xmm0 // mi expanded low
973 movdqa [SP + 28], xmm1 // mi expanded high
976 // Set up the outer loop state and prepare for the first iteration.
977 mov ecx, [BP + 32] // n
978 mov ebx, [BP + 28] // -> X = nv[0]
979 lea edx, [edi + 4*ecx] // -> dv[n/4] = outer dv limit
980 lea ecx, [ebx + 4*ecx] // -> nv[n/4] = nv limit
983 lea esi, [SP + 12] // -> expanded M = mi
984 lea edx, [SP + 44] // -> space for Y
988 cmp ebx, ecx // done already?
992 // Complete the first inner loop.
996 cmp ebx, ecx // done yet?
999 // Still have carries left to propagate.
1001 mov esi, [SP + 8] // -> dv blocks limit
1002 mov edx, [BP + 24] // dv limit
1013 // Continue carry propagation until the end of the buffer.
1015 mov eax, 0 // preserves flags
1024 // Deal with the tail end.
1026 mov eax, 0 // preserves flags
1032 // All done for this iteration. Start the next. (This must have at
1033 // least one follow-on iteration, or we'd not have started this outer
1035 8: mov edi, [SP + 0] // -> dv[i - 1]
1036 mov ebx, [BP + 28] // -> X = nv[0]
1037 lea edx, [SP + 44] // -> space for Y
1038 lea esi, [SP + 12] // -> expanded M = mi
1039 add edi, 16 // -> Z = dv[i]
1040 cmp edi, [SP + 4] // all done yet?
1057 ///--------------------------------------------------------------------------
1058 /// Testing and performance measurement.
1068 .macro cystore c, v, n
1076 mov [ebx + ecx*8], eax
1077 mov [ebx + ecx*8 + 4], edx
1080 .macro testprologue n
1092 // SP + 0 = v expanded
1093 // SP + 32 = y expanded
1094 // SP + 64 = ? expanded
1108 .macro testldcarry c
1110 movdqu xmm4, [ecx + 0] // (c'_0; c''_0)
1111 movdqu xmm5, [ecx + 16] // (c'_1; c''_1)
1112 movdqu xmm6, [ecx + 32] // (c'_2; c''_2)
1115 .macro testexpand v=nil, y=nil
1120 expand xmm7, xmm0, xmm1
1121 movdqa [SP + 0], xmm0
1122 movdqa [SP + 16], xmm1
1127 expand xmm7, xmm2, xmm3
1128 movdqa [SP + 32], xmm2
1129 movdqa [SP + 48], xmm3
1133 .macro testtop u=nil, x=nil, mode=nil
1140 .ifeqs "\mode", "mont"
1147 .ifeqs "\mode", "mont"
1155 cystore SP + 96, \cyv, SP + 104
1159 .macro testcarryout c
1161 movdqu [ecx + 0], xmm4
1162 movdqu [ecx + 16], xmm5
1163 movdqu [ecx + 32], xmm6
1167 testprologue [BP + 44]
1168 testldcarry [BP + 24]
1169 testexpand [BP + 36], [BP + 40]
1171 testtop [BP + 28], [BP + 32]
1174 testcarryout [BP + 24]
1179 testprologue [BP + 44]
1180 testldcarry [BP + 24]
1181 testexpand [BP + 36], [BP + 40]
1183 testtop [BP + 28], [BP + 32]
1186 testcarryout [BP + 24]
1191 testprologue [BP + 36]
1192 testldcarry [BP + 24]
1193 testexpand nil, [BP + 32]
1195 testtop nil, [BP + 28]
1198 testcarryout [BP + 24]
1203 testprologue [BP + 36]
1204 testldcarry [BP + 24]
1205 testexpand nil, [BP + 32]
1207 testtop nil, [BP + 28]
1210 testcarryout [BP + 24]
1215 testprologue [BP + 36]
1216 testldcarry [BP + 24]
1217 testexpand nil, [BP + 32]
1219 testtop nil, [BP + 28]
1222 testcarryout [BP + 24]
1227 testprologue [BP + 36]
1228 testldcarry [BP + 24]
1229 testexpand nil, [BP + 32]
1231 testtop nil, [BP + 28]
1234 testcarryout [BP + 24]
1239 testprologue [BP + 48]
1240 testexpand [BP + 40], [BP + 44]
1242 testtop [BP + 32], [BP + 36], mont
1246 movdqa xmm0, [SP + 64]
1247 movdqa xmm1, [SP + 80]
1249 movdqu [edi + 16], xmm1
1250 testcarryout [BP + 24]
1255 testprologue [BP + 48]
1256 testexpand [BP + 40], [BP + 44]
1258 testtop [BP + 32], [BP + 36], mont
1262 movdqa xmm0, [SP + 64]
1263 movdqa xmm1, [SP + 80]
1265 movdqu [edi + 16], xmm1
1266 testcarryout [BP + 24]
1271 testprologue [BP + 40]
1272 testexpand nil, [BP + 36]
1274 testtop nil, [BP + 32], mont
1278 movdqa xmm0, [SP + 64]
1279 movdqa xmm1, [SP + 80]
1281 movdqu [edi + 16], xmm1
1282 testcarryout [BP + 24]
1288 ///----- That's all, folks --------------------------------------------------