math/mpx-mul4-*.S: Fix up some of the commentary.
[catacomb] / math / mpx-mul4-x86-sse2.S
1 /// -*- mode: asm; asm-comment-char: ?/; comment-start: "// " -*-
2 ///
3 /// Large SIMD-based multiplications
4 ///
5 /// (c) 2016 Straylight/Edgeware
6
7 ///----- Licensing notice ---------------------------------------------------
8 ///
9 /// This file is part of Catacomb.
10 ///
11 /// Catacomb is free software; you can redistribute it and/or modify
12 /// it under the terms of the GNU Library General Public License as
13 /// published by the Free Software Foundation; either version 2 of the
14 /// License, or (at your option) any later version.
15 ///
16 /// Catacomb is distributed in the hope that it will be useful,
17 /// but WITHOUT ANY WARRANTY; without even the implied warranty of
18 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 /// GNU Library General Public License for more details.
20 ///
21 /// You should have received a copy of the GNU Library General Public
22 /// License along with Catacomb; if not, write to the Free
23 /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
24 /// MA 02111-1307, USA.
25
26 ///--------------------------------------------------------------------------
27 /// Preliminaries.
28
29 #include "config.h"
30 #include "asm-common.h"
31
32 .arch pentium4
33
34 .text
35
36 ///--------------------------------------------------------------------------
37 /// Theory.
38 ///
39 /// We define a number of primitive fixed-size multipliers from which we can
40 /// construct more general variable-length multipliers.
41 ///
42 /// The basic trick is the same throughout. In an operand-scanning
43 /// multiplication, the inner multiplication loop multiplies a multiple-
44 /// precision operand by a single precision factor, and adds the result,
45 /// appropriately shifted, to the result. A `finely integrated operand
46 /// scanning' implementation of Montgomery multiplication also adds the
47 /// product of a single-precision `Montgomery factor' and the modulus,
48 /// calculated in the same pass. The more common `coarsely integrated
49 /// operand scanning' alternates main multiplication and Montgomery passes,
50 /// which requires additional carry propagation.
51 ///
52 /// Throughout both plain-multiplication and Montgomery stages, then, one of
53 /// the factors remains constant throughout the operation, so we can afford
54 /// to take a little time to preprocess it. The transformation we perform is
55 /// as follows. Let b = 2^16, and B = b^2 = 2^32. Suppose we're given a
56 /// 128-bit factor v = v_0 + v_1 B + v_2 B^2 + v_3 B^3. Split each v_i into
57 /// two sixteen-bit pieces, so v_i = v'_i + v''_i b. These eight 16-bit
58 /// pieces are placed into 32-bit cells, and arranged as two 128-bit SSE
59 /// operands, as follows.
60 ///
61 /// Offset 0 4 8 12
62 /// 0 v'_0 v'_1 v''_0 v''_1
63 /// 16 v'_2 v'_3 v''_2 v''_3
64 ///
65 /// A `pmuludq' instruction ignores the odd positions in its operands; thus,
66 /// it will act on (say) v'_0 and v''_0 in a single instruction. Shifting
67 /// this vector right by 4 bytes brings v'_1 and v''_1 into position. We can
68 /// multiply such a vector by a full 32-bit scalar to produce two 48-bit
69 /// results in 64-bit fields. The sixteen bits of headroom allows us to add
70 /// many products together before we must deal with carrying; it also allows
71 /// for some calculations to be performed on the above expanded form.
72 ///
73 /// We maintain four `carry' registers XMM4--XMM7 accumulating intermediate
74 /// results. The registers' precise roles rotate during the computation; we
75 /// name them `c0', `c1', `c2', and `c3'. Each carry register holds two
76 /// 64-bit halves: the register c0, for example, holds c'_0 (low half) and
77 /// c''_0 (high half), and represents the value c_0 = c'_0 + c''_0 b; the
78 /// carry registers collectively represent the value c_0 + c_1 B + c_2 B^2 +
79 /// c_3 B^3. The `pmuluqd' instruction acting on a scalar operand (broadcast
80 /// across all lanes of its vector) and an operand in the expanded form above
81 /// produces a result which can be added directly to the appropriate carry
82 /// register. Following a pass of four multiplications, we perform some
83 /// limited carry propagation: let t = c''_0 mod B, and let d = c'_0 + t b;
84 /// then we output z = d mod B, add (floor(d/B), floor(c''_0/B)) to c1, and
85 /// cycle the carry registers around, so that c1 becomes c0, and the old
86 /// (implicitly) zeroed c0 becomes c3.
87 ///
88 /// On 32-bit x86, we are register starved: the expanded operands are kept in
89 /// memory, typically in warm L1 cache. The packed operands are read from
90 /// memory into working registers XMM0--XMM3 and processed immediately.
91 /// The following conventional argument names and locations are used
92 /// throughout.
93 ///
94 /// Arg Format Location Notes
95 ///
96 /// U packed [EAX]
97 /// X packed [EBX] In Montgomery multiplication, X = N
98 /// V expanded [ECX]
99 /// Y expanded [EDX] In Montgomery multiplication, Y = (A + U V) M
100 /// M expanded [ESI] -N^{-1} (mod B^4)
101 /// N Modulus, for Montgomery multiplication
102 /// A packed [EDI] Destination/accumulator
103 /// C carry XMM4--XMM7
104 ///
105 /// The calculation is some variant of
106 ///
107 /// A' + C' B^4 <- U V + X Y + A + C
108 ///
109 /// The low-level functions fit into a fairly traditional (finely-integrated)
110 /// operand scanning loop over operand pairs (U, X) (indexed by j) and (V, Y)
111 /// (indexed by i).
112 ///
113 /// The variants are as follows.
114 ///
115 /// Function Variant Use i j
116 ///
117 /// mmul4 A = C = 0 Montgomery 0 0
118 /// dmul4 A = 0 Montgomery 0 +
119 /// mmla4 C = 0 Montgomery + 0
120 /// dmla4 exactly as shown Montgomery + +
121 /// mont4 U = V = C = 0 Montgomery any 0
122 ///
123 /// mul4zc U = V = A = C = 0 Plain 0 0
124 /// mul4 U = V = A = 0 Plain 0 +
125 /// mla4zc U = V = C = 0 Plain + 0
126 /// mla4 U = V = 0 Plain + +
127 ///
128 /// The `mmul4' and `mmla4' functions are also responsible for calculating
129 /// the Montgomery reduction factor Y = (A + U V) M used by the rest of the
130 /// inner loop.
131
132 ///--------------------------------------------------------------------------
133 /// Macro definitions.
134
135 .macro mulcore r, s, d0, d1=nil, d2=nil, d3=nil
136 // Load a word r_i from R, multiply by the expanded operand [S], and
137 // leave the pieces of the product in registers D0, D1, D2, D3.
138 movd \d0, \r // (r_i, 0; 0, 0)
139 .ifnes "\d1", "nil"
140 movdqa \d1, [\s] // (s'_0, s'_1; s''_0, s''_1)
141 .endif
142 .ifnes "\d3", "nil"
143 movdqa \d3, [\s + 16] // (s'_2, s'_3; s''_2, s''_3)
144 .endif
145 pshufd \d0, \d0, SHUF(0, 3, 0, 3) // (r_i, ?; r_i, ?)
146 .ifnes "\d1", "nil"
147 psrldq \d1, 4 // (s'_1, s''_0; s''_1, 0)
148 .endif
149 .ifnes "\d2", "nil"
150 .ifnes "\d3", "nil"
151 movdqa \d2, \d3 // another copy of (s'_2, s'_3; ...)
152 .else
153 movdqa \d2, \d0 // another copy of (r_i, ?; r_i, ?)
154 .endif
155 .endif
156 .ifnes "\d3", "nil"
157 psrldq \d3, 4 // (s'_3, s''_2; s''_3, 0)
158 .endif
159 .ifnes "\d1", "nil"
160 pmuludq \d1, \d0 // (r_i s'_1; r_i s''_1)
161 .endif
162 .ifnes "\d3", "nil"
163 pmuludq \d3, \d0 // (r_i s'_3; r_i s''_3)
164 .endif
165 .ifnes "\d2", "nil"
166 .ifnes "\d3", "nil"
167 pmuludq \d2, \d0 // (r_i s'_2; r_i s''_2)
168 .else
169 pmuludq \d2, [\s + 16]
170 .endif
171 .endif
172 pmuludq \d0, [\s] // (r_i s'_0; r_i s''_0)
173 .endm
174
175 .macro accum c0, c1=nil, c2=nil, c3=nil
176 // Accumulate 64-bit pieces in XMM0--XMM3 into the corresponding
177 // carry registers C0--C3. Any or all of C1--C3 may be `nil' to skip
178 // updating that register.
179 paddq \c0, xmm0
180 .ifnes "\c1", "nil"
181 paddq \c1, xmm1
182 .endif
183 .ifnes "\c2", "nil"
184 paddq \c2, xmm2
185 .endif
186 .ifnes "\c3", "nil"
187 paddq \c3, xmm3
188 .endif
189 .endm
190
191 .macro mulacc r, s, c0, c1, c2, c3, z3p=nil
192 // Load a word r_i from R, multiply by the expanded operand [S],
193 // and accumulate in carry registers C0, C1, C2, C3. If Z3P is `t'
194 // then C3 notionally contains zero, but needs clearing; in practice,
195 // we store the product directly rather than attempting to add. On
196 // completion, XMM0, XMM1, and XMM2 are clobbered, as is XMM3 if Z3P
197 // is not `t'.
198 .ifeqs "\z3p", "t"
199 mulcore \r, \s, xmm0, xmm1, xmm2, \c3
200 accum \c0, \c1, \c2
201 .else
202 mulcore \r, \s, xmm0, xmm1, xmm2, xmm3
203 accum \c0, \c1, \c2, \c3
204 .endif
205 .endm
206
207 .macro propout d, c, cc=nil
208 // Calculate an output word from C, and store it in D; propagate
209 // carries out from C to CC in preparation for a rotation of the
210 // carry registers. On completion, XMM3 is clobbered. If CC is
211 // `nil', then the contribution which would have been added to it is
212 // left in C.
213 pshufd xmm3, \c, SHUF(3, 3, 3, 2) // (?, ?; ?, t = c'' mod B)
214 psrldq xmm3, 12 // (t, 0; 0, 0) = (t, 0)
215 pslldq xmm3, 2 // (t b; 0)
216 paddq \c, xmm3 // (c' + t b; c'')
217 movd \d, \c
218 psrlq \c, 32 // floor(c/B)
219 .ifnes "\cc", "nil"
220 paddq \cc, \c // propagate up
221 .endif
222 .endm
223
224 .macro endprop d, c, t
225 // On entry, C contains a carry register. On exit, the low 32 bits
226 // of the value represented in C are written to D, and the remaining
227 // bits are left at the bottom of T.
228 movdqa \t, \c
229 psllq \t, 16 // (?; c'' b)
230 pslldq \c, 8 // (0; c')
231 paddq \t, \c // (?; c' + c'' b)
232 psrldq \t, 8 // (c' + c'' b; 0) = (c; 0)
233 movd \d, \t
234 psrldq \t, 4 // (floor(c/B); 0)
235 .endm
236
237 .macro expand z, a, b, c=nil, d=nil
238 // On entry, A and C hold packed 128-bit values, and Z is zero. On
239 // exit, A:B and C:D together hold the same values in expanded
240 // form. If C is `nil', then only expand A to A:B.
241 movdqa \b, \a // (a_0, a_1; a_2, a_3)
242 .ifnes "\c", "nil"
243 movdqa \d, \c // (c_0, c_1; c_2, c_3)
244 .endif
245 punpcklwd \a, \z // (a'_0, a''_0; a'_1, a''_1)
246 punpckhwd \b, \z // (a'_2, a''_2; a'_3, a''_3)
247 .ifnes "\c", "nil"
248 punpcklwd \c, \z // (c'_0, c''_0; c'_1, c''_1)
249 punpckhwd \d, \z // (c'_2, c''_2; c'_3, c''_3)
250 .endif
251 pshufd \a, \a, SHUF(0, 2, 1, 3) // (a'_0, a'_1; a''_0, a''_1)
252 pshufd \b, \b, SHUF(0, 2, 1, 3) // (a'_2, a'_3; a''_2, a''_3)
253 .ifnes "\c", "nil"
254 pshufd \c, \c, SHUF(0, 2, 1, 3) // (c'_0, c'_1; c''_0, c''_1)
255 pshufd \d, \d, SHUF(0, 2, 1, 3) // (c'_2, c'_3; c''_2, c''_3)
256 .endif
257 .endm
258
259 .macro squash c0, c1, c2, c3, t, u, lo, hi=nil
260 // On entry, C0, C1, C2, C3 are carry registers representing a value
261 // Y. On exit, LO holds the low 128 bits of the carry value; C1, C2,
262 // C3, T, and U are clobbered; and the high bits of Y are stored in
263 // HI, if this is not `nil'.
264
265 // The first step is to eliminate the `double-prime' pieces -- i.e.,
266 // the ones offset by 16 bytes from a 32-bit boundary -- by carrying
267 // them into the 32-bit-aligned pieces above and below. But before
268 // we can do that, we must gather them together.
269 movdqa \t, \c0
270 movdqa \u, \c1
271 punpcklqdq \t, \c2 // (y'_0; y'_2)
272 punpckhqdq \c0, \c2 // (y''_0; y''_2)
273 punpcklqdq \u, \c3 // (y'_1; y'_3)
274 punpckhqdq \c1, \c3 // (y''_1; y''_3)
275
276 // Now split the double-prime pieces. The high (up to) 48 bits will
277 // go up; the low 16 bits go down.
278 movdqa \c2, \c0
279 movdqa \c3, \c1
280 psllq \c2, 48
281 psllq \c3, 48
282 psrlq \c0, 16 // high parts of (y''_0; y''_2)
283 psrlq \c1, 16 // high parts of (y''_1; y''_3)
284 psrlq \c2, 32 // low parts of (y''_0; y''_2)
285 psrlq \c3, 32 // low parts of (y''_1; y''_3)
286 .ifnes "\hi", "nil"
287 movdqa \hi, \c1
288 .endif
289 pslldq \c1, 8 // high part of (0; y''_1)
290
291 paddq \t, \c2 // propagate down
292 paddq \u, \c3
293 paddq \t, \c1 // and up: (y_0; y_2)
294 paddq \u, \c0 // (y_1; y_3)
295 .ifnes "\hi", "nil"
296 psrldq \hi, 8 // high part of (y''_3; 0)
297 .endif
298
299 // Finally extract the answer. This complicated dance is better than
300 // storing to memory and loading, because the piecemeal stores
301 // inhibit store forwarding.
302 movdqa \c3, \t // (y_0; ?)
303 movdqa \lo, \t // (y^*_0, ?; ?, ?)
304 psrldq \t, 8 // (y_2; 0)
305 psrlq \c3, 32 // (floor(y_0/B); ?)
306 paddq \c3, \u // (y_1 + floor(y_0/B); ?)
307 movdqa \c1, \c3 // (y^*_1, ?; ?, ?)
308 psrldq \u, 8 // (y_3; 0)
309 psrlq \c3, 32 // (floor((y_1 B + y_0)/B^2; ?)
310 paddq \c3, \t // (y_2 + floor((y_1 B + y_0)/B^2; ?)
311 punpckldq \lo, \c3 // (y^*_0, y^*_2; ?, ?)
312 psrlq \c3, 32 // (floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
313 paddq \c3, \u // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
314 .ifnes "\hi", "nil"
315 movdqa \t, \c3
316 pxor \u, \u
317 .endif
318 punpckldq \c1, \c3 // (y^*_1, y^*_3; ?, ?)
319 .ifnes "\hi", "nil"
320 psrlq \t, 32 // very high bits of y
321 paddq \hi, \t
322 punpcklqdq \hi, \u // carry up
323 .endif
324 punpckldq \lo, \c1 // y mod B^4
325 .endm
326
327 .macro carryadd
328 // On entry, EDI points to a packed addend A, and XMM4, XMM5, XMM6
329 // hold the incoming carry registers c0, c1, and c2 representing a
330 // carry-in C.
331 //
332 // On exit, the carry registers, including XMM7, are updated to hold
333 // C + A; XMM0, XMM1, XMM2, and XMM3 are clobbered. The other
334 // registers are preserved.
335 movd xmm0, [edi + 0] // (a_0; 0)
336 movd xmm1, [edi + 4] // (a_1; 0)
337 movd xmm2, [edi + 8] // (a_2; 0)
338 movd xmm7, [edi + 12] // (a_3; 0)
339
340 paddq xmm4, xmm0 // (c'_0 + a_0; c''_0)
341 paddq xmm5, xmm1 // (c'_1 + a_1; c''_1)
342 paddq xmm6, xmm2 // (c'_2 + a_2; c''_2 + a_3 b)
343 .endm
344
345 ///--------------------------------------------------------------------------
346 /// Primitive multipliers and related utilities.
347
348 INTFUNC(carryprop)
349 // On entry, XMM4, XMM5, and XMM6 hold a 144-bit carry in an expanded
350 // form. Store the low 128 bits of the represented carry to [EDI] as
351 // a packed 128-bit value, and leave the remaining 16 bits in the low
352 // 32 bits of XMM4. On exit, XMM3, XMM5 and XMM6 are clobbered.
353 endprologue
354
355 propout [edi + 0], xmm4, xmm5
356 propout [edi + 4], xmm5, xmm6
357 propout [edi + 8], xmm6, nil
358 endprop [edi + 12], xmm6, xmm4
359 ret
360 ENDFUNC
361
362 INTFUNC(dmul4)
363 // On entry, EDI points to the destination buffer; EAX and EBX point
364 // to the packed operands U and X; ECX and EDX point to the expanded
365 // operands V and Y; and XMM4, XMM5, XMM6 hold the incoming carry
366 // registers c0, c1, and c2; c3 is assumed to be zero.
367 //
368 // On exit, we write the low 128 bits of the sum C + U V + X Y to
369 // [EDI], and update the carry registers with the carry out. The
370 // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
371 // general-purpose registers are preserved.
372 endprologue
373
374 mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7, t
375 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
376 propout [edi + 0], xmm4, xmm5
377
378 mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
379 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4
380 propout [edi + 4], xmm5, xmm6
381
382 mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t
383 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5
384 propout [edi + 8], xmm6, xmm7
385
386 mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
387 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6
388 propout [edi + 12], xmm7, xmm4
389
390 ret
391 ENDFUNC
392
393 INTFUNC(dmla4)
394 // On entry, EDI points to the destination buffer, which also
395 // contains an addend A to accumulate; EAX and EBX point to the
396 // packed operands U and X; ECX and EDX point to the expanded
397 // operands V and Y; and XMM4, XMM5, XMM6 hold the incoming carry
398 // registers c0, c1, and c2 representing a carry-in C; c3 is assumed
399 // to be zero.
400 //
401 // On exit, we write the low 128 bits of the sum A + C + U V + X Y to
402 // [EDI], and update the carry registers with the carry out. The
403 // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
404 // general-purpose registers are preserved.
405 endprologue
406
407 carryadd
408
409 mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7
410 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
411 propout [edi + 0], xmm4, xmm5
412
413 mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
414 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4
415 propout [edi + 4], xmm5, xmm6
416
417 mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t
418 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5
419 propout [edi + 8], xmm6, xmm7
420
421 mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
422 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6
423 propout [edi + 12], xmm7, xmm4
424
425 ret
426 ENDFUNC
427
428 INTFUNC(mul4zc)
429 // On entry, EDI points to the destination buffer; EBX points to a
430 // packed operand X; and EDX points to an expanded operand Y.
431 //
432 // On exit, we write the low 128 bits of the product X Y to [EDI],
433 // and set the carry registers XMM4, XMM5, XMM6 to the carry out.
434 // The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
435 // general-purpose registers are preserved.
436 endprologue
437
438 mulcore [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
439 propout [edi + 0], xmm4, xmm5
440
441 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
442 propout [edi + 4], xmm5, xmm6
443
444 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
445 propout [edi + 8], xmm6, xmm7
446
447 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
448 propout [edi + 12], xmm7, xmm4
449
450 ret
451 ENDFUNC
452
453 INTFUNC(mul4)
454 // On entry, EDI points to the destination buffer; EBX points to a
455 // packed operand X; EDX points to an expanded operand Y; and XMM4,
456 // XMM5, XMM6 hold the incoming carry registers c0, c1, and c2,
457 // representing a carry-in C; c3 is assumed to be zero.
458 //
459 // On exit, we write the low 128 bits of the sum C + X Y to [EDI],
460 // and update the carry registers with the carry out. The registers
461 // XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
462 // general-purpose registers are preserved.
463 endprologue
464
465 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, t
466 propout [edi + 0], xmm4, xmm5
467
468 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
469 propout [edi + 4], xmm5, xmm6
470
471 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
472 propout [edi + 8], xmm6, xmm7
473
474 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
475 propout [edi + 12], xmm7, xmm4
476
477 ret
478 ENDFUNC
479
480 INTFUNC(mla4zc)
481 // On entry, EDI points to the destination buffer, which also
482 // contains an addend A to accumulate; EBX points to a packed operand
483 // X; and EDX points to an expanded operand Y.
484 //
485 // On exit, we write the low 128 bits of the sum A + X Y to [EDI],
486 // and set the carry registers XMM4, XMM5, XMM6 to the carry out.
487 // The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
488 // general-purpose registers are preserved.
489 endprologue
490
491 movd xmm4, [edi + 0]
492 movd xmm5, [edi + 4]
493 movd xmm6, [edi + 8]
494 movd xmm7, [edi + 12]
495
496 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
497 propout [edi + 0], xmm4, xmm5
498
499 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
500 propout [edi + 4], xmm5, xmm6
501
502 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
503 propout [edi + 8], xmm6, xmm7
504
505 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
506 propout [edi + 12], xmm7, xmm4
507
508 ret
509 ENDFUNC
510
511 INTFUNC(mla4)
512 // On entry, EDI points to the destination buffer, which also
513 // contains an addend A to accumulate; EBX points to a packed operand
514 // X; EDX points to an expanded operand Y; and XMM4, XMM5, XMM6 hold
515 // the incoming carry registers c0, c1, and c2, representing a
516 // carry-in C; c3 is assumed to be zero.
517 //
518 // On exit, we write the low 128 bits of the sum A + C + X Y to
519 // [EDI], and update the carry registers with the carry out. The
520 // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
521 // general-purpose registers are preserved.
522 endprologue
523
524 carryadd
525
526 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
527 propout [edi + 0], xmm4, xmm5
528
529 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
530 propout [edi + 4], xmm5, xmm6
531
532 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
533 propout [edi + 8], xmm6, xmm7
534
535 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
536 propout [edi + 12], xmm7, xmm4
537
538 ret
539 ENDFUNC
540
541 INTFUNC(mmul4)
542 // On entry, EDI points to the destination buffer; EAX and EBX point
543 // to the packed operands U and N; ECX and ESI point to the expanded
544 // operands V and M; and EDX points to a place to store an expanded
545 // result Y (32 bytes, at a 16-byte boundary). The stack pointer
546 // must be 12 modulo 16, as is usual for modern x86 ABIs.
547 //
548 // On exit, we write Y = U V M mod B to [EDX], and the low 128 bits
549 // of the sum U V + N Y to [EDI], leaving the remaining carry in
550 // XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2, XMM3, and
551 // XMM7 are clobbered; the general-purpose registers are preserved.
552 stalloc 48 + 12 // space for the carries
553 endprologue
554
555 // Calculate W = U V, and leave it in the destination. Stash the
556 // carry pieces for later.
557 mulcore [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7
558 propout [edi + 0], xmm4, xmm5
559 jmp 5f
560 ENDFUNC
561
562 INTFUNC(mmla4)
563 // On entry, EDI points to the destination buffer, which also
564 // contains an addend A to accumulate; EAX and EBX point to the
565 // packed operands U and N; ECX and ESI point to the expanded
566 // operands V and M; and EDX points to a place to store an expanded
567 // result Y (32 bytes, at a 16-byte boundary). The stack pointer
568 // must be 12 modulo 16, as is usual for modern x86 ABIs.
569 //
570 // On exit, we write Y = (A + U V) M mod B to [EDX], and the low 128
571 // bits of the sum A + U V + N Y to [EDI], leaving the remaining
572 // carry in XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2,
573 // XMM3, and XMM7 are clobbered; the general-purpose registers are
574 // preserved.
575 stalloc 48 + 12 // space for the carries
576 endprologue
577
578 movd xmm4, [edi + 0]
579 movd xmm5, [edi + 4]
580 movd xmm6, [edi + 8]
581 movd xmm7, [edi + 12]
582
583 // Calculate W = U V, and leave it in the destination. Stash the
584 // carry pieces for later.
585 mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7
586 propout [edi + 0], xmm4, xmm5
587
588 5: mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
589 propout [edi + 4], xmm5, xmm6
590
591 mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t
592 propout [edi + 8], xmm6, xmm7
593
594 mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
595 propout [edi + 12], xmm7, xmm4
596
597 movdqa [SP + 0], xmm4
598 movdqa [SP + 16], xmm5
599 movdqa [SP + 32], xmm6
600
601 // Calculate Y = W M.
602 mulcore [edi + 0], esi, xmm4, xmm5, xmm6, xmm7
603
604 mulcore [edi + 4], esi, xmm0, xmm1, xmm2
605 accum xmm5, xmm6, xmm7
606
607 mulcore [edi + 8], esi, xmm0, xmm1
608 accum xmm6, xmm7
609
610 mulcore [edi + 12], esi, xmm0
611 accum xmm7
612
613 // That's lots of pieces. Now we have to assemble the answer.
614 squash xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm4
615
616 // Expand it.
617 pxor xmm2, xmm2
618 expand xmm2, xmm4, xmm1
619 movdqa [edx + 0], xmm4
620 movdqa [edx + 16], xmm1
621
622 // Initialize the carry from the value for W we calculated earlier.
623 movd xmm4, [edi + 0]
624 movd xmm5, [edi + 4]
625 movd xmm6, [edi + 8]
626 movd xmm7, [edi + 12]
627
628 // Finish the calculation by adding the Montgomery product.
629 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
630 propout [edi + 0], xmm4, xmm5
631
632 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
633 propout [edi + 4], xmm5, xmm6
634
635 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
636 propout [edi + 8], xmm6, xmm7
637
638 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
639 propout [edi + 12], xmm7, xmm4
640
641 // Add add on the carry we calculated earlier.
642 paddq xmm4, [SP + 0]
643 paddq xmm5, [SP + 16]
644 paddq xmm6, [SP + 32]
645
646 // And, with that, we're done.
647 stfree 48 + 12
648 ret
649 ENDFUNC
650
651 INTFUNC(mont4)
652 // On entry, EDI points to the destination buffer holding a packed
653 // value W; EBX points to a packed operand N; ESI points to an
654 // expanded operand M; and EDX points to a place to store an expanded
655 // result Y (32 bytes, at a 16-byte boundary).
656 //
657 // On exit, we write Y = W M mod B to [EDX], and the low 128 bits
658 // of the sum W + N Y to [EDI], leaving the remaining carry in
659 // XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2, XMM3, and
660 // XMM7 are clobbered; the general-purpose registers are preserved.
661 endprologue
662
663 // Calculate Y = W M.
664 mulcore [edi + 0], esi, xmm4, xmm5, xmm6, xmm7
665
666 mulcore [edi + 4], esi, xmm0, xmm1, xmm2
667 accum xmm5, xmm6, xmm7
668
669 mulcore [edi + 8], esi, xmm0, xmm1
670 accum xmm6, xmm7
671
672 mulcore [edi + 12], esi, xmm0
673 accum xmm7
674
675 // That's lots of pieces. Now we have to assemble the answer.
676 squash xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm4
677
678 // Expand it.
679 pxor xmm2, xmm2
680 expand xmm2, xmm4, xmm1
681 movdqa [edx + 0], xmm4
682 movdqa [edx + 16], xmm1
683
684 // Initialize the carry from W.
685 movd xmm4, [edi + 0]
686 movd xmm5, [edi + 4]
687 movd xmm6, [edi + 8]
688 movd xmm7, [edi + 12]
689
690 // Finish the calculation by adding the Montgomery product.
691 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
692 propout [edi + 0], xmm4, xmm5
693
694 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
695 propout [edi + 4], xmm5, xmm6
696
697 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
698 propout [edi + 8], xmm6, xmm7
699
700 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
701 propout [edi + 12], xmm7, xmm4
702
703 // And, with that, we're done.
704 ret
705 ENDFUNC
706
707 ///--------------------------------------------------------------------------
708 /// Bulk multipliers.
709
710 FUNC(mpx_umul4_x86_avx)
711 .arch .avx
712 vzeroupper
713 endprologue
714 // and drop through...
715 .arch pentium4
716 ENDFUNC
717
718 FUNC(mpx_umul4_x86_sse2)
719 // void mpx_umul4_x86_sse2(mpw *dv, const mpw *av, const mpw *avl,
720 // const mpw *bv, const mpw *bvl);
721
722 // Build a stack frame. Arguments will be relative to BP, as
723 // follows.
724 //
725 // BP + 20 dv
726 // BP + 24 av
727 // BP + 28 avl
728 // BP + 32 bv
729 // BP + 36 bvl
730 //
731 // Locals are relative to SP, as follows.
732 //
733 // SP + 0 expanded Y (32 bytes)
734 // SP + 32 (top of locals)
735 pushreg BP
736 pushreg ebx
737 pushreg esi
738 pushreg edi
739 setfp
740 stalloc 32
741 and SP, ~15
742 endprologue
743
744 // Prepare for the first iteration.
745 mov esi, [BP + 32] // -> bv[0]
746 pxor xmm7, xmm7
747 movdqu xmm0, [esi] // bv[0]
748 mov edi, [BP + 20] // -> dv[0]
749 mov ecx, edi // outer loop dv cursor
750 expand xmm7, xmm0, xmm1
751 mov ebx, [BP + 24] // -> av[0]
752 mov eax, [BP + 28] // -> av[m] = av limit
753 mov edx, SP // -> expanded Y = bv[0]
754 movdqa [SP + 0], xmm0 // bv[0] expanded low
755 movdqa [SP + 16], xmm1 // bv[0] expanded high
756 call mul4zc
757 add ebx, 16
758 add edi, 16
759 add ecx, 16
760 add esi, 16
761 cmp ebx, eax // all done?
762 jae 8f
763
764 .p2align 4
765 // Continue with the first iteration.
766 0: call mul4
767 add ebx, 16
768 add edi, 16
769 cmp ebx, eax // all done?
770 jb 0b
771
772 // Write out the leftover carry. There can be no tail here.
773 8: call carryprop
774 cmp esi, [BP + 36] // more passes to do?
775 jae 9f
776
777 .p2align 4
778 // Set up for the next pass.
779 1: movdqu xmm0, [esi] // bv[i]
780 mov edi, ecx // -> dv[i]
781 pxor xmm7, xmm7
782 expand xmm7, xmm0, xmm1
783 mov ebx, [BP + 24] // -> av[0]
784 movdqa [SP + 0], xmm0 // bv[i] expanded low
785 movdqa [SP + 16], xmm1 // bv[i] expanded high
786 call mla4zc
787 add edi, 16
788 add ebx, 16
789 add ecx, 16
790 add esi, 16
791 cmp ebx, eax // done yet?
792 jae 8f
793
794 .p2align 4
795 // Continue...
796 0: call mla4
797 add ebx, 16
798 add edi, 16
799 cmp ebx, eax
800 jb 0b
801
802 // Finish off this pass. There was no tail on the previous pass, and
803 // there can be none on this pass.
804 8: call carryprop
805 cmp esi, [BP + 36]
806 jb 1b
807
808 // All over.
809 9: dropfp
810 pop edi
811 pop esi
812 pop ebx
813 pop BP
814 ret
815 ENDFUNC
816
817 FUNC(mpxmont_mul4_x86_avx)
818 .arch .avx
819 vzeroupper
820 endprologue
821 // and drop through...
822 .arch pentium4
823 ENDFUNC
824
825 FUNC(mpxmont_mul4_x86_sse2)
826 // void mpxmont_mul4_x86_sse2(mpw *dv, const mpw *av, const mpw *bv,
827 // const mpw *nv, size_t n, const mpw *mi);
828
829 // Build a stack frame. Arguments will be relative to BP, as
830 // follows.
831 //
832 // BP + 20 dv
833 // BP + 24 av
834 // BP + 28 bv
835 // BP + 32 nv
836 // BP + 36 n (nonzero multiple of 4)
837 // BP + 40 mi
838 //
839 // Locals are relative to SP, which 16-byte aligned, as follows.
840 //
841 // SP + 0 expanded V (32 bytes)
842 // SP + 32 expanded M (32 bytes)
843 // SP + 64 expanded Y (32 bytes)
844 // SP + 96 outer loop dv
845 // SP + 100 outer loop bv
846 // SP + 104 av limit (mostly in ESI)
847 // SP + 108 bv limit
848 // SP + 112 (top of locals)
849 pushreg BP
850 pushreg ebx
851 pushreg esi
852 pushreg edi
853 setfp
854 stalloc 112
855 and SP, ~15
856 endprologue
857
858 // Establish the expanded operands.
859 pxor xmm7, xmm7
860 mov ecx, [BP + 28] // -> bv
861 mov edx, [BP + 40] // -> mi
862 movdqu xmm0, [ecx] // bv[0]
863 movdqu xmm2, [edx] // mi
864 expand xmm7, xmm0, xmm1, xmm2, xmm3
865 movdqa [SP + 0], xmm0 // bv[0] expanded low
866 movdqa [SP + 16], xmm1 // bv[0] expanded high
867 movdqa [SP + 32], xmm2 // mi expanded low
868 movdqa [SP + 48], xmm3 // mi expanded high
869
870 // Set up the outer loop state and prepare for the first iteration.
871 mov edx, [BP + 36] // n
872 mov eax, [BP + 24] // -> U = av[0]
873 mov ebx, [BP + 32] // -> X = nv[0]
874 mov edi, [BP + 20] // -> Z = dv[0]
875 mov [SP + 100], ecx
876 lea ecx, [ecx + 4*edx] // -> bv[n/4] = bv limit
877 lea edx, [eax + 4*edx] // -> av[n/4] = av limit
878 mov [SP + 96], edi
879 mov [SP + 104], edx
880 mov [SP + 108], ecx
881 lea ecx, [SP + 0] // -> expanded V = bv[0]
882 lea esi, [SP + 32] // -> expanded M = mi
883 lea edx, [SP + 64] // -> space for Y
884 call mmul4
885 mov esi, [SP + 104] // recover av limit
886 add edi, 16
887 add eax, 16
888 add ebx, 16
889 cmp eax, esi // done already?
890 jae 8f
891 mov [SP + 96], edi
892
893 .p2align 4
894 // Complete the first inner loop.
895 0: call dmul4
896 add edi, 16
897 add eax, 16
898 add ebx, 16
899 cmp eax, esi // done yet?
900 jb 0b
901
902 // Still have carries left to propagate.
903 call carryprop
904 movd [edi + 16], xmm4
905
906 .p2align 4
907 // Embark on the next iteration. (There must be one. If n = 1, then
908 // we would have bailed above, to label 8. Similarly, the subsequent
909 // iterations can fall into the inner loop immediately.)
910 1: mov eax, [SP + 100] // -> bv[i - 1]
911 mov edi, [SP + 96] // -> Z = dv[i]
912 add eax, 16 // -> bv[i]
913 pxor xmm7, xmm7
914 mov [SP + 100], eax
915 cmp eax, [SP + 108] // done yet?
916 jae 9f
917 movdqu xmm0, [eax] // bv[i]
918 mov ebx, [BP + 32] // -> X = nv[0]
919 lea esi, [SP + 32] // -> expanded M = mi
920 mov eax, [BP + 24] // -> U = av[0]
921 expand xmm7, xmm0, xmm1
922 movdqa [SP + 0], xmm0 // bv[i] expanded low
923 movdqa [SP + 16], xmm1 // bv[i] expanded high
924 call mmla4
925 mov esi, [SP + 104] // recover av limit
926 add edi, 16
927 add eax, 16
928 add ebx, 16
929 mov [SP + 96], edi
930
931 .p2align 4
932 // Complete the next inner loop.
933 0: call dmla4
934 add edi, 16
935 add eax, 16
936 add ebx, 16
937 cmp eax, esi
938 jb 0b
939
940 // Still have carries left to propagate, and they overlap the
941 // previous iteration's final tail, so read that in and add it.
942 movd xmm0, [edi]
943 paddq xmm4, xmm0
944 call carryprop
945 movd [edi + 16], xmm4
946
947 // Back again.
948 jmp 1b
949
950 // First iteration was short. Write out the carries and we're done.
951 // (This could be folded into the main loop structure, but that would
952 // penalize small numbers more.)
953 8: call carryprop
954 movd [edi + 16], xmm4
955
956 // All done.
957 9: dropfp
958 popreg edi
959 popreg esi
960 popreg ebx
961 popreg BP
962 ret
963 ENDFUNC
964
965 FUNC(mpxmont_redc4_x86_avx)
966 .arch .avx
967 vzeroupper
968 endprologue
969 // and drop through...
970 .arch pentium4
971 ENDFUNC
972
973 FUNC(mpxmont_redc4_x86_sse2)
974 // void mpxmont_redc4_x86_sse2(mpw *dv, mpw *dvl, const mpw *nv,
975 // size_t n, const mpw *mi);
976
977 // Build a stack frame. Arguments will be relative to BP, as
978 // follows.
979 //
980 // BP + 20 dv
981 // BP + 24 dvl
982 // BP + 28 nv
983 // BP + 32 n (nonzero multiple of 4)
984 // BP + 36 mi
985 //
986 // Locals are relative to SP, as follows.
987 //
988 // SP + 0 outer loop dv
989 // SP + 4 outer dv limit
990 // SP + 8 blocks-of-4 dv limit
991 // SP + 12 expanded M (32 bytes)
992 // SP + 44 expanded Y (32 bytes)
993 // SP + 76 (top of locals)
994 pushreg BP
995 pushreg ebx
996 pushreg esi
997 pushreg edi
998 setfp
999 and SP, ~15
1000 stalloc 76
1001 endprologue
1002
1003 // Establish the expanded operands and the blocks-of-4 dv limit.
1004 mov edi, [BP + 20] // -> Z = dv[0]
1005 pxor xmm7, xmm7
1006 mov eax, [BP + 24] // -> dv[n] = dv limit
1007 sub eax, edi // length of dv in bytes
1008 mov edx, [BP + 36] // -> mi
1009 movdqu xmm0, [edx] // mi
1010 and eax, ~15 // mask off the tail end
1011 expand xmm7, xmm0, xmm1
1012 add eax, edi // find limit
1013 movdqa [SP + 12], xmm0 // mi expanded low
1014 movdqa [SP + 28], xmm1 // mi expanded high
1015 mov [SP + 8], eax
1016
1017 // Set up the outer loop state and prepare for the first iteration.
1018 mov ecx, [BP + 32] // n
1019 mov ebx, [BP + 28] // -> X = nv[0]
1020 lea edx, [edi + 4*ecx] // -> dv[n/4] = outer dv limit
1021 lea ecx, [ebx + 4*ecx] // -> nv[n/4] = nv limit
1022 mov [SP + 0], edi
1023 mov [SP + 4], edx
1024 lea esi, [SP + 12] // -> expanded M = mi
1025 lea edx, [SP + 44] // -> space for Y
1026 call mont4
1027 add ebx, 16
1028 add edi, 16
1029 cmp ebx, ecx // done already?
1030 jae 8f
1031
1032 .p2align 4
1033 // Complete the first inner loop.
1034 5: call mla4
1035 add ebx, 16
1036 add edi, 16
1037 cmp ebx, ecx // done yet?
1038 jb 5b
1039
1040 // Still have carries left to propagate.
1041 8: carryadd
1042 mov esi, [SP + 8] // -> dv blocks limit
1043 mov edx, [BP + 24] // dv limit
1044 psllq xmm7, 16
1045 pslldq xmm7, 8
1046 paddq xmm6, xmm7
1047 call carryprop
1048 movd eax, xmm4
1049 add edi, 16
1050 cmp edi, esi
1051 jae 7f
1052
1053 .p2align 4
1054 // Continue carry propagation until the end of the buffer.
1055 0: add [edi], eax
1056 mov eax, 0 // preserves flags
1057 adcd [edi + 4], 0
1058 adcd [edi + 8], 0
1059 adcd [edi + 12], 0
1060 adc eax, 0
1061 add edi, 16
1062 cmp edi, esi
1063 jb 0b
1064
1065 // Deal with the tail end.
1066 7: add [edi], eax
1067 mov eax, 0 // preserves flags
1068 add edi, 4
1069 adc eax, 0
1070 cmp edi, edx
1071 jb 7b
1072
1073 // All done for this iteration. Start the next. (This must have at
1074 // least one follow-on iteration, or we'd not have started this outer
1075 // loop.)
1076 8: mov edi, [SP + 0] // -> dv[i - 1]
1077 mov ebx, [BP + 28] // -> X = nv[0]
1078 lea edx, [SP + 44] // -> space for Y
1079 lea esi, [SP + 12] // -> expanded M = mi
1080 add edi, 16 // -> Z = dv[i]
1081 cmp edi, [SP + 4] // all done yet?
1082 jae 9f
1083 mov [SP + 0], edi
1084 call mont4
1085 add edi, 16
1086 add ebx, 16
1087 jmp 5b
1088
1089 // All over.
1090 9: dropfp
1091 popreg edi
1092 popreg esi
1093 popreg ebx
1094 popreg BP
1095 ret
1096 ENDFUNC
1097
1098 ///--------------------------------------------------------------------------
1099 /// Testing and performance measurement.
1100
1101 #ifdef TEST_MUL4
1102
1103 .macro cysetup c
1104 rdtsc
1105 mov [\c], eax
1106 mov [\c + 4], edx
1107 .endm
1108
1109 .macro cystore c, v, n
1110 rdtsc
1111 sub eax, [\c]
1112 sbb edx, [\c + 4]
1113 mov ebx, [\v]
1114 mov ecx, [\n]
1115 dec ecx
1116 mov [\n], ecx
1117 mov [ebx + ecx*8], eax
1118 mov [ebx + ecx*8 + 4], edx
1119 .endm
1120
1121 .macro testprologue n
1122 pushreg BP
1123 pushreg ebx
1124 pushreg esi
1125 pushreg edi
1126 setfp
1127 stalloc 3*32 + 4*4
1128 and SP, ~15
1129 endprologue
1130 mov eax, \n
1131 mov [SP + 104], eax
1132 // vars:
1133 // SP + 0 = v expanded
1134 // SP + 32 = y expanded
1135 // SP + 64 = ? expanded
1136 // SP + 96 = cycles
1137 // SP + 104 = count
1138 .endm
1139
1140 .macro testepilogue
1141 dropfp
1142 popreg edi
1143 popreg esi
1144 popreg ebx
1145 popreg BP
1146 ret
1147 .endm
1148
1149 .macro testldcarry c
1150 mov ecx, \c // -> c
1151 movdqu xmm4, [ecx + 0] // (c'_0; c''_0)
1152 movdqu xmm5, [ecx + 16] // (c'_1; c''_1)
1153 movdqu xmm6, [ecx + 32] // (c'_2; c''_2)
1154 .endm
1155
1156 .macro testexpand v=nil, y=nil
1157 pxor xmm7, xmm7
1158 .ifnes "\v", "nil"
1159 mov ecx, \v
1160 movdqu xmm0, [ecx]
1161 expand xmm7, xmm0, xmm1
1162 movdqa [SP + 0], xmm0
1163 movdqa [SP + 16], xmm1
1164 .endif
1165 .ifnes "\y", "nil"
1166 mov edx, \y
1167 movdqu xmm2, [edx]
1168 expand xmm7, xmm2, xmm3
1169 movdqa [SP + 32], xmm2
1170 movdqa [SP + 48], xmm3
1171 .endif
1172 .endm
1173
1174 .macro testtop u=nil, x=nil, mode=nil
1175 .p2align 4
1176 0:
1177 .ifnes "\u", "nil"
1178 lea ecx, [SP + 0]
1179 .endif
1180 mov ebx, \x
1181 .ifeqs "\mode", "mont"
1182 lea esi, [SP + 32]
1183 .endif
1184 cysetup SP + 96
1185 .ifnes "\u", "nil"
1186 mov eax, \u
1187 .endif
1188 .ifeqs "\mode", "mont"
1189 lea edx, [SP + 64]
1190 .else
1191 lea edx, [SP + 32]
1192 .endif
1193 .endm
1194
1195 .macro testtail cyv
1196 cystore SP + 96, \cyv, SP + 104
1197 jnz 0b
1198 .endm
1199
1200 .macro testcarryout c
1201 mov ecx, \c
1202 movdqu [ecx + 0], xmm4
1203 movdqu [ecx + 16], xmm5
1204 movdqu [ecx + 32], xmm6
1205 .endm
1206
1207 FUNC(test_dmul4)
1208 testprologue [BP + 44]
1209 testldcarry [BP + 24]
1210 testexpand [BP + 36], [BP + 40]
1211 mov edi, [BP + 20]
1212 testtop [BP + 28], [BP + 32]
1213 call dmul4
1214 testtail [BP + 48]
1215 testcarryout [BP + 24]
1216 testepilogue
1217 ENDFUNC
1218
1219 FUNC(test_dmla4)
1220 testprologue [BP + 44]
1221 testldcarry [BP + 24]
1222 testexpand [BP + 36], [BP + 40]
1223 mov edi, [BP + 20]
1224 testtop [BP + 28], [BP + 32]
1225 call dmla4
1226 testtail [BP + 48]
1227 testcarryout [BP + 24]
1228 testepilogue
1229 ENDFUNC
1230
1231 FUNC(test_mul4)
1232 testprologue [BP + 36]
1233 testldcarry [BP + 24]
1234 testexpand nil, [BP + 32]
1235 mov edi, [BP + 20]
1236 testtop nil, [BP + 28]
1237 call mul4
1238 testtail [BP + 40]
1239 testcarryout [BP + 24]
1240 testepilogue
1241 ENDFUNC
1242
1243 FUNC(test_mul4zc)
1244 testprologue [BP + 36]
1245 testldcarry [BP + 24]
1246 testexpand nil, [BP + 32]
1247 mov edi, [BP + 20]
1248 testtop nil, [BP + 28]
1249 call mul4zc
1250 testtail [BP + 40]
1251 testcarryout [BP + 24]
1252 testepilogue
1253 ENDFUNC
1254
1255 FUNC(test_mla4)
1256 testprologue [BP + 36]
1257 testldcarry [BP + 24]
1258 testexpand nil, [BP + 32]
1259 mov edi, [BP + 20]
1260 testtop nil, [BP + 28]
1261 call mla4
1262 testtail [BP + 40]
1263 testcarryout [BP + 24]
1264 testepilogue
1265 ENDFUNC
1266
1267 FUNC(test_mla4zc)
1268 testprologue [BP + 36]
1269 testldcarry [BP + 24]
1270 testexpand nil, [BP + 32]
1271 mov edi, [BP + 20]
1272 testtop nil, [BP + 28]
1273 call mla4zc
1274 testtail [BP + 40]
1275 testcarryout [BP + 24]
1276 testepilogue
1277 ENDFUNC
1278
1279 FUNC(test_mmul4)
1280 testprologue [BP + 48]
1281 testexpand [BP + 40], [BP + 44]
1282 mov edi, [BP + 20]
1283 testtop [BP + 32], [BP + 36], mont
1284 call mmul4
1285 testtail [BP + 52]
1286 mov edi, [BP + 28]
1287 movdqa xmm0, [SP + 64]
1288 movdqa xmm1, [SP + 80]
1289 movdqu [edi], xmm0
1290 movdqu [edi + 16], xmm1
1291 testcarryout [BP + 24]
1292 testepilogue
1293 ENDFUNC
1294
1295 FUNC(test_mmla4)
1296 testprologue [BP + 48]
1297 testexpand [BP + 40], [BP + 44]
1298 mov edi, [BP + 20]
1299 testtop [BP + 32], [BP + 36], mont
1300 call mmla4
1301 testtail [BP + 52]
1302 mov edi, [BP + 28]
1303 movdqa xmm0, [SP + 64]
1304 movdqa xmm1, [SP + 80]
1305 movdqu [edi], xmm0
1306 movdqu [edi + 16], xmm1
1307 testcarryout [BP + 24]
1308 testepilogue
1309 ENDFUNC
1310
1311 FUNC(test_mont4)
1312 testprologue [BP + 40]
1313 testexpand nil, [BP + 36]
1314 mov edi, [BP + 20]
1315 testtop nil, [BP + 32], mont
1316 call mont4
1317 testtail [BP + 44]
1318 mov edi, [BP + 28]
1319 movdqa xmm0, [SP + 64]
1320 movdqa xmm1, [SP + 80]
1321 movdqu [edi], xmm0
1322 movdqu [edi + 16], xmm1
1323 testcarryout [BP + 24]
1324 testepilogue
1325 ENDFUNC
1326
1327 #endif
1328
1329 ///----- That's all, folks --------------------------------------------------