math/mpx-mul4-x86-sse2.S: Use the correct vector-multiply instruction.
[catacomb] / math / mpx-mul4-x86-sse2.S
1 /// -*- mode: asm; asm-comment-char: ?/; comment-start: "// " -*-
2 ///
3 /// Large SIMD-based multiplications
4 ///
5 /// (c) 2016 Straylight/Edgeware
6
7 ///----- Licensing notice ---------------------------------------------------
8 ///
9 /// This file is part of Catacomb.
10 ///
11 /// Catacomb is free software; you can redistribute it and/or modify
12 /// it under the terms of the GNU Library General Public License as
13 /// published by the Free Software Foundation; either version 2 of the
14 /// License, or (at your option) any later version.
15 ///
16 /// Catacomb is distributed in the hope that it will be useful,
17 /// but WITHOUT ANY WARRANTY; without even the implied warranty of
18 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 /// GNU Library General Public License for more details.
20 ///
21 /// You should have received a copy of the GNU Library General Public
22 /// License along with Catacomb; if not, write to the Free
23 /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
24 /// MA 02111-1307, USA.
25
26 ///--------------------------------------------------------------------------
27 /// External definitions.
28
29 #include "config.h"
30 #include "asm-common.h"
31
32 ///--------------------------------------------------------------------------
33 /// Prologue.
34
35 .arch pentium4
36 .text
37
38 ///--------------------------------------------------------------------------
39 /// Theory.
40 ///
41 /// We define a number of primitive fixed-size multipliers from which we can
42 /// construct more general variable-length multipliers.
43 ///
44 /// The basic trick is the same throughout. In an operand-scanning
45 /// multiplication, the inner multiplication loop multiplies a
46 /// multiple-precision operand by a single precision factor, and adds the
47 /// result, appropriately shifted, to the result. A `finely integrated
48 /// operand scanning' implementation of Montgomery multiplication also adds
49 /// the product of a single-precision `Montgomery factor' and the modulus,
50 /// calculated in the same pass. The more common `coarsely integrated
51 /// operand scanning' alternates main multiplication and Montgomery passes,
52 /// which requires additional carry propagation.
53 ///
54 /// Throughout both plain-multiplication and Montgomery stages, then, one of
55 /// the factors remains constant throughout the operation, so we can afford
56 /// to take a little time to preprocess it. The transformation we perform is
57 /// as follows. Let b = 2^16, and B = b^2 = 2^32. Suppose we're given a
58 /// 128-bit factor v = v_0 + v_1 B + v_2 B^2 + v_3 B^3. Split each v_i into
59 /// two sixteen-bit pieces, so v_i = v'_i + v''_i b. These eight 16-bit
60 /// pieces are placed into 32-bit cells, and arranged as two 128-bit SSE
61 /// operands, as follows.
62 ///
63 /// Offset 0 4 8 12
64 /// 0 v'_0 v'_1 v''_0 v''_1
65 /// 16 v'_2 v'_3 v''_2 v''_3
66 ///
67 /// A `pmuludq' instruction ignores the odd positions in its operands; thus,
68 /// it will act on (say) v'_0 and v''_0 in a single instruction. Shifting
69 /// this vector right by 4 bytes brings v'_1 and v''_1 into position. We can
70 /// multiply such a vector by a full 32-bit scalar to produce two 48-bit
71 /// results in 64-bit fields. The sixteen bits of headroom allows us to add
72 /// many products together before we must deal with carrying; it also allows
73 /// for some calculations to be performed on the above expanded form.
74 ///
75 /// On 32-bit x86, we are register starved: the expanded operands are kept in
76 /// memory, typically in warm L1 cache.
77 ///
78 /// We maintain four `carry' registers accumulating intermediate results.
79 /// The registers' precise roles rotate during the computation; we name them
80 /// `c0', `c1', `c2', and `c3'. Each carry register holds two 64-bit halves:
81 /// the register c0, for example, holds c'_0 (low half) and c''_0 (high
82 /// half), and represents the value c_0 = c'_0 + c''_0 b; the carry registers
83 /// collectively represent the value c_0 + c_1 B + c_2 B^2 + c_3 B^3. The
84 /// `pmuluqd' instruction acting on a scalar operand (broadcast across all
85 /// lanes of its vector) and an operand in the expanded form above produces a
86 /// result which can be added directly to the appropriate carry register.
87 /// Following a pass of four multiplications, we perform some limited carry
88 /// propagation: let t = c''_0 mod B, and let d = c'_0 + t b; then we output
89 /// z = d mod B, add (floor(d/B), floor(c''_0/B)) to c1, and cycle the carry
90 /// registers around, so that c1 becomes c0, and the old c0 is (implicitly)
91 /// zeroed becomes c3.
92
93 ///--------------------------------------------------------------------------
94 /// Macro definitions.
95
96 .macro mulcore r, s, d0, d1, d2, d3
97 // Load a word r_i from R, multiply by the expanded operand [S], and
98 // leave the pieces of the product in registers D0, D1, D2, D3.
99 movd \d0, \r // (r_i, 0, 0, 0)
100 .ifnes "\d1", "nil"
101 movdqa \d1, [\s] // (s'_0, s'_1, s''_0, s''_1)
102 .endif
103 .ifnes "\d3", "nil"
104 movdqa \d3, [\s + 16] // (s'_2, s'_3, s''_2, s''_3)
105 .endif
106 pshufd \d0, \d0, SHUF(3, 0, 3, 0) // (r_i, ?, r_i, ?)
107 .ifnes "\d1", "nil"
108 psrldq \d1, 4 // (s'_1, s''_0, s''_1, 0)
109 .endif
110 .ifnes "\d2", "nil"
111 .ifnes "\d3", "nil"
112 movdqa \d2, \d3 // another copy of (s'_2, s'_3, ...)
113 .else
114 movdqa \d2, \d0 // another copy of (r_i, ?, r_i, ?)
115 .endif
116 .endif
117 .ifnes "\d3", "nil"
118 psrldq \d3, 4 // (s'_3, s''_2, s''_3, 0)
119 .endif
120 .ifnes "\d1", "nil"
121 pmuludq \d1, \d0 // (r_i s'_1, r_i s''_1)
122 .endif
123 .ifnes "\d3", "nil"
124 pmuludq \d3, \d0 // (r_i s'_3, r_i s''_3)
125 .endif
126 .ifnes "\d2", "nil"
127 .ifnes "\d3", "nil"
128 pmuludq \d2, \d0 // (r_i s'_2, r_i s''_2)
129 .else
130 pmuludq \d2, [\s + 16]
131 .endif
132 .endif
133 pmuludq \d0, [\s] // (r_i s'_0, r_i s''_0)
134 .endm
135
136 .macro accum c0, c1, c2, c3
137 paddq \c0, xmm0
138 .ifnes "\c1", "nil"
139 paddq \c1, xmm1
140 .endif
141 .ifnes "\c2", "nil"
142 paddq \c2, xmm2
143 .endif
144 .ifnes "\c3", "nil"
145 paddq \c3, xmm3
146 .endif
147 .endm
148
149 .macro mulacc r, s, c0, c1, c2, c3, z3p
150 // Load a word r_i from R, multiply by the expanded operand [S],
151 // and accumulate in carry registers C0, C1, C2, C3. If Z3P is `t'
152 // then C3 notionally contains zero, but needs clearing; in practice,
153 // we store the product directly rather than attempting to add. On
154 // completion, XMM0, XMM1, and XMM2 are clobbered, as is XMM3 if Z3P
155 // is not `t'.
156 .ifeqs "\z3p", "t"
157 mulcore \r, \s, xmm0, xmm1, xmm2, \c3
158 accum \c0, \c1, \c2, nil
159 .else
160 mulcore \r, \s, xmm0, xmm1, xmm2, xmm3
161 accum \c0, \c1, \c2, \c3
162 .endif
163 .endm
164
165 .macro propout d, c, cc
166 // Calculate an output word from C, and store it in D; propagate
167 // carries out from C to CC in preparation for a rotation of the
168 // carry registers. On completion, XMM3 is clobbered. If CC is
169 // `nil', then the contribution which would have been added to it is
170 // left in C.
171 pshufd xmm3, \c, SHUF(2, 3, 3, 3) // (?, ?, ?, t = c'' mod B)
172 psrldq xmm3, 12 // (t, 0, 0, 0) = (t, 0)
173 pslldq xmm3, 2 // (t b, 0)
174 paddq \c, xmm3 // (c' + t b, c'')
175 movd \d, \c
176 psrlq \c, 32 // floor(c/B)
177 .ifnes "\cc", "nil"
178 paddq \cc, \c // propagate up
179 .endif
180 .endm
181
182 .macro endprop d, c, t
183 // On entry, C contains a carry register. On exit, the low 32 bits
184 // of the value represented in C are written to D, and the remaining
185 // bits are left at the bottom of T.
186 movdqa \t, \c
187 psllq \t, 16 // (?, c'' b)
188 pslldq \c, 8 // (0, c')
189 paddq \t, \c // (?, c' + c'' b)
190 psrldq \t, 8 // c' + c'' b
191 movd \d, \t
192 psrldq \t, 4 // floor((c' + c'' b)/B)
193 .endm
194
195 .macro expand a, b, c, d, z
196 // On entry, A and C hold packed 128-bit values, and Z is zero. On
197 // exit, A:B and C:D together hold the same values in expanded
198 // form. If C is `nil', then only expand A to A:B.
199 movdqa \b, \a // (a_0, a_1, a_2, a_3)
200 .ifnes "\c", "nil"
201 movdqa \d, \c // (c_0, c_1, c_2, c_3)
202 .endif
203 punpcklwd \a, \z // (a'_0, a''_0, a'_1, a''_1)
204 punpckhwd \b, \z // (a'_2, a''_2, a'_3, a''_3)
205 .ifnes "\c", "nil"
206 punpcklwd \c, \z // (c'_0, c''_0, c'_1, c''_1)
207 punpckhwd \d, \z // (c'_2, c''_2, c'_3, c''_3)
208 .endif
209 pshufd \a, \a, SHUF(3, 1, 2, 0) // (a'_0, a'_1, a''_0, a''_1)
210 pshufd \b, \b, SHUF(3, 1, 2, 0) // (a'_2, a'_3, a''_2, a''_3)
211 .ifnes "\c", "nil"
212 pshufd \c, \c, SHUF(3, 1, 2, 0) // (c'_0, c'_1, c''_0, c''_1)
213 pshufd \d, \d, SHUF(3, 1, 2, 0) // (c'_2, c'_3, c''_2, c''_3)
214 .endif
215 .endm
216
217 .macro squash lo, hi, c0, c1, c2, c3, t, u
218 // On entry, C0, C1, C2, C3 are carry registers representing a value
219 // Y. On exit, LO holds the low 128 bits of the carry value; C1, C2,
220 // C3, T, and U are clobbered; and the high bits of Y are stored in
221 // HI, if this is not `nil'.
222
223 // The first step is to eliminate the `double-prime' pieces -- i.e.,
224 // the ones offset by 16 bytes from a 32-bit boundary -- by carrying
225 // them into the 32-bit-aligned pieces above and below. But before
226 // we can do that, we must gather them together.
227 movdqa \t, \c0
228 movdqa \u, \c1
229 punpcklqdq \t, \c2 // (y'_0, y'_2)
230 punpckhqdq \c0, \c2 // (y''_0, y''_2)
231 punpcklqdq \u, \c3 // (y'_1, y'_3)
232 punpckhqdq \c1, \c3 // (y''_1, y''_3)
233
234 // Now split the double-prime pieces. The high (up to) 48 bits will
235 // go up; the low 16 bits go down.
236 movdqa \c2, \c0
237 movdqa \c3, \c1
238 psllq \c2, 48
239 psllq \c3, 48
240 psrlq \c0, 16 // high parts of (y''_0, y''_2)
241 psrlq \c1, 16 // high parts of (y''_1, y''_3)
242 psrlq \c2, 32 // low parts of (y''_0, y''_2)
243 psrlq \c3, 32 // low parts of (y''_1, y''_3)
244 .ifnes "\hi", "nil"
245 movdqa \hi, \c1
246 .endif
247 pslldq \c1, 8 // high part of (0, y''_1)
248
249 paddq \t, \c2 // propagate down
250 paddq \u, \c3
251 paddq \t, \c1 // and up: (y_0, y_2)
252 paddq \u, \c0 // (y_1, y_3)
253 .ifnes "\hi", "nil"
254 psrldq \hi, 8 // high part of (y''_3, 0)
255 .endif
256
257 // Finally extract the answer. This complicated dance is better than
258 // storing to memory and loading, because the piecemeal stores
259 // inhibit store forwarding.
260 movdqa \c3, \t // (y_0, y_1)
261 movdqa \lo, \t // (y^*_0, ?, ?, ?)
262 psrldq \t, 8 // (y_2, 0)
263 psrlq \c3, 32 // (floor(y_0/B), ?)
264 paddq \c3, \u // (y_1 + floor(y_0/B), ?)
265 movdqa \c1, \c3 // (y^*_1, ?, ?, ?)
266 psrldq \u, 8 // (y_3, 0)
267 psrlq \c3, 32 // (floor((y_1 B + y_0)/B^2, ?)
268 paddq \c3, \t // (y_2 + floor((y_1 B + y_0)/B^2, ?)
269 punpckldq \lo, \c3 // (y^*_0, y^*_2, ?, ?)
270 psrlq \c3, 32 // (floor((y_2 B^2 + y_1 B + y_0)/B^3, ?)
271 paddq \c3, \u // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3, ?)
272 .ifnes "\hi", "nil"
273 movdqa \t, \c3
274 pxor \u, \u
275 .endif
276 punpckldq \c1, \c3 // (y^*_1, y^*_3, ?, ?)
277 .ifnes "\hi", "nil"
278 psrlq \t, 32 // very high bits of y
279 paddq \hi, \t
280 punpcklqdq \hi, \u // carry up
281 .endif
282 punpckldq \lo, \c1 // y mod B^4
283 .endm
284
285 .macro carryadd
286 // On entry, EDI points to a packed addend A, and XMM4, XMM5, XMM6
287 // hold the incoming carry registers c0, c1, and c2 representing a
288 // carry-in C.
289 //
290 // On exit, the carry registers, including XMM7, are updated to hold
291 // C + A; XMM0, XMM1, XMM2, and XMM3 are clobbered. The other
292 // registers are preserved.
293 movd xmm0, [edi + 0] // (a_0, 0)
294 movd xmm1, [edi + 4] // (a_1, 0)
295 movd xmm2, [edi + 8] // (a_2, 0)
296 movd xmm7, [edi + 12] // (a_3, 0)
297
298 paddq xmm4, xmm0 // (c'_0 + a_0, c''_0)
299 paddq xmm5, xmm1 // (c'_1 + a_1, c''_1)
300 paddq xmm6, xmm2 // (c'_2 + a_2, c''_2 + a_3 b)
301 .endm
302
303 ///--------------------------------------------------------------------------
304 /// Primitive multipliers and related utilities.
305
306 INTFUNC(carryprop)
307 // On entry, XMM4, XMM5, and XMM6 hold a 144-bit carry in an expanded
308 // form. Store the low 128 bits of the represented carry to [EDI] as
309 // a packed 128-bit value, and leave the remaining 16 bits in the low
310 // 32 bits of XMM4. On exit, XMM3, XMM5 and XMM6 are clobbered.
311 endprologue
312
313 propout [edi + 0], xmm4, xmm5
314 propout [edi + 4], xmm5, xmm6
315 propout [edi + 8], xmm6, nil
316 endprop [edi + 12], xmm6, xmm4
317 ret
318
319 ENDFUNC
320
321 INTFUNC(dmul4)
322 // On entry, EDI points to the destination buffer; EAX and EBX point
323 // to the packed operands U and X; ECX and EDX point to the expanded
324 // operands V and Y; and XMM4, XMM5, XMM6 hold the incoming carry
325 // registers c0, c1, and c2; c3 is assumed to be zero.
326 //
327 // On exit, we write the low 128 bits of the sum C + U V + X Y to
328 // [EDI], and update the carry registers with the carry out. The
329 // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
330 // general-purpose registers are preserved.
331 endprologue
332
333 mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7, t
334 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil
335 propout [edi + 0], xmm4, xmm5
336
337 mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
338 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, nil
339 propout [edi + 4], xmm5, xmm6
340
341 mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t
342 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, nil
343 propout [edi + 8], xmm6, xmm7
344
345 mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
346 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, nil
347 propout [edi + 12], xmm7, xmm4
348
349 ret
350
351 ENDFUNC
352
353 INTFUNC(dmla4)
354 // On entry, EDI points to the destination buffer, which also
355 // contains an addend A to accumulate; EAX and EBX point to the
356 // packed operands U and X; ECX and EDX point to the expanded
357 // operands V and Y; and XMM4, XMM5, XMM6 hold the incoming carry
358 // registers c0, c1, and c2 representing a carry-in C; c3 is assumed
359 // to be zero.
360 //
361 // On exit, we write the low 128 bits of the sum A + C + U V + X Y to
362 // [EDI], and update the carry registers with the carry out. The
363 // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
364 // general-purpose registers are preserved.
365 endprologue
366
367 carryadd
368
369 mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7, nil
370 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil
371 propout [edi + 0], xmm4, xmm5
372
373 mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
374 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, nil
375 propout [edi + 4], xmm5, xmm6
376
377 mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t
378 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, nil
379 propout [edi + 8], xmm6, xmm7
380
381 mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
382 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, nil
383 propout [edi + 12], xmm7, xmm4
384
385 ret
386
387 ENDFUNC
388
389 INTFUNC(mul4zc)
390 // On entry, EDI points to the destination buffer; EBX points to a
391 // packed operand X; and EDX points to an expanded operand Y.
392 //
393 // On exit, we write the low 128 bits of the product X Y to [EDI],
394 // and set the carry registers XMM4, XMM5, XMM6 to the carry out.
395 // The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
396 // general-purpose registers are preserved.
397 endprologue
398
399 mulcore [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
400 propout [edi + 0], xmm4, xmm5
401
402 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
403 propout [edi + 4], xmm5, xmm6
404
405 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
406 propout [edi + 8], xmm6, xmm7
407
408 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
409 propout [edi + 12], xmm7, xmm4
410
411 ret
412
413 ENDFUNC
414
415 INTFUNC(mul4)
416 // On entry, EDI points to the destination buffer; EBX points to a
417 // packed operand X; EDX points to an expanded operand Y; and XMM4,
418 // XMM5, XMM6 hold the incoming carry registers c0, c1, and c2,
419 // representing a carry-in C; c3 is assumed to be zero.
420 //
421 // On exit, we write the low 128 bits of the sum C + X Y to [EDI],
422 // and update the carry registers with the carry out. The registers
423 // XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
424 // general-purpose registers are preserved.
425 endprologue
426
427 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, t
428 propout [edi + 0], xmm4, xmm5
429
430 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
431 propout [edi + 4], xmm5, xmm6
432
433 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
434 propout [edi + 8], xmm6, xmm7
435
436 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
437 propout [edi + 12], xmm7, xmm4
438
439 ret
440
441 ENDFUNC
442
443 INTFUNC(mla4zc)
444 // On entry, EDI points to the destination buffer, which also
445 // contains an addend A to accumulate; EBX points to a packed operand
446 // X; and EDX points to an expanded operand Y.
447 //
448 // On exit, we write the low 128 bits of the sum A + X Y to [EDI],
449 // and set the carry registers XMM4, XMM5, XMM6 to the carry out.
450 // The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
451 // general-purpose registers are preserved.
452 endprologue
453
454 movd xmm4, [edi + 0]
455 movd xmm5, [edi + 4]
456 movd xmm6, [edi + 8]
457 movd xmm7, [edi + 12]
458
459 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil
460 propout [edi + 0], xmm4, xmm5
461
462 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
463 propout [edi + 4], xmm5, xmm6
464
465 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
466 propout [edi + 8], xmm6, xmm7
467
468 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
469 propout [edi + 12], xmm7, xmm4
470
471 ret
472
473 ENDFUNC
474
475 INTFUNC(mla4)
476 // On entry, EDI points to the destination buffer, which also
477 // contains an addend A to accumulate; EBX points to a packed operand
478 // X; EDX points to an expanded operand Y; and XMM4, XMM5, XMM6 hold
479 // the incoming carry registers c0, c1, and c2, representing a
480 // carry-in C; c3 is assumed to be zero.
481 //
482 // On exit, we write the low 128 bits of the sum A + C + X Y to
483 // [EDI], and update the carry registers with the carry out. The
484 // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
485 // general-purpose registers are preserved.
486 endprologue
487
488 carryadd
489
490 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil
491 propout [edi + 0], xmm4, xmm5
492
493 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
494 propout [edi + 4], xmm5, xmm6
495
496 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
497 propout [edi + 8], xmm6, xmm7
498
499 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
500 propout [edi + 12], xmm7, xmm4
501
502 ret
503
504 ENDFUNC
505
506 INTFUNC(mmul4)
507 // On entry, EDI points to the destination buffer; EAX and EBX point
508 // to the packed operands U and N; ECX and ESI point to the expanded
509 // operands V and M; and EDX points to a place to store an expanded
510 // result Y (32 bytes, at a 16-byte boundary). The stack pointer
511 // must be 16-byte aligned. (This is not the usual convention, which
512 // requires alignment before the call.)
513 //
514 // On exit, we write Y = U V M mod B to [EDX], and the low 128 bits
515 // of the sum U V + N Y to [EDI], leaving the remaining carry in
516 // XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2, XMM3, and
517 // XMM7 are clobbered; the general-purpose registers are preserved.
518 stalloc 48 // space for the carries
519 endprologue
520
521 // Calculate W = U V, and leave it in the destination. Stash the
522 // carry pieces for later.
523 mulcore [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7
524 propout [edi + 0], xmm4, xmm5
525 jmp 5f
526
527 ENDFUNC
528
529 INTFUNC(mmla4)
530 // On entry, EDI points to the destination buffer, which also
531 // contains an addend A to accumulate; EAX and EBX point
532 // to the packed operands U and N; ECX and ESI point to the expanded
533 // operands V and M; and EDX points to a place to store an expanded
534 // result Y (32 bytes, at a 16-byte boundary). The stack pointer
535 // must be 16-byte aligned. (This is not the usual convention, which
536 // requires alignment before the call.)
537 //
538 // On exit, we write Y = (A + U V) M mod B to [EDX], and the low 128
539 // bits of the sum A + U V + N Y to [EDI], leaving the remaining
540 // carry in XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2,
541 // XMM3, and XMM7 are clobbered; the general-purpose registers are
542 // preserved.
543 stalloc 48 // space for the carries
544 endprologue
545
546 movd xmm4, [edi + 0]
547 movd xmm5, [edi + 4]
548 movd xmm6, [edi + 8]
549 movd xmm7, [edi + 12]
550 mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7, nil
551 propout [edi + 0], xmm4, xmm5
552
553 5: mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
554 propout [edi + 4], xmm5, xmm6
555
556 mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t
557 propout [edi + 8], xmm6, xmm7
558
559 mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
560 propout [edi + 12], xmm7, xmm4
561
562 movdqa [esp + 0], xmm4
563 movdqa [esp + 16], xmm5
564 movdqa [esp + 32], xmm6
565
566 // Calculate Y = W M.
567 mulcore [edi + 0], esi, xmm4, xmm5, xmm6, xmm7
568
569 mulcore [edi + 4], esi, xmm0, xmm1, xmm2, nil
570 accum xmm5, xmm6, xmm7, nil
571
572 mulcore [edi + 8], esi, xmm0, xmm1, nil, nil
573 accum xmm6, xmm7, nil, nil
574
575 mulcore [edi + 12], esi, xmm0, nil, nil, nil
576 accum xmm7, nil, nil, nil
577
578 // That's lots of pieces. Now we have to assemble the answer.
579 squash xmm4, nil, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1
580
581 // Expand it.
582 pxor xmm2, xmm2
583 expand xmm4, xmm1, nil, nil, xmm2
584 movdqa [edx + 0], xmm4
585 movdqa [edx + 16], xmm1
586
587 // Initialize the carry from the value for W we calculated earlier.
588 movd xmm4, [edi + 0]
589 movd xmm5, [edi + 4]
590 movd xmm6, [edi + 8]
591 movd xmm7, [edi + 12]
592
593 // Finish the calculation by adding the Montgomery product.
594 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil
595 propout [edi + 0], xmm4, xmm5
596
597 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
598 propout [edi + 4], xmm5, xmm6
599
600 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
601 propout [edi + 8], xmm6, xmm7
602
603 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
604 propout [edi + 12], xmm7, xmm4
605
606 // Add add on the carry we calculated earlier.
607 paddq xmm4, [esp + 0]
608 paddq xmm5, [esp + 16]
609 paddq xmm6, [esp + 32]
610
611 // And, with that, we're done.
612 stfree 48
613 ret
614
615 ENDFUNC
616
617 INTFUNC(mont4)
618 // On entry, EDI points to the destination buffer holding a packed
619 // value W; EBX points to a packed operand N; ESI points to an
620 // expanded operand M; and EDX points to a place to store an expanded
621 // result Y (32 bytes, at a 16-byte boundary).
622 //
623 // On exit, we write Y = W M mod B to [EDX], and the low 128 bits
624 // of the sum W + N Y to [EDI], leaving the remaining carry in
625 // XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2, XMM3, and
626 // XMM7 are clobbered; the general-purpose registers are preserved.
627 endprologue
628
629 // Calculate Y = W M.
630 mulcore [edi + 0], esi, xmm4, xmm5, xmm6, xmm7
631
632 mulcore [edi + 4], esi, xmm0, xmm1, xmm2, nil
633 accum xmm5, xmm6, xmm7, nil
634
635 mulcore [edi + 8], esi, xmm0, xmm1, nil, nil
636 accum xmm6, xmm7, nil, nil
637
638 mulcore [edi + 12], esi, xmm0, nil, nil, nil
639 accum xmm7, nil, nil, nil
640
641 // That's lots of pieces. Now we have to assemble the answer.
642 squash xmm4, nil, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1
643
644 // Expand it.
645 pxor xmm2, xmm2
646 expand xmm4, xmm1, nil, nil, xmm2
647 movdqa [edx + 0], xmm4
648 movdqa [edx + 16], xmm1
649
650 // Initialize the carry from W.
651 movd xmm4, [edi + 0]
652 movd xmm5, [edi + 4]
653 movd xmm6, [edi + 8]
654 movd xmm7, [edi + 12]
655
656 // Finish the calculation by adding the Montgomery product.
657 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil
658 propout [edi + 0], xmm4, xmm5
659
660 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
661 propout [edi + 4], xmm5, xmm6
662
663 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
664 propout [edi + 8], xmm6, xmm7
665
666 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
667 propout [edi + 12], xmm7, xmm4
668
669 // And, with that, we're done.
670 ret
671
672 ENDFUNC
673
674 ///--------------------------------------------------------------------------
675 /// Bulk multipliers.
676
677 FUNC(mpx_umul4_x86_sse2)
678 // void mpx_umul4_x86_sse2(mpw *dv, const mpw *av, const mpw *avl,
679 // const mpw *bv, const mpw *bvl);
680
681 // Build a stack frame. Arguments will be relative to EBP, as
682 // follows.
683 //
684 // ebp + 20 dv
685 // ebp + 24 av
686 // ebp + 28 avl
687 // ebp + 32 bv
688 // ebp + 36 bvl
689 //
690 // Locals are relative to ESP, as follows.
691 //
692 // esp + 0 expanded Y (32 bytes)
693 // esp + 32 (top of locals)
694 pushreg ebp
695 pushreg ebx
696 pushreg esi
697 pushreg edi
698 setfp ebp
699 and esp, ~15
700 sub esp, 32
701 endprologue
702
703 // Prepare for the first iteration.
704 mov esi, [ebp + 32] // -> bv[0]
705 pxor xmm7, xmm7
706 movdqu xmm0, [esi] // bv[0]
707 mov edi, [ebp + 20] // -> dv[0]
708 mov ecx, edi // outer loop dv cursor
709 expand xmm0, xmm1, nil, nil, xmm7
710 mov ebx, [ebp + 24] // -> av[0]
711 mov eax, [ebp + 28] // -> av[m] = av limit
712 mov edx, esp // -> expanded Y = bv[0]
713 movdqa [esp + 0], xmm0 // bv[0] expanded low
714 movdqa [esp + 16], xmm1 // bv[0] expanded high
715 call mul4zc
716 add ebx, 16
717 add edi, 16
718 add ecx, 16
719 add esi, 16
720 cmp ebx, eax // all done?
721 jae 8f
722
723 .p2align 4
724 // Continue with the first iteration.
725 0: call mul4
726 add ebx, 16
727 add edi, 16
728 cmp ebx, eax // all done?
729 jb 0b
730
731 // Write out the leftover carry. There can be no tail here.
732 8: call carryprop
733 cmp esi, [ebp + 36] // more passes to do?
734 jae 9f
735
736 .p2align 4
737 // Set up for the next pass.
738 1: movdqu xmm0, [esi] // bv[i]
739 mov edi, ecx // -> dv[i]
740 pxor xmm7, xmm7
741 expand xmm0, xmm1, nil, nil, xmm7
742 mov ebx, [ebp + 24] // -> av[0]
743 movdqa [esp + 0], xmm0 // bv[i] expanded low
744 movdqa [esp + 16], xmm1 // bv[i] expanded high
745 call mla4zc
746 add edi, 16
747 add ebx, 16
748 add ecx, 16
749 add esi, 16
750 cmp ebx, eax // done yet?
751 jae 8f
752
753 .p2align 4
754 // Continue...
755 0: call mla4
756 add ebx, 16
757 add edi, 16
758 cmp ebx, eax
759 jb 0b
760
761 // Finish off this pass. There was no tail on the previous pass, and
762 // there can be none on this pass.
763 8: call carryprop
764 cmp esi, [ebp + 36]
765 jb 1b
766
767 // All over.
768 9: dropfp
769 pop edi
770 pop esi
771 pop ebx
772 pop ebp
773 ret
774
775 ENDFUNC
776
777 FUNC(mpxmont_mul4_x86_sse2)
778 // void mpxmont_mul4_x86_sse2(mpw *dv, const mpw *av, const mpw *bv,
779 // const mpw *nv, size_t n, const mpw *mi);
780
781 // Build a stack frame. Arguments will be relative to EBP, as
782 // follows.
783 //
784 // ebp + 20 dv
785 // ebp + 24 av
786 // ebp + 28 bv
787 // ebp + 32 nv
788 // ebp + 36 n (nonzero multiple of 4)
789 // ebp + 40 mi
790 //
791 // Locals are relative to ESP, which is 4 mod 16, as follows.
792 //
793 // esp + 0 outer loop dv
794 // esp + 4 outer loop bv
795 // esp + 8 av limit (mostly in ESI)
796 // esp + 12 expanded V (32 bytes)
797 // esp + 44 expanded M (32 bytes)
798 // esp + 76 expanded Y (32 bytes)
799 // esp + 108 bv limit
800 // esp + 112 (gap)
801 // esp + 124 (top of locals)
802 pushreg ebp
803 pushreg ebx
804 pushreg esi
805 pushreg edi
806 setfp ebp
807 and esp, ~15
808 sub esp, 124
809 endprologue
810
811 // Establish the expanded operands.
812 pxor xmm7, xmm7
813 mov ecx, [ebp + 28] // -> bv
814 mov edx, [ebp + 40] // -> mi
815 movdqu xmm0, [ecx] // bv[0]
816 movdqu xmm2, [edx] // mi
817 expand xmm0, xmm1, xmm2, xmm3, xmm7
818 movdqa [esp + 12], xmm0 // bv[0] expanded low
819 movdqa [esp + 28], xmm1 // bv[0] expanded high
820 movdqa [esp + 44], xmm2 // mi expanded low
821 movdqa [esp + 60], xmm3 // mi expanded high
822
823 // Set up the outer loop state and prepare for the first iteration.
824 mov edx, [ebp + 36] // n
825 mov eax, [ebp + 24] // -> U = av[0]
826 mov ebx, [ebp + 32] // -> X = nv[0]
827 mov edi, [ebp + 20] // -> Z = dv[0]
828 mov [esp + 4], ecx
829 lea ecx, [ecx + 4*edx] // -> bv[n/4] = bv limit
830 lea edx, [eax + 4*edx] // -> av[n/4] = av limit
831 mov [esp + 0], edi
832 mov [esp + 108], ecx
833 mov [esp + 8], edx
834 lea ecx, [esp + 12] // -> expanded V = bv[0]
835 lea esi, [esp + 44] // -> expanded M = mi
836 lea edx, [esp + 76] // -> space for Y
837 call mmul4
838 mov esi, [esp + 8] // recover av limit
839 add edi, 16
840 add eax, 16
841 add ebx, 16
842 cmp eax, esi // done already?
843 jae 8f
844 mov [esp + 0], edi
845
846 .p2align 4
847 // Complete the first inner loop.
848 0: call dmul4
849 add edi, 16
850 add eax, 16
851 add ebx, 16
852 cmp eax, esi // done yet?
853 jb 0b
854
855 // Still have carries left to propagate.
856 call carryprop
857 movd [edi + 16], xmm4
858
859 .p2align 4
860 // Embark on the next iteration. (There must be one. If n = 1, then
861 // we would have bailed above, to label 8. Similarly, the subsequent
862 // iterations can fall into the inner loop immediately.)
863 1: mov eax, [esp + 4] // -> bv[i - 1]
864 mov edi, [esp + 0] // -> Z = dv[i]
865 add eax, 16 // -> bv[i]
866 pxor xmm7, xmm7
867 movdqu xmm0, [eax] // bv[i]
868 mov [esp + 4], eax
869 cmp eax, [esp + 108] // done yet?
870 jae 9f
871 mov ebx, [ebp + 32] // -> X = nv[0]
872 lea esi, [esp + 44] // -> expanded M = mi
873 mov eax, [ebp + 24] // -> U = av[0]
874 expand xmm0, xmm1, nil, nil, xmm7
875 movdqa [esp + 12], xmm0 // bv[i] expanded low
876 movdqa [esp + 28], xmm1 // bv[i] expanded high
877 call mmla4
878 mov esi, [esp + 8] // recover av limit
879 add edi, 16
880 add eax, 16
881 add ebx, 16
882 mov [esp + 0], edi
883
884 .p2align 4
885 // Complete the next inner loop.
886 0: call dmla4
887 add edi, 16
888 add eax, 16
889 add ebx, 16
890 cmp eax, esi
891 jb 0b
892
893 // Still have carries left to propagate, and they overlap the
894 // previous iteration's final tail, so read that in and add it.
895 movd xmm0, [edi]
896 paddq xmm4, xmm0
897 call carryprop
898 movd [edi + 16], xmm4
899
900 // Back again.
901 jmp 1b
902
903 // First iteration was short. Write out the carries and we're done.
904 // (This could be folded into the main loop structure, but that would
905 // penalize small numbers more.)
906 8: call carryprop
907 movd [edi + 16], xmm4
908
909 // All done.
910 9: dropfp
911 popreg edi
912 popreg esi
913 popreg ebx
914 popreg ebp
915 ret
916
917 ENDFUNC
918
919 FUNC(mpxmont_redc4_x86_sse2)
920 // void mpxmont_redc4_x86_sse2(mpw *dv, mpw *dvl, const mpw *nv,
921 // size_t n, const mpw *mi);
922
923 // Build a stack frame. Arguments will be relative to EBP, as
924 // follows.
925 //
926 // ebp + 20 dv
927 // ebp + 24 dvl
928 // ebp + 28 nv
929 // ebp + 32 n (nonzero multiple of 4)
930 // ebp + 36 mi
931 //
932 // Locals are relative to ESP, as follows.
933 //
934 // esp + 0 outer loop dv
935 // esp + 4 outer dv limit
936 // esp + 8 blocks-of-4 dv limit
937 // esp + 12 expanded M (32 bytes)
938 // esp + 44 expanded Y (32 bytes)
939 // esp + 76 (top of locals)
940 pushreg ebp
941 pushreg ebx
942 pushreg esi
943 pushreg edi
944 setfp ebp
945 and esp, ~15
946 sub esp, 76
947 endprologue
948
949 // Establish the expanded operands and the blocks-of-4 dv limit.
950 mov edi, [ebp + 20] // -> Z = dv[0]
951 pxor xmm7, xmm7
952 mov eax, [ebp + 24] // -> dv[n] = dv limit
953 sub eax, edi // length of dv in bytes
954 mov edx, [ebp + 36] // -> mi
955 movdqu xmm0, [edx] // mi
956 and eax, ~15 // mask off the tail end
957 expand xmm0, xmm1, nil, nil, xmm7
958 add eax, edi // find limit
959 movdqa [esp + 12], xmm0 // mi expanded low
960 movdqa [esp + 28], xmm1 // mi expanded high
961 mov [esp + 8], eax
962
963 // Set up the outer loop state and prepare for the first iteration.
964 mov ecx, [ebp + 32] // n
965 mov ebx, [ebp + 28] // -> X = nv[0]
966 lea edx, [edi + 4*ecx] // -> dv[n/4] = outer dv limit
967 lea ecx, [ebx + 4*ecx] // -> nv[n/4] = nv limit
968 mov [esp + 0], edi
969 mov [esp + 4], edx
970 lea esi, [esp + 12] // -> expanded M = mi
971 lea edx, [esp + 44] // -> space for Y
972 call mont4
973 add edi, 16
974 add ebx, 16
975 cmp ebx, ecx // done already?
976 jae 8f
977
978 .p2align 4
979 // Complete the first inner loop.
980 5: call mla4
981 add ebx, 16
982 add edi, 16
983 cmp ebx, ecx // done yet?
984 jb 5b
985
986 // Still have carries left to propagate.
987 8: carryadd
988 mov esi, [esp + 8] // -> dv blocks limit
989 mov edx, [ebp + 24] // dv limit
990 psllq xmm7, 16
991 pslldq xmm7, 8
992 paddq xmm6, xmm7
993 call carryprop
994 movd eax, xmm4
995 add edi, 16
996 cmp edi, esi
997 jae 7f
998
999 .p2align 4
1000 // Continue carry propagation until the end of the buffer.
1001 0: add [edi], eax
1002 mov eax, 0 // preserves flags
1003 adcd [edi + 4], 0
1004 adcd [edi + 8], 0
1005 adcd [edi + 12], 0
1006 adc eax, 0
1007 add edi, 16
1008 cmp edi, esi
1009 jb 0b
1010
1011 // Deal with the tail end.
1012 7: add [edi], eax
1013 mov eax, 0 // preserves flags
1014 add edi, 4
1015 adc eax, 0
1016 cmp edi, edx
1017 jb 7b
1018
1019 // All done for this iteration. Start the next. (This must have at
1020 // least one follow-on iteration, or we'd not have started this outer
1021 // loop.)
1022 8: mov edi, [esp + 0] // -> dv[i - 1]
1023 mov ebx, [ebp + 28] // -> X = nv[0]
1024 lea edx, [esp + 44] // -> space for Y
1025 lea esi, [esp + 12] // -> expanded M = mi
1026 add edi, 16 // -> Z = dv[i]
1027 cmp edi, [esp + 4] // all done yet?
1028 jae 9f
1029 mov [esp + 0], edi
1030 call mont4
1031 add edi, 16
1032 add ebx, 16
1033 jmp 5b
1034
1035 // All over.
1036 9: dropfp
1037 popreg edi
1038 popreg esi
1039 popreg ebx
1040 popreg ebp
1041 ret
1042
1043 ENDFUNC
1044
1045 ///--------------------------------------------------------------------------
1046 /// Testing and performance measurement.
1047
1048 #ifdef TEST_MUL4
1049
1050 .macro cysetup c
1051 rdtsc
1052 mov [\c], eax
1053 mov [\c + 4], edx
1054 .endm
1055
1056 .macro cystore c, v, n
1057 rdtsc
1058 sub eax, [\c]
1059 sbb edx, [\c + 4]
1060 mov ebx, [\v]
1061 mov ecx, [\n]
1062 dec ecx
1063 mov [\n], ecx
1064 mov [ebx + ecx*8], eax
1065 mov [ebx + ecx*8 + 4], edx
1066 .endm
1067
1068 .macro testprologue
1069 pushreg ebp
1070 pushreg ebx
1071 pushreg esi
1072 pushreg edi
1073 setfp ebp
1074 and esp, ~15
1075 sub esp, 3*32 + 12
1076 endprologue
1077 // vars:
1078 // esp + 0 = cycles
1079 // esp + 12 = v expanded
1080 // esp + 44 = y expanded
1081 // esp + 72 = ? expanded
1082 .endm
1083
1084 .macro testepilogue
1085 dropfp
1086 popreg edi
1087 popreg esi
1088 popreg ebx
1089 popreg ebp
1090 ret
1091 .endm
1092
1093 .macro testldcarry c
1094 mov ecx, \c // -> c
1095 movdqu xmm4, [ecx + 0] // (c'_0, c''_0)
1096 movdqu xmm5, [ecx + 16] // (c'_1, c''_1)
1097 movdqu xmm6, [ecx + 32] // (c'_2, c''_2)
1098 .endm
1099
1100 .macro testexpand v, y
1101 pxor xmm7, xmm7
1102 .ifnes "\v", "nil"
1103 mov ecx, \v
1104 movdqu xmm0, [ecx]
1105 expand xmm0, xmm1, nil, nil, xmm7
1106 movdqa [esp + 12], xmm0
1107 movdqa [esp + 28], xmm1
1108 .endif
1109 .ifnes "\y", "nil"
1110 mov edx, \y
1111 movdqu xmm2, [edx]
1112 expand xmm2, xmm3, nil, nil, xmm7
1113 movdqa [esp + 44], xmm2
1114 movdqa [esp + 60], xmm3
1115 .endif
1116 .endm
1117
1118 .macro testtop u, x, mode
1119 .p2align 4
1120 0:
1121 .ifnes "\u", "nil"
1122 lea ecx, [esp + 12]
1123 .endif
1124 mov ebx, \x
1125 .ifeqs "\mode", "mont"
1126 lea esi, [esp + 44]
1127 .endif
1128 cysetup esp + 0
1129 .ifnes "\u", "nil"
1130 mov eax, \u
1131 .endif
1132 .ifeqs "\mode", "mont"
1133 lea edx, [esp + 76]
1134 .else
1135 lea edx, [esp + 44]
1136 .endif
1137 .endm
1138
1139 .macro testtail cyv, n
1140 cystore esp + 0, \cyv, \n
1141 jnz 0b
1142 .endm
1143
1144 .macro testcarryout c
1145 mov ecx, \c
1146 movdqu [ecx + 0], xmm4
1147 movdqu [ecx + 16], xmm5
1148 movdqu [ecx + 32], xmm6
1149 .endm
1150
1151 FUNC(test_dmul4)
1152 testprologue
1153 testldcarry [ebp + 24]
1154 testexpand [ebp + 36], [ebp + 40]
1155 mov edi, [ebp + 20]
1156 testtop [ebp + 28], [ebp + 32]
1157 call dmul4
1158 testtail [ebp + 48], [ebp + 44]
1159 testcarryout [ebp + 24]
1160 testepilogue
1161 ENDFUNC
1162
1163 FUNC(test_dmla4)
1164 testprologue
1165 testldcarry [ebp + 24]
1166 testexpand [ebp + 36], [ebp + 40]
1167 mov edi, [ebp + 20]
1168 testtop [ebp + 28], [ebp + 32]
1169 call dmla4
1170 testtail [ebp + 48], [ebp + 44]
1171 testcarryout [ebp + 24]
1172 testepilogue
1173 ENDFUNC
1174
1175 FUNC(test_mul4)
1176 testprologue
1177 testldcarry [ebp + 24]
1178 testexpand nil, [ebp + 32]
1179 mov edi, [ebp + 20]
1180 testtop nil, [ebp + 28]
1181 call mul4
1182 testtail [ebp + 40], [ebp + 36]
1183 testcarryout [ebp + 24]
1184 testepilogue
1185 ENDFUNC
1186
1187 FUNC(test_mla4)
1188 testprologue
1189 testldcarry [ebp + 24]
1190 testexpand nil, [ebp + 32]
1191 mov edi, [ebp + 20]
1192 testtop nil, [ebp + 28]
1193 call mla4
1194 testtail [ebp + 40], [ebp + 36]
1195 testcarryout [ebp + 24]
1196 testepilogue
1197 ENDFUNC
1198
1199 FUNC(test_mmul4)
1200 testprologue
1201 testexpand [ebp + 40], [ebp + 44]
1202 mov edi, [ebp + 20]
1203 testtop [ebp + 32], [ebp + 36], mont
1204 call mmul4
1205 testtail [ebp + 52], [ebp + 48]
1206 mov edi, [ebp + 28]
1207 movdqa xmm0, [esp + 76]
1208 movdqa xmm1, [esp + 92]
1209 movdqu [edi], xmm0
1210 movdqu [edi + 16], xmm1
1211 testcarryout [ebp + 24]
1212 testepilogue
1213 ENDFUNC
1214
1215 FUNC(test_mmla4)
1216 testprologue
1217 testexpand [ebp + 40], [ebp + 44]
1218 mov edi, [ebp + 20]
1219 testtop [ebp + 32], [ebp + 36], mont
1220 call mmla4
1221 testtail [ebp + 52], [ebp + 48]
1222 mov edi, [ebp + 28]
1223 movdqa xmm0, [esp + 76]
1224 movdqa xmm1, [esp + 92]
1225 movdqu [edi], xmm0
1226 movdqu [edi + 16], xmm1
1227 testcarryout [ebp + 24]
1228 testepilogue
1229 ENDFUNC
1230
1231 FUNC(test_mont4)
1232 testprologue
1233 testexpand nil, [ebp + 36]
1234 mov edi, [ebp + 20]
1235 testtop nil, [ebp + 32], mont
1236 call mont4
1237 testtail [ebp + 44], [ebp + 40]
1238 mov edi, [ebp + 28]
1239 movdqa xmm0, [esp + 76]
1240 movdqa xmm1, [esp + 92]
1241 movdqu [edi], xmm0
1242 movdqu [edi + 16], xmm1
1243 testcarryout [ebp + 24]
1244 testepilogue
1245 ENDFUNC
1246
1247 #endif
1248
1249 ///----- That's all, folks --------------------------------------------------