math/mpx-mul4-x86-sse2.S: Additional piece of commentary.
[catacomb] / math / mpx-mul4-x86-sse2.S
1 /// -*- mode: asm; asm-comment-char: ?/; comment-start: "// " -*-
2 ///
3 /// Large SIMD-based multiplications
4 ///
5 /// (c) 2016 Straylight/Edgeware
6
7 ///----- Licensing notice ---------------------------------------------------
8 ///
9 /// This file is part of Catacomb.
10 ///
11 /// Catacomb is free software; you can redistribute it and/or modify
12 /// it under the terms of the GNU Library General Public License as
13 /// published by the Free Software Foundation; either version 2 of the
14 /// License, or (at your option) any later version.
15 ///
16 /// Catacomb is distributed in the hope that it will be useful,
17 /// but WITHOUT ANY WARRANTY; without even the implied warranty of
18 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 /// GNU Library General Public License for more details.
20 ///
21 /// You should have received a copy of the GNU Library General Public
22 /// License along with Catacomb; if not, write to the Free
23 /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
24 /// MA 02111-1307, USA.
25
26 ///--------------------------------------------------------------------------
27 /// External definitions.
28
29 #include "config.h"
30 #include "asm-common.h"
31
32 ///--------------------------------------------------------------------------
33 /// Prologue.
34
35 .arch pentium4
36 .text
37
38 ///--------------------------------------------------------------------------
39 /// Theory.
40 ///
41 /// We define a number of primitive fixed-size multipliers from which we can
42 /// construct more general variable-length multipliers.
43 ///
44 /// The basic trick is the same throughout. In an operand-scanning
45 /// multiplication, the inner multiplication loop multiplies a
46 /// multiple-precision operand by a single precision factor, and adds the
47 /// result, appropriately shifted, to the result. A `finely integrated
48 /// operand scanning' implementation of Montgomery multiplication also adds
49 /// the product of a single-precision `Montgomery factor' and the modulus,
50 /// calculated in the same pass. The more common `coarsely integrated
51 /// operand scanning' alternates main multiplication and Montgomery passes,
52 /// which requires additional carry propagation.
53 ///
54 /// Throughout both plain-multiplication and Montgomery stages, then, one of
55 /// the factors remains constant throughout the operation, so we can afford
56 /// to take a little time to preprocess it. The transformation we perform is
57 /// as follows. Let b = 2^16, and B = b^2 = 2^32. Suppose we're given a
58 /// 128-bit factor v = v_0 + v_1 B + v_2 B^2 + v_3 B^3. Split each v_i into
59 /// two sixteen-bit pieces, so v_i = v'_i + v''_i b. These eight 16-bit
60 /// pieces are placed into 32-bit cells, and arranged as two 128-bit SSE
61 /// operands, as follows.
62 ///
63 /// Offset 0 4 8 12
64 /// 0 v'_0 v'_1 v''_0 v''_1
65 /// 16 v'_2 v'_3 v''_2 v''_3
66 ///
67 /// A `pmuludq' instruction ignores the odd positions in its operands; thus,
68 /// it will act on (say) v'_0 and v''_0 in a single instruction. Shifting
69 /// this vector right by 4 bytes brings v'_1 and v''_1 into position. We can
70 /// multiply such a vector by a full 32-bit scalar to produce two 48-bit
71 /// results in 64-bit fields. The sixteen bits of headroom allows us to add
72 /// many products together before we must deal with carrying; it also allows
73 /// for some calculations to be performed on the above expanded form.
74 ///
75 /// On 32-bit x86, we are register starved: the expanded operands are kept in
76 /// memory, typically in warm L1 cache.
77 ///
78 /// We maintain four `carry' registers accumulating intermediate results.
79 /// The registers' precise roles rotate during the computation; we name them
80 /// `c0', `c1', `c2', and `c3'. Each carry register holds two 64-bit halves:
81 /// the register c0, for example, holds c'_0 (low half) and c''_0 (high
82 /// half), and represents the value c_0 = c'_0 + c''_0 b; the carry registers
83 /// collectively represent the value c_0 + c_1 B + c_2 B^2 + c_3 B^3. The
84 /// `pmuluqd' instruction acting on a scalar operand (broadcast across all
85 /// lanes of its vector) and an operand in the expanded form above produces a
86 /// result which can be added directly to the appropriate carry register.
87 /// Following a pass of four multiplications, we perform some limited carry
88 /// propagation: let t = c''_0 mod B, and let d = c'_0 + t b; then we output
89 /// z = d mod B, add (floor(d/B), floor(c''_0/B)) to c1, and cycle the carry
90 /// registers around, so that c1 becomes c0, and the old c0 is (implicitly)
91 /// zeroed becomes c3.
92
93 ///--------------------------------------------------------------------------
94 /// Macro definitions.
95
96 .macro mulcore r, s, d0, d1=nil, d2=nil, d3=nil
97 // Load a word r_i from R, multiply by the expanded operand [S], and
98 // leave the pieces of the product in registers D0, D1, D2, D3.
99 movd \d0, \r // (r_i, 0, 0, 0)
100 .ifnes "\d1", "nil"
101 movdqa \d1, [\s] // (s'_0, s'_1, s''_0, s''_1)
102 .endif
103 .ifnes "\d3", "nil"
104 movdqa \d3, [\s + 16] // (s'_2, s'_3, s''_2, s''_3)
105 .endif
106 pshufd \d0, \d0, SHUF(3, 0, 3, 0) // (r_i, ?, r_i, ?)
107 .ifnes "\d1", "nil"
108 psrldq \d1, 4 // (s'_1, s''_0, s''_1, 0)
109 .endif
110 .ifnes "\d2", "nil"
111 .ifnes "\d3", "nil"
112 movdqa \d2, \d3 // another copy of (s'_2, s'_3, ...)
113 .else
114 movdqa \d2, \d0 // another copy of (r_i, ?, r_i, ?)
115 .endif
116 .endif
117 .ifnes "\d3", "nil"
118 psrldq \d3, 4 // (s'_3, s''_2, s''_3, 0)
119 .endif
120 .ifnes "\d1", "nil"
121 pmuludq \d1, \d0 // (r_i s'_1, r_i s''_1)
122 .endif
123 .ifnes "\d3", "nil"
124 pmuludq \d3, \d0 // (r_i s'_3, r_i s''_3)
125 .endif
126 .ifnes "\d2", "nil"
127 .ifnes "\d3", "nil"
128 pmuludq \d2, \d0 // (r_i s'_2, r_i s''_2)
129 .else
130 pmuludq \d2, [\s + 16]
131 .endif
132 .endif
133 pmuludq \d0, [\s] // (r_i s'_0, r_i s''_0)
134 .endm
135
136 .macro accum c0, c1=nil, c2=nil, c3=nil
137 // Accumulate 64-bit pieces in XMM0--XMM3 into the corresponding
138 // carry registers C0--C3. Any or all of C1--C3 may be `nil' to skip
139 // updating that register.
140 paddq \c0, xmm0
141 .ifnes "\c1", "nil"
142 paddq \c1, xmm1
143 .endif
144 .ifnes "\c2", "nil"
145 paddq \c2, xmm2
146 .endif
147 .ifnes "\c3", "nil"
148 paddq \c3, xmm3
149 .endif
150 .endm
151
152 .macro mulacc r, s, c0, c1, c2, c3, z3p=nil
153 // Load a word r_i from R, multiply by the expanded operand [S],
154 // and accumulate in carry registers C0, C1, C2, C3. If Z3P is `t'
155 // then C3 notionally contains zero, but needs clearing; in practice,
156 // we store the product directly rather than attempting to add. On
157 // completion, XMM0, XMM1, and XMM2 are clobbered, as is XMM3 if Z3P
158 // is not `t'.
159 .ifeqs "\z3p", "t"
160 mulcore \r, \s, xmm0, xmm1, xmm2, \c3
161 accum \c0, \c1, \c2
162 .else
163 mulcore \r, \s, xmm0, xmm1, xmm2, xmm3
164 accum \c0, \c1, \c2, \c3
165 .endif
166 .endm
167
168 .macro propout d, c, cc=nil
169 // Calculate an output word from C, and store it in D; propagate
170 // carries out from C to CC in preparation for a rotation of the
171 // carry registers. On completion, XMM3 is clobbered. If CC is
172 // `nil', then the contribution which would have been added to it is
173 // left in C.
174 pshufd xmm3, \c, SHUF(2, 3, 3, 3) // (?, ?, ?, t = c'' mod B)
175 psrldq xmm3, 12 // (t, 0, 0, 0) = (t, 0)
176 pslldq xmm3, 2 // (t b, 0)
177 paddq \c, xmm3 // (c' + t b, c'')
178 movd \d, \c
179 psrlq \c, 32 // floor(c/B)
180 .ifnes "\cc", "nil"
181 paddq \cc, \c // propagate up
182 .endif
183 .endm
184
185 .macro endprop d, c, t
186 // On entry, C contains a carry register. On exit, the low 32 bits
187 // of the value represented in C are written to D, and the remaining
188 // bits are left at the bottom of T.
189 movdqa \t, \c
190 psllq \t, 16 // (?, c'' b)
191 pslldq \c, 8 // (0, c')
192 paddq \t, \c // (?, c' + c'' b)
193 psrldq \t, 8 // c' + c'' b
194 movd \d, \t
195 psrldq \t, 4 // floor((c' + c'' b)/B)
196 .endm
197
198 .macro expand z, a, b, c=nil, d=nil
199 // On entry, A and C hold packed 128-bit values, and Z is zero. On
200 // exit, A:B and C:D together hold the same values in expanded
201 // form. If C is `nil', then only expand A to A:B.
202 movdqa \b, \a // (a_0, a_1, a_2, a_3)
203 .ifnes "\c", "nil"
204 movdqa \d, \c // (c_0, c_1, c_2, c_3)
205 .endif
206 punpcklwd \a, \z // (a'_0, a''_0, a'_1, a''_1)
207 punpckhwd \b, \z // (a'_2, a''_2, a'_3, a''_3)
208 .ifnes "\c", "nil"
209 punpcklwd \c, \z // (c'_0, c''_0, c'_1, c''_1)
210 punpckhwd \d, \z // (c'_2, c''_2, c'_3, c''_3)
211 .endif
212 pshufd \a, \a, SHUF(3, 1, 2, 0) // (a'_0, a'_1, a''_0, a''_1)
213 pshufd \b, \b, SHUF(3, 1, 2, 0) // (a'_2, a'_3, a''_2, a''_3)
214 .ifnes "\c", "nil"
215 pshufd \c, \c, SHUF(3, 1, 2, 0) // (c'_0, c'_1, c''_0, c''_1)
216 pshufd \d, \d, SHUF(3, 1, 2, 0) // (c'_2, c'_3, c''_2, c''_3)
217 .endif
218 .endm
219
220 .macro squash c0, c1, c2, c3, t, u, lo, hi=nil
221 // On entry, C0, C1, C2, C3 are carry registers representing a value
222 // Y. On exit, LO holds the low 128 bits of the carry value; C1, C2,
223 // C3, T, and U are clobbered; and the high bits of Y are stored in
224 // HI, if this is not `nil'.
225
226 // The first step is to eliminate the `double-prime' pieces -- i.e.,
227 // the ones offset by 16 bytes from a 32-bit boundary -- by carrying
228 // them into the 32-bit-aligned pieces above and below. But before
229 // we can do that, we must gather them together.
230 movdqa \t, \c0
231 movdqa \u, \c1
232 punpcklqdq \t, \c2 // (y'_0, y'_2)
233 punpckhqdq \c0, \c2 // (y''_0, y''_2)
234 punpcklqdq \u, \c3 // (y'_1, y'_3)
235 punpckhqdq \c1, \c3 // (y''_1, y''_3)
236
237 // Now split the double-prime pieces. The high (up to) 48 bits will
238 // go up; the low 16 bits go down.
239 movdqa \c2, \c0
240 movdqa \c3, \c1
241 psllq \c2, 48
242 psllq \c3, 48
243 psrlq \c0, 16 // high parts of (y''_0, y''_2)
244 psrlq \c1, 16 // high parts of (y''_1, y''_3)
245 psrlq \c2, 32 // low parts of (y''_0, y''_2)
246 psrlq \c3, 32 // low parts of (y''_1, y''_3)
247 .ifnes "\hi", "nil"
248 movdqa \hi, \c1
249 .endif
250 pslldq \c1, 8 // high part of (0, y''_1)
251
252 paddq \t, \c2 // propagate down
253 paddq \u, \c3
254 paddq \t, \c1 // and up: (y_0, y_2)
255 paddq \u, \c0 // (y_1, y_3)
256 .ifnes "\hi", "nil"
257 psrldq \hi, 8 // high part of (y''_3, 0)
258 .endif
259
260 // Finally extract the answer. This complicated dance is better than
261 // storing to memory and loading, because the piecemeal stores
262 // inhibit store forwarding.
263 movdqa \c3, \t // (y_0, y_1)
264 movdqa \lo, \t // (y^*_0, ?, ?, ?)
265 psrldq \t, 8 // (y_2, 0)
266 psrlq \c3, 32 // (floor(y_0/B), ?)
267 paddq \c3, \u // (y_1 + floor(y_0/B), ?)
268 movdqa \c1, \c3 // (y^*_1, ?, ?, ?)
269 psrldq \u, 8 // (y_3, 0)
270 psrlq \c3, 32 // (floor((y_1 B + y_0)/B^2, ?)
271 paddq \c3, \t // (y_2 + floor((y_1 B + y_0)/B^2, ?)
272 punpckldq \lo, \c3 // (y^*_0, y^*_2, ?, ?)
273 psrlq \c3, 32 // (floor((y_2 B^2 + y_1 B + y_0)/B^3, ?)
274 paddq \c3, \u // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3, ?)
275 .ifnes "\hi", "nil"
276 movdqa \t, \c3
277 pxor \u, \u
278 .endif
279 punpckldq \c1, \c3 // (y^*_1, y^*_3, ?, ?)
280 .ifnes "\hi", "nil"
281 psrlq \t, 32 // very high bits of y
282 paddq \hi, \t
283 punpcklqdq \hi, \u // carry up
284 .endif
285 punpckldq \lo, \c1 // y mod B^4
286 .endm
287
288 .macro carryadd
289 // On entry, EDI points to a packed addend A, and XMM4, XMM5, XMM6
290 // hold the incoming carry registers c0, c1, and c2 representing a
291 // carry-in C.
292 //
293 // On exit, the carry registers, including XMM7, are updated to hold
294 // C + A; XMM0, XMM1, XMM2, and XMM3 are clobbered. The other
295 // registers are preserved.
296 movd xmm0, [edi + 0] // (a_0, 0)
297 movd xmm1, [edi + 4] // (a_1, 0)
298 movd xmm2, [edi + 8] // (a_2, 0)
299 movd xmm7, [edi + 12] // (a_3, 0)
300
301 paddq xmm4, xmm0 // (c'_0 + a_0, c''_0)
302 paddq xmm5, xmm1 // (c'_1 + a_1, c''_1)
303 paddq xmm6, xmm2 // (c'_2 + a_2, c''_2 + a_3 b)
304 .endm
305
306 ///--------------------------------------------------------------------------
307 /// Primitive multipliers and related utilities.
308
309 INTFUNC(carryprop)
310 // On entry, XMM4, XMM5, and XMM6 hold a 144-bit carry in an expanded
311 // form. Store the low 128 bits of the represented carry to [EDI] as
312 // a packed 128-bit value, and leave the remaining 16 bits in the low
313 // 32 bits of XMM4. On exit, XMM3, XMM5 and XMM6 are clobbered.
314 endprologue
315
316 propout [edi + 0], xmm4, xmm5
317 propout [edi + 4], xmm5, xmm6
318 propout [edi + 8], xmm6, nil
319 endprop [edi + 12], xmm6, xmm4
320 ret
321
322 ENDFUNC
323
324 INTFUNC(dmul4)
325 // On entry, EDI points to the destination buffer; EAX and EBX point
326 // to the packed operands U and X; ECX and EDX point to the expanded
327 // operands V and Y; and XMM4, XMM5, XMM6 hold the incoming carry
328 // registers c0, c1, and c2; c3 is assumed to be zero.
329 //
330 // On exit, we write the low 128 bits of the sum C + U V + X Y to
331 // [EDI], and update the carry registers with the carry out. The
332 // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
333 // general-purpose registers are preserved.
334 endprologue
335
336 mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7, t
337 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
338 propout [edi + 0], xmm4, xmm5
339
340 mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
341 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4
342 propout [edi + 4], xmm5, xmm6
343
344 mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t
345 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5
346 propout [edi + 8], xmm6, xmm7
347
348 mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
349 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6
350 propout [edi + 12], xmm7, xmm4
351
352 ret
353
354 ENDFUNC
355
356 INTFUNC(dmla4)
357 // On entry, EDI points to the destination buffer, which also
358 // contains an addend A to accumulate; EAX and EBX point to the
359 // packed operands U and X; ECX and EDX point to the expanded
360 // operands V and Y; and XMM4, XMM5, XMM6 hold the incoming carry
361 // registers c0, c1, and c2 representing a carry-in C; c3 is assumed
362 // to be zero.
363 //
364 // On exit, we write the low 128 bits of the sum A + C + U V + X Y to
365 // [EDI], and update the carry registers with the carry out. The
366 // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
367 // general-purpose registers are preserved.
368 endprologue
369
370 carryadd
371
372 mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7
373 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
374 propout [edi + 0], xmm4, xmm5
375
376 mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
377 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4
378 propout [edi + 4], xmm5, xmm6
379
380 mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t
381 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5
382 propout [edi + 8], xmm6, xmm7
383
384 mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
385 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6
386 propout [edi + 12], xmm7, xmm4
387
388 ret
389
390 ENDFUNC
391
392 INTFUNC(mul4zc)
393 // On entry, EDI points to the destination buffer; EBX points to a
394 // packed operand X; and EDX points to an expanded operand Y.
395 //
396 // On exit, we write the low 128 bits of the product X Y to [EDI],
397 // and set the carry registers XMM4, XMM5, XMM6 to the carry out.
398 // The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
399 // general-purpose registers are preserved.
400 endprologue
401
402 mulcore [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
403 propout [edi + 0], xmm4, xmm5
404
405 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
406 propout [edi + 4], xmm5, xmm6
407
408 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
409 propout [edi + 8], xmm6, xmm7
410
411 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
412 propout [edi + 12], xmm7, xmm4
413
414 ret
415
416 ENDFUNC
417
418 INTFUNC(mul4)
419 // On entry, EDI points to the destination buffer; EBX points to a
420 // packed operand X; EDX points to an expanded operand Y; and XMM4,
421 // XMM5, XMM6 hold the incoming carry registers c0, c1, and c2,
422 // representing a carry-in C; c3 is assumed to be zero.
423 //
424 // On exit, we write the low 128 bits of the sum C + X Y to [EDI],
425 // and update the carry registers with the carry out. The registers
426 // XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
427 // general-purpose registers are preserved.
428 endprologue
429
430 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, t
431 propout [edi + 0], xmm4, xmm5
432
433 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
434 propout [edi + 4], xmm5, xmm6
435
436 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
437 propout [edi + 8], xmm6, xmm7
438
439 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
440 propout [edi + 12], xmm7, xmm4
441
442 ret
443
444 ENDFUNC
445
446 INTFUNC(mla4zc)
447 // On entry, EDI points to the destination buffer, which also
448 // contains an addend A to accumulate; EBX points to a packed operand
449 // X; and EDX points to an expanded operand Y.
450 //
451 // On exit, we write the low 128 bits of the sum A + X Y to [EDI],
452 // and set the carry registers XMM4, XMM5, XMM6 to the carry out.
453 // The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
454 // general-purpose registers are preserved.
455 endprologue
456
457 movd xmm4, [edi + 0]
458 movd xmm5, [edi + 4]
459 movd xmm6, [edi + 8]
460 movd xmm7, [edi + 12]
461
462 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
463 propout [edi + 0], xmm4, xmm5
464
465 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
466 propout [edi + 4], xmm5, xmm6
467
468 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
469 propout [edi + 8], xmm6, xmm7
470
471 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
472 propout [edi + 12], xmm7, xmm4
473
474 ret
475
476 ENDFUNC
477
478 INTFUNC(mla4)
479 // On entry, EDI points to the destination buffer, which also
480 // contains an addend A to accumulate; EBX points to a packed operand
481 // X; EDX points to an expanded operand Y; and XMM4, XMM5, XMM6 hold
482 // the incoming carry registers c0, c1, and c2, representing a
483 // carry-in C; c3 is assumed to be zero.
484 //
485 // On exit, we write the low 128 bits of the sum A + C + X Y to
486 // [EDI], and update the carry registers with the carry out. The
487 // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
488 // general-purpose registers are preserved.
489 endprologue
490
491 carryadd
492
493 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
494 propout [edi + 0], xmm4, xmm5
495
496 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
497 propout [edi + 4], xmm5, xmm6
498
499 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
500 propout [edi + 8], xmm6, xmm7
501
502 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
503 propout [edi + 12], xmm7, xmm4
504
505 ret
506
507 ENDFUNC
508
509 INTFUNC(mmul4)
510 // On entry, EDI points to the destination buffer; EAX and EBX point
511 // to the packed operands U and N; ECX and ESI point to the expanded
512 // operands V and M; and EDX points to a place to store an expanded
513 // result Y (32 bytes, at a 16-byte boundary). The stack pointer
514 // must be 16-byte aligned. (This is not the usual convention, which
515 // requires alignment before the call.)
516 //
517 // On exit, we write Y = U V M mod B to [EDX], and the low 128 bits
518 // of the sum U V + N Y to [EDI], leaving the remaining carry in
519 // XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2, XMM3, and
520 // XMM7 are clobbered; the general-purpose registers are preserved.
521 stalloc 48 // space for the carries
522 endprologue
523
524 // Calculate W = U V, and leave it in the destination. Stash the
525 // carry pieces for later.
526 mulcore [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7
527 propout [edi + 0], xmm4, xmm5
528 jmp 5f
529
530 ENDFUNC
531
532 INTFUNC(mmla4)
533 // On entry, EDI points to the destination buffer, which also
534 // contains an addend A to accumulate; EAX and EBX point
535 // to the packed operands U and N; ECX and ESI point to the expanded
536 // operands V and M; and EDX points to a place to store an expanded
537 // result Y (32 bytes, at a 16-byte boundary). The stack pointer
538 // must be 16-byte aligned. (This is not the usual convention, which
539 // requires alignment before the call.)
540 //
541 // On exit, we write Y = (A + U V) M mod B to [EDX], and the low 128
542 // bits of the sum A + U V + N Y to [EDI], leaving the remaining
543 // carry in XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2,
544 // XMM3, and XMM7 are clobbered; the general-purpose registers are
545 // preserved.
546 stalloc 48 // space for the carries
547 endprologue
548
549 movd xmm4, [edi + 0]
550 movd xmm5, [edi + 4]
551 movd xmm6, [edi + 8]
552 movd xmm7, [edi + 12]
553
554 // Calculate W = U V, and leave it in the destination. Stash the
555 // carry pieces for later.
556 mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7
557 propout [edi + 0], xmm4, xmm5
558
559 5: mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
560 propout [edi + 4], xmm5, xmm6
561
562 mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t
563 propout [edi + 8], xmm6, xmm7
564
565 mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
566 propout [edi + 12], xmm7, xmm4
567
568 movdqa [esp + 0], xmm4
569 movdqa [esp + 16], xmm5
570 movdqa [esp + 32], xmm6
571
572 // Calculate Y = W M.
573 mulcore [edi + 0], esi, xmm4, xmm5, xmm6, xmm7
574
575 mulcore [edi + 4], esi, xmm0, xmm1, xmm2
576 accum xmm5, xmm6, xmm7
577
578 mulcore [edi + 8], esi, xmm0, xmm1
579 accum xmm6, xmm7
580
581 mulcore [edi + 12], esi, xmm0
582 accum xmm7
583
584 // That's lots of pieces. Now we have to assemble the answer.
585 squash xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm4
586
587 // Expand it.
588 pxor xmm2, xmm2
589 expand xmm2, xmm4, xmm1
590 movdqa [edx + 0], xmm4
591 movdqa [edx + 16], xmm1
592
593 // Initialize the carry from the value for W we calculated earlier.
594 movd xmm4, [edi + 0]
595 movd xmm5, [edi + 4]
596 movd xmm6, [edi + 8]
597 movd xmm7, [edi + 12]
598
599 // Finish the calculation by adding the Montgomery product.
600 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
601 propout [edi + 0], xmm4, xmm5
602
603 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
604 propout [edi + 4], xmm5, xmm6
605
606 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
607 propout [edi + 8], xmm6, xmm7
608
609 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
610 propout [edi + 12], xmm7, xmm4
611
612 // Add add on the carry we calculated earlier.
613 paddq xmm4, [esp + 0]
614 paddq xmm5, [esp + 16]
615 paddq xmm6, [esp + 32]
616
617 // And, with that, we're done.
618 stfree 48
619 ret
620
621 ENDFUNC
622
623 INTFUNC(mont4)
624 // On entry, EDI points to the destination buffer holding a packed
625 // value W; EBX points to a packed operand N; ESI points to an
626 // expanded operand M; and EDX points to a place to store an expanded
627 // result Y (32 bytes, at a 16-byte boundary).
628 //
629 // On exit, we write Y = W M mod B to [EDX], and the low 128 bits
630 // of the sum W + N Y to [EDI], leaving the remaining carry in
631 // XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2, XMM3, and
632 // XMM7 are clobbered; the general-purpose registers are preserved.
633 endprologue
634
635 // Calculate Y = W M.
636 mulcore [edi + 0], esi, xmm4, xmm5, xmm6, xmm7
637
638 mulcore [edi + 4], esi, xmm0, xmm1, xmm2
639 accum xmm5, xmm6, xmm7
640
641 mulcore [edi + 8], esi, xmm0, xmm1
642 accum xmm6, xmm7
643
644 mulcore [edi + 12], esi, xmm0
645 accum xmm7
646
647 // That's lots of pieces. Now we have to assemble the answer.
648 squash xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm4
649
650 // Expand it.
651 pxor xmm2, xmm2
652 expand xmm2, xmm4, xmm1
653 movdqa [edx + 0], xmm4
654 movdqa [edx + 16], xmm1
655
656 // Initialize the carry from W.
657 movd xmm4, [edi + 0]
658 movd xmm5, [edi + 4]
659 movd xmm6, [edi + 8]
660 movd xmm7, [edi + 12]
661
662 // Finish the calculation by adding the Montgomery product.
663 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
664 propout [edi + 0], xmm4, xmm5
665
666 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
667 propout [edi + 4], xmm5, xmm6
668
669 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
670 propout [edi + 8], xmm6, xmm7
671
672 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
673 propout [edi + 12], xmm7, xmm4
674
675 // And, with that, we're done.
676 ret
677
678 ENDFUNC
679
680 ///--------------------------------------------------------------------------
681 /// Bulk multipliers.
682
683 FUNC(mpx_umul4_x86_sse2)
684 // void mpx_umul4_x86_sse2(mpw *dv, const mpw *av, const mpw *avl,
685 // const mpw *bv, const mpw *bvl);
686
687 // Build a stack frame. Arguments will be relative to EBP, as
688 // follows.
689 //
690 // ebp + 20 dv
691 // ebp + 24 av
692 // ebp + 28 avl
693 // ebp + 32 bv
694 // ebp + 36 bvl
695 //
696 // Locals are relative to ESP, as follows.
697 //
698 // esp + 0 expanded Y (32 bytes)
699 // esp + 32 (top of locals)
700 pushreg ebp
701 pushreg ebx
702 pushreg esi
703 pushreg edi
704 setfp ebp
705 and esp, ~15
706 sub esp, 32
707 endprologue
708
709 // Prepare for the first iteration.
710 mov esi, [ebp + 32] // -> bv[0]
711 pxor xmm7, xmm7
712 movdqu xmm0, [esi] // bv[0]
713 mov edi, [ebp + 20] // -> dv[0]
714 mov ecx, edi // outer loop dv cursor
715 expand xmm7, xmm0, xmm1
716 mov ebx, [ebp + 24] // -> av[0]
717 mov eax, [ebp + 28] // -> av[m] = av limit
718 mov edx, esp // -> expanded Y = bv[0]
719 movdqa [esp + 0], xmm0 // bv[0] expanded low
720 movdqa [esp + 16], xmm1 // bv[0] expanded high
721 call mul4zc
722 add ebx, 16
723 add edi, 16
724 add ecx, 16
725 add esi, 16
726 cmp ebx, eax // all done?
727 jae 8f
728
729 .p2align 4
730 // Continue with the first iteration.
731 0: call mul4
732 add ebx, 16
733 add edi, 16
734 cmp ebx, eax // all done?
735 jb 0b
736
737 // Write out the leftover carry. There can be no tail here.
738 8: call carryprop
739 cmp esi, [ebp + 36] // more passes to do?
740 jae 9f
741
742 .p2align 4
743 // Set up for the next pass.
744 1: movdqu xmm0, [esi] // bv[i]
745 mov edi, ecx // -> dv[i]
746 pxor xmm7, xmm7
747 expand xmm7, xmm0, xmm1
748 mov ebx, [ebp + 24] // -> av[0]
749 movdqa [esp + 0], xmm0 // bv[i] expanded low
750 movdqa [esp + 16], xmm1 // bv[i] expanded high
751 call mla4zc
752 add edi, 16
753 add ebx, 16
754 add ecx, 16
755 add esi, 16
756 cmp ebx, eax // done yet?
757 jae 8f
758
759 .p2align 4
760 // Continue...
761 0: call mla4
762 add ebx, 16
763 add edi, 16
764 cmp ebx, eax
765 jb 0b
766
767 // Finish off this pass. There was no tail on the previous pass, and
768 // there can be none on this pass.
769 8: call carryprop
770 cmp esi, [ebp + 36]
771 jb 1b
772
773 // All over.
774 9: dropfp
775 pop edi
776 pop esi
777 pop ebx
778 pop ebp
779 ret
780
781 ENDFUNC
782
783 FUNC(mpxmont_mul4_x86_sse2)
784 // void mpxmont_mul4_x86_sse2(mpw *dv, const mpw *av, const mpw *bv,
785 // const mpw *nv, size_t n, const mpw *mi);
786
787 // Build a stack frame. Arguments will be relative to EBP, as
788 // follows.
789 //
790 // ebp + 20 dv
791 // ebp + 24 av
792 // ebp + 28 bv
793 // ebp + 32 nv
794 // ebp + 36 n (nonzero multiple of 4)
795 // ebp + 40 mi
796 //
797 // Locals are relative to ESP, which is 4 mod 16, as follows.
798 //
799 // esp + 0 outer loop dv
800 // esp + 4 outer loop bv
801 // esp + 8 av limit (mostly in ESI)
802 // esp + 12 expanded V (32 bytes)
803 // esp + 44 expanded M (32 bytes)
804 // esp + 76 expanded Y (32 bytes)
805 // esp + 108 bv limit
806 // esp + 112 (gap)
807 // esp + 124 (top of locals)
808 pushreg ebp
809 pushreg ebx
810 pushreg esi
811 pushreg edi
812 setfp ebp
813 and esp, ~15
814 sub esp, 124
815 endprologue
816
817 // Establish the expanded operands.
818 pxor xmm7, xmm7
819 mov ecx, [ebp + 28] // -> bv
820 mov edx, [ebp + 40] // -> mi
821 movdqu xmm0, [ecx] // bv[0]
822 movdqu xmm2, [edx] // mi
823 expand xmm7, xmm0, xmm1, xmm2, xmm3
824 movdqa [esp + 12], xmm0 // bv[0] expanded low
825 movdqa [esp + 28], xmm1 // bv[0] expanded high
826 movdqa [esp + 44], xmm2 // mi expanded low
827 movdqa [esp + 60], xmm3 // mi expanded high
828
829 // Set up the outer loop state and prepare for the first iteration.
830 mov edx, [ebp + 36] // n
831 mov eax, [ebp + 24] // -> U = av[0]
832 mov ebx, [ebp + 32] // -> X = nv[0]
833 mov edi, [ebp + 20] // -> Z = dv[0]
834 mov [esp + 4], ecx
835 lea ecx, [ecx + 4*edx] // -> bv[n/4] = bv limit
836 lea edx, [eax + 4*edx] // -> av[n/4] = av limit
837 mov [esp + 0], edi
838 mov [esp + 108], ecx
839 mov [esp + 8], edx
840 lea ecx, [esp + 12] // -> expanded V = bv[0]
841 lea esi, [esp + 44] // -> expanded M = mi
842 lea edx, [esp + 76] // -> space for Y
843 call mmul4
844 mov esi, [esp + 8] // recover av limit
845 add edi, 16
846 add eax, 16
847 add ebx, 16
848 cmp eax, esi // done already?
849 jae 8f
850 mov [esp + 0], edi
851
852 .p2align 4
853 // Complete the first inner loop.
854 0: call dmul4
855 add edi, 16
856 add eax, 16
857 add ebx, 16
858 cmp eax, esi // done yet?
859 jb 0b
860
861 // Still have carries left to propagate.
862 call carryprop
863 movd [edi + 16], xmm4
864
865 .p2align 4
866 // Embark on the next iteration. (There must be one. If n = 1, then
867 // we would have bailed above, to label 8. Similarly, the subsequent
868 // iterations can fall into the inner loop immediately.)
869 1: mov eax, [esp + 4] // -> bv[i - 1]
870 mov edi, [esp + 0] // -> Z = dv[i]
871 add eax, 16 // -> bv[i]
872 pxor xmm7, xmm7
873 movdqu xmm0, [eax] // bv[i]
874 mov [esp + 4], eax
875 cmp eax, [esp + 108] // done yet?
876 jae 9f
877 mov ebx, [ebp + 32] // -> X = nv[0]
878 lea esi, [esp + 44] // -> expanded M = mi
879 mov eax, [ebp + 24] // -> U = av[0]
880 expand xmm7, xmm0, xmm1
881 movdqa [esp + 12], xmm0 // bv[i] expanded low
882 movdqa [esp + 28], xmm1 // bv[i] expanded high
883 call mmla4
884 mov esi, [esp + 8] // recover av limit
885 add edi, 16
886 add eax, 16
887 add ebx, 16
888 mov [esp + 0], edi
889
890 .p2align 4
891 // Complete the next inner loop.
892 0: call dmla4
893 add edi, 16
894 add eax, 16
895 add ebx, 16
896 cmp eax, esi
897 jb 0b
898
899 // Still have carries left to propagate, and they overlap the
900 // previous iteration's final tail, so read that in and add it.
901 movd xmm0, [edi]
902 paddq xmm4, xmm0
903 call carryprop
904 movd [edi + 16], xmm4
905
906 // Back again.
907 jmp 1b
908
909 // First iteration was short. Write out the carries and we're done.
910 // (This could be folded into the main loop structure, but that would
911 // penalize small numbers more.)
912 8: call carryprop
913 movd [edi + 16], xmm4
914
915 // All done.
916 9: dropfp
917 popreg edi
918 popreg esi
919 popreg ebx
920 popreg ebp
921 ret
922
923 ENDFUNC
924
925 FUNC(mpxmont_redc4_x86_sse2)
926 // void mpxmont_redc4_x86_sse2(mpw *dv, mpw *dvl, const mpw *nv,
927 // size_t n, const mpw *mi);
928
929 // Build a stack frame. Arguments will be relative to EBP, as
930 // follows.
931 //
932 // ebp + 20 dv
933 // ebp + 24 dvl
934 // ebp + 28 nv
935 // ebp + 32 n (nonzero multiple of 4)
936 // ebp + 36 mi
937 //
938 // Locals are relative to ESP, as follows.
939 //
940 // esp + 0 outer loop dv
941 // esp + 4 outer dv limit
942 // esp + 8 blocks-of-4 dv limit
943 // esp + 12 expanded M (32 bytes)
944 // esp + 44 expanded Y (32 bytes)
945 // esp + 76 (top of locals)
946 pushreg ebp
947 pushreg ebx
948 pushreg esi
949 pushreg edi
950 setfp ebp
951 and esp, ~15
952 sub esp, 76
953 endprologue
954
955 // Establish the expanded operands and the blocks-of-4 dv limit.
956 mov edi, [ebp + 20] // -> Z = dv[0]
957 pxor xmm7, xmm7
958 mov eax, [ebp + 24] // -> dv[n] = dv limit
959 sub eax, edi // length of dv in bytes
960 mov edx, [ebp + 36] // -> mi
961 movdqu xmm0, [edx] // mi
962 and eax, ~15 // mask off the tail end
963 expand xmm7, xmm0, xmm1
964 add eax, edi // find limit
965 movdqa [esp + 12], xmm0 // mi expanded low
966 movdqa [esp + 28], xmm1 // mi expanded high
967 mov [esp + 8], eax
968
969 // Set up the outer loop state and prepare for the first iteration.
970 mov ecx, [ebp + 32] // n
971 mov ebx, [ebp + 28] // -> X = nv[0]
972 lea edx, [edi + 4*ecx] // -> dv[n/4] = outer dv limit
973 lea ecx, [ebx + 4*ecx] // -> nv[n/4] = nv limit
974 mov [esp + 0], edi
975 mov [esp + 4], edx
976 lea esi, [esp + 12] // -> expanded M = mi
977 lea edx, [esp + 44] // -> space for Y
978 call mont4
979 add edi, 16
980 add ebx, 16
981 cmp ebx, ecx // done already?
982 jae 8f
983
984 .p2align 4
985 // Complete the first inner loop.
986 5: call mla4
987 add ebx, 16
988 add edi, 16
989 cmp ebx, ecx // done yet?
990 jb 5b
991
992 // Still have carries left to propagate.
993 8: carryadd
994 mov esi, [esp + 8] // -> dv blocks limit
995 mov edx, [ebp + 24] // dv limit
996 psllq xmm7, 16
997 pslldq xmm7, 8
998 paddq xmm6, xmm7
999 call carryprop
1000 movd eax, xmm4
1001 add edi, 16
1002 cmp edi, esi
1003 jae 7f
1004
1005 .p2align 4
1006 // Continue carry propagation until the end of the buffer.
1007 0: add [edi], eax
1008 mov eax, 0 // preserves flags
1009 adcd [edi + 4], 0
1010 adcd [edi + 8], 0
1011 adcd [edi + 12], 0
1012 adc eax, 0
1013 add edi, 16
1014 cmp edi, esi
1015 jb 0b
1016
1017 // Deal with the tail end.
1018 7: add [edi], eax
1019 mov eax, 0 // preserves flags
1020 add edi, 4
1021 adc eax, 0
1022 cmp edi, edx
1023 jb 7b
1024
1025 // All done for this iteration. Start the next. (This must have at
1026 // least one follow-on iteration, or we'd not have started this outer
1027 // loop.)
1028 8: mov edi, [esp + 0] // -> dv[i - 1]
1029 mov ebx, [ebp + 28] // -> X = nv[0]
1030 lea edx, [esp + 44] // -> space for Y
1031 lea esi, [esp + 12] // -> expanded M = mi
1032 add edi, 16 // -> Z = dv[i]
1033 cmp edi, [esp + 4] // all done yet?
1034 jae 9f
1035 mov [esp + 0], edi
1036 call mont4
1037 add edi, 16
1038 add ebx, 16
1039 jmp 5b
1040
1041 // All over.
1042 9: dropfp
1043 popreg edi
1044 popreg esi
1045 popreg ebx
1046 popreg ebp
1047 ret
1048
1049 ENDFUNC
1050
1051 ///--------------------------------------------------------------------------
1052 /// Testing and performance measurement.
1053
1054 #ifdef TEST_MUL4
1055
1056 .macro cysetup c
1057 rdtsc
1058 mov [\c], eax
1059 mov [\c + 4], edx
1060 .endm
1061
1062 .macro cystore c, v, n
1063 rdtsc
1064 sub eax, [\c]
1065 sbb edx, [\c + 4]
1066 mov ebx, [\v]
1067 mov ecx, [\n]
1068 dec ecx
1069 mov [\n], ecx
1070 mov [ebx + ecx*8], eax
1071 mov [ebx + ecx*8 + 4], edx
1072 .endm
1073
1074 .macro testprologue
1075 pushreg ebp
1076 pushreg ebx
1077 pushreg esi
1078 pushreg edi
1079 setfp ebp
1080 and esp, ~15
1081 sub esp, 3*32 + 12
1082 endprologue
1083 // vars:
1084 // esp + 0 = cycles
1085 // esp + 12 = v expanded
1086 // esp + 44 = y expanded
1087 // esp + 72 = ? expanded
1088 .endm
1089
1090 .macro testepilogue
1091 dropfp
1092 popreg edi
1093 popreg esi
1094 popreg ebx
1095 popreg ebp
1096 ret
1097 .endm
1098
1099 .macro testldcarry c
1100 mov ecx, \c // -> c
1101 movdqu xmm4, [ecx + 0] // (c'_0, c''_0)
1102 movdqu xmm5, [ecx + 16] // (c'_1, c''_1)
1103 movdqu xmm6, [ecx + 32] // (c'_2, c''_2)
1104 .endm
1105
1106 .macro testexpand v=nil, y=nil
1107 pxor xmm7, xmm7
1108 .ifnes "\v", "nil"
1109 mov ecx, \v
1110 movdqu xmm0, [ecx]
1111 expand xmm7, xmm0, xmm1
1112 movdqa [esp + 12], xmm0
1113 movdqa [esp + 28], xmm1
1114 .endif
1115 .ifnes "\y", "nil"
1116 mov edx, \y
1117 movdqu xmm2, [edx]
1118 expand xmm7, xmm2, xmm3
1119 movdqa [esp + 44], xmm2
1120 movdqa [esp + 60], xmm3
1121 .endif
1122 .endm
1123
1124 .macro testtop u=nil, x=nil, mode=nil
1125 .p2align 4
1126 0:
1127 .ifnes "\u", "nil"
1128 lea ecx, [esp + 12]
1129 .endif
1130 mov ebx, \x
1131 .ifeqs "\mode", "mont"
1132 lea esi, [esp + 44]
1133 .endif
1134 cysetup esp + 0
1135 .ifnes "\u", "nil"
1136 mov eax, \u
1137 .endif
1138 .ifeqs "\mode", "mont"
1139 lea edx, [esp + 76]
1140 .else
1141 lea edx, [esp + 44]
1142 .endif
1143 .endm
1144
1145 .macro testtail cyv, n
1146 cystore esp + 0, \cyv, \n
1147 jnz 0b
1148 .endm
1149
1150 .macro testcarryout c
1151 mov ecx, \c
1152 movdqu [ecx + 0], xmm4
1153 movdqu [ecx + 16], xmm5
1154 movdqu [ecx + 32], xmm6
1155 .endm
1156
1157 FUNC(test_dmul4)
1158 testprologue
1159 testldcarry [ebp + 24]
1160 testexpand [ebp + 36], [ebp + 40]
1161 mov edi, [ebp + 20]
1162 testtop [ebp + 28], [ebp + 32]
1163 call dmul4
1164 testtail [ebp + 48], [ebp + 44]
1165 testcarryout [ebp + 24]
1166 testepilogue
1167 ENDFUNC
1168
1169 FUNC(test_dmla4)
1170 testprologue
1171 testldcarry [ebp + 24]
1172 testexpand [ebp + 36], [ebp + 40]
1173 mov edi, [ebp + 20]
1174 testtop [ebp + 28], [ebp + 32]
1175 call dmla4
1176 testtail [ebp + 48], [ebp + 44]
1177 testcarryout [ebp + 24]
1178 testepilogue
1179 ENDFUNC
1180
1181 FUNC(test_mul4)
1182 testprologue
1183 testldcarry [ebp + 24]
1184 testexpand nil, [ebp + 32]
1185 mov edi, [ebp + 20]
1186 testtop nil, [ebp + 28]
1187 call mul4
1188 testtail [ebp + 40], [ebp + 36]
1189 testcarryout [ebp + 24]
1190 testepilogue
1191 ENDFUNC
1192
1193 FUNC(test_mla4)
1194 testprologue
1195 testldcarry [ebp + 24]
1196 testexpand nil, [ebp + 32]
1197 mov edi, [ebp + 20]
1198 testtop nil, [ebp + 28]
1199 call mla4
1200 testtail [ebp + 40], [ebp + 36]
1201 testcarryout [ebp + 24]
1202 testepilogue
1203 ENDFUNC
1204
1205 FUNC(test_mmul4)
1206 testprologue
1207 testexpand [ebp + 40], [ebp + 44]
1208 mov edi, [ebp + 20]
1209 testtop [ebp + 32], [ebp + 36], mont
1210 call mmul4
1211 testtail [ebp + 52], [ebp + 48]
1212 mov edi, [ebp + 28]
1213 movdqa xmm0, [esp + 76]
1214 movdqa xmm1, [esp + 92]
1215 movdqu [edi], xmm0
1216 movdqu [edi + 16], xmm1
1217 testcarryout [ebp + 24]
1218 testepilogue
1219 ENDFUNC
1220
1221 FUNC(test_mmla4)
1222 testprologue
1223 testexpand [ebp + 40], [ebp + 44]
1224 mov edi, [ebp + 20]
1225 testtop [ebp + 32], [ebp + 36], mont
1226 call mmla4
1227 testtail [ebp + 52], [ebp + 48]
1228 mov edi, [ebp + 28]
1229 movdqa xmm0, [esp + 76]
1230 movdqa xmm1, [esp + 92]
1231 movdqu [edi], xmm0
1232 movdqu [edi + 16], xmm1
1233 testcarryout [ebp + 24]
1234 testepilogue
1235 ENDFUNC
1236
1237 FUNC(test_mont4)
1238 testprologue
1239 testexpand nil, [ebp + 36]
1240 mov edi, [ebp + 20]
1241 testtop nil, [ebp + 32], mont
1242 call mont4
1243 testtail [ebp + 44], [ebp + 40]
1244 mov edi, [ebp + 28]
1245 movdqa xmm0, [esp + 76]
1246 movdqa xmm1, [esp + 92]
1247 movdqu [edi], xmm0
1248 movdqu [edi + 16], xmm1
1249 testcarryout [ebp + 24]
1250 testepilogue
1251 ENDFUNC
1252
1253 #endif
1254
1255 ///----- That's all, folks --------------------------------------------------