math/mpx-mul4-x86-sse2.S: Use `SHUF' instead of hardwired constants.
[catacomb] / math / mpx-mul4-x86-sse2.S
1 /// -*- mode: asm; asm-comment-char: ?/; comment-start: "// " -*-
2 ///
3 /// Large SIMD-based multiplications
4 ///
5 /// (c) 2016 Straylight/Edgeware
6
7 ///----- Licensing notice ---------------------------------------------------
8 ///
9 /// This file is part of Catacomb.
10 ///
11 /// Catacomb is free software; you can redistribute it and/or modify
12 /// it under the terms of the GNU Library General Public License as
13 /// published by the Free Software Foundation; either version 2 of the
14 /// License, or (at your option) any later version.
15 ///
16 /// Catacomb is distributed in the hope that it will be useful,
17 /// but WITHOUT ANY WARRANTY; without even the implied warranty of
18 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 /// GNU Library General Public License for more details.
20 ///
21 /// You should have received a copy of the GNU Library General Public
22 /// License along with Catacomb; if not, write to the Free
23 /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
24 /// MA 02111-1307, USA.
25
26 ///--------------------------------------------------------------------------
27 /// External definitions.
28
29 #include "config.h"
30 #include "asm-common.h"
31
32 ///--------------------------------------------------------------------------
33 /// Prologue.
34
35 .arch pentium4
36 .text
37
38 ///--------------------------------------------------------------------------
39 /// Theory.
40 ///
41 /// We define a number of primitive fixed-size multipliers from which we can
42 /// construct more general variable-length multipliers.
43 ///
44 /// The basic trick is the same throughout. In an operand-scanning
45 /// multiplication, the inner multiplication loop multiplies a
46 /// multiple-precision operand by a single precision factor, and adds the
47 /// result, appropriately shifted, to the result. A `finely integrated
48 /// operand scanning' implementation of Montgomery multiplication also adds
49 /// the product of a single-precision `Montgomery factor' and the modulus,
50 /// calculated in the same pass. The more common `coarsely integrated
51 /// operand scanning' alternates main multiplication and Montgomery passes,
52 /// which requires additional carry propagation.
53 ///
54 /// Throughout both plain-multiplication and Montgomery stages, then, one of
55 /// the factors remains constant throughout the operation, so we can afford
56 /// to take a little time to preprocess it. The transformation we perform is
57 /// as follows. Let b = 2^16, and B = b^2 = 2^32. Suppose we're given a
58 /// 128-bit factor v = v_0 + v_1 B + v_2 B^2 + v_3 B^3. Split each v_i into
59 /// two sixteen-bit pieces, so v_i = v'_i + v''_i b. These eight 16-bit
60 /// pieces are placed into 32-bit cells, and arranged as two 128-bit SSE
61 /// operands, as follows.
62 ///
63 /// Offset 0 4 8 12
64 /// 0 v'_0 v'_1 v''_0 v''_1
65 /// 16 v'_2 v'_3 v''_2 v''_3
66 ///
67 /// A `pmuludqd' instruction ignores the odd positions in its operands; thus,
68 /// it will act on (say) v'_0 and v''_0 in a single instruction. Shifting
69 /// this vector right by 4 bytes brings v'_1 and v''_1 into position. We can
70 /// multiply such a vector by a full 32-bit scalar to produce two 48-bit
71 /// results in 64-bit fields. The sixteen bits of headroom allows us to add
72 /// many products together before we must deal with carrying; it also allows
73 /// for some calculations to be performed on the above expanded form.
74 ///
75 /// On 32-bit x86, we are register starved: the expanded operands are kept in
76 /// memory, typically in warm L1 cache.
77 ///
78 /// We maintain four `carry' registers accumulating intermediate results.
79 /// The registers' precise roles rotate during the computation; we name them
80 /// `c0', `c1', `c2', and `c3'. Each carry register holds two 64-bit halves:
81 /// the register c0, for example, holds c'_0 (low half) and c''_0 (high
82 /// half), and represents the value c_0 = c'_0 + c''_0 b; the carry registers
83 /// collectively represent the value c_0 + c_1 B + c_2 B^2 + c_3 B^3. The
84 /// `pmuluqdq' instruction acting on a scalar operand (broadcast across all
85 /// lanes of its vector) and an operand in the expanded form above produces a
86 /// result which can be added directly to the appropriate carry register.
87 /// Following a pass of four multiplications, we perform some limited carry
88 /// propagation: let t = c''_0 mod B, and let d = c'_0 + t b; then we output
89 /// z = d mod B, add (floor(d/B), floor(c''_0/B)) to c1, and cycle the carry
90 /// registers around, so that c1 becomes c0, and the old c0 is (implicitly)
91 /// zeroed becomes c3.
92
93 ///--------------------------------------------------------------------------
94 /// Macro definitions.
95
96 .macro mulcore r, s, d0, d1, d2, d3
97 // Load a word r_i from R, multiply by the expanded operand [S], and
98 // leave the pieces of the product in registers D0, D1, D2, D3.
99 movd \d0, \r // (r_i, 0, 0, 0)
100 .ifnes "\d1", "nil"
101 movdqa \d1, [\s] // (s'_0, s'_1, s''_0, s''_1)
102 .endif
103 .ifnes "\d3", "nil"
104 movdqa \d3, [\s + 16] // (s'_2, s'_3, s''_2, s''_3)
105 .endif
106 pshufd \d0, \d0, SHUF(3, 0, 3, 0) // (r_i, ?, r_i, ?)
107 .ifnes "\d1", "nil"
108 psrldq \d1, 4 // (s'_1, s''_0, s''_1, 0)
109 .endif
110 .ifnes "\d2", "nil"
111 .ifnes "\d3", "nil"
112 movdqa \d2, \d3 // another copy of (s'_2, s'_3, ...)
113 .else
114 movdqa \d2, \d0 // another copy of (r_i, ?, r_i, ?)
115 .endif
116 .endif
117 .ifnes "\d3", "nil"
118 psrldq \d3, 4 // (s'_3, s''_2, s''_3, 0)
119 .endif
120 .ifnes "\d1", "nil"
121 pmuludqd \d1, \d0 // (r_i s'_1, r_i s''_1)
122 .endif
123 .ifnes "\d3", "nil"
124 pmuludqd \d3, \d0 // (r_i s'_3, r_i s''_3)
125 .endif
126 .ifnes "\d2", "nil"
127 .ifnes "\d3", "nil"
128 pmuludqd \d2, \d0 // (r_i s'_2, r_i s''_2)
129 .else
130 pmuludqd \d2, [\s + 16]
131 .endif
132 .endif
133 pmuludqd \d0, [\s] // (r_i s'_0, r_i s''_0)
134 .endm
135
136 .macro accum c0, c1, c2, c3
137 paddq \c0, xmm0
138 .ifnes "\c1", "nil"
139 paddq \c1, xmm1
140 .endif
141 .ifnes "\c2", "nil"
142 paddq \c2, xmm2
143 .endif
144 .ifnes "\c3", "nil"
145 paddq \c3, xmm3
146 .endif
147 .endm
148
149 .macro mulacc r, s, c0, c1, c2, c3, z3p
150 // Load a word r_i from R, multiply by the expanded operand [S],
151 // and accumulate in carry registers C0, C1, C2, C3. If Z3P is `t'
152 // then C3 notionally contains zero, but needs clearing; in practice,
153 // we store the product directly rather than attempting to add. On
154 // completion, XMM0, XMM1, and XMM2 are clobbered, as is XMM3 if Z3P
155 // is not `t'.
156 .ifeqs "\z3p", "t"
157 mulcore \r, \s, xmm0, xmm1, xmm2, \c3
158 accum \c0, \c1, \c2, nil
159 .else
160 mulcore \r, \s, xmm0, xmm1, xmm2, xmm3
161 accum \c0, \c1, \c2, \c3
162 .endif
163 .endm
164
165 .macro propout d, c, cc
166 // Calculate an output word from C, and store it in D; propagate
167 // carries out from C to CC in preparation for a rotation of the
168 // carry registers. On completion, XMM3 is clobbered. If CC is
169 // `nil', then the contribution which would have been added to it is
170 // left in C.
171 pshufd xmm3, \c, SHUF(2, 3, 3, 3) // (?, ?, ?, t = c'' mod B)
172 psrldq xmm3, 12 // (t, 0, 0, 0) = (t, 0)
173 pslldq xmm3, 2 // (t b, 0)
174 paddq \c, xmm3 // (c' + t b, c'')
175 movd \d, \c
176 psrlq \c, 32 // floor(c/B)
177 .ifnes "\cc", "nil"
178 paddq \cc, \c // propagate up
179 .endif
180 .endm
181
182 .macro endprop d, c, t
183 // On entry, C contains a carry register. On exit, the low 32 bits
184 // of the value represented in C are written to D, and the remaining
185 // bits are left at the bottom of T.
186 movdqa \t, \c
187 psllq \t, 16 // (?, c'' b)
188 pslldq \c, 8 // (0, c')
189 paddq \t, \c // (?, c' + c'' b)
190 psrldq \t, 8 // c' + c'' b
191 movd \d, \t
192 psrldq \t, 4 // floor((c' + c'' b)/B)
193 .endm
194
195 .macro expand a, b, c, d, z
196 // On entry, A and C hold packed 128-bit values, and Z is zero. On
197 // exit, A:B and C:D together hold the same values in expanded
198 // form. If C is `nil', then only expand A to A:B.
199 movdqa \b, \a // (a_0, a_1, a_2, a_3)
200 .ifnes "\c", "nil"
201 movdqa \d, \c // (c_0, c_1, c_2, c_3)
202 .endif
203 punpcklwd \a, \z // (a'_0, a''_0, a'_1, a''_1)
204 punpckhwd \b, \z // (a'_2, a''_2, a'_3, a''_3)
205 .ifnes "\c", "nil"
206 punpcklwd \c, \z // (c'_0, c''_0, c'_1, c''_1)
207 punpckhwd \d, \z // (c'_2, c''_2, c'_3, c''_3)
208 .endif
209 pshufd \a, \a, SHUF(3, 1, 2, 0) // (a'_0, a'_1, a''_0, a''_1)
210 pshufd \b, \b, SHUF(3, 1, 2, 0) // (a'_2, a'_3, a''_2, a''_3)
211 .ifnes "\c", "nil"
212 pshufd \c, \c, SHUF(3, 1, 2, 0) // (c'_0, c'_1, c''_0, c''_1)
213 pshufd \d, \d, SHUF(3, 1, 2, 0) // (c'_2, c'_3, c''_2, c''_3)
214 .endif
215 .endm
216
217 .macro squash c0, c1, c2, c3, h, t, u
218 // On entry, C0, C1, C2, C3 are carry registers representing a value
219 // Y. On exit, C0 holds the low 128 bits of the carry value; C1, C2,
220 // C3, T, and U are clobbered; and the high bits of Y are stored in
221 // H, if this is not `nil'.
222
223 // The first step is to eliminate the `double-prime' pieces -- i.e.,
224 // the ones offset by 16 bytes from a 32-bit boundary -- by carrying
225 // them into the 32-bit-aligned pieces above and below. But before
226 // we can do that, we must gather them together.
227 movdqa \t, \c0
228 movdqa \u, \c1
229 punpcklqdq \t, \c2 // (y'_0, y'_2)
230 punpckhqdq \c0, \c2 // (y''_0, y''_2)
231 punpcklqdq \u, \c3 // (y'_1, y'_3)
232 punpckhqdq \c1, \c3 // (y''_1, y''_3)
233
234 // Now split the double-prime pieces. The high (up to) 48 bits will
235 // go up; the low 16 bits go down.
236 movdqa \c2, \c0
237 movdqa \c3, \c1
238 psllq \c2, 48
239 psllq \c3, 48
240 psrlq \c0, 16 // high parts of (y''_0, y''_2)
241 psrlq \c1, 16 // high parts of (y''_1, y''_3)
242 psrlq \c2, 32 // low parts of (y''_0, y''_2)
243 psrlq \c3, 32 // low parts of (y''_1, y''_3)
244 .ifnes "\h", "nil"
245 movdqa \h, \c1
246 .endif
247 pslldq \c1, 8 // high part of (0, y''_1)
248
249 paddq \t, \c2 // propagate down
250 paddq \u, \c3
251 paddq \t, \c1 // and up: (y_0, y_2)
252 paddq \u, \c0 // (y_1, y_3)
253 .ifnes "\h", "nil"
254 psrldq \h, 8 // high part of (y''_3, 0)
255 .endif
256
257 // Finally extract the answer. This complicated dance is better than
258 // storing to memory and loading, because the piecemeal stores
259 // inhibit store forwarding.
260 movdqa \c3, \t // (y_0, y_1)
261 movdqa \c0, \t // (y^*_0, ?, ?, ?)
262 psrldq \t, 8 // (y_2, 0)
263 psrlq \c3, 32 // (floor(y_0/B), ?)
264 paddq \c3, \u // (y_1 + floor(y_0/B), ?)
265 pslldq \c0, 12 // (0, 0, 0, y^*_0)
266 movdqa \c1, \c3 // (y^*_1, ?, ?, ?)
267 psrldq \u, 8 // (y_3, 0)
268 psrlq \c3, 32 // (floor((y_1 B + y_0)/B^2, ?)
269 paddq \c3, \t // (y_2 + floor((y_1 B + y_0)/B^2, ?)
270 pslldq \c1, 12 // (0, 0, 0, y^*_1)
271 psrldq \c0, 12 // (y^*_0, 0, 0, 0)
272 movdqa \c2, \c3 // (y^*_2, ?, ?, ?)
273 psrlq \c3, 32 // (floor((y_2 B^2 + y_1 B + y_0)/B^3, ?)
274 paddq \c3, \u // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3, ?)
275 pslldq \c2, 12 // (0, 0, 0, y^*_2)
276 psrldq \c1, 8 // (0, y^*_1, 0, 0)
277 psrldq \c2, 4 // (0, 0, y^*_2, 0)
278 .ifnes "\h", "nil"
279 movdqu \t, \c3
280 pxor \u, \u
281 .endif
282 pslldq \c3, 12 // (0, 0, 0, y^*_3)
283 por \c0, \c1 // (y^*_0, y^*_1, 0, 0)
284 por \c2, \c3 // (0, 0, y^*_2, y^*_3)
285 por \c0, \c2 // y mod B^4
286 .ifnes "\h", "nil"
287 psrlq \t, 32 // very high bits of y
288 paddq \h, \t
289 punpcklqdq \h, \u // carry up
290 .endif
291 .endm
292
293 .macro carryadd
294 // On entry, EDI points to a packed addend A, and XMM4, XMM5, XMM6
295 // hold the incoming carry registers c0, c1, and c2 representing a
296 // carry-in C.
297 //
298 // On exit, the carry registers, including XMM7, are updated to hold
299 // C + A; XMM0, XMM1, XMM2, and XMM3 are clobbered. The other
300 // registers are preserved.
301 movd xmm0, [edi + 0] // (a_0, 0)
302 movd xmm1, [edi + 4] // (a_1, 0)
303 movd xmm2, [edi + 8] // (a_2, 0)
304 movd xmm7, [edi + 12] // (a_3, 0)
305 paddq xmm4, xmm0 // (c'_0 + a_0, c''_0)
306 paddq xmm5, xmm1 // (c'_1 + a_1, c''_1)
307 paddq xmm6, xmm2 // (c'_2 + a_2, c''_2 + a_3 b)
308 .endm
309
310 ///--------------------------------------------------------------------------
311 /// Primitive multipliers and related utilities.
312
313 .p2align 4
314 carryprop:
315 // On entry, XMM4, XMM5, and XMM6 hold a 144-bit carry in an expanded
316 // form. Store the low 128 bits of the represented carry to [EDI] as
317 // a packed 128-bit value, and leave the remaining 16 bits in the low
318 // 32 bits of XMM4. On exit, XMM3, XMM5 and XMM6 are clobbered.
319 propout [edi + 0], xmm4, xmm5
320 propout [edi + 4], xmm5, xmm6
321 propout [edi + 8], xmm6, nil
322 endprop [edi + 12], xmm6, xmm4
323 ret
324
325 .p2align 4
326 dmul4:
327 // On entry, EDI points to the destination buffer; EAX and EBX point
328 // to the packed operands U and X; ECX and EDX point to the expanded
329 // operands V and Y; and XMM4, XMM5, XMM6 hold the incoming carry
330 // registers c0, c1, and c2; c3 is assumed to be zero.
331 //
332 // On exit, we write the low 128 bits of the sum C + U V + X Y to
333 // [EDI], and update the carry registers with the carry out. The
334 // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
335 // general-purpose registers are preserved.
336 mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7, t
337 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil
338 propout [edi + 0], xmm4, xmm5
339
340 mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
341 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, nil
342 propout [edi + 4], xmm5, xmm6
343
344 mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t
345 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, nil
346 propout [edi + 8], xmm6, xmm7
347
348 mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
349 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, nil
350 propout [edi + 12], xmm7, xmm4
351
352 ret
353
354 .p2align 4
355 dmla4:
356 // On entry, EDI points to the destination buffer, which also
357 // contains an addend A to accumulate; EAX and EBX point to the
358 // packed operands U and X; ECX and EDX point to the expanded
359 // operands V and Y; and XMM4, XMM5, XMM6 hold the incoming carry
360 // registers c0, c1, and c2 representing a carry-in C; c3 is assumed
361 // to be zero.
362 //
363 // On exit, we write the low 128 bits of the sum A + C + U V + X Y to
364 // [EDI], and update the carry registers with the carry out. The
365 // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
366 // general-purpose registers are preserved.
367 carryadd
368
369 mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7, nil
370 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil
371 propout [edi + 0], xmm4, xmm5
372
373 mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
374 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, nil
375 propout [edi + 4], xmm5, xmm6
376
377 mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t
378 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, nil
379 propout [edi + 8], xmm6, xmm7
380
381 mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
382 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, nil
383 propout [edi + 12], xmm7, xmm4
384
385 ret
386
387 .p2align 4
388 mul4zc:
389 // On entry, EDI points to the destination buffer; EBX points to a
390 // packed operand X; and EDX points to an expanded operand Y.
391 //
392 // On exit, we write the low 128 bits of the product X Y to [EDI],
393 // and set the carry registers XMM4, XMM5, XMM6 to the carry out.
394 // The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
395 // general-purpose registers are preserved.
396 mulcore [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
397 propout [edi + 0], xmm4, xmm5
398
399 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
400 propout [edi + 4], xmm5, xmm6
401
402 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
403 propout [edi + 8], xmm6, xmm7
404
405 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
406 propout [edi + 12], xmm7, xmm4
407
408 ret
409
410 .p2align 4
411 mul4:
412 // On entry, EDI points to the destination buffer; EBX points to a
413 // packed operand X; EDX points to an expanded operand Y; and XMM4,
414 // XMM5, XMM6 hold the incoming carry registers c0, c1, and c2,
415 // representing a carry-in C; c3 is assumed to be zero.
416 //
417 // On exit, we write the low 128 bits of the sum C + X Y to [EDI],
418 // and update the carry registers with the carry out. The registers
419 // XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
420 // general-purpose registers are preserved.
421 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, t
422 propout [edi + 0], xmm4, xmm5
423
424 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
425 propout [edi + 4], xmm5, xmm6
426
427 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
428 propout [edi + 8], xmm6, xmm7
429
430 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
431 propout [edi + 12], xmm7, xmm4
432
433 ret
434
435 .p2align 4
436 mla4zc:
437 // On entry, EDI points to the destination buffer, which also
438 // contains an addend A to accumulate; EBX points to a packed operand
439 // X; and EDX points to an expanded operand Y.
440 //
441 // On exit, we write the low 128 bits of the sum A + X Y to [EDI],
442 // and set the carry registers XMM4, XMM5, XMM6 to the carry out.
443 // The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
444 // general-purpose registers are preserved.
445 movd xmm4, [edi + 0]
446 movd xmm5, [edi + 4]
447 movd xmm6, [edi + 8]
448 movd xmm7, [edi + 12]
449
450 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil
451 propout [edi + 0], xmm4, xmm5
452
453 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
454 propout [edi + 4], xmm5, xmm6
455
456 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
457 propout [edi + 8], xmm6, xmm7
458
459 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
460 propout [edi + 12], xmm7, xmm4
461
462 ret
463
464 .p2align 4
465 mla4:
466 // On entry, EDI points to the destination buffer, which also
467 // contains an addend A to accumulate; EBX points to a packed operand
468 // X; EDX points to an expanded operand Y; and XMM4, XMM5, XMM6 hold
469 // the incoming carry registers c0, c1, and c2, representing a
470 // carry-in C; c3 is assumed to be zero.
471 //
472 // On exit, we write the low 128 bits of the sum A + C + X Y to
473 // [EDI], and update the carry registers with the carry out. The
474 // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
475 // general-purpose registers are preserved.
476 carryadd
477
478 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil
479 propout [edi + 0], xmm4, xmm5
480
481 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
482 propout [edi + 4], xmm5, xmm6
483
484 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
485 propout [edi + 8], xmm6, xmm7
486
487 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
488 propout [edi + 12], xmm7, xmm4
489
490 ret
491
492 .p2align 4
493 mmul4:
494 // On entry, EDI points to the destination buffer; EAX and EBX point
495 // to the packed operands U and N; ECX and ESI point to the expanded
496 // operands V and M; and EDX points to a place to store an expanded
497 // result Y (32 bytes, at a 16-byte boundary). The stack pointer
498 // must be 16-byte aligned. (This is not the usual convention, which
499 // requires alignment before the call.)
500 //
501 // On exit, we write Y = U V M mod B to [EDX], and the low 128 bits
502 // of the sum U V + N Y to [EDI], leaving the remaining carry in
503 // XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2, XMM3, and
504 // XMM7 are clobbered; the general-purpose registers are preserved.
505 sub esp, 64 // space for the carries
506
507 // Calculate W = U V, and leave it in the destination. Stash the
508 // carry pieces for later.
509 mulcore [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7
510 propout [edi + 0], xmm4, xmm5
511 jmp 5f
512
513 .p2align 4
514 mmla4:
515 // On entry, EDI points to the destination buffer, which also
516 // contains an addend A to accumulate; EAX and EBX point
517 // to the packed operands U and N; ECX and ESI point to the expanded
518 // operands V and M; and EDX points to a place to store an expanded
519 // result Y (32 bytes, at a 16-byte boundary). The stack pointer
520 // must be 16-byte aligned. (This is not the usual convention, which
521 // requires alignment before the call.)
522 //
523 // On exit, we write Y = (A + U V) M mod B to [EDX], and the low 128
524 // bits of the sum A + U V + N Y to [EDI], leaving the remaining
525 // carry in XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2,
526 // XMM3, and XMM7 are clobbered; the general-purpose registers are
527 // preserved.
528 sub esp, 64 // space for the carries
529 movd xmm4, [edi + 0]
530 movd xmm5, [edi + 4]
531 movd xmm6, [edi + 8]
532 movd xmm7, [edi + 12]
533 mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7, nil
534 propout [edi + 0], xmm4, xmm5
535
536 5: mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
537 propout [edi + 4], xmm5, xmm6
538
539 mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t
540 propout [edi + 8], xmm6, xmm7
541
542 mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
543 propout [edi + 12], xmm7, xmm4
544
545 movdqa [esp + 0], xmm4
546 movdqa [esp + 16], xmm5
547 movdqa [esp + 32], xmm6
548
549 // Calculate Y = W M.
550 mulcore [edi + 0], esi, xmm4, xmm5, xmm6, xmm7
551
552 mulcore [edi + 4], esi, xmm0, xmm1, xmm2, nil
553 accum xmm5, xmm6, xmm7, nil
554
555 mulcore [edi + 8], esi, xmm0, xmm1, nil, nil
556 accum xmm6, xmm7, nil, nil
557
558 mulcore [edi + 12], esi, xmm0, nil, nil, nil
559 accum xmm7, nil, nil, nil
560
561 // That's lots of pieces. Now we have to assemble the answer.
562 squash xmm4, xmm5, xmm6, xmm7, nil, xmm0, xmm1
563
564 // Expand it.
565 pxor xmm2, xmm2
566 expand xmm4, xmm1, nil, nil, xmm2
567 movdqa [edx + 0], xmm4
568 movdqa [edx + 16], xmm1
569
570 // Initialize the carry from the value for W we calculated earlier.
571 movd xmm4, [edi + 0]
572 movd xmm5, [edi + 4]
573 movd xmm6, [edi + 8]
574 movd xmm7, [edi + 12]
575
576 // Finish the calculation by adding the Montgomery product.
577 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil
578 propout [edi + 0], xmm4, xmm5
579
580 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
581 propout [edi + 4], xmm5, xmm6
582
583 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
584 propout [edi + 8], xmm6, xmm7
585
586 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
587 propout [edi + 12], xmm7, xmm4
588
589 // Add add on the carry we calculated earlier.
590 paddq xmm4, [esp + 0]
591 paddq xmm5, [esp + 16]
592 paddq xmm6, [esp + 32]
593
594 // And, with that, we're done.
595 add esp, 64
596 ret
597
598 .p2align 4
599 mont4:
600 // On entry, EDI points to the destination buffer holding a packed
601 // value A; EBX points to a packed operand N; ESI points to an
602 // expanded operand M; and EDX points to a place to store an expanded
603 // result Y (32 bytes, at a 16-byte boundary).
604 //
605 // On exit, we write Y = W M mod B to [EDX], and the low 128 bits
606 // of the sum W + N Y to [EDI], leaving the remaining carry in
607 // XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2, XMM3, and
608 // XMM7 are clobbered; the general-purpose registers are preserved.
609
610 // Calculate Y = W M.
611 mulcore [edi + 0], esi, xmm4, xmm5, xmm6, xmm7
612
613 mulcore [edi + 4], esi, xmm0, xmm1, xmm2, nil
614 accum xmm5, xmm6, xmm7, nil
615
616 mulcore [edi + 8], esi, xmm0, xmm1, nil, nil
617 accum xmm6, xmm7, nil, nil
618
619 mulcore [edi + 12], esi, xmm0, nil, nil, nil
620 accum xmm7, nil, nil, nil
621
622 // That's lots of pieces. Now we have to assemble the answer.
623 squash xmm4, xmm5, xmm6, xmm7, nil, xmm0, xmm1
624
625 // Expand it.
626 pxor xmm2, xmm2
627 expand xmm4, xmm1, nil, nil, xmm2
628 movdqa [edx + 0], xmm4
629 movdqa [edx + 16], xmm1
630
631 // Initialize the carry from W.
632 movd xmm4, [edi + 0]
633 movd xmm5, [edi + 4]
634 movd xmm6, [edi + 8]
635 movd xmm7, [edi + 12]
636
637 // Finish the calculation by adding the Montgomery product.
638 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil
639 propout [edi + 0], xmm4, xmm5
640
641 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
642 propout [edi + 4], xmm5, xmm6
643
644 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
645 propout [edi + 8], xmm6, xmm7
646
647 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
648 propout [edi + 12], xmm7, xmm4
649
650 // And, with that, we're done.
651 ret
652
653 ///--------------------------------------------------------------------------
654 /// Bulk multipliers.
655
656 FUNC(mpx_umul4_x86_sse2)
657 // void mpx_umul4_x86_sse2(mpw *dv, const mpw *av, const mpw *avl,
658 // const mpw *bv, const mpw *bvl);
659
660 // Build a stack frame. Arguments will be relative to EBP, as
661 // follows.
662 //
663 // ebp + 20 dv
664 // ebp + 24 av
665 // ebp + 28 avl
666 // ebp + 32 bv
667 // ebp + 36 bvl
668 //
669 // Locals are relative to ESP, as follows.
670 //
671 // esp + 0 expanded Y (32 bytes)
672 // esp + 32 (top of locals)
673 push ebp
674 push ebx
675 push esi
676 push edi
677 mov ebp, esp
678 and esp, ~15
679 sub esp, 32
680
681 // Prepare for the first iteration.
682 mov esi, [ebp + 32] // -> bv[0]
683 pxor xmm7, xmm7
684 movdqu xmm0, [esi] // bv[0]
685 mov edi, [ebp + 20] // -> dv[0]
686 mov ecx, edi // outer loop dv cursor
687 expand xmm0, xmm1, nil, nil, xmm7
688 mov ebx, [ebp + 24] // -> av[0]
689 mov eax, [ebp + 28] // -> av[m] = av limit
690 mov edx, esp // -> expanded Y = bv[0]
691 movdqa [esp + 0], xmm0 // bv[0] expanded low
692 movdqa [esp + 16], xmm1 // bv[0] expanded high
693 call mul4zc
694 add ebx, 16
695 add edi, 16
696 add ecx, 16
697 add esi, 16
698 cmp ebx, eax // all done?
699 jae 8f
700
701 .p2align 4
702 // Continue with the first iteration.
703 0: call mul4
704 add ebx, 16
705 add edi, 16
706 cmp ebx, eax // all done?
707 jb 0b
708
709 // Write out the leftover carry. There can be no tail here.
710 8: call carryprop
711 cmp esi, [ebp + 36] // more passes to do?
712 jae 9f
713
714 .p2align 4
715 // Set up for the next pass.
716 1: movdqu xmm0, [esi] // bv[i]
717 mov edi, ecx // -> dv[i]
718 pxor xmm7, xmm7
719 expand xmm0, xmm1, nil, nil, xmm7
720 mov ebx, [ebp + 24] // -> av[0]
721 movdqa [esp + 0], xmm0 // bv[i] expanded low
722 movdqa [esp + 16], xmm1 // bv[i] expanded high
723 call mla4zc
724 add edi, 16
725 add ebx, 16
726 add ecx, 16
727 add esi, 16
728 cmp ebx, eax // done yet?
729 jae 8f
730
731 .p2align 4
732 // Continue...
733 0: call mla4
734 add ebx, 16
735 add edi, 16
736 cmp ebx, eax
737 jb 0b
738
739 // Finish off this pass. There was no tail on the previous pass, and
740 // there can be none on this pass.
741 8: call carryprop
742 cmp esi, [ebp + 36]
743 jb 1b
744
745 // All over.
746 9: mov esp, ebp
747 pop edi
748 pop esi
749 pop ebx
750 pop ebp
751 ret
752
753 ENDFUNC
754
755 FUNC(mpxmont_mul4_x86_sse2)
756 // void mpxmont_mul4_x86_sse2(mpw *dv, const mpw *av, const mpw *bv,
757 // const mpw *nv, size_t n, const mpw *mi);
758
759 // Build a stack frame. Arguments will be relative to EBP, as
760 // follows.
761 //
762 // ebp + 20 dv
763 // ebp + 24 av
764 // ebp + 28 bv
765 // ebp + 32 nv
766 // ebp + 36 n (nonzero multiple of 4)
767 // ebp + 40 mi
768 //
769 // Locals are relative to ESP, which is 4 mod 16, as follows.
770 //
771 // esp + 0 outer loop dv
772 // esp + 4 outer loop bv
773 // esp + 8 av limit (mostly in ESI)
774 // esp + 12 expanded V (32 bytes)
775 // esp + 44 expanded M (32 bytes)
776 // esp + 76 expanded Y (32 bytes)
777 // esp + 108 bv limit
778 // esp + 112 (gap)
779 // esp + 124 (top of locals)
780 push ebp
781 push ebx
782 push esi
783 push edi
784 mov ebp, esp
785 and esp, ~15
786 sub esp, 124
787
788 // Establish the expanded operands.
789 pxor xmm7, xmm7
790 mov ecx, [ebp + 28] // -> bv
791 mov edx, [ebp + 40] // -> mi
792 movdqu xmm0, [ecx] // bv[0]
793 movdqu xmm2, [edx] // mi
794 expand xmm0, xmm1, xmm2, xmm3, xmm7
795 movdqa [esp + 12], xmm0 // bv[0] expanded low
796 movdqa [esp + 28], xmm1 // bv[0] expanded high
797 movdqa [esp + 44], xmm2 // mi expanded low
798 movdqa [esp + 60], xmm3 // mi expanded high
799
800 // Set up the outer loop state and prepare for the first iteration.
801 mov edx, [ebp + 36] // n
802 mov eax, [ebp + 24] // -> U = av[0]
803 mov ebx, [ebp + 32] // -> X = nv[0]
804 mov edi, [ebp + 20] // -> Z = dv[0]
805 mov [esp + 4], ecx
806 lea ecx, [ecx + 4*edx] // -> bv[n/4] = bv limit
807 lea edx, [eax + 4*edx] // -> av[n/4] = av limit
808 mov [esp + 0], edi
809 mov [esp + 108], ecx
810 mov [esp + 8], edx
811 lea ecx, [esp + 12] // -> expanded V = bv[0]
812 lea esi, [esp + 44] // -> expanded M = mi
813 lea edx, [esp + 76] // -> space for Y
814 call mmul4
815 mov esi, [esp + 8] // recover av limit
816 add edi, 16
817 add eax, 16
818 add ebx, 16
819 cmp eax, esi // done already?
820 jae 8f
821 mov [esp + 0], edi
822
823 .p2align 4
824 // Complete the first inner loop.
825 0: call dmul4
826 add edi, 16
827 add eax, 16
828 add ebx, 16
829 cmp eax, esi // done yet?
830 jb 0b
831
832 // Still have carries left to propagate.
833 call carryprop
834 movd [edi + 16], xmm4
835
836 .p2align 4
837 // Embark on the next iteration. (There must be one. If n = 1, then
838 // we would have bailed above, to label 8. Similarly, the subsequent
839 // iterations can fall into the inner loop immediately.)
840 1: mov eax, [esp + 4] // -> bv[i - 1]
841 mov edi, [esp + 0] // -> Z = dv[i]
842 add eax, 16 // -> bv[i]
843 pxor xmm7, xmm7
844 movdqu xmm0, [eax] // bv[i]
845 mov [esp + 4], eax
846 cmp eax, [esp + 108] // done yet?
847 jae 9f
848 mov ebx, [ebp + 32] // -> X = nv[0]
849 lea esi, [esp + 44] // -> expanded M = mi
850 mov eax, [ebp + 24] // -> U = av[0]
851 expand xmm0, xmm1, nil, nil, xmm7
852 movdqa [esp + 12], xmm0 // bv[i] expanded low
853 movdqa [esp + 28], xmm1 // bv[i] expanded high
854 call mmla4
855 mov esi, [esp + 8] // recover av limit
856 add edi, 16
857 add eax, 16
858 add ebx, 16
859 mov [esp + 0], edi
860
861 .p2align 4
862 // Complete the next inner loop.
863 0: call dmla4
864 add edi, 16
865 add eax, 16
866 add ebx, 16
867 cmp eax, esi
868 jb 0b
869
870 // Still have carries left to propagate, and they overlap the
871 // previous iteration's final tail, so read that in and add it.
872 movd xmm0, [edi]
873 paddq xmm4, xmm0
874 call carryprop
875 movd [edi + 16], xmm4
876
877 // Back again.
878 jmp 1b
879
880 // First iteration was short. Write out the carries and we're done.
881 // (This could be folded into the main loop structure, but that would
882 // penalize small numbers more.)
883 8: call carryprop
884 movd [edi + 16], xmm4
885
886 // All done.
887 9: mov esp, ebp
888 pop edi
889 pop esi
890 pop ebx
891 pop ebp
892 ret
893
894 ENDFUNC
895
896 FUNC(mpxmont_redc4_x86_sse2)
897 // void mpxmont_redc4_x86_sse2(mpw *dv, mpw *dvl, const mpw *nv,
898 // size_t n, const mpw *mi);
899
900 // Build a stack frame. Arguments will be relative to EBP, as
901 // follows.
902 //
903 // ebp + 20 dv
904 // ebp + 24 dvl
905 // ebp + 28 nv
906 // ebp + 32 n (nonzero multiple of 4)
907 // ebp + 36 mi
908 //
909 // Locals are relative to ESP, as follows.
910 //
911 // esp + 0 outer loop dv
912 // esp + 4 outer dv limit
913 // esp + 8 blocks-of-4 dv limit
914 // esp + 12 expanded M (32 bytes)
915 // esp + 44 expanded Y (32 bytes)
916 // esp + 76 (top of locals)
917 push ebp
918 push ebx
919 push esi
920 push edi
921 mov ebp, esp
922 and esp, ~15
923 sub esp, 76
924
925 // Establish the expanded operands and the blocks-of-4 dv limit.
926 mov edi, [ebp + 20] // -> Z = dv[0]
927 pxor xmm7, xmm7
928 mov eax, [ebp + 24] // -> dv[n] = dv limit
929 sub eax, edi // length of dv in bytes
930 mov edx, [ebp + 36] // -> mi
931 movdqu xmm0, [edx] // mi
932 and eax, ~15 // mask off the tail end
933 expand xmm0, xmm1, nil, nil, xmm7
934 add eax, edi // find limit
935 movdqa [esp + 12], xmm0 // mi expanded low
936 movdqa [esp + 28], xmm1 // mi expanded high
937 mov [esp + 8], eax
938
939 // Set up the outer loop state and prepare for the first iteration.
940 mov ecx, [ebp + 32] // n
941 mov ebx, [ebp + 28] // -> X = nv[0]
942 lea edx, [edi + 4*ecx] // -> dv[n/4] = outer dv limit
943 lea ecx, [ebx + 4*ecx] // -> nv[n/4] = nv limit
944 mov [esp + 0], edi
945 mov [esp + 4], edx
946 lea esi, [esp + 12] // -> expanded M = mi
947 lea edx, [esp + 44] // -> space for Y
948 call mont4
949 add edi, 16
950 add ebx, 16
951 cmp ebx, ecx // done already?
952 jae 8f
953
954 .p2align 4
955 // Complete the first inner loop.
956 5: call mla4
957 add ebx, 16
958 add edi, 16
959 cmp ebx, ecx // done yet?
960 jb 5b
961
962 // Still have carries left to propagate.
963 8: carryadd
964 mov esi, [esp + 8] // -> dv blocks limit
965 mov edx, [ebp + 24] // dv limit
966 psllq xmm7, 16
967 pslldq xmm7, 8
968 paddq xmm6, xmm7
969 call carryprop
970 movd eax, xmm4
971 add edi, 16
972 cmp edi, esi
973 jae 7f
974
975 .p2align 4
976 // Continue carry propagation until the end of the buffer.
977 0: add [edi], eax
978 mov eax, 0 // preserves flags
979 adcd [edi + 4], 0
980 adcd [edi + 8], 0
981 adcd [edi + 12], 0
982 adc eax, 0
983 add edi, 16
984 cmp edi, esi
985 jb 0b
986
987 // Deal with the tail end.
988 7: add [edi], eax
989 mov eax, 0 // preserves flags
990 add edi, 4
991 adc eax, 0
992 cmp edi, edx
993 jb 7b
994
995 // All done for this iteration. Start the next. (This must have at
996 // least one follow-on iteration, or we'd not have started this outer
997 // loop.)
998 8: mov edi, [esp + 0] // -> dv[i - 1]
999 mov ebx, [ebp + 28] // -> X = nv[0]
1000 lea edx, [esp + 44] // -> space for Y
1001 lea esi, [esp + 12] // -> expanded M = mi
1002 add edi, 16 // -> Z = dv[i]
1003 cmp edi, [esp + 4] // all done yet?
1004 jae 9f
1005 mov [esp + 0], edi
1006 call mont4
1007 add edi, 16
1008 add ebx, 16
1009 jmp 5b
1010
1011 // All over.
1012 9: mov esp, ebp
1013 pop edi
1014 pop esi
1015 pop ebx
1016 pop ebp
1017 ret
1018
1019 ENDFUNC
1020
1021 ///--------------------------------------------------------------------------
1022 /// Testing and performance measurement.
1023
1024 #ifdef TEST_MUL4
1025
1026 .macro cysetup c
1027 rdtsc
1028 mov [\c], eax
1029 mov [\c + 4], edx
1030 .endm
1031
1032 .macro cystore c, v, n
1033 rdtsc
1034 sub eax, [\c]
1035 sbb edx, [\c + 4]
1036 mov ebx, [\v]
1037 mov ecx, [\n]
1038 dec ecx
1039 mov [\n], ecx
1040 mov [ebx + ecx*8], eax
1041 mov [ebx + ecx*8 + 4], edx
1042 .endm
1043
1044 .macro testprologue
1045 push ebp
1046 push ebx
1047 push esi
1048 push edi
1049 mov ebp, esp
1050 and esp, ~15
1051 sub esp, 3*32 + 12
1052 // vars:
1053 // esp + 0 = cycles
1054 // esp + 12 = v expanded
1055 // esp + 44 = y expanded
1056 // esp + 72 = ? expanded
1057 .endm
1058
1059 .macro testepilogue
1060 mov esp, ebp
1061 pop edi
1062 pop esi
1063 pop ebx
1064 pop ebp
1065 ret
1066 .endm
1067
1068 .macro testldcarry c
1069 mov ecx, \c // -> c
1070 movdqu xmm4, [ecx + 0] // (c'_0, c''_0)
1071 movdqu xmm5, [ecx + 16] // (c'_1, c''_1)
1072 movdqu xmm6, [ecx + 32] // (c'_2, c''_2)
1073 .endm
1074
1075 .macro testexpand v, y
1076 pxor xmm7, xmm7
1077 .ifnes "\v", "nil"
1078 mov ecx, \v
1079 movdqu xmm0, [ecx]
1080 expand xmm0, xmm1, nil, nil, xmm7
1081 movdqa [esp + 12], xmm0
1082 movdqa [esp + 28], xmm1
1083 .endif
1084 .ifnes "\y", "nil"
1085 mov edx, \y
1086 movdqu xmm2, [edx]
1087 expand xmm2, xmm3, nil, nil, xmm7
1088 movdqa [esp + 44], xmm2
1089 movdqa [esp + 60], xmm3
1090 .endif
1091 .endm
1092
1093 .macro testtop u, x, mode
1094 .p2align 4
1095 0:
1096 .ifnes "\u", "nil"
1097 lea ecx, [esp + 12]
1098 .endif
1099 mov ebx, \x
1100 .ifeqs "\mode", "mont"
1101 lea esi, [esp + 44]
1102 .endif
1103 cysetup esp + 0
1104 .ifnes "\u", "nil"
1105 mov eax, \u
1106 .endif
1107 .ifeqs "\mode", "mont"
1108 lea edx, [esp + 76]
1109 .else
1110 lea edx, [esp + 44]
1111 .endif
1112 .endm
1113
1114 .macro testtail cyv, n
1115 cystore esp + 0, \cyv, \n
1116 jnz 0b
1117 .endm
1118
1119 .macro testcarryout c
1120 mov ecx, \c
1121 movdqu [ecx + 0], xmm4
1122 movdqu [ecx + 16], xmm5
1123 movdqu [ecx + 32], xmm6
1124 .endm
1125
1126 .globl test_dmul4
1127 test_dmul4:
1128 testprologue
1129 testldcarry [ebp + 24]
1130 testexpand [ebp + 36], [ebp + 40]
1131 mov edi, [ebp + 20]
1132 testtop [ebp + 28], [ebp + 32]
1133 call dmul4
1134 testtail [ebp + 48], [ebp + 44]
1135 testcarryout [ebp + 24]
1136 testepilogue
1137
1138 .globl test_dmla4
1139 test_dmla4:
1140 testprologue
1141 testldcarry [ebp + 24]
1142 testexpand [ebp + 36], [ebp + 40]
1143 mov edi, [ebp + 20]
1144 testtop [ebp + 28], [ebp + 32]
1145 call dmla4
1146 testtail [ebp + 48], [ebp + 44]
1147 testcarryout [ebp + 24]
1148 testepilogue
1149
1150 .globl test_mul4
1151 test_mul4:
1152 testprologue
1153 testldcarry [ebp + 24]
1154 testexpand nil, [ebp + 32]
1155 mov edi, [ebp + 20]
1156 testtop nil, [ebp + 28]
1157 call mul4
1158 testtail [ebp + 40], [ebp + 36]
1159 testcarryout [ebp + 24]
1160 testepilogue
1161
1162 .globl test_mla4
1163 test_mla4:
1164 testprologue
1165 testldcarry [ebp + 24]
1166 testexpand nil, [ebp + 32]
1167 mov edi, [ebp + 20]
1168 testtop nil, [ebp + 28]
1169 call mla4
1170 testtail [ebp + 40], [ebp + 36]
1171 testcarryout [ebp + 24]
1172 testepilogue
1173
1174 .globl test_mmul4
1175 test_mmul4:
1176 testprologue
1177 testexpand [ebp + 40], [ebp + 44]
1178 mov edi, [ebp + 20]
1179 testtop [ebp + 32], [ebp + 36], mont
1180 call mmul4
1181 testtail [ebp + 52], [ebp + 48]
1182 mov edi, [ebp + 28]
1183 movdqa xmm0, [esp + 76]
1184 movdqa xmm1, [esp + 92]
1185 movdqu [edi], xmm0
1186 movdqu [edi + 16], xmm1
1187 testcarryout [ebp + 24]
1188 testepilogue
1189
1190 .globl test_mmla4
1191 test_mmla4:
1192 testprologue
1193 testexpand [ebp + 40], [ebp + 44]
1194 mov edi, [ebp + 20]
1195 testtop [ebp + 32], [ebp + 36], mont
1196 call mmla4
1197 testtail [ebp + 52], [ebp + 48]
1198 mov edi, [ebp + 28]
1199 movdqa xmm0, [esp + 76]
1200 movdqa xmm1, [esp + 92]
1201 movdqu [edi], xmm0
1202 movdqu [edi + 16], xmm1
1203 testcarryout [ebp + 24]
1204 testepilogue
1205
1206 .globl test_mont4
1207 test_mont4:
1208 testprologue
1209 testexpand nil, [ebp + 36]
1210 mov edi, [ebp + 20]
1211 testtop nil, [ebp + 32], mont
1212 call mont4
1213 testtail [ebp + 44], [ebp + 40]
1214 mov edi, [ebp + 28]
1215 movdqa xmm0, [esp + 76]
1216 movdqa xmm1, [esp + 92]
1217 movdqu [edi], xmm0
1218 movdqu [edi + 16], xmm1
1219 testcarryout [ebp + 24]
1220 testepilogue
1221
1222 #endif
1223
1224 ///----- That's all, folks --------------------------------------------------