math/mpx-mul4-x86-sse2.S: Use `movdqa' to move between XMM registers.
[catacomb] / math / mpx-mul4-x86-sse2.S
1 /// -*- mode: asm; asm-comment-char: ?/; comment-start: "// " -*-
2 ///
3 /// Large SIMD-based multiplications
4 ///
5 /// (c) 2016 Straylight/Edgeware
6
7 ///----- Licensing notice ---------------------------------------------------
8 ///
9 /// This file is part of Catacomb.
10 ///
11 /// Catacomb is free software; you can redistribute it and/or modify
12 /// it under the terms of the GNU Library General Public License as
13 /// published by the Free Software Foundation; either version 2 of the
14 /// License, or (at your option) any later version.
15 ///
16 /// Catacomb is distributed in the hope that it will be useful,
17 /// but WITHOUT ANY WARRANTY; without even the implied warranty of
18 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 /// GNU Library General Public License for more details.
20 ///
21 /// You should have received a copy of the GNU Library General Public
22 /// License along with Catacomb; if not, write to the Free
23 /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
24 /// MA 02111-1307, USA.
25
26 ///--------------------------------------------------------------------------
27 /// External definitions.
28
29 #include "config.h"
30 #include "asm-common.h"
31
32 ///--------------------------------------------------------------------------
33 /// Prologue.
34
35 .arch pentium4
36 .text
37
38 ///--------------------------------------------------------------------------
39 /// Theory.
40 ///
41 /// We define a number of primitive fixed-size multipliers from which we can
42 /// construct more general variable-length multipliers.
43 ///
44 /// The basic trick is the same throughout. In an operand-scanning
45 /// multiplication, the inner multiplication loop multiplies a
46 /// multiple-precision operand by a single precision factor, and adds the
47 /// result, appropriately shifted, to the result. A `finely integrated
48 /// operand scanning' implementation of Montgomery multiplication also adds
49 /// the product of a single-precision `Montgomery factor' and the modulus,
50 /// calculated in the same pass. The more common `coarsely integrated
51 /// operand scanning' alternates main multiplication and Montgomery passes,
52 /// which requires additional carry propagation.
53 ///
54 /// Throughout both plain-multiplication and Montgomery stages, then, one of
55 /// the factors remains constant throughout the operation, so we can afford
56 /// to take a little time to preprocess it. The transformation we perform is
57 /// as follows. Let b = 2^16, and B = b^2 = 2^32. Suppose we're given a
58 /// 128-bit factor v = v_0 + v_1 B + v_2 B^2 + v_3 B^3. Split each v_i into
59 /// two sixteen-bit pieces, so v_i = v'_i + v''_i b. These eight 16-bit
60 /// pieces are placed into 32-bit cells, and arranged as two 128-bit SSE
61 /// operands, as follows.
62 ///
63 /// Offset 0 4 8 12
64 /// 0 v'_0 v'_1 v''_0 v''_1
65 /// 16 v'_2 v'_3 v''_2 v''_3
66 ///
67 /// A `pmuludqd' instruction ignores the odd positions in its operands; thus,
68 /// it will act on (say) v'_0 and v''_0 in a single instruction. Shifting
69 /// this vector right by 4 bytes brings v'_1 and v''_1 into position. We can
70 /// multiply such a vector by a full 32-bit scalar to produce two 48-bit
71 /// results in 64-bit fields. The sixteen bits of headroom allows us to add
72 /// many products together before we must deal with carrying; it also allows
73 /// for some calculations to be performed on the above expanded form.
74 ///
75 /// On 32-bit x86, we are register starved: the expanded operands are kept in
76 /// memory, typically in warm L1 cache.
77 ///
78 /// We maintain four `carry' registers accumulating intermediate results.
79 /// The registers' precise roles rotate during the computation; we name them
80 /// `c0', `c1', `c2', and `c3'. Each carry register holds two 64-bit halves:
81 /// the register c0, for example, holds c'_0 (low half) and c''_0 (high
82 /// half), and represents the value c_0 = c'_0 + c''_0 b; the carry registers
83 /// collectively represent the value c_0 + c_1 B + c_2 B^2 + c_3 B^3. The
84 /// `pmuluqdq' instruction acting on a scalar operand (broadcast across all
85 /// lanes of its vector) and an operand in the expanded form above produces a
86 /// result which can be added directly to the appropriate carry register.
87 /// Following a pass of four multiplications, we perform some limited carry
88 /// propagation: let t = c''_0 mod B, and let d = c'_0 + t b; then we output
89 /// z = d mod B, add (floor(d/B), floor(c''_0/B)) to c1, and cycle the carry
90 /// registers around, so that c1 becomes c0, and the old c0 is (implicitly)
91 /// zeroed becomes c3.
92
93 ///--------------------------------------------------------------------------
94 /// Macro definitions.
95
96 .macro mulcore r, s, d0, d1, d2, d3
97 // Load a word r_i from R, multiply by the expanded operand [S], and
98 // leave the pieces of the product in registers D0, D1, D2, D3.
99 movd \d0, \r // (r_i, 0, 0, 0)
100 .ifnes "\d1", "nil"
101 movdqa \d1, [\s] // (s'_0, s'_1, s''_0, s''_1)
102 .endif
103 .ifnes "\d3", "nil"
104 movdqa \d3, [\s + 16] // (s'_2, s'_3, s''_2, s''_3)
105 .endif
106 pshufd \d0, \d0, SHUF(3, 0, 3, 0) // (r_i, ?, r_i, ?)
107 .ifnes "\d1", "nil"
108 psrldq \d1, 4 // (s'_1, s''_0, s''_1, 0)
109 .endif
110 .ifnes "\d2", "nil"
111 .ifnes "\d3", "nil"
112 movdqa \d2, \d3 // another copy of (s'_2, s'_3, ...)
113 .else
114 movdqa \d2, \d0 // another copy of (r_i, ?, r_i, ?)
115 .endif
116 .endif
117 .ifnes "\d3", "nil"
118 psrldq \d3, 4 // (s'_3, s''_2, s''_3, 0)
119 .endif
120 .ifnes "\d1", "nil"
121 pmuludqd \d1, \d0 // (r_i s'_1, r_i s''_1)
122 .endif
123 .ifnes "\d3", "nil"
124 pmuludqd \d3, \d0 // (r_i s'_3, r_i s''_3)
125 .endif
126 .ifnes "\d2", "nil"
127 .ifnes "\d3", "nil"
128 pmuludqd \d2, \d0 // (r_i s'_2, r_i s''_2)
129 .else
130 pmuludqd \d2, [\s + 16]
131 .endif
132 .endif
133 pmuludqd \d0, [\s] // (r_i s'_0, r_i s''_0)
134 .endm
135
136 .macro accum c0, c1, c2, c3
137 paddq \c0, xmm0
138 .ifnes "\c1", "nil"
139 paddq \c1, xmm1
140 .endif
141 .ifnes "\c2", "nil"
142 paddq \c2, xmm2
143 .endif
144 .ifnes "\c3", "nil"
145 paddq \c3, xmm3
146 .endif
147 .endm
148
149 .macro mulacc r, s, c0, c1, c2, c3, z3p
150 // Load a word r_i from R, multiply by the expanded operand [S],
151 // and accumulate in carry registers C0, C1, C2, C3. If Z3P is `t'
152 // then C3 notionally contains zero, but needs clearing; in practice,
153 // we store the product directly rather than attempting to add. On
154 // completion, XMM0, XMM1, and XMM2 are clobbered, as is XMM3 if Z3P
155 // is not `t'.
156 .ifeqs "\z3p", "t"
157 mulcore \r, \s, xmm0, xmm1, xmm2, \c3
158 accum \c0, \c1, \c2, nil
159 .else
160 mulcore \r, \s, xmm0, xmm1, xmm2, xmm3
161 accum \c0, \c1, \c2, \c3
162 .endif
163 .endm
164
165 .macro propout d, c, cc
166 // Calculate an output word from C, and store it in D; propagate
167 // carries out from C to CC in preparation for a rotation of the
168 // carry registers. On completion, XMM3 is clobbered. If CC is
169 // `nil', then the contribution which would have been added to it is
170 // left in C.
171 pshufd xmm3, \c, SHUF(2, 3, 3, 3) // (?, ?, ?, t = c'' mod B)
172 psrldq xmm3, 12 // (t, 0, 0, 0) = (t, 0)
173 pslldq xmm3, 2 // (t b, 0)
174 paddq \c, xmm3 // (c' + t b, c'')
175 movd \d, \c
176 psrlq \c, 32 // floor(c/B)
177 .ifnes "\cc", "nil"
178 paddq \cc, \c // propagate up
179 .endif
180 .endm
181
182 .macro endprop d, c, t
183 // On entry, C contains a carry register. On exit, the low 32 bits
184 // of the value represented in C are written to D, and the remaining
185 // bits are left at the bottom of T.
186 movdqa \t, \c
187 psllq \t, 16 // (?, c'' b)
188 pslldq \c, 8 // (0, c')
189 paddq \t, \c // (?, c' + c'' b)
190 psrldq \t, 8 // c' + c'' b
191 movd \d, \t
192 psrldq \t, 4 // floor((c' + c'' b)/B)
193 .endm
194
195 .macro expand a, b, c, d, z
196 // On entry, A and C hold packed 128-bit values, and Z is zero. On
197 // exit, A:B and C:D together hold the same values in expanded
198 // form. If C is `nil', then only expand A to A:B.
199 movdqa \b, \a // (a_0, a_1, a_2, a_3)
200 .ifnes "\c", "nil"
201 movdqa \d, \c // (c_0, c_1, c_2, c_3)
202 .endif
203 punpcklwd \a, \z // (a'_0, a''_0, a'_1, a''_1)
204 punpckhwd \b, \z // (a'_2, a''_2, a'_3, a''_3)
205 .ifnes "\c", "nil"
206 punpcklwd \c, \z // (c'_0, c''_0, c'_1, c''_1)
207 punpckhwd \d, \z // (c'_2, c''_2, c'_3, c''_3)
208 .endif
209 pshufd \a, \a, SHUF(3, 1, 2, 0) // (a'_0, a'_1, a''_0, a''_1)
210 pshufd \b, \b, SHUF(3, 1, 2, 0) // (a'_2, a'_3, a''_2, a''_3)
211 .ifnes "\c", "nil"
212 pshufd \c, \c, SHUF(3, 1, 2, 0) // (c'_0, c'_1, c''_0, c''_1)
213 pshufd \d, \d, SHUF(3, 1, 2, 0) // (c'_2, c'_3, c''_2, c''_3)
214 .endif
215 .endm
216
217 .macro squash c0, c1, c2, c3, h, t, u
218 // On entry, C0, C1, C2, C3 are carry registers representing a value
219 // Y. On exit, C0 holds the low 128 bits of the carry value; C1, C2,
220 // C3, T, and U are clobbered; and the high bits of Y are stored in
221 // H, if this is not `nil'.
222
223 // The first step is to eliminate the `double-prime' pieces -- i.e.,
224 // the ones offset by 16 bytes from a 32-bit boundary -- by carrying
225 // them into the 32-bit-aligned pieces above and below. But before
226 // we can do that, we must gather them together.
227 movdqa \t, \c0
228 movdqa \u, \c1
229 punpcklqdq \t, \c2 // (y'_0, y'_2)
230 punpckhqdq \c0, \c2 // (y''_0, y''_2)
231 punpcklqdq \u, \c3 // (y'_1, y'_3)
232 punpckhqdq \c1, \c3 // (y''_1, y''_3)
233
234 // Now split the double-prime pieces. The high (up to) 48 bits will
235 // go up; the low 16 bits go down.
236 movdqa \c2, \c0
237 movdqa \c3, \c1
238 psllq \c2, 48
239 psllq \c3, 48
240 psrlq \c0, 16 // high parts of (y''_0, y''_2)
241 psrlq \c1, 16 // high parts of (y''_1, y''_3)
242 psrlq \c2, 32 // low parts of (y''_0, y''_2)
243 psrlq \c3, 32 // low parts of (y''_1, y''_3)
244 .ifnes "\h", "nil"
245 movdqa \h, \c1
246 .endif
247 pslldq \c1, 8 // high part of (0, y''_1)
248
249 paddq \t, \c2 // propagate down
250 paddq \u, \c3
251 paddq \t, \c1 // and up: (y_0, y_2)
252 paddq \u, \c0 // (y_1, y_3)
253 .ifnes "\h", "nil"
254 psrldq \h, 8 // high part of (y''_3, 0)
255 .endif
256
257 // Finally extract the answer. This complicated dance is better than
258 // storing to memory and loading, because the piecemeal stores
259 // inhibit store forwarding.
260 movdqa \c3, \t // (y_0, y_1)
261 movdqa \c0, \t // (y^*_0, ?, ?, ?)
262 psrldq \t, 8 // (y_2, 0)
263 psrlq \c3, 32 // (floor(y_0/B), ?)
264 paddq \c3, \u // (y_1 + floor(y_0/B), ?)
265 pslldq \c0, 12 // (0, 0, 0, y^*_0)
266 movdqa \c1, \c3 // (y^*_1, ?, ?, ?)
267 psrldq \u, 8 // (y_3, 0)
268 psrlq \c3, 32 // (floor((y_1 B + y_0)/B^2, ?)
269 paddq \c3, \t // (y_2 + floor((y_1 B + y_0)/B^2, ?)
270 pslldq \c1, 12 // (0, 0, 0, y^*_1)
271 psrldq \c0, 12 // (y^*_0, 0, 0, 0)
272 movdqa \c2, \c3 // (y^*_2, ?, ?, ?)
273 psrlq \c3, 32 // (floor((y_2 B^2 + y_1 B + y_0)/B^3, ?)
274 paddq \c3, \u // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3, ?)
275 pslldq \c2, 12 // (0, 0, 0, y^*_2)
276 psrldq \c1, 8 // (0, y^*_1, 0, 0)
277 psrldq \c2, 4 // (0, 0, y^*_2, 0)
278 .ifnes "\h", "nil"
279 movdqa \t, \c3
280 pxor \u, \u
281 .endif
282 pslldq \c3, 12 // (0, 0, 0, y^*_3)
283 por \c0, \c1 // (y^*_0, y^*_1, 0, 0)
284 por \c2, \c3 // (0, 0, y^*_2, y^*_3)
285 por \c0, \c2 // y mod B^4
286 .ifnes "\h", "nil"
287 psrlq \t, 32 // very high bits of y
288 paddq \h, \t
289 punpcklqdq \h, \u // carry up
290 .endif
291 .endm
292
293 .macro carryadd
294 // On entry, EDI points to a packed addend A, and XMM4, XMM5, XMM6
295 // hold the incoming carry registers c0, c1, and c2 representing a
296 // carry-in C.
297 //
298 // On exit, the carry registers, including XMM7, are updated to hold
299 // C + A; XMM0, XMM1, XMM2, and XMM3 are clobbered. The other
300 // registers are preserved.
301 movd xmm0, [edi + 0] // (a_0, 0)
302 movd xmm1, [edi + 4] // (a_1, 0)
303 movd xmm2, [edi + 8] // (a_2, 0)
304 movd xmm7, [edi + 12] // (a_3, 0)
305
306 paddq xmm4, xmm0 // (c'_0 + a_0, c''_0)
307 paddq xmm5, xmm1 // (c'_1 + a_1, c''_1)
308 paddq xmm6, xmm2 // (c'_2 + a_2, c''_2 + a_3 b)
309 .endm
310
311 ///--------------------------------------------------------------------------
312 /// Primitive multipliers and related utilities.
313
314 INTFUNC(carryprop)
315 // On entry, XMM4, XMM5, and XMM6 hold a 144-bit carry in an expanded
316 // form. Store the low 128 bits of the represented carry to [EDI] as
317 // a packed 128-bit value, and leave the remaining 16 bits in the low
318 // 32 bits of XMM4. On exit, XMM3, XMM5 and XMM6 are clobbered.
319 endprologue
320
321 propout [edi + 0], xmm4, xmm5
322 propout [edi + 4], xmm5, xmm6
323 propout [edi + 8], xmm6, nil
324 endprop [edi + 12], xmm6, xmm4
325 ret
326
327 ENDFUNC
328
329 INTFUNC(dmul4)
330 // On entry, EDI points to the destination buffer; EAX and EBX point
331 // to the packed operands U and X; ECX and EDX point to the expanded
332 // operands V and Y; and XMM4, XMM5, XMM6 hold the incoming carry
333 // registers c0, c1, and c2; c3 is assumed to be zero.
334 //
335 // On exit, we write the low 128 bits of the sum C + U V + X Y to
336 // [EDI], and update the carry registers with the carry out. The
337 // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
338 // general-purpose registers are preserved.
339 endprologue
340
341 mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7, t
342 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil
343 propout [edi + 0], xmm4, xmm5
344
345 mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
346 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, nil
347 propout [edi + 4], xmm5, xmm6
348
349 mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t
350 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, nil
351 propout [edi + 8], xmm6, xmm7
352
353 mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
354 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, nil
355 propout [edi + 12], xmm7, xmm4
356
357 ret
358
359 ENDFUNC
360
361 INTFUNC(dmla4)
362 // On entry, EDI points to the destination buffer, which also
363 // contains an addend A to accumulate; EAX and EBX point to the
364 // packed operands U and X; ECX and EDX point to the expanded
365 // operands V and Y; and XMM4, XMM5, XMM6 hold the incoming carry
366 // registers c0, c1, and c2 representing a carry-in C; c3 is assumed
367 // to be zero.
368 //
369 // On exit, we write the low 128 bits of the sum A + C + U V + X Y to
370 // [EDI], and update the carry registers with the carry out. The
371 // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
372 // general-purpose registers are preserved.
373 endprologue
374
375 carryadd
376
377 mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7, nil
378 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil
379 propout [edi + 0], xmm4, xmm5
380
381 mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
382 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, nil
383 propout [edi + 4], xmm5, xmm6
384
385 mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t
386 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, nil
387 propout [edi + 8], xmm6, xmm7
388
389 mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
390 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, nil
391 propout [edi + 12], xmm7, xmm4
392
393 ret
394
395 ENDFUNC
396
397 INTFUNC(mul4zc)
398 // On entry, EDI points to the destination buffer; EBX points to a
399 // packed operand X; and EDX points to an expanded operand Y.
400 //
401 // On exit, we write the low 128 bits of the product X Y to [EDI],
402 // and set the carry registers XMM4, XMM5, XMM6 to the carry out.
403 // The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
404 // general-purpose registers are preserved.
405 endprologue
406
407 mulcore [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
408 propout [edi + 0], xmm4, xmm5
409
410 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
411 propout [edi + 4], xmm5, xmm6
412
413 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
414 propout [edi + 8], xmm6, xmm7
415
416 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
417 propout [edi + 12], xmm7, xmm4
418
419 ret
420
421 ENDFUNC
422
423 INTFUNC(mul4)
424 // On entry, EDI points to the destination buffer; EBX points to a
425 // packed operand X; EDX points to an expanded operand Y; and XMM4,
426 // XMM5, XMM6 hold the incoming carry registers c0, c1, and c2,
427 // representing a carry-in C; c3 is assumed to be zero.
428 //
429 // On exit, we write the low 128 bits of the sum C + X Y to [EDI],
430 // and update the carry registers with the carry out. The registers
431 // XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
432 // general-purpose registers are preserved.
433 endprologue
434
435 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, t
436 propout [edi + 0], xmm4, xmm5
437
438 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
439 propout [edi + 4], xmm5, xmm6
440
441 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
442 propout [edi + 8], xmm6, xmm7
443
444 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
445 propout [edi + 12], xmm7, xmm4
446
447 ret
448
449 ENDFUNC
450
451 INTFUNC(mla4zc)
452 // On entry, EDI points to the destination buffer, which also
453 // contains an addend A to accumulate; EBX points to a packed operand
454 // X; and EDX points to an expanded operand Y.
455 //
456 // On exit, we write the low 128 bits of the sum A + X Y to [EDI],
457 // and set the carry registers XMM4, XMM5, XMM6 to the carry out.
458 // The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
459 // general-purpose registers are preserved.
460 endprologue
461
462 movd xmm4, [edi + 0]
463 movd xmm5, [edi + 4]
464 movd xmm6, [edi + 8]
465 movd xmm7, [edi + 12]
466
467 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil
468 propout [edi + 0], xmm4, xmm5
469
470 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
471 propout [edi + 4], xmm5, xmm6
472
473 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
474 propout [edi + 8], xmm6, xmm7
475
476 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
477 propout [edi + 12], xmm7, xmm4
478
479 ret
480
481 ENDFUNC
482
483 INTFUNC(mla4)
484 // On entry, EDI points to the destination buffer, which also
485 // contains an addend A to accumulate; EBX points to a packed operand
486 // X; EDX points to an expanded operand Y; and XMM4, XMM5, XMM6 hold
487 // the incoming carry registers c0, c1, and c2, representing a
488 // carry-in C; c3 is assumed to be zero.
489 //
490 // On exit, we write the low 128 bits of the sum A + C + X Y to
491 // [EDI], and update the carry registers with the carry out. The
492 // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
493 // general-purpose registers are preserved.
494 endprologue
495
496 carryadd
497
498 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil
499 propout [edi + 0], xmm4, xmm5
500
501 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
502 propout [edi + 4], xmm5, xmm6
503
504 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
505 propout [edi + 8], xmm6, xmm7
506
507 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
508 propout [edi + 12], xmm7, xmm4
509
510 ret
511
512 ENDFUNC
513
514 INTFUNC(mmul4)
515 // On entry, EDI points to the destination buffer; EAX and EBX point
516 // to the packed operands U and N; ECX and ESI point to the expanded
517 // operands V and M; and EDX points to a place to store an expanded
518 // result Y (32 bytes, at a 16-byte boundary). The stack pointer
519 // must be 16-byte aligned. (This is not the usual convention, which
520 // requires alignment before the call.)
521 //
522 // On exit, we write Y = U V M mod B to [EDX], and the low 128 bits
523 // of the sum U V + N Y to [EDI], leaving the remaining carry in
524 // XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2, XMM3, and
525 // XMM7 are clobbered; the general-purpose registers are preserved.
526 stalloc 48 // space for the carries
527 endprologue
528
529 // Calculate W = U V, and leave it in the destination. Stash the
530 // carry pieces for later.
531 mulcore [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7
532 propout [edi + 0], xmm4, xmm5
533 jmp 5f
534
535 ENDFUNC
536
537 INTFUNC(mmla4)
538 // On entry, EDI points to the destination buffer, which also
539 // contains an addend A to accumulate; EAX and EBX point
540 // to the packed operands U and N; ECX and ESI point to the expanded
541 // operands V and M; and EDX points to a place to store an expanded
542 // result Y (32 bytes, at a 16-byte boundary). The stack pointer
543 // must be 16-byte aligned. (This is not the usual convention, which
544 // requires alignment before the call.)
545 //
546 // On exit, we write Y = (A + U V) M mod B to [EDX], and the low 128
547 // bits of the sum A + U V + N Y to [EDI], leaving the remaining
548 // carry in XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2,
549 // XMM3, and XMM7 are clobbered; the general-purpose registers are
550 // preserved.
551 stalloc 48 // space for the carries
552 endprologue
553
554 movd xmm4, [edi + 0]
555 movd xmm5, [edi + 4]
556 movd xmm6, [edi + 8]
557 movd xmm7, [edi + 12]
558 mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7, nil
559 propout [edi + 0], xmm4, xmm5
560
561 5: mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
562 propout [edi + 4], xmm5, xmm6
563
564 mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t
565 propout [edi + 8], xmm6, xmm7
566
567 mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
568 propout [edi + 12], xmm7, xmm4
569
570 movdqa [esp + 0], xmm4
571 movdqa [esp + 16], xmm5
572 movdqa [esp + 32], xmm6
573
574 // Calculate Y = W M.
575 mulcore [edi + 0], esi, xmm4, xmm5, xmm6, xmm7
576
577 mulcore [edi + 4], esi, xmm0, xmm1, xmm2, nil
578 accum xmm5, xmm6, xmm7, nil
579
580 mulcore [edi + 8], esi, xmm0, xmm1, nil, nil
581 accum xmm6, xmm7, nil, nil
582
583 mulcore [edi + 12], esi, xmm0, nil, nil, nil
584 accum xmm7, nil, nil, nil
585
586 // That's lots of pieces. Now we have to assemble the answer.
587 squash xmm4, xmm5, xmm6, xmm7, nil, xmm0, xmm1
588
589 // Expand it.
590 pxor xmm2, xmm2
591 expand xmm4, xmm1, nil, nil, xmm2
592 movdqa [edx + 0], xmm4
593 movdqa [edx + 16], xmm1
594
595 // Initialize the carry from the value for W we calculated earlier.
596 movd xmm4, [edi + 0]
597 movd xmm5, [edi + 4]
598 movd xmm6, [edi + 8]
599 movd xmm7, [edi + 12]
600
601 // Finish the calculation by adding the Montgomery product.
602 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil
603 propout [edi + 0], xmm4, xmm5
604
605 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
606 propout [edi + 4], xmm5, xmm6
607
608 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
609 propout [edi + 8], xmm6, xmm7
610
611 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
612 propout [edi + 12], xmm7, xmm4
613
614 // Add add on the carry we calculated earlier.
615 paddq xmm4, [esp + 0]
616 paddq xmm5, [esp + 16]
617 paddq xmm6, [esp + 32]
618
619 // And, with that, we're done.
620 stfree 48
621 ret
622
623 ENDFUNC
624
625 INTFUNC(mont4)
626 // On entry, EDI points to the destination buffer holding a packed
627 // value W; EBX points to a packed operand N; ESI points to an
628 // expanded operand M; and EDX points to a place to store an expanded
629 // result Y (32 bytes, at a 16-byte boundary).
630 //
631 // On exit, we write Y = W M mod B to [EDX], and the low 128 bits
632 // of the sum W + N Y to [EDI], leaving the remaining carry in
633 // XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2, XMM3, and
634 // XMM7 are clobbered; the general-purpose registers are preserved.
635 endprologue
636
637 // Calculate Y = W M.
638 mulcore [edi + 0], esi, xmm4, xmm5, xmm6, xmm7
639
640 mulcore [edi + 4], esi, xmm0, xmm1, xmm2, nil
641 accum xmm5, xmm6, xmm7, nil
642
643 mulcore [edi + 8], esi, xmm0, xmm1, nil, nil
644 accum xmm6, xmm7, nil, nil
645
646 mulcore [edi + 12], esi, xmm0, nil, nil, nil
647 accum xmm7, nil, nil, nil
648
649 // That's lots of pieces. Now we have to assemble the answer.
650 squash xmm4, xmm5, xmm6, xmm7, nil, xmm0, xmm1
651
652 // Expand it.
653 pxor xmm2, xmm2
654 expand xmm4, xmm1, nil, nil, xmm2
655 movdqa [edx + 0], xmm4
656 movdqa [edx + 16], xmm1
657
658 // Initialize the carry from W.
659 movd xmm4, [edi + 0]
660 movd xmm5, [edi + 4]
661 movd xmm6, [edi + 8]
662 movd xmm7, [edi + 12]
663
664 // Finish the calculation by adding the Montgomery product.
665 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil
666 propout [edi + 0], xmm4, xmm5
667
668 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
669 propout [edi + 4], xmm5, xmm6
670
671 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
672 propout [edi + 8], xmm6, xmm7
673
674 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
675 propout [edi + 12], xmm7, xmm4
676
677 // And, with that, we're done.
678 ret
679
680 ENDFUNC
681
682 ///--------------------------------------------------------------------------
683 /// Bulk multipliers.
684
685 FUNC(mpx_umul4_x86_sse2)
686 // void mpx_umul4_x86_sse2(mpw *dv, const mpw *av, const mpw *avl,
687 // const mpw *bv, const mpw *bvl);
688
689 // Build a stack frame. Arguments will be relative to EBP, as
690 // follows.
691 //
692 // ebp + 20 dv
693 // ebp + 24 av
694 // ebp + 28 avl
695 // ebp + 32 bv
696 // ebp + 36 bvl
697 //
698 // Locals are relative to ESP, as follows.
699 //
700 // esp + 0 expanded Y (32 bytes)
701 // esp + 32 (top of locals)
702 pushreg ebp
703 pushreg ebx
704 pushreg esi
705 pushreg edi
706 setfp ebp
707 and esp, ~15
708 sub esp, 32
709 endprologue
710
711 // Prepare for the first iteration.
712 mov esi, [ebp + 32] // -> bv[0]
713 pxor xmm7, xmm7
714 movdqu xmm0, [esi] // bv[0]
715 mov edi, [ebp + 20] // -> dv[0]
716 mov ecx, edi // outer loop dv cursor
717 expand xmm0, xmm1, nil, nil, xmm7
718 mov ebx, [ebp + 24] // -> av[0]
719 mov eax, [ebp + 28] // -> av[m] = av limit
720 mov edx, esp // -> expanded Y = bv[0]
721 movdqa [esp + 0], xmm0 // bv[0] expanded low
722 movdqa [esp + 16], xmm1 // bv[0] expanded high
723 call mul4zc
724 add ebx, 16
725 add edi, 16
726 add ecx, 16
727 add esi, 16
728 cmp ebx, eax // all done?
729 jae 8f
730
731 .p2align 4
732 // Continue with the first iteration.
733 0: call mul4
734 add ebx, 16
735 add edi, 16
736 cmp ebx, eax // all done?
737 jb 0b
738
739 // Write out the leftover carry. There can be no tail here.
740 8: call carryprop
741 cmp esi, [ebp + 36] // more passes to do?
742 jae 9f
743
744 .p2align 4
745 // Set up for the next pass.
746 1: movdqu xmm0, [esi] // bv[i]
747 mov edi, ecx // -> dv[i]
748 pxor xmm7, xmm7
749 expand xmm0, xmm1, nil, nil, xmm7
750 mov ebx, [ebp + 24] // -> av[0]
751 movdqa [esp + 0], xmm0 // bv[i] expanded low
752 movdqa [esp + 16], xmm1 // bv[i] expanded high
753 call mla4zc
754 add edi, 16
755 add ebx, 16
756 add ecx, 16
757 add esi, 16
758 cmp ebx, eax // done yet?
759 jae 8f
760
761 .p2align 4
762 // Continue...
763 0: call mla4
764 add ebx, 16
765 add edi, 16
766 cmp ebx, eax
767 jb 0b
768
769 // Finish off this pass. There was no tail on the previous pass, and
770 // there can be none on this pass.
771 8: call carryprop
772 cmp esi, [ebp + 36]
773 jb 1b
774
775 // All over.
776 9: dropfp
777 pop edi
778 pop esi
779 pop ebx
780 pop ebp
781 ret
782
783 ENDFUNC
784
785 FUNC(mpxmont_mul4_x86_sse2)
786 // void mpxmont_mul4_x86_sse2(mpw *dv, const mpw *av, const mpw *bv,
787 // const mpw *nv, size_t n, const mpw *mi);
788
789 // Build a stack frame. Arguments will be relative to EBP, as
790 // follows.
791 //
792 // ebp + 20 dv
793 // ebp + 24 av
794 // ebp + 28 bv
795 // ebp + 32 nv
796 // ebp + 36 n (nonzero multiple of 4)
797 // ebp + 40 mi
798 //
799 // Locals are relative to ESP, which is 4 mod 16, as follows.
800 //
801 // esp + 0 outer loop dv
802 // esp + 4 outer loop bv
803 // esp + 8 av limit (mostly in ESI)
804 // esp + 12 expanded V (32 bytes)
805 // esp + 44 expanded M (32 bytes)
806 // esp + 76 expanded Y (32 bytes)
807 // esp + 108 bv limit
808 // esp + 112 (gap)
809 // esp + 124 (top of locals)
810 pushreg ebp
811 pushreg ebx
812 pushreg esi
813 pushreg edi
814 setfp ebp
815 and esp, ~15
816 sub esp, 124
817 endprologue
818
819 // Establish the expanded operands.
820 pxor xmm7, xmm7
821 mov ecx, [ebp + 28] // -> bv
822 mov edx, [ebp + 40] // -> mi
823 movdqu xmm0, [ecx] // bv[0]
824 movdqu xmm2, [edx] // mi
825 expand xmm0, xmm1, xmm2, xmm3, xmm7
826 movdqa [esp + 12], xmm0 // bv[0] expanded low
827 movdqa [esp + 28], xmm1 // bv[0] expanded high
828 movdqa [esp + 44], xmm2 // mi expanded low
829 movdqa [esp + 60], xmm3 // mi expanded high
830
831 // Set up the outer loop state and prepare for the first iteration.
832 mov edx, [ebp + 36] // n
833 mov eax, [ebp + 24] // -> U = av[0]
834 mov ebx, [ebp + 32] // -> X = nv[0]
835 mov edi, [ebp + 20] // -> Z = dv[0]
836 mov [esp + 4], ecx
837 lea ecx, [ecx + 4*edx] // -> bv[n/4] = bv limit
838 lea edx, [eax + 4*edx] // -> av[n/4] = av limit
839 mov [esp + 0], edi
840 mov [esp + 108], ecx
841 mov [esp + 8], edx
842 lea ecx, [esp + 12] // -> expanded V = bv[0]
843 lea esi, [esp + 44] // -> expanded M = mi
844 lea edx, [esp + 76] // -> space for Y
845 call mmul4
846 mov esi, [esp + 8] // recover av limit
847 add edi, 16
848 add eax, 16
849 add ebx, 16
850 cmp eax, esi // done already?
851 jae 8f
852 mov [esp + 0], edi
853
854 .p2align 4
855 // Complete the first inner loop.
856 0: call dmul4
857 add edi, 16
858 add eax, 16
859 add ebx, 16
860 cmp eax, esi // done yet?
861 jb 0b
862
863 // Still have carries left to propagate.
864 call carryprop
865 movd [edi + 16], xmm4
866
867 .p2align 4
868 // Embark on the next iteration. (There must be one. If n = 1, then
869 // we would have bailed above, to label 8. Similarly, the subsequent
870 // iterations can fall into the inner loop immediately.)
871 1: mov eax, [esp + 4] // -> bv[i - 1]
872 mov edi, [esp + 0] // -> Z = dv[i]
873 add eax, 16 // -> bv[i]
874 pxor xmm7, xmm7
875 movdqu xmm0, [eax] // bv[i]
876 mov [esp + 4], eax
877 cmp eax, [esp + 108] // done yet?
878 jae 9f
879 mov ebx, [ebp + 32] // -> X = nv[0]
880 lea esi, [esp + 44] // -> expanded M = mi
881 mov eax, [ebp + 24] // -> U = av[0]
882 expand xmm0, xmm1, nil, nil, xmm7
883 movdqa [esp + 12], xmm0 // bv[i] expanded low
884 movdqa [esp + 28], xmm1 // bv[i] expanded high
885 call mmla4
886 mov esi, [esp + 8] // recover av limit
887 add edi, 16
888 add eax, 16
889 add ebx, 16
890 mov [esp + 0], edi
891
892 .p2align 4
893 // Complete the next inner loop.
894 0: call dmla4
895 add edi, 16
896 add eax, 16
897 add ebx, 16
898 cmp eax, esi
899 jb 0b
900
901 // Still have carries left to propagate, and they overlap the
902 // previous iteration's final tail, so read that in and add it.
903 movd xmm0, [edi]
904 paddq xmm4, xmm0
905 call carryprop
906 movd [edi + 16], xmm4
907
908 // Back again.
909 jmp 1b
910
911 // First iteration was short. Write out the carries and we're done.
912 // (This could be folded into the main loop structure, but that would
913 // penalize small numbers more.)
914 8: call carryprop
915 movd [edi + 16], xmm4
916
917 // All done.
918 9: dropfp
919 popreg edi
920 popreg esi
921 popreg ebx
922 popreg ebp
923 ret
924
925 ENDFUNC
926
927 FUNC(mpxmont_redc4_x86_sse2)
928 // void mpxmont_redc4_x86_sse2(mpw *dv, mpw *dvl, const mpw *nv,
929 // size_t n, const mpw *mi);
930
931 // Build a stack frame. Arguments will be relative to EBP, as
932 // follows.
933 //
934 // ebp + 20 dv
935 // ebp + 24 dvl
936 // ebp + 28 nv
937 // ebp + 32 n (nonzero multiple of 4)
938 // ebp + 36 mi
939 //
940 // Locals are relative to ESP, as follows.
941 //
942 // esp + 0 outer loop dv
943 // esp + 4 outer dv limit
944 // esp + 8 blocks-of-4 dv limit
945 // esp + 12 expanded M (32 bytes)
946 // esp + 44 expanded Y (32 bytes)
947 // esp + 76 (top of locals)
948 pushreg ebp
949 pushreg ebx
950 pushreg esi
951 pushreg edi
952 setfp ebp
953 and esp, ~15
954 sub esp, 76
955 endprologue
956
957 // Establish the expanded operands and the blocks-of-4 dv limit.
958 mov edi, [ebp + 20] // -> Z = dv[0]
959 pxor xmm7, xmm7
960 mov eax, [ebp + 24] // -> dv[n] = dv limit
961 sub eax, edi // length of dv in bytes
962 mov edx, [ebp + 36] // -> mi
963 movdqu xmm0, [edx] // mi
964 and eax, ~15 // mask off the tail end
965 expand xmm0, xmm1, nil, nil, xmm7
966 add eax, edi // find limit
967 movdqa [esp + 12], xmm0 // mi expanded low
968 movdqa [esp + 28], xmm1 // mi expanded high
969 mov [esp + 8], eax
970
971 // Set up the outer loop state and prepare for the first iteration.
972 mov ecx, [ebp + 32] // n
973 mov ebx, [ebp + 28] // -> X = nv[0]
974 lea edx, [edi + 4*ecx] // -> dv[n/4] = outer dv limit
975 lea ecx, [ebx + 4*ecx] // -> nv[n/4] = nv limit
976 mov [esp + 0], edi
977 mov [esp + 4], edx
978 lea esi, [esp + 12] // -> expanded M = mi
979 lea edx, [esp + 44] // -> space for Y
980 call mont4
981 add edi, 16
982 add ebx, 16
983 cmp ebx, ecx // done already?
984 jae 8f
985
986 .p2align 4
987 // Complete the first inner loop.
988 5: call mla4
989 add ebx, 16
990 add edi, 16
991 cmp ebx, ecx // done yet?
992 jb 5b
993
994 // Still have carries left to propagate.
995 8: carryadd
996 mov esi, [esp + 8] // -> dv blocks limit
997 mov edx, [ebp + 24] // dv limit
998 psllq xmm7, 16
999 pslldq xmm7, 8
1000 paddq xmm6, xmm7
1001 call carryprop
1002 movd eax, xmm4
1003 add edi, 16
1004 cmp edi, esi
1005 jae 7f
1006
1007 .p2align 4
1008 // Continue carry propagation until the end of the buffer.
1009 0: add [edi], eax
1010 mov eax, 0 // preserves flags
1011 adcd [edi + 4], 0
1012 adcd [edi + 8], 0
1013 adcd [edi + 12], 0
1014 adc eax, 0
1015 add edi, 16
1016 cmp edi, esi
1017 jb 0b
1018
1019 // Deal with the tail end.
1020 7: add [edi], eax
1021 mov eax, 0 // preserves flags
1022 add edi, 4
1023 adc eax, 0
1024 cmp edi, edx
1025 jb 7b
1026
1027 // All done for this iteration. Start the next. (This must have at
1028 // least one follow-on iteration, or we'd not have started this outer
1029 // loop.)
1030 8: mov edi, [esp + 0] // -> dv[i - 1]
1031 mov ebx, [ebp + 28] // -> X = nv[0]
1032 lea edx, [esp + 44] // -> space for Y
1033 lea esi, [esp + 12] // -> expanded M = mi
1034 add edi, 16 // -> Z = dv[i]
1035 cmp edi, [esp + 4] // all done yet?
1036 jae 9f
1037 mov [esp + 0], edi
1038 call mont4
1039 add edi, 16
1040 add ebx, 16
1041 jmp 5b
1042
1043 // All over.
1044 9: dropfp
1045 popreg edi
1046 popreg esi
1047 popreg ebx
1048 popreg ebp
1049 ret
1050
1051 ENDFUNC
1052
1053 ///--------------------------------------------------------------------------
1054 /// Testing and performance measurement.
1055
1056 #ifdef TEST_MUL4
1057
1058 .macro cysetup c
1059 rdtsc
1060 mov [\c], eax
1061 mov [\c + 4], edx
1062 .endm
1063
1064 .macro cystore c, v, n
1065 rdtsc
1066 sub eax, [\c]
1067 sbb edx, [\c + 4]
1068 mov ebx, [\v]
1069 mov ecx, [\n]
1070 dec ecx
1071 mov [\n], ecx
1072 mov [ebx + ecx*8], eax
1073 mov [ebx + ecx*8 + 4], edx
1074 .endm
1075
1076 .macro testprologue
1077 pushreg ebp
1078 pushreg ebx
1079 pushreg esi
1080 pushreg edi
1081 setfp ebp
1082 and esp, ~15
1083 sub esp, 3*32 + 12
1084 endprologue
1085 // vars:
1086 // esp + 0 = cycles
1087 // esp + 12 = v expanded
1088 // esp + 44 = y expanded
1089 // esp + 72 = ? expanded
1090 .endm
1091
1092 .macro testepilogue
1093 dropfp
1094 popreg edi
1095 popreg esi
1096 popreg ebx
1097 popreg ebp
1098 ret
1099 .endm
1100
1101 .macro testldcarry c
1102 mov ecx, \c // -> c
1103 movdqu xmm4, [ecx + 0] // (c'_0, c''_0)
1104 movdqu xmm5, [ecx + 16] // (c'_1, c''_1)
1105 movdqu xmm6, [ecx + 32] // (c'_2, c''_2)
1106 .endm
1107
1108 .macro testexpand v, y
1109 pxor xmm7, xmm7
1110 .ifnes "\v", "nil"
1111 mov ecx, \v
1112 movdqu xmm0, [ecx]
1113 expand xmm0, xmm1, nil, nil, xmm7
1114 movdqa [esp + 12], xmm0
1115 movdqa [esp + 28], xmm1
1116 .endif
1117 .ifnes "\y", "nil"
1118 mov edx, \y
1119 movdqu xmm2, [edx]
1120 expand xmm2, xmm3, nil, nil, xmm7
1121 movdqa [esp + 44], xmm2
1122 movdqa [esp + 60], xmm3
1123 .endif
1124 .endm
1125
1126 .macro testtop u, x, mode
1127 .p2align 4
1128 0:
1129 .ifnes "\u", "nil"
1130 lea ecx, [esp + 12]
1131 .endif
1132 mov ebx, \x
1133 .ifeqs "\mode", "mont"
1134 lea esi, [esp + 44]
1135 .endif
1136 cysetup esp + 0
1137 .ifnes "\u", "nil"
1138 mov eax, \u
1139 .endif
1140 .ifeqs "\mode", "mont"
1141 lea edx, [esp + 76]
1142 .else
1143 lea edx, [esp + 44]
1144 .endif
1145 .endm
1146
1147 .macro testtail cyv, n
1148 cystore esp + 0, \cyv, \n
1149 jnz 0b
1150 .endm
1151
1152 .macro testcarryout c
1153 mov ecx, \c
1154 movdqu [ecx + 0], xmm4
1155 movdqu [ecx + 16], xmm5
1156 movdqu [ecx + 32], xmm6
1157 .endm
1158
1159 FUNC(test_dmul4)
1160 testprologue
1161 testldcarry [ebp + 24]
1162 testexpand [ebp + 36], [ebp + 40]
1163 mov edi, [ebp + 20]
1164 testtop [ebp + 28], [ebp + 32]
1165 call dmul4
1166 testtail [ebp + 48], [ebp + 44]
1167 testcarryout [ebp + 24]
1168 testepilogue
1169 ENDFUNC
1170
1171 FUNC(test_dmla4)
1172 testprologue
1173 testldcarry [ebp + 24]
1174 testexpand [ebp + 36], [ebp + 40]
1175 mov edi, [ebp + 20]
1176 testtop [ebp + 28], [ebp + 32]
1177 call dmla4
1178 testtail [ebp + 48], [ebp + 44]
1179 testcarryout [ebp + 24]
1180 testepilogue
1181 ENDFUNC
1182
1183 FUNC(test_mul4)
1184 testprologue
1185 testldcarry [ebp + 24]
1186 testexpand nil, [ebp + 32]
1187 mov edi, [ebp + 20]
1188 testtop nil, [ebp + 28]
1189 call mul4
1190 testtail [ebp + 40], [ebp + 36]
1191 testcarryout [ebp + 24]
1192 testepilogue
1193 ENDFUNC
1194
1195 FUNC(test_mla4)
1196 testprologue
1197 testldcarry [ebp + 24]
1198 testexpand nil, [ebp + 32]
1199 mov edi, [ebp + 20]
1200 testtop nil, [ebp + 28]
1201 call mla4
1202 testtail [ebp + 40], [ebp + 36]
1203 testcarryout [ebp + 24]
1204 testepilogue
1205 ENDFUNC
1206
1207 FUNC(test_mmul4)
1208 testprologue
1209 testexpand [ebp + 40], [ebp + 44]
1210 mov edi, [ebp + 20]
1211 testtop [ebp + 32], [ebp + 36], mont
1212 call mmul4
1213 testtail [ebp + 52], [ebp + 48]
1214 mov edi, [ebp + 28]
1215 movdqa xmm0, [esp + 76]
1216 movdqa xmm1, [esp + 92]
1217 movdqu [edi], xmm0
1218 movdqu [edi + 16], xmm1
1219 testcarryout [ebp + 24]
1220 testepilogue
1221 ENDFUNC
1222
1223 FUNC(test_mmla4)
1224 testprologue
1225 testexpand [ebp + 40], [ebp + 44]
1226 mov edi, [ebp + 20]
1227 testtop [ebp + 32], [ebp + 36], mont
1228 call mmla4
1229 testtail [ebp + 52], [ebp + 48]
1230 mov edi, [ebp + 28]
1231 movdqa xmm0, [esp + 76]
1232 movdqa xmm1, [esp + 92]
1233 movdqu [edi], xmm0
1234 movdqu [edi + 16], xmm1
1235 testcarryout [ebp + 24]
1236 testepilogue
1237 ENDFUNC
1238
1239 FUNC(test_mont4)
1240 testprologue
1241 testexpand nil, [ebp + 36]
1242 mov edi, [ebp + 20]
1243 testtop nil, [ebp + 32], mont
1244 call mont4
1245 testtail [ebp + 44], [ebp + 40]
1246 mov edi, [ebp + 28]
1247 movdqa xmm0, [esp + 76]
1248 movdqa xmm1, [esp + 92]
1249 movdqu [edi], xmm0
1250 movdqu [edi + 16], xmm1
1251 testcarryout [ebp + 24]
1252 testepilogue
1253 ENDFUNC
1254
1255 #endif
1256
1257 ///----- That's all, folks --------------------------------------------------