math/mpx-mul4-x86-sse2.S: Fix operand name in commentary.
[catacomb] / math / mpx-mul4-x86-sse2.S
1 /// -*- mode: asm; asm-comment-char: ?/; comment-start: "// " -*-
2 ///
3 /// Large SIMD-based multiplications
4 ///
5 /// (c) 2016 Straylight/Edgeware
6
7 ///----- Licensing notice ---------------------------------------------------
8 ///
9 /// This file is part of Catacomb.
10 ///
11 /// Catacomb is free software; you can redistribute it and/or modify
12 /// it under the terms of the GNU Library General Public License as
13 /// published by the Free Software Foundation; either version 2 of the
14 /// License, or (at your option) any later version.
15 ///
16 /// Catacomb is distributed in the hope that it will be useful,
17 /// but WITHOUT ANY WARRANTY; without even the implied warranty of
18 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 /// GNU Library General Public License for more details.
20 ///
21 /// You should have received a copy of the GNU Library General Public
22 /// License along with Catacomb; if not, write to the Free
23 /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
24 /// MA 02111-1307, USA.
25
26 ///--------------------------------------------------------------------------
27 /// External definitions.
28
29 #include "config.h"
30 #include "asm-common.h"
31
32 ///--------------------------------------------------------------------------
33 /// Prologue.
34
35 .arch pentium4
36 .text
37
38 ///--------------------------------------------------------------------------
39 /// Theory.
40 ///
41 /// We define a number of primitive fixed-size multipliers from which we can
42 /// construct more general variable-length multipliers.
43 ///
44 /// The basic trick is the same throughout. In an operand-scanning
45 /// multiplication, the inner multiplication loop multiplies a
46 /// multiple-precision operand by a single precision factor, and adds the
47 /// result, appropriately shifted, to the result. A `finely integrated
48 /// operand scanning' implementation of Montgomery multiplication also adds
49 /// the product of a single-precision `Montgomery factor' and the modulus,
50 /// calculated in the same pass. The more common `coarsely integrated
51 /// operand scanning' alternates main multiplication and Montgomery passes,
52 /// which requires additional carry propagation.
53 ///
54 /// Throughout both plain-multiplication and Montgomery stages, then, one of
55 /// the factors remains constant throughout the operation, so we can afford
56 /// to take a little time to preprocess it. The transformation we perform is
57 /// as follows. Let b = 2^16, and B = b^2 = 2^32. Suppose we're given a
58 /// 128-bit factor v = v_0 + v_1 B + v_2 B^2 + v_3 B^3. Split each v_i into
59 /// two sixteen-bit pieces, so v_i = v'_i + v''_i b. These eight 16-bit
60 /// pieces are placed into 32-bit cells, and arranged as two 128-bit SSE
61 /// operands, as follows.
62 ///
63 /// Offset 0 4 8 12
64 /// 0 v'_0 v'_1 v''_0 v''_1
65 /// 16 v'_2 v'_3 v''_2 v''_3
66 ///
67 /// A `pmuludqd' instruction ignores the odd positions in its operands; thus,
68 /// it will act on (say) v'_0 and v''_0 in a single instruction. Shifting
69 /// this vector right by 4 bytes brings v'_1 and v''_1 into position. We can
70 /// multiply such a vector by a full 32-bit scalar to produce two 48-bit
71 /// results in 64-bit fields. The sixteen bits of headroom allows us to add
72 /// many products together before we must deal with carrying; it also allows
73 /// for some calculations to be performed on the above expanded form.
74 ///
75 /// On 32-bit x86, we are register starved: the expanded operands are kept in
76 /// memory, typically in warm L1 cache.
77 ///
78 /// We maintain four `carry' registers accumulating intermediate results.
79 /// The registers' precise roles rotate during the computation; we name them
80 /// `c0', `c1', `c2', and `c3'. Each carry register holds two 64-bit halves:
81 /// the register c0, for example, holds c'_0 (low half) and c''_0 (high
82 /// half), and represents the value c_0 = c'_0 + c''_0 b; the carry registers
83 /// collectively represent the value c_0 + c_1 B + c_2 B^2 + c_3 B^3. The
84 /// `pmuluqdq' instruction acting on a scalar operand (broadcast across all
85 /// lanes of its vector) and an operand in the expanded form above produces a
86 /// result which can be added directly to the appropriate carry register.
87 /// Following a pass of four multiplications, we perform some limited carry
88 /// propagation: let t = c''_0 mod B, and let d = c'_0 + t b; then we output
89 /// z = d mod B, add (floor(d/B), floor(c''_0/B)) to c1, and cycle the carry
90 /// registers around, so that c1 becomes c0, and the old c0 is (implicitly)
91 /// zeroed becomes c3.
92
93 ///--------------------------------------------------------------------------
94 /// Macro definitions.
95
96 .macro mulcore r, s, d0, d1, d2, d3
97 // Load a word r_i from R, multiply by the expanded operand [S], and
98 // leave the pieces of the product in registers D0, D1, D2, D3.
99 movd \d0, \r // (r_i, 0, 0, 0)
100 .ifnes "\d1", "nil"
101 movdqa \d1, [\s] // (s'_0, s'_1, s''_0, s''_1)
102 .endif
103 .ifnes "\d3", "nil"
104 movdqa \d3, [\s + 16] // (s'_2, s'_3, s''_2, s''_3)
105 .endif
106 pshufd \d0, \d0, SHUF(3, 0, 3, 0) // (r_i, ?, r_i, ?)
107 .ifnes "\d1", "nil"
108 psrldq \d1, 4 // (s'_1, s''_0, s''_1, 0)
109 .endif
110 .ifnes "\d2", "nil"
111 .ifnes "\d3", "nil"
112 movdqa \d2, \d3 // another copy of (s'_2, s'_3, ...)
113 .else
114 movdqa \d2, \d0 // another copy of (r_i, ?, r_i, ?)
115 .endif
116 .endif
117 .ifnes "\d3", "nil"
118 psrldq \d3, 4 // (s'_3, s''_2, s''_3, 0)
119 .endif
120 .ifnes "\d1", "nil"
121 pmuludqd \d1, \d0 // (r_i s'_1, r_i s''_1)
122 .endif
123 .ifnes "\d3", "nil"
124 pmuludqd \d3, \d0 // (r_i s'_3, r_i s''_3)
125 .endif
126 .ifnes "\d2", "nil"
127 .ifnes "\d3", "nil"
128 pmuludqd \d2, \d0 // (r_i s'_2, r_i s''_2)
129 .else
130 pmuludqd \d2, [\s + 16]
131 .endif
132 .endif
133 pmuludqd \d0, [\s] // (r_i s'_0, r_i s''_0)
134 .endm
135
136 .macro accum c0, c1, c2, c3
137 paddq \c0, xmm0
138 .ifnes "\c1", "nil"
139 paddq \c1, xmm1
140 .endif
141 .ifnes "\c2", "nil"
142 paddq \c2, xmm2
143 .endif
144 .ifnes "\c3", "nil"
145 paddq \c3, xmm3
146 .endif
147 .endm
148
149 .macro mulacc r, s, c0, c1, c2, c3, z3p
150 // Load a word r_i from R, multiply by the expanded operand [S],
151 // and accumulate in carry registers C0, C1, C2, C3. If Z3P is `t'
152 // then C3 notionally contains zero, but needs clearing; in practice,
153 // we store the product directly rather than attempting to add. On
154 // completion, XMM0, XMM1, and XMM2 are clobbered, as is XMM3 if Z3P
155 // is not `t'.
156 .ifeqs "\z3p", "t"
157 mulcore \r, \s, xmm0, xmm1, xmm2, \c3
158 accum \c0, \c1, \c2, nil
159 .else
160 mulcore \r, \s, xmm0, xmm1, xmm2, xmm3
161 accum \c0, \c1, \c2, \c3
162 .endif
163 .endm
164
165 .macro propout d, c, cc
166 // Calculate an output word from C, and store it in D; propagate
167 // carries out from C to CC in preparation for a rotation of the
168 // carry registers. On completion, XMM3 is clobbered. If CC is
169 // `nil', then the contribution which would have been added to it is
170 // left in C.
171 pshufd xmm3, \c, SHUF(2, 3, 3, 3) // (?, ?, ?, t = c'' mod B)
172 psrldq xmm3, 12 // (t, 0, 0, 0) = (t, 0)
173 pslldq xmm3, 2 // (t b, 0)
174 paddq \c, xmm3 // (c' + t b, c'')
175 movd \d, \c
176 psrlq \c, 32 // floor(c/B)
177 .ifnes "\cc", "nil"
178 paddq \cc, \c // propagate up
179 .endif
180 .endm
181
182 .macro endprop d, c, t
183 // On entry, C contains a carry register. On exit, the low 32 bits
184 // of the value represented in C are written to D, and the remaining
185 // bits are left at the bottom of T.
186 movdqa \t, \c
187 psllq \t, 16 // (?, c'' b)
188 pslldq \c, 8 // (0, c')
189 paddq \t, \c // (?, c' + c'' b)
190 psrldq \t, 8 // c' + c'' b
191 movd \d, \t
192 psrldq \t, 4 // floor((c' + c'' b)/B)
193 .endm
194
195 .macro expand a, b, c, d, z
196 // On entry, A and C hold packed 128-bit values, and Z is zero. On
197 // exit, A:B and C:D together hold the same values in expanded
198 // form. If C is `nil', then only expand A to A:B.
199 movdqa \b, \a // (a_0, a_1, a_2, a_3)
200 .ifnes "\c", "nil"
201 movdqa \d, \c // (c_0, c_1, c_2, c_3)
202 .endif
203 punpcklwd \a, \z // (a'_0, a''_0, a'_1, a''_1)
204 punpckhwd \b, \z // (a'_2, a''_2, a'_3, a''_3)
205 .ifnes "\c", "nil"
206 punpcklwd \c, \z // (c'_0, c''_0, c'_1, c''_1)
207 punpckhwd \d, \z // (c'_2, c''_2, c'_3, c''_3)
208 .endif
209 pshufd \a, \a, SHUF(3, 1, 2, 0) // (a'_0, a'_1, a''_0, a''_1)
210 pshufd \b, \b, SHUF(3, 1, 2, 0) // (a'_2, a'_3, a''_2, a''_3)
211 .ifnes "\c", "nil"
212 pshufd \c, \c, SHUF(3, 1, 2, 0) // (c'_0, c'_1, c''_0, c''_1)
213 pshufd \d, \d, SHUF(3, 1, 2, 0) // (c'_2, c'_3, c''_2, c''_3)
214 .endif
215 .endm
216
217 .macro squash c0, c1, c2, c3, h, t, u
218 // On entry, C0, C1, C2, C3 are carry registers representing a value
219 // Y. On exit, C0 holds the low 128 bits of the carry value; C1, C2,
220 // C3, T, and U are clobbered; and the high bits of Y are stored in
221 // H, if this is not `nil'.
222
223 // The first step is to eliminate the `double-prime' pieces -- i.e.,
224 // the ones offset by 16 bytes from a 32-bit boundary -- by carrying
225 // them into the 32-bit-aligned pieces above and below. But before
226 // we can do that, we must gather them together.
227 movdqa \t, \c0
228 movdqa \u, \c1
229 punpcklqdq \t, \c2 // (y'_0, y'_2)
230 punpckhqdq \c0, \c2 // (y''_0, y''_2)
231 punpcklqdq \u, \c3 // (y'_1, y'_3)
232 punpckhqdq \c1, \c3 // (y''_1, y''_3)
233
234 // Now split the double-prime pieces. The high (up to) 48 bits will
235 // go up; the low 16 bits go down.
236 movdqa \c2, \c0
237 movdqa \c3, \c1
238 psllq \c2, 48
239 psllq \c3, 48
240 psrlq \c0, 16 // high parts of (y''_0, y''_2)
241 psrlq \c1, 16 // high parts of (y''_1, y''_3)
242 psrlq \c2, 32 // low parts of (y''_0, y''_2)
243 psrlq \c3, 32 // low parts of (y''_1, y''_3)
244 .ifnes "\h", "nil"
245 movdqa \h, \c1
246 .endif
247 pslldq \c1, 8 // high part of (0, y''_1)
248
249 paddq \t, \c2 // propagate down
250 paddq \u, \c3
251 paddq \t, \c1 // and up: (y_0, y_2)
252 paddq \u, \c0 // (y_1, y_3)
253 .ifnes "\h", "nil"
254 psrldq \h, 8 // high part of (y''_3, 0)
255 .endif
256
257 // Finally extract the answer. This complicated dance is better than
258 // storing to memory and loading, because the piecemeal stores
259 // inhibit store forwarding.
260 movdqa \c3, \t // (y_0, y_1)
261 movdqa \c0, \t // (y^*_0, ?, ?, ?)
262 psrldq \t, 8 // (y_2, 0)
263 psrlq \c3, 32 // (floor(y_0/B), ?)
264 paddq \c3, \u // (y_1 + floor(y_0/B), ?)
265 pslldq \c0, 12 // (0, 0, 0, y^*_0)
266 movdqa \c1, \c3 // (y^*_1, ?, ?, ?)
267 psrldq \u, 8 // (y_3, 0)
268 psrlq \c3, 32 // (floor((y_1 B + y_0)/B^2, ?)
269 paddq \c3, \t // (y_2 + floor((y_1 B + y_0)/B^2, ?)
270 pslldq \c1, 12 // (0, 0, 0, y^*_1)
271 psrldq \c0, 12 // (y^*_0, 0, 0, 0)
272 movdqa \c2, \c3 // (y^*_2, ?, ?, ?)
273 psrlq \c3, 32 // (floor((y_2 B^2 + y_1 B + y_0)/B^3, ?)
274 paddq \c3, \u // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3, ?)
275 pslldq \c2, 12 // (0, 0, 0, y^*_2)
276 psrldq \c1, 8 // (0, y^*_1, 0, 0)
277 psrldq \c2, 4 // (0, 0, y^*_2, 0)
278 .ifnes "\h", "nil"
279 movdqu \t, \c3
280 pxor \u, \u
281 .endif
282 pslldq \c3, 12 // (0, 0, 0, y^*_3)
283 por \c0, \c1 // (y^*_0, y^*_1, 0, 0)
284 por \c2, \c3 // (0, 0, y^*_2, y^*_3)
285 por \c0, \c2 // y mod B^4
286 .ifnes "\h", "nil"
287 psrlq \t, 32 // very high bits of y
288 paddq \h, \t
289 punpcklqdq \h, \u // carry up
290 .endif
291 .endm
292
293 .macro carryadd
294 // On entry, EDI points to a packed addend A, and XMM4, XMM5, XMM6
295 // hold the incoming carry registers c0, c1, and c2 representing a
296 // carry-in C.
297 //
298 // On exit, the carry registers, including XMM7, are updated to hold
299 // C + A; XMM0, XMM1, XMM2, and XMM3 are clobbered. The other
300 // registers are preserved.
301 movd xmm0, [edi + 0] // (a_0, 0)
302 movd xmm1, [edi + 4] // (a_1, 0)
303 movd xmm2, [edi + 8] // (a_2, 0)
304 movd xmm7, [edi + 12] // (a_3, 0)
305 paddq xmm4, xmm0 // (c'_0 + a_0, c''_0)
306 paddq xmm5, xmm1 // (c'_1 + a_1, c''_1)
307 paddq xmm6, xmm2 // (c'_2 + a_2, c''_2 + a_3 b)
308 .endm
309
310 ///--------------------------------------------------------------------------
311 /// Primitive multipliers and related utilities.
312
313 INTFUNC(carryprop)
314 // On entry, XMM4, XMM5, and XMM6 hold a 144-bit carry in an expanded
315 // form. Store the low 128 bits of the represented carry to [EDI] as
316 // a packed 128-bit value, and leave the remaining 16 bits in the low
317 // 32 bits of XMM4. On exit, XMM3, XMM5 and XMM6 are clobbered.
318 endprologue
319
320 propout [edi + 0], xmm4, xmm5
321 propout [edi + 4], xmm5, xmm6
322 propout [edi + 8], xmm6, nil
323 endprop [edi + 12], xmm6, xmm4
324 ret
325
326 ENDFUNC
327
328 INTFUNC(dmul4)
329 // On entry, EDI points to the destination buffer; EAX and EBX point
330 // to the packed operands U and X; ECX and EDX point to the expanded
331 // operands V and Y; and XMM4, XMM5, XMM6 hold the incoming carry
332 // registers c0, c1, and c2; c3 is assumed to be zero.
333 //
334 // On exit, we write the low 128 bits of the sum C + U V + X Y to
335 // [EDI], and update the carry registers with the carry out. The
336 // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
337 // general-purpose registers are preserved.
338 endprologue
339
340 mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7, t
341 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil
342 propout [edi + 0], xmm4, xmm5
343
344 mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
345 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, nil
346 propout [edi + 4], xmm5, xmm6
347
348 mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t
349 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, nil
350 propout [edi + 8], xmm6, xmm7
351
352 mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
353 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, nil
354 propout [edi + 12], xmm7, xmm4
355
356 ret
357
358 ENDFUNC
359
360 INTFUNC(dmla4)
361 // On entry, EDI points to the destination buffer, which also
362 // contains an addend A to accumulate; EAX and EBX point to the
363 // packed operands U and X; ECX and EDX point to the expanded
364 // operands V and Y; and XMM4, XMM5, XMM6 hold the incoming carry
365 // registers c0, c1, and c2 representing a carry-in C; c3 is assumed
366 // to be zero.
367 //
368 // On exit, we write the low 128 bits of the sum A + C + U V + X Y to
369 // [EDI], and update the carry registers with the carry out. The
370 // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
371 // general-purpose registers are preserved.
372 endprologue
373
374 carryadd
375
376 mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7, nil
377 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil
378 propout [edi + 0], xmm4, xmm5
379
380 mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
381 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, nil
382 propout [edi + 4], xmm5, xmm6
383
384 mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t
385 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, nil
386 propout [edi + 8], xmm6, xmm7
387
388 mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
389 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, nil
390 propout [edi + 12], xmm7, xmm4
391
392 ret
393
394 ENDFUNC
395
396 INTFUNC(mul4zc)
397 // On entry, EDI points to the destination buffer; EBX points to a
398 // packed operand X; and EDX points to an expanded operand Y.
399 //
400 // On exit, we write the low 128 bits of the product X Y to [EDI],
401 // and set the carry registers XMM4, XMM5, XMM6 to the carry out.
402 // The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
403 // general-purpose registers are preserved.
404 endprologue
405
406 mulcore [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
407 propout [edi + 0], xmm4, xmm5
408
409 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
410 propout [edi + 4], xmm5, xmm6
411
412 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
413 propout [edi + 8], xmm6, xmm7
414
415 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
416 propout [edi + 12], xmm7, xmm4
417
418 ret
419
420 ENDFUNC
421
422 INTFUNC(mul4)
423 // On entry, EDI points to the destination buffer; EBX points to a
424 // packed operand X; EDX points to an expanded operand Y; and XMM4,
425 // XMM5, XMM6 hold the incoming carry registers c0, c1, and c2,
426 // representing a carry-in C; c3 is assumed to be zero.
427 //
428 // On exit, we write the low 128 bits of the sum C + X Y to [EDI],
429 // and update the carry registers with the carry out. The registers
430 // XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
431 // general-purpose registers are preserved.
432 endprologue
433
434 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, t
435 propout [edi + 0], xmm4, xmm5
436
437 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
438 propout [edi + 4], xmm5, xmm6
439
440 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
441 propout [edi + 8], xmm6, xmm7
442
443 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
444 propout [edi + 12], xmm7, xmm4
445
446 ret
447
448 ENDFUNC
449
450 INTFUNC(mla4zc)
451 // On entry, EDI points to the destination buffer, which also
452 // contains an addend A to accumulate; EBX points to a packed operand
453 // X; and EDX points to an expanded operand Y.
454 //
455 // On exit, we write the low 128 bits of the sum A + X Y to [EDI],
456 // and set the carry registers XMM4, XMM5, XMM6 to the carry out.
457 // The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
458 // general-purpose registers are preserved.
459 endprologue
460
461 movd xmm4, [edi + 0]
462 movd xmm5, [edi + 4]
463 movd xmm6, [edi + 8]
464 movd xmm7, [edi + 12]
465
466 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil
467 propout [edi + 0], xmm4, xmm5
468
469 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
470 propout [edi + 4], xmm5, xmm6
471
472 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
473 propout [edi + 8], xmm6, xmm7
474
475 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
476 propout [edi + 12], xmm7, xmm4
477
478 ret
479
480 ENDFUNC
481
482 INTFUNC(mla4)
483 // On entry, EDI points to the destination buffer, which also
484 // contains an addend A to accumulate; EBX points to a packed operand
485 // X; EDX points to an expanded operand Y; and XMM4, XMM5, XMM6 hold
486 // the incoming carry registers c0, c1, and c2, representing a
487 // carry-in C; c3 is assumed to be zero.
488 //
489 // On exit, we write the low 128 bits of the sum A + C + X Y to
490 // [EDI], and update the carry registers with the carry out. The
491 // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
492 // general-purpose registers are preserved.
493 endprologue
494
495 carryadd
496
497 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil
498 propout [edi + 0], xmm4, xmm5
499
500 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
501 propout [edi + 4], xmm5, xmm6
502
503 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
504 propout [edi + 8], xmm6, xmm7
505
506 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
507 propout [edi + 12], xmm7, xmm4
508
509 ret
510
511 ENDFUNC
512
513 INTFUNC(mmul4)
514 // On entry, EDI points to the destination buffer; EAX and EBX point
515 // to the packed operands U and N; ECX and ESI point to the expanded
516 // operands V and M; and EDX points to a place to store an expanded
517 // result Y (32 bytes, at a 16-byte boundary). The stack pointer
518 // must be 16-byte aligned. (This is not the usual convention, which
519 // requires alignment before the call.)
520 //
521 // On exit, we write Y = U V M mod B to [EDX], and the low 128 bits
522 // of the sum U V + N Y to [EDI], leaving the remaining carry in
523 // XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2, XMM3, and
524 // XMM7 are clobbered; the general-purpose registers are preserved.
525 stalloc 48 // space for the carries
526 endprologue
527
528 // Calculate W = U V, and leave it in the destination. Stash the
529 // carry pieces for later.
530 mulcore [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7
531 propout [edi + 0], xmm4, xmm5
532 jmp 5f
533
534 ENDFUNC
535
536 INTFUNC(mmla4)
537 // On entry, EDI points to the destination buffer, which also
538 // contains an addend A to accumulate; EAX and EBX point
539 // to the packed operands U and N; ECX and ESI point to the expanded
540 // operands V and M; and EDX points to a place to store an expanded
541 // result Y (32 bytes, at a 16-byte boundary). The stack pointer
542 // must be 16-byte aligned. (This is not the usual convention, which
543 // requires alignment before the call.)
544 //
545 // On exit, we write Y = (A + U V) M mod B to [EDX], and the low 128
546 // bits of the sum A + U V + N Y to [EDI], leaving the remaining
547 // carry in XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2,
548 // XMM3, and XMM7 are clobbered; the general-purpose registers are
549 // preserved.
550 stalloc 48 // space for the carries
551 endprologue
552
553 movd xmm4, [edi + 0]
554 movd xmm5, [edi + 4]
555 movd xmm6, [edi + 8]
556 movd xmm7, [edi + 12]
557 mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7, nil
558 propout [edi + 0], xmm4, xmm5
559
560 5: mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
561 propout [edi + 4], xmm5, xmm6
562
563 mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t
564 propout [edi + 8], xmm6, xmm7
565
566 mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
567 propout [edi + 12], xmm7, xmm4
568
569 movdqa [esp + 0], xmm4
570 movdqa [esp + 16], xmm5
571 movdqa [esp + 32], xmm6
572
573 // Calculate Y = W M.
574 mulcore [edi + 0], esi, xmm4, xmm5, xmm6, xmm7
575
576 mulcore [edi + 4], esi, xmm0, xmm1, xmm2, nil
577 accum xmm5, xmm6, xmm7, nil
578
579 mulcore [edi + 8], esi, xmm0, xmm1, nil, nil
580 accum xmm6, xmm7, nil, nil
581
582 mulcore [edi + 12], esi, xmm0, nil, nil, nil
583 accum xmm7, nil, nil, nil
584
585 // That's lots of pieces. Now we have to assemble the answer.
586 squash xmm4, xmm5, xmm6, xmm7, nil, xmm0, xmm1
587
588 // Expand it.
589 pxor xmm2, xmm2
590 expand xmm4, xmm1, nil, nil, xmm2
591 movdqa [edx + 0], xmm4
592 movdqa [edx + 16], xmm1
593
594 // Initialize the carry from the value for W we calculated earlier.
595 movd xmm4, [edi + 0]
596 movd xmm5, [edi + 4]
597 movd xmm6, [edi + 8]
598 movd xmm7, [edi + 12]
599
600 // Finish the calculation by adding the Montgomery product.
601 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil
602 propout [edi + 0], xmm4, xmm5
603
604 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
605 propout [edi + 4], xmm5, xmm6
606
607 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
608 propout [edi + 8], xmm6, xmm7
609
610 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
611 propout [edi + 12], xmm7, xmm4
612
613 // Add add on the carry we calculated earlier.
614 paddq xmm4, [esp + 0]
615 paddq xmm5, [esp + 16]
616 paddq xmm6, [esp + 32]
617
618 // And, with that, we're done.
619 stfree 48
620 ret
621
622 ENDFUNC
623
624 INTFUNC(mont4)
625 // On entry, EDI points to the destination buffer holding a packed
626 // value W; EBX points to a packed operand N; ESI points to an
627 // expanded operand M; and EDX points to a place to store an expanded
628 // result Y (32 bytes, at a 16-byte boundary).
629 //
630 // On exit, we write Y = W M mod B to [EDX], and the low 128 bits
631 // of the sum W + N Y to [EDI], leaving the remaining carry in
632 // XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2, XMM3, and
633 // XMM7 are clobbered; the general-purpose registers are preserved.
634 endprologue
635
636 // Calculate Y = W M.
637 mulcore [edi + 0], esi, xmm4, xmm5, xmm6, xmm7
638
639 mulcore [edi + 4], esi, xmm0, xmm1, xmm2, nil
640 accum xmm5, xmm6, xmm7, nil
641
642 mulcore [edi + 8], esi, xmm0, xmm1, nil, nil
643 accum xmm6, xmm7, nil, nil
644
645 mulcore [edi + 12], esi, xmm0, nil, nil, nil
646 accum xmm7, nil, nil, nil
647
648 // That's lots of pieces. Now we have to assemble the answer.
649 squash xmm4, xmm5, xmm6, xmm7, nil, xmm0, xmm1
650
651 // Expand it.
652 pxor xmm2, xmm2
653 expand xmm4, xmm1, nil, nil, xmm2
654 movdqa [edx + 0], xmm4
655 movdqa [edx + 16], xmm1
656
657 // Initialize the carry from W.
658 movd xmm4, [edi + 0]
659 movd xmm5, [edi + 4]
660 movd xmm6, [edi + 8]
661 movd xmm7, [edi + 12]
662
663 // Finish the calculation by adding the Montgomery product.
664 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil
665 propout [edi + 0], xmm4, xmm5
666
667 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
668 propout [edi + 4], xmm5, xmm6
669
670 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
671 propout [edi + 8], xmm6, xmm7
672
673 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
674 propout [edi + 12], xmm7, xmm4
675
676 // And, with that, we're done.
677 ret
678
679 ENDFUNC
680
681 ///--------------------------------------------------------------------------
682 /// Bulk multipliers.
683
684 FUNC(mpx_umul4_x86_sse2)
685 // void mpx_umul4_x86_sse2(mpw *dv, const mpw *av, const mpw *avl,
686 // const mpw *bv, const mpw *bvl);
687
688 // Build a stack frame. Arguments will be relative to EBP, as
689 // follows.
690 //
691 // ebp + 20 dv
692 // ebp + 24 av
693 // ebp + 28 avl
694 // ebp + 32 bv
695 // ebp + 36 bvl
696 //
697 // Locals are relative to ESP, as follows.
698 //
699 // esp + 0 expanded Y (32 bytes)
700 // esp + 32 (top of locals)
701 pushreg ebp
702 pushreg ebx
703 pushreg esi
704 pushreg edi
705 setfp ebp
706 and esp, ~15
707 sub esp, 32
708 endprologue
709
710 // Prepare for the first iteration.
711 mov esi, [ebp + 32] // -> bv[0]
712 pxor xmm7, xmm7
713 movdqu xmm0, [esi] // bv[0]
714 mov edi, [ebp + 20] // -> dv[0]
715 mov ecx, edi // outer loop dv cursor
716 expand xmm0, xmm1, nil, nil, xmm7
717 mov ebx, [ebp + 24] // -> av[0]
718 mov eax, [ebp + 28] // -> av[m] = av limit
719 mov edx, esp // -> expanded Y = bv[0]
720 movdqa [esp + 0], xmm0 // bv[0] expanded low
721 movdqa [esp + 16], xmm1 // bv[0] expanded high
722 call mul4zc
723 add ebx, 16
724 add edi, 16
725 add ecx, 16
726 add esi, 16
727 cmp ebx, eax // all done?
728 jae 8f
729
730 .p2align 4
731 // Continue with the first iteration.
732 0: call mul4
733 add ebx, 16
734 add edi, 16
735 cmp ebx, eax // all done?
736 jb 0b
737
738 // Write out the leftover carry. There can be no tail here.
739 8: call carryprop
740 cmp esi, [ebp + 36] // more passes to do?
741 jae 9f
742
743 .p2align 4
744 // Set up for the next pass.
745 1: movdqu xmm0, [esi] // bv[i]
746 mov edi, ecx // -> dv[i]
747 pxor xmm7, xmm7
748 expand xmm0, xmm1, nil, nil, xmm7
749 mov ebx, [ebp + 24] // -> av[0]
750 movdqa [esp + 0], xmm0 // bv[i] expanded low
751 movdqa [esp + 16], xmm1 // bv[i] expanded high
752 call mla4zc
753 add edi, 16
754 add ebx, 16
755 add ecx, 16
756 add esi, 16
757 cmp ebx, eax // done yet?
758 jae 8f
759
760 .p2align 4
761 // Continue...
762 0: call mla4
763 add ebx, 16
764 add edi, 16
765 cmp ebx, eax
766 jb 0b
767
768 // Finish off this pass. There was no tail on the previous pass, and
769 // there can be none on this pass.
770 8: call carryprop
771 cmp esi, [ebp + 36]
772 jb 1b
773
774 // All over.
775 9: dropfp
776 pop edi
777 pop esi
778 pop ebx
779 pop ebp
780 ret
781
782 ENDFUNC
783
784 FUNC(mpxmont_mul4_x86_sse2)
785 // void mpxmont_mul4_x86_sse2(mpw *dv, const mpw *av, const mpw *bv,
786 // const mpw *nv, size_t n, const mpw *mi);
787
788 // Build a stack frame. Arguments will be relative to EBP, as
789 // follows.
790 //
791 // ebp + 20 dv
792 // ebp + 24 av
793 // ebp + 28 bv
794 // ebp + 32 nv
795 // ebp + 36 n (nonzero multiple of 4)
796 // ebp + 40 mi
797 //
798 // Locals are relative to ESP, which is 4 mod 16, as follows.
799 //
800 // esp + 0 outer loop dv
801 // esp + 4 outer loop bv
802 // esp + 8 av limit (mostly in ESI)
803 // esp + 12 expanded V (32 bytes)
804 // esp + 44 expanded M (32 bytes)
805 // esp + 76 expanded Y (32 bytes)
806 // esp + 108 bv limit
807 // esp + 112 (gap)
808 // esp + 124 (top of locals)
809 pushreg ebp
810 pushreg ebx
811 pushreg esi
812 pushreg edi
813 setfp ebp
814 and esp, ~15
815 sub esp, 124
816 endprologue
817
818 // Establish the expanded operands.
819 pxor xmm7, xmm7
820 mov ecx, [ebp + 28] // -> bv
821 mov edx, [ebp + 40] // -> mi
822 movdqu xmm0, [ecx] // bv[0]
823 movdqu xmm2, [edx] // mi
824 expand xmm0, xmm1, xmm2, xmm3, xmm7
825 movdqa [esp + 12], xmm0 // bv[0] expanded low
826 movdqa [esp + 28], xmm1 // bv[0] expanded high
827 movdqa [esp + 44], xmm2 // mi expanded low
828 movdqa [esp + 60], xmm3 // mi expanded high
829
830 // Set up the outer loop state and prepare for the first iteration.
831 mov edx, [ebp + 36] // n
832 mov eax, [ebp + 24] // -> U = av[0]
833 mov ebx, [ebp + 32] // -> X = nv[0]
834 mov edi, [ebp + 20] // -> Z = dv[0]
835 mov [esp + 4], ecx
836 lea ecx, [ecx + 4*edx] // -> bv[n/4] = bv limit
837 lea edx, [eax + 4*edx] // -> av[n/4] = av limit
838 mov [esp + 0], edi
839 mov [esp + 108], ecx
840 mov [esp + 8], edx
841 lea ecx, [esp + 12] // -> expanded V = bv[0]
842 lea esi, [esp + 44] // -> expanded M = mi
843 lea edx, [esp + 76] // -> space for Y
844 call mmul4
845 mov esi, [esp + 8] // recover av limit
846 add edi, 16
847 add eax, 16
848 add ebx, 16
849 cmp eax, esi // done already?
850 jae 8f
851 mov [esp + 0], edi
852
853 .p2align 4
854 // Complete the first inner loop.
855 0: call dmul4
856 add edi, 16
857 add eax, 16
858 add ebx, 16
859 cmp eax, esi // done yet?
860 jb 0b
861
862 // Still have carries left to propagate.
863 call carryprop
864 movd [edi + 16], xmm4
865
866 .p2align 4
867 // Embark on the next iteration. (There must be one. If n = 1, then
868 // we would have bailed above, to label 8. Similarly, the subsequent
869 // iterations can fall into the inner loop immediately.)
870 1: mov eax, [esp + 4] // -> bv[i - 1]
871 mov edi, [esp + 0] // -> Z = dv[i]
872 add eax, 16 // -> bv[i]
873 pxor xmm7, xmm7
874 movdqu xmm0, [eax] // bv[i]
875 mov [esp + 4], eax
876 cmp eax, [esp + 108] // done yet?
877 jae 9f
878 mov ebx, [ebp + 32] // -> X = nv[0]
879 lea esi, [esp + 44] // -> expanded M = mi
880 mov eax, [ebp + 24] // -> U = av[0]
881 expand xmm0, xmm1, nil, nil, xmm7
882 movdqa [esp + 12], xmm0 // bv[i] expanded low
883 movdqa [esp + 28], xmm1 // bv[i] expanded high
884 call mmla4
885 mov esi, [esp + 8] // recover av limit
886 add edi, 16
887 add eax, 16
888 add ebx, 16
889 mov [esp + 0], edi
890
891 .p2align 4
892 // Complete the next inner loop.
893 0: call dmla4
894 add edi, 16
895 add eax, 16
896 add ebx, 16
897 cmp eax, esi
898 jb 0b
899
900 // Still have carries left to propagate, and they overlap the
901 // previous iteration's final tail, so read that in and add it.
902 movd xmm0, [edi]
903 paddq xmm4, xmm0
904 call carryprop
905 movd [edi + 16], xmm4
906
907 // Back again.
908 jmp 1b
909
910 // First iteration was short. Write out the carries and we're done.
911 // (This could be folded into the main loop structure, but that would
912 // penalize small numbers more.)
913 8: call carryprop
914 movd [edi + 16], xmm4
915
916 // All done.
917 9: dropfp
918 popreg edi
919 popreg esi
920 popreg ebx
921 popreg ebp
922 ret
923
924 ENDFUNC
925
926 FUNC(mpxmont_redc4_x86_sse2)
927 // void mpxmont_redc4_x86_sse2(mpw *dv, mpw *dvl, const mpw *nv,
928 // size_t n, const mpw *mi);
929
930 // Build a stack frame. Arguments will be relative to EBP, as
931 // follows.
932 //
933 // ebp + 20 dv
934 // ebp + 24 dvl
935 // ebp + 28 nv
936 // ebp + 32 n (nonzero multiple of 4)
937 // ebp + 36 mi
938 //
939 // Locals are relative to ESP, as follows.
940 //
941 // esp + 0 outer loop dv
942 // esp + 4 outer dv limit
943 // esp + 8 blocks-of-4 dv limit
944 // esp + 12 expanded M (32 bytes)
945 // esp + 44 expanded Y (32 bytes)
946 // esp + 76 (top of locals)
947 pushreg ebp
948 pushreg ebx
949 pushreg esi
950 pushreg edi
951 setfp ebp
952 and esp, ~15
953 sub esp, 76
954 endprologue
955
956 // Establish the expanded operands and the blocks-of-4 dv limit.
957 mov edi, [ebp + 20] // -> Z = dv[0]
958 pxor xmm7, xmm7
959 mov eax, [ebp + 24] // -> dv[n] = dv limit
960 sub eax, edi // length of dv in bytes
961 mov edx, [ebp + 36] // -> mi
962 movdqu xmm0, [edx] // mi
963 and eax, ~15 // mask off the tail end
964 expand xmm0, xmm1, nil, nil, xmm7
965 add eax, edi // find limit
966 movdqa [esp + 12], xmm0 // mi expanded low
967 movdqa [esp + 28], xmm1 // mi expanded high
968 mov [esp + 8], eax
969
970 // Set up the outer loop state and prepare for the first iteration.
971 mov ecx, [ebp + 32] // n
972 mov ebx, [ebp + 28] // -> X = nv[0]
973 lea edx, [edi + 4*ecx] // -> dv[n/4] = outer dv limit
974 lea ecx, [ebx + 4*ecx] // -> nv[n/4] = nv limit
975 mov [esp + 0], edi
976 mov [esp + 4], edx
977 lea esi, [esp + 12] // -> expanded M = mi
978 lea edx, [esp + 44] // -> space for Y
979 call mont4
980 add edi, 16
981 add ebx, 16
982 cmp ebx, ecx // done already?
983 jae 8f
984
985 .p2align 4
986 // Complete the first inner loop.
987 5: call mla4
988 add ebx, 16
989 add edi, 16
990 cmp ebx, ecx // done yet?
991 jb 5b
992
993 // Still have carries left to propagate.
994 8: carryadd
995 mov esi, [esp + 8] // -> dv blocks limit
996 mov edx, [ebp + 24] // dv limit
997 psllq xmm7, 16
998 pslldq xmm7, 8
999 paddq xmm6, xmm7
1000 call carryprop
1001 movd eax, xmm4
1002 add edi, 16
1003 cmp edi, esi
1004 jae 7f
1005
1006 .p2align 4
1007 // Continue carry propagation until the end of the buffer.
1008 0: add [edi], eax
1009 mov eax, 0 // preserves flags
1010 adcd [edi + 4], 0
1011 adcd [edi + 8], 0
1012 adcd [edi + 12], 0
1013 adc eax, 0
1014 add edi, 16
1015 cmp edi, esi
1016 jb 0b
1017
1018 // Deal with the tail end.
1019 7: add [edi], eax
1020 mov eax, 0 // preserves flags
1021 add edi, 4
1022 adc eax, 0
1023 cmp edi, edx
1024 jb 7b
1025
1026 // All done for this iteration. Start the next. (This must have at
1027 // least one follow-on iteration, or we'd not have started this outer
1028 // loop.)
1029 8: mov edi, [esp + 0] // -> dv[i - 1]
1030 mov ebx, [ebp + 28] // -> X = nv[0]
1031 lea edx, [esp + 44] // -> space for Y
1032 lea esi, [esp + 12] // -> expanded M = mi
1033 add edi, 16 // -> Z = dv[i]
1034 cmp edi, [esp + 4] // all done yet?
1035 jae 9f
1036 mov [esp + 0], edi
1037 call mont4
1038 add edi, 16
1039 add ebx, 16
1040 jmp 5b
1041
1042 // All over.
1043 9: dropfp
1044 popreg edi
1045 popreg esi
1046 popreg ebx
1047 popreg ebp
1048 ret
1049
1050 ENDFUNC
1051
1052 ///--------------------------------------------------------------------------
1053 /// Testing and performance measurement.
1054
1055 #ifdef TEST_MUL4
1056
1057 .macro cysetup c
1058 rdtsc
1059 mov [\c], eax
1060 mov [\c + 4], edx
1061 .endm
1062
1063 .macro cystore c, v, n
1064 rdtsc
1065 sub eax, [\c]
1066 sbb edx, [\c + 4]
1067 mov ebx, [\v]
1068 mov ecx, [\n]
1069 dec ecx
1070 mov [\n], ecx
1071 mov [ebx + ecx*8], eax
1072 mov [ebx + ecx*8 + 4], edx
1073 .endm
1074
1075 .macro testprologue
1076 pushreg ebp
1077 pushreg ebx
1078 pushreg esi
1079 pushreg edi
1080 setfp ebp
1081 and esp, ~15
1082 sub esp, 3*32 + 12
1083 endprologue
1084 // vars:
1085 // esp + 0 = cycles
1086 // esp + 12 = v expanded
1087 // esp + 44 = y expanded
1088 // esp + 72 = ? expanded
1089 .endm
1090
1091 .macro testepilogue
1092 dropfp
1093 popreg edi
1094 popreg esi
1095 popreg ebx
1096 popreg ebp
1097 ret
1098 .endm
1099
1100 .macro testldcarry c
1101 mov ecx, \c // -> c
1102 movdqu xmm4, [ecx + 0] // (c'_0, c''_0)
1103 movdqu xmm5, [ecx + 16] // (c'_1, c''_1)
1104 movdqu xmm6, [ecx + 32] // (c'_2, c''_2)
1105 .endm
1106
1107 .macro testexpand v, y
1108 pxor xmm7, xmm7
1109 .ifnes "\v", "nil"
1110 mov ecx, \v
1111 movdqu xmm0, [ecx]
1112 expand xmm0, xmm1, nil, nil, xmm7
1113 movdqa [esp + 12], xmm0
1114 movdqa [esp + 28], xmm1
1115 .endif
1116 .ifnes "\y", "nil"
1117 mov edx, \y
1118 movdqu xmm2, [edx]
1119 expand xmm2, xmm3, nil, nil, xmm7
1120 movdqa [esp + 44], xmm2
1121 movdqa [esp + 60], xmm3
1122 .endif
1123 .endm
1124
1125 .macro testtop u, x, mode
1126 .p2align 4
1127 0:
1128 .ifnes "\u", "nil"
1129 lea ecx, [esp + 12]
1130 .endif
1131 mov ebx, \x
1132 .ifeqs "\mode", "mont"
1133 lea esi, [esp + 44]
1134 .endif
1135 cysetup esp + 0
1136 .ifnes "\u", "nil"
1137 mov eax, \u
1138 .endif
1139 .ifeqs "\mode", "mont"
1140 lea edx, [esp + 76]
1141 .else
1142 lea edx, [esp + 44]
1143 .endif
1144 .endm
1145
1146 .macro testtail cyv, n
1147 cystore esp + 0, \cyv, \n
1148 jnz 0b
1149 .endm
1150
1151 .macro testcarryout c
1152 mov ecx, \c
1153 movdqu [ecx + 0], xmm4
1154 movdqu [ecx + 16], xmm5
1155 movdqu [ecx + 32], xmm6
1156 .endm
1157
1158 FUNC(test_dmul4)
1159 testprologue
1160 testldcarry [ebp + 24]
1161 testexpand [ebp + 36], [ebp + 40]
1162 mov edi, [ebp + 20]
1163 testtop [ebp + 28], [ebp + 32]
1164 call dmul4
1165 testtail [ebp + 48], [ebp + 44]
1166 testcarryout [ebp + 24]
1167 testepilogue
1168 ENDFUNC
1169
1170 FUNC(test_dmla4)
1171 testprologue
1172 testldcarry [ebp + 24]
1173 testexpand [ebp + 36], [ebp + 40]
1174 mov edi, [ebp + 20]
1175 testtop [ebp + 28], [ebp + 32]
1176 call dmla4
1177 testtail [ebp + 48], [ebp + 44]
1178 testcarryout [ebp + 24]
1179 testepilogue
1180 ENDFUNC
1181
1182 FUNC(test_mul4)
1183 testprologue
1184 testldcarry [ebp + 24]
1185 testexpand nil, [ebp + 32]
1186 mov edi, [ebp + 20]
1187 testtop nil, [ebp + 28]
1188 call mul4
1189 testtail [ebp + 40], [ebp + 36]
1190 testcarryout [ebp + 24]
1191 testepilogue
1192 ENDFUNC
1193
1194 FUNC(test_mla4)
1195 testprologue
1196 testldcarry [ebp + 24]
1197 testexpand nil, [ebp + 32]
1198 mov edi, [ebp + 20]
1199 testtop nil, [ebp + 28]
1200 call mla4
1201 testtail [ebp + 40], [ebp + 36]
1202 testcarryout [ebp + 24]
1203 testepilogue
1204 ENDFUNC
1205
1206 FUNC(test_mmul4)
1207 testprologue
1208 testexpand [ebp + 40], [ebp + 44]
1209 mov edi, [ebp + 20]
1210 testtop [ebp + 32], [ebp + 36], mont
1211 call mmul4
1212 testtail [ebp + 52], [ebp + 48]
1213 mov edi, [ebp + 28]
1214 movdqa xmm0, [esp + 76]
1215 movdqa xmm1, [esp + 92]
1216 movdqu [edi], xmm0
1217 movdqu [edi + 16], xmm1
1218 testcarryout [ebp + 24]
1219 testepilogue
1220 ENDFUNC
1221
1222 FUNC(test_mmla4)
1223 testprologue
1224 testexpand [ebp + 40], [ebp + 44]
1225 mov edi, [ebp + 20]
1226 testtop [ebp + 32], [ebp + 36], mont
1227 call mmla4
1228 testtail [ebp + 52], [ebp + 48]
1229 mov edi, [ebp + 28]
1230 movdqa xmm0, [esp + 76]
1231 movdqa xmm1, [esp + 92]
1232 movdqu [edi], xmm0
1233 movdqu [edi + 16], xmm1
1234 testcarryout [ebp + 24]
1235 testepilogue
1236 ENDFUNC
1237
1238 FUNC(test_mont4)
1239 testprologue
1240 testexpand nil, [ebp + 36]
1241 mov edi, [ebp + 20]
1242 testtop nil, [ebp + 32], mont
1243 call mont4
1244 testtail [ebp + 44], [ebp + 40]
1245 mov edi, [ebp + 28]
1246 movdqa xmm0, [esp + 76]
1247 movdqa xmm1, [esp + 92]
1248 movdqu [edi], xmm0
1249 movdqu [edi + 16], xmm1
1250 testcarryout [ebp + 24]
1251 testepilogue
1252 ENDFUNC
1253
1254 #endif
1255
1256 ///----- That's all, folks --------------------------------------------------