2f7b5ec9685e2311c30fb970aacfbfbf21663993
[catacomb] / math / mpx-mul4-x86-sse2.S
1 /// -*- mode: asm; asm-comment-char: ?/; comment-start: "// " -*-
2 ///
3 /// Large SIMD-based multiplications
4 ///
5 /// (c) 2016 Straylight/Edgeware
6
7 ///----- Licensing notice ---------------------------------------------------
8 ///
9 /// This file is part of Catacomb.
10 ///
11 /// Catacomb is free software; you can redistribute it and/or modify
12 /// it under the terms of the GNU Library General Public License as
13 /// published by the Free Software Foundation; either version 2 of the
14 /// License, or (at your option) any later version.
15 ///
16 /// Catacomb is distributed in the hope that it will be useful,
17 /// but WITHOUT ANY WARRANTY; without even the implied warranty of
18 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 /// GNU Library General Public License for more details.
20 ///
21 /// You should have received a copy of the GNU Library General Public
22 /// License along with Catacomb; if not, write to the Free
23 /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
24 /// MA 02111-1307, USA.
25
26 ///--------------------------------------------------------------------------
27 /// Preliminaries.
28
29 #include "config.h"
30 #include "asm-common.h"
31
32 .arch pentium4
33
34 .text
35
36 ///--------------------------------------------------------------------------
37 /// Theory.
38 ///
39 /// We define a number of primitive fixed-size multipliers from which we can
40 /// construct more general variable-length multipliers.
41 ///
42 /// The basic trick is the same throughout. In an operand-scanning
43 /// multiplication, the inner multiplication loop multiplies a
44 /// multiple-precision operand by a single precision factor, and adds the
45 /// result, appropriately shifted, to the result. A `finely integrated
46 /// operand scanning' implementation of Montgomery multiplication also adds
47 /// the product of a single-precision `Montgomery factor' and the modulus,
48 /// calculated in the same pass. The more common `coarsely integrated
49 /// operand scanning' alternates main multiplication and Montgomery passes,
50 /// which requires additional carry propagation.
51 ///
52 /// Throughout both plain-multiplication and Montgomery stages, then, one of
53 /// the factors remains constant throughout the operation, so we can afford
54 /// to take a little time to preprocess it. The transformation we perform is
55 /// as follows. Let b = 2^16, and B = b^2 = 2^32. Suppose we're given a
56 /// 128-bit factor v = v_0 + v_1 B + v_2 B^2 + v_3 B^3. Split each v_i into
57 /// two sixteen-bit pieces, so v_i = v'_i + v''_i b. These eight 16-bit
58 /// pieces are placed into 32-bit cells, and arranged as two 128-bit SSE
59 /// operands, as follows.
60 ///
61 /// Offset 0 4 8 12
62 /// 0 v'_0 v'_1 v''_0 v''_1
63 /// 16 v'_2 v'_3 v''_2 v''_3
64 ///
65 /// A `pmuludq' instruction ignores the odd positions in its operands; thus,
66 /// it will act on (say) v'_0 and v''_0 in a single instruction. Shifting
67 /// this vector right by 4 bytes brings v'_1 and v''_1 into position. We can
68 /// multiply such a vector by a full 32-bit scalar to produce two 48-bit
69 /// results in 64-bit fields. The sixteen bits of headroom allows us to add
70 /// many products together before we must deal with carrying; it also allows
71 /// for some calculations to be performed on the above expanded form.
72 ///
73 /// On 32-bit x86, we are register starved: the expanded operands are kept in
74 /// memory, typically in warm L1 cache.
75 ///
76 /// We maintain four `carry' registers accumulating intermediate results.
77 /// The registers' precise roles rotate during the computation; we name them
78 /// `c0', `c1', `c2', and `c3'. Each carry register holds two 64-bit halves:
79 /// the register c0, for example, holds c'_0 (low half) and c''_0 (high
80 /// half), and represents the value c_0 = c'_0 + c''_0 b; the carry registers
81 /// collectively represent the value c_0 + c_1 B + c_2 B^2 + c_3 B^3. The
82 /// `pmuluqd' instruction acting on a scalar operand (broadcast across all
83 /// lanes of its vector) and an operand in the expanded form above produces a
84 /// result which can be added directly to the appropriate carry register.
85 /// Following a pass of four multiplications, we perform some limited carry
86 /// propagation: let t = c''_0 mod B, and let d = c'_0 + t b; then we output
87 /// z = d mod B, add (floor(d/B), floor(c''_0/B)) to c1, and cycle the carry
88 /// registers around, so that c1 becomes c0, and the old c0 is (implicitly)
89 /// zeroed becomes c3.
90
91 ///--------------------------------------------------------------------------
92 /// Macro definitions.
93
94 .macro mulcore r, s, d0, d1=nil, d2=nil, d3=nil
95 // Load a word r_i from R, multiply by the expanded operand [S], and
96 // leave the pieces of the product in registers D0, D1, D2, D3.
97 movd \d0, \r // (r_i, 0; 0, 0)
98 .ifnes "\d1", "nil"
99 movdqa \d1, [\s] // (s'_0, s'_1; s''_0, s''_1)
100 .endif
101 .ifnes "\d3", "nil"
102 movdqa \d3, [\s + 16] // (s'_2, s'_3; s''_2, s''_3)
103 .endif
104 pshufd \d0, \d0, SHUF(0, 3, 0, 3) // (r_i, ?; r_i, ?)
105 .ifnes "\d1", "nil"
106 psrldq \d1, 4 // (s'_1, s''_0; s''_1, 0)
107 .endif
108 .ifnes "\d2", "nil"
109 .ifnes "\d3", "nil"
110 movdqa \d2, \d3 // another copy of (s'_2, s'_3; ...)
111 .else
112 movdqa \d2, \d0 // another copy of (r_i, ?; r_i, ?)
113 .endif
114 .endif
115 .ifnes "\d3", "nil"
116 psrldq \d3, 4 // (s'_3, s''_2; s''_3, 0)
117 .endif
118 .ifnes "\d1", "nil"
119 pmuludq \d1, \d0 // (r_i s'_1; r_i s''_1)
120 .endif
121 .ifnes "\d3", "nil"
122 pmuludq \d3, \d0 // (r_i s'_3; r_i s''_3)
123 .endif
124 .ifnes "\d2", "nil"
125 .ifnes "\d3", "nil"
126 pmuludq \d2, \d0 // (r_i s'_2; r_i s''_2)
127 .else
128 pmuludq \d2, [\s + 16]
129 .endif
130 .endif
131 pmuludq \d0, [\s] // (r_i s'_0; r_i s''_0)
132 .endm
133
134 .macro accum c0, c1=nil, c2=nil, c3=nil
135 // Accumulate 64-bit pieces in XMM0--XMM3 into the corresponding
136 // carry registers C0--C3. Any or all of C1--C3 may be `nil' to skip
137 // updating that register.
138 paddq \c0, xmm0
139 .ifnes "\c1", "nil"
140 paddq \c1, xmm1
141 .endif
142 .ifnes "\c2", "nil"
143 paddq \c2, xmm2
144 .endif
145 .ifnes "\c3", "nil"
146 paddq \c3, xmm3
147 .endif
148 .endm
149
150 .macro mulacc r, s, c0, c1, c2, c3, z3p=nil
151 // Load a word r_i from R, multiply by the expanded operand [S],
152 // and accumulate in carry registers C0, C1, C2, C3. If Z3P is `t'
153 // then C3 notionally contains zero, but needs clearing; in practice,
154 // we store the product directly rather than attempting to add. On
155 // completion, XMM0, XMM1, and XMM2 are clobbered, as is XMM3 if Z3P
156 // is not `t'.
157 .ifeqs "\z3p", "t"
158 mulcore \r, \s, xmm0, xmm1, xmm2, \c3
159 accum \c0, \c1, \c2
160 .else
161 mulcore \r, \s, xmm0, xmm1, xmm2, xmm3
162 accum \c0, \c1, \c2, \c3
163 .endif
164 .endm
165
166 .macro propout d, c, cc=nil
167 // Calculate an output word from C, and store it in D; propagate
168 // carries out from C to CC in preparation for a rotation of the
169 // carry registers. On completion, XMM3 is clobbered. If CC is
170 // `nil', then the contribution which would have been added to it is
171 // left in C.
172 pshufd xmm3, \c, SHUF(3, 3, 3, 2) // (?, ?; ?, t = c'' mod B)
173 psrldq xmm3, 12 // (t, 0; 0, 0) = (t, 0)
174 pslldq xmm3, 2 // (t b; 0)
175 paddq \c, xmm3 // (c' + t b; c'')
176 movd \d, \c
177 psrlq \c, 32 // floor(c/B)
178 .ifnes "\cc", "nil"
179 paddq \cc, \c // propagate up
180 .endif
181 .endm
182
183 .macro endprop d, c, t
184 // On entry, C contains a carry register. On exit, the low 32 bits
185 // of the value represented in C are written to D, and the remaining
186 // bits are left at the bottom of T.
187 movdqa \t, \c
188 psllq \t, 16 // (?; c'' b)
189 pslldq \c, 8 // (0; c')
190 paddq \t, \c // (?; c' + c'' b)
191 psrldq \t, 8 // (c' + c'' b; 0) = (c; 0)
192 movd \d, \t
193 psrldq \t, 4 // (floor(c/B); 0)
194 .endm
195
196 .macro expand z, a, b, c=nil, d=nil
197 // On entry, A and C hold packed 128-bit values, and Z is zero. On
198 // exit, A:B and C:D together hold the same values in expanded
199 // form. If C is `nil', then only expand A to A:B.
200 movdqa \b, \a // (a_0, a_1; a_2, a_3)
201 .ifnes "\c", "nil"
202 movdqa \d, \c // (c_0, c_1; c_2, c_3)
203 .endif
204 punpcklwd \a, \z // (a'_0, a''_0; a'_1, a''_1)
205 punpckhwd \b, \z // (a'_2, a''_2; a'_3, a''_3)
206 .ifnes "\c", "nil"
207 punpcklwd \c, \z // (c'_0, c''_0; c'_1, c''_1)
208 punpckhwd \d, \z // (c'_2, c''_2; c'_3, c''_3)
209 .endif
210 pshufd \a, \a, SHUF(0, 2, 1, 3) // (a'_0, a'_1; a''_0, a''_1)
211 pshufd \b, \b, SHUF(0, 2, 1, 3) // (a'_2, a'_3; a''_2, a''_3)
212 .ifnes "\c", "nil"
213 pshufd \c, \c, SHUF(0, 2, 1, 3) // (c'_0, c'_1; c''_0, c''_1)
214 pshufd \d, \d, SHUF(0, 2, 1, 3) // (c'_2, c'_3; c''_2, c''_3)
215 .endif
216 .endm
217
218 .macro squash c0, c1, c2, c3, t, u, lo, hi=nil
219 // On entry, C0, C1, C2, C3 are carry registers representing a value
220 // Y. On exit, LO holds the low 128 bits of the carry value; C1, C2,
221 // C3, T, and U are clobbered; and the high bits of Y are stored in
222 // HI, if this is not `nil'.
223
224 // The first step is to eliminate the `double-prime' pieces -- i.e.,
225 // the ones offset by 16 bytes from a 32-bit boundary -- by carrying
226 // them into the 32-bit-aligned pieces above and below. But before
227 // we can do that, we must gather them together.
228 movdqa \t, \c0
229 movdqa \u, \c1
230 punpcklqdq \t, \c2 // (y'_0; y'_2)
231 punpckhqdq \c0, \c2 // (y''_0; y''_2)
232 punpcklqdq \u, \c3 // (y'_1; y'_3)
233 punpckhqdq \c1, \c3 // (y''_1; y''_3)
234
235 // Now split the double-prime pieces. The high (up to) 48 bits will
236 // go up; the low 16 bits go down.
237 movdqa \c2, \c0
238 movdqa \c3, \c1
239 psllq \c2, 48
240 psllq \c3, 48
241 psrlq \c0, 16 // high parts of (y''_0; y''_2)
242 psrlq \c1, 16 // high parts of (y''_1; y''_3)
243 psrlq \c2, 32 // low parts of (y''_0; y''_2)
244 psrlq \c3, 32 // low parts of (y''_1; y''_3)
245 .ifnes "\hi", "nil"
246 movdqa \hi, \c1
247 .endif
248 pslldq \c1, 8 // high part of (0; y''_1)
249
250 paddq \t, \c2 // propagate down
251 paddq \u, \c3
252 paddq \t, \c1 // and up: (y_0; y_2)
253 paddq \u, \c0 // (y_1; y_3)
254 .ifnes "\hi", "nil"
255 psrldq \hi, 8 // high part of (y''_3; 0)
256 .endif
257
258 // Finally extract the answer. This complicated dance is better than
259 // storing to memory and loading, because the piecemeal stores
260 // inhibit store forwarding.
261 movdqa \c3, \t // (y_0; ?)
262 movdqa \lo, \t // (y^*_0, ?; ?, ?)
263 psrldq \t, 8 // (y_2; 0)
264 psrlq \c3, 32 // (floor(y_0/B); ?)
265 paddq \c3, \u // (y_1 + floor(y_0/B); ?)
266 movdqa \c1, \c3 // (y^*_1, ?; ?, ?)
267 psrldq \u, 8 // (y_3; 0)
268 psrlq \c3, 32 // (floor((y_1 B + y_0)/B^2; ?)
269 paddq \c3, \t // (y_2 + floor((y_1 B + y_0)/B^2; ?)
270 punpckldq \lo, \c3 // (y^*_0, y^*_2; ?, ?)
271 psrlq \c3, 32 // (floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
272 paddq \c3, \u // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
273 .ifnes "\hi", "nil"
274 movdqa \t, \c3
275 pxor \u, \u
276 .endif
277 punpckldq \c1, \c3 // (y^*_1, y^*_3; ?, ?)
278 .ifnes "\hi", "nil"
279 psrlq \t, 32 // very high bits of y
280 paddq \hi, \t
281 punpcklqdq \hi, \u // carry up
282 .endif
283 punpckldq \lo, \c1 // y mod B^4
284 .endm
285
286 .macro carryadd
287 // On entry, EDI points to a packed addend A, and XMM4, XMM5, XMM6
288 // hold the incoming carry registers c0, c1, and c2 representing a
289 // carry-in C.
290 //
291 // On exit, the carry registers, including XMM7, are updated to hold
292 // C + A; XMM0, XMM1, XMM2, and XMM3 are clobbered. The other
293 // registers are preserved.
294 movd xmm0, [edi + 0] // (a_0; 0)
295 movd xmm1, [edi + 4] // (a_1; 0)
296 movd xmm2, [edi + 8] // (a_2; 0)
297 movd xmm7, [edi + 12] // (a_3; 0)
298
299 paddq xmm4, xmm0 // (c'_0 + a_0; c''_0)
300 paddq xmm5, xmm1 // (c'_1 + a_1; c''_1)
301 paddq xmm6, xmm2 // (c'_2 + a_2; c''_2 + a_3 b)
302 .endm
303
304 ///--------------------------------------------------------------------------
305 /// Primitive multipliers and related utilities.
306
307 INTFUNC(carryprop)
308 // On entry, XMM4, XMM5, and XMM6 hold a 144-bit carry in an expanded
309 // form. Store the low 128 bits of the represented carry to [EDI] as
310 // a packed 128-bit value, and leave the remaining 16 bits in the low
311 // 32 bits of XMM4. On exit, XMM3, XMM5 and XMM6 are clobbered.
312 endprologue
313
314 propout [edi + 0], xmm4, xmm5
315 propout [edi + 4], xmm5, xmm6
316 propout [edi + 8], xmm6, nil
317 endprop [edi + 12], xmm6, xmm4
318 ret
319 ENDFUNC
320
321 INTFUNC(dmul4)
322 // On entry, EDI points to the destination buffer; EAX and EBX point
323 // to the packed operands U and X; ECX and EDX point to the expanded
324 // operands V and Y; and XMM4, XMM5, XMM6 hold the incoming carry
325 // registers c0, c1, and c2; c3 is assumed to be zero.
326 //
327 // On exit, we write the low 128 bits of the sum C + U V + X Y to
328 // [EDI], and update the carry registers with the carry out. The
329 // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
330 // general-purpose registers are preserved.
331 endprologue
332
333 mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7, t
334 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
335 propout [edi + 0], xmm4, xmm5
336
337 mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
338 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4
339 propout [edi + 4], xmm5, xmm6
340
341 mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t
342 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5
343 propout [edi + 8], xmm6, xmm7
344
345 mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
346 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6
347 propout [edi + 12], xmm7, xmm4
348
349 ret
350 ENDFUNC
351
352 INTFUNC(dmla4)
353 // On entry, EDI points to the destination buffer, which also
354 // contains an addend A to accumulate; EAX and EBX point to the
355 // packed operands U and X; ECX and EDX point to the expanded
356 // operands V and Y; and XMM4, XMM5, XMM6 hold the incoming carry
357 // registers c0, c1, and c2 representing a carry-in C; c3 is assumed
358 // to be zero.
359 //
360 // On exit, we write the low 128 bits of the sum A + C + U V + X Y to
361 // [EDI], and update the carry registers with the carry out. The
362 // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
363 // general-purpose registers are preserved.
364 endprologue
365
366 carryadd
367
368 mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7
369 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
370 propout [edi + 0], xmm4, xmm5
371
372 mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
373 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4
374 propout [edi + 4], xmm5, xmm6
375
376 mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t
377 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5
378 propout [edi + 8], xmm6, xmm7
379
380 mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
381 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6
382 propout [edi + 12], xmm7, xmm4
383
384 ret
385 ENDFUNC
386
387 INTFUNC(mul4zc)
388 // On entry, EDI points to the destination buffer; EBX points to a
389 // packed operand X; and EDX points to an expanded operand Y.
390 //
391 // On exit, we write the low 128 bits of the product X Y to [EDI],
392 // and set the carry registers XMM4, XMM5, XMM6 to the carry out.
393 // The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
394 // general-purpose registers are preserved.
395 endprologue
396
397 mulcore [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
398 propout [edi + 0], xmm4, xmm5
399
400 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
401 propout [edi + 4], xmm5, xmm6
402
403 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
404 propout [edi + 8], xmm6, xmm7
405
406 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
407 propout [edi + 12], xmm7, xmm4
408
409 ret
410 ENDFUNC
411
412 INTFUNC(mul4)
413 // On entry, EDI points to the destination buffer; EBX points to a
414 // packed operand X; EDX points to an expanded operand Y; and XMM4,
415 // XMM5, XMM6 hold the incoming carry registers c0, c1, and c2,
416 // representing a carry-in C; c3 is assumed to be zero.
417 //
418 // On exit, we write the low 128 bits of the sum C + X Y to [EDI],
419 // and update the carry registers with the carry out. The registers
420 // XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
421 // general-purpose registers are preserved.
422 endprologue
423
424 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, t
425 propout [edi + 0], xmm4, xmm5
426
427 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
428 propout [edi + 4], xmm5, xmm6
429
430 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
431 propout [edi + 8], xmm6, xmm7
432
433 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
434 propout [edi + 12], xmm7, xmm4
435
436 ret
437 ENDFUNC
438
439 INTFUNC(mla4zc)
440 // On entry, EDI points to the destination buffer, which also
441 // contains an addend A to accumulate; EBX points to a packed operand
442 // X; and EDX points to an expanded operand Y.
443 //
444 // On exit, we write the low 128 bits of the sum A + X Y to [EDI],
445 // and set the carry registers XMM4, XMM5, XMM6 to the carry out.
446 // The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
447 // general-purpose registers are preserved.
448 endprologue
449
450 movd xmm4, [edi + 0]
451 movd xmm5, [edi + 4]
452 movd xmm6, [edi + 8]
453 movd xmm7, [edi + 12]
454
455 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
456 propout [edi + 0], xmm4, xmm5
457
458 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
459 propout [edi + 4], xmm5, xmm6
460
461 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
462 propout [edi + 8], xmm6, xmm7
463
464 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
465 propout [edi + 12], xmm7, xmm4
466
467 ret
468 ENDFUNC
469
470 INTFUNC(mla4)
471 // On entry, EDI points to the destination buffer, which also
472 // contains an addend A to accumulate; EBX points to a packed operand
473 // X; EDX points to an expanded operand Y; and XMM4, XMM5, XMM6 hold
474 // the incoming carry registers c0, c1, and c2, representing a
475 // carry-in C; c3 is assumed to be zero.
476 //
477 // On exit, we write the low 128 bits of the sum A + C + X Y to
478 // [EDI], and update the carry registers with the carry out. The
479 // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
480 // general-purpose registers are preserved.
481 endprologue
482
483 carryadd
484
485 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
486 propout [edi + 0], xmm4, xmm5
487
488 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
489 propout [edi + 4], xmm5, xmm6
490
491 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
492 propout [edi + 8], xmm6, xmm7
493
494 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
495 propout [edi + 12], xmm7, xmm4
496
497 ret
498 ENDFUNC
499
500 INTFUNC(mmul4)
501 // On entry, EDI points to the destination buffer; EAX and EBX point
502 // to the packed operands U and N; ECX and ESI point to the expanded
503 // operands V and M; and EDX points to a place to store an expanded
504 // result Y (32 bytes, at a 16-byte boundary). The stack pointer
505 // must be 12 modulo 16, as is usual for modern x86 ABIs.
506 //
507 // On exit, we write Y = U V M mod B to [EDX], and the low 128 bits
508 // of the sum U V + N Y to [EDI], leaving the remaining carry in
509 // XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2, XMM3, and
510 // XMM7 are clobbered; the general-purpose registers are preserved.
511 stalloc 48 + 12 // space for the carries
512 endprologue
513
514 // Calculate W = U V, and leave it in the destination. Stash the
515 // carry pieces for later.
516 mulcore [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7
517 propout [edi + 0], xmm4, xmm5
518 jmp 5f
519 ENDFUNC
520
521 INTFUNC(mmla4)
522 // On entry, EDI points to the destination buffer, which also
523 // contains an addend A to accumulate; EAX and EBX point to the
524 // packed operands U and N; ECX and ESI point to the expanded
525 // operands V and M; and EDX points to a place to store an expanded
526 // result Y (32 bytes, at a 16-byte boundary). The stack pointer
527 // must be 12 modulo 16, as is usual for modern x86 ABIs.
528 //
529 // On exit, we write Y = (A + U V) M mod B to [EDX], and the low 128
530 // bits of the sum A + U V + N Y to [EDI], leaving the remaining
531 // carry in XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2,
532 // XMM3, and XMM7 are clobbered; the general-purpose registers are
533 // preserved.
534 stalloc 48 + 12 // space for the carries
535 endprologue
536
537 movd xmm4, [edi + 0]
538 movd xmm5, [edi + 4]
539 movd xmm6, [edi + 8]
540 movd xmm7, [edi + 12]
541
542 // Calculate W = U V, and leave it in the destination. Stash the
543 // carry pieces for later.
544 mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7
545 propout [edi + 0], xmm4, xmm5
546
547 5: mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
548 propout [edi + 4], xmm5, xmm6
549
550 mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t
551 propout [edi + 8], xmm6, xmm7
552
553 mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
554 propout [edi + 12], xmm7, xmm4
555
556 movdqa [SP + 0], xmm4
557 movdqa [SP + 16], xmm5
558 movdqa [SP + 32], xmm6
559
560 // Calculate Y = W M.
561 mulcore [edi + 0], esi, xmm4, xmm5, xmm6, xmm7
562
563 mulcore [edi + 4], esi, xmm0, xmm1, xmm2
564 accum xmm5, xmm6, xmm7
565
566 mulcore [edi + 8], esi, xmm0, xmm1
567 accum xmm6, xmm7
568
569 mulcore [edi + 12], esi, xmm0
570 accum xmm7
571
572 // That's lots of pieces. Now we have to assemble the answer.
573 squash xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm4
574
575 // Expand it.
576 pxor xmm2, xmm2
577 expand xmm2, xmm4, xmm1
578 movdqa [edx + 0], xmm4
579 movdqa [edx + 16], xmm1
580
581 // Initialize the carry from the value for W we calculated earlier.
582 movd xmm4, [edi + 0]
583 movd xmm5, [edi + 4]
584 movd xmm6, [edi + 8]
585 movd xmm7, [edi + 12]
586
587 // Finish the calculation by adding the Montgomery product.
588 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
589 propout [edi + 0], xmm4, xmm5
590
591 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
592 propout [edi + 4], xmm5, xmm6
593
594 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
595 propout [edi + 8], xmm6, xmm7
596
597 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
598 propout [edi + 12], xmm7, xmm4
599
600 // Add add on the carry we calculated earlier.
601 paddq xmm4, [SP + 0]
602 paddq xmm5, [SP + 16]
603 paddq xmm6, [SP + 32]
604
605 // And, with that, we're done.
606 stfree 48 + 12
607 ret
608 ENDFUNC
609
610 INTFUNC(mont4)
611 // On entry, EDI points to the destination buffer holding a packed
612 // value W; EBX points to a packed operand N; ESI points to an
613 // expanded operand M; and EDX points to a place to store an expanded
614 // result Y (32 bytes, at a 16-byte boundary).
615 //
616 // On exit, we write Y = W M mod B to [EDX], and the low 128 bits
617 // of the sum W + N Y to [EDI], leaving the remaining carry in
618 // XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2, XMM3, and
619 // XMM7 are clobbered; the general-purpose registers are preserved.
620 endprologue
621
622 // Calculate Y = W M.
623 mulcore [edi + 0], esi, xmm4, xmm5, xmm6, xmm7
624
625 mulcore [edi + 4], esi, xmm0, xmm1, xmm2
626 accum xmm5, xmm6, xmm7
627
628 mulcore [edi + 8], esi, xmm0, xmm1
629 accum xmm6, xmm7
630
631 mulcore [edi + 12], esi, xmm0
632 accum xmm7
633
634 // That's lots of pieces. Now we have to assemble the answer.
635 squash xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm4
636
637 // Expand it.
638 pxor xmm2, xmm2
639 expand xmm2, xmm4, xmm1
640 movdqa [edx + 0], xmm4
641 movdqa [edx + 16], xmm1
642
643 // Initialize the carry from W.
644 movd xmm4, [edi + 0]
645 movd xmm5, [edi + 4]
646 movd xmm6, [edi + 8]
647 movd xmm7, [edi + 12]
648
649 // Finish the calculation by adding the Montgomery product.
650 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
651 propout [edi + 0], xmm4, xmm5
652
653 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
654 propout [edi + 4], xmm5, xmm6
655
656 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
657 propout [edi + 8], xmm6, xmm7
658
659 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
660 propout [edi + 12], xmm7, xmm4
661
662 // And, with that, we're done.
663 ret
664 ENDFUNC
665
666 ///--------------------------------------------------------------------------
667 /// Bulk multipliers.
668
669 FUNC(mpx_umul4_x86_avx)
670 .arch .avx
671 vzeroupper
672 endprologue
673 // and drop through...
674 .arch pentium4
675 ENDFUNC
676
677 FUNC(mpx_umul4_x86_sse2)
678 // void mpx_umul4_x86_sse2(mpw *dv, const mpw *av, const mpw *avl,
679 // const mpw *bv, const mpw *bvl);
680
681 // Build a stack frame. Arguments will be relative to BP, as
682 // follows.
683 //
684 // BP + 20 dv
685 // BP + 24 av
686 // BP + 28 avl
687 // BP + 32 bv
688 // BP + 36 bvl
689 //
690 // Locals are relative to SP, as follows.
691 //
692 // SP + 0 expanded Y (32 bytes)
693 // SP + 32 (top of locals)
694 pushreg BP
695 pushreg ebx
696 pushreg esi
697 pushreg edi
698 setfp
699 stalloc 32
700 and SP, ~15
701 endprologue
702
703 // Prepare for the first iteration.
704 mov esi, [BP + 32] // -> bv[0]
705 pxor xmm7, xmm7
706 movdqu xmm0, [esi] // bv[0]
707 mov edi, [BP + 20] // -> dv[0]
708 mov ecx, edi // outer loop dv cursor
709 expand xmm7, xmm0, xmm1
710 mov ebx, [BP + 24] // -> av[0]
711 mov eax, [BP + 28] // -> av[m] = av limit
712 mov edx, SP // -> expanded Y = bv[0]
713 movdqa [SP + 0], xmm0 // bv[0] expanded low
714 movdqa [SP + 16], xmm1 // bv[0] expanded high
715 call mul4zc
716 add ebx, 16
717 add edi, 16
718 add ecx, 16
719 add esi, 16
720 cmp ebx, eax // all done?
721 jae 8f
722
723 .p2align 4
724 // Continue with the first iteration.
725 0: call mul4
726 add ebx, 16
727 add edi, 16
728 cmp ebx, eax // all done?
729 jb 0b
730
731 // Write out the leftover carry. There can be no tail here.
732 8: call carryprop
733 cmp esi, [BP + 36] // more passes to do?
734 jae 9f
735
736 .p2align 4
737 // Set up for the next pass.
738 1: movdqu xmm0, [esi] // bv[i]
739 mov edi, ecx // -> dv[i]
740 pxor xmm7, xmm7
741 expand xmm7, xmm0, xmm1
742 mov ebx, [BP + 24] // -> av[0]
743 movdqa [SP + 0], xmm0 // bv[i] expanded low
744 movdqa [SP + 16], xmm1 // bv[i] expanded high
745 call mla4zc
746 add edi, 16
747 add ebx, 16
748 add ecx, 16
749 add esi, 16
750 cmp ebx, eax // done yet?
751 jae 8f
752
753 .p2align 4
754 // Continue...
755 0: call mla4
756 add ebx, 16
757 add edi, 16
758 cmp ebx, eax
759 jb 0b
760
761 // Finish off this pass. There was no tail on the previous pass, and
762 // there can be none on this pass.
763 8: call carryprop
764 cmp esi, [BP + 36]
765 jb 1b
766
767 // All over.
768 9: dropfp
769 pop edi
770 pop esi
771 pop ebx
772 pop BP
773 ret
774 ENDFUNC
775
776 FUNC(mpxmont_mul4_x86_avx)
777 .arch .avx
778 vzeroupper
779 endprologue
780 // and drop through...
781 .arch pentium4
782 ENDFUNC
783
784 FUNC(mpxmont_mul4_x86_sse2)
785 // void mpxmont_mul4_x86_sse2(mpw *dv, const mpw *av, const mpw *bv,
786 // const mpw *nv, size_t n, const mpw *mi);
787
788 // Build a stack frame. Arguments will be relative to BP, as
789 // follows.
790 //
791 // BP + 20 dv
792 // BP + 24 av
793 // BP + 28 bv
794 // BP + 32 nv
795 // BP + 36 n (nonzero multiple of 4)
796 // BP + 40 mi
797 //
798 // Locals are relative to SP, which 16-byte aligned, as follows.
799 //
800 // SP + 0 expanded V (32 bytes)
801 // SP + 32 expanded M (32 bytes)
802 // SP + 64 expanded Y (32 bytes)
803 // SP + 96 outer loop dv
804 // SP + 100 outer loop bv
805 // SP + 104 av limit (mostly in ESI)
806 // SP + 108 bv limit
807 // SP + 112 (top of locals)
808 pushreg BP
809 pushreg ebx
810 pushreg esi
811 pushreg edi
812 setfp
813 stalloc 112
814 and SP, ~15
815 endprologue
816
817 // Establish the expanded operands.
818 pxor xmm7, xmm7
819 mov ecx, [BP + 28] // -> bv
820 mov edx, [BP + 40] // -> mi
821 movdqu xmm0, [ecx] // bv[0]
822 movdqu xmm2, [edx] // mi
823 expand xmm7, xmm0, xmm1, xmm2, xmm3
824 movdqa [SP + 0], xmm0 // bv[0] expanded low
825 movdqa [SP + 16], xmm1 // bv[0] expanded high
826 movdqa [SP + 32], xmm2 // mi expanded low
827 movdqa [SP + 48], xmm3 // mi expanded high
828
829 // Set up the outer loop state and prepare for the first iteration.
830 mov edx, [BP + 36] // n
831 mov eax, [BP + 24] // -> U = av[0]
832 mov ebx, [BP + 32] // -> X = nv[0]
833 mov edi, [BP + 20] // -> Z = dv[0]
834 mov [SP + 100], ecx
835 lea ecx, [ecx + 4*edx] // -> bv[n/4] = bv limit
836 lea edx, [eax + 4*edx] // -> av[n/4] = av limit
837 mov [SP + 96], edi
838 mov [SP + 104], edx
839 mov [SP + 108], ecx
840 lea ecx, [SP + 0] // -> expanded V = bv[0]
841 lea esi, [SP + 32] // -> expanded M = mi
842 lea edx, [SP + 64] // -> space for Y
843 call mmul4
844 mov esi, [SP + 104] // recover av limit
845 add edi, 16
846 add eax, 16
847 add ebx, 16
848 cmp eax, esi // done already?
849 jae 8f
850 mov [SP + 96], edi
851
852 .p2align 4
853 // Complete the first inner loop.
854 0: call dmul4
855 add edi, 16
856 add eax, 16
857 add ebx, 16
858 cmp eax, esi // done yet?
859 jb 0b
860
861 // Still have carries left to propagate.
862 call carryprop
863 movd [edi + 16], xmm4
864
865 .p2align 4
866 // Embark on the next iteration. (There must be one. If n = 1, then
867 // we would have bailed above, to label 8. Similarly, the subsequent
868 // iterations can fall into the inner loop immediately.)
869 1: mov eax, [SP + 100] // -> bv[i - 1]
870 mov edi, [SP + 96] // -> Z = dv[i]
871 add eax, 16 // -> bv[i]
872 pxor xmm7, xmm7
873 mov [SP + 100], eax
874 cmp eax, [SP + 108] // done yet?
875 jae 9f
876 movdqu xmm0, [eax] // bv[i]
877 mov ebx, [BP + 32] // -> X = nv[0]
878 lea esi, [SP + 32] // -> expanded M = mi
879 mov eax, [BP + 24] // -> U = av[0]
880 expand xmm7, xmm0, xmm1
881 movdqa [SP + 0], xmm0 // bv[i] expanded low
882 movdqa [SP + 16], xmm1 // bv[i] expanded high
883 call mmla4
884 mov esi, [SP + 104] // recover av limit
885 add edi, 16
886 add eax, 16
887 add ebx, 16
888 mov [SP + 96], edi
889
890 .p2align 4
891 // Complete the next inner loop.
892 0: call dmla4
893 add edi, 16
894 add eax, 16
895 add ebx, 16
896 cmp eax, esi
897 jb 0b
898
899 // Still have carries left to propagate, and they overlap the
900 // previous iteration's final tail, so read that in and add it.
901 movd xmm0, [edi]
902 paddq xmm4, xmm0
903 call carryprop
904 movd [edi + 16], xmm4
905
906 // Back again.
907 jmp 1b
908
909 // First iteration was short. Write out the carries and we're done.
910 // (This could be folded into the main loop structure, but that would
911 // penalize small numbers more.)
912 8: call carryprop
913 movd [edi + 16], xmm4
914
915 // All done.
916 9: dropfp
917 popreg edi
918 popreg esi
919 popreg ebx
920 popreg BP
921 ret
922 ENDFUNC
923
924 FUNC(mpxmont_redc4_x86_avx)
925 .arch .avx
926 vzeroupper
927 endprologue
928 // and drop through...
929 .arch pentium4
930 ENDFUNC
931
932 FUNC(mpxmont_redc4_x86_sse2)
933 // void mpxmont_redc4_x86_sse2(mpw *dv, mpw *dvl, const mpw *nv,
934 // size_t n, const mpw *mi);
935
936 // Build a stack frame. Arguments will be relative to BP, as
937 // follows.
938 //
939 // BP + 20 dv
940 // BP + 24 dvl
941 // BP + 28 nv
942 // BP + 32 n (nonzero multiple of 4)
943 // BP + 36 mi
944 //
945 // Locals are relative to SP, as follows.
946 //
947 // SP + 0 outer loop dv
948 // SP + 4 outer dv limit
949 // SP + 8 blocks-of-4 dv limit
950 // SP + 12 expanded M (32 bytes)
951 // SP + 44 expanded Y (32 bytes)
952 // SP + 76 (top of locals)
953 pushreg BP
954 pushreg ebx
955 pushreg esi
956 pushreg edi
957 setfp
958 and SP, ~15
959 stalloc 76
960 endprologue
961
962 // Establish the expanded operands and the blocks-of-4 dv limit.
963 mov edi, [BP + 20] // -> Z = dv[0]
964 pxor xmm7, xmm7
965 mov eax, [BP + 24] // -> dv[n] = dv limit
966 sub eax, edi // length of dv in bytes
967 mov edx, [BP + 36] // -> mi
968 movdqu xmm0, [edx] // mi
969 and eax, ~15 // mask off the tail end
970 expand xmm7, xmm0, xmm1
971 add eax, edi // find limit
972 movdqa [SP + 12], xmm0 // mi expanded low
973 movdqa [SP + 28], xmm1 // mi expanded high
974 mov [SP + 8], eax
975
976 // Set up the outer loop state and prepare for the first iteration.
977 mov ecx, [BP + 32] // n
978 mov ebx, [BP + 28] // -> X = nv[0]
979 lea edx, [edi + 4*ecx] // -> dv[n/4] = outer dv limit
980 lea ecx, [ebx + 4*ecx] // -> nv[n/4] = nv limit
981 mov [SP + 0], edi
982 mov [SP + 4], edx
983 lea esi, [SP + 12] // -> expanded M = mi
984 lea edx, [SP + 44] // -> space for Y
985 call mont4
986 add ebx, 16
987 add edi, 16
988 cmp ebx, ecx // done already?
989 jae 8f
990
991 .p2align 4
992 // Complete the first inner loop.
993 5: call mla4
994 add ebx, 16
995 add edi, 16
996 cmp ebx, ecx // done yet?
997 jb 5b
998
999 // Still have carries left to propagate.
1000 8: carryadd
1001 mov esi, [SP + 8] // -> dv blocks limit
1002 mov edx, [BP + 24] // dv limit
1003 psllq xmm7, 16
1004 pslldq xmm7, 8
1005 paddq xmm6, xmm7
1006 call carryprop
1007 movd eax, xmm4
1008 add edi, 16
1009 cmp edi, esi
1010 jae 7f
1011
1012 .p2align 4
1013 // Continue carry propagation until the end of the buffer.
1014 0: add [edi], eax
1015 mov eax, 0 // preserves flags
1016 adcd [edi + 4], 0
1017 adcd [edi + 8], 0
1018 adcd [edi + 12], 0
1019 adc eax, 0
1020 add edi, 16
1021 cmp edi, esi
1022 jb 0b
1023
1024 // Deal with the tail end.
1025 7: add [edi], eax
1026 mov eax, 0 // preserves flags
1027 add edi, 4
1028 adc eax, 0
1029 cmp edi, edx
1030 jb 7b
1031
1032 // All done for this iteration. Start the next. (This must have at
1033 // least one follow-on iteration, or we'd not have started this outer
1034 // loop.)
1035 8: mov edi, [SP + 0] // -> dv[i - 1]
1036 mov ebx, [BP + 28] // -> X = nv[0]
1037 lea edx, [SP + 44] // -> space for Y
1038 lea esi, [SP + 12] // -> expanded M = mi
1039 add edi, 16 // -> Z = dv[i]
1040 cmp edi, [SP + 4] // all done yet?
1041 jae 9f
1042 mov [SP + 0], edi
1043 call mont4
1044 add edi, 16
1045 add ebx, 16
1046 jmp 5b
1047
1048 // All over.
1049 9: dropfp
1050 popreg edi
1051 popreg esi
1052 popreg ebx
1053 popreg BP
1054 ret
1055 ENDFUNC
1056
1057 ///--------------------------------------------------------------------------
1058 /// Testing and performance measurement.
1059
1060 #ifdef TEST_MUL4
1061
1062 .macro cysetup c
1063 rdtsc
1064 mov [\c], eax
1065 mov [\c + 4], edx
1066 .endm
1067
1068 .macro cystore c, v, n
1069 rdtsc
1070 sub eax, [\c]
1071 sbb edx, [\c + 4]
1072 mov ebx, [\v]
1073 mov ecx, [\n]
1074 dec ecx
1075 mov [\n], ecx
1076 mov [ebx + ecx*8], eax
1077 mov [ebx + ecx*8 + 4], edx
1078 .endm
1079
1080 .macro testprologue n
1081 pushreg BP
1082 pushreg ebx
1083 pushreg esi
1084 pushreg edi
1085 setfp
1086 stalloc 3*32 + 4*4
1087 and SP, ~15
1088 endprologue
1089 mov eax, \n
1090 mov [SP + 104], eax
1091 // vars:
1092 // SP + 0 = v expanded
1093 // SP + 32 = y expanded
1094 // SP + 64 = ? expanded
1095 // SP + 96 = cycles
1096 // SP + 104 = count
1097 .endm
1098
1099 .macro testepilogue
1100 dropfp
1101 popreg edi
1102 popreg esi
1103 popreg ebx
1104 popreg BP
1105 ret
1106 .endm
1107
1108 .macro testldcarry c
1109 mov ecx, \c // -> c
1110 movdqu xmm4, [ecx + 0] // (c'_0; c''_0)
1111 movdqu xmm5, [ecx + 16] // (c'_1; c''_1)
1112 movdqu xmm6, [ecx + 32] // (c'_2; c''_2)
1113 .endm
1114
1115 .macro testexpand v=nil, y=nil
1116 pxor xmm7, xmm7
1117 .ifnes "\v", "nil"
1118 mov ecx, \v
1119 movdqu xmm0, [ecx]
1120 expand xmm7, xmm0, xmm1
1121 movdqa [SP + 0], xmm0
1122 movdqa [SP + 16], xmm1
1123 .endif
1124 .ifnes "\y", "nil"
1125 mov edx, \y
1126 movdqu xmm2, [edx]
1127 expand xmm7, xmm2, xmm3
1128 movdqa [SP + 32], xmm2
1129 movdqa [SP + 48], xmm3
1130 .endif
1131 .endm
1132
1133 .macro testtop u=nil, x=nil, mode=nil
1134 .p2align 4
1135 0:
1136 .ifnes "\u", "nil"
1137 lea ecx, [SP + 0]
1138 .endif
1139 mov ebx, \x
1140 .ifeqs "\mode", "mont"
1141 lea esi, [SP + 32]
1142 .endif
1143 cysetup SP + 96
1144 .ifnes "\u", "nil"
1145 mov eax, \u
1146 .endif
1147 .ifeqs "\mode", "mont"
1148 lea edx, [SP + 64]
1149 .else
1150 lea edx, [SP + 32]
1151 .endif
1152 .endm
1153
1154 .macro testtail cyv
1155 cystore SP + 96, \cyv, SP + 104
1156 jnz 0b
1157 .endm
1158
1159 .macro testcarryout c
1160 mov ecx, \c
1161 movdqu [ecx + 0], xmm4
1162 movdqu [ecx + 16], xmm5
1163 movdqu [ecx + 32], xmm6
1164 .endm
1165
1166 FUNC(test_dmul4)
1167 testprologue [BP + 44]
1168 testldcarry [BP + 24]
1169 testexpand [BP + 36], [BP + 40]
1170 mov edi, [BP + 20]
1171 testtop [BP + 28], [BP + 32]
1172 call dmul4
1173 testtail [BP + 48]
1174 testcarryout [BP + 24]
1175 testepilogue
1176 ENDFUNC
1177
1178 FUNC(test_dmla4)
1179 testprologue [BP + 44]
1180 testldcarry [BP + 24]
1181 testexpand [BP + 36], [BP + 40]
1182 mov edi, [BP + 20]
1183 testtop [BP + 28], [BP + 32]
1184 call dmla4
1185 testtail [BP + 48]
1186 testcarryout [BP + 24]
1187 testepilogue
1188 ENDFUNC
1189
1190 FUNC(test_mul4)
1191 testprologue [BP + 36]
1192 testldcarry [BP + 24]
1193 testexpand nil, [BP + 32]
1194 mov edi, [BP + 20]
1195 testtop nil, [BP + 28]
1196 call mul4
1197 testtail [BP + 40]
1198 testcarryout [BP + 24]
1199 testepilogue
1200 ENDFUNC
1201
1202 FUNC(test_mul4zc)
1203 testprologue [BP + 36]
1204 testldcarry [BP + 24]
1205 testexpand nil, [BP + 32]
1206 mov edi, [BP + 20]
1207 testtop nil, [BP + 28]
1208 call mul4zc
1209 testtail [BP + 40]
1210 testcarryout [BP + 24]
1211 testepilogue
1212 ENDFUNC
1213
1214 FUNC(test_mla4)
1215 testprologue [BP + 36]
1216 testldcarry [BP + 24]
1217 testexpand nil, [BP + 32]
1218 mov edi, [BP + 20]
1219 testtop nil, [BP + 28]
1220 call mla4
1221 testtail [BP + 40]
1222 testcarryout [BP + 24]
1223 testepilogue
1224 ENDFUNC
1225
1226 FUNC(test_mla4zc)
1227 testprologue [BP + 36]
1228 testldcarry [BP + 24]
1229 testexpand nil, [BP + 32]
1230 mov edi, [BP + 20]
1231 testtop nil, [BP + 28]
1232 call mla4zc
1233 testtail [BP + 40]
1234 testcarryout [BP + 24]
1235 testepilogue
1236 ENDFUNC
1237
1238 FUNC(test_mmul4)
1239 testprologue [BP + 48]
1240 testexpand [BP + 40], [BP + 44]
1241 mov edi, [BP + 20]
1242 testtop [BP + 32], [BP + 36], mont
1243 call mmul4
1244 testtail [BP + 52]
1245 mov edi, [BP + 28]
1246 movdqa xmm0, [SP + 64]
1247 movdqa xmm1, [SP + 80]
1248 movdqu [edi], xmm0
1249 movdqu [edi + 16], xmm1
1250 testcarryout [BP + 24]
1251 testepilogue
1252 ENDFUNC
1253
1254 FUNC(test_mmla4)
1255 testprologue [BP + 48]
1256 testexpand [BP + 40], [BP + 44]
1257 mov edi, [BP + 20]
1258 testtop [BP + 32], [BP + 36], mont
1259 call mmla4
1260 testtail [BP + 52]
1261 mov edi, [BP + 28]
1262 movdqa xmm0, [SP + 64]
1263 movdqa xmm1, [SP + 80]
1264 movdqu [edi], xmm0
1265 movdqu [edi + 16], xmm1
1266 testcarryout [BP + 24]
1267 testepilogue
1268 ENDFUNC
1269
1270 FUNC(test_mont4)
1271 testprologue [BP + 40]
1272 testexpand nil, [BP + 36]
1273 mov edi, [BP + 20]
1274 testtop nil, [BP + 32], mont
1275 call mont4
1276 testtail [BP + 44]
1277 mov edi, [BP + 28]
1278 movdqa xmm0, [SP + 64]
1279 movdqa xmm1, [SP + 80]
1280 movdqu [edi], xmm0
1281 movdqu [edi + 16], xmm1
1282 testcarryout [BP + 24]
1283 testepilogue
1284 ENDFUNC
1285
1286 #endif
1287
1288 ///----- That's all, folks --------------------------------------------------