ba7ae6a34bffa924de37f55f7787c985839cc3b8
[catacomb] / math / mpx-mul4-x86-sse2.S
1 /// -*- mode: asm; asm-comment-char: ?/; comment-start: "// " -*-
2 ///
3 /// Large SIMD-based multiplications
4 ///
5 /// (c) 2016 Straylight/Edgeware
6
7 ///----- Licensing notice ---------------------------------------------------
8 ///
9 /// This file is part of Catacomb.
10 ///
11 /// Catacomb is free software; you can redistribute it and/or modify
12 /// it under the terms of the GNU Library General Public License as
13 /// published by the Free Software Foundation; either version 2 of the
14 /// License, or (at your option) any later version.
15 ///
16 /// Catacomb is distributed in the hope that it will be useful,
17 /// but WITHOUT ANY WARRANTY; without even the implied warranty of
18 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 /// GNU Library General Public License for more details.
20 ///
21 /// You should have received a copy of the GNU Library General Public
22 /// License along with Catacomb; if not, write to the Free
23 /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
24 /// MA 02111-1307, USA.
25
26 ///--------------------------------------------------------------------------
27 /// Preliminaries.
28
29 #include "config.h"
30 #include "asm-common.h"
31
32 .arch pentium4
33
34 .text
35
36 ///--------------------------------------------------------------------------
37 /// Theory.
38 ///
39 /// We define a number of primitive fixed-size multipliers from which we can
40 /// construct more general variable-length multipliers.
41 ///
42 /// The basic trick is the same throughout. In an operand-scanning
43 /// multiplication, the inner multiplication loop multiplies a
44 /// multiple-precision operand by a single precision factor, and adds the
45 /// result, appropriately shifted, to the result. A `finely integrated
46 /// operand scanning' implementation of Montgomery multiplication also adds
47 /// the product of a single-precision `Montgomery factor' and the modulus,
48 /// calculated in the same pass. The more common `coarsely integrated
49 /// operand scanning' alternates main multiplication and Montgomery passes,
50 /// which requires additional carry propagation.
51 ///
52 /// Throughout both plain-multiplication and Montgomery stages, then, one of
53 /// the factors remains constant throughout the operation, so we can afford
54 /// to take a little time to preprocess it. The transformation we perform is
55 /// as follows. Let b = 2^16, and B = b^2 = 2^32. Suppose we're given a
56 /// 128-bit factor v = v_0 + v_1 B + v_2 B^2 + v_3 B^3. Split each v_i into
57 /// two sixteen-bit pieces, so v_i = v'_i + v''_i b. These eight 16-bit
58 /// pieces are placed into 32-bit cells, and arranged as two 128-bit SSE
59 /// operands, as follows.
60 ///
61 /// Offset 0 4 8 12
62 /// 0 v'_0 v'_1 v''_0 v''_1
63 /// 16 v'_2 v'_3 v''_2 v''_3
64 ///
65 /// A `pmuludq' instruction ignores the odd positions in its operands; thus,
66 /// it will act on (say) v'_0 and v''_0 in a single instruction. Shifting
67 /// this vector right by 4 bytes brings v'_1 and v''_1 into position. We can
68 /// multiply such a vector by a full 32-bit scalar to produce two 48-bit
69 /// results in 64-bit fields. The sixteen bits of headroom allows us to add
70 /// many products together before we must deal with carrying; it also allows
71 /// for some calculations to be performed on the above expanded form.
72 ///
73 /// On 32-bit x86, we are register starved: the expanded operands are kept in
74 /// memory, typically in warm L1 cache.
75 ///
76 /// We maintain four `carry' registers accumulating intermediate results.
77 /// The registers' precise roles rotate during the computation; we name them
78 /// `c0', `c1', `c2', and `c3'. Each carry register holds two 64-bit halves:
79 /// the register c0, for example, holds c'_0 (low half) and c''_0 (high
80 /// half), and represents the value c_0 = c'_0 + c''_0 b; the carry registers
81 /// collectively represent the value c_0 + c_1 B + c_2 B^2 + c_3 B^3. The
82 /// `pmuluqd' instruction acting on a scalar operand (broadcast across all
83 /// lanes of its vector) and an operand in the expanded form above produces a
84 /// result which can be added directly to the appropriate carry register.
85 /// Following a pass of four multiplications, we perform some limited carry
86 /// propagation: let t = c''_0 mod B, and let d = c'_0 + t b; then we output
87 /// z = d mod B, add (floor(d/B), floor(c''_0/B)) to c1, and cycle the carry
88 /// registers around, so that c1 becomes c0, and the old c0 is (implicitly)
89 /// zeroed becomes c3.
90
91 ///--------------------------------------------------------------------------
92 /// Macro definitions.
93
94 .macro mulcore r, s, d0, d1=nil, d2=nil, d3=nil
95 // Load a word r_i from R, multiply by the expanded operand [S], and
96 // leave the pieces of the product in registers D0, D1, D2, D3.
97 movd \d0, \r // (r_i, 0; 0, 0)
98 .ifnes "\d1", "nil"
99 movdqa \d1, [\s] // (s'_0, s'_1; s''_0, s''_1)
100 .endif
101 .ifnes "\d3", "nil"
102 movdqa \d3, [\s + 16] // (s'_2, s'_3; s''_2, s''_3)
103 .endif
104 pshufd \d0, \d0, SHUF(0, 3, 0, 3) // (r_i, ?; r_i, ?)
105 .ifnes "\d1", "nil"
106 psrldq \d1, 4 // (s'_1, s''_0; s''_1, 0)
107 .endif
108 .ifnes "\d2", "nil"
109 .ifnes "\d3", "nil"
110 movdqa \d2, \d3 // another copy of (s'_2, s'_3; ...)
111 .else
112 movdqa \d2, \d0 // another copy of (r_i, ?; r_i, ?)
113 .endif
114 .endif
115 .ifnes "\d3", "nil"
116 psrldq \d3, 4 // (s'_3, s''_2; s''_3, 0)
117 .endif
118 .ifnes "\d1", "nil"
119 pmuludq \d1, \d0 // (r_i s'_1; r_i s''_1)
120 .endif
121 .ifnes "\d3", "nil"
122 pmuludq \d3, \d0 // (r_i s'_3; r_i s''_3)
123 .endif
124 .ifnes "\d2", "nil"
125 .ifnes "\d3", "nil"
126 pmuludq \d2, \d0 // (r_i s'_2; r_i s''_2)
127 .else
128 pmuludq \d2, [\s + 16]
129 .endif
130 .endif
131 pmuludq \d0, [\s] // (r_i s'_0; r_i s''_0)
132 .endm
133
134 .macro accum c0, c1=nil, c2=nil, c3=nil
135 // Accumulate 64-bit pieces in XMM0--XMM3 into the corresponding
136 // carry registers C0--C3. Any or all of C1--C3 may be `nil' to skip
137 // updating that register.
138 paddq \c0, xmm0
139 .ifnes "\c1", "nil"
140 paddq \c1, xmm1
141 .endif
142 .ifnes "\c2", "nil"
143 paddq \c2, xmm2
144 .endif
145 .ifnes "\c3", "nil"
146 paddq \c3, xmm3
147 .endif
148 .endm
149
150 .macro mulacc r, s, c0, c1, c2, c3, z3p=nil
151 // Load a word r_i from R, multiply by the expanded operand [S],
152 // and accumulate in carry registers C0, C1, C2, C3. If Z3P is `t'
153 // then C3 notionally contains zero, but needs clearing; in practice,
154 // we store the product directly rather than attempting to add. On
155 // completion, XMM0, XMM1, and XMM2 are clobbered, as is XMM3 if Z3P
156 // is not `t'.
157 .ifeqs "\z3p", "t"
158 mulcore \r, \s, xmm0, xmm1, xmm2, \c3
159 accum \c0, \c1, \c2
160 .else
161 mulcore \r, \s, xmm0, xmm1, xmm2, xmm3
162 accum \c0, \c1, \c2, \c3
163 .endif
164 .endm
165
166 .macro propout d, c, cc=nil
167 // Calculate an output word from C, and store it in D; propagate
168 // carries out from C to CC in preparation for a rotation of the
169 // carry registers. On completion, XMM3 is clobbered. If CC is
170 // `nil', then the contribution which would have been added to it is
171 // left in C.
172 pshufd xmm3, \c, SHUF(3, 3, 3, 2) // (?, ?; ?, t = c'' mod B)
173 psrldq xmm3, 12 // (t, 0; 0, 0) = (t, 0)
174 pslldq xmm3, 2 // (t b; 0)
175 paddq \c, xmm3 // (c' + t b; c'')
176 movd \d, \c
177 psrlq \c, 32 // floor(c/B)
178 .ifnes "\cc", "nil"
179 paddq \cc, \c // propagate up
180 .endif
181 .endm
182
183 .macro endprop d, c, t
184 // On entry, C contains a carry register. On exit, the low 32 bits
185 // of the value represented in C are written to D, and the remaining
186 // bits are left at the bottom of T.
187 movdqa \t, \c
188 psllq \t, 16 // (?; c'' b)
189 pslldq \c, 8 // (0; c')
190 paddq \t, \c // (?; c' + c'' b)
191 psrldq \t, 8 // (c' + c'' b; 0) = (c; 0)
192 movd \d, \t
193 psrldq \t, 4 // (floor(c/B); 0)
194 .endm
195
196 .macro expand z, a, b, c=nil, d=nil
197 // On entry, A and C hold packed 128-bit values, and Z is zero. On
198 // exit, A:B and C:D together hold the same values in expanded
199 // form. If C is `nil', then only expand A to A:B.
200 movdqa \b, \a // (a_0, a_1; a_2, a_3)
201 .ifnes "\c", "nil"
202 movdqa \d, \c // (c_0, c_1; c_2, c_3)
203 .endif
204 punpcklwd \a, \z // (a'_0, a''_0; a'_1, a''_1)
205 punpckhwd \b, \z // (a'_2, a''_2; a'_3, a''_3)
206 .ifnes "\c", "nil"
207 punpcklwd \c, \z // (c'_0, c''_0; c'_1, c''_1)
208 punpckhwd \d, \z // (c'_2, c''_2; c'_3, c''_3)
209 .endif
210 pshufd \a, \a, SHUF(0, 2, 1, 3) // (a'_0, a'_1; a''_0, a''_1)
211 pshufd \b, \b, SHUF(0, 2, 1, 3) // (a'_2, a'_3; a''_2, a''_3)
212 .ifnes "\c", "nil"
213 pshufd \c, \c, SHUF(0, 2, 1, 3) // (c'_0, c'_1; c''_0, c''_1)
214 pshufd \d, \d, SHUF(0, 2, 1, 3) // (c'_2, c'_3; c''_2, c''_3)
215 .endif
216 .endm
217
218 .macro squash c0, c1, c2, c3, t, u, lo, hi=nil
219 // On entry, C0, C1, C2, C3 are carry registers representing a value
220 // Y. On exit, LO holds the low 128 bits of the carry value; C1, C2,
221 // C3, T, and U are clobbered; and the high bits of Y are stored in
222 // HI, if this is not `nil'.
223
224 // The first step is to eliminate the `double-prime' pieces -- i.e.,
225 // the ones offset by 16 bytes from a 32-bit boundary -- by carrying
226 // them into the 32-bit-aligned pieces above and below. But before
227 // we can do that, we must gather them together.
228 movdqa \t, \c0
229 movdqa \u, \c1
230 punpcklqdq \t, \c2 // (y'_0; y'_2)
231 punpckhqdq \c0, \c2 // (y''_0; y''_2)
232 punpcklqdq \u, \c3 // (y'_1; y'_3)
233 punpckhqdq \c1, \c3 // (y''_1; y''_3)
234
235 // Now split the double-prime pieces. The high (up to) 48 bits will
236 // go up; the low 16 bits go down.
237 movdqa \c2, \c0
238 movdqa \c3, \c1
239 psllq \c2, 48
240 psllq \c3, 48
241 psrlq \c0, 16 // high parts of (y''_0; y''_2)
242 psrlq \c1, 16 // high parts of (y''_1; y''_3)
243 psrlq \c2, 32 // low parts of (y''_0; y''_2)
244 psrlq \c3, 32 // low parts of (y''_1; y''_3)
245 .ifnes "\hi", "nil"
246 movdqa \hi, \c1
247 .endif
248 pslldq \c1, 8 // high part of (0; y''_1)
249
250 paddq \t, \c2 // propagate down
251 paddq \u, \c3
252 paddq \t, \c1 // and up: (y_0; y_2)
253 paddq \u, \c0 // (y_1; y_3)
254 .ifnes "\hi", "nil"
255 psrldq \hi, 8 // high part of (y''_3; 0)
256 .endif
257
258 // Finally extract the answer. This complicated dance is better than
259 // storing to memory and loading, because the piecemeal stores
260 // inhibit store forwarding.
261 movdqa \c3, \t // (y_0; ?)
262 movdqa \lo, \t // (y^*_0, ?; ?, ?)
263 psrldq \t, 8 // (y_2; 0)
264 psrlq \c3, 32 // (floor(y_0/B); ?)
265 paddq \c3, \u // (y_1 + floor(y_0/B); ?)
266 movdqa \c1, \c3 // (y^*_1, ?; ?, ?)
267 psrldq \u, 8 // (y_3; 0)
268 psrlq \c3, 32 // (floor((y_1 B + y_0)/B^2; ?)
269 paddq \c3, \t // (y_2 + floor((y_1 B + y_0)/B^2; ?)
270 punpckldq \lo, \c3 // (y^*_0, y^*_2; ?, ?)
271 psrlq \c3, 32 // (floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
272 paddq \c3, \u // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
273 .ifnes "\hi", "nil"
274 movdqa \t, \c3
275 pxor \u, \u
276 .endif
277 punpckldq \c1, \c3 // (y^*_1, y^*_3; ?, ?)
278 .ifnes "\hi", "nil"
279 psrlq \t, 32 // very high bits of y
280 paddq \hi, \t
281 punpcklqdq \hi, \u // carry up
282 .endif
283 punpckldq \lo, \c1 // y mod B^4
284 .endm
285
286 .macro carryadd
287 // On entry, EDI points to a packed addend A, and XMM4, XMM5, XMM6
288 // hold the incoming carry registers c0, c1, and c2 representing a
289 // carry-in C.
290 //
291 // On exit, the carry registers, including XMM7, are updated to hold
292 // C + A; XMM0, XMM1, XMM2, and XMM3 are clobbered. The other
293 // registers are preserved.
294 movd xmm0, [edi + 0] // (a_0; 0)
295 movd xmm1, [edi + 4] // (a_1; 0)
296 movd xmm2, [edi + 8] // (a_2; 0)
297 movd xmm7, [edi + 12] // (a_3; 0)
298
299 paddq xmm4, xmm0 // (c'_0 + a_0; c''_0)
300 paddq xmm5, xmm1 // (c'_1 + a_1; c''_1)
301 paddq xmm6, xmm2 // (c'_2 + a_2; c''_2 + a_3 b)
302 .endm
303
304 ///--------------------------------------------------------------------------
305 /// Primitive multipliers and related utilities.
306
307 INTFUNC(carryprop)
308 // On entry, XMM4, XMM5, and XMM6 hold a 144-bit carry in an expanded
309 // form. Store the low 128 bits of the represented carry to [EDI] as
310 // a packed 128-bit value, and leave the remaining 16 bits in the low
311 // 32 bits of XMM4. On exit, XMM3, XMM5 and XMM6 are clobbered.
312 endprologue
313
314 propout [edi + 0], xmm4, xmm5
315 propout [edi + 4], xmm5, xmm6
316 propout [edi + 8], xmm6, nil
317 endprop [edi + 12], xmm6, xmm4
318 ret
319
320 ENDFUNC
321
322 INTFUNC(dmul4)
323 // On entry, EDI points to the destination buffer; EAX and EBX point
324 // to the packed operands U and X; ECX and EDX point to the expanded
325 // operands V and Y; and XMM4, XMM5, XMM6 hold the incoming carry
326 // registers c0, c1, and c2; c3 is assumed to be zero.
327 //
328 // On exit, we write the low 128 bits of the sum C + U V + X Y to
329 // [EDI], and update the carry registers with the carry out. The
330 // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
331 // general-purpose registers are preserved.
332 endprologue
333
334 mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7, t
335 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
336 propout [edi + 0], xmm4, xmm5
337
338 mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
339 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4
340 propout [edi + 4], xmm5, xmm6
341
342 mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t
343 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5
344 propout [edi + 8], xmm6, xmm7
345
346 mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
347 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6
348 propout [edi + 12], xmm7, xmm4
349
350 ret
351
352 ENDFUNC
353
354 INTFUNC(dmla4)
355 // On entry, EDI points to the destination buffer, which also
356 // contains an addend A to accumulate; EAX and EBX point to the
357 // packed operands U and X; ECX and EDX point to the expanded
358 // operands V and Y; and XMM4, XMM5, XMM6 hold the incoming carry
359 // registers c0, c1, and c2 representing a carry-in C; c3 is assumed
360 // to be zero.
361 //
362 // On exit, we write the low 128 bits of the sum A + C + U V + X Y to
363 // [EDI], and update the carry registers with the carry out. The
364 // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
365 // general-purpose registers are preserved.
366 endprologue
367
368 carryadd
369
370 mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7
371 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
372 propout [edi + 0], xmm4, xmm5
373
374 mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
375 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4
376 propout [edi + 4], xmm5, xmm6
377
378 mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t
379 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5
380 propout [edi + 8], xmm6, xmm7
381
382 mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
383 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6
384 propout [edi + 12], xmm7, xmm4
385
386 ret
387
388 ENDFUNC
389
390 INTFUNC(mul4zc)
391 // On entry, EDI points to the destination buffer; EBX points to a
392 // packed operand X; and EDX points to an expanded operand Y.
393 //
394 // On exit, we write the low 128 bits of the product X Y to [EDI],
395 // and set the carry registers XMM4, XMM5, XMM6 to the carry out.
396 // The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
397 // general-purpose registers are preserved.
398 endprologue
399
400 mulcore [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
401 propout [edi + 0], xmm4, xmm5
402
403 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
404 propout [edi + 4], xmm5, xmm6
405
406 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
407 propout [edi + 8], xmm6, xmm7
408
409 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
410 propout [edi + 12], xmm7, xmm4
411
412 ret
413
414 ENDFUNC
415
416 INTFUNC(mul4)
417 // On entry, EDI points to the destination buffer; EBX points to a
418 // packed operand X; EDX points to an expanded operand Y; and XMM4,
419 // XMM5, XMM6 hold the incoming carry registers c0, c1, and c2,
420 // representing a carry-in C; c3 is assumed to be zero.
421 //
422 // On exit, we write the low 128 bits of the sum C + X Y to [EDI],
423 // and update the carry registers with the carry out. The registers
424 // XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
425 // general-purpose registers are preserved.
426 endprologue
427
428 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, t
429 propout [edi + 0], xmm4, xmm5
430
431 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
432 propout [edi + 4], xmm5, xmm6
433
434 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
435 propout [edi + 8], xmm6, xmm7
436
437 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
438 propout [edi + 12], xmm7, xmm4
439
440 ret
441
442 ENDFUNC
443
444 INTFUNC(mla4zc)
445 // On entry, EDI points to the destination buffer, which also
446 // contains an addend A to accumulate; EBX points to a packed operand
447 // X; and EDX points to an expanded operand Y.
448 //
449 // On exit, we write the low 128 bits of the sum A + X Y to [EDI],
450 // and set the carry registers XMM4, XMM5, XMM6 to the carry out.
451 // The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
452 // general-purpose registers are preserved.
453 endprologue
454
455 movd xmm4, [edi + 0]
456 movd xmm5, [edi + 4]
457 movd xmm6, [edi + 8]
458 movd xmm7, [edi + 12]
459
460 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
461 propout [edi + 0], xmm4, xmm5
462
463 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
464 propout [edi + 4], xmm5, xmm6
465
466 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
467 propout [edi + 8], xmm6, xmm7
468
469 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
470 propout [edi + 12], xmm7, xmm4
471
472 ret
473
474 ENDFUNC
475
476 INTFUNC(mla4)
477 // On entry, EDI points to the destination buffer, which also
478 // contains an addend A to accumulate; EBX points to a packed operand
479 // X; EDX points to an expanded operand Y; and XMM4, XMM5, XMM6 hold
480 // the incoming carry registers c0, c1, and c2, representing a
481 // carry-in C; c3 is assumed to be zero.
482 //
483 // On exit, we write the low 128 bits of the sum A + C + X Y to
484 // [EDI], and update the carry registers with the carry out. The
485 // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
486 // general-purpose registers are preserved.
487 endprologue
488
489 carryadd
490
491 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
492 propout [edi + 0], xmm4, xmm5
493
494 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
495 propout [edi + 4], xmm5, xmm6
496
497 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
498 propout [edi + 8], xmm6, xmm7
499
500 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
501 propout [edi + 12], xmm7, xmm4
502
503 ret
504
505 ENDFUNC
506
507 INTFUNC(mmul4)
508 // On entry, EDI points to the destination buffer; EAX and EBX point
509 // to the packed operands U and N; ECX and ESI point to the expanded
510 // operands V and M; and EDX points to a place to store an expanded
511 // result Y (32 bytes, at a 16-byte boundary). The stack pointer
512 // must be 12 modulo 16, as is usual for modern x86 ABIs.
513 //
514 // On exit, we write Y = U V M mod B to [EDX], and the low 128 bits
515 // of the sum U V + N Y to [EDI], leaving the remaining carry in
516 // XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2, XMM3, and
517 // XMM7 are clobbered; the general-purpose registers are preserved.
518 stalloc 48 + 12 // space for the carries
519 endprologue
520
521 // Calculate W = U V, and leave it in the destination. Stash the
522 // carry pieces for later.
523 mulcore [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7
524 propout [edi + 0], xmm4, xmm5
525 jmp 5f
526
527 ENDFUNC
528
529 INTFUNC(mmla4)
530 // On entry, EDI points to the destination buffer, which also
531 // contains an addend A to accumulate; EAX and EBX point to the
532 // packed operands U and N; ECX and ESI point to the expanded
533 // operands V and M; and EDX points to a place to store an expanded
534 // result Y (32 bytes, at a 16-byte boundary). The stack pointer
535 // must be 12 modulo 16, as is usual for modern x86 ABIs.
536 //
537 // On exit, we write Y = (A + U V) M mod B to [EDX], and the low 128
538 // bits of the sum A + U V + N Y to [EDI], leaving the remaining
539 // carry in XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2,
540 // XMM3, and XMM7 are clobbered; the general-purpose registers are
541 // preserved.
542 stalloc 48 + 12 // space for the carries
543 endprologue
544
545 movd xmm4, [edi + 0]
546 movd xmm5, [edi + 4]
547 movd xmm6, [edi + 8]
548 movd xmm7, [edi + 12]
549
550 // Calculate W = U V, and leave it in the destination. Stash the
551 // carry pieces for later.
552 mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7
553 propout [edi + 0], xmm4, xmm5
554
555 5: mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
556 propout [edi + 4], xmm5, xmm6
557
558 mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t
559 propout [edi + 8], xmm6, xmm7
560
561 mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
562 propout [edi + 12], xmm7, xmm4
563
564 movdqa [SP + 0], xmm4
565 movdqa [SP + 16], xmm5
566 movdqa [SP + 32], xmm6
567
568 // Calculate Y = W M.
569 mulcore [edi + 0], esi, xmm4, xmm5, xmm6, xmm7
570
571 mulcore [edi + 4], esi, xmm0, xmm1, xmm2
572 accum xmm5, xmm6, xmm7
573
574 mulcore [edi + 8], esi, xmm0, xmm1
575 accum xmm6, xmm7
576
577 mulcore [edi + 12], esi, xmm0
578 accum xmm7
579
580 // That's lots of pieces. Now we have to assemble the answer.
581 squash xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm4
582
583 // Expand it.
584 pxor xmm2, xmm2
585 expand xmm2, xmm4, xmm1
586 movdqa [edx + 0], xmm4
587 movdqa [edx + 16], xmm1
588
589 // Initialize the carry from the value for W we calculated earlier.
590 movd xmm4, [edi + 0]
591 movd xmm5, [edi + 4]
592 movd xmm6, [edi + 8]
593 movd xmm7, [edi + 12]
594
595 // Finish the calculation by adding the Montgomery product.
596 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
597 propout [edi + 0], xmm4, xmm5
598
599 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
600 propout [edi + 4], xmm5, xmm6
601
602 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
603 propout [edi + 8], xmm6, xmm7
604
605 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
606 propout [edi + 12], xmm7, xmm4
607
608 // Add add on the carry we calculated earlier.
609 paddq xmm4, [SP + 0]
610 paddq xmm5, [SP + 16]
611 paddq xmm6, [SP + 32]
612
613 // And, with that, we're done.
614 stfree 48 + 12
615 ret
616
617 ENDFUNC
618
619 INTFUNC(mont4)
620 // On entry, EDI points to the destination buffer holding a packed
621 // value W; EBX points to a packed operand N; ESI points to an
622 // expanded operand M; and EDX points to a place to store an expanded
623 // result Y (32 bytes, at a 16-byte boundary).
624 //
625 // On exit, we write Y = W M mod B to [EDX], and the low 128 bits
626 // of the sum W + N Y to [EDI], leaving the remaining carry in
627 // XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2, XMM3, and
628 // XMM7 are clobbered; the general-purpose registers are preserved.
629 endprologue
630
631 // Calculate Y = W M.
632 mulcore [edi + 0], esi, xmm4, xmm5, xmm6, xmm7
633
634 mulcore [edi + 4], esi, xmm0, xmm1, xmm2
635 accum xmm5, xmm6, xmm7
636
637 mulcore [edi + 8], esi, xmm0, xmm1
638 accum xmm6, xmm7
639
640 mulcore [edi + 12], esi, xmm0
641 accum xmm7
642
643 // That's lots of pieces. Now we have to assemble the answer.
644 squash xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm4
645
646 // Expand it.
647 pxor xmm2, xmm2
648 expand xmm2, xmm4, xmm1
649 movdqa [edx + 0], xmm4
650 movdqa [edx + 16], xmm1
651
652 // Initialize the carry from W.
653 movd xmm4, [edi + 0]
654 movd xmm5, [edi + 4]
655 movd xmm6, [edi + 8]
656 movd xmm7, [edi + 12]
657
658 // Finish the calculation by adding the Montgomery product.
659 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
660 propout [edi + 0], xmm4, xmm5
661
662 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
663 propout [edi + 4], xmm5, xmm6
664
665 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
666 propout [edi + 8], xmm6, xmm7
667
668 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
669 propout [edi + 12], xmm7, xmm4
670
671 // And, with that, we're done.
672 ret
673
674 ENDFUNC
675
676 ///--------------------------------------------------------------------------
677 /// Bulk multipliers.
678
679 FUNC(mpx_umul4_x86_avx)
680 .arch .avx
681 vzeroupper
682 endprologue
683 // and drop through...
684 .arch pentium4
685 ENDFUNC
686
687 FUNC(mpx_umul4_x86_sse2)
688 // void mpx_umul4_x86_sse2(mpw *dv, const mpw *av, const mpw *avl,
689 // const mpw *bv, const mpw *bvl);
690
691 // Build a stack frame. Arguments will be relative to BP, as
692 // follows.
693 //
694 // BP + 20 dv
695 // BP + 24 av
696 // BP + 28 avl
697 // BP + 32 bv
698 // BP + 36 bvl
699 //
700 // Locals are relative to SP, as follows.
701 //
702 // SP + 0 expanded Y (32 bytes)
703 // SP + 32 (top of locals)
704 pushreg BP
705 pushreg ebx
706 pushreg esi
707 pushreg edi
708 setfp
709 and SP, ~15
710 sub SP, 32
711 endprologue
712
713 // Prepare for the first iteration.
714 mov esi, [BP + 32] // -> bv[0]
715 pxor xmm7, xmm7
716 movdqu xmm0, [esi] // bv[0]
717 mov edi, [BP + 20] // -> dv[0]
718 mov ecx, edi // outer loop dv cursor
719 expand xmm7, xmm0, xmm1
720 mov ebx, [BP + 24] // -> av[0]
721 mov eax, [BP + 28] // -> av[m] = av limit
722 mov edx, SP // -> expanded Y = bv[0]
723 movdqa [SP + 0], xmm0 // bv[0] expanded low
724 movdqa [SP + 16], xmm1 // bv[0] expanded high
725 call mul4zc
726 add ebx, 16
727 add edi, 16
728 add ecx, 16
729 add esi, 16
730 cmp ebx, eax // all done?
731 jae 8f
732
733 .p2align 4
734 // Continue with the first iteration.
735 0: call mul4
736 add ebx, 16
737 add edi, 16
738 cmp ebx, eax // all done?
739 jb 0b
740
741 // Write out the leftover carry. There can be no tail here.
742 8: call carryprop
743 cmp esi, [BP + 36] // more passes to do?
744 jae 9f
745
746 .p2align 4
747 // Set up for the next pass.
748 1: movdqu xmm0, [esi] // bv[i]
749 mov edi, ecx // -> dv[i]
750 pxor xmm7, xmm7
751 expand xmm7, xmm0, xmm1
752 mov ebx, [BP + 24] // -> av[0]
753 movdqa [SP + 0], xmm0 // bv[i] expanded low
754 movdqa [SP + 16], xmm1 // bv[i] expanded high
755 call mla4zc
756 add edi, 16
757 add ebx, 16
758 add ecx, 16
759 add esi, 16
760 cmp ebx, eax // done yet?
761 jae 8f
762
763 .p2align 4
764 // Continue...
765 0: call mla4
766 add ebx, 16
767 add edi, 16
768 cmp ebx, eax
769 jb 0b
770
771 // Finish off this pass. There was no tail on the previous pass, and
772 // there can be none on this pass.
773 8: call carryprop
774 cmp esi, [BP + 36]
775 jb 1b
776
777 // All over.
778 9: dropfp
779 pop edi
780 pop esi
781 pop ebx
782 pop BP
783 ret
784
785 ENDFUNC
786
787 FUNC(mpxmont_mul4_x86_avx)
788 .arch .avx
789 vzeroupper
790 endprologue
791 // and drop through...
792 .arch pentium4
793 ENDFUNC
794
795 FUNC(mpxmont_mul4_x86_sse2)
796 // void mpxmont_mul4_x86_sse2(mpw *dv, const mpw *av, const mpw *bv,
797 // const mpw *nv, size_t n, const mpw *mi);
798
799 // Build a stack frame. Arguments will be relative to BP, as
800 // follows.
801 //
802 // BP + 20 dv
803 // BP + 24 av
804 // BP + 28 bv
805 // BP + 32 nv
806 // BP + 36 n (nonzero multiple of 4)
807 // BP + 40 mi
808 //
809 // Locals are relative to SP, which 16-byte aligned, as follows.
810 //
811 // SP + 0 expanded V (32 bytes)
812 // SP + 32 expanded M (32 bytes)
813 // SP + 64 expanded Y (32 bytes)
814 // SP + 96 outer loop dv
815 // SP + 100 outer loop bv
816 // SP + 104 av limit (mostly in ESI)
817 // SP + 108 bv limit
818 // SP + 112 (top of locals)
819 pushreg BP
820 pushreg ebx
821 pushreg esi
822 pushreg edi
823 setfp
824 and SP, ~15
825 sub SP, 112
826 endprologue
827
828 // Establish the expanded operands.
829 pxor xmm7, xmm7
830 mov ecx, [BP + 28] // -> bv
831 mov edx, [BP + 40] // -> mi
832 movdqu xmm0, [ecx] // bv[0]
833 movdqu xmm2, [edx] // mi
834 expand xmm7, xmm0, xmm1, xmm2, xmm3
835 movdqa [SP + 0], xmm0 // bv[0] expanded low
836 movdqa [SP + 16], xmm1 // bv[0] expanded high
837 movdqa [SP + 32], xmm2 // mi expanded low
838 movdqa [SP + 48], xmm3 // mi expanded high
839
840 // Set up the outer loop state and prepare for the first iteration.
841 mov edx, [BP + 36] // n
842 mov eax, [BP + 24] // -> U = av[0]
843 mov ebx, [BP + 32] // -> X = nv[0]
844 mov edi, [BP + 20] // -> Z = dv[0]
845 mov [SP + 100], ecx
846 lea ecx, [ecx + 4*edx] // -> bv[n/4] = bv limit
847 lea edx, [eax + 4*edx] // -> av[n/4] = av limit
848 mov [SP + 96], edi
849 mov [SP + 104], edx
850 mov [SP + 108], ecx
851 lea ecx, [SP + 0] // -> expanded V = bv[0]
852 lea esi, [SP + 32] // -> expanded M = mi
853 lea edx, [SP + 64] // -> space for Y
854 call mmul4
855 mov esi, [SP + 104] // recover av limit
856 add edi, 16
857 add eax, 16
858 add ebx, 16
859 cmp eax, esi // done already?
860 jae 8f
861 mov [SP + 96], edi
862
863 .p2align 4
864 // Complete the first inner loop.
865 0: call dmul4
866 add edi, 16
867 add eax, 16
868 add ebx, 16
869 cmp eax, esi // done yet?
870 jb 0b
871
872 // Still have carries left to propagate.
873 call carryprop
874 movd [edi + 16], xmm4
875
876 .p2align 4
877 // Embark on the next iteration. (There must be one. If n = 1, then
878 // we would have bailed above, to label 8. Similarly, the subsequent
879 // iterations can fall into the inner loop immediately.)
880 1: mov eax, [SP + 100] // -> bv[i - 1]
881 mov edi, [SP + 96] // -> Z = dv[i]
882 add eax, 16 // -> bv[i]
883 pxor xmm7, xmm7
884 mov [SP + 100], eax
885 cmp eax, [SP + 108] // done yet?
886 jae 9f
887 movdqu xmm0, [eax] // bv[i]
888 mov ebx, [BP + 32] // -> X = nv[0]
889 lea esi, [SP + 32] // -> expanded M = mi
890 mov eax, [BP + 24] // -> U = av[0]
891 expand xmm7, xmm0, xmm1
892 movdqa [SP + 0], xmm0 // bv[i] expanded low
893 movdqa [SP + 16], xmm1 // bv[i] expanded high
894 call mmla4
895 mov esi, [SP + 104] // recover av limit
896 add edi, 16
897 add eax, 16
898 add ebx, 16
899 mov [SP + 96], edi
900
901 .p2align 4
902 // Complete the next inner loop.
903 0: call dmla4
904 add edi, 16
905 add eax, 16
906 add ebx, 16
907 cmp eax, esi
908 jb 0b
909
910 // Still have carries left to propagate, and they overlap the
911 // previous iteration's final tail, so read that in and add it.
912 movd xmm0, [edi]
913 paddq xmm4, xmm0
914 call carryprop
915 movd [edi + 16], xmm4
916
917 // Back again.
918 jmp 1b
919
920 // First iteration was short. Write out the carries and we're done.
921 // (This could be folded into the main loop structure, but that would
922 // penalize small numbers more.)
923 8: call carryprop
924 movd [edi + 16], xmm4
925
926 // All done.
927 9: dropfp
928 popreg edi
929 popreg esi
930 popreg ebx
931 popreg BP
932 ret
933
934 ENDFUNC
935
936 FUNC(mpxmont_redc4_x86_avx)
937 .arch .avx
938 vzeroupper
939 endprologue
940 // and drop through...
941 .arch pentium4
942 ENDFUNC
943
944 FUNC(mpxmont_redc4_x86_sse2)
945 // void mpxmont_redc4_x86_sse2(mpw *dv, mpw *dvl, const mpw *nv,
946 // size_t n, const mpw *mi);
947
948 // Build a stack frame. Arguments will be relative to BP, as
949 // follows.
950 //
951 // BP + 20 dv
952 // BP + 24 dvl
953 // BP + 28 nv
954 // BP + 32 n (nonzero multiple of 4)
955 // BP + 36 mi
956 //
957 // Locals are relative to SP, as follows.
958 //
959 // SP + 0 outer loop dv
960 // SP + 4 outer dv limit
961 // SP + 8 blocks-of-4 dv limit
962 // SP + 12 expanded M (32 bytes)
963 // SP + 44 expanded Y (32 bytes)
964 // SP + 76 (top of locals)
965 pushreg BP
966 pushreg ebx
967 pushreg esi
968 pushreg edi
969 setfp
970 and SP, ~15
971 sub SP, 76
972 endprologue
973
974 // Establish the expanded operands and the blocks-of-4 dv limit.
975 mov edi, [BP + 20] // -> Z = dv[0]
976 pxor xmm7, xmm7
977 mov eax, [BP + 24] // -> dv[n] = dv limit
978 sub eax, edi // length of dv in bytes
979 mov edx, [BP + 36] // -> mi
980 movdqu xmm0, [edx] // mi
981 and eax, ~15 // mask off the tail end
982 expand xmm7, xmm0, xmm1
983 add eax, edi // find limit
984 movdqa [SP + 12], xmm0 // mi expanded low
985 movdqa [SP + 28], xmm1 // mi expanded high
986 mov [SP + 8], eax
987
988 // Set up the outer loop state and prepare for the first iteration.
989 mov ecx, [BP + 32] // n
990 mov ebx, [BP + 28] // -> X = nv[0]
991 lea edx, [edi + 4*ecx] // -> dv[n/4] = outer dv limit
992 lea ecx, [ebx + 4*ecx] // -> nv[n/4] = nv limit
993 mov [SP + 0], edi
994 mov [SP + 4], edx
995 lea esi, [SP + 12] // -> expanded M = mi
996 lea edx, [SP + 44] // -> space for Y
997 call mont4
998 add ebx, 16
999 add edi, 16
1000 cmp ebx, ecx // done already?
1001 jae 8f
1002
1003 .p2align 4
1004 // Complete the first inner loop.
1005 5: call mla4
1006 add ebx, 16
1007 add edi, 16
1008 cmp ebx, ecx // done yet?
1009 jb 5b
1010
1011 // Still have carries left to propagate.
1012 8: carryadd
1013 mov esi, [SP + 8] // -> dv blocks limit
1014 mov edx, [BP + 24] // dv limit
1015 psllq xmm7, 16
1016 pslldq xmm7, 8
1017 paddq xmm6, xmm7
1018 call carryprop
1019 movd eax, xmm4
1020 add edi, 16
1021 cmp edi, esi
1022 jae 7f
1023
1024 .p2align 4
1025 // Continue carry propagation until the end of the buffer.
1026 0: add [edi], eax
1027 mov eax, 0 // preserves flags
1028 adcd [edi + 4], 0
1029 adcd [edi + 8], 0
1030 adcd [edi + 12], 0
1031 adc eax, 0
1032 add edi, 16
1033 cmp edi, esi
1034 jb 0b
1035
1036 // Deal with the tail end.
1037 7: add [edi], eax
1038 mov eax, 0 // preserves flags
1039 add edi, 4
1040 adc eax, 0
1041 cmp edi, edx
1042 jb 7b
1043
1044 // All done for this iteration. Start the next. (This must have at
1045 // least one follow-on iteration, or we'd not have started this outer
1046 // loop.)
1047 8: mov edi, [SP + 0] // -> dv[i - 1]
1048 mov ebx, [BP + 28] // -> X = nv[0]
1049 lea edx, [SP + 44] // -> space for Y
1050 lea esi, [SP + 12] // -> expanded M = mi
1051 add edi, 16 // -> Z = dv[i]
1052 cmp edi, [SP + 4] // all done yet?
1053 jae 9f
1054 mov [SP + 0], edi
1055 call mont4
1056 add edi, 16
1057 add ebx, 16
1058 jmp 5b
1059
1060 // All over.
1061 9: dropfp
1062 popreg edi
1063 popreg esi
1064 popreg ebx
1065 popreg BP
1066 ret
1067
1068 ENDFUNC
1069
1070 ///--------------------------------------------------------------------------
1071 /// Testing and performance measurement.
1072
1073 #ifdef TEST_MUL4
1074
1075 .macro cysetup c
1076 rdtsc
1077 mov [\c], eax
1078 mov [\c + 4], edx
1079 .endm
1080
1081 .macro cystore c, v, n
1082 rdtsc
1083 sub eax, [\c]
1084 sbb edx, [\c + 4]
1085 mov ebx, [\v]
1086 mov ecx, [\n]
1087 dec ecx
1088 mov [\n], ecx
1089 mov [ebx + ecx*8], eax
1090 mov [ebx + ecx*8 + 4], edx
1091 .endm
1092
1093 .macro testprologue n
1094 pushreg BP
1095 pushreg ebx
1096 pushreg esi
1097 pushreg edi
1098 setfp
1099 and SP, ~15
1100 sub SP, 3*32 + 4*4
1101 endprologue
1102 mov eax, \n
1103 mov [SP + 104], eax
1104 // vars:
1105 // SP + 0 = v expanded
1106 // SP + 32 = y expanded
1107 // SP + 64 = ? expanded
1108 // SP + 96 = cycles
1109 // SP + 104 = count
1110 .endm
1111
1112 .macro testepilogue
1113 dropfp
1114 popreg edi
1115 popreg esi
1116 popreg ebx
1117 popreg BP
1118 ret
1119 .endm
1120
1121 .macro testldcarry c
1122 mov ecx, \c // -> c
1123 movdqu xmm4, [ecx + 0] // (c'_0; c''_0)
1124 movdqu xmm5, [ecx + 16] // (c'_1; c''_1)
1125 movdqu xmm6, [ecx + 32] // (c'_2; c''_2)
1126 .endm
1127
1128 .macro testexpand v=nil, y=nil
1129 pxor xmm7, xmm7
1130 .ifnes "\v", "nil"
1131 mov ecx, \v
1132 movdqu xmm0, [ecx]
1133 expand xmm7, xmm0, xmm1
1134 movdqa [SP + 0], xmm0
1135 movdqa [SP + 16], xmm1
1136 .endif
1137 .ifnes "\y", "nil"
1138 mov edx, \y
1139 movdqu xmm2, [edx]
1140 expand xmm7, xmm2, xmm3
1141 movdqa [SP + 32], xmm2
1142 movdqa [SP + 48], xmm3
1143 .endif
1144 .endm
1145
1146 .macro testtop u=nil, x=nil, mode=nil
1147 .p2align 4
1148 0:
1149 .ifnes "\u", "nil"
1150 lea ecx, [SP + 0]
1151 .endif
1152 mov ebx, \x
1153 .ifeqs "\mode", "mont"
1154 lea esi, [SP + 32]
1155 .endif
1156 cysetup SP + 96
1157 .ifnes "\u", "nil"
1158 mov eax, \u
1159 .endif
1160 .ifeqs "\mode", "mont"
1161 lea edx, [SP + 64]
1162 .else
1163 lea edx, [SP + 32]
1164 .endif
1165 .endm
1166
1167 .macro testtail cyv
1168 cystore SP + 96, \cyv, SP + 104
1169 jnz 0b
1170 .endm
1171
1172 .macro testcarryout c
1173 mov ecx, \c
1174 movdqu [ecx + 0], xmm4
1175 movdqu [ecx + 16], xmm5
1176 movdqu [ecx + 32], xmm6
1177 .endm
1178
1179 FUNC(test_dmul4)
1180 testprologue [BP + 44]
1181 testldcarry [BP + 24]
1182 testexpand [BP + 36], [BP + 40]
1183 mov edi, [BP + 20]
1184 testtop [BP + 28], [BP + 32]
1185 call dmul4
1186 testtail [BP + 48]
1187 testcarryout [BP + 24]
1188 testepilogue
1189 ENDFUNC
1190
1191 FUNC(test_dmla4)
1192 testprologue [BP + 44]
1193 testldcarry [BP + 24]
1194 testexpand [BP + 36], [BP + 40]
1195 mov edi, [BP + 20]
1196 testtop [BP + 28], [BP + 32]
1197 call dmla4
1198 testtail [BP + 48]
1199 testcarryout [BP + 24]
1200 testepilogue
1201 ENDFUNC
1202
1203 FUNC(test_mul4)
1204 testprologue [BP + 36]
1205 testldcarry [BP + 24]
1206 testexpand nil, [BP + 32]
1207 mov edi, [BP + 20]
1208 testtop nil, [BP + 28]
1209 call mul4
1210 testtail [BP + 40]
1211 testcarryout [BP + 24]
1212 testepilogue
1213 ENDFUNC
1214
1215 FUNC(test_mul4zc)
1216 testprologue [BP + 36]
1217 testldcarry [BP + 24]
1218 testexpand nil, [BP + 32]
1219 mov edi, [BP + 20]
1220 testtop nil, [BP + 28]
1221 call mul4zc
1222 testtail [BP + 40]
1223 testcarryout [BP + 24]
1224 testepilogue
1225 ENDFUNC
1226
1227 FUNC(test_mla4)
1228 testprologue [BP + 36]
1229 testldcarry [BP + 24]
1230 testexpand nil, [BP + 32]
1231 mov edi, [BP + 20]
1232 testtop nil, [BP + 28]
1233 call mla4
1234 testtail [BP + 40]
1235 testcarryout [BP + 24]
1236 testepilogue
1237 ENDFUNC
1238
1239 FUNC(test_mla4zc)
1240 testprologue [BP + 36]
1241 testldcarry [BP + 24]
1242 testexpand nil, [BP + 32]
1243 mov edi, [BP + 20]
1244 testtop nil, [BP + 28]
1245 call mla4zc
1246 testtail [BP + 40]
1247 testcarryout [BP + 24]
1248 testepilogue
1249 ENDFUNC
1250
1251 FUNC(test_mmul4)
1252 testprologue [BP + 48]
1253 testexpand [BP + 40], [BP + 44]
1254 mov edi, [BP + 20]
1255 testtop [BP + 32], [BP + 36], mont
1256 call mmul4
1257 testtail [BP + 52]
1258 mov edi, [BP + 28]
1259 movdqa xmm0, [SP + 64]
1260 movdqa xmm1, [SP + 80]
1261 movdqu [edi], xmm0
1262 movdqu [edi + 16], xmm1
1263 testcarryout [BP + 24]
1264 testepilogue
1265 ENDFUNC
1266
1267 FUNC(test_mmla4)
1268 testprologue [BP + 48]
1269 testexpand [BP + 40], [BP + 44]
1270 mov edi, [BP + 20]
1271 testtop [BP + 32], [BP + 36], mont
1272 call mmla4
1273 testtail [BP + 52]
1274 mov edi, [BP + 28]
1275 movdqa xmm0, [SP + 64]
1276 movdqa xmm1, [SP + 80]
1277 movdqu [edi], xmm0
1278 movdqu [edi + 16], xmm1
1279 testcarryout [BP + 24]
1280 testepilogue
1281 ENDFUNC
1282
1283 FUNC(test_mont4)
1284 testprologue [BP + 40]
1285 testexpand nil, [BP + 36]
1286 mov edi, [BP + 20]
1287 testtop nil, [BP + 32], mont
1288 call mont4
1289 testtail [BP + 44]
1290 mov edi, [BP + 28]
1291 movdqa xmm0, [SP + 64]
1292 movdqa xmm1, [SP + 80]
1293 movdqu [edi], xmm0
1294 movdqu [edi + 16], xmm1
1295 testcarryout [BP + 24]
1296 testepilogue
1297 ENDFUNC
1298
1299 #endif
1300
1301 ///----- That's all, folks --------------------------------------------------