math/mpx-mul4-x86-sse2.S: Use default arguments for macros.
[catacomb] / math / mpx-mul4-x86-sse2.S
CommitLineData
444083ae
MW
1/// -*- mode: asm; asm-comment-char: ?/; comment-start: "// " -*-
2///
3/// Large SIMD-based multiplications
4///
5/// (c) 2016 Straylight/Edgeware
6
7///----- Licensing notice ---------------------------------------------------
8///
9/// This file is part of Catacomb.
10///
11/// Catacomb is free software; you can redistribute it and/or modify
12/// it under the terms of the GNU Library General Public License as
13/// published by the Free Software Foundation; either version 2 of the
14/// License, or (at your option) any later version.
15///
16/// Catacomb is distributed in the hope that it will be useful,
17/// but WITHOUT ANY WARRANTY; without even the implied warranty of
18/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19/// GNU Library General Public License for more details.
20///
21/// You should have received a copy of the GNU Library General Public
22/// License along with Catacomb; if not, write to the Free
23/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
24/// MA 02111-1307, USA.
25
26///--------------------------------------------------------------------------
27/// External definitions.
28
29#include "config.h"
30#include "asm-common.h"
31
32///--------------------------------------------------------------------------
33/// Prologue.
34
35 .arch pentium4
36 .text
37
38///--------------------------------------------------------------------------
39/// Theory.
40///
41/// We define a number of primitive fixed-size multipliers from which we can
42/// construct more general variable-length multipliers.
43///
44/// The basic trick is the same throughout. In an operand-scanning
45/// multiplication, the inner multiplication loop multiplies a
46/// multiple-precision operand by a single precision factor, and adds the
47/// result, appropriately shifted, to the result. A `finely integrated
48/// operand scanning' implementation of Montgomery multiplication also adds
49/// the product of a single-precision `Montgomery factor' and the modulus,
50/// calculated in the same pass. The more common `coarsely integrated
51/// operand scanning' alternates main multiplication and Montgomery passes,
52/// which requires additional carry propagation.
53///
54/// Throughout both plain-multiplication and Montgomery stages, then, one of
55/// the factors remains constant throughout the operation, so we can afford
56/// to take a little time to preprocess it. The transformation we perform is
57/// as follows. Let b = 2^16, and B = b^2 = 2^32. Suppose we're given a
58/// 128-bit factor v = v_0 + v_1 B + v_2 B^2 + v_3 B^3. Split each v_i into
59/// two sixteen-bit pieces, so v_i = v'_i + v''_i b. These eight 16-bit
60/// pieces are placed into 32-bit cells, and arranged as two 128-bit SSE
61/// operands, as follows.
62///
63/// Offset 0 4 8 12
64/// 0 v'_0 v'_1 v''_0 v''_1
65/// 16 v'_2 v'_3 v''_2 v''_3
66///
2aaa07f8 67/// A `pmuludq' instruction ignores the odd positions in its operands; thus,
444083ae
MW
68/// it will act on (say) v'_0 and v''_0 in a single instruction. Shifting
69/// this vector right by 4 bytes brings v'_1 and v''_1 into position. We can
70/// multiply such a vector by a full 32-bit scalar to produce two 48-bit
71/// results in 64-bit fields. The sixteen bits of headroom allows us to add
72/// many products together before we must deal with carrying; it also allows
73/// for some calculations to be performed on the above expanded form.
74///
75/// On 32-bit x86, we are register starved: the expanded operands are kept in
76/// memory, typically in warm L1 cache.
77///
78/// We maintain four `carry' registers accumulating intermediate results.
79/// The registers' precise roles rotate during the computation; we name them
80/// `c0', `c1', `c2', and `c3'. Each carry register holds two 64-bit halves:
81/// the register c0, for example, holds c'_0 (low half) and c''_0 (high
82/// half), and represents the value c_0 = c'_0 + c''_0 b; the carry registers
83/// collectively represent the value c_0 + c_1 B + c_2 B^2 + c_3 B^3. The
2aaa07f8 84/// `pmuluqd' instruction acting on a scalar operand (broadcast across all
444083ae
MW
85/// lanes of its vector) and an operand in the expanded form above produces a
86/// result which can be added directly to the appropriate carry register.
87/// Following a pass of four multiplications, we perform some limited carry
88/// propagation: let t = c''_0 mod B, and let d = c'_0 + t b; then we output
89/// z = d mod B, add (floor(d/B), floor(c''_0/B)) to c1, and cycle the carry
90/// registers around, so that c1 becomes c0, and the old c0 is (implicitly)
91/// zeroed becomes c3.
92
93///--------------------------------------------------------------------------
94/// Macro definitions.
95
71ac8e5e 96.macro mulcore r, s, d0, d1=nil, d2=nil, d3=nil
444083ae
MW
97 // Load a word r_i from R, multiply by the expanded operand [S], and
98 // leave the pieces of the product in registers D0, D1, D2, D3.
99 movd \d0, \r // (r_i, 0, 0, 0)
100 .ifnes "\d1", "nil"
101 movdqa \d1, [\s] // (s'_0, s'_1, s''_0, s''_1)
102 .endif
103 .ifnes "\d3", "nil"
104 movdqa \d3, [\s + 16] // (s'_2, s'_3, s''_2, s''_3)
105 .endif
aec741b6 106 pshufd \d0, \d0, SHUF(3, 0, 3, 0) // (r_i, ?, r_i, ?)
444083ae
MW
107 .ifnes "\d1", "nil"
108 psrldq \d1, 4 // (s'_1, s''_0, s''_1, 0)
109 .endif
110 .ifnes "\d2", "nil"
111 .ifnes "\d3", "nil"
112 movdqa \d2, \d3 // another copy of (s'_2, s'_3, ...)
113 .else
114 movdqa \d2, \d0 // another copy of (r_i, ?, r_i, ?)
115 .endif
116 .endif
117 .ifnes "\d3", "nil"
118 psrldq \d3, 4 // (s'_3, s''_2, s''_3, 0)
119 .endif
120 .ifnes "\d1", "nil"
2aaa07f8 121 pmuludq \d1, \d0 // (r_i s'_1, r_i s''_1)
444083ae
MW
122 .endif
123 .ifnes "\d3", "nil"
2aaa07f8 124 pmuludq \d3, \d0 // (r_i s'_3, r_i s''_3)
444083ae
MW
125 .endif
126 .ifnes "\d2", "nil"
127 .ifnes "\d3", "nil"
2aaa07f8 128 pmuludq \d2, \d0 // (r_i s'_2, r_i s''_2)
444083ae 129 .else
2aaa07f8 130 pmuludq \d2, [\s + 16]
444083ae
MW
131 .endif
132 .endif
2aaa07f8 133 pmuludq \d0, [\s] // (r_i s'_0, r_i s''_0)
444083ae
MW
134.endm
135
71ac8e5e
MW
136.macro accum c0, c1=nil, c2=nil, c3=nil
137 // Accumulate 64-bit pieces in XMM0--XMM3 into the corresponding
138 // carry registers C0--C3. Any or all of C1--C3 may be `nil' to skip
139 // updating that register.
444083ae
MW
140 paddq \c0, xmm0
141 .ifnes "\c1", "nil"
142 paddq \c1, xmm1
143 .endif
144 .ifnes "\c2", "nil"
145 paddq \c2, xmm2
146 .endif
147 .ifnes "\c3", "nil"
148 paddq \c3, xmm3
149 .endif
150.endm
151
71ac8e5e 152.macro mulacc r, s, c0, c1, c2, c3, z3p=nil
444083ae
MW
153 // Load a word r_i from R, multiply by the expanded operand [S],
154 // and accumulate in carry registers C0, C1, C2, C3. If Z3P is `t'
155 // then C3 notionally contains zero, but needs clearing; in practice,
156 // we store the product directly rather than attempting to add. On
157 // completion, XMM0, XMM1, and XMM2 are clobbered, as is XMM3 if Z3P
158 // is not `t'.
159 .ifeqs "\z3p", "t"
160 mulcore \r, \s, xmm0, xmm1, xmm2, \c3
71ac8e5e 161 accum \c0, \c1, \c2
444083ae
MW
162 .else
163 mulcore \r, \s, xmm0, xmm1, xmm2, xmm3
164 accum \c0, \c1, \c2, \c3
165 .endif
166.endm
167
71ac8e5e 168.macro propout d, c, cc=nil
444083ae
MW
169 // Calculate an output word from C, and store it in D; propagate
170 // carries out from C to CC in preparation for a rotation of the
171 // carry registers. On completion, XMM3 is clobbered. If CC is
172 // `nil', then the contribution which would have been added to it is
173 // left in C.
aec741b6 174 pshufd xmm3, \c, SHUF(2, 3, 3, 3) // (?, ?, ?, t = c'' mod B)
444083ae
MW
175 psrldq xmm3, 12 // (t, 0, 0, 0) = (t, 0)
176 pslldq xmm3, 2 // (t b, 0)
177 paddq \c, xmm3 // (c' + t b, c'')
178 movd \d, \c
179 psrlq \c, 32 // floor(c/B)
180 .ifnes "\cc", "nil"
181 paddq \cc, \c // propagate up
182 .endif
183.endm
184
185.macro endprop d, c, t
186 // On entry, C contains a carry register. On exit, the low 32 bits
187 // of the value represented in C are written to D, and the remaining
188 // bits are left at the bottom of T.
189 movdqa \t, \c
190 psllq \t, 16 // (?, c'' b)
191 pslldq \c, 8 // (0, c')
192 paddq \t, \c // (?, c' + c'' b)
193 psrldq \t, 8 // c' + c'' b
194 movd \d, \t
195 psrldq \t, 4 // floor((c' + c'' b)/B)
196.endm
197
71ac8e5e 198.macro expand z, a, b, c=nil, d=nil
444083ae
MW
199 // On entry, A and C hold packed 128-bit values, and Z is zero. On
200 // exit, A:B and C:D together hold the same values in expanded
201 // form. If C is `nil', then only expand A to A:B.
202 movdqa \b, \a // (a_0, a_1, a_2, a_3)
203 .ifnes "\c", "nil"
204 movdqa \d, \c // (c_0, c_1, c_2, c_3)
205 .endif
206 punpcklwd \a, \z // (a'_0, a''_0, a'_1, a''_1)
207 punpckhwd \b, \z // (a'_2, a''_2, a'_3, a''_3)
208 .ifnes "\c", "nil"
209 punpcklwd \c, \z // (c'_0, c''_0, c'_1, c''_1)
210 punpckhwd \d, \z // (c'_2, c''_2, c'_3, c''_3)
211 .endif
aec741b6
MW
212 pshufd \a, \a, SHUF(3, 1, 2, 0) // (a'_0, a'_1, a''_0, a''_1)
213 pshufd \b, \b, SHUF(3, 1, 2, 0) // (a'_2, a'_3, a''_2, a''_3)
444083ae 214 .ifnes "\c", "nil"
aec741b6
MW
215 pshufd \c, \c, SHUF(3, 1, 2, 0) // (c'_0, c'_1, c''_0, c''_1)
216 pshufd \d, \d, SHUF(3, 1, 2, 0) // (c'_2, c'_3, c''_2, c''_3)
444083ae
MW
217 .endif
218.endm
219
71ac8e5e 220.macro squash c0, c1, c2, c3, t, u, lo, hi=nil
444083ae 221 // On entry, C0, C1, C2, C3 are carry registers representing a value
4b30aca5 222 // Y. On exit, LO holds the low 128 bits of the carry value; C1, C2,
444083ae 223 // C3, T, and U are clobbered; and the high bits of Y are stored in
4b30aca5 224 // HI, if this is not `nil'.
444083ae
MW
225
226 // The first step is to eliminate the `double-prime' pieces -- i.e.,
227 // the ones offset by 16 bytes from a 32-bit boundary -- by carrying
228 // them into the 32-bit-aligned pieces above and below. But before
229 // we can do that, we must gather them together.
230 movdqa \t, \c0
231 movdqa \u, \c1
232 punpcklqdq \t, \c2 // (y'_0, y'_2)
233 punpckhqdq \c0, \c2 // (y''_0, y''_2)
234 punpcklqdq \u, \c3 // (y'_1, y'_3)
235 punpckhqdq \c1, \c3 // (y''_1, y''_3)
236
237 // Now split the double-prime pieces. The high (up to) 48 bits will
238 // go up; the low 16 bits go down.
239 movdqa \c2, \c0
240 movdqa \c3, \c1
241 psllq \c2, 48
242 psllq \c3, 48
243 psrlq \c0, 16 // high parts of (y''_0, y''_2)
244 psrlq \c1, 16 // high parts of (y''_1, y''_3)
245 psrlq \c2, 32 // low parts of (y''_0, y''_2)
246 psrlq \c3, 32 // low parts of (y''_1, y''_3)
4b30aca5
MW
247 .ifnes "\hi", "nil"
248 movdqa \hi, \c1
444083ae
MW
249 .endif
250 pslldq \c1, 8 // high part of (0, y''_1)
251
252 paddq \t, \c2 // propagate down
253 paddq \u, \c3
254 paddq \t, \c1 // and up: (y_0, y_2)
255 paddq \u, \c0 // (y_1, y_3)
4b30aca5
MW
256 .ifnes "\hi", "nil"
257 psrldq \hi, 8 // high part of (y''_3, 0)
444083ae
MW
258 .endif
259
260 // Finally extract the answer. This complicated dance is better than
261 // storing to memory and loading, because the piecemeal stores
262 // inhibit store forwarding.
263 movdqa \c3, \t // (y_0, y_1)
4b30aca5 264 movdqa \lo, \t // (y^*_0, ?, ?, ?)
444083ae
MW
265 psrldq \t, 8 // (y_2, 0)
266 psrlq \c3, 32 // (floor(y_0/B), ?)
267 paddq \c3, \u // (y_1 + floor(y_0/B), ?)
444083ae
MW
268 movdqa \c1, \c3 // (y^*_1, ?, ?, ?)
269 psrldq \u, 8 // (y_3, 0)
270 psrlq \c3, 32 // (floor((y_1 B + y_0)/B^2, ?)
271 paddq \c3, \t // (y_2 + floor((y_1 B + y_0)/B^2, ?)
4b30aca5 272 punpckldq \lo, \c3 // (y^*_0, y^*_2, ?, ?)
444083ae
MW
273 psrlq \c3, 32 // (floor((y_2 B^2 + y_1 B + y_0)/B^3, ?)
274 paddq \c3, \u // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3, ?)
4b30aca5 275 .ifnes "\hi", "nil"
d2269179 276 movdqa \t, \c3
444083ae
MW
277 pxor \u, \u
278 .endif
bd685577 279 punpckldq \c1, \c3 // (y^*_1, y^*_3, ?, ?)
4b30aca5 280 .ifnes "\hi", "nil"
444083ae 281 psrlq \t, 32 // very high bits of y
4b30aca5
MW
282 paddq \hi, \t
283 punpcklqdq \hi, \u // carry up
444083ae 284 .endif
4b30aca5 285 punpckldq \lo, \c1 // y mod B^4
444083ae
MW
286.endm
287
288.macro carryadd
289 // On entry, EDI points to a packed addend A, and XMM4, XMM5, XMM6
290 // hold the incoming carry registers c0, c1, and c2 representing a
291 // carry-in C.
292 //
293 // On exit, the carry registers, including XMM7, are updated to hold
294 // C + A; XMM0, XMM1, XMM2, and XMM3 are clobbered. The other
295 // registers are preserved.
296 movd xmm0, [edi + 0] // (a_0, 0)
297 movd xmm1, [edi + 4] // (a_1, 0)
298 movd xmm2, [edi + 8] // (a_2, 0)
299 movd xmm7, [edi + 12] // (a_3, 0)
9b9763c0 300
444083ae
MW
301 paddq xmm4, xmm0 // (c'_0 + a_0, c''_0)
302 paddq xmm5, xmm1 // (c'_1 + a_1, c''_1)
303 paddq xmm6, xmm2 // (c'_2 + a_2, c''_2 + a_3 b)
304.endm
305
306///--------------------------------------------------------------------------
307/// Primitive multipliers and related utilities.
308
1a517bb3 309INTFUNC(carryprop)
444083ae
MW
310 // On entry, XMM4, XMM5, and XMM6 hold a 144-bit carry in an expanded
311 // form. Store the low 128 bits of the represented carry to [EDI] as
312 // a packed 128-bit value, and leave the remaining 16 bits in the low
313 // 32 bits of XMM4. On exit, XMM3, XMM5 and XMM6 are clobbered.
0923a413
MW
314 endprologue
315
444083ae
MW
316 propout [edi + 0], xmm4, xmm5
317 propout [edi + 4], xmm5, xmm6
318 propout [edi + 8], xmm6, nil
319 endprop [edi + 12], xmm6, xmm4
320 ret
321
1a517bb3
MW
322ENDFUNC
323
324INTFUNC(dmul4)
444083ae
MW
325 // On entry, EDI points to the destination buffer; EAX and EBX point
326 // to the packed operands U and X; ECX and EDX point to the expanded
327 // operands V and Y; and XMM4, XMM5, XMM6 hold the incoming carry
328 // registers c0, c1, and c2; c3 is assumed to be zero.
329 //
330 // On exit, we write the low 128 bits of the sum C + U V + X Y to
331 // [EDI], and update the carry registers with the carry out. The
332 // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
333 // general-purpose registers are preserved.
0923a413
MW
334 endprologue
335
444083ae 336 mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7, t
71ac8e5e 337 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
444083ae
MW
338 propout [edi + 0], xmm4, xmm5
339
340 mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
71ac8e5e 341 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4
444083ae
MW
342 propout [edi + 4], xmm5, xmm6
343
344 mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t
71ac8e5e 345 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5
444083ae
MW
346 propout [edi + 8], xmm6, xmm7
347
348 mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
71ac8e5e 349 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6
444083ae
MW
350 propout [edi + 12], xmm7, xmm4
351
352 ret
353
1a517bb3
MW
354ENDFUNC
355
356INTFUNC(dmla4)
444083ae
MW
357 // On entry, EDI points to the destination buffer, which also
358 // contains an addend A to accumulate; EAX and EBX point to the
359 // packed operands U and X; ECX and EDX point to the expanded
360 // operands V and Y; and XMM4, XMM5, XMM6 hold the incoming carry
361 // registers c0, c1, and c2 representing a carry-in C; c3 is assumed
362 // to be zero.
363 //
364 // On exit, we write the low 128 bits of the sum A + C + U V + X Y to
365 // [EDI], and update the carry registers with the carry out. The
366 // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
367 // general-purpose registers are preserved.
0923a413
MW
368 endprologue
369
444083ae
MW
370 carryadd
371
71ac8e5e
MW
372 mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7
373 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
444083ae
MW
374 propout [edi + 0], xmm4, xmm5
375
376 mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
71ac8e5e 377 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4
444083ae
MW
378 propout [edi + 4], xmm5, xmm6
379
380 mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t
71ac8e5e 381 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5
444083ae
MW
382 propout [edi + 8], xmm6, xmm7
383
384 mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
71ac8e5e 385 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6
444083ae
MW
386 propout [edi + 12], xmm7, xmm4
387
388 ret
389
1a517bb3
MW
390ENDFUNC
391
392INTFUNC(mul4zc)
444083ae
MW
393 // On entry, EDI points to the destination buffer; EBX points to a
394 // packed operand X; and EDX points to an expanded operand Y.
395 //
396 // On exit, we write the low 128 bits of the product X Y to [EDI],
397 // and set the carry registers XMM4, XMM5, XMM6 to the carry out.
398 // The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
399 // general-purpose registers are preserved.
0923a413
MW
400 endprologue
401
444083ae
MW
402 mulcore [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
403 propout [edi + 0], xmm4, xmm5
404
405 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
406 propout [edi + 4], xmm5, xmm6
407
408 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
409 propout [edi + 8], xmm6, xmm7
410
411 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
412 propout [edi + 12], xmm7, xmm4
413
414 ret
415
1a517bb3
MW
416ENDFUNC
417
418INTFUNC(mul4)
444083ae
MW
419 // On entry, EDI points to the destination buffer; EBX points to a
420 // packed operand X; EDX points to an expanded operand Y; and XMM4,
421 // XMM5, XMM6 hold the incoming carry registers c0, c1, and c2,
422 // representing a carry-in C; c3 is assumed to be zero.
423 //
424 // On exit, we write the low 128 bits of the sum C + X Y to [EDI],
425 // and update the carry registers with the carry out. The registers
426 // XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
427 // general-purpose registers are preserved.
0923a413
MW
428 endprologue
429
444083ae
MW
430 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, t
431 propout [edi + 0], xmm4, xmm5
432
433 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
434 propout [edi + 4], xmm5, xmm6
435
436 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
437 propout [edi + 8], xmm6, xmm7
438
439 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
440 propout [edi + 12], xmm7, xmm4
441
442 ret
443
1a517bb3
MW
444ENDFUNC
445
446INTFUNC(mla4zc)
444083ae
MW
447 // On entry, EDI points to the destination buffer, which also
448 // contains an addend A to accumulate; EBX points to a packed operand
449 // X; and EDX points to an expanded operand Y.
450 //
451 // On exit, we write the low 128 bits of the sum A + X Y to [EDI],
452 // and set the carry registers XMM4, XMM5, XMM6 to the carry out.
453 // The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
454 // general-purpose registers are preserved.
0923a413
MW
455 endprologue
456
444083ae
MW
457 movd xmm4, [edi + 0]
458 movd xmm5, [edi + 4]
459 movd xmm6, [edi + 8]
460 movd xmm7, [edi + 12]
461
71ac8e5e 462 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
444083ae
MW
463 propout [edi + 0], xmm4, xmm5
464
465 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
466 propout [edi + 4], xmm5, xmm6
467
468 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
469 propout [edi + 8], xmm6, xmm7
470
471 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
472 propout [edi + 12], xmm7, xmm4
473
474 ret
475
1a517bb3
MW
476ENDFUNC
477
478INTFUNC(mla4)
444083ae
MW
479 // On entry, EDI points to the destination buffer, which also
480 // contains an addend A to accumulate; EBX points to a packed operand
481 // X; EDX points to an expanded operand Y; and XMM4, XMM5, XMM6 hold
482 // the incoming carry registers c0, c1, and c2, representing a
483 // carry-in C; c3 is assumed to be zero.
484 //
485 // On exit, we write the low 128 bits of the sum A + C + X Y to
486 // [EDI], and update the carry registers with the carry out. The
487 // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
488 // general-purpose registers are preserved.
0923a413
MW
489 endprologue
490
444083ae
MW
491 carryadd
492
71ac8e5e 493 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
444083ae
MW
494 propout [edi + 0], xmm4, xmm5
495
496 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
497 propout [edi + 4], xmm5, xmm6
498
499 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
500 propout [edi + 8], xmm6, xmm7
501
502 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
503 propout [edi + 12], xmm7, xmm4
504
505 ret
506
1a517bb3
MW
507ENDFUNC
508
509INTFUNC(mmul4)
444083ae
MW
510 // On entry, EDI points to the destination buffer; EAX and EBX point
511 // to the packed operands U and N; ECX and ESI point to the expanded
512 // operands V and M; and EDX points to a place to store an expanded
513 // result Y (32 bytes, at a 16-byte boundary). The stack pointer
514 // must be 16-byte aligned. (This is not the usual convention, which
515 // requires alignment before the call.)
516 //
517 // On exit, we write Y = U V M mod B to [EDX], and the low 128 bits
518 // of the sum U V + N Y to [EDI], leaving the remaining carry in
519 // XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2, XMM3, and
520 // XMM7 are clobbered; the general-purpose registers are preserved.
847e1f3e 521 stalloc 48 // space for the carries
0923a413 522 endprologue
444083ae
MW
523
524 // Calculate W = U V, and leave it in the destination. Stash the
525 // carry pieces for later.
526 mulcore [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7
527 propout [edi + 0], xmm4, xmm5
528 jmp 5f
529
1a517bb3
MW
530ENDFUNC
531
532INTFUNC(mmla4)
444083ae
MW
533 // On entry, EDI points to the destination buffer, which also
534 // contains an addend A to accumulate; EAX and EBX point
535 // to the packed operands U and N; ECX and ESI point to the expanded
536 // operands V and M; and EDX points to a place to store an expanded
537 // result Y (32 bytes, at a 16-byte boundary). The stack pointer
538 // must be 16-byte aligned. (This is not the usual convention, which
539 // requires alignment before the call.)
540 //
541 // On exit, we write Y = (A + U V) M mod B to [EDX], and the low 128
542 // bits of the sum A + U V + N Y to [EDI], leaving the remaining
543 // carry in XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2,
544 // XMM3, and XMM7 are clobbered; the general-purpose registers are
545 // preserved.
847e1f3e 546 stalloc 48 // space for the carries
0923a413
MW
547 endprologue
548
444083ae
MW
549 movd xmm4, [edi + 0]
550 movd xmm5, [edi + 4]
551 movd xmm6, [edi + 8]
552 movd xmm7, [edi + 12]
71ac8e5e 553 mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7
444083ae
MW
554 propout [edi + 0], xmm4, xmm5
555
5565: mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
557 propout [edi + 4], xmm5, xmm6
558
559 mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t
560 propout [edi + 8], xmm6, xmm7
561
562 mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
563 propout [edi + 12], xmm7, xmm4
564
565 movdqa [esp + 0], xmm4
566 movdqa [esp + 16], xmm5
567 movdqa [esp + 32], xmm6
568
569 // Calculate Y = W M.
570 mulcore [edi + 0], esi, xmm4, xmm5, xmm6, xmm7
571
71ac8e5e
MW
572 mulcore [edi + 4], esi, xmm0, xmm1, xmm2
573 accum xmm5, xmm6, xmm7
444083ae 574
71ac8e5e
MW
575 mulcore [edi + 8], esi, xmm0, xmm1
576 accum xmm6, xmm7
444083ae 577
71ac8e5e
MW
578 mulcore [edi + 12], esi, xmm0
579 accum xmm7
444083ae
MW
580
581 // That's lots of pieces. Now we have to assemble the answer.
71ac8e5e 582 squash xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm4
444083ae
MW
583
584 // Expand it.
585 pxor xmm2, xmm2
71ac8e5e 586 expand xmm2, xmm4, xmm1
444083ae
MW
587 movdqa [edx + 0], xmm4
588 movdqa [edx + 16], xmm1
589
590 // Initialize the carry from the value for W we calculated earlier.
591 movd xmm4, [edi + 0]
592 movd xmm5, [edi + 4]
593 movd xmm6, [edi + 8]
594 movd xmm7, [edi + 12]
595
596 // Finish the calculation by adding the Montgomery product.
71ac8e5e 597 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
444083ae
MW
598 propout [edi + 0], xmm4, xmm5
599
600 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
601 propout [edi + 4], xmm5, xmm6
602
603 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
604 propout [edi + 8], xmm6, xmm7
605
606 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
607 propout [edi + 12], xmm7, xmm4
608
609 // Add add on the carry we calculated earlier.
610 paddq xmm4, [esp + 0]
611 paddq xmm5, [esp + 16]
612 paddq xmm6, [esp + 32]
613
614 // And, with that, we're done.
847e1f3e 615 stfree 48
444083ae
MW
616 ret
617
1a517bb3
MW
618ENDFUNC
619
620INTFUNC(mont4)
444083ae 621 // On entry, EDI points to the destination buffer holding a packed
8e5386aa 622 // value W; EBX points to a packed operand N; ESI points to an
444083ae
MW
623 // expanded operand M; and EDX points to a place to store an expanded
624 // result Y (32 bytes, at a 16-byte boundary).
625 //
626 // On exit, we write Y = W M mod B to [EDX], and the low 128 bits
627 // of the sum W + N Y to [EDI], leaving the remaining carry in
628 // XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2, XMM3, and
629 // XMM7 are clobbered; the general-purpose registers are preserved.
0923a413 630 endprologue
444083ae
MW
631
632 // Calculate Y = W M.
633 mulcore [edi + 0], esi, xmm4, xmm5, xmm6, xmm7
634
71ac8e5e
MW
635 mulcore [edi + 4], esi, xmm0, xmm1, xmm2
636 accum xmm5, xmm6, xmm7
444083ae 637
71ac8e5e
MW
638 mulcore [edi + 8], esi, xmm0, xmm1
639 accum xmm6, xmm7
444083ae 640
71ac8e5e
MW
641 mulcore [edi + 12], esi, xmm0
642 accum xmm7
444083ae
MW
643
644 // That's lots of pieces. Now we have to assemble the answer.
71ac8e5e 645 squash xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm4
444083ae
MW
646
647 // Expand it.
648 pxor xmm2, xmm2
71ac8e5e 649 expand xmm2, xmm4, xmm1
444083ae
MW
650 movdqa [edx + 0], xmm4
651 movdqa [edx + 16], xmm1
652
653 // Initialize the carry from W.
654 movd xmm4, [edi + 0]
655 movd xmm5, [edi + 4]
656 movd xmm6, [edi + 8]
657 movd xmm7, [edi + 12]
658
659 // Finish the calculation by adding the Montgomery product.
71ac8e5e 660 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
444083ae
MW
661 propout [edi + 0], xmm4, xmm5
662
663 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
664 propout [edi + 4], xmm5, xmm6
665
666 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
667 propout [edi + 8], xmm6, xmm7
668
669 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
670 propout [edi + 12], xmm7, xmm4
671
672 // And, with that, we're done.
673 ret
674
1a517bb3
MW
675ENDFUNC
676
444083ae
MW
677///--------------------------------------------------------------------------
678/// Bulk multipliers.
679
680FUNC(mpx_umul4_x86_sse2)
681 // void mpx_umul4_x86_sse2(mpw *dv, const mpw *av, const mpw *avl,
682 // const mpw *bv, const mpw *bvl);
683
684 // Build a stack frame. Arguments will be relative to EBP, as
685 // follows.
686 //
687 // ebp + 20 dv
688 // ebp + 24 av
689 // ebp + 28 avl
690 // ebp + 32 bv
691 // ebp + 36 bvl
692 //
693 // Locals are relative to ESP, as follows.
694 //
695 // esp + 0 expanded Y (32 bytes)
696 // esp + 32 (top of locals)
0923a413
MW
697 pushreg ebp
698 pushreg ebx
699 pushreg esi
700 pushreg edi
701 setfp ebp
444083ae
MW
702 and esp, ~15
703 sub esp, 32
0923a413 704 endprologue
444083ae
MW
705
706 // Prepare for the first iteration.
707 mov esi, [ebp + 32] // -> bv[0]
708 pxor xmm7, xmm7
709 movdqu xmm0, [esi] // bv[0]
710 mov edi, [ebp + 20] // -> dv[0]
711 mov ecx, edi // outer loop dv cursor
71ac8e5e 712 expand xmm7, xmm0, xmm1
444083ae
MW
713 mov ebx, [ebp + 24] // -> av[0]
714 mov eax, [ebp + 28] // -> av[m] = av limit
715 mov edx, esp // -> expanded Y = bv[0]
716 movdqa [esp + 0], xmm0 // bv[0] expanded low
717 movdqa [esp + 16], xmm1 // bv[0] expanded high
718 call mul4zc
719 add ebx, 16
720 add edi, 16
721 add ecx, 16
722 add esi, 16
723 cmp ebx, eax // all done?
724 jae 8f
725
726 .p2align 4
727 // Continue with the first iteration.
7280: call mul4
729 add ebx, 16
730 add edi, 16
731 cmp ebx, eax // all done?
732 jb 0b
733
734 // Write out the leftover carry. There can be no tail here.
7358: call carryprop
736 cmp esi, [ebp + 36] // more passes to do?
737 jae 9f
738
739 .p2align 4
740 // Set up for the next pass.
7411: movdqu xmm0, [esi] // bv[i]
742 mov edi, ecx // -> dv[i]
743 pxor xmm7, xmm7
71ac8e5e 744 expand xmm7, xmm0, xmm1
444083ae
MW
745 mov ebx, [ebp + 24] // -> av[0]
746 movdqa [esp + 0], xmm0 // bv[i] expanded low
747 movdqa [esp + 16], xmm1 // bv[i] expanded high
748 call mla4zc
749 add edi, 16
750 add ebx, 16
751 add ecx, 16
752 add esi, 16
753 cmp ebx, eax // done yet?
754 jae 8f
755
756 .p2align 4
757 // Continue...
7580: call mla4
759 add ebx, 16
760 add edi, 16
761 cmp ebx, eax
762 jb 0b
763
764 // Finish off this pass. There was no tail on the previous pass, and
765 // there can be none on this pass.
7668: call carryprop
767 cmp esi, [ebp + 36]
768 jb 1b
769
770 // All over.
0923a413 7719: dropfp
444083ae
MW
772 pop edi
773 pop esi
774 pop ebx
775 pop ebp
776 ret
777
778ENDFUNC
779
780FUNC(mpxmont_mul4_x86_sse2)
781 // void mpxmont_mul4_x86_sse2(mpw *dv, const mpw *av, const mpw *bv,
782 // const mpw *nv, size_t n, const mpw *mi);
783
784 // Build a stack frame. Arguments will be relative to EBP, as
785 // follows.
786 //
787 // ebp + 20 dv
788 // ebp + 24 av
789 // ebp + 28 bv
790 // ebp + 32 nv
791 // ebp + 36 n (nonzero multiple of 4)
792 // ebp + 40 mi
793 //
794 // Locals are relative to ESP, which is 4 mod 16, as follows.
795 //
796 // esp + 0 outer loop dv
797 // esp + 4 outer loop bv
798 // esp + 8 av limit (mostly in ESI)
799 // esp + 12 expanded V (32 bytes)
800 // esp + 44 expanded M (32 bytes)
801 // esp + 76 expanded Y (32 bytes)
802 // esp + 108 bv limit
803 // esp + 112 (gap)
804 // esp + 124 (top of locals)
0923a413
MW
805 pushreg ebp
806 pushreg ebx
807 pushreg esi
808 pushreg edi
809 setfp ebp
444083ae
MW
810 and esp, ~15
811 sub esp, 124
0923a413 812 endprologue
444083ae
MW
813
814 // Establish the expanded operands.
815 pxor xmm7, xmm7
816 mov ecx, [ebp + 28] // -> bv
817 mov edx, [ebp + 40] // -> mi
818 movdqu xmm0, [ecx] // bv[0]
819 movdqu xmm2, [edx] // mi
71ac8e5e 820 expand xmm7, xmm0, xmm1, xmm2, xmm3
444083ae
MW
821 movdqa [esp + 12], xmm0 // bv[0] expanded low
822 movdqa [esp + 28], xmm1 // bv[0] expanded high
823 movdqa [esp + 44], xmm2 // mi expanded low
824 movdqa [esp + 60], xmm3 // mi expanded high
825
826 // Set up the outer loop state and prepare for the first iteration.
827 mov edx, [ebp + 36] // n
828 mov eax, [ebp + 24] // -> U = av[0]
829 mov ebx, [ebp + 32] // -> X = nv[0]
830 mov edi, [ebp + 20] // -> Z = dv[0]
831 mov [esp + 4], ecx
832 lea ecx, [ecx + 4*edx] // -> bv[n/4] = bv limit
833 lea edx, [eax + 4*edx] // -> av[n/4] = av limit
834 mov [esp + 0], edi
835 mov [esp + 108], ecx
836 mov [esp + 8], edx
837 lea ecx, [esp + 12] // -> expanded V = bv[0]
838 lea esi, [esp + 44] // -> expanded M = mi
839 lea edx, [esp + 76] // -> space for Y
840 call mmul4
841 mov esi, [esp + 8] // recover av limit
842 add edi, 16
843 add eax, 16
844 add ebx, 16
845 cmp eax, esi // done already?
846 jae 8f
847 mov [esp + 0], edi
848
849 .p2align 4
850 // Complete the first inner loop.
8510: call dmul4
852 add edi, 16
853 add eax, 16
854 add ebx, 16
855 cmp eax, esi // done yet?
856 jb 0b
857
858 // Still have carries left to propagate.
859 call carryprop
860 movd [edi + 16], xmm4
861
862 .p2align 4
863 // Embark on the next iteration. (There must be one. If n = 1, then
864 // we would have bailed above, to label 8. Similarly, the subsequent
865 // iterations can fall into the inner loop immediately.)
8661: mov eax, [esp + 4] // -> bv[i - 1]
867 mov edi, [esp + 0] // -> Z = dv[i]
868 add eax, 16 // -> bv[i]
869 pxor xmm7, xmm7
870 movdqu xmm0, [eax] // bv[i]
871 mov [esp + 4], eax
872 cmp eax, [esp + 108] // done yet?
873 jae 9f
874 mov ebx, [ebp + 32] // -> X = nv[0]
875 lea esi, [esp + 44] // -> expanded M = mi
876 mov eax, [ebp + 24] // -> U = av[0]
71ac8e5e 877 expand xmm7, xmm0, xmm1
444083ae
MW
878 movdqa [esp + 12], xmm0 // bv[i] expanded low
879 movdqa [esp + 28], xmm1 // bv[i] expanded high
880 call mmla4
881 mov esi, [esp + 8] // recover av limit
882 add edi, 16
883 add eax, 16
884 add ebx, 16
885 mov [esp + 0], edi
886
887 .p2align 4
888 // Complete the next inner loop.
8890: call dmla4
890 add edi, 16
891 add eax, 16
892 add ebx, 16
893 cmp eax, esi
894 jb 0b
895
896 // Still have carries left to propagate, and they overlap the
897 // previous iteration's final tail, so read that in and add it.
898 movd xmm0, [edi]
899 paddq xmm4, xmm0
900 call carryprop
901 movd [edi + 16], xmm4
902
903 // Back again.
904 jmp 1b
905
906 // First iteration was short. Write out the carries and we're done.
907 // (This could be folded into the main loop structure, but that would
908 // penalize small numbers more.)
9098: call carryprop
910 movd [edi + 16], xmm4
911
912 // All done.
0923a413
MW
9139: dropfp
914 popreg edi
915 popreg esi
916 popreg ebx
917 popreg ebp
444083ae
MW
918 ret
919
920ENDFUNC
921
922FUNC(mpxmont_redc4_x86_sse2)
923 // void mpxmont_redc4_x86_sse2(mpw *dv, mpw *dvl, const mpw *nv,
924 // size_t n, const mpw *mi);
925
926 // Build a stack frame. Arguments will be relative to EBP, as
927 // follows.
928 //
929 // ebp + 20 dv
930 // ebp + 24 dvl
931 // ebp + 28 nv
932 // ebp + 32 n (nonzero multiple of 4)
933 // ebp + 36 mi
934 //
935 // Locals are relative to ESP, as follows.
936 //
937 // esp + 0 outer loop dv
938 // esp + 4 outer dv limit
939 // esp + 8 blocks-of-4 dv limit
940 // esp + 12 expanded M (32 bytes)
941 // esp + 44 expanded Y (32 bytes)
942 // esp + 76 (top of locals)
0923a413
MW
943 pushreg ebp
944 pushreg ebx
945 pushreg esi
946 pushreg edi
947 setfp ebp
444083ae
MW
948 and esp, ~15
949 sub esp, 76
0923a413 950 endprologue
444083ae
MW
951
952 // Establish the expanded operands and the blocks-of-4 dv limit.
953 mov edi, [ebp + 20] // -> Z = dv[0]
954 pxor xmm7, xmm7
955 mov eax, [ebp + 24] // -> dv[n] = dv limit
956 sub eax, edi // length of dv in bytes
957 mov edx, [ebp + 36] // -> mi
958 movdqu xmm0, [edx] // mi
959 and eax, ~15 // mask off the tail end
71ac8e5e 960 expand xmm7, xmm0, xmm1
444083ae
MW
961 add eax, edi // find limit
962 movdqa [esp + 12], xmm0 // mi expanded low
963 movdqa [esp + 28], xmm1 // mi expanded high
964 mov [esp + 8], eax
965
966 // Set up the outer loop state and prepare for the first iteration.
967 mov ecx, [ebp + 32] // n
968 mov ebx, [ebp + 28] // -> X = nv[0]
969 lea edx, [edi + 4*ecx] // -> dv[n/4] = outer dv limit
970 lea ecx, [ebx + 4*ecx] // -> nv[n/4] = nv limit
971 mov [esp + 0], edi
972 mov [esp + 4], edx
973 lea esi, [esp + 12] // -> expanded M = mi
974 lea edx, [esp + 44] // -> space for Y
975 call mont4
976 add edi, 16
977 add ebx, 16
978 cmp ebx, ecx // done already?
979 jae 8f
980
981 .p2align 4
982 // Complete the first inner loop.
9835: call mla4
984 add ebx, 16
985 add edi, 16
986 cmp ebx, ecx // done yet?
987 jb 5b
988
989 // Still have carries left to propagate.
9908: carryadd
991 mov esi, [esp + 8] // -> dv blocks limit
992 mov edx, [ebp + 24] // dv limit
993 psllq xmm7, 16
994 pslldq xmm7, 8
995 paddq xmm6, xmm7
996 call carryprop
997 movd eax, xmm4
998 add edi, 16
999 cmp edi, esi
1000 jae 7f
1001
1002 .p2align 4
1003 // Continue carry propagation until the end of the buffer.
10040: add [edi], eax
1005 mov eax, 0 // preserves flags
1006 adcd [edi + 4], 0
1007 adcd [edi + 8], 0
1008 adcd [edi + 12], 0
1009 adc eax, 0
1010 add edi, 16
1011 cmp edi, esi
1012 jb 0b
1013
1014 // Deal with the tail end.
10157: add [edi], eax
1016 mov eax, 0 // preserves flags
1017 add edi, 4
1018 adc eax, 0
1019 cmp edi, edx
1020 jb 7b
1021
1022 // All done for this iteration. Start the next. (This must have at
1023 // least one follow-on iteration, or we'd not have started this outer
1024 // loop.)
10258: mov edi, [esp + 0] // -> dv[i - 1]
1026 mov ebx, [ebp + 28] // -> X = nv[0]
1027 lea edx, [esp + 44] // -> space for Y
1028 lea esi, [esp + 12] // -> expanded M = mi
1029 add edi, 16 // -> Z = dv[i]
1030 cmp edi, [esp + 4] // all done yet?
1031 jae 9f
1032 mov [esp + 0], edi
1033 call mont4
1034 add edi, 16
1035 add ebx, 16
1036 jmp 5b
1037
1038 // All over.
0923a413
MW
10399: dropfp
1040 popreg edi
1041 popreg esi
1042 popreg ebx
1043 popreg ebp
444083ae
MW
1044 ret
1045
1046ENDFUNC
1047
1048///--------------------------------------------------------------------------
1049/// Testing and performance measurement.
1050
1051#ifdef TEST_MUL4
1052
1053.macro cysetup c
1054 rdtsc
1055 mov [\c], eax
1056 mov [\c + 4], edx
1057.endm
1058
1059.macro cystore c, v, n
1060 rdtsc
1061 sub eax, [\c]
1062 sbb edx, [\c + 4]
1063 mov ebx, [\v]
1064 mov ecx, [\n]
1065 dec ecx
1066 mov [\n], ecx
1067 mov [ebx + ecx*8], eax
1068 mov [ebx + ecx*8 + 4], edx
1069.endm
1070
1071.macro testprologue
0923a413
MW
1072 pushreg ebp
1073 pushreg ebx
1074 pushreg esi
1075 pushreg edi
1076 setfp ebp
444083ae
MW
1077 and esp, ~15
1078 sub esp, 3*32 + 12
0923a413 1079 endprologue
444083ae
MW
1080 // vars:
1081 // esp + 0 = cycles
1082 // esp + 12 = v expanded
1083 // esp + 44 = y expanded
1084 // esp + 72 = ? expanded
1085.endm
1086
1087.macro testepilogue
0923a413
MW
1088 dropfp
1089 popreg edi
1090 popreg esi
1091 popreg ebx
1092 popreg ebp
444083ae
MW
1093 ret
1094.endm
1095
1096.macro testldcarry c
1097 mov ecx, \c // -> c
1098 movdqu xmm4, [ecx + 0] // (c'_0, c''_0)
1099 movdqu xmm5, [ecx + 16] // (c'_1, c''_1)
1100 movdqu xmm6, [ecx + 32] // (c'_2, c''_2)
1101.endm
1102
71ac8e5e 1103.macro testexpand v=nil, y=nil
444083ae
MW
1104 pxor xmm7, xmm7
1105 .ifnes "\v", "nil"
1106 mov ecx, \v
1107 movdqu xmm0, [ecx]
71ac8e5e 1108 expand xmm7, xmm0, xmm1
444083ae
MW
1109 movdqa [esp + 12], xmm0
1110 movdqa [esp + 28], xmm1
1111 .endif
1112 .ifnes "\y", "nil"
1113 mov edx, \y
1114 movdqu xmm2, [edx]
71ac8e5e 1115 expand xmm7, xmm2, xmm3
444083ae
MW
1116 movdqa [esp + 44], xmm2
1117 movdqa [esp + 60], xmm3
1118 .endif
1119.endm
1120
71ac8e5e 1121.macro testtop u=nil, x=nil, mode=nil
444083ae
MW
1122 .p2align 4
11230:
1124 .ifnes "\u", "nil"
1125 lea ecx, [esp + 12]
1126 .endif
1127 mov ebx, \x
1128 .ifeqs "\mode", "mont"
1129 lea esi, [esp + 44]
1130 .endif
1131 cysetup esp + 0
1132 .ifnes "\u", "nil"
1133 mov eax, \u
1134 .endif
1135 .ifeqs "\mode", "mont"
1136 lea edx, [esp + 76]
1137 .else
1138 lea edx, [esp + 44]
1139 .endif
1140.endm
1141
1142.macro testtail cyv, n
1143 cystore esp + 0, \cyv, \n
1144 jnz 0b
1145.endm
1146
1147.macro testcarryout c
1148 mov ecx, \c
1149 movdqu [ecx + 0], xmm4
1150 movdqu [ecx + 16], xmm5
1151 movdqu [ecx + 32], xmm6
1152.endm
1153
0923a413 1154FUNC(test_dmul4)
444083ae
MW
1155 testprologue
1156 testldcarry [ebp + 24]
1157 testexpand [ebp + 36], [ebp + 40]
1158 mov edi, [ebp + 20]
1159 testtop [ebp + 28], [ebp + 32]
1160 call dmul4
1161 testtail [ebp + 48], [ebp + 44]
1162 testcarryout [ebp + 24]
1163 testepilogue
0923a413 1164ENDFUNC
444083ae 1165
0923a413 1166FUNC(test_dmla4)
444083ae
MW
1167 testprologue
1168 testldcarry [ebp + 24]
1169 testexpand [ebp + 36], [ebp + 40]
1170 mov edi, [ebp + 20]
1171 testtop [ebp + 28], [ebp + 32]
1172 call dmla4
1173 testtail [ebp + 48], [ebp + 44]
1174 testcarryout [ebp + 24]
1175 testepilogue
0923a413 1176ENDFUNC
444083ae 1177
0923a413 1178FUNC(test_mul4)
444083ae
MW
1179 testprologue
1180 testldcarry [ebp + 24]
1181 testexpand nil, [ebp + 32]
1182 mov edi, [ebp + 20]
1183 testtop nil, [ebp + 28]
1184 call mul4
1185 testtail [ebp + 40], [ebp + 36]
1186 testcarryout [ebp + 24]
1187 testepilogue
0923a413 1188ENDFUNC
444083ae 1189
0923a413 1190FUNC(test_mla4)
444083ae
MW
1191 testprologue
1192 testldcarry [ebp + 24]
1193 testexpand nil, [ebp + 32]
1194 mov edi, [ebp + 20]
1195 testtop nil, [ebp + 28]
1196 call mla4
1197 testtail [ebp + 40], [ebp + 36]
1198 testcarryout [ebp + 24]
1199 testepilogue
0923a413 1200ENDFUNC
444083ae 1201
0923a413 1202FUNC(test_mmul4)
444083ae
MW
1203 testprologue
1204 testexpand [ebp + 40], [ebp + 44]
1205 mov edi, [ebp + 20]
1206 testtop [ebp + 32], [ebp + 36], mont
1207 call mmul4
1208 testtail [ebp + 52], [ebp + 48]
1209 mov edi, [ebp + 28]
1210 movdqa xmm0, [esp + 76]
1211 movdqa xmm1, [esp + 92]
1212 movdqu [edi], xmm0
1213 movdqu [edi + 16], xmm1
1214 testcarryout [ebp + 24]
1215 testepilogue
0923a413 1216ENDFUNC
444083ae 1217
0923a413 1218FUNC(test_mmla4)
444083ae
MW
1219 testprologue
1220 testexpand [ebp + 40], [ebp + 44]
1221 mov edi, [ebp + 20]
1222 testtop [ebp + 32], [ebp + 36], mont
1223 call mmla4
1224 testtail [ebp + 52], [ebp + 48]
1225 mov edi, [ebp + 28]
1226 movdqa xmm0, [esp + 76]
1227 movdqa xmm1, [esp + 92]
1228 movdqu [edi], xmm0
1229 movdqu [edi + 16], xmm1
1230 testcarryout [ebp + 24]
1231 testepilogue
0923a413 1232ENDFUNC
444083ae 1233
0923a413 1234FUNC(test_mont4)
444083ae
MW
1235 testprologue
1236 testexpand nil, [ebp + 36]
1237 mov edi, [ebp + 20]
1238 testtop nil, [ebp + 32], mont
1239 call mont4
1240 testtail [ebp + 44], [ebp + 40]
1241 mov edi, [ebp + 28]
1242 movdqa xmm0, [esp + 76]
1243 movdqa xmm1, [esp + 92]
1244 movdqu [edi], xmm0
1245 movdqu [edi + 16], xmm1
1246 testcarryout [ebp + 24]
1247 testepilogue
0923a413 1248ENDFUNC
444083ae
MW
1249
1250#endif
1251
1252///----- That's all, folks --------------------------------------------------