x86ish *.S: Use `stalloc' consistently to allocate space on the stack.
[catacomb] / math / mpx-mul4-x86-sse2.S
CommitLineData
444083ae
MW
1/// -*- mode: asm; asm-comment-char: ?/; comment-start: "// " -*-
2///
3/// Large SIMD-based multiplications
4///
5/// (c) 2016 Straylight/Edgeware
6
7///----- Licensing notice ---------------------------------------------------
8///
9/// This file is part of Catacomb.
10///
11/// Catacomb is free software; you can redistribute it and/or modify
12/// it under the terms of the GNU Library General Public License as
13/// published by the Free Software Foundation; either version 2 of the
14/// License, or (at your option) any later version.
15///
16/// Catacomb is distributed in the hope that it will be useful,
17/// but WITHOUT ANY WARRANTY; without even the implied warranty of
18/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19/// GNU Library General Public License for more details.
20///
21/// You should have received a copy of the GNU Library General Public
22/// License along with Catacomb; if not, write to the Free
23/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
24/// MA 02111-1307, USA.
25
26///--------------------------------------------------------------------------
df07f2c0 27/// Preliminaries.
444083ae
MW
28
29#include "config.h"
30#include "asm-common.h"
31
444083ae 32 .arch pentium4
df07f2c0 33
444083ae
MW
34 .text
35
36///--------------------------------------------------------------------------
37/// Theory.
38///
39/// We define a number of primitive fixed-size multipliers from which we can
40/// construct more general variable-length multipliers.
41///
42/// The basic trick is the same throughout. In an operand-scanning
43/// multiplication, the inner multiplication loop multiplies a
44/// multiple-precision operand by a single precision factor, and adds the
45/// result, appropriately shifted, to the result. A `finely integrated
46/// operand scanning' implementation of Montgomery multiplication also adds
47/// the product of a single-precision `Montgomery factor' and the modulus,
48/// calculated in the same pass. The more common `coarsely integrated
49/// operand scanning' alternates main multiplication and Montgomery passes,
50/// which requires additional carry propagation.
51///
52/// Throughout both plain-multiplication and Montgomery stages, then, one of
53/// the factors remains constant throughout the operation, so we can afford
54/// to take a little time to preprocess it. The transformation we perform is
55/// as follows. Let b = 2^16, and B = b^2 = 2^32. Suppose we're given a
56/// 128-bit factor v = v_0 + v_1 B + v_2 B^2 + v_3 B^3. Split each v_i into
57/// two sixteen-bit pieces, so v_i = v'_i + v''_i b. These eight 16-bit
58/// pieces are placed into 32-bit cells, and arranged as two 128-bit SSE
59/// operands, as follows.
60///
61/// Offset 0 4 8 12
62/// 0 v'_0 v'_1 v''_0 v''_1
63/// 16 v'_2 v'_3 v''_2 v''_3
64///
2aaa07f8 65/// A `pmuludq' instruction ignores the odd positions in its operands; thus,
444083ae
MW
66/// it will act on (say) v'_0 and v''_0 in a single instruction. Shifting
67/// this vector right by 4 bytes brings v'_1 and v''_1 into position. We can
68/// multiply such a vector by a full 32-bit scalar to produce two 48-bit
69/// results in 64-bit fields. The sixteen bits of headroom allows us to add
70/// many products together before we must deal with carrying; it also allows
71/// for some calculations to be performed on the above expanded form.
72///
73/// On 32-bit x86, we are register starved: the expanded operands are kept in
74/// memory, typically in warm L1 cache.
75///
76/// We maintain four `carry' registers accumulating intermediate results.
77/// The registers' precise roles rotate during the computation; we name them
78/// `c0', `c1', `c2', and `c3'. Each carry register holds two 64-bit halves:
79/// the register c0, for example, holds c'_0 (low half) and c''_0 (high
80/// half), and represents the value c_0 = c'_0 + c''_0 b; the carry registers
81/// collectively represent the value c_0 + c_1 B + c_2 B^2 + c_3 B^3. The
2aaa07f8 82/// `pmuluqd' instruction acting on a scalar operand (broadcast across all
444083ae
MW
83/// lanes of its vector) and an operand in the expanded form above produces a
84/// result which can be added directly to the appropriate carry register.
85/// Following a pass of four multiplications, we perform some limited carry
86/// propagation: let t = c''_0 mod B, and let d = c'_0 + t b; then we output
87/// z = d mod B, add (floor(d/B), floor(c''_0/B)) to c1, and cycle the carry
88/// registers around, so that c1 becomes c0, and the old c0 is (implicitly)
89/// zeroed becomes c3.
90
91///--------------------------------------------------------------------------
92/// Macro definitions.
93
71ac8e5e 94.macro mulcore r, s, d0, d1=nil, d2=nil, d3=nil
444083ae
MW
95 // Load a word r_i from R, multiply by the expanded operand [S], and
96 // leave the pieces of the product in registers D0, D1, D2, D3.
8e91d6e5 97 movd \d0, \r // (r_i, 0; 0, 0)
444083ae 98 .ifnes "\d1", "nil"
8e91d6e5 99 movdqa \d1, [\s] // (s'_0, s'_1; s''_0, s''_1)
444083ae
MW
100 .endif
101 .ifnes "\d3", "nil"
8e91d6e5 102 movdqa \d3, [\s + 16] // (s'_2, s'_3; s''_2, s''_3)
444083ae 103 .endif
a117c06f 104 pshufd \d0, \d0, SHUF(0, 3, 0, 3) // (r_i, ?; r_i, ?)
444083ae 105 .ifnes "\d1", "nil"
8e91d6e5 106 psrldq \d1, 4 // (s'_1, s''_0; s''_1, 0)
444083ae
MW
107 .endif
108 .ifnes "\d2", "nil"
109 .ifnes "\d3", "nil"
8e91d6e5 110 movdqa \d2, \d3 // another copy of (s'_2, s'_3; ...)
444083ae 111 .else
8e91d6e5 112 movdqa \d2, \d0 // another copy of (r_i, ?; r_i, ?)
444083ae
MW
113 .endif
114 .endif
115 .ifnes "\d3", "nil"
8e91d6e5 116 psrldq \d3, 4 // (s'_3, s''_2; s''_3, 0)
444083ae
MW
117 .endif
118 .ifnes "\d1", "nil"
8e91d6e5 119 pmuludq \d1, \d0 // (r_i s'_1; r_i s''_1)
444083ae
MW
120 .endif
121 .ifnes "\d3", "nil"
8e91d6e5 122 pmuludq \d3, \d0 // (r_i s'_3; r_i s''_3)
444083ae
MW
123 .endif
124 .ifnes "\d2", "nil"
125 .ifnes "\d3", "nil"
8e91d6e5 126 pmuludq \d2, \d0 // (r_i s'_2; r_i s''_2)
444083ae 127 .else
2aaa07f8 128 pmuludq \d2, [\s + 16]
444083ae
MW
129 .endif
130 .endif
8e91d6e5 131 pmuludq \d0, [\s] // (r_i s'_0; r_i s''_0)
444083ae
MW
132.endm
133
71ac8e5e
MW
134.macro accum c0, c1=nil, c2=nil, c3=nil
135 // Accumulate 64-bit pieces in XMM0--XMM3 into the corresponding
136 // carry registers C0--C3. Any or all of C1--C3 may be `nil' to skip
137 // updating that register.
444083ae
MW
138 paddq \c0, xmm0
139 .ifnes "\c1", "nil"
140 paddq \c1, xmm1
141 .endif
142 .ifnes "\c2", "nil"
143 paddq \c2, xmm2
144 .endif
145 .ifnes "\c3", "nil"
146 paddq \c3, xmm3
147 .endif
148.endm
149
71ac8e5e 150.macro mulacc r, s, c0, c1, c2, c3, z3p=nil
444083ae
MW
151 // Load a word r_i from R, multiply by the expanded operand [S],
152 // and accumulate in carry registers C0, C1, C2, C3. If Z3P is `t'
153 // then C3 notionally contains zero, but needs clearing; in practice,
154 // we store the product directly rather than attempting to add. On
155 // completion, XMM0, XMM1, and XMM2 are clobbered, as is XMM3 if Z3P
156 // is not `t'.
157 .ifeqs "\z3p", "t"
158 mulcore \r, \s, xmm0, xmm1, xmm2, \c3
71ac8e5e 159 accum \c0, \c1, \c2
444083ae
MW
160 .else
161 mulcore \r, \s, xmm0, xmm1, xmm2, xmm3
162 accum \c0, \c1, \c2, \c3
163 .endif
164.endm
165
71ac8e5e 166.macro propout d, c, cc=nil
444083ae
MW
167 // Calculate an output word from C, and store it in D; propagate
168 // carries out from C to CC in preparation for a rotation of the
169 // carry registers. On completion, XMM3 is clobbered. If CC is
170 // `nil', then the contribution which would have been added to it is
171 // left in C.
a117c06f 172 pshufd xmm3, \c, SHUF(3, 3, 3, 2) // (?, ?; ?, t = c'' mod B)
8e91d6e5
MW
173 psrldq xmm3, 12 // (t, 0; 0, 0) = (t, 0)
174 pslldq xmm3, 2 // (t b; 0)
175 paddq \c, xmm3 // (c' + t b; c'')
444083ae
MW
176 movd \d, \c
177 psrlq \c, 32 // floor(c/B)
178 .ifnes "\cc", "nil"
179 paddq \cc, \c // propagate up
180 .endif
181.endm
182
183.macro endprop d, c, t
184 // On entry, C contains a carry register. On exit, the low 32 bits
185 // of the value represented in C are written to D, and the remaining
186 // bits are left at the bottom of T.
187 movdqa \t, \c
8e91d6e5
MW
188 psllq \t, 16 // (?; c'' b)
189 pslldq \c, 8 // (0; c')
190 paddq \t, \c // (?; c' + c'' b)
191 psrldq \t, 8 // (c' + c'' b; 0) = (c; 0)
444083ae 192 movd \d, \t
8e91d6e5 193 psrldq \t, 4 // (floor(c/B); 0)
444083ae
MW
194.endm
195
71ac8e5e 196.macro expand z, a, b, c=nil, d=nil
444083ae
MW
197 // On entry, A and C hold packed 128-bit values, and Z is zero. On
198 // exit, A:B and C:D together hold the same values in expanded
199 // form. If C is `nil', then only expand A to A:B.
8e91d6e5 200 movdqa \b, \a // (a_0, a_1; a_2, a_3)
444083ae 201 .ifnes "\c", "nil"
8e91d6e5 202 movdqa \d, \c // (c_0, c_1; c_2, c_3)
444083ae 203 .endif
8e91d6e5
MW
204 punpcklwd \a, \z // (a'_0, a''_0; a'_1, a''_1)
205 punpckhwd \b, \z // (a'_2, a''_2; a'_3, a''_3)
444083ae 206 .ifnes "\c", "nil"
8e91d6e5
MW
207 punpcklwd \c, \z // (c'_0, c''_0; c'_1, c''_1)
208 punpckhwd \d, \z // (c'_2, c''_2; c'_3, c''_3)
444083ae 209 .endif
a117c06f
MW
210 pshufd \a, \a, SHUF(0, 2, 1, 3) // (a'_0, a'_1; a''_0, a''_1)
211 pshufd \b, \b, SHUF(0, 2, 1, 3) // (a'_2, a'_3; a''_2, a''_3)
444083ae 212 .ifnes "\c", "nil"
a117c06f
MW
213 pshufd \c, \c, SHUF(0, 2, 1, 3) // (c'_0, c'_1; c''_0, c''_1)
214 pshufd \d, \d, SHUF(0, 2, 1, 3) // (c'_2, c'_3; c''_2, c''_3)
444083ae
MW
215 .endif
216.endm
217
71ac8e5e 218.macro squash c0, c1, c2, c3, t, u, lo, hi=nil
444083ae 219 // On entry, C0, C1, C2, C3 are carry registers representing a value
4b30aca5 220 // Y. On exit, LO holds the low 128 bits of the carry value; C1, C2,
444083ae 221 // C3, T, and U are clobbered; and the high bits of Y are stored in
4b30aca5 222 // HI, if this is not `nil'.
444083ae
MW
223
224 // The first step is to eliminate the `double-prime' pieces -- i.e.,
225 // the ones offset by 16 bytes from a 32-bit boundary -- by carrying
226 // them into the 32-bit-aligned pieces above and below. But before
227 // we can do that, we must gather them together.
228 movdqa \t, \c0
229 movdqa \u, \c1
8e91d6e5
MW
230 punpcklqdq \t, \c2 // (y'_0; y'_2)
231 punpckhqdq \c0, \c2 // (y''_0; y''_2)
232 punpcklqdq \u, \c3 // (y'_1; y'_3)
233 punpckhqdq \c1, \c3 // (y''_1; y''_3)
444083ae
MW
234
235 // Now split the double-prime pieces. The high (up to) 48 bits will
236 // go up; the low 16 bits go down.
237 movdqa \c2, \c0
238 movdqa \c3, \c1
239 psllq \c2, 48
240 psllq \c3, 48
8e91d6e5
MW
241 psrlq \c0, 16 // high parts of (y''_0; y''_2)
242 psrlq \c1, 16 // high parts of (y''_1; y''_3)
243 psrlq \c2, 32 // low parts of (y''_0; y''_2)
244 psrlq \c3, 32 // low parts of (y''_1; y''_3)
4b30aca5
MW
245 .ifnes "\hi", "nil"
246 movdqa \hi, \c1
444083ae 247 .endif
8e91d6e5 248 pslldq \c1, 8 // high part of (0; y''_1)
444083ae
MW
249
250 paddq \t, \c2 // propagate down
251 paddq \u, \c3
8e91d6e5
MW
252 paddq \t, \c1 // and up: (y_0; y_2)
253 paddq \u, \c0 // (y_1; y_3)
4b30aca5 254 .ifnes "\hi", "nil"
8e91d6e5 255 psrldq \hi, 8 // high part of (y''_3; 0)
444083ae
MW
256 .endif
257
258 // Finally extract the answer. This complicated dance is better than
259 // storing to memory and loading, because the piecemeal stores
260 // inhibit store forwarding.
8e91d6e5
MW
261 movdqa \c3, \t // (y_0; ?)
262 movdqa \lo, \t // (y^*_0, ?; ?, ?)
263 psrldq \t, 8 // (y_2; 0)
264 psrlq \c3, 32 // (floor(y_0/B); ?)
265 paddq \c3, \u // (y_1 + floor(y_0/B); ?)
266 movdqa \c1, \c3 // (y^*_1, ?; ?, ?)
267 psrldq \u, 8 // (y_3; 0)
268 psrlq \c3, 32 // (floor((y_1 B + y_0)/B^2; ?)
269 paddq \c3, \t // (y_2 + floor((y_1 B + y_0)/B^2; ?)
270 punpckldq \lo, \c3 // (y^*_0, y^*_2; ?, ?)
271 psrlq \c3, 32 // (floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
272 paddq \c3, \u // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
4b30aca5 273 .ifnes "\hi", "nil"
d2269179 274 movdqa \t, \c3
444083ae
MW
275 pxor \u, \u
276 .endif
8e91d6e5 277 punpckldq \c1, \c3 // (y^*_1, y^*_3; ?, ?)
4b30aca5 278 .ifnes "\hi", "nil"
444083ae 279 psrlq \t, 32 // very high bits of y
4b30aca5
MW
280 paddq \hi, \t
281 punpcklqdq \hi, \u // carry up
444083ae 282 .endif
4b30aca5 283 punpckldq \lo, \c1 // y mod B^4
444083ae
MW
284.endm
285
286.macro carryadd
287 // On entry, EDI points to a packed addend A, and XMM4, XMM5, XMM6
288 // hold the incoming carry registers c0, c1, and c2 representing a
289 // carry-in C.
290 //
291 // On exit, the carry registers, including XMM7, are updated to hold
292 // C + A; XMM0, XMM1, XMM2, and XMM3 are clobbered. The other
293 // registers are preserved.
8e91d6e5
MW
294 movd xmm0, [edi + 0] // (a_0; 0)
295 movd xmm1, [edi + 4] // (a_1; 0)
296 movd xmm2, [edi + 8] // (a_2; 0)
297 movd xmm7, [edi + 12] // (a_3; 0)
298
299 paddq xmm4, xmm0 // (c'_0 + a_0; c''_0)
300 paddq xmm5, xmm1 // (c'_1 + a_1; c''_1)
301 paddq xmm6, xmm2 // (c'_2 + a_2; c''_2 + a_3 b)
444083ae
MW
302.endm
303
304///--------------------------------------------------------------------------
305/// Primitive multipliers and related utilities.
306
1a517bb3 307INTFUNC(carryprop)
444083ae
MW
308 // On entry, XMM4, XMM5, and XMM6 hold a 144-bit carry in an expanded
309 // form. Store the low 128 bits of the represented carry to [EDI] as
310 // a packed 128-bit value, and leave the remaining 16 bits in the low
311 // 32 bits of XMM4. On exit, XMM3, XMM5 and XMM6 are clobbered.
0923a413
MW
312 endprologue
313
444083ae
MW
314 propout [edi + 0], xmm4, xmm5
315 propout [edi + 4], xmm5, xmm6
316 propout [edi + 8], xmm6, nil
317 endprop [edi + 12], xmm6, xmm4
318 ret
319
1a517bb3
MW
320ENDFUNC
321
322INTFUNC(dmul4)
444083ae
MW
323 // On entry, EDI points to the destination buffer; EAX and EBX point
324 // to the packed operands U and X; ECX and EDX point to the expanded
325 // operands V and Y; and XMM4, XMM5, XMM6 hold the incoming carry
326 // registers c0, c1, and c2; c3 is assumed to be zero.
327 //
328 // On exit, we write the low 128 bits of the sum C + U V + X Y to
329 // [EDI], and update the carry registers with the carry out. The
330 // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
331 // general-purpose registers are preserved.
0923a413
MW
332 endprologue
333
444083ae 334 mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7, t
71ac8e5e 335 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
444083ae
MW
336 propout [edi + 0], xmm4, xmm5
337
338 mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
71ac8e5e 339 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4
444083ae
MW
340 propout [edi + 4], xmm5, xmm6
341
342 mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t
71ac8e5e 343 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5
444083ae
MW
344 propout [edi + 8], xmm6, xmm7
345
346 mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
71ac8e5e 347 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6
444083ae
MW
348 propout [edi + 12], xmm7, xmm4
349
350 ret
351
1a517bb3
MW
352ENDFUNC
353
354INTFUNC(dmla4)
444083ae
MW
355 // On entry, EDI points to the destination buffer, which also
356 // contains an addend A to accumulate; EAX and EBX point to the
357 // packed operands U and X; ECX and EDX point to the expanded
358 // operands V and Y; and XMM4, XMM5, XMM6 hold the incoming carry
359 // registers c0, c1, and c2 representing a carry-in C; c3 is assumed
360 // to be zero.
361 //
362 // On exit, we write the low 128 bits of the sum A + C + U V + X Y to
363 // [EDI], and update the carry registers with the carry out. The
364 // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
365 // general-purpose registers are preserved.
0923a413
MW
366 endprologue
367
444083ae
MW
368 carryadd
369
71ac8e5e
MW
370 mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7
371 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
444083ae
MW
372 propout [edi + 0], xmm4, xmm5
373
374 mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
71ac8e5e 375 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4
444083ae
MW
376 propout [edi + 4], xmm5, xmm6
377
378 mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t
71ac8e5e 379 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5
444083ae
MW
380 propout [edi + 8], xmm6, xmm7
381
382 mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
71ac8e5e 383 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6
444083ae
MW
384 propout [edi + 12], xmm7, xmm4
385
386 ret
387
1a517bb3
MW
388ENDFUNC
389
390INTFUNC(mul4zc)
444083ae
MW
391 // On entry, EDI points to the destination buffer; EBX points to a
392 // packed operand X; and EDX points to an expanded operand Y.
393 //
394 // On exit, we write the low 128 bits of the product X Y to [EDI],
395 // and set the carry registers XMM4, XMM5, XMM6 to the carry out.
396 // The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
397 // general-purpose registers are preserved.
0923a413
MW
398 endprologue
399
444083ae
MW
400 mulcore [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
401 propout [edi + 0], xmm4, xmm5
402
403 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
404 propout [edi + 4], xmm5, xmm6
405
406 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
407 propout [edi + 8], xmm6, xmm7
408
409 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
410 propout [edi + 12], xmm7, xmm4
411
412 ret
413
1a517bb3
MW
414ENDFUNC
415
416INTFUNC(mul4)
444083ae
MW
417 // On entry, EDI points to the destination buffer; EBX points to a
418 // packed operand X; EDX points to an expanded operand Y; and XMM4,
419 // XMM5, XMM6 hold the incoming carry registers c0, c1, and c2,
420 // representing a carry-in C; c3 is assumed to be zero.
421 //
422 // On exit, we write the low 128 bits of the sum C + X Y to [EDI],
423 // and update the carry registers with the carry out. The registers
424 // XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
425 // general-purpose registers are preserved.
0923a413
MW
426 endprologue
427
444083ae
MW
428 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, t
429 propout [edi + 0], xmm4, xmm5
430
431 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
432 propout [edi + 4], xmm5, xmm6
433
434 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
435 propout [edi + 8], xmm6, xmm7
436
437 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
438 propout [edi + 12], xmm7, xmm4
439
440 ret
441
1a517bb3
MW
442ENDFUNC
443
444INTFUNC(mla4zc)
444083ae
MW
445 // On entry, EDI points to the destination buffer, which also
446 // contains an addend A to accumulate; EBX points to a packed operand
447 // X; and EDX points to an expanded operand Y.
448 //
449 // On exit, we write the low 128 bits of the sum A + X Y to [EDI],
450 // and set the carry registers XMM4, XMM5, XMM6 to the carry out.
451 // The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
452 // general-purpose registers are preserved.
0923a413
MW
453 endprologue
454
444083ae
MW
455 movd xmm4, [edi + 0]
456 movd xmm5, [edi + 4]
457 movd xmm6, [edi + 8]
458 movd xmm7, [edi + 12]
459
71ac8e5e 460 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
444083ae
MW
461 propout [edi + 0], xmm4, xmm5
462
463 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
464 propout [edi + 4], xmm5, xmm6
465
466 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
467 propout [edi + 8], xmm6, xmm7
468
469 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
470 propout [edi + 12], xmm7, xmm4
471
472 ret
473
1a517bb3
MW
474ENDFUNC
475
476INTFUNC(mla4)
444083ae
MW
477 // On entry, EDI points to the destination buffer, which also
478 // contains an addend A to accumulate; EBX points to a packed operand
479 // X; EDX points to an expanded operand Y; and XMM4, XMM5, XMM6 hold
480 // the incoming carry registers c0, c1, and c2, representing a
481 // carry-in C; c3 is assumed to be zero.
482 //
483 // On exit, we write the low 128 bits of the sum A + C + X Y to
484 // [EDI], and update the carry registers with the carry out. The
485 // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
486 // general-purpose registers are preserved.
0923a413
MW
487 endprologue
488
444083ae
MW
489 carryadd
490
71ac8e5e 491 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
444083ae
MW
492 propout [edi + 0], xmm4, xmm5
493
494 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
495 propout [edi + 4], xmm5, xmm6
496
497 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
498 propout [edi + 8], xmm6, xmm7
499
500 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
501 propout [edi + 12], xmm7, xmm4
502
503 ret
504
1a517bb3
MW
505ENDFUNC
506
507INTFUNC(mmul4)
444083ae
MW
508 // On entry, EDI points to the destination buffer; EAX and EBX point
509 // to the packed operands U and N; ECX and ESI point to the expanded
510 // operands V and M; and EDX points to a place to store an expanded
511 // result Y (32 bytes, at a 16-byte boundary). The stack pointer
6ecc0b8f 512 // must be 12 modulo 16, as is usual for modern x86 ABIs.
444083ae
MW
513 //
514 // On exit, we write Y = U V M mod B to [EDX], and the low 128 bits
515 // of the sum U V + N Y to [EDI], leaving the remaining carry in
516 // XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2, XMM3, and
517 // XMM7 are clobbered; the general-purpose registers are preserved.
6ecc0b8f 518 stalloc 48 + 12 // space for the carries
0923a413 519 endprologue
444083ae
MW
520
521 // Calculate W = U V, and leave it in the destination. Stash the
522 // carry pieces for later.
523 mulcore [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7
524 propout [edi + 0], xmm4, xmm5
525 jmp 5f
526
1a517bb3
MW
527ENDFUNC
528
529INTFUNC(mmla4)
444083ae 530 // On entry, EDI points to the destination buffer, which also
14e7b1f5
MW
531 // contains an addend A to accumulate; EAX and EBX point to the
532 // packed operands U and N; ECX and ESI point to the expanded
444083ae
MW
533 // operands V and M; and EDX points to a place to store an expanded
534 // result Y (32 bytes, at a 16-byte boundary). The stack pointer
6ecc0b8f 535 // must be 12 modulo 16, as is usual for modern x86 ABIs.
444083ae
MW
536 //
537 // On exit, we write Y = (A + U V) M mod B to [EDX], and the low 128
538 // bits of the sum A + U V + N Y to [EDI], leaving the remaining
539 // carry in XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2,
540 // XMM3, and XMM7 are clobbered; the general-purpose registers are
541 // preserved.
6ecc0b8f 542 stalloc 48 + 12 // space for the carries
0923a413
MW
543 endprologue
544
444083ae
MW
545 movd xmm4, [edi + 0]
546 movd xmm5, [edi + 4]
547 movd xmm6, [edi + 8]
548 movd xmm7, [edi + 12]
ba12677b
MW
549
550 // Calculate W = U V, and leave it in the destination. Stash the
551 // carry pieces for later.
71ac8e5e 552 mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7
444083ae
MW
553 propout [edi + 0], xmm4, xmm5
554
5555: mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
556 propout [edi + 4], xmm5, xmm6
557
558 mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t
559 propout [edi + 8], xmm6, xmm7
560
561 mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
562 propout [edi + 12], xmm7, xmm4
563
a90d420c
MW
564 movdqa [SP + 0], xmm4
565 movdqa [SP + 16], xmm5
566 movdqa [SP + 32], xmm6
444083ae
MW
567
568 // Calculate Y = W M.
569 mulcore [edi + 0], esi, xmm4, xmm5, xmm6, xmm7
570
71ac8e5e
MW
571 mulcore [edi + 4], esi, xmm0, xmm1, xmm2
572 accum xmm5, xmm6, xmm7
444083ae 573
71ac8e5e
MW
574 mulcore [edi + 8], esi, xmm0, xmm1
575 accum xmm6, xmm7
444083ae 576
71ac8e5e
MW
577 mulcore [edi + 12], esi, xmm0
578 accum xmm7
444083ae
MW
579
580 // That's lots of pieces. Now we have to assemble the answer.
71ac8e5e 581 squash xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm4
444083ae
MW
582
583 // Expand it.
584 pxor xmm2, xmm2
71ac8e5e 585 expand xmm2, xmm4, xmm1
444083ae
MW
586 movdqa [edx + 0], xmm4
587 movdqa [edx + 16], xmm1
588
589 // Initialize the carry from the value for W we calculated earlier.
590 movd xmm4, [edi + 0]
591 movd xmm5, [edi + 4]
592 movd xmm6, [edi + 8]
593 movd xmm7, [edi + 12]
594
595 // Finish the calculation by adding the Montgomery product.
71ac8e5e 596 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
444083ae
MW
597 propout [edi + 0], xmm4, xmm5
598
599 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
600 propout [edi + 4], xmm5, xmm6
601
602 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
603 propout [edi + 8], xmm6, xmm7
604
605 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
606 propout [edi + 12], xmm7, xmm4
607
608 // Add add on the carry we calculated earlier.
a90d420c
MW
609 paddq xmm4, [SP + 0]
610 paddq xmm5, [SP + 16]
611 paddq xmm6, [SP + 32]
444083ae
MW
612
613 // And, with that, we're done.
6ecc0b8f 614 stfree 48 + 12
444083ae
MW
615 ret
616
1a517bb3
MW
617ENDFUNC
618
619INTFUNC(mont4)
444083ae 620 // On entry, EDI points to the destination buffer holding a packed
8e5386aa 621 // value W; EBX points to a packed operand N; ESI points to an
444083ae
MW
622 // expanded operand M; and EDX points to a place to store an expanded
623 // result Y (32 bytes, at a 16-byte boundary).
624 //
625 // On exit, we write Y = W M mod B to [EDX], and the low 128 bits
626 // of the sum W + N Y to [EDI], leaving the remaining carry in
627 // XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2, XMM3, and
628 // XMM7 are clobbered; the general-purpose registers are preserved.
0923a413 629 endprologue
444083ae
MW
630
631 // Calculate Y = W M.
632 mulcore [edi + 0], esi, xmm4, xmm5, xmm6, xmm7
633
71ac8e5e
MW
634 mulcore [edi + 4], esi, xmm0, xmm1, xmm2
635 accum xmm5, xmm6, xmm7
444083ae 636
71ac8e5e
MW
637 mulcore [edi + 8], esi, xmm0, xmm1
638 accum xmm6, xmm7
444083ae 639
71ac8e5e
MW
640 mulcore [edi + 12], esi, xmm0
641 accum xmm7
444083ae
MW
642
643 // That's lots of pieces. Now we have to assemble the answer.
71ac8e5e 644 squash xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm4
444083ae
MW
645
646 // Expand it.
647 pxor xmm2, xmm2
71ac8e5e 648 expand xmm2, xmm4, xmm1
444083ae
MW
649 movdqa [edx + 0], xmm4
650 movdqa [edx + 16], xmm1
651
652 // Initialize the carry from W.
653 movd xmm4, [edi + 0]
654 movd xmm5, [edi + 4]
655 movd xmm6, [edi + 8]
656 movd xmm7, [edi + 12]
657
658 // Finish the calculation by adding the Montgomery product.
71ac8e5e 659 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
444083ae
MW
660 propout [edi + 0], xmm4, xmm5
661
662 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
663 propout [edi + 4], xmm5, xmm6
664
665 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
666 propout [edi + 8], xmm6, xmm7
667
668 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
669 propout [edi + 12], xmm7, xmm4
670
671 // And, with that, we're done.
672 ret
673
1a517bb3
MW
674ENDFUNC
675
444083ae
MW
676///--------------------------------------------------------------------------
677/// Bulk multipliers.
678
b9b279b4
MW
679FUNC(mpx_umul4_x86_avx)
680 .arch .avx
681 vzeroupper
682 endprologue
683 // and drop through...
684 .arch pentium4
685ENDFUNC
686
444083ae
MW
687FUNC(mpx_umul4_x86_sse2)
688 // void mpx_umul4_x86_sse2(mpw *dv, const mpw *av, const mpw *avl,
689 // const mpw *bv, const mpw *bvl);
690
a90d420c 691 // Build a stack frame. Arguments will be relative to BP, as
444083ae
MW
692 // follows.
693 //
a90d420c
MW
694 // BP + 20 dv
695 // BP + 24 av
696 // BP + 28 avl
697 // BP + 32 bv
698 // BP + 36 bvl
444083ae 699 //
a90d420c 700 // Locals are relative to SP, as follows.
444083ae 701 //
a90d420c
MW
702 // SP + 0 expanded Y (32 bytes)
703 // SP + 32 (top of locals)
704 pushreg BP
0923a413
MW
705 pushreg ebx
706 pushreg esi
707 pushreg edi
42c44b27 708 setfp
6d2bd7f1 709 stalloc 32
a90d420c 710 and SP, ~15
0923a413 711 endprologue
444083ae
MW
712
713 // Prepare for the first iteration.
a90d420c 714 mov esi, [BP + 32] // -> bv[0]
444083ae
MW
715 pxor xmm7, xmm7
716 movdqu xmm0, [esi] // bv[0]
a90d420c 717 mov edi, [BP + 20] // -> dv[0]
444083ae 718 mov ecx, edi // outer loop dv cursor
71ac8e5e 719 expand xmm7, xmm0, xmm1
a90d420c
MW
720 mov ebx, [BP + 24] // -> av[0]
721 mov eax, [BP + 28] // -> av[m] = av limit
722 mov edx, SP // -> expanded Y = bv[0]
723 movdqa [SP + 0], xmm0 // bv[0] expanded low
724 movdqa [SP + 16], xmm1 // bv[0] expanded high
444083ae
MW
725 call mul4zc
726 add ebx, 16
727 add edi, 16
728 add ecx, 16
729 add esi, 16
730 cmp ebx, eax // all done?
731 jae 8f
732
733 .p2align 4
734 // Continue with the first iteration.
7350: call mul4
736 add ebx, 16
737 add edi, 16
738 cmp ebx, eax // all done?
739 jb 0b
740
741 // Write out the leftover carry. There can be no tail here.
7428: call carryprop
a90d420c 743 cmp esi, [BP + 36] // more passes to do?
444083ae
MW
744 jae 9f
745
746 .p2align 4
747 // Set up for the next pass.
7481: movdqu xmm0, [esi] // bv[i]
749 mov edi, ecx // -> dv[i]
750 pxor xmm7, xmm7
71ac8e5e 751 expand xmm7, xmm0, xmm1
a90d420c
MW
752 mov ebx, [BP + 24] // -> av[0]
753 movdqa [SP + 0], xmm0 // bv[i] expanded low
754 movdqa [SP + 16], xmm1 // bv[i] expanded high
444083ae
MW
755 call mla4zc
756 add edi, 16
757 add ebx, 16
758 add ecx, 16
759 add esi, 16
760 cmp ebx, eax // done yet?
761 jae 8f
762
763 .p2align 4
764 // Continue...
7650: call mla4
766 add ebx, 16
767 add edi, 16
768 cmp ebx, eax
769 jb 0b
770
771 // Finish off this pass. There was no tail on the previous pass, and
772 // there can be none on this pass.
7738: call carryprop
a90d420c 774 cmp esi, [BP + 36]
444083ae
MW
775 jb 1b
776
777 // All over.
0923a413 7789: dropfp
444083ae
MW
779 pop edi
780 pop esi
781 pop ebx
a90d420c 782 pop BP
444083ae
MW
783 ret
784
785ENDFUNC
786
b9b279b4
MW
787FUNC(mpxmont_mul4_x86_avx)
788 .arch .avx
789 vzeroupper
790 endprologue
791 // and drop through...
792 .arch pentium4
793ENDFUNC
794
444083ae
MW
795FUNC(mpxmont_mul4_x86_sse2)
796 // void mpxmont_mul4_x86_sse2(mpw *dv, const mpw *av, const mpw *bv,
797 // const mpw *nv, size_t n, const mpw *mi);
798
a90d420c 799 // Build a stack frame. Arguments will be relative to BP, as
444083ae
MW
800 // follows.
801 //
a90d420c
MW
802 // BP + 20 dv
803 // BP + 24 av
804 // BP + 28 bv
805 // BP + 32 nv
806 // BP + 36 n (nonzero multiple of 4)
807 // BP + 40 mi
444083ae 808 //
a90d420c 809 // Locals are relative to SP, which 16-byte aligned, as follows.
444083ae 810 //
a90d420c
MW
811 // SP + 0 expanded V (32 bytes)
812 // SP + 32 expanded M (32 bytes)
813 // SP + 64 expanded Y (32 bytes)
814 // SP + 96 outer loop dv
815 // SP + 100 outer loop bv
816 // SP + 104 av limit (mostly in ESI)
817 // SP + 108 bv limit
818 // SP + 112 (top of locals)
819 pushreg BP
0923a413
MW
820 pushreg ebx
821 pushreg esi
822 pushreg edi
42c44b27 823 setfp
6d2bd7f1 824 stalloc 112
a90d420c 825 and SP, ~15
0923a413 826 endprologue
444083ae
MW
827
828 // Establish the expanded operands.
829 pxor xmm7, xmm7
a90d420c
MW
830 mov ecx, [BP + 28] // -> bv
831 mov edx, [BP + 40] // -> mi
444083ae
MW
832 movdqu xmm0, [ecx] // bv[0]
833 movdqu xmm2, [edx] // mi
71ac8e5e 834 expand xmm7, xmm0, xmm1, xmm2, xmm3
a90d420c
MW
835 movdqa [SP + 0], xmm0 // bv[0] expanded low
836 movdqa [SP + 16], xmm1 // bv[0] expanded high
837 movdqa [SP + 32], xmm2 // mi expanded low
838 movdqa [SP + 48], xmm3 // mi expanded high
444083ae
MW
839
840 // Set up the outer loop state and prepare for the first iteration.
a90d420c
MW
841 mov edx, [BP + 36] // n
842 mov eax, [BP + 24] // -> U = av[0]
843 mov ebx, [BP + 32] // -> X = nv[0]
844 mov edi, [BP + 20] // -> Z = dv[0]
845 mov [SP + 100], ecx
444083ae
MW
846 lea ecx, [ecx + 4*edx] // -> bv[n/4] = bv limit
847 lea edx, [eax + 4*edx] // -> av[n/4] = av limit
a90d420c
MW
848 mov [SP + 96], edi
849 mov [SP + 104], edx
850 mov [SP + 108], ecx
851 lea ecx, [SP + 0] // -> expanded V = bv[0]
852 lea esi, [SP + 32] // -> expanded M = mi
853 lea edx, [SP + 64] // -> space for Y
444083ae 854 call mmul4
a90d420c 855 mov esi, [SP + 104] // recover av limit
444083ae
MW
856 add edi, 16
857 add eax, 16
858 add ebx, 16
859 cmp eax, esi // done already?
860 jae 8f
a90d420c 861 mov [SP + 96], edi
444083ae
MW
862
863 .p2align 4
864 // Complete the first inner loop.
8650: call dmul4
866 add edi, 16
867 add eax, 16
868 add ebx, 16
869 cmp eax, esi // done yet?
870 jb 0b
871
872 // Still have carries left to propagate.
873 call carryprop
874 movd [edi + 16], xmm4
875
876 .p2align 4
877 // Embark on the next iteration. (There must be one. If n = 1, then
878 // we would have bailed above, to label 8. Similarly, the subsequent
879 // iterations can fall into the inner loop immediately.)
a90d420c
MW
8801: mov eax, [SP + 100] // -> bv[i - 1]
881 mov edi, [SP + 96] // -> Z = dv[i]
444083ae
MW
882 add eax, 16 // -> bv[i]
883 pxor xmm7, xmm7
a90d420c
MW
884 mov [SP + 100], eax
885 cmp eax, [SP + 108] // done yet?
444083ae 886 jae 9f
6ecc0b8f 887 movdqu xmm0, [eax] // bv[i]
a90d420c
MW
888 mov ebx, [BP + 32] // -> X = nv[0]
889 lea esi, [SP + 32] // -> expanded M = mi
890 mov eax, [BP + 24] // -> U = av[0]
71ac8e5e 891 expand xmm7, xmm0, xmm1
a90d420c
MW
892 movdqa [SP + 0], xmm0 // bv[i] expanded low
893 movdqa [SP + 16], xmm1 // bv[i] expanded high
444083ae 894 call mmla4
a90d420c 895 mov esi, [SP + 104] // recover av limit
444083ae
MW
896 add edi, 16
897 add eax, 16
898 add ebx, 16
a90d420c 899 mov [SP + 96], edi
444083ae
MW
900
901 .p2align 4
902 // Complete the next inner loop.
9030: call dmla4
904 add edi, 16
905 add eax, 16
906 add ebx, 16
907 cmp eax, esi
908 jb 0b
909
910 // Still have carries left to propagate, and they overlap the
911 // previous iteration's final tail, so read that in and add it.
912 movd xmm0, [edi]
913 paddq xmm4, xmm0
914 call carryprop
915 movd [edi + 16], xmm4
916
917 // Back again.
918 jmp 1b
919
920 // First iteration was short. Write out the carries and we're done.
921 // (This could be folded into the main loop structure, but that would
922 // penalize small numbers more.)
9238: call carryprop
924 movd [edi + 16], xmm4
925
926 // All done.
0923a413
MW
9279: dropfp
928 popreg edi
929 popreg esi
930 popreg ebx
a90d420c 931 popreg BP
444083ae
MW
932 ret
933
934ENDFUNC
935
b9b279b4
MW
936FUNC(mpxmont_redc4_x86_avx)
937 .arch .avx
938 vzeroupper
939 endprologue
940 // and drop through...
941 .arch pentium4
942ENDFUNC
943
444083ae
MW
944FUNC(mpxmont_redc4_x86_sse2)
945 // void mpxmont_redc4_x86_sse2(mpw *dv, mpw *dvl, const mpw *nv,
946 // size_t n, const mpw *mi);
947
a90d420c 948 // Build a stack frame. Arguments will be relative to BP, as
444083ae
MW
949 // follows.
950 //
a90d420c
MW
951 // BP + 20 dv
952 // BP + 24 dvl
953 // BP + 28 nv
954 // BP + 32 n (nonzero multiple of 4)
955 // BP + 36 mi
444083ae 956 //
a90d420c 957 // Locals are relative to SP, as follows.
444083ae 958 //
a90d420c
MW
959 // SP + 0 outer loop dv
960 // SP + 4 outer dv limit
961 // SP + 8 blocks-of-4 dv limit
962 // SP + 12 expanded M (32 bytes)
963 // SP + 44 expanded Y (32 bytes)
964 // SP + 76 (top of locals)
965 pushreg BP
0923a413
MW
966 pushreg ebx
967 pushreg esi
968 pushreg edi
42c44b27 969 setfp
a90d420c 970 and SP, ~15
6d2bd7f1 971 stalloc 76
0923a413 972 endprologue
444083ae
MW
973
974 // Establish the expanded operands and the blocks-of-4 dv limit.
a90d420c 975 mov edi, [BP + 20] // -> Z = dv[0]
444083ae 976 pxor xmm7, xmm7
a90d420c 977 mov eax, [BP + 24] // -> dv[n] = dv limit
444083ae 978 sub eax, edi // length of dv in bytes
a90d420c 979 mov edx, [BP + 36] // -> mi
444083ae
MW
980 movdqu xmm0, [edx] // mi
981 and eax, ~15 // mask off the tail end
71ac8e5e 982 expand xmm7, xmm0, xmm1
444083ae 983 add eax, edi // find limit
a90d420c
MW
984 movdqa [SP + 12], xmm0 // mi expanded low
985 movdqa [SP + 28], xmm1 // mi expanded high
986 mov [SP + 8], eax
444083ae
MW
987
988 // Set up the outer loop state and prepare for the first iteration.
a90d420c
MW
989 mov ecx, [BP + 32] // n
990 mov ebx, [BP + 28] // -> X = nv[0]
444083ae
MW
991 lea edx, [edi + 4*ecx] // -> dv[n/4] = outer dv limit
992 lea ecx, [ebx + 4*ecx] // -> nv[n/4] = nv limit
a90d420c
MW
993 mov [SP + 0], edi
994 mov [SP + 4], edx
995 lea esi, [SP + 12] // -> expanded M = mi
996 lea edx, [SP + 44] // -> space for Y
444083ae 997 call mont4
444083ae 998 add ebx, 16
a87d6f26 999 add edi, 16
444083ae
MW
1000 cmp ebx, ecx // done already?
1001 jae 8f
1002
1003 .p2align 4
1004 // Complete the first inner loop.
10055: call mla4
1006 add ebx, 16
1007 add edi, 16
1008 cmp ebx, ecx // done yet?
1009 jb 5b
1010
1011 // Still have carries left to propagate.
10128: carryadd
a90d420c
MW
1013 mov esi, [SP + 8] // -> dv blocks limit
1014 mov edx, [BP + 24] // dv limit
444083ae
MW
1015 psllq xmm7, 16
1016 pslldq xmm7, 8
1017 paddq xmm6, xmm7
1018 call carryprop
1019 movd eax, xmm4
1020 add edi, 16
1021 cmp edi, esi
1022 jae 7f
1023
1024 .p2align 4
1025 // Continue carry propagation until the end of the buffer.
10260: add [edi], eax
1027 mov eax, 0 // preserves flags
1028 adcd [edi + 4], 0
1029 adcd [edi + 8], 0
1030 adcd [edi + 12], 0
1031 adc eax, 0
1032 add edi, 16
1033 cmp edi, esi
1034 jb 0b
1035
1036 // Deal with the tail end.
10377: add [edi], eax
1038 mov eax, 0 // preserves flags
1039 add edi, 4
1040 adc eax, 0
1041 cmp edi, edx
1042 jb 7b
1043
1044 // All done for this iteration. Start the next. (This must have at
1045 // least one follow-on iteration, or we'd not have started this outer
1046 // loop.)
a90d420c
MW
10478: mov edi, [SP + 0] // -> dv[i - 1]
1048 mov ebx, [BP + 28] // -> X = nv[0]
1049 lea edx, [SP + 44] // -> space for Y
1050 lea esi, [SP + 12] // -> expanded M = mi
444083ae 1051 add edi, 16 // -> Z = dv[i]
a90d420c 1052 cmp edi, [SP + 4] // all done yet?
444083ae 1053 jae 9f
a90d420c 1054 mov [SP + 0], edi
444083ae
MW
1055 call mont4
1056 add edi, 16
1057 add ebx, 16
1058 jmp 5b
1059
1060 // All over.
0923a413
MW
10619: dropfp
1062 popreg edi
1063 popreg esi
1064 popreg ebx
a90d420c 1065 popreg BP
444083ae
MW
1066 ret
1067
1068ENDFUNC
1069
1070///--------------------------------------------------------------------------
1071/// Testing and performance measurement.
1072
1073#ifdef TEST_MUL4
1074
1075.macro cysetup c
1076 rdtsc
1077 mov [\c], eax
1078 mov [\c + 4], edx
1079.endm
1080
1081.macro cystore c, v, n
1082 rdtsc
1083 sub eax, [\c]
1084 sbb edx, [\c + 4]
1085 mov ebx, [\v]
1086 mov ecx, [\n]
1087 dec ecx
1088 mov [\n], ecx
1089 mov [ebx + ecx*8], eax
1090 mov [ebx + ecx*8 + 4], edx
1091.endm
1092
6d19758a 1093.macro testprologue n
a90d420c 1094 pushreg BP
0923a413
MW
1095 pushreg ebx
1096 pushreg esi
1097 pushreg edi
42c44b27 1098 setfp
6d2bd7f1 1099 stalloc 3*32 + 4*4
a90d420c 1100 and SP, ~15
0923a413 1101 endprologue
6d19758a 1102 mov eax, \n
a90d420c 1103 mov [SP + 104], eax
444083ae 1104 // vars:
a90d420c
MW
1105 // SP + 0 = v expanded
1106 // SP + 32 = y expanded
1107 // SP + 64 = ? expanded
1108 // SP + 96 = cycles
1109 // SP + 104 = count
444083ae
MW
1110.endm
1111
1112.macro testepilogue
0923a413
MW
1113 dropfp
1114 popreg edi
1115 popreg esi
1116 popreg ebx
a90d420c 1117 popreg BP
444083ae
MW
1118 ret
1119.endm
1120
1121.macro testldcarry c
1122 mov ecx, \c // -> c
8e91d6e5
MW
1123 movdqu xmm4, [ecx + 0] // (c'_0; c''_0)
1124 movdqu xmm5, [ecx + 16] // (c'_1; c''_1)
1125 movdqu xmm6, [ecx + 32] // (c'_2; c''_2)
444083ae
MW
1126.endm
1127
71ac8e5e 1128.macro testexpand v=nil, y=nil
444083ae
MW
1129 pxor xmm7, xmm7
1130 .ifnes "\v", "nil"
1131 mov ecx, \v
1132 movdqu xmm0, [ecx]
71ac8e5e 1133 expand xmm7, xmm0, xmm1
a90d420c
MW
1134 movdqa [SP + 0], xmm0
1135 movdqa [SP + 16], xmm1
444083ae
MW
1136 .endif
1137 .ifnes "\y", "nil"
1138 mov edx, \y
1139 movdqu xmm2, [edx]
71ac8e5e 1140 expand xmm7, xmm2, xmm3
a90d420c
MW
1141 movdqa [SP + 32], xmm2
1142 movdqa [SP + 48], xmm3
444083ae
MW
1143 .endif
1144.endm
1145
71ac8e5e 1146.macro testtop u=nil, x=nil, mode=nil
444083ae
MW
1147 .p2align 4
11480:
1149 .ifnes "\u", "nil"
a90d420c 1150 lea ecx, [SP + 0]
444083ae
MW
1151 .endif
1152 mov ebx, \x
1153 .ifeqs "\mode", "mont"
a90d420c 1154 lea esi, [SP + 32]
444083ae 1155 .endif
a90d420c 1156 cysetup SP + 96
444083ae
MW
1157 .ifnes "\u", "nil"
1158 mov eax, \u
1159 .endif
1160 .ifeqs "\mode", "mont"
a90d420c 1161 lea edx, [SP + 64]
444083ae 1162 .else
a90d420c 1163 lea edx, [SP + 32]
444083ae
MW
1164 .endif
1165.endm
1166
6d19758a 1167.macro testtail cyv
a90d420c 1168 cystore SP + 96, \cyv, SP + 104
444083ae
MW
1169 jnz 0b
1170.endm
1171
1172.macro testcarryout c
1173 mov ecx, \c
1174 movdqu [ecx + 0], xmm4
1175 movdqu [ecx + 16], xmm5
1176 movdqu [ecx + 32], xmm6
1177.endm
1178
0923a413 1179FUNC(test_dmul4)
a90d420c
MW
1180 testprologue [BP + 44]
1181 testldcarry [BP + 24]
1182 testexpand [BP + 36], [BP + 40]
1183 mov edi, [BP + 20]
1184 testtop [BP + 28], [BP + 32]
444083ae 1185 call dmul4
a90d420c
MW
1186 testtail [BP + 48]
1187 testcarryout [BP + 24]
444083ae 1188 testepilogue
0923a413 1189ENDFUNC
444083ae 1190
0923a413 1191FUNC(test_dmla4)
a90d420c
MW
1192 testprologue [BP + 44]
1193 testldcarry [BP + 24]
1194 testexpand [BP + 36], [BP + 40]
1195 mov edi, [BP + 20]
1196 testtop [BP + 28], [BP + 32]
444083ae 1197 call dmla4
a90d420c
MW
1198 testtail [BP + 48]
1199 testcarryout [BP + 24]
444083ae 1200 testepilogue
0923a413 1201ENDFUNC
444083ae 1202
0923a413 1203FUNC(test_mul4)
a90d420c
MW
1204 testprologue [BP + 36]
1205 testldcarry [BP + 24]
1206 testexpand nil, [BP + 32]
1207 mov edi, [BP + 20]
1208 testtop nil, [BP + 28]
444083ae 1209 call mul4
a90d420c
MW
1210 testtail [BP + 40]
1211 testcarryout [BP + 24]
444083ae 1212 testepilogue
0923a413 1213ENDFUNC
444083ae 1214
d0d41c6e 1215FUNC(test_mul4zc)
a90d420c
MW
1216 testprologue [BP + 36]
1217 testldcarry [BP + 24]
1218 testexpand nil, [BP + 32]
1219 mov edi, [BP + 20]
1220 testtop nil, [BP + 28]
d0d41c6e 1221 call mul4zc
a90d420c
MW
1222 testtail [BP + 40]
1223 testcarryout [BP + 24]
d0d41c6e
MW
1224 testepilogue
1225ENDFUNC
1226
0923a413 1227FUNC(test_mla4)
a90d420c
MW
1228 testprologue [BP + 36]
1229 testldcarry [BP + 24]
1230 testexpand nil, [BP + 32]
1231 mov edi, [BP + 20]
1232 testtop nil, [BP + 28]
444083ae 1233 call mla4
a90d420c
MW
1234 testtail [BP + 40]
1235 testcarryout [BP + 24]
444083ae 1236 testepilogue
0923a413 1237ENDFUNC
444083ae 1238
d0d41c6e 1239FUNC(test_mla4zc)
a90d420c
MW
1240 testprologue [BP + 36]
1241 testldcarry [BP + 24]
1242 testexpand nil, [BP + 32]
1243 mov edi, [BP + 20]
1244 testtop nil, [BP + 28]
d0d41c6e 1245 call mla4zc
a90d420c
MW
1246 testtail [BP + 40]
1247 testcarryout [BP + 24]
d0d41c6e
MW
1248 testepilogue
1249ENDFUNC
1250
0923a413 1251FUNC(test_mmul4)
a90d420c
MW
1252 testprologue [BP + 48]
1253 testexpand [BP + 40], [BP + 44]
1254 mov edi, [BP + 20]
1255 testtop [BP + 32], [BP + 36], mont
444083ae 1256 call mmul4
a90d420c
MW
1257 testtail [BP + 52]
1258 mov edi, [BP + 28]
1259 movdqa xmm0, [SP + 64]
1260 movdqa xmm1, [SP + 80]
444083ae
MW
1261 movdqu [edi], xmm0
1262 movdqu [edi + 16], xmm1
a90d420c 1263 testcarryout [BP + 24]
444083ae 1264 testepilogue
0923a413 1265ENDFUNC
444083ae 1266
0923a413 1267FUNC(test_mmla4)
a90d420c
MW
1268 testprologue [BP + 48]
1269 testexpand [BP + 40], [BP + 44]
1270 mov edi, [BP + 20]
1271 testtop [BP + 32], [BP + 36], mont
444083ae 1272 call mmla4
a90d420c
MW
1273 testtail [BP + 52]
1274 mov edi, [BP + 28]
1275 movdqa xmm0, [SP + 64]
1276 movdqa xmm1, [SP + 80]
444083ae
MW
1277 movdqu [edi], xmm0
1278 movdqu [edi + 16], xmm1
a90d420c 1279 testcarryout [BP + 24]
444083ae 1280 testepilogue
0923a413 1281ENDFUNC
444083ae 1282
0923a413 1283FUNC(test_mont4)
a90d420c
MW
1284 testprologue [BP + 40]
1285 testexpand nil, [BP + 36]
1286 mov edi, [BP + 20]
1287 testtop nil, [BP + 32], mont
444083ae 1288 call mont4
a90d420c
MW
1289 testtail [BP + 44]
1290 mov edi, [BP + 28]
1291 movdqa xmm0, [SP + 64]
1292 movdqa xmm1, [SP + 80]
444083ae
MW
1293 movdqu [edi], xmm0
1294 movdqu [edi + 16], xmm1
a90d420c 1295 testcarryout [BP + 24]
444083ae 1296 testepilogue
0923a413 1297ENDFUNC
444083ae
MW
1298
1299#endif
1300
1301///----- That's all, folks --------------------------------------------------