base/asm-common.h: Decorate pseudoregister `nil' as `nil'.
[catacomb] / math / mpx-mul4-x86-sse2.S
CommitLineData
444083ae
MW
1/// -*- mode: asm; asm-comment-char: ?/; comment-start: "// " -*-
2///
3/// Large SIMD-based multiplications
4///
5/// (c) 2016 Straylight/Edgeware
6
7///----- Licensing notice ---------------------------------------------------
8///
9/// This file is part of Catacomb.
10///
11/// Catacomb is free software; you can redistribute it and/or modify
12/// it under the terms of the GNU Library General Public License as
13/// published by the Free Software Foundation; either version 2 of the
14/// License, or (at your option) any later version.
15///
16/// Catacomb is distributed in the hope that it will be useful,
17/// but WITHOUT ANY WARRANTY; without even the implied warranty of
18/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19/// GNU Library General Public License for more details.
20///
21/// You should have received a copy of the GNU Library General Public
22/// License along with Catacomb; if not, write to the Free
23/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
24/// MA 02111-1307, USA.
25
26///--------------------------------------------------------------------------
df07f2c0 27/// Preliminaries.
444083ae
MW
28
29#include "config.h"
30#include "asm-common.h"
31
444083ae 32 .arch pentium4
df07f2c0 33
444083ae
MW
34 .text
35
36///--------------------------------------------------------------------------
37/// Theory.
38///
39/// We define a number of primitive fixed-size multipliers from which we can
40/// construct more general variable-length multipliers.
41///
42/// The basic trick is the same throughout. In an operand-scanning
43/// multiplication, the inner multiplication loop multiplies a
44/// multiple-precision operand by a single precision factor, and adds the
45/// result, appropriately shifted, to the result. A `finely integrated
46/// operand scanning' implementation of Montgomery multiplication also adds
47/// the product of a single-precision `Montgomery factor' and the modulus,
48/// calculated in the same pass. The more common `coarsely integrated
49/// operand scanning' alternates main multiplication and Montgomery passes,
50/// which requires additional carry propagation.
51///
52/// Throughout both plain-multiplication and Montgomery stages, then, one of
53/// the factors remains constant throughout the operation, so we can afford
54/// to take a little time to preprocess it. The transformation we perform is
55/// as follows. Let b = 2^16, and B = b^2 = 2^32. Suppose we're given a
56/// 128-bit factor v = v_0 + v_1 B + v_2 B^2 + v_3 B^3. Split each v_i into
57/// two sixteen-bit pieces, so v_i = v'_i + v''_i b. These eight 16-bit
58/// pieces are placed into 32-bit cells, and arranged as two 128-bit SSE
59/// operands, as follows.
60///
61/// Offset 0 4 8 12
62/// 0 v'_0 v'_1 v''_0 v''_1
63/// 16 v'_2 v'_3 v''_2 v''_3
64///
2aaa07f8 65/// A `pmuludq' instruction ignores the odd positions in its operands; thus,
444083ae
MW
66/// it will act on (say) v'_0 and v''_0 in a single instruction. Shifting
67/// this vector right by 4 bytes brings v'_1 and v''_1 into position. We can
68/// multiply such a vector by a full 32-bit scalar to produce two 48-bit
69/// results in 64-bit fields. The sixteen bits of headroom allows us to add
70/// many products together before we must deal with carrying; it also allows
71/// for some calculations to be performed on the above expanded form.
72///
73/// On 32-bit x86, we are register starved: the expanded operands are kept in
74/// memory, typically in warm L1 cache.
75///
76/// We maintain four `carry' registers accumulating intermediate results.
77/// The registers' precise roles rotate during the computation; we name them
78/// `c0', `c1', `c2', and `c3'. Each carry register holds two 64-bit halves:
79/// the register c0, for example, holds c'_0 (low half) and c''_0 (high
80/// half), and represents the value c_0 = c'_0 + c''_0 b; the carry registers
81/// collectively represent the value c_0 + c_1 B + c_2 B^2 + c_3 B^3. The
2aaa07f8 82/// `pmuluqd' instruction acting on a scalar operand (broadcast across all
444083ae
MW
83/// lanes of its vector) and an operand in the expanded form above produces a
84/// result which can be added directly to the appropriate carry register.
85/// Following a pass of four multiplications, we perform some limited carry
86/// propagation: let t = c''_0 mod B, and let d = c'_0 + t b; then we output
87/// z = d mod B, add (floor(d/B), floor(c''_0/B)) to c1, and cycle the carry
88/// registers around, so that c1 becomes c0, and the old c0 is (implicitly)
89/// zeroed becomes c3.
90
91///--------------------------------------------------------------------------
92/// Macro definitions.
93
71ac8e5e 94.macro mulcore r, s, d0, d1=nil, d2=nil, d3=nil
444083ae
MW
95 // Load a word r_i from R, multiply by the expanded operand [S], and
96 // leave the pieces of the product in registers D0, D1, D2, D3.
8e91d6e5 97 movd \d0, \r // (r_i, 0; 0, 0)
444083ae 98 .ifnes "\d1", "nil"
8e91d6e5 99 movdqa \d1, [\s] // (s'_0, s'_1; s''_0, s''_1)
444083ae
MW
100 .endif
101 .ifnes "\d3", "nil"
8e91d6e5 102 movdqa \d3, [\s + 16] // (s'_2, s'_3; s''_2, s''_3)
444083ae 103 .endif
a117c06f 104 pshufd \d0, \d0, SHUF(0, 3, 0, 3) // (r_i, ?; r_i, ?)
444083ae 105 .ifnes "\d1", "nil"
8e91d6e5 106 psrldq \d1, 4 // (s'_1, s''_0; s''_1, 0)
444083ae
MW
107 .endif
108 .ifnes "\d2", "nil"
109 .ifnes "\d3", "nil"
8e91d6e5 110 movdqa \d2, \d3 // another copy of (s'_2, s'_3; ...)
444083ae 111 .else
8e91d6e5 112 movdqa \d2, \d0 // another copy of (r_i, ?; r_i, ?)
444083ae
MW
113 .endif
114 .endif
115 .ifnes "\d3", "nil"
8e91d6e5 116 psrldq \d3, 4 // (s'_3, s''_2; s''_3, 0)
444083ae
MW
117 .endif
118 .ifnes "\d1", "nil"
8e91d6e5 119 pmuludq \d1, \d0 // (r_i s'_1; r_i s''_1)
444083ae
MW
120 .endif
121 .ifnes "\d3", "nil"
8e91d6e5 122 pmuludq \d3, \d0 // (r_i s'_3; r_i s''_3)
444083ae
MW
123 .endif
124 .ifnes "\d2", "nil"
125 .ifnes "\d3", "nil"
8e91d6e5 126 pmuludq \d2, \d0 // (r_i s'_2; r_i s''_2)
444083ae 127 .else
2aaa07f8 128 pmuludq \d2, [\s + 16]
444083ae
MW
129 .endif
130 .endif
8e91d6e5 131 pmuludq \d0, [\s] // (r_i s'_0; r_i s''_0)
444083ae
MW
132.endm
133
71ac8e5e
MW
134.macro accum c0, c1=nil, c2=nil, c3=nil
135 // Accumulate 64-bit pieces in XMM0--XMM3 into the corresponding
136 // carry registers C0--C3. Any or all of C1--C3 may be `nil' to skip
137 // updating that register.
444083ae
MW
138 paddq \c0, xmm0
139 .ifnes "\c1", "nil"
140 paddq \c1, xmm1
141 .endif
142 .ifnes "\c2", "nil"
143 paddq \c2, xmm2
144 .endif
145 .ifnes "\c3", "nil"
146 paddq \c3, xmm3
147 .endif
148.endm
149
71ac8e5e 150.macro mulacc r, s, c0, c1, c2, c3, z3p=nil
444083ae
MW
151 // Load a word r_i from R, multiply by the expanded operand [S],
152 // and accumulate in carry registers C0, C1, C2, C3. If Z3P is `t'
153 // then C3 notionally contains zero, but needs clearing; in practice,
154 // we store the product directly rather than attempting to add. On
155 // completion, XMM0, XMM1, and XMM2 are clobbered, as is XMM3 if Z3P
156 // is not `t'.
157 .ifeqs "\z3p", "t"
158 mulcore \r, \s, xmm0, xmm1, xmm2, \c3
71ac8e5e 159 accum \c0, \c1, \c2
444083ae
MW
160 .else
161 mulcore \r, \s, xmm0, xmm1, xmm2, xmm3
162 accum \c0, \c1, \c2, \c3
163 .endif
164.endm
165
71ac8e5e 166.macro propout d, c, cc=nil
444083ae
MW
167 // Calculate an output word from C, and store it in D; propagate
168 // carries out from C to CC in preparation for a rotation of the
169 // carry registers. On completion, XMM3 is clobbered. If CC is
170 // `nil', then the contribution which would have been added to it is
171 // left in C.
a117c06f 172 pshufd xmm3, \c, SHUF(3, 3, 3, 2) // (?, ?; ?, t = c'' mod B)
8e91d6e5
MW
173 psrldq xmm3, 12 // (t, 0; 0, 0) = (t, 0)
174 pslldq xmm3, 2 // (t b; 0)
175 paddq \c, xmm3 // (c' + t b; c'')
444083ae
MW
176 movd \d, \c
177 psrlq \c, 32 // floor(c/B)
178 .ifnes "\cc", "nil"
179 paddq \cc, \c // propagate up
180 .endif
181.endm
182
183.macro endprop d, c, t
184 // On entry, C contains a carry register. On exit, the low 32 bits
185 // of the value represented in C are written to D, and the remaining
186 // bits are left at the bottom of T.
187 movdqa \t, \c
8e91d6e5
MW
188 psllq \t, 16 // (?; c'' b)
189 pslldq \c, 8 // (0; c')
190 paddq \t, \c // (?; c' + c'' b)
191 psrldq \t, 8 // (c' + c'' b; 0) = (c; 0)
444083ae 192 movd \d, \t
8e91d6e5 193 psrldq \t, 4 // (floor(c/B); 0)
444083ae
MW
194.endm
195
71ac8e5e 196.macro expand z, a, b, c=nil, d=nil
444083ae
MW
197 // On entry, A and C hold packed 128-bit values, and Z is zero. On
198 // exit, A:B and C:D together hold the same values in expanded
199 // form. If C is `nil', then only expand A to A:B.
8e91d6e5 200 movdqa \b, \a // (a_0, a_1; a_2, a_3)
444083ae 201 .ifnes "\c", "nil"
8e91d6e5 202 movdqa \d, \c // (c_0, c_1; c_2, c_3)
444083ae 203 .endif
8e91d6e5
MW
204 punpcklwd \a, \z // (a'_0, a''_0; a'_1, a''_1)
205 punpckhwd \b, \z // (a'_2, a''_2; a'_3, a''_3)
444083ae 206 .ifnes "\c", "nil"
8e91d6e5
MW
207 punpcklwd \c, \z // (c'_0, c''_0; c'_1, c''_1)
208 punpckhwd \d, \z // (c'_2, c''_2; c'_3, c''_3)
444083ae 209 .endif
a117c06f
MW
210 pshufd \a, \a, SHUF(0, 2, 1, 3) // (a'_0, a'_1; a''_0, a''_1)
211 pshufd \b, \b, SHUF(0, 2, 1, 3) // (a'_2, a'_3; a''_2, a''_3)
444083ae 212 .ifnes "\c", "nil"
a117c06f
MW
213 pshufd \c, \c, SHUF(0, 2, 1, 3) // (c'_0, c'_1; c''_0, c''_1)
214 pshufd \d, \d, SHUF(0, 2, 1, 3) // (c'_2, c'_3; c''_2, c''_3)
444083ae
MW
215 .endif
216.endm
217
71ac8e5e 218.macro squash c0, c1, c2, c3, t, u, lo, hi=nil
444083ae 219 // On entry, C0, C1, C2, C3 are carry registers representing a value
4b30aca5 220 // Y. On exit, LO holds the low 128 bits of the carry value; C1, C2,
444083ae 221 // C3, T, and U are clobbered; and the high bits of Y are stored in
4b30aca5 222 // HI, if this is not `nil'.
444083ae
MW
223
224 // The first step is to eliminate the `double-prime' pieces -- i.e.,
225 // the ones offset by 16 bytes from a 32-bit boundary -- by carrying
226 // them into the 32-bit-aligned pieces above and below. But before
227 // we can do that, we must gather them together.
228 movdqa \t, \c0
229 movdqa \u, \c1
8e91d6e5
MW
230 punpcklqdq \t, \c2 // (y'_0; y'_2)
231 punpckhqdq \c0, \c2 // (y''_0; y''_2)
232 punpcklqdq \u, \c3 // (y'_1; y'_3)
233 punpckhqdq \c1, \c3 // (y''_1; y''_3)
444083ae
MW
234
235 // Now split the double-prime pieces. The high (up to) 48 bits will
236 // go up; the low 16 bits go down.
237 movdqa \c2, \c0
238 movdqa \c3, \c1
239 psllq \c2, 48
240 psllq \c3, 48
8e91d6e5
MW
241 psrlq \c0, 16 // high parts of (y''_0; y''_2)
242 psrlq \c1, 16 // high parts of (y''_1; y''_3)
243 psrlq \c2, 32 // low parts of (y''_0; y''_2)
244 psrlq \c3, 32 // low parts of (y''_1; y''_3)
4b30aca5
MW
245 .ifnes "\hi", "nil"
246 movdqa \hi, \c1
444083ae 247 .endif
8e91d6e5 248 pslldq \c1, 8 // high part of (0; y''_1)
444083ae
MW
249
250 paddq \t, \c2 // propagate down
251 paddq \u, \c3
8e91d6e5
MW
252 paddq \t, \c1 // and up: (y_0; y_2)
253 paddq \u, \c0 // (y_1; y_3)
4b30aca5 254 .ifnes "\hi", "nil"
8e91d6e5 255 psrldq \hi, 8 // high part of (y''_3; 0)
444083ae
MW
256 .endif
257
258 // Finally extract the answer. This complicated dance is better than
259 // storing to memory and loading, because the piecemeal stores
260 // inhibit store forwarding.
8e91d6e5
MW
261 movdqa \c3, \t // (y_0; ?)
262 movdqa \lo, \t // (y^*_0, ?; ?, ?)
263 psrldq \t, 8 // (y_2; 0)
264 psrlq \c3, 32 // (floor(y_0/B); ?)
265 paddq \c3, \u // (y_1 + floor(y_0/B); ?)
266 movdqa \c1, \c3 // (y^*_1, ?; ?, ?)
267 psrldq \u, 8 // (y_3; 0)
268 psrlq \c3, 32 // (floor((y_1 B + y_0)/B^2; ?)
269 paddq \c3, \t // (y_2 + floor((y_1 B + y_0)/B^2; ?)
270 punpckldq \lo, \c3 // (y^*_0, y^*_2; ?, ?)
271 psrlq \c3, 32 // (floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
272 paddq \c3, \u // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
4b30aca5 273 .ifnes "\hi", "nil"
d2269179 274 movdqa \t, \c3
444083ae
MW
275 pxor \u, \u
276 .endif
8e91d6e5 277 punpckldq \c1, \c3 // (y^*_1, y^*_3; ?, ?)
4b30aca5 278 .ifnes "\hi", "nil"
444083ae 279 psrlq \t, 32 // very high bits of y
4b30aca5
MW
280 paddq \hi, \t
281 punpcklqdq \hi, \u // carry up
444083ae 282 .endif
4b30aca5 283 punpckldq \lo, \c1 // y mod B^4
444083ae
MW
284.endm
285
286.macro carryadd
287 // On entry, EDI points to a packed addend A, and XMM4, XMM5, XMM6
288 // hold the incoming carry registers c0, c1, and c2 representing a
289 // carry-in C.
290 //
291 // On exit, the carry registers, including XMM7, are updated to hold
292 // C + A; XMM0, XMM1, XMM2, and XMM3 are clobbered. The other
293 // registers are preserved.
8e91d6e5
MW
294 movd xmm0, [edi + 0] // (a_0; 0)
295 movd xmm1, [edi + 4] // (a_1; 0)
296 movd xmm2, [edi + 8] // (a_2; 0)
297 movd xmm7, [edi + 12] // (a_3; 0)
298
299 paddq xmm4, xmm0 // (c'_0 + a_0; c''_0)
300 paddq xmm5, xmm1 // (c'_1 + a_1; c''_1)
301 paddq xmm6, xmm2 // (c'_2 + a_2; c''_2 + a_3 b)
444083ae
MW
302.endm
303
304///--------------------------------------------------------------------------
305/// Primitive multipliers and related utilities.
306
1a517bb3 307INTFUNC(carryprop)
444083ae
MW
308 // On entry, XMM4, XMM5, and XMM6 hold a 144-bit carry in an expanded
309 // form. Store the low 128 bits of the represented carry to [EDI] as
310 // a packed 128-bit value, and leave the remaining 16 bits in the low
311 // 32 bits of XMM4. On exit, XMM3, XMM5 and XMM6 are clobbered.
0923a413
MW
312 endprologue
313
444083ae
MW
314 propout [edi + 0], xmm4, xmm5
315 propout [edi + 4], xmm5, xmm6
316 propout [edi + 8], xmm6, nil
317 endprop [edi + 12], xmm6, xmm4
318 ret
1a517bb3
MW
319ENDFUNC
320
321INTFUNC(dmul4)
444083ae
MW
322 // On entry, EDI points to the destination buffer; EAX and EBX point
323 // to the packed operands U and X; ECX and EDX point to the expanded
324 // operands V and Y; and XMM4, XMM5, XMM6 hold the incoming carry
325 // registers c0, c1, and c2; c3 is assumed to be zero.
326 //
327 // On exit, we write the low 128 bits of the sum C + U V + X Y to
328 // [EDI], and update the carry registers with the carry out. The
329 // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
330 // general-purpose registers are preserved.
0923a413
MW
331 endprologue
332
444083ae 333 mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7, t
71ac8e5e 334 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
444083ae
MW
335 propout [edi + 0], xmm4, xmm5
336
337 mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
71ac8e5e 338 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4
444083ae
MW
339 propout [edi + 4], xmm5, xmm6
340
341 mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t
71ac8e5e 342 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5
444083ae
MW
343 propout [edi + 8], xmm6, xmm7
344
345 mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
71ac8e5e 346 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6
444083ae
MW
347 propout [edi + 12], xmm7, xmm4
348
349 ret
1a517bb3
MW
350ENDFUNC
351
352INTFUNC(dmla4)
444083ae
MW
353 // On entry, EDI points to the destination buffer, which also
354 // contains an addend A to accumulate; EAX and EBX point to the
355 // packed operands U and X; ECX and EDX point to the expanded
356 // operands V and Y; and XMM4, XMM5, XMM6 hold the incoming carry
357 // registers c0, c1, and c2 representing a carry-in C; c3 is assumed
358 // to be zero.
359 //
360 // On exit, we write the low 128 bits of the sum A + C + U V + X Y to
361 // [EDI], and update the carry registers with the carry out. The
362 // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
363 // general-purpose registers are preserved.
0923a413
MW
364 endprologue
365
444083ae
MW
366 carryadd
367
71ac8e5e
MW
368 mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7
369 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
444083ae
MW
370 propout [edi + 0], xmm4, xmm5
371
372 mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
71ac8e5e 373 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4
444083ae
MW
374 propout [edi + 4], xmm5, xmm6
375
376 mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t
71ac8e5e 377 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5
444083ae
MW
378 propout [edi + 8], xmm6, xmm7
379
380 mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
71ac8e5e 381 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6
444083ae
MW
382 propout [edi + 12], xmm7, xmm4
383
384 ret
1a517bb3
MW
385ENDFUNC
386
387INTFUNC(mul4zc)
444083ae
MW
388 // On entry, EDI points to the destination buffer; EBX points to a
389 // packed operand X; and EDX points to an expanded operand Y.
390 //
391 // On exit, we write the low 128 bits of the product X Y to [EDI],
392 // and set the carry registers XMM4, XMM5, XMM6 to the carry out.
393 // The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
394 // general-purpose registers are preserved.
0923a413
MW
395 endprologue
396
444083ae
MW
397 mulcore [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
398 propout [edi + 0], xmm4, xmm5
399
400 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
401 propout [edi + 4], xmm5, xmm6
402
403 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
404 propout [edi + 8], xmm6, xmm7
405
406 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
407 propout [edi + 12], xmm7, xmm4
408
409 ret
1a517bb3
MW
410ENDFUNC
411
412INTFUNC(mul4)
444083ae
MW
413 // On entry, EDI points to the destination buffer; EBX points to a
414 // packed operand X; EDX points to an expanded operand Y; and XMM4,
415 // XMM5, XMM6 hold the incoming carry registers c0, c1, and c2,
416 // representing a carry-in C; c3 is assumed to be zero.
417 //
418 // On exit, we write the low 128 bits of the sum C + X Y to [EDI],
419 // and update the carry registers with the carry out. The registers
420 // XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
421 // general-purpose registers are preserved.
0923a413
MW
422 endprologue
423
444083ae
MW
424 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, t
425 propout [edi + 0], xmm4, xmm5
426
427 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
428 propout [edi + 4], xmm5, xmm6
429
430 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
431 propout [edi + 8], xmm6, xmm7
432
433 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
434 propout [edi + 12], xmm7, xmm4
435
436 ret
1a517bb3
MW
437ENDFUNC
438
439INTFUNC(mla4zc)
444083ae
MW
440 // On entry, EDI points to the destination buffer, which also
441 // contains an addend A to accumulate; EBX points to a packed operand
442 // X; and EDX points to an expanded operand Y.
443 //
444 // On exit, we write the low 128 bits of the sum A + X Y to [EDI],
445 // and set the carry registers XMM4, XMM5, XMM6 to the carry out.
446 // The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
447 // general-purpose registers are preserved.
0923a413
MW
448 endprologue
449
444083ae
MW
450 movd xmm4, [edi + 0]
451 movd xmm5, [edi + 4]
452 movd xmm6, [edi + 8]
453 movd xmm7, [edi + 12]
454
71ac8e5e 455 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
444083ae
MW
456 propout [edi + 0], xmm4, xmm5
457
458 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
459 propout [edi + 4], xmm5, xmm6
460
461 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
462 propout [edi + 8], xmm6, xmm7
463
464 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
465 propout [edi + 12], xmm7, xmm4
466
467 ret
1a517bb3
MW
468ENDFUNC
469
470INTFUNC(mla4)
444083ae
MW
471 // On entry, EDI points to the destination buffer, which also
472 // contains an addend A to accumulate; EBX points to a packed operand
473 // X; EDX points to an expanded operand Y; and XMM4, XMM5, XMM6 hold
474 // the incoming carry registers c0, c1, and c2, representing a
475 // carry-in C; c3 is assumed to be zero.
476 //
477 // On exit, we write the low 128 bits of the sum A + C + X Y to
478 // [EDI], and update the carry registers with the carry out. The
479 // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
480 // general-purpose registers are preserved.
0923a413
MW
481 endprologue
482
444083ae
MW
483 carryadd
484
71ac8e5e 485 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
444083ae
MW
486 propout [edi + 0], xmm4, xmm5
487
488 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
489 propout [edi + 4], xmm5, xmm6
490
491 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
492 propout [edi + 8], xmm6, xmm7
493
494 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
495 propout [edi + 12], xmm7, xmm4
496
497 ret
1a517bb3
MW
498ENDFUNC
499
500INTFUNC(mmul4)
444083ae
MW
501 // On entry, EDI points to the destination buffer; EAX and EBX point
502 // to the packed operands U and N; ECX and ESI point to the expanded
503 // operands V and M; and EDX points to a place to store an expanded
504 // result Y (32 bytes, at a 16-byte boundary). The stack pointer
6ecc0b8f 505 // must be 12 modulo 16, as is usual for modern x86 ABIs.
444083ae
MW
506 //
507 // On exit, we write Y = U V M mod B to [EDX], and the low 128 bits
508 // of the sum U V + N Y to [EDI], leaving the remaining carry in
509 // XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2, XMM3, and
510 // XMM7 are clobbered; the general-purpose registers are preserved.
6ecc0b8f 511 stalloc 48 + 12 // space for the carries
0923a413 512 endprologue
444083ae
MW
513
514 // Calculate W = U V, and leave it in the destination. Stash the
515 // carry pieces for later.
516 mulcore [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7
517 propout [edi + 0], xmm4, xmm5
518 jmp 5f
1a517bb3
MW
519ENDFUNC
520
521INTFUNC(mmla4)
444083ae 522 // On entry, EDI points to the destination buffer, which also
14e7b1f5
MW
523 // contains an addend A to accumulate; EAX and EBX point to the
524 // packed operands U and N; ECX and ESI point to the expanded
444083ae
MW
525 // operands V and M; and EDX points to a place to store an expanded
526 // result Y (32 bytes, at a 16-byte boundary). The stack pointer
6ecc0b8f 527 // must be 12 modulo 16, as is usual for modern x86 ABIs.
444083ae
MW
528 //
529 // On exit, we write Y = (A + U V) M mod B to [EDX], and the low 128
530 // bits of the sum A + U V + N Y to [EDI], leaving the remaining
531 // carry in XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2,
532 // XMM3, and XMM7 are clobbered; the general-purpose registers are
533 // preserved.
6ecc0b8f 534 stalloc 48 + 12 // space for the carries
0923a413
MW
535 endprologue
536
444083ae
MW
537 movd xmm4, [edi + 0]
538 movd xmm5, [edi + 4]
539 movd xmm6, [edi + 8]
540 movd xmm7, [edi + 12]
ba12677b
MW
541
542 // Calculate W = U V, and leave it in the destination. Stash the
543 // carry pieces for later.
71ac8e5e 544 mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7
444083ae
MW
545 propout [edi + 0], xmm4, xmm5
546
5475: mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
548 propout [edi + 4], xmm5, xmm6
549
550 mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t
551 propout [edi + 8], xmm6, xmm7
552
553 mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
554 propout [edi + 12], xmm7, xmm4
555
a90d420c
MW
556 movdqa [SP + 0], xmm4
557 movdqa [SP + 16], xmm5
558 movdqa [SP + 32], xmm6
444083ae
MW
559
560 // Calculate Y = W M.
561 mulcore [edi + 0], esi, xmm4, xmm5, xmm6, xmm7
562
71ac8e5e
MW
563 mulcore [edi + 4], esi, xmm0, xmm1, xmm2
564 accum xmm5, xmm6, xmm7
444083ae 565
71ac8e5e
MW
566 mulcore [edi + 8], esi, xmm0, xmm1
567 accum xmm6, xmm7
444083ae 568
71ac8e5e
MW
569 mulcore [edi + 12], esi, xmm0
570 accum xmm7
444083ae
MW
571
572 // That's lots of pieces. Now we have to assemble the answer.
71ac8e5e 573 squash xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm4
444083ae
MW
574
575 // Expand it.
576 pxor xmm2, xmm2
71ac8e5e 577 expand xmm2, xmm4, xmm1
444083ae
MW
578 movdqa [edx + 0], xmm4
579 movdqa [edx + 16], xmm1
580
581 // Initialize the carry from the value for W we calculated earlier.
582 movd xmm4, [edi + 0]
583 movd xmm5, [edi + 4]
584 movd xmm6, [edi + 8]
585 movd xmm7, [edi + 12]
586
587 // Finish the calculation by adding the Montgomery product.
71ac8e5e 588 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
444083ae
MW
589 propout [edi + 0], xmm4, xmm5
590
591 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
592 propout [edi + 4], xmm5, xmm6
593
594 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
595 propout [edi + 8], xmm6, xmm7
596
597 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
598 propout [edi + 12], xmm7, xmm4
599
600 // Add add on the carry we calculated earlier.
a90d420c
MW
601 paddq xmm4, [SP + 0]
602 paddq xmm5, [SP + 16]
603 paddq xmm6, [SP + 32]
444083ae
MW
604
605 // And, with that, we're done.
6ecc0b8f 606 stfree 48 + 12
444083ae 607 ret
1a517bb3
MW
608ENDFUNC
609
610INTFUNC(mont4)
444083ae 611 // On entry, EDI points to the destination buffer holding a packed
8e5386aa 612 // value W; EBX points to a packed operand N; ESI points to an
444083ae
MW
613 // expanded operand M; and EDX points to a place to store an expanded
614 // result Y (32 bytes, at a 16-byte boundary).
615 //
616 // On exit, we write Y = W M mod B to [EDX], and the low 128 bits
617 // of the sum W + N Y to [EDI], leaving the remaining carry in
618 // XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2, XMM3, and
619 // XMM7 are clobbered; the general-purpose registers are preserved.
0923a413 620 endprologue
444083ae
MW
621
622 // Calculate Y = W M.
623 mulcore [edi + 0], esi, xmm4, xmm5, xmm6, xmm7
624
71ac8e5e
MW
625 mulcore [edi + 4], esi, xmm0, xmm1, xmm2
626 accum xmm5, xmm6, xmm7
444083ae 627
71ac8e5e
MW
628 mulcore [edi + 8], esi, xmm0, xmm1
629 accum xmm6, xmm7
444083ae 630
71ac8e5e
MW
631 mulcore [edi + 12], esi, xmm0
632 accum xmm7
444083ae
MW
633
634 // That's lots of pieces. Now we have to assemble the answer.
71ac8e5e 635 squash xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm4
444083ae
MW
636
637 // Expand it.
638 pxor xmm2, xmm2
71ac8e5e 639 expand xmm2, xmm4, xmm1
444083ae
MW
640 movdqa [edx + 0], xmm4
641 movdqa [edx + 16], xmm1
642
643 // Initialize the carry from W.
644 movd xmm4, [edi + 0]
645 movd xmm5, [edi + 4]
646 movd xmm6, [edi + 8]
647 movd xmm7, [edi + 12]
648
649 // Finish the calculation by adding the Montgomery product.
71ac8e5e 650 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
444083ae
MW
651 propout [edi + 0], xmm4, xmm5
652
653 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
654 propout [edi + 4], xmm5, xmm6
655
656 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
657 propout [edi + 8], xmm6, xmm7
658
659 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
660 propout [edi + 12], xmm7, xmm4
661
662 // And, with that, we're done.
663 ret
1a517bb3
MW
664ENDFUNC
665
444083ae
MW
666///--------------------------------------------------------------------------
667/// Bulk multipliers.
668
b9b279b4
MW
669FUNC(mpx_umul4_x86_avx)
670 .arch .avx
671 vzeroupper
672 endprologue
673 // and drop through...
674 .arch pentium4
675ENDFUNC
676
444083ae
MW
677FUNC(mpx_umul4_x86_sse2)
678 // void mpx_umul4_x86_sse2(mpw *dv, const mpw *av, const mpw *avl,
679 // const mpw *bv, const mpw *bvl);
680
a90d420c 681 // Build a stack frame. Arguments will be relative to BP, as
444083ae
MW
682 // follows.
683 //
a90d420c
MW
684 // BP + 20 dv
685 // BP + 24 av
686 // BP + 28 avl
687 // BP + 32 bv
688 // BP + 36 bvl
444083ae 689 //
a90d420c 690 // Locals are relative to SP, as follows.
444083ae 691 //
a90d420c
MW
692 // SP + 0 expanded Y (32 bytes)
693 // SP + 32 (top of locals)
694 pushreg BP
0923a413
MW
695 pushreg ebx
696 pushreg esi
697 pushreg edi
42c44b27 698 setfp
6d2bd7f1 699 stalloc 32
a90d420c 700 and SP, ~15
0923a413 701 endprologue
444083ae
MW
702
703 // Prepare for the first iteration.
a90d420c 704 mov esi, [BP + 32] // -> bv[0]
444083ae
MW
705 pxor xmm7, xmm7
706 movdqu xmm0, [esi] // bv[0]
a90d420c 707 mov edi, [BP + 20] // -> dv[0]
444083ae 708 mov ecx, edi // outer loop dv cursor
71ac8e5e 709 expand xmm7, xmm0, xmm1
a90d420c
MW
710 mov ebx, [BP + 24] // -> av[0]
711 mov eax, [BP + 28] // -> av[m] = av limit
712 mov edx, SP // -> expanded Y = bv[0]
713 movdqa [SP + 0], xmm0 // bv[0] expanded low
714 movdqa [SP + 16], xmm1 // bv[0] expanded high
444083ae
MW
715 call mul4zc
716 add ebx, 16
717 add edi, 16
718 add ecx, 16
719 add esi, 16
720 cmp ebx, eax // all done?
721 jae 8f
722
723 .p2align 4
724 // Continue with the first iteration.
7250: call mul4
726 add ebx, 16
727 add edi, 16
728 cmp ebx, eax // all done?
729 jb 0b
730
731 // Write out the leftover carry. There can be no tail here.
7328: call carryprop
a90d420c 733 cmp esi, [BP + 36] // more passes to do?
444083ae
MW
734 jae 9f
735
736 .p2align 4
737 // Set up for the next pass.
7381: movdqu xmm0, [esi] // bv[i]
739 mov edi, ecx // -> dv[i]
740 pxor xmm7, xmm7
71ac8e5e 741 expand xmm7, xmm0, xmm1
a90d420c
MW
742 mov ebx, [BP + 24] // -> av[0]
743 movdqa [SP + 0], xmm0 // bv[i] expanded low
744 movdqa [SP + 16], xmm1 // bv[i] expanded high
444083ae
MW
745 call mla4zc
746 add edi, 16
747 add ebx, 16
748 add ecx, 16
749 add esi, 16
750 cmp ebx, eax // done yet?
751 jae 8f
752
753 .p2align 4
754 // Continue...
7550: call mla4
756 add ebx, 16
757 add edi, 16
758 cmp ebx, eax
759 jb 0b
760
761 // Finish off this pass. There was no tail on the previous pass, and
762 // there can be none on this pass.
7638: call carryprop
a90d420c 764 cmp esi, [BP + 36]
444083ae
MW
765 jb 1b
766
767 // All over.
0923a413 7689: dropfp
444083ae
MW
769 pop edi
770 pop esi
771 pop ebx
a90d420c 772 pop BP
444083ae 773 ret
444083ae
MW
774ENDFUNC
775
b9b279b4
MW
776FUNC(mpxmont_mul4_x86_avx)
777 .arch .avx
778 vzeroupper
779 endprologue
780 // and drop through...
781 .arch pentium4
782ENDFUNC
783
444083ae
MW
784FUNC(mpxmont_mul4_x86_sse2)
785 // void mpxmont_mul4_x86_sse2(mpw *dv, const mpw *av, const mpw *bv,
786 // const mpw *nv, size_t n, const mpw *mi);
787
a90d420c 788 // Build a stack frame. Arguments will be relative to BP, as
444083ae
MW
789 // follows.
790 //
a90d420c
MW
791 // BP + 20 dv
792 // BP + 24 av
793 // BP + 28 bv
794 // BP + 32 nv
795 // BP + 36 n (nonzero multiple of 4)
796 // BP + 40 mi
444083ae 797 //
a90d420c 798 // Locals are relative to SP, which 16-byte aligned, as follows.
444083ae 799 //
a90d420c
MW
800 // SP + 0 expanded V (32 bytes)
801 // SP + 32 expanded M (32 bytes)
802 // SP + 64 expanded Y (32 bytes)
803 // SP + 96 outer loop dv
804 // SP + 100 outer loop bv
805 // SP + 104 av limit (mostly in ESI)
806 // SP + 108 bv limit
807 // SP + 112 (top of locals)
808 pushreg BP
0923a413
MW
809 pushreg ebx
810 pushreg esi
811 pushreg edi
42c44b27 812 setfp
6d2bd7f1 813 stalloc 112
a90d420c 814 and SP, ~15
0923a413 815 endprologue
444083ae
MW
816
817 // Establish the expanded operands.
818 pxor xmm7, xmm7
a90d420c
MW
819 mov ecx, [BP + 28] // -> bv
820 mov edx, [BP + 40] // -> mi
444083ae
MW
821 movdqu xmm0, [ecx] // bv[0]
822 movdqu xmm2, [edx] // mi
71ac8e5e 823 expand xmm7, xmm0, xmm1, xmm2, xmm3
a90d420c
MW
824 movdqa [SP + 0], xmm0 // bv[0] expanded low
825 movdqa [SP + 16], xmm1 // bv[0] expanded high
826 movdqa [SP + 32], xmm2 // mi expanded low
827 movdqa [SP + 48], xmm3 // mi expanded high
444083ae
MW
828
829 // Set up the outer loop state and prepare for the first iteration.
a90d420c
MW
830 mov edx, [BP + 36] // n
831 mov eax, [BP + 24] // -> U = av[0]
832 mov ebx, [BP + 32] // -> X = nv[0]
833 mov edi, [BP + 20] // -> Z = dv[0]
834 mov [SP + 100], ecx
444083ae
MW
835 lea ecx, [ecx + 4*edx] // -> bv[n/4] = bv limit
836 lea edx, [eax + 4*edx] // -> av[n/4] = av limit
a90d420c
MW
837 mov [SP + 96], edi
838 mov [SP + 104], edx
839 mov [SP + 108], ecx
840 lea ecx, [SP + 0] // -> expanded V = bv[0]
841 lea esi, [SP + 32] // -> expanded M = mi
842 lea edx, [SP + 64] // -> space for Y
444083ae 843 call mmul4
a90d420c 844 mov esi, [SP + 104] // recover av limit
444083ae
MW
845 add edi, 16
846 add eax, 16
847 add ebx, 16
848 cmp eax, esi // done already?
849 jae 8f
a90d420c 850 mov [SP + 96], edi
444083ae
MW
851
852 .p2align 4
853 // Complete the first inner loop.
8540: call dmul4
855 add edi, 16
856 add eax, 16
857 add ebx, 16
858 cmp eax, esi // done yet?
859 jb 0b
860
861 // Still have carries left to propagate.
862 call carryprop
863 movd [edi + 16], xmm4
864
865 .p2align 4
866 // Embark on the next iteration. (There must be one. If n = 1, then
867 // we would have bailed above, to label 8. Similarly, the subsequent
868 // iterations can fall into the inner loop immediately.)
a90d420c
MW
8691: mov eax, [SP + 100] // -> bv[i - 1]
870 mov edi, [SP + 96] // -> Z = dv[i]
444083ae
MW
871 add eax, 16 // -> bv[i]
872 pxor xmm7, xmm7
a90d420c
MW
873 mov [SP + 100], eax
874 cmp eax, [SP + 108] // done yet?
444083ae 875 jae 9f
6ecc0b8f 876 movdqu xmm0, [eax] // bv[i]
a90d420c
MW
877 mov ebx, [BP + 32] // -> X = nv[0]
878 lea esi, [SP + 32] // -> expanded M = mi
879 mov eax, [BP + 24] // -> U = av[0]
71ac8e5e 880 expand xmm7, xmm0, xmm1
a90d420c
MW
881 movdqa [SP + 0], xmm0 // bv[i] expanded low
882 movdqa [SP + 16], xmm1 // bv[i] expanded high
444083ae 883 call mmla4
a90d420c 884 mov esi, [SP + 104] // recover av limit
444083ae
MW
885 add edi, 16
886 add eax, 16
887 add ebx, 16
a90d420c 888 mov [SP + 96], edi
444083ae
MW
889
890 .p2align 4
891 // Complete the next inner loop.
8920: call dmla4
893 add edi, 16
894 add eax, 16
895 add ebx, 16
896 cmp eax, esi
897 jb 0b
898
899 // Still have carries left to propagate, and they overlap the
900 // previous iteration's final tail, so read that in and add it.
901 movd xmm0, [edi]
902 paddq xmm4, xmm0
903 call carryprop
904 movd [edi + 16], xmm4
905
906 // Back again.
907 jmp 1b
908
909 // First iteration was short. Write out the carries and we're done.
910 // (This could be folded into the main loop structure, but that would
911 // penalize small numbers more.)
9128: call carryprop
913 movd [edi + 16], xmm4
914
915 // All done.
0923a413
MW
9169: dropfp
917 popreg edi
918 popreg esi
919 popreg ebx
a90d420c 920 popreg BP
444083ae 921 ret
444083ae
MW
922ENDFUNC
923
b9b279b4
MW
924FUNC(mpxmont_redc4_x86_avx)
925 .arch .avx
926 vzeroupper
927 endprologue
928 // and drop through...
929 .arch pentium4
930ENDFUNC
931
444083ae
MW
932FUNC(mpxmont_redc4_x86_sse2)
933 // void mpxmont_redc4_x86_sse2(mpw *dv, mpw *dvl, const mpw *nv,
934 // size_t n, const mpw *mi);
935
a90d420c 936 // Build a stack frame. Arguments will be relative to BP, as
444083ae
MW
937 // follows.
938 //
a90d420c
MW
939 // BP + 20 dv
940 // BP + 24 dvl
941 // BP + 28 nv
942 // BP + 32 n (nonzero multiple of 4)
943 // BP + 36 mi
444083ae 944 //
a90d420c 945 // Locals are relative to SP, as follows.
444083ae 946 //
a90d420c
MW
947 // SP + 0 outer loop dv
948 // SP + 4 outer dv limit
949 // SP + 8 blocks-of-4 dv limit
950 // SP + 12 expanded M (32 bytes)
951 // SP + 44 expanded Y (32 bytes)
952 // SP + 76 (top of locals)
953 pushreg BP
0923a413
MW
954 pushreg ebx
955 pushreg esi
956 pushreg edi
42c44b27 957 setfp
a90d420c 958 and SP, ~15
6d2bd7f1 959 stalloc 76
0923a413 960 endprologue
444083ae
MW
961
962 // Establish the expanded operands and the blocks-of-4 dv limit.
a90d420c 963 mov edi, [BP + 20] // -> Z = dv[0]
444083ae 964 pxor xmm7, xmm7
a90d420c 965 mov eax, [BP + 24] // -> dv[n] = dv limit
444083ae 966 sub eax, edi // length of dv in bytes
a90d420c 967 mov edx, [BP + 36] // -> mi
444083ae
MW
968 movdqu xmm0, [edx] // mi
969 and eax, ~15 // mask off the tail end
71ac8e5e 970 expand xmm7, xmm0, xmm1
444083ae 971 add eax, edi // find limit
a90d420c
MW
972 movdqa [SP + 12], xmm0 // mi expanded low
973 movdqa [SP + 28], xmm1 // mi expanded high
974 mov [SP + 8], eax
444083ae
MW
975
976 // Set up the outer loop state and prepare for the first iteration.
a90d420c
MW
977 mov ecx, [BP + 32] // n
978 mov ebx, [BP + 28] // -> X = nv[0]
444083ae
MW
979 lea edx, [edi + 4*ecx] // -> dv[n/4] = outer dv limit
980 lea ecx, [ebx + 4*ecx] // -> nv[n/4] = nv limit
a90d420c
MW
981 mov [SP + 0], edi
982 mov [SP + 4], edx
983 lea esi, [SP + 12] // -> expanded M = mi
984 lea edx, [SP + 44] // -> space for Y
444083ae 985 call mont4
444083ae 986 add ebx, 16
a87d6f26 987 add edi, 16
444083ae
MW
988 cmp ebx, ecx // done already?
989 jae 8f
990
991 .p2align 4
992 // Complete the first inner loop.
9935: call mla4
994 add ebx, 16
995 add edi, 16
996 cmp ebx, ecx // done yet?
997 jb 5b
998
999 // Still have carries left to propagate.
10008: carryadd
a90d420c
MW
1001 mov esi, [SP + 8] // -> dv blocks limit
1002 mov edx, [BP + 24] // dv limit
444083ae
MW
1003 psllq xmm7, 16
1004 pslldq xmm7, 8
1005 paddq xmm6, xmm7
1006 call carryprop
1007 movd eax, xmm4
1008 add edi, 16
1009 cmp edi, esi
1010 jae 7f
1011
1012 .p2align 4
1013 // Continue carry propagation until the end of the buffer.
10140: add [edi], eax
1015 mov eax, 0 // preserves flags
1016 adcd [edi + 4], 0
1017 adcd [edi + 8], 0
1018 adcd [edi + 12], 0
1019 adc eax, 0
1020 add edi, 16
1021 cmp edi, esi
1022 jb 0b
1023
1024 // Deal with the tail end.
10257: add [edi], eax
1026 mov eax, 0 // preserves flags
1027 add edi, 4
1028 adc eax, 0
1029 cmp edi, edx
1030 jb 7b
1031
1032 // All done for this iteration. Start the next. (This must have at
1033 // least one follow-on iteration, or we'd not have started this outer
1034 // loop.)
a90d420c
MW
10358: mov edi, [SP + 0] // -> dv[i - 1]
1036 mov ebx, [BP + 28] // -> X = nv[0]
1037 lea edx, [SP + 44] // -> space for Y
1038 lea esi, [SP + 12] // -> expanded M = mi
444083ae 1039 add edi, 16 // -> Z = dv[i]
a90d420c 1040 cmp edi, [SP + 4] // all done yet?
444083ae 1041 jae 9f
a90d420c 1042 mov [SP + 0], edi
444083ae
MW
1043 call mont4
1044 add edi, 16
1045 add ebx, 16
1046 jmp 5b
1047
1048 // All over.
0923a413
MW
10499: dropfp
1050 popreg edi
1051 popreg esi
1052 popreg ebx
a90d420c 1053 popreg BP
444083ae 1054 ret
444083ae
MW
1055ENDFUNC
1056
1057///--------------------------------------------------------------------------
1058/// Testing and performance measurement.
1059
1060#ifdef TEST_MUL4
1061
1062.macro cysetup c
1063 rdtsc
1064 mov [\c], eax
1065 mov [\c + 4], edx
1066.endm
1067
1068.macro cystore c, v, n
1069 rdtsc
1070 sub eax, [\c]
1071 sbb edx, [\c + 4]
1072 mov ebx, [\v]
1073 mov ecx, [\n]
1074 dec ecx
1075 mov [\n], ecx
1076 mov [ebx + ecx*8], eax
1077 mov [ebx + ecx*8 + 4], edx
1078.endm
1079
6d19758a 1080.macro testprologue n
a90d420c 1081 pushreg BP
0923a413
MW
1082 pushreg ebx
1083 pushreg esi
1084 pushreg edi
42c44b27 1085 setfp
6d2bd7f1 1086 stalloc 3*32 + 4*4
a90d420c 1087 and SP, ~15
0923a413 1088 endprologue
6d19758a 1089 mov eax, \n
a90d420c 1090 mov [SP + 104], eax
444083ae 1091 // vars:
a90d420c
MW
1092 // SP + 0 = v expanded
1093 // SP + 32 = y expanded
1094 // SP + 64 = ? expanded
1095 // SP + 96 = cycles
1096 // SP + 104 = count
444083ae
MW
1097.endm
1098
1099.macro testepilogue
0923a413
MW
1100 dropfp
1101 popreg edi
1102 popreg esi
1103 popreg ebx
a90d420c 1104 popreg BP
444083ae
MW
1105 ret
1106.endm
1107
1108.macro testldcarry c
1109 mov ecx, \c // -> c
8e91d6e5
MW
1110 movdqu xmm4, [ecx + 0] // (c'_0; c''_0)
1111 movdqu xmm5, [ecx + 16] // (c'_1; c''_1)
1112 movdqu xmm6, [ecx + 32] // (c'_2; c''_2)
444083ae
MW
1113.endm
1114
71ac8e5e 1115.macro testexpand v=nil, y=nil
444083ae
MW
1116 pxor xmm7, xmm7
1117 .ifnes "\v", "nil"
1118 mov ecx, \v
1119 movdqu xmm0, [ecx]
71ac8e5e 1120 expand xmm7, xmm0, xmm1
a90d420c
MW
1121 movdqa [SP + 0], xmm0
1122 movdqa [SP + 16], xmm1
444083ae
MW
1123 .endif
1124 .ifnes "\y", "nil"
1125 mov edx, \y
1126 movdqu xmm2, [edx]
71ac8e5e 1127 expand xmm7, xmm2, xmm3
a90d420c
MW
1128 movdqa [SP + 32], xmm2
1129 movdqa [SP + 48], xmm3
444083ae
MW
1130 .endif
1131.endm
1132
71ac8e5e 1133.macro testtop u=nil, x=nil, mode=nil
444083ae
MW
1134 .p2align 4
11350:
1136 .ifnes "\u", "nil"
a90d420c 1137 lea ecx, [SP + 0]
444083ae
MW
1138 .endif
1139 mov ebx, \x
1140 .ifeqs "\mode", "mont"
a90d420c 1141 lea esi, [SP + 32]
444083ae 1142 .endif
a90d420c 1143 cysetup SP + 96
444083ae
MW
1144 .ifnes "\u", "nil"
1145 mov eax, \u
1146 .endif
1147 .ifeqs "\mode", "mont"
a90d420c 1148 lea edx, [SP + 64]
444083ae 1149 .else
a90d420c 1150 lea edx, [SP + 32]
444083ae
MW
1151 .endif
1152.endm
1153
6d19758a 1154.macro testtail cyv
a90d420c 1155 cystore SP + 96, \cyv, SP + 104
444083ae
MW
1156 jnz 0b
1157.endm
1158
1159.macro testcarryout c
1160 mov ecx, \c
1161 movdqu [ecx + 0], xmm4
1162 movdqu [ecx + 16], xmm5
1163 movdqu [ecx + 32], xmm6
1164.endm
1165
0923a413 1166FUNC(test_dmul4)
a90d420c
MW
1167 testprologue [BP + 44]
1168 testldcarry [BP + 24]
1169 testexpand [BP + 36], [BP + 40]
1170 mov edi, [BP + 20]
1171 testtop [BP + 28], [BP + 32]
444083ae 1172 call dmul4
a90d420c
MW
1173 testtail [BP + 48]
1174 testcarryout [BP + 24]
444083ae 1175 testepilogue
0923a413 1176ENDFUNC
444083ae 1177
0923a413 1178FUNC(test_dmla4)
a90d420c
MW
1179 testprologue [BP + 44]
1180 testldcarry [BP + 24]
1181 testexpand [BP + 36], [BP + 40]
1182 mov edi, [BP + 20]
1183 testtop [BP + 28], [BP + 32]
444083ae 1184 call dmla4
a90d420c
MW
1185 testtail [BP + 48]
1186 testcarryout [BP + 24]
444083ae 1187 testepilogue
0923a413 1188ENDFUNC
444083ae 1189
0923a413 1190FUNC(test_mul4)
a90d420c
MW
1191 testprologue [BP + 36]
1192 testldcarry [BP + 24]
1193 testexpand nil, [BP + 32]
1194 mov edi, [BP + 20]
1195 testtop nil, [BP + 28]
444083ae 1196 call mul4
a90d420c
MW
1197 testtail [BP + 40]
1198 testcarryout [BP + 24]
444083ae 1199 testepilogue
0923a413 1200ENDFUNC
444083ae 1201
d0d41c6e 1202FUNC(test_mul4zc)
a90d420c
MW
1203 testprologue [BP + 36]
1204 testldcarry [BP + 24]
1205 testexpand nil, [BP + 32]
1206 mov edi, [BP + 20]
1207 testtop nil, [BP + 28]
d0d41c6e 1208 call mul4zc
a90d420c
MW
1209 testtail [BP + 40]
1210 testcarryout [BP + 24]
d0d41c6e
MW
1211 testepilogue
1212ENDFUNC
1213
0923a413 1214FUNC(test_mla4)
a90d420c
MW
1215 testprologue [BP + 36]
1216 testldcarry [BP + 24]
1217 testexpand nil, [BP + 32]
1218 mov edi, [BP + 20]
1219 testtop nil, [BP + 28]
444083ae 1220 call mla4
a90d420c
MW
1221 testtail [BP + 40]
1222 testcarryout [BP + 24]
444083ae 1223 testepilogue
0923a413 1224ENDFUNC
444083ae 1225
d0d41c6e 1226FUNC(test_mla4zc)
a90d420c
MW
1227 testprologue [BP + 36]
1228 testldcarry [BP + 24]
1229 testexpand nil, [BP + 32]
1230 mov edi, [BP + 20]
1231 testtop nil, [BP + 28]
d0d41c6e 1232 call mla4zc
a90d420c
MW
1233 testtail [BP + 40]
1234 testcarryout [BP + 24]
d0d41c6e
MW
1235 testepilogue
1236ENDFUNC
1237
0923a413 1238FUNC(test_mmul4)
a90d420c
MW
1239 testprologue [BP + 48]
1240 testexpand [BP + 40], [BP + 44]
1241 mov edi, [BP + 20]
1242 testtop [BP + 32], [BP + 36], mont
444083ae 1243 call mmul4
a90d420c
MW
1244 testtail [BP + 52]
1245 mov edi, [BP + 28]
1246 movdqa xmm0, [SP + 64]
1247 movdqa xmm1, [SP + 80]
444083ae
MW
1248 movdqu [edi], xmm0
1249 movdqu [edi + 16], xmm1
a90d420c 1250 testcarryout [BP + 24]
444083ae 1251 testepilogue
0923a413 1252ENDFUNC
444083ae 1253
0923a413 1254FUNC(test_mmla4)
a90d420c
MW
1255 testprologue [BP + 48]
1256 testexpand [BP + 40], [BP + 44]
1257 mov edi, [BP + 20]
1258 testtop [BP + 32], [BP + 36], mont
444083ae 1259 call mmla4
a90d420c
MW
1260 testtail [BP + 52]
1261 mov edi, [BP + 28]
1262 movdqa xmm0, [SP + 64]
1263 movdqa xmm1, [SP + 80]
444083ae
MW
1264 movdqu [edi], xmm0
1265 movdqu [edi + 16], xmm1
a90d420c 1266 testcarryout [BP + 24]
444083ae 1267 testepilogue
0923a413 1268ENDFUNC
444083ae 1269
0923a413 1270FUNC(test_mont4)
a90d420c
MW
1271 testprologue [BP + 40]
1272 testexpand nil, [BP + 36]
1273 mov edi, [BP + 20]
1274 testtop nil, [BP + 32], mont
444083ae 1275 call mont4
a90d420c
MW
1276 testtail [BP + 44]
1277 mov edi, [BP + 28]
1278 movdqa xmm0, [SP + 64]
1279 movdqa xmm1, [SP + 80]
444083ae
MW
1280 movdqu [edi], xmm0
1281 movdqu [edi + 16], xmm1
a90d420c 1282 testcarryout [BP + 24]
444083ae 1283 testepilogue
0923a413 1284ENDFUNC
444083ae
MW
1285
1286#endif
1287
1288///----- That's all, folks --------------------------------------------------