symm/ocb3.h, symm/ocb3-def.h: Implement the OCB3 auth'ned encryption mode.
[catacomb] / math / mpx-mul4-x86-sse2.S
CommitLineData
444083ae
MW
1/// -*- mode: asm; asm-comment-char: ?/; comment-start: "// " -*-
2///
3/// Large SIMD-based multiplications
4///
5/// (c) 2016 Straylight/Edgeware
6
7///----- Licensing notice ---------------------------------------------------
8///
9/// This file is part of Catacomb.
10///
11/// Catacomb is free software; you can redistribute it and/or modify
12/// it under the terms of the GNU Library General Public License as
13/// published by the Free Software Foundation; either version 2 of the
14/// License, or (at your option) any later version.
15///
16/// Catacomb is distributed in the hope that it will be useful,
17/// but WITHOUT ANY WARRANTY; without even the implied warranty of
18/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19/// GNU Library General Public License for more details.
20///
21/// You should have received a copy of the GNU Library General Public
22/// License along with Catacomb; if not, write to the Free
23/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
24/// MA 02111-1307, USA.
25
26///--------------------------------------------------------------------------
27/// External definitions.
28
29#include "config.h"
30#include "asm-common.h"
31
32///--------------------------------------------------------------------------
33/// Prologue.
34
35 .arch pentium4
36 .text
37
38///--------------------------------------------------------------------------
39/// Theory.
40///
41/// We define a number of primitive fixed-size multipliers from which we can
42/// construct more general variable-length multipliers.
43///
44/// The basic trick is the same throughout. In an operand-scanning
45/// multiplication, the inner multiplication loop multiplies a
46/// multiple-precision operand by a single precision factor, and adds the
47/// result, appropriately shifted, to the result. A `finely integrated
48/// operand scanning' implementation of Montgomery multiplication also adds
49/// the product of a single-precision `Montgomery factor' and the modulus,
50/// calculated in the same pass. The more common `coarsely integrated
51/// operand scanning' alternates main multiplication and Montgomery passes,
52/// which requires additional carry propagation.
53///
54/// Throughout both plain-multiplication and Montgomery stages, then, one of
55/// the factors remains constant throughout the operation, so we can afford
56/// to take a little time to preprocess it. The transformation we perform is
57/// as follows. Let b = 2^16, and B = b^2 = 2^32. Suppose we're given a
58/// 128-bit factor v = v_0 + v_1 B + v_2 B^2 + v_3 B^3. Split each v_i into
59/// two sixteen-bit pieces, so v_i = v'_i + v''_i b. These eight 16-bit
60/// pieces are placed into 32-bit cells, and arranged as two 128-bit SSE
61/// operands, as follows.
62///
63/// Offset 0 4 8 12
64/// 0 v'_0 v'_1 v''_0 v''_1
65/// 16 v'_2 v'_3 v''_2 v''_3
66///
2aaa07f8 67/// A `pmuludq' instruction ignores the odd positions in its operands; thus,
444083ae
MW
68/// it will act on (say) v'_0 and v''_0 in a single instruction. Shifting
69/// this vector right by 4 bytes brings v'_1 and v''_1 into position. We can
70/// multiply such a vector by a full 32-bit scalar to produce two 48-bit
71/// results in 64-bit fields. The sixteen bits of headroom allows us to add
72/// many products together before we must deal with carrying; it also allows
73/// for some calculations to be performed on the above expanded form.
74///
75/// On 32-bit x86, we are register starved: the expanded operands are kept in
76/// memory, typically in warm L1 cache.
77///
78/// We maintain four `carry' registers accumulating intermediate results.
79/// The registers' precise roles rotate during the computation; we name them
80/// `c0', `c1', `c2', and `c3'. Each carry register holds two 64-bit halves:
81/// the register c0, for example, holds c'_0 (low half) and c''_0 (high
82/// half), and represents the value c_0 = c'_0 + c''_0 b; the carry registers
83/// collectively represent the value c_0 + c_1 B + c_2 B^2 + c_3 B^3. The
2aaa07f8 84/// `pmuluqd' instruction acting on a scalar operand (broadcast across all
444083ae
MW
85/// lanes of its vector) and an operand in the expanded form above produces a
86/// result which can be added directly to the appropriate carry register.
87/// Following a pass of four multiplications, we perform some limited carry
88/// propagation: let t = c''_0 mod B, and let d = c'_0 + t b; then we output
89/// z = d mod B, add (floor(d/B), floor(c''_0/B)) to c1, and cycle the carry
90/// registers around, so that c1 becomes c0, and the old c0 is (implicitly)
91/// zeroed becomes c3.
92
93///--------------------------------------------------------------------------
94/// Macro definitions.
95
71ac8e5e 96.macro mulcore r, s, d0, d1=nil, d2=nil, d3=nil
444083ae
MW
97 // Load a word r_i from R, multiply by the expanded operand [S], and
98 // leave the pieces of the product in registers D0, D1, D2, D3.
8e91d6e5 99 movd \d0, \r // (r_i, 0; 0, 0)
444083ae 100 .ifnes "\d1", "nil"
8e91d6e5 101 movdqa \d1, [\s] // (s'_0, s'_1; s''_0, s''_1)
444083ae
MW
102 .endif
103 .ifnes "\d3", "nil"
8e91d6e5 104 movdqa \d3, [\s + 16] // (s'_2, s'_3; s''_2, s''_3)
444083ae 105 .endif
a117c06f 106 pshufd \d0, \d0, SHUF(0, 3, 0, 3) // (r_i, ?; r_i, ?)
444083ae 107 .ifnes "\d1", "nil"
8e91d6e5 108 psrldq \d1, 4 // (s'_1, s''_0; s''_1, 0)
444083ae
MW
109 .endif
110 .ifnes "\d2", "nil"
111 .ifnes "\d3", "nil"
8e91d6e5 112 movdqa \d2, \d3 // another copy of (s'_2, s'_3; ...)
444083ae 113 .else
8e91d6e5 114 movdqa \d2, \d0 // another copy of (r_i, ?; r_i, ?)
444083ae
MW
115 .endif
116 .endif
117 .ifnes "\d3", "nil"
8e91d6e5 118 psrldq \d3, 4 // (s'_3, s''_2; s''_3, 0)
444083ae
MW
119 .endif
120 .ifnes "\d1", "nil"
8e91d6e5 121 pmuludq \d1, \d0 // (r_i s'_1; r_i s''_1)
444083ae
MW
122 .endif
123 .ifnes "\d3", "nil"
8e91d6e5 124 pmuludq \d3, \d0 // (r_i s'_3; r_i s''_3)
444083ae
MW
125 .endif
126 .ifnes "\d2", "nil"
127 .ifnes "\d3", "nil"
8e91d6e5 128 pmuludq \d2, \d0 // (r_i s'_2; r_i s''_2)
444083ae 129 .else
2aaa07f8 130 pmuludq \d2, [\s + 16]
444083ae
MW
131 .endif
132 .endif
8e91d6e5 133 pmuludq \d0, [\s] // (r_i s'_0; r_i s''_0)
444083ae
MW
134.endm
135
71ac8e5e
MW
136.macro accum c0, c1=nil, c2=nil, c3=nil
137 // Accumulate 64-bit pieces in XMM0--XMM3 into the corresponding
138 // carry registers C0--C3. Any or all of C1--C3 may be `nil' to skip
139 // updating that register.
444083ae
MW
140 paddq \c0, xmm0
141 .ifnes "\c1", "nil"
142 paddq \c1, xmm1
143 .endif
144 .ifnes "\c2", "nil"
145 paddq \c2, xmm2
146 .endif
147 .ifnes "\c3", "nil"
148 paddq \c3, xmm3
149 .endif
150.endm
151
71ac8e5e 152.macro mulacc r, s, c0, c1, c2, c3, z3p=nil
444083ae
MW
153 // Load a word r_i from R, multiply by the expanded operand [S],
154 // and accumulate in carry registers C0, C1, C2, C3. If Z3P is `t'
155 // then C3 notionally contains zero, but needs clearing; in practice,
156 // we store the product directly rather than attempting to add. On
157 // completion, XMM0, XMM1, and XMM2 are clobbered, as is XMM3 if Z3P
158 // is not `t'.
159 .ifeqs "\z3p", "t"
160 mulcore \r, \s, xmm0, xmm1, xmm2, \c3
71ac8e5e 161 accum \c0, \c1, \c2
444083ae
MW
162 .else
163 mulcore \r, \s, xmm0, xmm1, xmm2, xmm3
164 accum \c0, \c1, \c2, \c3
165 .endif
166.endm
167
71ac8e5e 168.macro propout d, c, cc=nil
444083ae
MW
169 // Calculate an output word from C, and store it in D; propagate
170 // carries out from C to CC in preparation for a rotation of the
171 // carry registers. On completion, XMM3 is clobbered. If CC is
172 // `nil', then the contribution which would have been added to it is
173 // left in C.
a117c06f 174 pshufd xmm3, \c, SHUF(3, 3, 3, 2) // (?, ?; ?, t = c'' mod B)
8e91d6e5
MW
175 psrldq xmm3, 12 // (t, 0; 0, 0) = (t, 0)
176 pslldq xmm3, 2 // (t b; 0)
177 paddq \c, xmm3 // (c' + t b; c'')
444083ae
MW
178 movd \d, \c
179 psrlq \c, 32 // floor(c/B)
180 .ifnes "\cc", "nil"
181 paddq \cc, \c // propagate up
182 .endif
183.endm
184
185.macro endprop d, c, t
186 // On entry, C contains a carry register. On exit, the low 32 bits
187 // of the value represented in C are written to D, and the remaining
188 // bits are left at the bottom of T.
189 movdqa \t, \c
8e91d6e5
MW
190 psllq \t, 16 // (?; c'' b)
191 pslldq \c, 8 // (0; c')
192 paddq \t, \c // (?; c' + c'' b)
193 psrldq \t, 8 // (c' + c'' b; 0) = (c; 0)
444083ae 194 movd \d, \t
8e91d6e5 195 psrldq \t, 4 // (floor(c/B); 0)
444083ae
MW
196.endm
197
71ac8e5e 198.macro expand z, a, b, c=nil, d=nil
444083ae
MW
199 // On entry, A and C hold packed 128-bit values, and Z is zero. On
200 // exit, A:B and C:D together hold the same values in expanded
201 // form. If C is `nil', then only expand A to A:B.
8e91d6e5 202 movdqa \b, \a // (a_0, a_1; a_2, a_3)
444083ae 203 .ifnes "\c", "nil"
8e91d6e5 204 movdqa \d, \c // (c_0, c_1; c_2, c_3)
444083ae 205 .endif
8e91d6e5
MW
206 punpcklwd \a, \z // (a'_0, a''_0; a'_1, a''_1)
207 punpckhwd \b, \z // (a'_2, a''_2; a'_3, a''_3)
444083ae 208 .ifnes "\c", "nil"
8e91d6e5
MW
209 punpcklwd \c, \z // (c'_0, c''_0; c'_1, c''_1)
210 punpckhwd \d, \z // (c'_2, c''_2; c'_3, c''_3)
444083ae 211 .endif
a117c06f
MW
212 pshufd \a, \a, SHUF(0, 2, 1, 3) // (a'_0, a'_1; a''_0, a''_1)
213 pshufd \b, \b, SHUF(0, 2, 1, 3) // (a'_2, a'_3; a''_2, a''_3)
444083ae 214 .ifnes "\c", "nil"
a117c06f
MW
215 pshufd \c, \c, SHUF(0, 2, 1, 3) // (c'_0, c'_1; c''_0, c''_1)
216 pshufd \d, \d, SHUF(0, 2, 1, 3) // (c'_2, c'_3; c''_2, c''_3)
444083ae
MW
217 .endif
218.endm
219
71ac8e5e 220.macro squash c0, c1, c2, c3, t, u, lo, hi=nil
444083ae 221 // On entry, C0, C1, C2, C3 are carry registers representing a value
4b30aca5 222 // Y. On exit, LO holds the low 128 bits of the carry value; C1, C2,
444083ae 223 // C3, T, and U are clobbered; and the high bits of Y are stored in
4b30aca5 224 // HI, if this is not `nil'.
444083ae
MW
225
226 // The first step is to eliminate the `double-prime' pieces -- i.e.,
227 // the ones offset by 16 bytes from a 32-bit boundary -- by carrying
228 // them into the 32-bit-aligned pieces above and below. But before
229 // we can do that, we must gather them together.
230 movdqa \t, \c0
231 movdqa \u, \c1
8e91d6e5
MW
232 punpcklqdq \t, \c2 // (y'_0; y'_2)
233 punpckhqdq \c0, \c2 // (y''_0; y''_2)
234 punpcklqdq \u, \c3 // (y'_1; y'_3)
235 punpckhqdq \c1, \c3 // (y''_1; y''_3)
444083ae
MW
236
237 // Now split the double-prime pieces. The high (up to) 48 bits will
238 // go up; the low 16 bits go down.
239 movdqa \c2, \c0
240 movdqa \c3, \c1
241 psllq \c2, 48
242 psllq \c3, 48
8e91d6e5
MW
243 psrlq \c0, 16 // high parts of (y''_0; y''_2)
244 psrlq \c1, 16 // high parts of (y''_1; y''_3)
245 psrlq \c2, 32 // low parts of (y''_0; y''_2)
246 psrlq \c3, 32 // low parts of (y''_1; y''_3)
4b30aca5
MW
247 .ifnes "\hi", "nil"
248 movdqa \hi, \c1
444083ae 249 .endif
8e91d6e5 250 pslldq \c1, 8 // high part of (0; y''_1)
444083ae
MW
251
252 paddq \t, \c2 // propagate down
253 paddq \u, \c3
8e91d6e5
MW
254 paddq \t, \c1 // and up: (y_0; y_2)
255 paddq \u, \c0 // (y_1; y_3)
4b30aca5 256 .ifnes "\hi", "nil"
8e91d6e5 257 psrldq \hi, 8 // high part of (y''_3; 0)
444083ae
MW
258 .endif
259
260 // Finally extract the answer. This complicated dance is better than
261 // storing to memory and loading, because the piecemeal stores
262 // inhibit store forwarding.
8e91d6e5
MW
263 movdqa \c3, \t // (y_0; ?)
264 movdqa \lo, \t // (y^*_0, ?; ?, ?)
265 psrldq \t, 8 // (y_2; 0)
266 psrlq \c3, 32 // (floor(y_0/B); ?)
267 paddq \c3, \u // (y_1 + floor(y_0/B); ?)
268 movdqa \c1, \c3 // (y^*_1, ?; ?, ?)
269 psrldq \u, 8 // (y_3; 0)
270 psrlq \c3, 32 // (floor((y_1 B + y_0)/B^2; ?)
271 paddq \c3, \t // (y_2 + floor((y_1 B + y_0)/B^2; ?)
272 punpckldq \lo, \c3 // (y^*_0, y^*_2; ?, ?)
273 psrlq \c3, 32 // (floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
274 paddq \c3, \u // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
4b30aca5 275 .ifnes "\hi", "nil"
d2269179 276 movdqa \t, \c3
444083ae
MW
277 pxor \u, \u
278 .endif
8e91d6e5 279 punpckldq \c1, \c3 // (y^*_1, y^*_3; ?, ?)
4b30aca5 280 .ifnes "\hi", "nil"
444083ae 281 psrlq \t, 32 // very high bits of y
4b30aca5
MW
282 paddq \hi, \t
283 punpcklqdq \hi, \u // carry up
444083ae 284 .endif
4b30aca5 285 punpckldq \lo, \c1 // y mod B^4
444083ae
MW
286.endm
287
288.macro carryadd
289 // On entry, EDI points to a packed addend A, and XMM4, XMM5, XMM6
290 // hold the incoming carry registers c0, c1, and c2 representing a
291 // carry-in C.
292 //
293 // On exit, the carry registers, including XMM7, are updated to hold
294 // C + A; XMM0, XMM1, XMM2, and XMM3 are clobbered. The other
295 // registers are preserved.
8e91d6e5
MW
296 movd xmm0, [edi + 0] // (a_0; 0)
297 movd xmm1, [edi + 4] // (a_1; 0)
298 movd xmm2, [edi + 8] // (a_2; 0)
299 movd xmm7, [edi + 12] // (a_3; 0)
300
301 paddq xmm4, xmm0 // (c'_0 + a_0; c''_0)
302 paddq xmm5, xmm1 // (c'_1 + a_1; c''_1)
303 paddq xmm6, xmm2 // (c'_2 + a_2; c''_2 + a_3 b)
444083ae
MW
304.endm
305
306///--------------------------------------------------------------------------
307/// Primitive multipliers and related utilities.
308
1a517bb3 309INTFUNC(carryprop)
444083ae
MW
310 // On entry, XMM4, XMM5, and XMM6 hold a 144-bit carry in an expanded
311 // form. Store the low 128 bits of the represented carry to [EDI] as
312 // a packed 128-bit value, and leave the remaining 16 bits in the low
313 // 32 bits of XMM4. On exit, XMM3, XMM5 and XMM6 are clobbered.
0923a413
MW
314 endprologue
315
444083ae
MW
316 propout [edi + 0], xmm4, xmm5
317 propout [edi + 4], xmm5, xmm6
318 propout [edi + 8], xmm6, nil
319 endprop [edi + 12], xmm6, xmm4
320 ret
321
1a517bb3
MW
322ENDFUNC
323
324INTFUNC(dmul4)
444083ae
MW
325 // On entry, EDI points to the destination buffer; EAX and EBX point
326 // to the packed operands U and X; ECX and EDX point to the expanded
327 // operands V and Y; and XMM4, XMM5, XMM6 hold the incoming carry
328 // registers c0, c1, and c2; c3 is assumed to be zero.
329 //
330 // On exit, we write the low 128 bits of the sum C + U V + X Y to
331 // [EDI], and update the carry registers with the carry out. The
332 // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
333 // general-purpose registers are preserved.
0923a413
MW
334 endprologue
335
444083ae 336 mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7, t
71ac8e5e 337 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
444083ae
MW
338 propout [edi + 0], xmm4, xmm5
339
340 mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
71ac8e5e 341 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4
444083ae
MW
342 propout [edi + 4], xmm5, xmm6
343
344 mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t
71ac8e5e 345 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5
444083ae
MW
346 propout [edi + 8], xmm6, xmm7
347
348 mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
71ac8e5e 349 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6
444083ae
MW
350 propout [edi + 12], xmm7, xmm4
351
352 ret
353
1a517bb3
MW
354ENDFUNC
355
356INTFUNC(dmla4)
444083ae
MW
357 // On entry, EDI points to the destination buffer, which also
358 // contains an addend A to accumulate; EAX and EBX point to the
359 // packed operands U and X; ECX and EDX point to the expanded
360 // operands V and Y; and XMM4, XMM5, XMM6 hold the incoming carry
361 // registers c0, c1, and c2 representing a carry-in C; c3 is assumed
362 // to be zero.
363 //
364 // On exit, we write the low 128 bits of the sum A + C + U V + X Y to
365 // [EDI], and update the carry registers with the carry out. The
366 // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
367 // general-purpose registers are preserved.
0923a413
MW
368 endprologue
369
444083ae
MW
370 carryadd
371
71ac8e5e
MW
372 mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7
373 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
444083ae
MW
374 propout [edi + 0], xmm4, xmm5
375
376 mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
71ac8e5e 377 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4
444083ae
MW
378 propout [edi + 4], xmm5, xmm6
379
380 mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t
71ac8e5e 381 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5
444083ae
MW
382 propout [edi + 8], xmm6, xmm7
383
384 mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
71ac8e5e 385 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6
444083ae
MW
386 propout [edi + 12], xmm7, xmm4
387
388 ret
389
1a517bb3
MW
390ENDFUNC
391
392INTFUNC(mul4zc)
444083ae
MW
393 // On entry, EDI points to the destination buffer; EBX points to a
394 // packed operand X; and EDX points to an expanded operand Y.
395 //
396 // On exit, we write the low 128 bits of the product X Y to [EDI],
397 // and set the carry registers XMM4, XMM5, XMM6 to the carry out.
398 // The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
399 // general-purpose registers are preserved.
0923a413
MW
400 endprologue
401
444083ae
MW
402 mulcore [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
403 propout [edi + 0], xmm4, xmm5
404
405 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
406 propout [edi + 4], xmm5, xmm6
407
408 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
409 propout [edi + 8], xmm6, xmm7
410
411 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
412 propout [edi + 12], xmm7, xmm4
413
414 ret
415
1a517bb3
MW
416ENDFUNC
417
418INTFUNC(mul4)
444083ae
MW
419 // On entry, EDI points to the destination buffer; EBX points to a
420 // packed operand X; EDX points to an expanded operand Y; and XMM4,
421 // XMM5, XMM6 hold the incoming carry registers c0, c1, and c2,
422 // representing a carry-in C; c3 is assumed to be zero.
423 //
424 // On exit, we write the low 128 bits of the sum C + X Y to [EDI],
425 // and update the carry registers with the carry out. The registers
426 // XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
427 // general-purpose registers are preserved.
0923a413
MW
428 endprologue
429
444083ae
MW
430 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, t
431 propout [edi + 0], xmm4, xmm5
432
433 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
434 propout [edi + 4], xmm5, xmm6
435
436 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
437 propout [edi + 8], xmm6, xmm7
438
439 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
440 propout [edi + 12], xmm7, xmm4
441
442 ret
443
1a517bb3
MW
444ENDFUNC
445
446INTFUNC(mla4zc)
444083ae
MW
447 // On entry, EDI points to the destination buffer, which also
448 // contains an addend A to accumulate; EBX points to a packed operand
449 // X; and EDX points to an expanded operand Y.
450 //
451 // On exit, we write the low 128 bits of the sum A + X Y to [EDI],
452 // and set the carry registers XMM4, XMM5, XMM6 to the carry out.
453 // The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
454 // general-purpose registers are preserved.
0923a413
MW
455 endprologue
456
444083ae
MW
457 movd xmm4, [edi + 0]
458 movd xmm5, [edi + 4]
459 movd xmm6, [edi + 8]
460 movd xmm7, [edi + 12]
461
71ac8e5e 462 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
444083ae
MW
463 propout [edi + 0], xmm4, xmm5
464
465 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
466 propout [edi + 4], xmm5, xmm6
467
468 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
469 propout [edi + 8], xmm6, xmm7
470
471 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
472 propout [edi + 12], xmm7, xmm4
473
474 ret
475
1a517bb3
MW
476ENDFUNC
477
478INTFUNC(mla4)
444083ae
MW
479 // On entry, EDI points to the destination buffer, which also
480 // contains an addend A to accumulate; EBX points to a packed operand
481 // X; EDX points to an expanded operand Y; and XMM4, XMM5, XMM6 hold
482 // the incoming carry registers c0, c1, and c2, representing a
483 // carry-in C; c3 is assumed to be zero.
484 //
485 // On exit, we write the low 128 bits of the sum A + C + X Y to
486 // [EDI], and update the carry registers with the carry out. The
487 // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
488 // general-purpose registers are preserved.
0923a413
MW
489 endprologue
490
444083ae
MW
491 carryadd
492
71ac8e5e 493 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
444083ae
MW
494 propout [edi + 0], xmm4, xmm5
495
496 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
497 propout [edi + 4], xmm5, xmm6
498
499 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
500 propout [edi + 8], xmm6, xmm7
501
502 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
503 propout [edi + 12], xmm7, xmm4
504
505 ret
506
1a517bb3
MW
507ENDFUNC
508
509INTFUNC(mmul4)
444083ae
MW
510 // On entry, EDI points to the destination buffer; EAX and EBX point
511 // to the packed operands U and N; ECX and ESI point to the expanded
512 // operands V and M; and EDX points to a place to store an expanded
513 // result Y (32 bytes, at a 16-byte boundary). The stack pointer
6ecc0b8f 514 // must be 12 modulo 16, as is usual for modern x86 ABIs.
444083ae
MW
515 //
516 // On exit, we write Y = U V M mod B to [EDX], and the low 128 bits
517 // of the sum U V + N Y to [EDI], leaving the remaining carry in
518 // XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2, XMM3, and
519 // XMM7 are clobbered; the general-purpose registers are preserved.
6ecc0b8f 520 stalloc 48 + 12 // space for the carries
0923a413 521 endprologue
444083ae
MW
522
523 // Calculate W = U V, and leave it in the destination. Stash the
524 // carry pieces for later.
525 mulcore [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7
526 propout [edi + 0], xmm4, xmm5
527 jmp 5f
528
1a517bb3
MW
529ENDFUNC
530
531INTFUNC(mmla4)
444083ae 532 // On entry, EDI points to the destination buffer, which also
14e7b1f5
MW
533 // contains an addend A to accumulate; EAX and EBX point to the
534 // packed operands U and N; ECX and ESI point to the expanded
444083ae
MW
535 // operands V and M; and EDX points to a place to store an expanded
536 // result Y (32 bytes, at a 16-byte boundary). The stack pointer
6ecc0b8f 537 // must be 12 modulo 16, as is usual for modern x86 ABIs.
444083ae
MW
538 //
539 // On exit, we write Y = (A + U V) M mod B to [EDX], and the low 128
540 // bits of the sum A + U V + N Y to [EDI], leaving the remaining
541 // carry in XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2,
542 // XMM3, and XMM7 are clobbered; the general-purpose registers are
543 // preserved.
6ecc0b8f 544 stalloc 48 + 12 // space for the carries
0923a413
MW
545 endprologue
546
444083ae
MW
547 movd xmm4, [edi + 0]
548 movd xmm5, [edi + 4]
549 movd xmm6, [edi + 8]
550 movd xmm7, [edi + 12]
ba12677b
MW
551
552 // Calculate W = U V, and leave it in the destination. Stash the
553 // carry pieces for later.
71ac8e5e 554 mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7
444083ae
MW
555 propout [edi + 0], xmm4, xmm5
556
5575: mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
558 propout [edi + 4], xmm5, xmm6
559
560 mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t
561 propout [edi + 8], xmm6, xmm7
562
563 mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
564 propout [edi + 12], xmm7, xmm4
565
566 movdqa [esp + 0], xmm4
567 movdqa [esp + 16], xmm5
568 movdqa [esp + 32], xmm6
569
570 // Calculate Y = W M.
571 mulcore [edi + 0], esi, xmm4, xmm5, xmm6, xmm7
572
71ac8e5e
MW
573 mulcore [edi + 4], esi, xmm0, xmm1, xmm2
574 accum xmm5, xmm6, xmm7
444083ae 575
71ac8e5e
MW
576 mulcore [edi + 8], esi, xmm0, xmm1
577 accum xmm6, xmm7
444083ae 578
71ac8e5e
MW
579 mulcore [edi + 12], esi, xmm0
580 accum xmm7
444083ae
MW
581
582 // That's lots of pieces. Now we have to assemble the answer.
71ac8e5e 583 squash xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm4
444083ae
MW
584
585 // Expand it.
586 pxor xmm2, xmm2
71ac8e5e 587 expand xmm2, xmm4, xmm1
444083ae
MW
588 movdqa [edx + 0], xmm4
589 movdqa [edx + 16], xmm1
590
591 // Initialize the carry from the value for W we calculated earlier.
592 movd xmm4, [edi + 0]
593 movd xmm5, [edi + 4]
594 movd xmm6, [edi + 8]
595 movd xmm7, [edi + 12]
596
597 // Finish the calculation by adding the Montgomery product.
71ac8e5e 598 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
444083ae
MW
599 propout [edi + 0], xmm4, xmm5
600
601 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
602 propout [edi + 4], xmm5, xmm6
603
604 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
605 propout [edi + 8], xmm6, xmm7
606
607 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
608 propout [edi + 12], xmm7, xmm4
609
610 // Add add on the carry we calculated earlier.
611 paddq xmm4, [esp + 0]
612 paddq xmm5, [esp + 16]
613 paddq xmm6, [esp + 32]
614
615 // And, with that, we're done.
6ecc0b8f 616 stfree 48 + 12
444083ae
MW
617 ret
618
1a517bb3
MW
619ENDFUNC
620
621INTFUNC(mont4)
444083ae 622 // On entry, EDI points to the destination buffer holding a packed
8e5386aa 623 // value W; EBX points to a packed operand N; ESI points to an
444083ae
MW
624 // expanded operand M; and EDX points to a place to store an expanded
625 // result Y (32 bytes, at a 16-byte boundary).
626 //
627 // On exit, we write Y = W M mod B to [EDX], and the low 128 bits
628 // of the sum W + N Y to [EDI], leaving the remaining carry in
629 // XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2, XMM3, and
630 // XMM7 are clobbered; the general-purpose registers are preserved.
0923a413 631 endprologue
444083ae
MW
632
633 // Calculate Y = W M.
634 mulcore [edi + 0], esi, xmm4, xmm5, xmm6, xmm7
635
71ac8e5e
MW
636 mulcore [edi + 4], esi, xmm0, xmm1, xmm2
637 accum xmm5, xmm6, xmm7
444083ae 638
71ac8e5e
MW
639 mulcore [edi + 8], esi, xmm0, xmm1
640 accum xmm6, xmm7
444083ae 641
71ac8e5e
MW
642 mulcore [edi + 12], esi, xmm0
643 accum xmm7
444083ae
MW
644
645 // That's lots of pieces. Now we have to assemble the answer.
71ac8e5e 646 squash xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm4
444083ae
MW
647
648 // Expand it.
649 pxor xmm2, xmm2
71ac8e5e 650 expand xmm2, xmm4, xmm1
444083ae
MW
651 movdqa [edx + 0], xmm4
652 movdqa [edx + 16], xmm1
653
654 // Initialize the carry from W.
655 movd xmm4, [edi + 0]
656 movd xmm5, [edi + 4]
657 movd xmm6, [edi + 8]
658 movd xmm7, [edi + 12]
659
660 // Finish the calculation by adding the Montgomery product.
71ac8e5e 661 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
444083ae
MW
662 propout [edi + 0], xmm4, xmm5
663
664 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
665 propout [edi + 4], xmm5, xmm6
666
667 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
668 propout [edi + 8], xmm6, xmm7
669
670 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
671 propout [edi + 12], xmm7, xmm4
672
673 // And, with that, we're done.
674 ret
675
1a517bb3
MW
676ENDFUNC
677
444083ae
MW
678///--------------------------------------------------------------------------
679/// Bulk multipliers.
680
b9b279b4
MW
681FUNC(mpx_umul4_x86_avx)
682 .arch .avx
683 vzeroupper
684 endprologue
685 // and drop through...
686 .arch pentium4
687ENDFUNC
688
444083ae
MW
689FUNC(mpx_umul4_x86_sse2)
690 // void mpx_umul4_x86_sse2(mpw *dv, const mpw *av, const mpw *avl,
691 // const mpw *bv, const mpw *bvl);
692
693 // Build a stack frame. Arguments will be relative to EBP, as
694 // follows.
695 //
696 // ebp + 20 dv
697 // ebp + 24 av
698 // ebp + 28 avl
699 // ebp + 32 bv
700 // ebp + 36 bvl
701 //
702 // Locals are relative to ESP, as follows.
703 //
704 // esp + 0 expanded Y (32 bytes)
705 // esp + 32 (top of locals)
0923a413
MW
706 pushreg ebp
707 pushreg ebx
708 pushreg esi
709 pushreg edi
710 setfp ebp
444083ae
MW
711 and esp, ~15
712 sub esp, 32
0923a413 713 endprologue
444083ae
MW
714
715 // Prepare for the first iteration.
716 mov esi, [ebp + 32] // -> bv[0]
717 pxor xmm7, xmm7
718 movdqu xmm0, [esi] // bv[0]
719 mov edi, [ebp + 20] // -> dv[0]
720 mov ecx, edi // outer loop dv cursor
71ac8e5e 721 expand xmm7, xmm0, xmm1
444083ae
MW
722 mov ebx, [ebp + 24] // -> av[0]
723 mov eax, [ebp + 28] // -> av[m] = av limit
724 mov edx, esp // -> expanded Y = bv[0]
725 movdqa [esp + 0], xmm0 // bv[0] expanded low
726 movdqa [esp + 16], xmm1 // bv[0] expanded high
727 call mul4zc
728 add ebx, 16
729 add edi, 16
730 add ecx, 16
731 add esi, 16
732 cmp ebx, eax // all done?
733 jae 8f
734
735 .p2align 4
736 // Continue with the first iteration.
7370: call mul4
738 add ebx, 16
739 add edi, 16
740 cmp ebx, eax // all done?
741 jb 0b
742
743 // Write out the leftover carry. There can be no tail here.
7448: call carryprop
745 cmp esi, [ebp + 36] // more passes to do?
746 jae 9f
747
748 .p2align 4
749 // Set up for the next pass.
7501: movdqu xmm0, [esi] // bv[i]
751 mov edi, ecx // -> dv[i]
752 pxor xmm7, xmm7
71ac8e5e 753 expand xmm7, xmm0, xmm1
444083ae
MW
754 mov ebx, [ebp + 24] // -> av[0]
755 movdqa [esp + 0], xmm0 // bv[i] expanded low
756 movdqa [esp + 16], xmm1 // bv[i] expanded high
757 call mla4zc
758 add edi, 16
759 add ebx, 16
760 add ecx, 16
761 add esi, 16
762 cmp ebx, eax // done yet?
763 jae 8f
764
765 .p2align 4
766 // Continue...
7670: call mla4
768 add ebx, 16
769 add edi, 16
770 cmp ebx, eax
771 jb 0b
772
773 // Finish off this pass. There was no tail on the previous pass, and
774 // there can be none on this pass.
7758: call carryprop
776 cmp esi, [ebp + 36]
777 jb 1b
778
779 // All over.
0923a413 7809: dropfp
444083ae
MW
781 pop edi
782 pop esi
783 pop ebx
784 pop ebp
785 ret
786
787ENDFUNC
788
b9b279b4
MW
789FUNC(mpxmont_mul4_x86_avx)
790 .arch .avx
791 vzeroupper
792 endprologue
793 // and drop through...
794 .arch pentium4
795ENDFUNC
796
444083ae
MW
797FUNC(mpxmont_mul4_x86_sse2)
798 // void mpxmont_mul4_x86_sse2(mpw *dv, const mpw *av, const mpw *bv,
799 // const mpw *nv, size_t n, const mpw *mi);
800
801 // Build a stack frame. Arguments will be relative to EBP, as
802 // follows.
803 //
804 // ebp + 20 dv
805 // ebp + 24 av
806 // ebp + 28 bv
807 // ebp + 32 nv
808 // ebp + 36 n (nonzero multiple of 4)
809 // ebp + 40 mi
810 //
6ecc0b8f 811 // Locals are relative to ESP, which 16-byte aligned, as follows.
444083ae 812 //
6ecc0b8f
MW
813 // esp + 0 expanded V (32 bytes)
814 // esp + 32 expanded M (32 bytes)
815 // esp + 64 expanded Y (32 bytes)
816 // esp + 96 outer loop dv
817 // esp + 100 outer loop bv
818 // esp + 104 av limit (mostly in ESI)
444083ae 819 // esp + 108 bv limit
6ecc0b8f 820 // esp + 112 (top of locals)
0923a413
MW
821 pushreg ebp
822 pushreg ebx
823 pushreg esi
824 pushreg edi
825 setfp ebp
444083ae 826 and esp, ~15
6ecc0b8f 827 sub esp, 112
0923a413 828 endprologue
444083ae
MW
829
830 // Establish the expanded operands.
831 pxor xmm7, xmm7
832 mov ecx, [ebp + 28] // -> bv
833 mov edx, [ebp + 40] // -> mi
834 movdqu xmm0, [ecx] // bv[0]
835 movdqu xmm2, [edx] // mi
71ac8e5e 836 expand xmm7, xmm0, xmm1, xmm2, xmm3
6ecc0b8f
MW
837 movdqa [esp + 0], xmm0 // bv[0] expanded low
838 movdqa [esp + 16], xmm1 // bv[0] expanded high
839 movdqa [esp + 32], xmm2 // mi expanded low
840 movdqa [esp + 48], xmm3 // mi expanded high
444083ae
MW
841
842 // Set up the outer loop state and prepare for the first iteration.
843 mov edx, [ebp + 36] // n
844 mov eax, [ebp + 24] // -> U = av[0]
845 mov ebx, [ebp + 32] // -> X = nv[0]
846 mov edi, [ebp + 20] // -> Z = dv[0]
6ecc0b8f 847 mov [esp + 100], ecx
444083ae
MW
848 lea ecx, [ecx + 4*edx] // -> bv[n/4] = bv limit
849 lea edx, [eax + 4*edx] // -> av[n/4] = av limit
6ecc0b8f
MW
850 mov [esp + 96], edi
851 mov [esp + 104], edx
444083ae 852 mov [esp + 108], ecx
6ecc0b8f
MW
853 lea ecx, [esp + 0] // -> expanded V = bv[0]
854 lea esi, [esp + 32] // -> expanded M = mi
855 lea edx, [esp + 64] // -> space for Y
444083ae 856 call mmul4
6ecc0b8f 857 mov esi, [esp + 104] // recover av limit
444083ae
MW
858 add edi, 16
859 add eax, 16
860 add ebx, 16
861 cmp eax, esi // done already?
862 jae 8f
6ecc0b8f 863 mov [esp + 96], edi
444083ae
MW
864
865 .p2align 4
866 // Complete the first inner loop.
8670: call dmul4
868 add edi, 16
869 add eax, 16
870 add ebx, 16
871 cmp eax, esi // done yet?
872 jb 0b
873
874 // Still have carries left to propagate.
875 call carryprop
876 movd [edi + 16], xmm4
877
878 .p2align 4
879 // Embark on the next iteration. (There must be one. If n = 1, then
880 // we would have bailed above, to label 8. Similarly, the subsequent
881 // iterations can fall into the inner loop immediately.)
6ecc0b8f
MW
8821: mov eax, [esp + 100] // -> bv[i - 1]
883 mov edi, [esp + 96] // -> Z = dv[i]
444083ae
MW
884 add eax, 16 // -> bv[i]
885 pxor xmm7, xmm7
6ecc0b8f 886 mov [esp + 100], eax
444083ae
MW
887 cmp eax, [esp + 108] // done yet?
888 jae 9f
6ecc0b8f 889 movdqu xmm0, [eax] // bv[i]
444083ae 890 mov ebx, [ebp + 32] // -> X = nv[0]
6ecc0b8f 891 lea esi, [esp + 32] // -> expanded M = mi
444083ae 892 mov eax, [ebp + 24] // -> U = av[0]
71ac8e5e 893 expand xmm7, xmm0, xmm1
6ecc0b8f
MW
894 movdqa [esp + 0], xmm0 // bv[i] expanded low
895 movdqa [esp + 16], xmm1 // bv[i] expanded high
444083ae 896 call mmla4
6ecc0b8f 897 mov esi, [esp + 104] // recover av limit
444083ae
MW
898 add edi, 16
899 add eax, 16
900 add ebx, 16
6ecc0b8f 901 mov [esp + 96], edi
444083ae
MW
902
903 .p2align 4
904 // Complete the next inner loop.
9050: call dmla4
906 add edi, 16
907 add eax, 16
908 add ebx, 16
909 cmp eax, esi
910 jb 0b
911
912 // Still have carries left to propagate, and they overlap the
913 // previous iteration's final tail, so read that in and add it.
914 movd xmm0, [edi]
915 paddq xmm4, xmm0
916 call carryprop
917 movd [edi + 16], xmm4
918
919 // Back again.
920 jmp 1b
921
922 // First iteration was short. Write out the carries and we're done.
923 // (This could be folded into the main loop structure, but that would
924 // penalize small numbers more.)
9258: call carryprop
926 movd [edi + 16], xmm4
927
928 // All done.
0923a413
MW
9299: dropfp
930 popreg edi
931 popreg esi
932 popreg ebx
933 popreg ebp
444083ae
MW
934 ret
935
936ENDFUNC
937
b9b279b4
MW
938FUNC(mpxmont_redc4_x86_avx)
939 .arch .avx
940 vzeroupper
941 endprologue
942 // and drop through...
943 .arch pentium4
944ENDFUNC
945
444083ae
MW
946FUNC(mpxmont_redc4_x86_sse2)
947 // void mpxmont_redc4_x86_sse2(mpw *dv, mpw *dvl, const mpw *nv,
948 // size_t n, const mpw *mi);
949
950 // Build a stack frame. Arguments will be relative to EBP, as
951 // follows.
952 //
953 // ebp + 20 dv
954 // ebp + 24 dvl
955 // ebp + 28 nv
956 // ebp + 32 n (nonzero multiple of 4)
957 // ebp + 36 mi
958 //
959 // Locals are relative to ESP, as follows.
960 //
961 // esp + 0 outer loop dv
962 // esp + 4 outer dv limit
963 // esp + 8 blocks-of-4 dv limit
964 // esp + 12 expanded M (32 bytes)
965 // esp + 44 expanded Y (32 bytes)
966 // esp + 76 (top of locals)
0923a413
MW
967 pushreg ebp
968 pushreg ebx
969 pushreg esi
970 pushreg edi
971 setfp ebp
444083ae
MW
972 and esp, ~15
973 sub esp, 76
0923a413 974 endprologue
444083ae
MW
975
976 // Establish the expanded operands and the blocks-of-4 dv limit.
977 mov edi, [ebp + 20] // -> Z = dv[0]
978 pxor xmm7, xmm7
979 mov eax, [ebp + 24] // -> dv[n] = dv limit
980 sub eax, edi // length of dv in bytes
981 mov edx, [ebp + 36] // -> mi
982 movdqu xmm0, [edx] // mi
983 and eax, ~15 // mask off the tail end
71ac8e5e 984 expand xmm7, xmm0, xmm1
444083ae
MW
985 add eax, edi // find limit
986 movdqa [esp + 12], xmm0 // mi expanded low
987 movdqa [esp + 28], xmm1 // mi expanded high
988 mov [esp + 8], eax
989
990 // Set up the outer loop state and prepare for the first iteration.
991 mov ecx, [ebp + 32] // n
992 mov ebx, [ebp + 28] // -> X = nv[0]
993 lea edx, [edi + 4*ecx] // -> dv[n/4] = outer dv limit
994 lea ecx, [ebx + 4*ecx] // -> nv[n/4] = nv limit
995 mov [esp + 0], edi
996 mov [esp + 4], edx
997 lea esi, [esp + 12] // -> expanded M = mi
998 lea edx, [esp + 44] // -> space for Y
999 call mont4
444083ae 1000 add ebx, 16
a87d6f26 1001 add edi, 16
444083ae
MW
1002 cmp ebx, ecx // done already?
1003 jae 8f
1004
1005 .p2align 4
1006 // Complete the first inner loop.
10075: call mla4
1008 add ebx, 16
1009 add edi, 16
1010 cmp ebx, ecx // done yet?
1011 jb 5b
1012
1013 // Still have carries left to propagate.
10148: carryadd
1015 mov esi, [esp + 8] // -> dv blocks limit
1016 mov edx, [ebp + 24] // dv limit
1017 psllq xmm7, 16
1018 pslldq xmm7, 8
1019 paddq xmm6, xmm7
1020 call carryprop
1021 movd eax, xmm4
1022 add edi, 16
1023 cmp edi, esi
1024 jae 7f
1025
1026 .p2align 4
1027 // Continue carry propagation until the end of the buffer.
10280: add [edi], eax
1029 mov eax, 0 // preserves flags
1030 adcd [edi + 4], 0
1031 adcd [edi + 8], 0
1032 adcd [edi + 12], 0
1033 adc eax, 0
1034 add edi, 16
1035 cmp edi, esi
1036 jb 0b
1037
1038 // Deal with the tail end.
10397: add [edi], eax
1040 mov eax, 0 // preserves flags
1041 add edi, 4
1042 adc eax, 0
1043 cmp edi, edx
1044 jb 7b
1045
1046 // All done for this iteration. Start the next. (This must have at
1047 // least one follow-on iteration, or we'd not have started this outer
1048 // loop.)
10498: mov edi, [esp + 0] // -> dv[i - 1]
1050 mov ebx, [ebp + 28] // -> X = nv[0]
1051 lea edx, [esp + 44] // -> space for Y
1052 lea esi, [esp + 12] // -> expanded M = mi
1053 add edi, 16 // -> Z = dv[i]
1054 cmp edi, [esp + 4] // all done yet?
1055 jae 9f
1056 mov [esp + 0], edi
1057 call mont4
1058 add edi, 16
1059 add ebx, 16
1060 jmp 5b
1061
1062 // All over.
0923a413
MW
10639: dropfp
1064 popreg edi
1065 popreg esi
1066 popreg ebx
1067 popreg ebp
444083ae
MW
1068 ret
1069
1070ENDFUNC
1071
1072///--------------------------------------------------------------------------
1073/// Testing and performance measurement.
1074
1075#ifdef TEST_MUL4
1076
1077.macro cysetup c
1078 rdtsc
1079 mov [\c], eax
1080 mov [\c + 4], edx
1081.endm
1082
1083.macro cystore c, v, n
1084 rdtsc
1085 sub eax, [\c]
1086 sbb edx, [\c + 4]
1087 mov ebx, [\v]
1088 mov ecx, [\n]
1089 dec ecx
1090 mov [\n], ecx
1091 mov [ebx + ecx*8], eax
1092 mov [ebx + ecx*8 + 4], edx
1093.endm
1094
6d19758a 1095.macro testprologue n
0923a413
MW
1096 pushreg ebp
1097 pushreg ebx
1098 pushreg esi
1099 pushreg edi
1100 setfp ebp
444083ae 1101 and esp, ~15
6ecc0b8f 1102 sub esp, 3*32 + 4*4
0923a413 1103 endprologue
6d19758a
MW
1104 mov eax, \n
1105 mov [esp + 104], eax
444083ae 1106 // vars:
6ecc0b8f
MW
1107 // esp + 0 = v expanded
1108 // esp + 32 = y expanded
1109 // esp + 64 = ? expanded
1110 // esp + 96 = cycles
6d19758a 1111 // esp + 104 = count
444083ae
MW
1112.endm
1113
1114.macro testepilogue
0923a413
MW
1115 dropfp
1116 popreg edi
1117 popreg esi
1118 popreg ebx
1119 popreg ebp
444083ae
MW
1120 ret
1121.endm
1122
1123.macro testldcarry c
1124 mov ecx, \c // -> c
8e91d6e5
MW
1125 movdqu xmm4, [ecx + 0] // (c'_0; c''_0)
1126 movdqu xmm5, [ecx + 16] // (c'_1; c''_1)
1127 movdqu xmm6, [ecx + 32] // (c'_2; c''_2)
444083ae
MW
1128.endm
1129
71ac8e5e 1130.macro testexpand v=nil, y=nil
444083ae
MW
1131 pxor xmm7, xmm7
1132 .ifnes "\v", "nil"
1133 mov ecx, \v
1134 movdqu xmm0, [ecx]
71ac8e5e 1135 expand xmm7, xmm0, xmm1
6ecc0b8f
MW
1136 movdqa [esp + 0], xmm0
1137 movdqa [esp + 16], xmm1
444083ae
MW
1138 .endif
1139 .ifnes "\y", "nil"
1140 mov edx, \y
1141 movdqu xmm2, [edx]
71ac8e5e 1142 expand xmm7, xmm2, xmm3
6ecc0b8f
MW
1143 movdqa [esp + 32], xmm2
1144 movdqa [esp + 48], xmm3
444083ae
MW
1145 .endif
1146.endm
1147
71ac8e5e 1148.macro testtop u=nil, x=nil, mode=nil
444083ae
MW
1149 .p2align 4
11500:
1151 .ifnes "\u", "nil"
6ecc0b8f 1152 lea ecx, [esp + 0]
444083ae
MW
1153 .endif
1154 mov ebx, \x
1155 .ifeqs "\mode", "mont"
6ecc0b8f 1156 lea esi, [esp + 32]
444083ae 1157 .endif
6ecc0b8f 1158 cysetup esp + 96
444083ae
MW
1159 .ifnes "\u", "nil"
1160 mov eax, \u
1161 .endif
1162 .ifeqs "\mode", "mont"
6ecc0b8f 1163 lea edx, [esp + 64]
444083ae 1164 .else
6ecc0b8f 1165 lea edx, [esp + 32]
444083ae
MW
1166 .endif
1167.endm
1168
6d19758a
MW
1169.macro testtail cyv
1170 cystore esp + 96, \cyv, esp + 104
444083ae
MW
1171 jnz 0b
1172.endm
1173
1174.macro testcarryout c
1175 mov ecx, \c
1176 movdqu [ecx + 0], xmm4
1177 movdqu [ecx + 16], xmm5
1178 movdqu [ecx + 32], xmm6
1179.endm
1180
0923a413 1181FUNC(test_dmul4)
6d19758a 1182 testprologue [ebp + 44]
444083ae
MW
1183 testldcarry [ebp + 24]
1184 testexpand [ebp + 36], [ebp + 40]
1185 mov edi, [ebp + 20]
1186 testtop [ebp + 28], [ebp + 32]
1187 call dmul4
6d19758a 1188 testtail [ebp + 48]
444083ae
MW
1189 testcarryout [ebp + 24]
1190 testepilogue
0923a413 1191ENDFUNC
444083ae 1192
0923a413 1193FUNC(test_dmla4)
6d19758a 1194 testprologue [ebp + 44]
444083ae
MW
1195 testldcarry [ebp + 24]
1196 testexpand [ebp + 36], [ebp + 40]
1197 mov edi, [ebp + 20]
1198 testtop [ebp + 28], [ebp + 32]
1199 call dmla4
6d19758a 1200 testtail [ebp + 48]
444083ae
MW
1201 testcarryout [ebp + 24]
1202 testepilogue
0923a413 1203ENDFUNC
444083ae 1204
0923a413 1205FUNC(test_mul4)
6d19758a 1206 testprologue [ebp + 36]
444083ae
MW
1207 testldcarry [ebp + 24]
1208 testexpand nil, [ebp + 32]
1209 mov edi, [ebp + 20]
1210 testtop nil, [ebp + 28]
1211 call mul4
6d19758a 1212 testtail [ebp + 40]
444083ae
MW
1213 testcarryout [ebp + 24]
1214 testepilogue
0923a413 1215ENDFUNC
444083ae 1216
0923a413 1217FUNC(test_mla4)
6d19758a 1218 testprologue [ebp + 36]
444083ae
MW
1219 testldcarry [ebp + 24]
1220 testexpand nil, [ebp + 32]
1221 mov edi, [ebp + 20]
1222 testtop nil, [ebp + 28]
1223 call mla4
6d19758a 1224 testtail [ebp + 40]
444083ae
MW
1225 testcarryout [ebp + 24]
1226 testepilogue
0923a413 1227ENDFUNC
444083ae 1228
0923a413 1229FUNC(test_mmul4)
6d19758a 1230 testprologue [ebp + 48]
444083ae
MW
1231 testexpand [ebp + 40], [ebp + 44]
1232 mov edi, [ebp + 20]
1233 testtop [ebp + 32], [ebp + 36], mont
1234 call mmul4
6d19758a 1235 testtail [ebp + 52]
444083ae 1236 mov edi, [ebp + 28]
6ecc0b8f
MW
1237 movdqa xmm0, [esp + 64]
1238 movdqa xmm1, [esp + 80]
444083ae
MW
1239 movdqu [edi], xmm0
1240 movdqu [edi + 16], xmm1
1241 testcarryout [ebp + 24]
1242 testepilogue
0923a413 1243ENDFUNC
444083ae 1244
0923a413 1245FUNC(test_mmla4)
6d19758a 1246 testprologue [ebp + 48]
444083ae
MW
1247 testexpand [ebp + 40], [ebp + 44]
1248 mov edi, [ebp + 20]
1249 testtop [ebp + 32], [ebp + 36], mont
1250 call mmla4
6d19758a 1251 testtail [ebp + 52]
444083ae 1252 mov edi, [ebp + 28]
6ecc0b8f
MW
1253 movdqa xmm0, [esp + 64]
1254 movdqa xmm1, [esp + 80]
444083ae
MW
1255 movdqu [edi], xmm0
1256 movdqu [edi + 16], xmm1
1257 testcarryout [ebp + 24]
1258 testepilogue
0923a413 1259ENDFUNC
444083ae 1260
0923a413 1261FUNC(test_mont4)
6d19758a 1262 testprologue [ebp + 40]
444083ae
MW
1263 testexpand nil, [ebp + 36]
1264 mov edi, [ebp + 20]
1265 testtop nil, [ebp + 32], mont
1266 call mont4
6d19758a 1267 testtail [ebp + 44]
444083ae 1268 mov edi, [ebp + 28]
6ecc0b8f
MW
1269 movdqa xmm0, [esp + 64]
1270 movdqa xmm1, [esp + 80]
444083ae
MW
1271 movdqu [edi], xmm0
1272 movdqu [edi + 16], xmm1
1273 testcarryout [ebp + 24]
1274 testepilogue
0923a413 1275ENDFUNC
444083ae
MW
1276
1277#endif
1278
1279///----- That's all, folks --------------------------------------------------