base/asm-common.h, *.S: Add `INTFUNC' macro for internal subroutines.
[catacomb] / math / mpx-mul4-x86-sse2.S
CommitLineData
444083ae
MW
1/// -*- mode: asm; asm-comment-char: ?/; comment-start: "// " -*-
2///
3/// Large SIMD-based multiplications
4///
5/// (c) 2016 Straylight/Edgeware
6
7///----- Licensing notice ---------------------------------------------------
8///
9/// This file is part of Catacomb.
10///
11/// Catacomb is free software; you can redistribute it and/or modify
12/// it under the terms of the GNU Library General Public License as
13/// published by the Free Software Foundation; either version 2 of the
14/// License, or (at your option) any later version.
15///
16/// Catacomb is distributed in the hope that it will be useful,
17/// but WITHOUT ANY WARRANTY; without even the implied warranty of
18/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19/// GNU Library General Public License for more details.
20///
21/// You should have received a copy of the GNU Library General Public
22/// License along with Catacomb; if not, write to the Free
23/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
24/// MA 02111-1307, USA.
25
26///--------------------------------------------------------------------------
27/// External definitions.
28
29#include "config.h"
30#include "asm-common.h"
31
32///--------------------------------------------------------------------------
33/// Prologue.
34
35 .arch pentium4
36 .text
37
38///--------------------------------------------------------------------------
39/// Theory.
40///
41/// We define a number of primitive fixed-size multipliers from which we can
42/// construct more general variable-length multipliers.
43///
44/// The basic trick is the same throughout. In an operand-scanning
45/// multiplication, the inner multiplication loop multiplies a
46/// multiple-precision operand by a single precision factor, and adds the
47/// result, appropriately shifted, to the result. A `finely integrated
48/// operand scanning' implementation of Montgomery multiplication also adds
49/// the product of a single-precision `Montgomery factor' and the modulus,
50/// calculated in the same pass. The more common `coarsely integrated
51/// operand scanning' alternates main multiplication and Montgomery passes,
52/// which requires additional carry propagation.
53///
54/// Throughout both plain-multiplication and Montgomery stages, then, one of
55/// the factors remains constant throughout the operation, so we can afford
56/// to take a little time to preprocess it. The transformation we perform is
57/// as follows. Let b = 2^16, and B = b^2 = 2^32. Suppose we're given a
58/// 128-bit factor v = v_0 + v_1 B + v_2 B^2 + v_3 B^3. Split each v_i into
59/// two sixteen-bit pieces, so v_i = v'_i + v''_i b. These eight 16-bit
60/// pieces are placed into 32-bit cells, and arranged as two 128-bit SSE
61/// operands, as follows.
62///
63/// Offset 0 4 8 12
64/// 0 v'_0 v'_1 v''_0 v''_1
65/// 16 v'_2 v'_3 v''_2 v''_3
66///
67/// A `pmuludqd' instruction ignores the odd positions in its operands; thus,
68/// it will act on (say) v'_0 and v''_0 in a single instruction. Shifting
69/// this vector right by 4 bytes brings v'_1 and v''_1 into position. We can
70/// multiply such a vector by a full 32-bit scalar to produce two 48-bit
71/// results in 64-bit fields. The sixteen bits of headroom allows us to add
72/// many products together before we must deal with carrying; it also allows
73/// for some calculations to be performed on the above expanded form.
74///
75/// On 32-bit x86, we are register starved: the expanded operands are kept in
76/// memory, typically in warm L1 cache.
77///
78/// We maintain four `carry' registers accumulating intermediate results.
79/// The registers' precise roles rotate during the computation; we name them
80/// `c0', `c1', `c2', and `c3'. Each carry register holds two 64-bit halves:
81/// the register c0, for example, holds c'_0 (low half) and c''_0 (high
82/// half), and represents the value c_0 = c'_0 + c''_0 b; the carry registers
83/// collectively represent the value c_0 + c_1 B + c_2 B^2 + c_3 B^3. The
84/// `pmuluqdq' instruction acting on a scalar operand (broadcast across all
85/// lanes of its vector) and an operand in the expanded form above produces a
86/// result which can be added directly to the appropriate carry register.
87/// Following a pass of four multiplications, we perform some limited carry
88/// propagation: let t = c''_0 mod B, and let d = c'_0 + t b; then we output
89/// z = d mod B, add (floor(d/B), floor(c''_0/B)) to c1, and cycle the carry
90/// registers around, so that c1 becomes c0, and the old c0 is (implicitly)
91/// zeroed becomes c3.
92
93///--------------------------------------------------------------------------
94/// Macro definitions.
95
96.macro mulcore r, s, d0, d1, d2, d3
97 // Load a word r_i from R, multiply by the expanded operand [S], and
98 // leave the pieces of the product in registers D0, D1, D2, D3.
99 movd \d0, \r // (r_i, 0, 0, 0)
100 .ifnes "\d1", "nil"
101 movdqa \d1, [\s] // (s'_0, s'_1, s''_0, s''_1)
102 .endif
103 .ifnes "\d3", "nil"
104 movdqa \d3, [\s + 16] // (s'_2, s'_3, s''_2, s''_3)
105 .endif
aec741b6 106 pshufd \d0, \d0, SHUF(3, 0, 3, 0) // (r_i, ?, r_i, ?)
444083ae
MW
107 .ifnes "\d1", "nil"
108 psrldq \d1, 4 // (s'_1, s''_0, s''_1, 0)
109 .endif
110 .ifnes "\d2", "nil"
111 .ifnes "\d3", "nil"
112 movdqa \d2, \d3 // another copy of (s'_2, s'_3, ...)
113 .else
114 movdqa \d2, \d0 // another copy of (r_i, ?, r_i, ?)
115 .endif
116 .endif
117 .ifnes "\d3", "nil"
118 psrldq \d3, 4 // (s'_3, s''_2, s''_3, 0)
119 .endif
120 .ifnes "\d1", "nil"
121 pmuludqd \d1, \d0 // (r_i s'_1, r_i s''_1)
122 .endif
123 .ifnes "\d3", "nil"
124 pmuludqd \d3, \d0 // (r_i s'_3, r_i s''_3)
125 .endif
126 .ifnes "\d2", "nil"
127 .ifnes "\d3", "nil"
128 pmuludqd \d2, \d0 // (r_i s'_2, r_i s''_2)
129 .else
130 pmuludqd \d2, [\s + 16]
131 .endif
132 .endif
133 pmuludqd \d0, [\s] // (r_i s'_0, r_i s''_0)
134.endm
135
136.macro accum c0, c1, c2, c3
137 paddq \c0, xmm0
138 .ifnes "\c1", "nil"
139 paddq \c1, xmm1
140 .endif
141 .ifnes "\c2", "nil"
142 paddq \c2, xmm2
143 .endif
144 .ifnes "\c3", "nil"
145 paddq \c3, xmm3
146 .endif
147.endm
148
149.macro mulacc r, s, c0, c1, c2, c3, z3p
150 // Load a word r_i from R, multiply by the expanded operand [S],
151 // and accumulate in carry registers C0, C1, C2, C3. If Z3P is `t'
152 // then C3 notionally contains zero, but needs clearing; in practice,
153 // we store the product directly rather than attempting to add. On
154 // completion, XMM0, XMM1, and XMM2 are clobbered, as is XMM3 if Z3P
155 // is not `t'.
156 .ifeqs "\z3p", "t"
157 mulcore \r, \s, xmm0, xmm1, xmm2, \c3
158 accum \c0, \c1, \c2, nil
159 .else
160 mulcore \r, \s, xmm0, xmm1, xmm2, xmm3
161 accum \c0, \c1, \c2, \c3
162 .endif
163.endm
164
165.macro propout d, c, cc
166 // Calculate an output word from C, and store it in D; propagate
167 // carries out from C to CC in preparation for a rotation of the
168 // carry registers. On completion, XMM3 is clobbered. If CC is
169 // `nil', then the contribution which would have been added to it is
170 // left in C.
aec741b6 171 pshufd xmm3, \c, SHUF(2, 3, 3, 3) // (?, ?, ?, t = c'' mod B)
444083ae
MW
172 psrldq xmm3, 12 // (t, 0, 0, 0) = (t, 0)
173 pslldq xmm3, 2 // (t b, 0)
174 paddq \c, xmm3 // (c' + t b, c'')
175 movd \d, \c
176 psrlq \c, 32 // floor(c/B)
177 .ifnes "\cc", "nil"
178 paddq \cc, \c // propagate up
179 .endif
180.endm
181
182.macro endprop d, c, t
183 // On entry, C contains a carry register. On exit, the low 32 bits
184 // of the value represented in C are written to D, and the remaining
185 // bits are left at the bottom of T.
186 movdqa \t, \c
187 psllq \t, 16 // (?, c'' b)
188 pslldq \c, 8 // (0, c')
189 paddq \t, \c // (?, c' + c'' b)
190 psrldq \t, 8 // c' + c'' b
191 movd \d, \t
192 psrldq \t, 4 // floor((c' + c'' b)/B)
193.endm
194
195.macro expand a, b, c, d, z
196 // On entry, A and C hold packed 128-bit values, and Z is zero. On
197 // exit, A:B and C:D together hold the same values in expanded
198 // form. If C is `nil', then only expand A to A:B.
199 movdqa \b, \a // (a_0, a_1, a_2, a_3)
200 .ifnes "\c", "nil"
201 movdqa \d, \c // (c_0, c_1, c_2, c_3)
202 .endif
203 punpcklwd \a, \z // (a'_0, a''_0, a'_1, a''_1)
204 punpckhwd \b, \z // (a'_2, a''_2, a'_3, a''_3)
205 .ifnes "\c", "nil"
206 punpcklwd \c, \z // (c'_0, c''_0, c'_1, c''_1)
207 punpckhwd \d, \z // (c'_2, c''_2, c'_3, c''_3)
208 .endif
aec741b6
MW
209 pshufd \a, \a, SHUF(3, 1, 2, 0) // (a'_0, a'_1, a''_0, a''_1)
210 pshufd \b, \b, SHUF(3, 1, 2, 0) // (a'_2, a'_3, a''_2, a''_3)
444083ae 211 .ifnes "\c", "nil"
aec741b6
MW
212 pshufd \c, \c, SHUF(3, 1, 2, 0) // (c'_0, c'_1, c''_0, c''_1)
213 pshufd \d, \d, SHUF(3, 1, 2, 0) // (c'_2, c'_3, c''_2, c''_3)
444083ae
MW
214 .endif
215.endm
216
217.macro squash c0, c1, c2, c3, h, t, u
218 // On entry, C0, C1, C2, C3 are carry registers representing a value
219 // Y. On exit, C0 holds the low 128 bits of the carry value; C1, C2,
220 // C3, T, and U are clobbered; and the high bits of Y are stored in
221 // H, if this is not `nil'.
222
223 // The first step is to eliminate the `double-prime' pieces -- i.e.,
224 // the ones offset by 16 bytes from a 32-bit boundary -- by carrying
225 // them into the 32-bit-aligned pieces above and below. But before
226 // we can do that, we must gather them together.
227 movdqa \t, \c0
228 movdqa \u, \c1
229 punpcklqdq \t, \c2 // (y'_0, y'_2)
230 punpckhqdq \c0, \c2 // (y''_0, y''_2)
231 punpcklqdq \u, \c3 // (y'_1, y'_3)
232 punpckhqdq \c1, \c3 // (y''_1, y''_3)
233
234 // Now split the double-prime pieces. The high (up to) 48 bits will
235 // go up; the low 16 bits go down.
236 movdqa \c2, \c0
237 movdqa \c3, \c1
238 psllq \c2, 48
239 psllq \c3, 48
240 psrlq \c0, 16 // high parts of (y''_0, y''_2)
241 psrlq \c1, 16 // high parts of (y''_1, y''_3)
242 psrlq \c2, 32 // low parts of (y''_0, y''_2)
243 psrlq \c3, 32 // low parts of (y''_1, y''_3)
244 .ifnes "\h", "nil"
245 movdqa \h, \c1
246 .endif
247 pslldq \c1, 8 // high part of (0, y''_1)
248
249 paddq \t, \c2 // propagate down
250 paddq \u, \c3
251 paddq \t, \c1 // and up: (y_0, y_2)
252 paddq \u, \c0 // (y_1, y_3)
253 .ifnes "\h", "nil"
254 psrldq \h, 8 // high part of (y''_3, 0)
255 .endif
256
257 // Finally extract the answer. This complicated dance is better than
258 // storing to memory and loading, because the piecemeal stores
259 // inhibit store forwarding.
260 movdqa \c3, \t // (y_0, y_1)
261 movdqa \c0, \t // (y^*_0, ?, ?, ?)
262 psrldq \t, 8 // (y_2, 0)
263 psrlq \c3, 32 // (floor(y_0/B), ?)
264 paddq \c3, \u // (y_1 + floor(y_0/B), ?)
265 pslldq \c0, 12 // (0, 0, 0, y^*_0)
266 movdqa \c1, \c3 // (y^*_1, ?, ?, ?)
267 psrldq \u, 8 // (y_3, 0)
268 psrlq \c3, 32 // (floor((y_1 B + y_0)/B^2, ?)
269 paddq \c3, \t // (y_2 + floor((y_1 B + y_0)/B^2, ?)
270 pslldq \c1, 12 // (0, 0, 0, y^*_1)
271 psrldq \c0, 12 // (y^*_0, 0, 0, 0)
272 movdqa \c2, \c3 // (y^*_2, ?, ?, ?)
273 psrlq \c3, 32 // (floor((y_2 B^2 + y_1 B + y_0)/B^3, ?)
274 paddq \c3, \u // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3, ?)
275 pslldq \c2, 12 // (0, 0, 0, y^*_2)
276 psrldq \c1, 8 // (0, y^*_1, 0, 0)
277 psrldq \c2, 4 // (0, 0, y^*_2, 0)
278 .ifnes "\h", "nil"
279 movdqu \t, \c3
280 pxor \u, \u
281 .endif
282 pslldq \c3, 12 // (0, 0, 0, y^*_3)
283 por \c0, \c1 // (y^*_0, y^*_1, 0, 0)
284 por \c2, \c3 // (0, 0, y^*_2, y^*_3)
285 por \c0, \c2 // y mod B^4
286 .ifnes "\h", "nil"
287 psrlq \t, 32 // very high bits of y
288 paddq \h, \t
289 punpcklqdq \h, \u // carry up
290 .endif
291.endm
292
293.macro carryadd
294 // On entry, EDI points to a packed addend A, and XMM4, XMM5, XMM6
295 // hold the incoming carry registers c0, c1, and c2 representing a
296 // carry-in C.
297 //
298 // On exit, the carry registers, including XMM7, are updated to hold
299 // C + A; XMM0, XMM1, XMM2, and XMM3 are clobbered. The other
300 // registers are preserved.
301 movd xmm0, [edi + 0] // (a_0, 0)
302 movd xmm1, [edi + 4] // (a_1, 0)
303 movd xmm2, [edi + 8] // (a_2, 0)
304 movd xmm7, [edi + 12] // (a_3, 0)
305 paddq xmm4, xmm0 // (c'_0 + a_0, c''_0)
306 paddq xmm5, xmm1 // (c'_1 + a_1, c''_1)
307 paddq xmm6, xmm2 // (c'_2 + a_2, c''_2 + a_3 b)
308.endm
309
310///--------------------------------------------------------------------------
311/// Primitive multipliers and related utilities.
312
1a517bb3 313INTFUNC(carryprop)
444083ae
MW
314 // On entry, XMM4, XMM5, and XMM6 hold a 144-bit carry in an expanded
315 // form. Store the low 128 bits of the represented carry to [EDI] as
316 // a packed 128-bit value, and leave the remaining 16 bits in the low
317 // 32 bits of XMM4. On exit, XMM3, XMM5 and XMM6 are clobbered.
318 propout [edi + 0], xmm4, xmm5
319 propout [edi + 4], xmm5, xmm6
320 propout [edi + 8], xmm6, nil
321 endprop [edi + 12], xmm6, xmm4
322 ret
323
1a517bb3
MW
324ENDFUNC
325
326INTFUNC(dmul4)
444083ae
MW
327 // On entry, EDI points to the destination buffer; EAX and EBX point
328 // to the packed operands U and X; ECX and EDX point to the expanded
329 // operands V and Y; and XMM4, XMM5, XMM6 hold the incoming carry
330 // registers c0, c1, and c2; c3 is assumed to be zero.
331 //
332 // On exit, we write the low 128 bits of the sum C + U V + X Y to
333 // [EDI], and update the carry registers with the carry out. The
334 // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
335 // general-purpose registers are preserved.
336 mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7, t
337 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil
338 propout [edi + 0], xmm4, xmm5
339
340 mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
341 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, nil
342 propout [edi + 4], xmm5, xmm6
343
344 mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t
345 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, nil
346 propout [edi + 8], xmm6, xmm7
347
348 mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
349 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, nil
350 propout [edi + 12], xmm7, xmm4
351
352 ret
353
1a517bb3
MW
354ENDFUNC
355
356INTFUNC(dmla4)
444083ae
MW
357 // On entry, EDI points to the destination buffer, which also
358 // contains an addend A to accumulate; EAX and EBX point to the
359 // packed operands U and X; ECX and EDX point to the expanded
360 // operands V and Y; and XMM4, XMM5, XMM6 hold the incoming carry
361 // registers c0, c1, and c2 representing a carry-in C; c3 is assumed
362 // to be zero.
363 //
364 // On exit, we write the low 128 bits of the sum A + C + U V + X Y to
365 // [EDI], and update the carry registers with the carry out. The
366 // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
367 // general-purpose registers are preserved.
368 carryadd
369
370 mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7, nil
371 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil
372 propout [edi + 0], xmm4, xmm5
373
374 mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
375 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, nil
376 propout [edi + 4], xmm5, xmm6
377
378 mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t
379 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, nil
380 propout [edi + 8], xmm6, xmm7
381
382 mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
383 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, nil
384 propout [edi + 12], xmm7, xmm4
385
386 ret
387
1a517bb3
MW
388ENDFUNC
389
390INTFUNC(mul4zc)
444083ae
MW
391 // On entry, EDI points to the destination buffer; EBX points to a
392 // packed operand X; and EDX points to an expanded operand Y.
393 //
394 // On exit, we write the low 128 bits of the product X Y to [EDI],
395 // and set the carry registers XMM4, XMM5, XMM6 to the carry out.
396 // The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
397 // general-purpose registers are preserved.
398 mulcore [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
399 propout [edi + 0], xmm4, xmm5
400
401 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
402 propout [edi + 4], xmm5, xmm6
403
404 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
405 propout [edi + 8], xmm6, xmm7
406
407 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
408 propout [edi + 12], xmm7, xmm4
409
410 ret
411
1a517bb3
MW
412ENDFUNC
413
414INTFUNC(mul4)
444083ae
MW
415 // On entry, EDI points to the destination buffer; EBX points to a
416 // packed operand X; EDX points to an expanded operand Y; and XMM4,
417 // XMM5, XMM6 hold the incoming carry registers c0, c1, and c2,
418 // representing a carry-in C; c3 is assumed to be zero.
419 //
420 // On exit, we write the low 128 bits of the sum C + X Y to [EDI],
421 // and update the carry registers with the carry out. The registers
422 // XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
423 // general-purpose registers are preserved.
424 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, t
425 propout [edi + 0], xmm4, xmm5
426
427 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
428 propout [edi + 4], xmm5, xmm6
429
430 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
431 propout [edi + 8], xmm6, xmm7
432
433 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
434 propout [edi + 12], xmm7, xmm4
435
436 ret
437
1a517bb3
MW
438ENDFUNC
439
440INTFUNC(mla4zc)
444083ae
MW
441 // On entry, EDI points to the destination buffer, which also
442 // contains an addend A to accumulate; EBX points to a packed operand
443 // X; and EDX points to an expanded operand Y.
444 //
445 // On exit, we write the low 128 bits of the sum A + X Y to [EDI],
446 // and set the carry registers XMM4, XMM5, XMM6 to the carry out.
447 // The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
448 // general-purpose registers are preserved.
449 movd xmm4, [edi + 0]
450 movd xmm5, [edi + 4]
451 movd xmm6, [edi + 8]
452 movd xmm7, [edi + 12]
453
454 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil
455 propout [edi + 0], xmm4, xmm5
456
457 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
458 propout [edi + 4], xmm5, xmm6
459
460 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
461 propout [edi + 8], xmm6, xmm7
462
463 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
464 propout [edi + 12], xmm7, xmm4
465
466 ret
467
1a517bb3
MW
468ENDFUNC
469
470INTFUNC(mla4)
444083ae
MW
471 // On entry, EDI points to the destination buffer, which also
472 // contains an addend A to accumulate; EBX points to a packed operand
473 // X; EDX points to an expanded operand Y; and XMM4, XMM5, XMM6 hold
474 // the incoming carry registers c0, c1, and c2, representing a
475 // carry-in C; c3 is assumed to be zero.
476 //
477 // On exit, we write the low 128 bits of the sum A + C + X Y to
478 // [EDI], and update the carry registers with the carry out. The
479 // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
480 // general-purpose registers are preserved.
481 carryadd
482
483 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil
484 propout [edi + 0], xmm4, xmm5
485
486 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
487 propout [edi + 4], xmm5, xmm6
488
489 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
490 propout [edi + 8], xmm6, xmm7
491
492 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
493 propout [edi + 12], xmm7, xmm4
494
495 ret
496
1a517bb3
MW
497ENDFUNC
498
499INTFUNC(mmul4)
444083ae
MW
500 // On entry, EDI points to the destination buffer; EAX and EBX point
501 // to the packed operands U and N; ECX and ESI point to the expanded
502 // operands V and M; and EDX points to a place to store an expanded
503 // result Y (32 bytes, at a 16-byte boundary). The stack pointer
504 // must be 16-byte aligned. (This is not the usual convention, which
505 // requires alignment before the call.)
506 //
507 // On exit, we write Y = U V M mod B to [EDX], and the low 128 bits
508 // of the sum U V + N Y to [EDI], leaving the remaining carry in
509 // XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2, XMM3, and
510 // XMM7 are clobbered; the general-purpose registers are preserved.
511 sub esp, 64 // space for the carries
512
513 // Calculate W = U V, and leave it in the destination. Stash the
514 // carry pieces for later.
515 mulcore [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7
516 propout [edi + 0], xmm4, xmm5
517 jmp 5f
518
1a517bb3
MW
519ENDFUNC
520
521INTFUNC(mmla4)
444083ae
MW
522 // On entry, EDI points to the destination buffer, which also
523 // contains an addend A to accumulate; EAX and EBX point
524 // to the packed operands U and N; ECX and ESI point to the expanded
525 // operands V and M; and EDX points to a place to store an expanded
526 // result Y (32 bytes, at a 16-byte boundary). The stack pointer
527 // must be 16-byte aligned. (This is not the usual convention, which
528 // requires alignment before the call.)
529 //
530 // On exit, we write Y = (A + U V) M mod B to [EDX], and the low 128
531 // bits of the sum A + U V + N Y to [EDI], leaving the remaining
532 // carry in XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2,
533 // XMM3, and XMM7 are clobbered; the general-purpose registers are
534 // preserved.
535 sub esp, 64 // space for the carries
536 movd xmm4, [edi + 0]
537 movd xmm5, [edi + 4]
538 movd xmm6, [edi + 8]
539 movd xmm7, [edi + 12]
540 mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7, nil
541 propout [edi + 0], xmm4, xmm5
542
5435: mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
544 propout [edi + 4], xmm5, xmm6
545
546 mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t
547 propout [edi + 8], xmm6, xmm7
548
549 mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
550 propout [edi + 12], xmm7, xmm4
551
552 movdqa [esp + 0], xmm4
553 movdqa [esp + 16], xmm5
554 movdqa [esp + 32], xmm6
555
556 // Calculate Y = W M.
557 mulcore [edi + 0], esi, xmm4, xmm5, xmm6, xmm7
558
559 mulcore [edi + 4], esi, xmm0, xmm1, xmm2, nil
560 accum xmm5, xmm6, xmm7, nil
561
562 mulcore [edi + 8], esi, xmm0, xmm1, nil, nil
563 accum xmm6, xmm7, nil, nil
564
565 mulcore [edi + 12], esi, xmm0, nil, nil, nil
566 accum xmm7, nil, nil, nil
567
568 // That's lots of pieces. Now we have to assemble the answer.
569 squash xmm4, xmm5, xmm6, xmm7, nil, xmm0, xmm1
570
571 // Expand it.
572 pxor xmm2, xmm2
573 expand xmm4, xmm1, nil, nil, xmm2
574 movdqa [edx + 0], xmm4
575 movdqa [edx + 16], xmm1
576
577 // Initialize the carry from the value for W we calculated earlier.
578 movd xmm4, [edi + 0]
579 movd xmm5, [edi + 4]
580 movd xmm6, [edi + 8]
581 movd xmm7, [edi + 12]
582
583 // Finish the calculation by adding the Montgomery product.
584 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil
585 propout [edi + 0], xmm4, xmm5
586
587 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
588 propout [edi + 4], xmm5, xmm6
589
590 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
591 propout [edi + 8], xmm6, xmm7
592
593 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
594 propout [edi + 12], xmm7, xmm4
595
596 // Add add on the carry we calculated earlier.
597 paddq xmm4, [esp + 0]
598 paddq xmm5, [esp + 16]
599 paddq xmm6, [esp + 32]
600
601 // And, with that, we're done.
602 add esp, 64
603 ret
604
1a517bb3
MW
605ENDFUNC
606
607INTFUNC(mont4)
444083ae
MW
608 // On entry, EDI points to the destination buffer holding a packed
609 // value A; EBX points to a packed operand N; ESI points to an
610 // expanded operand M; and EDX points to a place to store an expanded
611 // result Y (32 bytes, at a 16-byte boundary).
612 //
613 // On exit, we write Y = W M mod B to [EDX], and the low 128 bits
614 // of the sum W + N Y to [EDI], leaving the remaining carry in
615 // XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2, XMM3, and
616 // XMM7 are clobbered; the general-purpose registers are preserved.
617
618 // Calculate Y = W M.
619 mulcore [edi + 0], esi, xmm4, xmm5, xmm6, xmm7
620
621 mulcore [edi + 4], esi, xmm0, xmm1, xmm2, nil
622 accum xmm5, xmm6, xmm7, nil
623
624 mulcore [edi + 8], esi, xmm0, xmm1, nil, nil
625 accum xmm6, xmm7, nil, nil
626
627 mulcore [edi + 12], esi, xmm0, nil, nil, nil
628 accum xmm7, nil, nil, nil
629
630 // That's lots of pieces. Now we have to assemble the answer.
631 squash xmm4, xmm5, xmm6, xmm7, nil, xmm0, xmm1
632
633 // Expand it.
634 pxor xmm2, xmm2
635 expand xmm4, xmm1, nil, nil, xmm2
636 movdqa [edx + 0], xmm4
637 movdqa [edx + 16], xmm1
638
639 // Initialize the carry from W.
640 movd xmm4, [edi + 0]
641 movd xmm5, [edi + 4]
642 movd xmm6, [edi + 8]
643 movd xmm7, [edi + 12]
644
645 // Finish the calculation by adding the Montgomery product.
646 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil
647 propout [edi + 0], xmm4, xmm5
648
649 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
650 propout [edi + 4], xmm5, xmm6
651
652 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
653 propout [edi + 8], xmm6, xmm7
654
655 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
656 propout [edi + 12], xmm7, xmm4
657
658 // And, with that, we're done.
659 ret
660
1a517bb3
MW
661ENDFUNC
662
444083ae
MW
663///--------------------------------------------------------------------------
664/// Bulk multipliers.
665
666FUNC(mpx_umul4_x86_sse2)
667 // void mpx_umul4_x86_sse2(mpw *dv, const mpw *av, const mpw *avl,
668 // const mpw *bv, const mpw *bvl);
669
670 // Build a stack frame. Arguments will be relative to EBP, as
671 // follows.
672 //
673 // ebp + 20 dv
674 // ebp + 24 av
675 // ebp + 28 avl
676 // ebp + 32 bv
677 // ebp + 36 bvl
678 //
679 // Locals are relative to ESP, as follows.
680 //
681 // esp + 0 expanded Y (32 bytes)
682 // esp + 32 (top of locals)
683 push ebp
684 push ebx
685 push esi
686 push edi
687 mov ebp, esp
688 and esp, ~15
689 sub esp, 32
690
691 // Prepare for the first iteration.
692 mov esi, [ebp + 32] // -> bv[0]
693 pxor xmm7, xmm7
694 movdqu xmm0, [esi] // bv[0]
695 mov edi, [ebp + 20] // -> dv[0]
696 mov ecx, edi // outer loop dv cursor
697 expand xmm0, xmm1, nil, nil, xmm7
698 mov ebx, [ebp + 24] // -> av[0]
699 mov eax, [ebp + 28] // -> av[m] = av limit
700 mov edx, esp // -> expanded Y = bv[0]
701 movdqa [esp + 0], xmm0 // bv[0] expanded low
702 movdqa [esp + 16], xmm1 // bv[0] expanded high
703 call mul4zc
704 add ebx, 16
705 add edi, 16
706 add ecx, 16
707 add esi, 16
708 cmp ebx, eax // all done?
709 jae 8f
710
711 .p2align 4
712 // Continue with the first iteration.
7130: call mul4
714 add ebx, 16
715 add edi, 16
716 cmp ebx, eax // all done?
717 jb 0b
718
719 // Write out the leftover carry. There can be no tail here.
7208: call carryprop
721 cmp esi, [ebp + 36] // more passes to do?
722 jae 9f
723
724 .p2align 4
725 // Set up for the next pass.
7261: movdqu xmm0, [esi] // bv[i]
727 mov edi, ecx // -> dv[i]
728 pxor xmm7, xmm7
729 expand xmm0, xmm1, nil, nil, xmm7
730 mov ebx, [ebp + 24] // -> av[0]
731 movdqa [esp + 0], xmm0 // bv[i] expanded low
732 movdqa [esp + 16], xmm1 // bv[i] expanded high
733 call mla4zc
734 add edi, 16
735 add ebx, 16
736 add ecx, 16
737 add esi, 16
738 cmp ebx, eax // done yet?
739 jae 8f
740
741 .p2align 4
742 // Continue...
7430: call mla4
744 add ebx, 16
745 add edi, 16
746 cmp ebx, eax
747 jb 0b
748
749 // Finish off this pass. There was no tail on the previous pass, and
750 // there can be none on this pass.
7518: call carryprop
752 cmp esi, [ebp + 36]
753 jb 1b
754
755 // All over.
7569: mov esp, ebp
757 pop edi
758 pop esi
759 pop ebx
760 pop ebp
761 ret
762
763ENDFUNC
764
765FUNC(mpxmont_mul4_x86_sse2)
766 // void mpxmont_mul4_x86_sse2(mpw *dv, const mpw *av, const mpw *bv,
767 // const mpw *nv, size_t n, const mpw *mi);
768
769 // Build a stack frame. Arguments will be relative to EBP, as
770 // follows.
771 //
772 // ebp + 20 dv
773 // ebp + 24 av
774 // ebp + 28 bv
775 // ebp + 32 nv
776 // ebp + 36 n (nonzero multiple of 4)
777 // ebp + 40 mi
778 //
779 // Locals are relative to ESP, which is 4 mod 16, as follows.
780 //
781 // esp + 0 outer loop dv
782 // esp + 4 outer loop bv
783 // esp + 8 av limit (mostly in ESI)
784 // esp + 12 expanded V (32 bytes)
785 // esp + 44 expanded M (32 bytes)
786 // esp + 76 expanded Y (32 bytes)
787 // esp + 108 bv limit
788 // esp + 112 (gap)
789 // esp + 124 (top of locals)
790 push ebp
791 push ebx
792 push esi
793 push edi
794 mov ebp, esp
795 and esp, ~15
796 sub esp, 124
797
798 // Establish the expanded operands.
799 pxor xmm7, xmm7
800 mov ecx, [ebp + 28] // -> bv
801 mov edx, [ebp + 40] // -> mi
802 movdqu xmm0, [ecx] // bv[0]
803 movdqu xmm2, [edx] // mi
804 expand xmm0, xmm1, xmm2, xmm3, xmm7
805 movdqa [esp + 12], xmm0 // bv[0] expanded low
806 movdqa [esp + 28], xmm1 // bv[0] expanded high
807 movdqa [esp + 44], xmm2 // mi expanded low
808 movdqa [esp + 60], xmm3 // mi expanded high
809
810 // Set up the outer loop state and prepare for the first iteration.
811 mov edx, [ebp + 36] // n
812 mov eax, [ebp + 24] // -> U = av[0]
813 mov ebx, [ebp + 32] // -> X = nv[0]
814 mov edi, [ebp + 20] // -> Z = dv[0]
815 mov [esp + 4], ecx
816 lea ecx, [ecx + 4*edx] // -> bv[n/4] = bv limit
817 lea edx, [eax + 4*edx] // -> av[n/4] = av limit
818 mov [esp + 0], edi
819 mov [esp + 108], ecx
820 mov [esp + 8], edx
821 lea ecx, [esp + 12] // -> expanded V = bv[0]
822 lea esi, [esp + 44] // -> expanded M = mi
823 lea edx, [esp + 76] // -> space for Y
824 call mmul4
825 mov esi, [esp + 8] // recover av limit
826 add edi, 16
827 add eax, 16
828 add ebx, 16
829 cmp eax, esi // done already?
830 jae 8f
831 mov [esp + 0], edi
832
833 .p2align 4
834 // Complete the first inner loop.
8350: call dmul4
836 add edi, 16
837 add eax, 16
838 add ebx, 16
839 cmp eax, esi // done yet?
840 jb 0b
841
842 // Still have carries left to propagate.
843 call carryprop
844 movd [edi + 16], xmm4
845
846 .p2align 4
847 // Embark on the next iteration. (There must be one. If n = 1, then
848 // we would have bailed above, to label 8. Similarly, the subsequent
849 // iterations can fall into the inner loop immediately.)
8501: mov eax, [esp + 4] // -> bv[i - 1]
851 mov edi, [esp + 0] // -> Z = dv[i]
852 add eax, 16 // -> bv[i]
853 pxor xmm7, xmm7
854 movdqu xmm0, [eax] // bv[i]
855 mov [esp + 4], eax
856 cmp eax, [esp + 108] // done yet?
857 jae 9f
858 mov ebx, [ebp + 32] // -> X = nv[0]
859 lea esi, [esp + 44] // -> expanded M = mi
860 mov eax, [ebp + 24] // -> U = av[0]
861 expand xmm0, xmm1, nil, nil, xmm7
862 movdqa [esp + 12], xmm0 // bv[i] expanded low
863 movdqa [esp + 28], xmm1 // bv[i] expanded high
864 call mmla4
865 mov esi, [esp + 8] // recover av limit
866 add edi, 16
867 add eax, 16
868 add ebx, 16
869 mov [esp + 0], edi
870
871 .p2align 4
872 // Complete the next inner loop.
8730: call dmla4
874 add edi, 16
875 add eax, 16
876 add ebx, 16
877 cmp eax, esi
878 jb 0b
879
880 // Still have carries left to propagate, and they overlap the
881 // previous iteration's final tail, so read that in and add it.
882 movd xmm0, [edi]
883 paddq xmm4, xmm0
884 call carryprop
885 movd [edi + 16], xmm4
886
887 // Back again.
888 jmp 1b
889
890 // First iteration was short. Write out the carries and we're done.
891 // (This could be folded into the main loop structure, but that would
892 // penalize small numbers more.)
8938: call carryprop
894 movd [edi + 16], xmm4
895
896 // All done.
8979: mov esp, ebp
898 pop edi
899 pop esi
900 pop ebx
901 pop ebp
902 ret
903
904ENDFUNC
905
906FUNC(mpxmont_redc4_x86_sse2)
907 // void mpxmont_redc4_x86_sse2(mpw *dv, mpw *dvl, const mpw *nv,
908 // size_t n, const mpw *mi);
909
910 // Build a stack frame. Arguments will be relative to EBP, as
911 // follows.
912 //
913 // ebp + 20 dv
914 // ebp + 24 dvl
915 // ebp + 28 nv
916 // ebp + 32 n (nonzero multiple of 4)
917 // ebp + 36 mi
918 //
919 // Locals are relative to ESP, as follows.
920 //
921 // esp + 0 outer loop dv
922 // esp + 4 outer dv limit
923 // esp + 8 blocks-of-4 dv limit
924 // esp + 12 expanded M (32 bytes)
925 // esp + 44 expanded Y (32 bytes)
926 // esp + 76 (top of locals)
927 push ebp
928 push ebx
929 push esi
930 push edi
931 mov ebp, esp
932 and esp, ~15
933 sub esp, 76
934
935 // Establish the expanded operands and the blocks-of-4 dv limit.
936 mov edi, [ebp + 20] // -> Z = dv[0]
937 pxor xmm7, xmm7
938 mov eax, [ebp + 24] // -> dv[n] = dv limit
939 sub eax, edi // length of dv in bytes
940 mov edx, [ebp + 36] // -> mi
941 movdqu xmm0, [edx] // mi
942 and eax, ~15 // mask off the tail end
943 expand xmm0, xmm1, nil, nil, xmm7
944 add eax, edi // find limit
945 movdqa [esp + 12], xmm0 // mi expanded low
946 movdqa [esp + 28], xmm1 // mi expanded high
947 mov [esp + 8], eax
948
949 // Set up the outer loop state and prepare for the first iteration.
950 mov ecx, [ebp + 32] // n
951 mov ebx, [ebp + 28] // -> X = nv[0]
952 lea edx, [edi + 4*ecx] // -> dv[n/4] = outer dv limit
953 lea ecx, [ebx + 4*ecx] // -> nv[n/4] = nv limit
954 mov [esp + 0], edi
955 mov [esp + 4], edx
956 lea esi, [esp + 12] // -> expanded M = mi
957 lea edx, [esp + 44] // -> space for Y
958 call mont4
959 add edi, 16
960 add ebx, 16
961 cmp ebx, ecx // done already?
962 jae 8f
963
964 .p2align 4
965 // Complete the first inner loop.
9665: call mla4
967 add ebx, 16
968 add edi, 16
969 cmp ebx, ecx // done yet?
970 jb 5b
971
972 // Still have carries left to propagate.
9738: carryadd
974 mov esi, [esp + 8] // -> dv blocks limit
975 mov edx, [ebp + 24] // dv limit
976 psllq xmm7, 16
977 pslldq xmm7, 8
978 paddq xmm6, xmm7
979 call carryprop
980 movd eax, xmm4
981 add edi, 16
982 cmp edi, esi
983 jae 7f
984
985 .p2align 4
986 // Continue carry propagation until the end of the buffer.
9870: add [edi], eax
988 mov eax, 0 // preserves flags
989 adcd [edi + 4], 0
990 adcd [edi + 8], 0
991 adcd [edi + 12], 0
992 adc eax, 0
993 add edi, 16
994 cmp edi, esi
995 jb 0b
996
997 // Deal with the tail end.
9987: add [edi], eax
999 mov eax, 0 // preserves flags
1000 add edi, 4
1001 adc eax, 0
1002 cmp edi, edx
1003 jb 7b
1004
1005 // All done for this iteration. Start the next. (This must have at
1006 // least one follow-on iteration, or we'd not have started this outer
1007 // loop.)
10088: mov edi, [esp + 0] // -> dv[i - 1]
1009 mov ebx, [ebp + 28] // -> X = nv[0]
1010 lea edx, [esp + 44] // -> space for Y
1011 lea esi, [esp + 12] // -> expanded M = mi
1012 add edi, 16 // -> Z = dv[i]
1013 cmp edi, [esp + 4] // all done yet?
1014 jae 9f
1015 mov [esp + 0], edi
1016 call mont4
1017 add edi, 16
1018 add ebx, 16
1019 jmp 5b
1020
1021 // All over.
10229: mov esp, ebp
1023 pop edi
1024 pop esi
1025 pop ebx
1026 pop ebp
1027 ret
1028
1029ENDFUNC
1030
1031///--------------------------------------------------------------------------
1032/// Testing and performance measurement.
1033
1034#ifdef TEST_MUL4
1035
1036.macro cysetup c
1037 rdtsc
1038 mov [\c], eax
1039 mov [\c + 4], edx
1040.endm
1041
1042.macro cystore c, v, n
1043 rdtsc
1044 sub eax, [\c]
1045 sbb edx, [\c + 4]
1046 mov ebx, [\v]
1047 mov ecx, [\n]
1048 dec ecx
1049 mov [\n], ecx
1050 mov [ebx + ecx*8], eax
1051 mov [ebx + ecx*8 + 4], edx
1052.endm
1053
1054.macro testprologue
1055 push ebp
1056 push ebx
1057 push esi
1058 push edi
1059 mov ebp, esp
1060 and esp, ~15
1061 sub esp, 3*32 + 12
1062 // vars:
1063 // esp + 0 = cycles
1064 // esp + 12 = v expanded
1065 // esp + 44 = y expanded
1066 // esp + 72 = ? expanded
1067.endm
1068
1069.macro testepilogue
1070 mov esp, ebp
1071 pop edi
1072 pop esi
1073 pop ebx
1074 pop ebp
1075 ret
1076.endm
1077
1078.macro testldcarry c
1079 mov ecx, \c // -> c
1080 movdqu xmm4, [ecx + 0] // (c'_0, c''_0)
1081 movdqu xmm5, [ecx + 16] // (c'_1, c''_1)
1082 movdqu xmm6, [ecx + 32] // (c'_2, c''_2)
1083.endm
1084
1085.macro testexpand v, y
1086 pxor xmm7, xmm7
1087 .ifnes "\v", "nil"
1088 mov ecx, \v
1089 movdqu xmm0, [ecx]
1090 expand xmm0, xmm1, nil, nil, xmm7
1091 movdqa [esp + 12], xmm0
1092 movdqa [esp + 28], xmm1
1093 .endif
1094 .ifnes "\y", "nil"
1095 mov edx, \y
1096 movdqu xmm2, [edx]
1097 expand xmm2, xmm3, nil, nil, xmm7
1098 movdqa [esp + 44], xmm2
1099 movdqa [esp + 60], xmm3
1100 .endif
1101.endm
1102
1103.macro testtop u, x, mode
1104 .p2align 4
11050:
1106 .ifnes "\u", "nil"
1107 lea ecx, [esp + 12]
1108 .endif
1109 mov ebx, \x
1110 .ifeqs "\mode", "mont"
1111 lea esi, [esp + 44]
1112 .endif
1113 cysetup esp + 0
1114 .ifnes "\u", "nil"
1115 mov eax, \u
1116 .endif
1117 .ifeqs "\mode", "mont"
1118 lea edx, [esp + 76]
1119 .else
1120 lea edx, [esp + 44]
1121 .endif
1122.endm
1123
1124.macro testtail cyv, n
1125 cystore esp + 0, \cyv, \n
1126 jnz 0b
1127.endm
1128
1129.macro testcarryout c
1130 mov ecx, \c
1131 movdqu [ecx + 0], xmm4
1132 movdqu [ecx + 16], xmm5
1133 movdqu [ecx + 32], xmm6
1134.endm
1135
1136 .globl test_dmul4
1137test_dmul4:
1138 testprologue
1139 testldcarry [ebp + 24]
1140 testexpand [ebp + 36], [ebp + 40]
1141 mov edi, [ebp + 20]
1142 testtop [ebp + 28], [ebp + 32]
1143 call dmul4
1144 testtail [ebp + 48], [ebp + 44]
1145 testcarryout [ebp + 24]
1146 testepilogue
1147
1148 .globl test_dmla4
1149test_dmla4:
1150 testprologue
1151 testldcarry [ebp + 24]
1152 testexpand [ebp + 36], [ebp + 40]
1153 mov edi, [ebp + 20]
1154 testtop [ebp + 28], [ebp + 32]
1155 call dmla4
1156 testtail [ebp + 48], [ebp + 44]
1157 testcarryout [ebp + 24]
1158 testepilogue
1159
1160 .globl test_mul4
1161test_mul4:
1162 testprologue
1163 testldcarry [ebp + 24]
1164 testexpand nil, [ebp + 32]
1165 mov edi, [ebp + 20]
1166 testtop nil, [ebp + 28]
1167 call mul4
1168 testtail [ebp + 40], [ebp + 36]
1169 testcarryout [ebp + 24]
1170 testepilogue
1171
1172 .globl test_mla4
1173test_mla4:
1174 testprologue
1175 testldcarry [ebp + 24]
1176 testexpand nil, [ebp + 32]
1177 mov edi, [ebp + 20]
1178 testtop nil, [ebp + 28]
1179 call mla4
1180 testtail [ebp + 40], [ebp + 36]
1181 testcarryout [ebp + 24]
1182 testepilogue
1183
1184 .globl test_mmul4
1185test_mmul4:
1186 testprologue
1187 testexpand [ebp + 40], [ebp + 44]
1188 mov edi, [ebp + 20]
1189 testtop [ebp + 32], [ebp + 36], mont
1190 call mmul4
1191 testtail [ebp + 52], [ebp + 48]
1192 mov edi, [ebp + 28]
1193 movdqa xmm0, [esp + 76]
1194 movdqa xmm1, [esp + 92]
1195 movdqu [edi], xmm0
1196 movdqu [edi + 16], xmm1
1197 testcarryout [ebp + 24]
1198 testepilogue
1199
1200 .globl test_mmla4
1201test_mmla4:
1202 testprologue
1203 testexpand [ebp + 40], [ebp + 44]
1204 mov edi, [ebp + 20]
1205 testtop [ebp + 32], [ebp + 36], mont
1206 call mmla4
1207 testtail [ebp + 52], [ebp + 48]
1208 mov edi, [ebp + 28]
1209 movdqa xmm0, [esp + 76]
1210 movdqa xmm1, [esp + 92]
1211 movdqu [edi], xmm0
1212 movdqu [edi + 16], xmm1
1213 testcarryout [ebp + 24]
1214 testepilogue
1215
1216 .globl test_mont4
1217test_mont4:
1218 testprologue
1219 testexpand nil, [ebp + 36]
1220 mov edi, [ebp + 20]
1221 testtop nil, [ebp + 32], mont
1222 call mont4
1223 testtail [ebp + 44], [ebp + 40]
1224 mov edi, [ebp + 28]
1225 movdqa xmm0, [esp + 76]
1226 movdqa xmm1, [esp + 92]
1227 movdqu [edi], xmm0
1228 movdqu [edi + 16], xmm1
1229 testcarryout [ebp + 24]
1230 testepilogue
1231
1232#endif
1233
1234///----- That's all, folks --------------------------------------------------