Merge branch '2.4.x' into 2.5.x
[catacomb] / math / mpx-mul4-amd64-sse2.S
1 /// -*- mode: asm; asm-comment-char: ?/; comment-start: "// " -*-
2 ///
3 /// Large SIMD-based multiplications
4 ///
5 /// (c) 2016 Straylight/Edgeware
6 ///
7
8 ///----- Licensing notice ---------------------------------------------------
9 ///
10 /// This file is part of Catacomb.
11 ///
12 /// Catacomb is free software; you can redistribute it and/or modify
13 /// it under the terms of the GNU Library General Public License as
14 /// published by the Free Software Foundation; either version 2 of the
15 /// License, or (at your option) any later version.
16 ///
17 /// Catacomb is distributed in the hope that it will be useful,
18 /// but WITHOUT ANY WARRANTY; without even the implied warranty of
19 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 /// GNU Library General Public License for more details.
21 ///
22 /// You should have received a copy of the GNU Library General Public
23 /// License along with Catacomb; if not, write to the Free
24 /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
25 /// MA 02111-1307, USA.
26
27 ///--------------------------------------------------------------------------
28 /// Preliminaries.
29
30 #include "config.h"
31 #include "asm-common.h"
32
33 .arch pentium4
34
35 .text
36
37 ///--------------------------------------------------------------------------
38 /// Theory.
39 ///
40 /// We define a number of primitive fixed-size multipliers from which we can
41 /// construct more general variable-length multipliers.
42 ///
43 /// The basic trick is the same throughout. In an operand-scanning
44 /// multiplication, the inner multiplication loop multiplies a
45 /// multiple-precision operand by a single precision factor, and adds the
46 /// result, appropriately shifted, to the result. A `finely integrated
47 /// operand scanning' implementation of Montgomery multiplication also adds
48 /// the product of a single-precision `Montgomery factor' and the modulus,
49 /// calculated in the same pass. The more common `coarsely integrated
50 /// operand scanning' alternates main multiplication and Montgomery passes,
51 /// which requires additional carry propagation.
52 ///
53 /// Throughout both plain-multiplication and Montgomery stages, then, one of
54 /// the factors remains constant throughout the operation, so we can afford
55 /// to take a little time to preprocess it. The transformation we perform is
56 /// as follows. Let b = 2^16, and B = b^2 = 2^32. Suppose we're given a
57 /// 128-bit factor v = v_0 + v_1 B + v_2 B^2 + v_3 B^3. Split each v_i into
58 /// two sixteen-bit pieces, so v_i = v'_i + v''_i b. These eight 16-bit
59 /// pieces are placed into 32-bit cells, and arranged as two 128-bit SSE
60 /// operands, as follows.
61 ///
62 /// Offset 0 4 8 12
63 /// 0 v'_0 v'_1 v''_0 v''_1
64 /// 16 v'_2 v'_3 v''_2 v''_3
65 ///
66 /// A `pmuludqd' instruction ignores the odd positions in its operands; thus,
67 /// it will act on (say) v'_0 and v''_0 in a single instruction. Shifting
68 /// this vector right by 4 bytes brings v'_1 and v''_1 into position. We can
69 /// multiply such a vector by a full 32-bit scalar to produce two 48-bit
70 /// results in 64-bit fields. The sixteen bits of headroom allows us to add
71 /// many products together before we must deal with carrying; it also allows
72 /// for some calculations to be performed on the above expanded form.
73 ///
74 /// ...
75 ///
76 /// We maintain four `carry' registers accumulating intermediate results.
77 /// The registers' precise roles rotate during the computation; we name them
78 /// `c0', `c1', `c2', and `c3'. Each carry register holds two 64-bit halves:
79 /// the register c0, for example, holds c'_0 (low half) and c''_0 (high
80 /// half), and represents the value c_0 = c'_0 + c''_0 b; the carry registers
81 /// collectively represent the value c_0 + c_1 B + c_2 B^2 + c_3 B^3. The
82 /// `pmuluqdq' instruction acting on a scalar operand (broadcast across all
83 /// lanes of its vector) and an operand in the expanded form above produces a
84 /// result which can be added directly to the appropriate carry register.
85 /// Following a pass of four multiplications, we perform some limited carry
86 /// propagation: let t = c''_0 mod B, and let d = c'_0 + t b; then we output
87 /// z = d mod B, add (floor(d/B), floor(c''_0/B)) to c1, and cycle the carry
88 /// registers around, so that c1 becomes c0, and the old c0 is (implicitly)
89 /// zeroed becomes c3.
90
91 ///--------------------------------------------------------------------------
92 /// Macro definitions.
93
94 .macro mulcore r, i, slo, shi, d0, d1=nil, d2=nil, d3=nil
95 // Multiply R_I by the expanded operand SLO/SHI, and leave the pieces
96 // of the product in registers D0, D1, D2, D3.
97 pshufd \d0, \r, SHUF(\i, 3, \i, 3) // (r_i, ?; r_i, ?)
98 .ifnes "\d1", "nil"
99 movdqa \d1, \slo // (s'_0, s'_1; s''_0, s''_1)
100 .endif
101 .ifnes "\d3", "nil"
102 movdqa \d3, \shi // (s'_2, s'_3; s''_2, s''_3)
103 .endif
104 .ifnes "\d1", "nil"
105 psrldq \d1, 4 // (s'_1, s''_0; s''_1, 0)
106 .endif
107 .ifnes "\d2", "nil"
108 movdqa \d2, \d0 // another copy of (r_i, ?; r_i, ?)
109 .endif
110 .ifnes "\d3", "nil"
111 psrldq \d3, 4 // (s'_3, s''_2; s''_3, 0)
112 .endif
113 .ifnes "\d1", "nil"
114 pmuludq \d1, \d0 // (r_i s'_1; r_i s''_1)
115 .endif
116 .ifnes "\d3", "nil"
117 pmuludq \d3, \d0 // (r_i s'_3; r_i s''_3)
118 .endif
119 .ifnes "\d2", "nil"
120 pmuludq \d2, \shi // (r_i s'_2; r_i s''_2)
121 .endif
122 pmuludq \d0, \slo // (r_i s'_0; r_i s''_0)
123 .endm
124
125 .macro accum c0, c1=nil, c2=nil, c3=nil
126 // Accumulate 64-bit pieces in XMM0--XMM3 into the corresponding
127 // carry registers C0--C3. Any or all of C1--C3 may be `nil' to skip
128 // updating that register.
129 paddq \c0, xmm0
130 .ifnes "\c1", "nil"
131 paddq \c1, xmm1
132 .endif
133 .ifnes "\c2", "nil"
134 paddq \c2, xmm2
135 .endif
136 .ifnes "\c3", "nil"
137 paddq \c3, xmm3
138 .endif
139 .endm
140
141 .macro mulacc r, i, slo, shi, c0=nil, c1=nil, c2=nil, c3=nil, z3p=nil
142 // Multiply R_I by the expanded operand SLO/SHI, and accumulate in
143 // carry registers C0, C1, C2, C3. If Z3P is `t' then C3 notionally
144 // contains zero, but needs clearing; in practice, we store the
145 // product directly rather than attempting to add. On completion,
146 // XMM0, XMM1, and XMM2 are clobbered, as is XMM3 if Z3P is not `t'.
147 .ifeqs "\z3p", "t"
148 mulcore \r, \i, \slo, \shi, xmm0, xmm1, xmm2, \c3
149 accum \c0, \c1, \c2
150 .else
151 mulcore \r, \i, \slo, \shi, xmm0, xmm1, xmm2, xmm3
152 accum \c0, \c1, \c2, \c3
153 .endif
154 .endm
155
156 .macro propout d, pos, c, cc=nil
157 // Calculate an output word from C, and store it at POS in D;
158 // propagate carries out from C to CC in preparation for a rotation
159 // of the carry registers. D is an XMM register; the POS is either
160 // `lo' or `hi' according to whether the output word should be in
161 // lane 0 or 1 of D; the high two lanes of D are clobbered. On
162 // completion, XMM3 is clobbered. If CC is `nil', then the
163 // contribution which would have been added to it is left in C.
164 pshufd xmm3, \c, SHUF(3, 3, 3, 2) // (?, ?; ?, t = c'' mod B)
165 psrldq xmm3, 12 // (t, 0; 0, 0) = (t; 0)
166 pslldq xmm3, 2 // (t b; 0)
167 paddq \c, xmm3 // (c' + t b; c'')
168 .ifeqs "\pos", "lo"
169 movdqa \d, \c
170 .else
171 punpckldq \d, \c
172 .endif
173 psrlq \c, 32 // floor(c/B)
174 .ifnes "\cc", "nil"
175 paddq \cc, \c // propagate up
176 .endif
177 .endm
178
179 .macro endprop d, pos, c, t
180 // On entry, C contains a carry register. On exit, the low 32 bits
181 // of the value represented in C are written at POS in D, and the
182 // remaining bits are left at the bottom of T.
183 movdqa \t, \c
184 psllq \t, 16 // (?; c'' b)
185 pslldq \c, 8 // (0; c')
186 paddq \t, \c // (?; c' + c'' b)
187 psrldq \t, 8 // (c' + c'' b; 0) = (c; 0)
188 .ifeqs "\pos", "lo"
189 movdqa \d, \t
190 .else
191 punpckldq \d, \t
192 .endif
193 psrldq \t, 4 // (floor(c/B); 0)
194 .endm
195
196 .macro expand z, a, b, c=nil, d=nil
197 // On entry, A and C hold packed 128-bit values, and Z is zero. On
198 // exit, A:B and C:D together hold the same values in expanded
199 // form. If C is `nil', then only expand A to A:B.
200 movdqa \b, \a // (a_0, a_1; a_2, a_3)
201 .ifnes "\c", "nil"
202 movdqa \d, \c // (c_0, c_1; c_2, c_3)
203 .endif
204 punpcklwd \a, \z // (a'_0, a''_0; a'_1, a''_1)
205 punpckhwd \b, \z // (a'_2, a''_2; a'_3, a''_3)
206 .ifnes "\c", "nil"
207 punpcklwd \c, \z // (c'_0, c''_0; c'_1, c''_1)
208 punpckhwd \d, \z // (c'_2, c''_2; c'_3, c''_3)
209 .endif
210 pshufd \a, \a, SHUF(0, 2, 1, 3) // (a'_0, a'_1; a''_0, a''_1)
211 pshufd \b, \b, SHUF(0, 2, 1, 3) // (a'_2, a'_3; a''_2, a''_3)
212 .ifnes "\c", "nil"
213 pshufd \c, \c, SHUF(0, 2, 1, 3) // (c'_0, c'_1; c''_0, c''_1)
214 pshufd \d, \d, SHUF(0, 2, 1, 3) // (c'_2, c'_3; c''_2, c''_3)
215 .endif
216 .endm
217
218 .macro squash c0, c1, c2, c3, t, u, lo, hi=nil
219 // On entry, C0, C1, C2, C3 are carry registers representing a value
220 // Y. On exit, LO holds the low 128 bits of the carry value; C1, C2,
221 // C3, T, and U are clobbered; and the high bits of Y are stored in
222 // HI, if this is not `nil'.
223
224 // The first step is to eliminate the `double-prime' pieces -- i.e.,
225 // the ones offset by 16 bytes from a 32-bit boundary -- by carrying
226 // them into the 32-bit-aligned pieces above and below. But before
227 // we can do that, we must gather them together.
228 movdqa \t, \c0
229 movdqa \u, \c1
230 punpcklqdq \t, \c2 // (y'_0; y'_2)
231 punpckhqdq \c0, \c2 // (y''_0; y''_2)
232 punpcklqdq \u, \c3 // (y'_1; y'_3)
233 punpckhqdq \c1, \c3 // (y''_1; y''_3)
234
235 // Now split the double-prime pieces. The high (up to) 48 bits will
236 // go up; the low 16 bits go down.
237 movdqa \c2, \c0
238 movdqa \c3, \c1
239 psllq \c2, 48
240 psllq \c3, 48
241 psrlq \c0, 16 // high parts of (y''_0; y''_2)
242 psrlq \c1, 16 // high parts of (y''_1; y''_3)
243 psrlq \c2, 32 // low parts of (y''_0; y''_2)
244 psrlq \c3, 32 // low parts of (y''_1; y''_3)
245 .ifnes "\hi", "nil"
246 movdqa \hi, \c1
247 .endif
248 pslldq \c1, 8 // high part of (0; y''_1)
249
250 paddq \t, \c2 // propagate down
251 paddq \u, \c3
252 paddq \t, \c1 // and up: (y_0; y_2)
253 paddq \u, \c0 // (y_1; y_3)
254 .ifnes "\hi", "nil"
255 psrldq \hi, 8 // high part of (y''_3; 0)
256 .endif
257
258 // Finally extract the answer. This complicated dance is better than
259 // storing to memory and loading, because the piecemeal stores
260 // inhibit store forwarding.
261 movdqa \c3, \t // (y_0; ?)
262 movdqa \lo, \t // (y^*_0, ?; ?, ?)
263 psrldq \t, 8 // (y_2; 0)
264 psrlq \c3, 32 // (floor(y_0/B); ?)
265 paddq \c3, \u // (y_1 + floor(y_0/B); ?)
266 movdqa \c1, \c3 // (y^*_1, ?; ?, ?)
267 psrldq \u, 8 // (y_3; 0)
268 psrlq \c3, 32 // (floor((y_1 B + y_0)/B^2; ?)
269 paddq \c3, \t // (y_2 + floor((y_1 B + y_0)/B^2; ?)
270 punpckldq \lo, \c3 // (y^*_0, y^*_2; ?, ?)
271 psrlq \c3, 32 // (floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
272 paddq \c3, \u // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
273 .ifnes "\hi", "nil"
274 movdqa \t, \c3
275 pxor \u, \u
276 .endif
277 punpckldq \c1, \c3 // (y^*_1, y^*_3; ?, ?)
278 .ifnes "\hi", "nil"
279 psrlq \t, 32 // very high bits of y
280 paddq \hi, \t
281 punpcklqdq \hi, \u // carry up
282 .endif
283 punpckldq \lo, \c1 // y mod B^4
284 .endm
285
286 .macro carryadd
287 // On entry, RDI points to a packed addend A, and XMM12, XMM13, XMM14
288 // hold the incoming carry registers c0, c1, and c2 representing a
289 // carry-in C.
290 //
291 // On exit, the carry registers, including XMM15, are updated to hold
292 // C + A; XMM0, XMM1, XMM2, and XMM3 are clobbered. The other
293 // registers are preserved.
294 movd xmm0, [rdi + 0] // (a_0; 0)
295 movd xmm1, [rdi + 4] // (a_1; 0)
296 movd xmm2, [rdi + 8] // (a_2; 0)
297 movd xmm15, [rdi + 12] // (a_3; 0)
298 paddq xmm12, xmm0 // (c'_0 + a_0; c''_0)
299 paddq xmm13, xmm1 // (c'_1 + a_1; c''_1)
300 paddq xmm14, xmm2 // (c'_2 + a_2; c''_2 + a_3 b)
301 .endm
302
303 ///--------------------------------------------------------------------------
304 /// Primitive multipliers and related utilities.
305
306 INTFUNC(carryprop)
307 // On entry, XMM12, XMM13, and XMM14 hold a 144-bit carry in an
308 // expanded form. Store the low 128 bits of the represented carry to
309 // [RDI] as a packed 128-bit value, and leave the remaining 16 bits
310 // in the low 32 bits of XMM12. On exit, XMM0, XMM1, XMM3, XMM13 and
311 // XMM14 are clobbered.
312 endprologue
313
314 propout xmm0, lo, xmm12, xmm13
315 propout xmm1, lo, xmm13, xmm14
316 propout xmm0, hi, xmm14, nil
317 endprop xmm1, hi, xmm14, xmm12
318 punpckldq xmm0, xmm1
319 movdqu [rdi], xmm0
320
321 ret
322
323 ENDFUNC
324
325 INTFUNC(dmul4)
326 // On entry, RDI points to the destination buffer; RAX and RBX point
327 // to the packed operands U and X; XMM8/XMM9 and XMM10/XMM11 hold the
328 // expanded operands V and Y; and XMM12, XMM13, XMM14 hold the
329 // incoming carry registers c0, c1, and c2; c3 is assumed to be zero.
330 //
331 // On exit, we write the low 128 bits of the sum C + U V + X Y to
332 // [RDI], and update the carry registers with the carry out. The
333 // registers XMM0--XMM7, and XMM15 are clobbered; the general-purpose
334 // registers are preserved.
335 endprologue
336
337 movdqu xmm4, [rax]
338 movdqu xmm5, [rbx]
339
340 mulacc xmm4, 0, xmm8, xmm9, xmm12, xmm13, xmm14, xmm15, t
341 mulacc xmm5, 0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
342 propout xmm6, lo, xmm12, xmm13
343
344 mulacc xmm4, 1, xmm8, xmm9, xmm13, xmm14, xmm15, xmm12, t
345 mulacc xmm5, 1, xmm10, xmm11, xmm13, xmm14, xmm15, xmm12
346 propout xmm7, lo, xmm13, xmm14
347
348 mulacc xmm4, 2, xmm8, xmm9, xmm14, xmm15, xmm12, xmm13, t
349 mulacc xmm5, 2, xmm10, xmm11, xmm14, xmm15, xmm12, xmm13
350 propout xmm6, hi, xmm14, xmm15
351
352 mulacc xmm4, 3, xmm8, xmm9, xmm15, xmm12, xmm13, xmm14, t
353 mulacc xmm5, 3, xmm10, xmm11, xmm15, xmm12, xmm13, xmm14
354 propout xmm7, hi, xmm15, xmm12
355
356 punpckldq xmm6, xmm7
357 movdqu [rdi], xmm6
358
359 ret
360
361 ENDFUNC
362
363 INTFUNC(dmla4)
364 // On entry, RDI points to the destination buffer, which also
365 // contains an addend A to accumulate; RAX and RBX point to the
366 // packed operands U and X; XMM8/XMM9 and XMM10/XMM11 hold the
367 // expanded operands V and Y; and XMM12, XMM13, XMM14 hold the
368 // incoming carry registers c0, c1, and c2 representing a carry-in C;
369 // c3 is assumed to be zero.
370 //
371 // On exit, we write the low 128 bits of the sum A + C + U V + X Y to
372 // [RDI], and update the carry registers with the carry out. The
373 // registers XMM0--XMM7, and XMM15 are clobbered; the general-purpose
374 // registers are preserved.
375 endprologue
376
377 movdqu xmm4, [rax]
378 movdqu xmm5, [rbx]
379 carryadd
380
381 mulacc xmm4, 0, xmm8, xmm9, xmm12, xmm13, xmm14, xmm15
382 mulacc xmm5, 0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
383 propout xmm6, lo, xmm12, xmm13
384
385 mulacc xmm4, 1, xmm8, xmm9, xmm13, xmm14, xmm15, xmm12, t
386 mulacc xmm5, 1, xmm10, xmm11, xmm13, xmm14, xmm15, xmm12
387 propout xmm7, lo, xmm13, xmm14
388
389 mulacc xmm4, 2, xmm8, xmm9, xmm14, xmm15, xmm12, xmm13, t
390 mulacc xmm5, 2, xmm10, xmm11, xmm14, xmm15, xmm12, xmm13
391 propout xmm6, hi, xmm14, xmm15
392
393 mulacc xmm4, 3, xmm8, xmm9, xmm15, xmm12, xmm13, xmm14, t
394 mulacc xmm5, 3, xmm10, xmm11, xmm15, xmm12, xmm13, xmm14
395 propout xmm7, hi, xmm15, xmm12
396
397 punpckldq xmm6, xmm7
398 movdqu [rdi], xmm6
399
400 ret
401
402 ENDFUNC
403
404 INTFUNC(mul4zc)
405 // On entry, RDI points to the destination buffer; RBX points to a
406 // packed operand X; and XMM10/XMM11 hold an expanded operand Y.
407 //
408 // On exit, we write the low 128 bits of the product X Y to [RDI],
409 // and set the carry registers XMM12, XMM13, XMM14 to the carry out.
410 // The registers XMM0--XMM3, XMM5--XMM7, and XMM15 are clobbered; the
411 // general-purpose registers are preserved.
412 endprologue
413
414 movdqu xmm5, [rbx]
415
416 mulcore xmm5, 0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
417 propout xmm6, lo, xmm12, xmm13
418
419 mulacc xmm5, 1, xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
420 propout xmm7, lo, xmm13, xmm14
421
422 mulacc xmm5, 2, xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
423 propout xmm6, hi, xmm14, xmm15
424
425 mulacc xmm5, 3, xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
426 propout xmm7, hi, xmm15, xmm12
427
428 punpckldq xmm6, xmm7
429 movdqu [rdi], xmm6
430
431 ret
432
433 ENDFUNC
434
435 INTFUNC(mul4)
436 // On entry, RDI points to the destination buffer; RBX points to a
437 // packed operand X; XMM10/XMM11 hold an expanded operand Y; and
438 // XMM12, XMM13, XMM14 hold the incoming carry registers c0, c1, and
439 // c2, representing a carry-in C; c3 is assumed to be zero.
440 //
441 // On exit, we write the low 128 bits of the sum C + X Y to [RDI],
442 // and update the carry registers with the carry out. The registers
443 // XMM0--XMM3, XMM5--XMM7, and XMM15 are clobbered; the
444 // general-purpose registers are preserved.
445 endprologue
446
447 movdqu xmm5, [rbx]
448
449 mulacc xmm5, 0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, t
450 propout xmm6, lo, xmm12, xmm13
451
452 mulacc xmm5, 1, xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
453 propout xmm7, lo, xmm13, xmm14
454
455 mulacc xmm5, 2, xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
456 propout xmm6, hi, xmm14, xmm15
457
458 mulacc xmm5, 3, xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
459 propout xmm7, hi, xmm15, xmm12
460
461 punpckldq xmm6, xmm7
462 movdqu [rdi], xmm6
463
464 ret
465
466 ENDFUNC
467
468 INTFUNC(mla4zc)
469 // On entry, RDI points to the destination buffer, which also
470 // contains an addend A to accumulate; RBX points to a packed operand
471 // X; and XMM10/XMM11 points to an expanded operand Y.
472 //
473 // On exit, we write the low 128 bits of the sum A + X Y to [RDI],
474 // and set the carry registers XMM12, XMM13, XMM14 to the carry out.
475 // The registers XMM0--XMM3, XMM5--XMM7, and XMM15 are clobbered; the
476 // general-purpose registers are preserved.
477 endprologue
478
479 movdqu xmm5, [rbx]
480 movd xmm12, [rdi + 0]
481 movd xmm13, [rdi + 4]
482 movd xmm14, [rdi + 8]
483 movd xmm15, [rdi + 12]
484
485 mulacc xmm5, 0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
486 propout xmm6, lo, xmm12, xmm13
487
488 mulacc xmm5, 1, xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
489 propout xmm7, lo, xmm13, xmm14
490
491 mulacc xmm5, 2, xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
492 propout xmm6, hi, xmm14, xmm15
493
494 mulacc xmm5, 3, xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
495 propout xmm7, hi, xmm15, xmm12
496
497 punpckldq xmm6, xmm7
498 movdqu [rdi], xmm6
499
500 ret
501
502 ENDFUNC
503
504 INTFUNC(mla4)
505 // On entry, RDI points to the destination buffer, which also
506 // contains an addend A to accumulate; RBX points to a packed operand
507 // X; XMM10/XMM11 holds an expanded operand Y; and XMM12, XMM13,
508 // XMM14 hold the incoming carry registers c0, c1, and c2,
509 // representing a carry-in C; c3 is assumed to be zero.
510 //
511 // On exit, we write the low 128 bits of the sum A + C + X Y to
512 // [RDI], and update the carry registers with the carry out. The
513 // registers XMM0--XMM3, XMM5--XMM7, and XMM15 are clobbered; the
514 // general-purpose registers are preserved.
515 endprologue
516
517 movdqu xmm5, [rbx]
518 carryadd
519
520 mulacc xmm5, 0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
521 propout xmm6, lo, xmm12, xmm13
522
523 mulacc xmm5, 1, xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
524 propout xmm7, lo, xmm13, xmm14
525
526 mulacc xmm5, 2, xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
527 propout xmm6, hi, xmm14, xmm15
528
529 mulacc xmm5, 3, xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
530 propout xmm7, hi, xmm15, xmm12
531
532 punpckldq xmm6, xmm7
533 movdqu [rdi], xmm6
534
535 ret
536
537 ENDFUNC
538
539 INTFUNC(mmul4)
540 // On entry, RDI points to the destination buffer; RAX and RBX point
541 // to the packed operands U and N; and XMM8/XMM9 and XMM10/XMM11 hold
542 // the expanded operands V and M. The stack pointer must be 8 modulo 16
543 // (as usual for AMD64 ABIs).
544 //
545 // On exit, we store Y = U V M mod B in XMM10/XMM11, and write the
546 // low 128 bits of the sum U V + N Y to [RDI], leaving the remaining
547 // carry in XMM12, XMM13, and XMM14. The registers XMM0--XMM7, and
548 // XMM15 are clobbered; the general-purpose registers are preserved.
549 movdqu xmm4, [rax]
550 #if ABI_WIN
551 stalloc 48 + 8 // space for the carries
552 #endif
553 endprologue
554
555 // Calculate W = U V, and leave it in XMM7. Stash the carry pieces
556 // for later.
557 mulcore xmm4, 0, xmm8, xmm9, xmm12, xmm13, xmm14, xmm15
558 propout xmm7, lo, xmm12, xmm13
559 jmp 5f
560
561 ENDFUNC
562
563 INTFUNC(mmla4)
564 // On entry, RDI points to the destination buffer, which also
565 // contains an addend A to accumulate; RAX and RBX point to the
566 // packed operands U and N; and XMM8/XMM9 and XMM10/XMM11 hold the
567 // expanded operands V and M. The stack pointer must be 8 modulo 16
568 // (as usual for AMD64 ABIs).
569 //
570 // On exit, we store Y = (A + U V) M mod B in XMM10/XMM11, and write
571 // the low 128 bits of the sum A + U V + N Y to [RDI], leaving the
572 // remaining carry in XMM12, XMM13, and XMM14. The registers
573 // XMM0--XMM7, and XMM15 are clobbered; the general-purpose registers
574 // are preserved.
575 movdqu xmm4, [rax]
576 #if ABI_WIN
577 stalloc 48 + 8 // space for the carries
578 # define STKTMP(i) [rsp + i]
579 #endif
580 #if ABI_SYSV
581 # define STKTMP(i) [rsp + i - 48 - 8] // use red zone
582 #endif
583 endprologue
584
585 movd xmm12, [rdi + 0]
586 movd xmm13, [rdi + 4]
587 movd xmm14, [rdi + 8]
588 movd xmm15, [rdi + 12]
589
590 // Calculate W = U V, and leave it in XMM7. Stash the carry pieces
591 // for later.
592 mulacc xmm4, 0, xmm8, xmm9, xmm12, xmm13, xmm14, xmm15
593 propout xmm7, lo, xmm12, xmm13
594
595 5: mulacc xmm4, 1, xmm8, xmm9, xmm13, xmm14, xmm15, xmm12, t
596 propout xmm6, lo, xmm13, xmm14
597
598 mulacc xmm4, 2, xmm8, xmm9, xmm14, xmm15, xmm12, xmm13, t
599 propout xmm7, hi, xmm14, xmm15
600
601 mulacc xmm4, 3, xmm8, xmm9, xmm15, xmm12, xmm13, xmm14, t
602 propout xmm6, hi, xmm15, xmm12
603
604 // Prepare W, and stash carries for later.
605 punpckldq xmm7, xmm6
606 movdqa STKTMP( 0), xmm12
607 movdqa STKTMP(16), xmm13
608 movdqa STKTMP(32), xmm14
609
610 // Calculate Y = W M. We just about have enough spare registers to
611 // make this work.
612 mulcore xmm7, 0, xmm10, xmm11, xmm3, xmm4, xmm5, xmm6
613
614 // Start expanding W back into the main carry registers...
615 pxor xmm15, xmm15
616 movdqa xmm12, xmm7
617 movdqa xmm14, xmm7
618
619 mulcore xmm7, 1, xmm10, xmm11, xmm0, xmm1, xmm2
620 accum xmm4, xmm5, xmm6
621
622 punpckldq xmm12, xmm15 // (w_0, 0; w_1, 0)
623 punpckhdq xmm14, xmm15 // (w_2, 0; w_3, 0)
624
625 mulcore xmm7, 2, xmm10, xmm11, xmm0, xmm1
626 accum xmm5, xmm6
627
628 pxor xmm2, xmm2
629 movdqa xmm13, xmm12
630 movdqa xmm15, xmm14
631
632 mulcore xmm7, 3, xmm10, xmm11, xmm0
633 accum xmm6
634
635 punpckldq xmm12, xmm2 // (w_0, 0; 0, 0)
636 punpckldq xmm14, xmm2 // (w_2, 0; 0, 0)
637 punpckhdq xmm13, xmm2 // (w_1, 0; 0, 0)
638 punpckhdq xmm15, xmm2 // (w_3, 0; 0, 0)
639
640 // That's lots of pieces. Now we have to assemble the answer.
641 squash xmm3, xmm4, xmm5, xmm6, xmm0, xmm1, xmm10
642
643 // Expand it.
644 movdqu xmm5, [rbx]
645 expand xmm2, xmm10, xmm11
646
647 // Finish the calculation by adding the Montgomery product.
648 mulacc xmm5, 0 xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
649 propout xmm6, lo, xmm12, xmm13
650
651 mulacc xmm5, 1 xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
652 propout xmm7, lo, xmm13, xmm14
653
654 mulacc xmm5, 2 xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
655 propout xmm6, hi, xmm14, xmm15
656
657 mulacc xmm5, 3 xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
658 propout xmm7, hi, xmm15, xmm12
659
660 punpckldq xmm6, xmm7
661
662 // Add add on the carry we calculated earlier.
663 paddq xmm12, STKTMP( 0)
664 paddq xmm13, STKTMP(16)
665 paddq xmm14, STKTMP(32)
666
667 // And, with that, we're done.
668 movdqu [rdi], xmm6
669 #if ABI_WIN
670 stfree 56
671 #endif
672 ret
673
674 #undef STKTMP
675
676 ENDFUNC
677
678 INTFUNC(mont4)
679 // On entry, RDI points to the destination buffer holding a packed
680 // value W; RBX points to a packed operand N; and XMM8/XMM9 hold an
681 // expanded operand M.
682 //
683 // On exit, we store Y = W M mod B in XMM10/XMM11, and write the low
684 // 128 bits of the sum W + N Y to [RDI], leaving the remaining carry
685 // in XMM12, XMM13, and XMM14. The registers XMM0--XMM3, XMM5--XMM7,
686 // and XMM15 are clobbered; the general-purpose registers are
687 // preserved.
688 endprologue
689
690 movdqu xmm7, [rdi]
691
692 // Calculate Y = W M. Avoid the standard carry registers, because
693 // we're setting something else up there.
694 mulcore xmm7, 0, xmm8, xmm9, xmm3, xmm4, xmm5, xmm6
695
696 // Start expanding W back into the main carry registers...
697 pxor xmm15, xmm15
698 movdqa xmm12, xmm7
699 movdqa xmm14, xmm7
700
701 mulcore xmm7, 1, xmm8, xmm9, xmm0, xmm1, xmm2
702 accum xmm4, xmm5, xmm6
703
704 punpckldq xmm12, xmm15 // (w_0, 0; w_1, 0)
705 punpckhdq xmm14, xmm15 // (w_2, 0; w_3, 0)
706
707 mulcore xmm7, 2, xmm8, xmm9, xmm0, xmm1
708 accum xmm5, xmm6
709
710 pxor xmm2, xmm2
711 movdqa xmm13, xmm12
712 movdqa xmm15, xmm14
713
714 mulcore xmm7, 3, xmm8, xmm9, xmm0
715 accum xmm6
716
717 punpckldq xmm12, xmm2 // (w_0, 0; 0, 0)
718 punpckldq xmm14, xmm2 // (w_2, 0; 0, 0)
719 punpckhdq xmm13, xmm2 // (w_1, 0; 0, 0)
720 punpckhdq xmm15, xmm2 // (w_3, 0; 0, 0)
721
722 // That's lots of pieces. Now we have to assemble the answer.
723 squash xmm3, xmm4, xmm5, xmm6, xmm0, xmm1, xmm10
724
725 // Expand it.
726 movdqu xmm5, [rbx]
727 expand xmm2, xmm10, xmm11
728
729 // Finish the calculation by adding the Montgomery product.
730 mulacc xmm5, 0 xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
731 propout xmm6, lo, xmm12, xmm13
732
733 mulacc xmm5, 1 xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
734 propout xmm7, lo, xmm13, xmm14
735
736 mulacc xmm5, 2 xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
737 propout xmm6, hi, xmm14, xmm15
738
739 mulacc xmm5, 3 xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
740 propout xmm7, hi, xmm15, xmm12
741
742 punpckldq xmm6, xmm7
743
744 // And, with that, we're done.
745 movdqu [rdi], xmm6
746 ret
747
748 ENDFUNC
749
750 ///--------------------------------------------------------------------------
751 /// Bulk multipliers.
752
753 FUNC(mpx_umul4_amd64_avx)
754 .arch .avx
755 vzeroupper
756 endprologue
757 .arch pentium4
758 ENDFUNC
759
760 FUNC(mpx_umul4_amd64_sse2)
761 // void mpx_umul4_amd64_sse2(mpw *dv, const mpw *av, const mpw *avl,
762 // const mpw *bv, const mpw *bvl);
763
764 // Establish the arguments and do initial setup.
765 //
766 // sysv win
767 // inner loop dv rdi rdi*
768 // inner loop av rbx* rbx*
769 // outer loop dv r10 rcx
770 // outer loop bv rcx r9
771 // av base rsi rdx
772 // av limit rdx r8
773 // bv limit r8 r10
774
775 #if ABI_SYSV
776 # define DV r10
777 # define AV rsi
778 # define AVL rdx
779 # define BV rcx
780 # define BVL r8
781
782 pushreg rbx
783 endprologue
784
785 mov DV, rdi
786
787 #endif
788
789 #if ABI_WIN
790 # define DV rcx
791 # define AV rdx
792 # define AVL r8
793 # define BV r9
794 # define BVL r10
795
796 pushreg rbx
797 pushreg rdi
798 stalloc 160 + 8
799
800 savexmm xmm6, 0
801 savexmm xmm7, 16
802 savexmm xmm8, 32
803 savexmm xmm9, 48
804 savexmm xmm10, 64
805 savexmm xmm11, 80
806 savexmm xmm12, 96
807 savexmm xmm13, 112
808 savexmm xmm14, 128
809 savexmm xmm15, 144
810
811 endprologue
812
813 mov rdi, DV
814 mov BVL, [rsp + 224]
815
816 #endif
817
818 // Prepare for the first iteration.
819 pxor xmm0, xmm0
820 movdqu xmm10, [BV] // bv[0]
821 mov rbx, AV
822 add DV, 16
823 add BV, 16
824 expand xmm0, xmm10, xmm11
825 call mul4zc
826 add rbx, 16
827 add rdi, 16
828 cmp rbx, AVL // all done?
829 jae 8f
830
831 .p2align 4
832 // Continue with the first iteration.
833 0: call mul4
834 add rbx, 16
835 add rdi, 16
836 cmp rbx, AVL // all done?
837 jb 0b
838
839 // Write out the leftover carry. There can be no tail here.
840 8: call carryprop
841 cmp BV, BVL // more passes to do?
842 jae 9f
843
844 .p2align 4
845 // Set up for the next pass.
846 1: movdqu xmm10, [BV] // bv[i]
847 mov rdi, DV // -> dv[i]
848 pxor xmm0, xmm0
849 expand xmm0, xmm10, xmm11
850 mov rbx, AV // -> av[0]
851 add DV, 16
852 add BV, 16
853 call mla4zc
854 add rbx, 16
855 add rdi, 16
856 cmp rbx, AVL // done yet?
857 jae 8f
858
859 .p2align 4
860 // Continue...
861 0: call mla4
862 add rbx, 16
863 add rdi, 16
864 cmp rbx, AVL
865 jb 0b
866
867 // Finish off this pass. There was no tail on the previous pass, and
868 // there can be none on this pass.
869 8: call carryprop
870 cmp BV, BVL
871 jb 1b
872
873 // All over.
874 9:
875
876 #if ABI_SYSV
877 popreg rbx
878 #endif
879
880 #if ABI_WIN
881
882 rstrxmm xmm6, 0
883 rstrxmm xmm7, 16
884 rstrxmm xmm8, 32
885 rstrxmm xmm9, 48
886 rstrxmm xmm10, 64
887 rstrxmm xmm11, 80
888 rstrxmm xmm12, 96
889 rstrxmm xmm13, 112
890 rstrxmm xmm14, 128
891 rstrxmm xmm15, 144
892
893 stfree 160 + 8
894 popreg rdi
895 popreg rbx
896
897 #endif
898
899 ret
900
901 #undef DV
902 #undef AV
903 #undef AVL
904 #undef BV
905 #undef BVL
906
907 ENDFUNC
908
909 FUNC(mpxmont_mul4_amd64_avx)
910 .arch .avx
911 vzeroupper
912 endprologue
913 .arch pentium4
914 ENDFUNC
915
916 FUNC(mpxmont_mul4_amd64_sse2)
917 // void mpxmont_mul4_amd64_sse2(mpw *dv, const mpw *av, const mpw *bv,
918 // const mpw *nv, size_t n, const mpw *mi);
919
920 // Establish the arguments and do initial setup.
921 //
922 // sysv win
923 // inner loop dv rdi rdi*
924 // inner loop av rax rax
925 // inner loop nv rbx* rbx*
926 // mi r9 r10
927 // outer loop dv r10 rcx
928 // outer loop bv rdx r8
929 // av base rsi rdx
930 // av limit r11 r11
931 // bv limit r8 r12*
932 // nv base rcx r9
933 // n r8 r12*
934
935 #if ABI_SYSV
936 # define DV r10
937 # define AV rsi
938 # define AVL r11
939 # define BV rdx
940 # define BVL r8
941 # define NV rcx
942 # define N r8
943 # define MI r9
944
945 pushreg rbx
946 endprologue
947
948 mov DV, rdi
949
950 #endif
951
952 #if ABI_WIN
953 # define DV rcx
954 # define AV rdx
955 # define AVL r11
956 # define BV r8
957 # define BVL r12
958 # define NV r9
959 # define N r12
960 # define MI r10
961
962 pushreg rbx
963 pushreg rdi
964 pushreg r12
965 stalloc 160
966
967 savexmm xmm6, 0
968 savexmm xmm7, 16
969 savexmm xmm8, 32
970 savexmm xmm9, 48
971 savexmm xmm10, 64
972 savexmm xmm11, 80
973 savexmm xmm12, 96
974 savexmm xmm13, 112
975 savexmm xmm14, 128
976 savexmm xmm15, 144
977
978 endprologue
979
980 mov rdi, DV
981 mov N, [rsp + 224]
982 mov MI, [rsp + 232]
983
984 #endif
985
986 // Establish the expanded operands.
987 pxor xmm0, xmm0
988 movdqu xmm8, [BV] // bv[0]
989 movdqu xmm10, [MI] // mi
990 expand xmm0, xmm8, xmm9, xmm10, xmm11
991
992 // Set up the outer loop state and prepare for the first iteration.
993 mov rax, AV // -> U = av[0]
994 mov rbx, NV // -> X = nv[0]
995 lea AVL, [AV + 4*N] // -> av[n/4] = av limit
996 lea BVL, [BV + 4*N] // -> bv[n/4] = bv limit
997 add BV, 16
998 add DV, 16
999 call mmul4
1000 add rdi, 16
1001 add rax, 16
1002 add rbx, 16
1003 cmp rax, AVL // done already?
1004 jae 8f
1005
1006 .p2align 4
1007 // Complete the first inner loop.
1008 0: call dmul4
1009 add rdi, 16
1010 add rax, 16
1011 add rbx, 16
1012 cmp rax, AVL // done yet?
1013 jb 0b
1014
1015 // Still have carries left to propagate.
1016 call carryprop
1017 movd [rdi + 16], xmm12
1018
1019 .p2align 4
1020 // Embark on the next iteration. (There must be one. If n = 1, then
1021 // we would have bailed above, to label 8. Similarly, the subsequent
1022 // iterations can fall into the inner loop immediately.)
1023 1: pxor xmm0, xmm0
1024 movdqu xmm8, [BV] // bv[i]
1025 movdqu xmm10, [MI] // mi
1026 mov rdi, DV // -> Z = dv[i]
1027 mov rax, AV // -> U = av[0]
1028 mov rbx, NV // -> X = nv[0]
1029 expand xmm0, xmm8, xmm9, xmm10, xmm11
1030 add BV, 16
1031 add DV, 16
1032 call mmla4
1033 add rdi, 16
1034 add rax, 16
1035 add rbx, 16
1036
1037 .p2align 4
1038 // Complete the next inner loop.
1039 0: call dmla4
1040 add rdi, 16
1041 add rax, 16
1042 add rbx, 16
1043 cmp rax, AVL
1044 jb 0b
1045
1046 // Still have carries left to propagate, and they overlap the
1047 // previous iteration's final tail, so read that in and add it.
1048 movd xmm0, [rdi]
1049 paddq xmm12, xmm0
1050 call carryprop
1051 movd [rdi + 16], xmm12
1052
1053 // Back again, maybe.
1054 cmp BV, BVL
1055 jb 1b
1056
1057 // All done.
1058 9:
1059
1060 #if ABI_SYSV
1061 popreg rbx
1062 #endif
1063
1064 #if ABI_WIN
1065
1066 rstrxmm xmm6, 0
1067 rstrxmm xmm7, 16
1068 rstrxmm xmm8, 32
1069 rstrxmm xmm9, 48
1070 rstrxmm xmm10, 64
1071 rstrxmm xmm11, 80
1072 rstrxmm xmm12, 96
1073 rstrxmm xmm13, 112
1074 rstrxmm xmm14, 128
1075 rstrxmm xmm15, 144
1076
1077 stfree 160
1078 popreg r12
1079 popreg rdi
1080 popreg rbx
1081
1082 #endif
1083
1084 ret
1085
1086 // First iteration was short. Write out the carries and we're done.
1087 // (This could be folded into the main loop structure, but that would
1088 // penalize small numbers more.)
1089 8: call carryprop
1090 movd [rdi + 16], xmm12
1091 #if ABI_SYSV
1092 popreg rbx
1093 ret
1094 #endif
1095 #if ABI_WIN
1096 jmp 9b
1097 #endif
1098
1099 #undef DV
1100 #undef AV
1101 #undef AVL
1102 #undef BV
1103 #undef BVL
1104 #undef NV
1105 #undef N
1106 #undef MI
1107
1108 ENDFUNC
1109
1110 FUNC(mpxmont_redc4_amd64_avx)
1111 .arch .avx
1112 vzeroupper
1113 endprologue
1114 .arch pentium4
1115 ENDFUNC
1116
1117 FUNC(mpxmont_redc4_amd64_sse2)
1118 // void mpxmont_redc4_amd64_sse2(mpw *dv, mpw *dvl, const mpw *nv,
1119 // size_t n, const mpw *mi);
1120
1121 // Establish the arguments and do initial setup.
1122 //
1123 // sysv win
1124 // inner loop dv rdi rdi*
1125 // dv limit rax rax
1126 // blocks-of-4 dv limit rsi rdx
1127 // inner loop nv rbx* rbx*
1128 // mi r8 r10
1129 // outer loop dv r10 rcx
1130 // outer loop dv limit r11 r11
1131 // nv base rdx r8
1132 // nv limit r9 r12*
1133 // n rcx r9
1134 // c rcx r9
1135
1136 #if ABI_SYSV
1137
1138 # define DVL rax
1139 # define DVL4 rsi
1140 # define MI r8
1141 # define DV r10
1142 # define DVLO r11
1143 # define NV rdx
1144 # define NVL r9
1145 # define N rcx
1146 # define C ecx
1147
1148 pushreg rbx
1149 endprologue
1150
1151 mov DV, rdi
1152
1153 #endif
1154
1155 #if ABI_WIN
1156
1157 # define DVL rax
1158 # define DVL4 rdx
1159 # define MI r10
1160 # define DV rcx
1161 # define DVLO r11
1162 # define NV r8
1163 # define NVL r12
1164 # define N r9
1165 # define C r9d
1166
1167 pushreg rbx
1168 pushreg rdi
1169 pushreg r12
1170 stalloc 160
1171
1172 savexmm xmm6, 0
1173 savexmm xmm7, 16
1174 savexmm xmm8, 32
1175 savexmm xmm9, 48
1176 savexmm xmm10, 64
1177 savexmm xmm11, 80
1178 savexmm xmm12, 96
1179 savexmm xmm13, 112
1180 savexmm xmm14, 128
1181 savexmm xmm15, 144
1182
1183 endprologue
1184
1185 mov rdi, DV
1186 mov MI, [rsp + 224]
1187
1188 #endif
1189
1190 // Establish the expanded operands and the blocks-of-4 dv limit.
1191 pxor xmm0, xmm0
1192 mov DVL, DVL4 // -> dv[n] = dv limit
1193 sub DVL4, DV // length of dv in bytes
1194 movdqu xmm8, [MI] // mi
1195 and DVL4, ~15 // mask off the tail end
1196 expand xmm0, xmm8, xmm9
1197 add DVL4, DV // find limit
1198
1199 // Set up the outer loop state and prepare for the first iteration.
1200 mov rbx, NV // -> X = nv[0]
1201 lea DVLO, [DV + 4*N] // -> dv[n/4] = outer dv limit
1202 lea NVL, [NV + 4*N] // -> nv[n/4] = nv limit
1203 add DV, 16
1204 call mont4
1205 add rbx, 16
1206 add rdi, 16
1207 cmp rbx, NVL // done already?
1208 jae 8f
1209
1210 .p2align 4
1211 // Complete the first inner loop.
1212 5: call mla4
1213 add rbx, 16
1214 add rdi, 16
1215 cmp rbx, NVL // done yet?
1216 jb 5b
1217
1218 // Still have carries left to propagate.
1219 8: carryadd
1220 psllq xmm15, 16
1221 pslldq xmm15, 8
1222 paddq xmm14, xmm15
1223 call carryprop
1224 movd C, xmm12
1225 add rdi, 16
1226 cmp rdi, DVL4
1227 jae 7f
1228
1229 .p2align 4
1230 // Continue carry propagation until the end of the buffer.
1231 0: add [rdi], C
1232 mov C, 0 // preserves flags
1233 adcd [rdi + 4], 0
1234 adcd [rdi + 8], 0
1235 adcd [rdi + 12], 0
1236 adc C, 0
1237 add rdi, 16
1238 cmp rdi, DVL4
1239 jb 0b
1240
1241 // Deal with the tail end.
1242 7: add [rdi], C
1243 mov C, 0 // preserves flags
1244 add rdi, 4
1245 adc C, 0
1246 cmp rdi, DVL
1247 jb 7b
1248
1249 // All done for this iteration. Start the next. (This must have at
1250 // least one follow-on iteration, or we'd not have started this outer
1251 // loop.)
1252 8: mov rdi, DV // -> Z = dv[i]
1253 mov rbx, NV // -> X = nv[0]
1254 cmp rdi, DVLO // all done yet?
1255 jae 9f
1256 add DV, 16
1257 call mont4
1258 add rdi, 16
1259 add rbx, 16
1260 jmp 5b
1261
1262 // All over.
1263 9:
1264
1265 #if ABI_SYSV
1266 popreg rbx
1267 #endif
1268
1269 #if ABI_WIN
1270
1271 rstrxmm xmm6, 0
1272 rstrxmm xmm7, 16
1273 rstrxmm xmm8, 32
1274 rstrxmm xmm9, 48
1275 rstrxmm xmm10, 64
1276 rstrxmm xmm11, 80
1277 rstrxmm xmm12, 96
1278 rstrxmm xmm13, 112
1279 rstrxmm xmm14, 128
1280 rstrxmm xmm15, 144
1281
1282 stfree 160
1283 popreg r12
1284 popreg rdi
1285 popreg rbx
1286
1287 #endif
1288
1289 ret
1290
1291 #undef DVL
1292 #undef DVL4
1293 #undef MI
1294 #undef DV
1295 #undef DVLO
1296 #undef NV
1297 #undef NVL
1298 #undef N
1299 #undef C
1300
1301 ENDFUNC
1302
1303 ///--------------------------------------------------------------------------
1304 /// Testing and performance measurement.
1305
1306 #ifdef TEST_MUL4
1307
1308 #if ABI_SYSV
1309 # define ARG0 rdi
1310 # define ARG1 rsi
1311 # define ARG2 rdx
1312 # define ARG3 rcx
1313 # define ARG4 r8
1314 # define ARG5 r9
1315 # define ARG6 STKARG(0)
1316 # define ARG7 STKARG(1)
1317 # define ARG8 STKARG(2)
1318 # define STKARG_OFFSET 16
1319 #endif
1320 #if ABI_WIN
1321 # define ARG0 rcx
1322 # define ARG1 rdx
1323 # define ARG2 r8
1324 # define ARG3 r9
1325 # define ARG4 STKARG(0)
1326 # define ARG5 STKARG(1)
1327 # define ARG6 STKARG(2)
1328 # define ARG7 STKARG(3)
1329 # define ARG8 STKARG(4)
1330 # define STKARG_OFFSET 224
1331 #endif
1332 #define STKARG(i) [rsp + STKARG_OFFSET + 8*(i)]
1333
1334 // sysv win
1335 // dmul smul mmul mont dmul smul mmul mont
1336 // A rax
1337 // D rdx
1338 // z rdi rdi rdi rdi rdi rcx rcx rcx rcx
1339 // c rcx rsi rsi rsi rsi rdx rdx rdx rdx
1340 // y r10 -- -- rdx rdx -- -- r8 r8
1341 // u r11 rdx -- rcx -- r8 -- r9 --
1342 // x rbx rcx rdx r8 rcx r9 r8 stk0 r9
1343 // vv xmm8/9 r8 -- r9 r8 stk0 -- stk1 stk0
1344 // yy xmm10/11 r9 rcx stk0 -- stk1 r9 stk2 --
1345 // n r8 stk0 r8 stk1 r9 stk2 stk0 stk3 stk1
1346 // cyv r9 stk1 r9 stk2 stk0 stk3 stk1 stk4 stk2
1347
1348 .macro cysetup v, n
1349 rdtsc
1350 shl rdx, 32
1351 or rax, rdx
1352 mov [\v + 8*\n - 8], rax
1353 .endm
1354
1355 .macro cystore v, n
1356 rdtsc
1357 shl rdx, 32
1358 or rax, rdx
1359 sub rax, [\v + 8*\n - 8]
1360 mov [\v + 8*\n - 8], rax
1361 dec \n
1362 .endm
1363
1364 .macro testprologue mode
1365 pushreg rbx
1366 #if ABI_SYSV
1367 endprologue
1368 .ifeqs "\mode", "dmul"
1369 mov rbx, rcx
1370 movdqu xmm8, [r8]
1371 movdqu xmm10, [r9]
1372 mov r8d, STKARG(0)
1373 mov r9, STKARG(1)
1374 mov r11, rdx
1375 mov rcx, rsi
1376 .endif
1377 .ifeqs "\mode", "smul"
1378 mov rbx, rdx
1379 movdqu xmm10, [rcx]
1380 mov rcx, rsi
1381 .endif
1382 .ifeqs "\mode", "mmul"
1383 mov rax, STKARG(0)
1384 mov rbx, r8
1385 movdqu xmm8, [r9]
1386 movdqu xmm10, [rax]
1387 mov r8d, STKARG(1)
1388 mov r9, STKARG(2)
1389 mov r10, rdx
1390 mov r11, rcx
1391 mov rcx, rsi
1392 .endif
1393 .ifeqs "\mode", "mont"
1394 mov rbx, rcx
1395 movdqu xmm8, [r8]
1396 mov r8d, r9d
1397 mov r9, STKARG(0)
1398 mov r10, rdx
1399 mov rcx, rsi
1400 .endif
1401 #endif
1402 #if ABI_WIN
1403 pushreg rdi
1404 stalloc 168
1405 savexmm xmm6, 0
1406 savexmm xmm7, 16
1407 savexmm xmm8, 32
1408 savexmm xmm9, 48
1409 savexmm xmm10, 64
1410 savexmm xmm11, 80
1411 savexmm xmm12, 96
1412 savexmm xmm13, 112
1413 savexmm xmm14, 128
1414 savexmm xmm15, 144
1415 endprologue
1416 .ifeqs "\mode", "dmul"
1417 mov r10, STKARG(0)
1418 mov r11, STKARG(1)
1419 mov rdi, rcx
1420 mov rcx, rdx
1421 mov rbx, r9
1422 movdqu xmm8, [r10]
1423 movdqu xmm10, [r11]
1424 mov r11, r8
1425 mov r8d, STKARG(2)
1426 mov r9, STKARG(3)
1427 .endif
1428 .ifeqs "\mode", "smul"
1429 mov rdi, rcx
1430 mov rcx, rdx
1431 mov rbx, r8
1432 movdqu xmm10, [r9]
1433 mov r8d, STKARG(0)
1434 mov r9, STKARG(1)
1435 .endif
1436 .ifeqs "\mode", "mmul"
1437 mov r10, STKARG(1)
1438 mov r11, STKARG(2)
1439 mov rdi, rcx
1440 mov rcx, rdx
1441 mov rbx, STKARG(0)
1442 movdqu xmm8, [r10]
1443 movdqu xmm10, [r11]
1444 mov r10, r8
1445 mov r11, r9
1446 mov r8d, STKARG(3)
1447 mov r9, STKARG(4)
1448 .endif
1449 .ifeqs "\mode", "mont"
1450 mov r10, STKARG(0)
1451 mov rdi, rcx
1452 mov rcx, rdx
1453 mov rbx, r9
1454 movdqu xmm8, [r10]
1455 mov r10, r8
1456 mov r8d, STKARG(1)
1457 mov r9, STKARG(2)
1458 .endif
1459 #endif
1460
1461 pxor xmm0, xmm0
1462 .ifeqs "\mode", "dmul"
1463 expand xmm0, xmm8, xmm9, xmm10, xmm11
1464 .endif
1465 .ifeqs "\mode", "smul"
1466 expand xmm0, xmm10, xmm11
1467 .endif
1468 .ifeqs "\mode", "mmul"
1469 expand xmm0, xmm8, xmm9, xmm10, xmm11
1470 .endif
1471 .ifeqs "\mode", "mont"
1472 expand xmm0, xmm8, xmm9
1473 .endif
1474 .endm
1475
1476 .macro testepilogue
1477 #if ABI_WIN
1478 rstrxmm xmm6, 0
1479 rstrxmm xmm7, 16
1480 rstrxmm xmm8, 32
1481 rstrxmm xmm9, 48
1482 rstrxmm xmm10, 64
1483 rstrxmm xmm11, 80
1484 rstrxmm xmm12, 96
1485 rstrxmm xmm13, 112
1486 rstrxmm xmm14, 128
1487 rstrxmm xmm15, 144
1488 stfree 168
1489 popreg rdi
1490 #endif
1491 popreg rbx
1492 ret
1493 .endm
1494
1495 .macro testldcarry
1496 movdqu xmm12, [rcx + 0] // (c'_0; c''_0)
1497 movdqu xmm13, [rcx + 16] // (c'_1; c''_1)
1498 movdqu xmm14, [rcx + 32] // (c'_2; c''_2)
1499 .endm
1500
1501 .macro testtop u=nil
1502 .p2align 4
1503 0:
1504 cysetup r9, r8
1505 .ifnes "\u", "nil"
1506 mov rax, \u
1507 .endif
1508 .endm
1509
1510 .macro testtail
1511 cystore r9, r8
1512 jnz 0b
1513 .endm
1514
1515 .macro testcarryout
1516 movdqu [rcx + 0], xmm12
1517 movdqu [rcx + 16], xmm13
1518 movdqu [rcx + 32], xmm14
1519 .endm
1520
1521 FUNC(test_dmul4)
1522 testprologue dmul
1523 testldcarry
1524 testtop r11
1525 call dmul4
1526 testtail
1527 testcarryout
1528 testepilogue
1529 ENDFUNC
1530
1531 FUNC(test_dmla4)
1532 testprologue dmul
1533 testldcarry
1534 testtop r11
1535 call dmla4
1536 testtail
1537 testcarryout
1538 testepilogue
1539 ENDFUNC
1540
1541 FUNC(test_mul4)
1542 testprologue smul
1543 testldcarry
1544 testtop nil
1545 call mul4
1546 testtail
1547 testcarryout
1548 testepilogue
1549 ENDFUNC
1550
1551 FUNC(test_mul4zc)
1552 testprologue smul
1553 testldcarry
1554 testtop nil
1555 call mul4zc
1556 testtail
1557 testcarryout
1558 testepilogue
1559 ENDFUNC
1560
1561 FUNC(test_mla4)
1562 testprologue smul
1563 testldcarry
1564 testtop nil
1565 call mla4
1566 testtail
1567 testcarryout
1568 testepilogue
1569 ENDFUNC
1570
1571 FUNC(test_mla4zc)
1572 testprologue smul
1573 testldcarry
1574 testtop nil
1575 call mla4zc
1576 testtail
1577 testcarryout
1578 testepilogue
1579 ENDFUNC
1580
1581 FUNC(test_mmul4)
1582 testprologue mmul
1583 testtop r11
1584 call mmul4
1585 testtail
1586 movdqu [r10 + 0], xmm10
1587 movdqu [r10 + 16], xmm11
1588 testcarryout
1589 testepilogue
1590 ENDFUNC
1591
1592 FUNC(test_mmla4)
1593 testprologue mmul
1594 testtop r11
1595 call mmla4
1596 testtail
1597 movdqu [r10 + 0], xmm10
1598 movdqu [r10 + 16], xmm11
1599 testcarryout
1600 testepilogue
1601 ENDFUNC
1602
1603 FUNC(test_mont4)
1604 testprologue mont
1605 testtop
1606 call mont4
1607 testtail
1608 movdqu [r10 + 0], xmm10
1609 movdqu [r10 + 16], xmm11
1610 testcarryout
1611 testepilogue
1612 ENDFUNC
1613
1614 #endif
1615
1616 ///----- That's all, folks --------------------------------------------------