Merge branch '2.4.x'
[catacomb] / math / mpx-mul4-amd64-sse2.S
1 /// -*- mode: asm; asm-comment-char: ?/; comment-start: "// " -*-
2 ///
3 /// Large SIMD-based multiplications
4 ///
5 /// (c) 2016 Straylight/Edgeware
6 ///
7
8 ///----- Licensing notice ---------------------------------------------------
9 ///
10 /// This file is part of Catacomb.
11 ///
12 /// Catacomb is free software; you can redistribute it and/or modify
13 /// it under the terms of the GNU Library General Public License as
14 /// published by the Free Software Foundation; either version 2 of the
15 /// License, or (at your option) any later version.
16 ///
17 /// Catacomb is distributed in the hope that it will be useful,
18 /// but WITHOUT ANY WARRANTY; without even the implied warranty of
19 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 /// GNU Library General Public License for more details.
21 ///
22 /// You should have received a copy of the GNU Library General Public
23 /// License along with Catacomb; if not, write to the Free
24 /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
25 /// MA 02111-1307, USA.
26
27 ///--------------------------------------------------------------------------
28 /// External definitions.
29
30 #include "config.h"
31 #include "asm-common.h"
32
33 ///--------------------------------------------------------------------------
34 /// Prologue.
35
36 .arch pentium4
37 .text
38
39 ///--------------------------------------------------------------------------
40 /// Theory.
41 ///
42 /// We define a number of primitive fixed-size multipliers from which we can
43 /// construct more general variable-length multipliers.
44 ///
45 /// The basic trick is the same throughout. In an operand-scanning
46 /// multiplication, the inner multiplication loop multiplies a
47 /// multiple-precision operand by a single precision factor, and adds the
48 /// result, appropriately shifted, to the result. A `finely integrated
49 /// operand scanning' implementation of Montgomery multiplication also adds
50 /// the product of a single-precision `Montgomery factor' and the modulus,
51 /// calculated in the same pass. The more common `coarsely integrated
52 /// operand scanning' alternates main multiplication and Montgomery passes,
53 /// which requires additional carry propagation.
54 ///
55 /// Throughout both plain-multiplication and Montgomery stages, then, one of
56 /// the factors remains constant throughout the operation, so we can afford
57 /// to take a little time to preprocess it. The transformation we perform is
58 /// as follows. Let b = 2^16, and B = b^2 = 2^32. Suppose we're given a
59 /// 128-bit factor v = v_0 + v_1 B + v_2 B^2 + v_3 B^3. Split each v_i into
60 /// two sixteen-bit pieces, so v_i = v'_i + v''_i b. These eight 16-bit
61 /// pieces are placed into 32-bit cells, and arranged as two 128-bit SSE
62 /// operands, as follows.
63 ///
64 /// Offset 0 4 8 12
65 /// 0 v'_0 v'_1 v''_0 v''_1
66 /// 16 v'_2 v'_3 v''_2 v''_3
67 ///
68 /// A `pmuludqd' instruction ignores the odd positions in its operands; thus,
69 /// it will act on (say) v'_0 and v''_0 in a single instruction. Shifting
70 /// this vector right by 4 bytes brings v'_1 and v''_1 into position. We can
71 /// multiply such a vector by a full 32-bit scalar to produce two 48-bit
72 /// results in 64-bit fields. The sixteen bits of headroom allows us to add
73 /// many products together before we must deal with carrying; it also allows
74 /// for some calculations to be performed on the above expanded form.
75 ///
76 /// ...
77 ///
78 /// We maintain four `carry' registers accumulating intermediate results.
79 /// The registers' precise roles rotate during the computation; we name them
80 /// `c0', `c1', `c2', and `c3'. Each carry register holds two 64-bit halves:
81 /// the register c0, for example, holds c'_0 (low half) and c''_0 (high
82 /// half), and represents the value c_0 = c'_0 + c''_0 b; the carry registers
83 /// collectively represent the value c_0 + c_1 B + c_2 B^2 + c_3 B^3. The
84 /// `pmuluqdq' instruction acting on a scalar operand (broadcast across all
85 /// lanes of its vector) and an operand in the expanded form above produces a
86 /// result which can be added directly to the appropriate carry register.
87 /// Following a pass of four multiplications, we perform some limited carry
88 /// propagation: let t = c''_0 mod B, and let d = c'_0 + t b; then we output
89 /// z = d mod B, add (floor(d/B), floor(c''_0/B)) to c1, and cycle the carry
90 /// registers around, so that c1 becomes c0, and the old c0 is (implicitly)
91 /// zeroed becomes c3.
92
93 ///--------------------------------------------------------------------------
94 /// Macro definitions.
95
96 .macro mulcore r, i, slo, shi, d0, d1=nil, d2=nil, d3=nil
97 // Multiply R_I by the expanded operand SLO/SHI, and leave the pieces
98 // of the product in registers D0, D1, D2, D3.
99 pshufd \d0, \r, SHUF(\i, 3, \i, 3) // (r_i, ?; r_i, ?)
100 .ifnes "\d1", "nil"
101 movdqa \d1, \slo // (s'_0, s'_1; s''_0, s''_1)
102 .endif
103 .ifnes "\d3", "nil"
104 movdqa \d3, \shi // (s'_2, s'_3; s''_2, s''_3)
105 .endif
106 .ifnes "\d1", "nil"
107 psrldq \d1, 4 // (s'_1, s''_0; s''_1, 0)
108 .endif
109 .ifnes "\d2", "nil"
110 movdqa \d2, \d0 // another copy of (r_i, ?; r_i, ?)
111 .endif
112 .ifnes "\d3", "nil"
113 psrldq \d3, 4 // (s'_3, s''_2; s''_3, 0)
114 .endif
115 .ifnes "\d1", "nil"
116 pmuludq \d1, \d0 // (r_i s'_1; r_i s''_1)
117 .endif
118 .ifnes "\d3", "nil"
119 pmuludq \d3, \d0 // (r_i s'_3; r_i s''_3)
120 .endif
121 .ifnes "\d2", "nil"
122 pmuludq \d2, \shi // (r_i s'_2; r_i s''_2)
123 .endif
124 pmuludq \d0, \slo // (r_i s'_0; r_i s''_0)
125 .endm
126
127 .macro accum c0, c1=nil, c2=nil, c3=nil
128 // Accumulate 64-bit pieces in XMM0--XMM3 into the corresponding
129 // carry registers C0--C3. Any or all of C1--C3 may be `nil' to skip
130 // updating that register.
131 paddq \c0, xmm0
132 .ifnes "\c1", "nil"
133 paddq \c1, xmm1
134 .endif
135 .ifnes "\c2", "nil"
136 paddq \c2, xmm2
137 .endif
138 .ifnes "\c3", "nil"
139 paddq \c3, xmm3
140 .endif
141 .endm
142
143 .macro mulacc r, i, slo, shi, c0=nil, c1=nil, c2=nil, c3=nil, z3p=nil
144 // Multiply R_I by the expanded operand SLO/SHI, and accumulate in
145 // carry registers C0, C1, C2, C3. If Z3P is `t' then C3 notionally
146 // contains zero, but needs clearing; in practice, we store the
147 // product directly rather than attempting to add. On completion,
148 // XMM0, XMM1, and XMM2 are clobbered, as is XMM3 if Z3P is not `t'.
149 .ifeqs "\z3p", "t"
150 mulcore \r, \i, \slo, \shi, xmm0, xmm1, xmm2, \c3
151 accum \c0, \c1, \c2
152 .else
153 mulcore \r, \i, \slo, \shi, xmm0, xmm1, xmm2, xmm3
154 accum \c0, \c1, \c2, \c3
155 .endif
156 .endm
157
158 .macro propout d, pos, c, cc=nil
159 // Calculate an output word from C, and store it at POS in D;
160 // propagate carries out from C to CC in preparation for a rotation
161 // of the carry registers. D is an XMM register; the POS is either
162 // `lo' or `hi' according to whether the output word should be in
163 // lane 0 or 1 of D; the high two lanes of D are clobbered. On
164 // completion, XMM3 is clobbered. If CC is `nil', then the
165 // contribution which would have been added to it is left in C.
166 pshufd xmm3, \c, SHUF(3, 3, 3, 2) // (?, ?; ?, t = c'' mod B)
167 psrldq xmm3, 12 // (t, 0; 0, 0) = (t; 0)
168 pslldq xmm3, 2 // (t b; 0)
169 paddq \c, xmm3 // (c' + t b; c'')
170 .ifeqs "\pos", "lo"
171 movdqa \d, \c
172 .else
173 punpckldq \d, \c
174 .endif
175 psrlq \c, 32 // floor(c/B)
176 .ifnes "\cc", "nil"
177 paddq \cc, \c // propagate up
178 .endif
179 .endm
180
181 .macro endprop d, pos, c, t
182 // On entry, C contains a carry register. On exit, the low 32 bits
183 // of the value represented in C are written at POS in D, and the
184 // remaining bits are left at the bottom of T.
185 movdqa \t, \c
186 psllq \t, 16 // (?; c'' b)
187 pslldq \c, 8 // (0; c')
188 paddq \t, \c // (?; c' + c'' b)
189 psrldq \t, 8 // (c' + c'' b; 0) = (c; 0)
190 .ifeqs "\pos", "lo"
191 movdqa \d, \t
192 .else
193 punpckldq \d, \t
194 .endif
195 psrldq \t, 4 // (floor(c/B); 0)
196 .endm
197
198 .macro expand z, a, b, c=nil, d=nil
199 // On entry, A and C hold packed 128-bit values, and Z is zero. On
200 // exit, A:B and C:D together hold the same values in expanded
201 // form. If C is `nil', then only expand A to A:B.
202 movdqa \b, \a // (a_0, a_1; a_2, a_3)
203 .ifnes "\c", "nil"
204 movdqa \d, \c // (c_0, c_1; c_2, c_3)
205 .endif
206 punpcklwd \a, \z // (a'_0, a''_0; a'_1, a''_1)
207 punpckhwd \b, \z // (a'_2, a''_2; a'_3, a''_3)
208 .ifnes "\c", "nil"
209 punpcklwd \c, \z // (c'_0, c''_0; c'_1, c''_1)
210 punpckhwd \d, \z // (c'_2, c''_2; c'_3, c''_3)
211 .endif
212 pshufd \a, \a, SHUF(0, 2, 1, 3) // (a'_0, a'_1; a''_0, a''_1)
213 pshufd \b, \b, SHUF(0, 2, 1, 3) // (a'_2, a'_3; a''_2, a''_3)
214 .ifnes "\c", "nil"
215 pshufd \c, \c, SHUF(0, 2, 1, 3) // (c'_0, c'_1; c''_0, c''_1)
216 pshufd \d, \d, SHUF(0, 2, 1, 3) // (c'_2, c'_3; c''_2, c''_3)
217 .endif
218 .endm
219
220 .macro squash c0, c1, c2, c3, t, u, lo, hi=nil
221 // On entry, C0, C1, C2, C3 are carry registers representing a value
222 // Y. On exit, LO holds the low 128 bits of the carry value; C1, C2,
223 // C3, T, and U are clobbered; and the high bits of Y are stored in
224 // HI, if this is not `nil'.
225
226 // The first step is to eliminate the `double-prime' pieces -- i.e.,
227 // the ones offset by 16 bytes from a 32-bit boundary -- by carrying
228 // them into the 32-bit-aligned pieces above and below. But before
229 // we can do that, we must gather them together.
230 movdqa \t, \c0
231 movdqa \u, \c1
232 punpcklqdq \t, \c2 // (y'_0; y'_2)
233 punpckhqdq \c0, \c2 // (y''_0; y''_2)
234 punpcklqdq \u, \c3 // (y'_1; y'_3)
235 punpckhqdq \c1, \c3 // (y''_1; y''_3)
236
237 // Now split the double-prime pieces. The high (up to) 48 bits will
238 // go up; the low 16 bits go down.
239 movdqa \c2, \c0
240 movdqa \c3, \c1
241 psllq \c2, 48
242 psllq \c3, 48
243 psrlq \c0, 16 // high parts of (y''_0; y''_2)
244 psrlq \c1, 16 // high parts of (y''_1; y''_3)
245 psrlq \c2, 32 // low parts of (y''_0; y''_2)
246 psrlq \c3, 32 // low parts of (y''_1; y''_3)
247 .ifnes "\hi", "nil"
248 movdqa \hi, \c1
249 .endif
250 pslldq \c1, 8 // high part of (0; y''_1)
251
252 paddq \t, \c2 // propagate down
253 paddq \u, \c3
254 paddq \t, \c1 // and up: (y_0; y_2)
255 paddq \u, \c0 // (y_1; y_3)
256 .ifnes "\hi", "nil"
257 psrldq \hi, 8 // high part of (y''_3; 0)
258 .endif
259
260 // Finally extract the answer. This complicated dance is better than
261 // storing to memory and loading, because the piecemeal stores
262 // inhibit store forwarding.
263 movdqa \c3, \t // (y_0; ?)
264 movdqa \lo, \t // (y^*_0, ?; ?, ?)
265 psrldq \t, 8 // (y_2; 0)
266 psrlq \c3, 32 // (floor(y_0/B); ?)
267 paddq \c3, \u // (y_1 + floor(y_0/B); ?)
268 movdqa \c1, \c3 // (y^*_1, ?; ?, ?)
269 psrldq \u, 8 // (y_3; 0)
270 psrlq \c3, 32 // (floor((y_1 B + y_0)/B^2; ?)
271 paddq \c3, \t // (y_2 + floor((y_1 B + y_0)/B^2; ?)
272 punpckldq \lo, \c3 // (y^*_0, y^*_2; ?, ?)
273 psrlq \c3, 32 // (floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
274 paddq \c3, \u // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
275 .ifnes "\hi", "nil"
276 movdqa \t, \c3
277 pxor \u, \u
278 .endif
279 punpckldq \c1, \c3 // (y^*_1, y^*_3; ?, ?)
280 .ifnes "\hi", "nil"
281 psrlq \t, 32 // very high bits of y
282 paddq \hi, \t
283 punpcklqdq \hi, \u // carry up
284 .endif
285 punpckldq \lo, \c1 // y mod B^4
286 .endm
287
288 .macro carryadd
289 // On entry, RDI points to a packed addend A, and XMM12, XMM13, XMM14
290 // hold the incoming carry registers c0, c1, and c2 representing a
291 // carry-in C.
292 //
293 // On exit, the carry registers, including XMM15, are updated to hold
294 // C + A; XMM0, XMM1, XMM2, and XMM3 are clobbered. The other
295 // registers are preserved.
296 movd xmm0, [rdi + 0] // (a_0; 0)
297 movd xmm1, [rdi + 4] // (a_1; 0)
298 movd xmm2, [rdi + 8] // (a_2; 0)
299 movd xmm15, [rdi + 12] // (a_3; 0)
300 paddq xmm12, xmm0 // (c'_0 + a_0; c''_0)
301 paddq xmm13, xmm1 // (c'_1 + a_1; c''_1)
302 paddq xmm14, xmm2 // (c'_2 + a_2; c''_2 + a_3 b)
303 .endm
304
305 ///--------------------------------------------------------------------------
306 /// Primitive multipliers and related utilities.
307
308 INTFUNC(carryprop)
309 // On entry, XMM12, XMM13, and XMM14 hold a 144-bit carry in an
310 // expanded form. Store the low 128 bits of the represented carry to
311 // [RDI] as a packed 128-bit value, and leave the remaining 16 bits
312 // in the low 32 bits of XMM12. On exit, XMM0, XMM1, XMM3, XMM13 and
313 // XMM14 are clobbered.
314 endprologue
315
316 propout xmm0, lo, xmm12, xmm13
317 propout xmm1, lo, xmm13, xmm14
318 propout xmm0, hi, xmm14, nil
319 endprop xmm1, hi, xmm14, xmm12
320 punpckldq xmm0, xmm1
321 movdqu [rdi], xmm0
322
323 ret
324
325 ENDFUNC
326
327 INTFUNC(dmul4)
328 // On entry, RDI points to the destination buffer; RAX and RBX point
329 // to the packed operands U and X; XMM8/XMM9 and XMM10/XMM11 hold the
330 // expanded operands V and Y; and XMM12, XMM13, XMM14 hold the
331 // incoming carry registers c0, c1, and c2; c3 is assumed to be zero.
332 //
333 // On exit, we write the low 128 bits of the sum C + U V + X Y to
334 // [RDI], and update the carry registers with the carry out. The
335 // registers XMM0--XMM7, and XMM15 are clobbered; the general-purpose
336 // registers are preserved.
337 endprologue
338
339 movdqu xmm4, [rax]
340 movdqu xmm5, [rbx]
341
342 mulacc xmm4, 0, xmm8, xmm9, xmm12, xmm13, xmm14, xmm15, t
343 mulacc xmm5, 0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
344 propout xmm6, lo, xmm12, xmm13
345
346 mulacc xmm4, 1, xmm8, xmm9, xmm13, xmm14, xmm15, xmm12, t
347 mulacc xmm5, 1, xmm10, xmm11, xmm13, xmm14, xmm15, xmm12
348 propout xmm7, lo, xmm13, xmm14
349
350 mulacc xmm4, 2, xmm8, xmm9, xmm14, xmm15, xmm12, xmm13, t
351 mulacc xmm5, 2, xmm10, xmm11, xmm14, xmm15, xmm12, xmm13
352 propout xmm6, hi, xmm14, xmm15
353
354 mulacc xmm4, 3, xmm8, xmm9, xmm15, xmm12, xmm13, xmm14, t
355 mulacc xmm5, 3, xmm10, xmm11, xmm15, xmm12, xmm13, xmm14
356 propout xmm7, hi, xmm15, xmm12
357
358 punpckldq xmm6, xmm7
359 movdqu [rdi], xmm6
360
361 ret
362
363 ENDFUNC
364
365 INTFUNC(dmla4)
366 // On entry, RDI points to the destination buffer, which also
367 // contains an addend A to accumulate; RAX and RBX point to the
368 // packed operands U and X; XMM8/XMM9 and XMM10/XMM11 hold the
369 // expanded operands V and Y; and XMM12, XMM13, XMM14 hold the
370 // incoming carry registers c0, c1, and c2 representing a carry-in C;
371 // c3 is assumed to be zero.
372 //
373 // On exit, we write the low 128 bits of the sum A + C + U V + X Y to
374 // [RDI], and update the carry registers with the carry out. The
375 // registers XMM0--XMM7, and XMM15 are clobbered; the general-purpose
376 // registers are preserved.
377 endprologue
378
379 movdqu xmm4, [rax]
380 movdqu xmm5, [rbx]
381 carryadd
382
383 mulacc xmm4, 0, xmm8, xmm9, xmm12, xmm13, xmm14, xmm15
384 mulacc xmm5, 0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
385 propout xmm6, lo, xmm12, xmm13
386
387 mulacc xmm4, 1, xmm8, xmm9, xmm13, xmm14, xmm15, xmm12, t
388 mulacc xmm5, 1, xmm10, xmm11, xmm13, xmm14, xmm15, xmm12
389 propout xmm7, lo, xmm13, xmm14
390
391 mulacc xmm4, 2, xmm8, xmm9, xmm14, xmm15, xmm12, xmm13, t
392 mulacc xmm5, 2, xmm10, xmm11, xmm14, xmm15, xmm12, xmm13
393 propout xmm6, hi, xmm14, xmm15
394
395 mulacc xmm4, 3, xmm8, xmm9, xmm15, xmm12, xmm13, xmm14, t
396 mulacc xmm5, 3, xmm10, xmm11, xmm15, xmm12, xmm13, xmm14
397 propout xmm7, hi, xmm15, xmm12
398
399 punpckldq xmm6, xmm7
400 movdqu [rdi], xmm6
401
402 ret
403
404 ENDFUNC
405
406 INTFUNC(mul4zc)
407 // On entry, RDI points to the destination buffer; RBX points to a
408 // packed operand X; and XMM10/XMM11 hold an expanded operand Y.
409 //
410 // On exit, we write the low 128 bits of the product X Y to [RDI],
411 // and set the carry registers XMM12, XMM13, XMM14 to the carry out.
412 // The registers XMM0--XMM3, XMM5--XMM7, and XMM15 are clobbered; the
413 // general-purpose registers are preserved.
414 endprologue
415
416 movdqu xmm5, [rbx]
417
418 mulcore xmm5, 0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
419 propout xmm6, lo, xmm12, xmm13
420
421 mulacc xmm5, 1, xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
422 propout xmm7, lo, xmm13, xmm14
423
424 mulacc xmm5, 2, xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
425 propout xmm6, hi, xmm14, xmm15
426
427 mulacc xmm5, 3, xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
428 propout xmm7, hi, xmm15, xmm12
429
430 punpckldq xmm6, xmm7
431 movdqu [rdi], xmm6
432
433 ret
434
435 ENDFUNC
436
437 INTFUNC(mul4)
438 // On entry, RDI points to the destination buffer; RBX points to a
439 // packed operand X; XMM10/XMM11 hold an expanded operand Y; and
440 // XMM12, XMM13, XMM14 hold the incoming carry registers c0, c1, and
441 // c2, representing a carry-in C; c3 is assumed to be zero.
442 //
443 // On exit, we write the low 128 bits of the sum C + X Y to [RDI],
444 // and update the carry registers with the carry out. The registers
445 // XMM0--XMM3, XMM5--XMM7, and XMM15 are clobbered; the
446 // general-purpose registers are preserved.
447 endprologue
448
449 movdqu xmm5, [rbx]
450
451 mulacc xmm5, 0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, t
452 propout xmm6, lo, xmm12, xmm13
453
454 mulacc xmm5, 1, xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
455 propout xmm7, lo, xmm13, xmm14
456
457 mulacc xmm5, 2, xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
458 propout xmm6, hi, xmm14, xmm15
459
460 mulacc xmm5, 3, xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
461 propout xmm7, hi, xmm15, xmm12
462
463 punpckldq xmm6, xmm7
464 movdqu [rdi], xmm6
465
466 ret
467
468 ENDFUNC
469
470 INTFUNC(mla4zc)
471 // On entry, RDI points to the destination buffer, which also
472 // contains an addend A to accumulate; RBX points to a packed operand
473 // X; and XMM10/XMM11 points to an expanded operand Y.
474 //
475 // On exit, we write the low 128 bits of the sum A + X Y to [RDI],
476 // and set the carry registers XMM12, XMM13, XMM14 to the carry out.
477 // The registers XMM0--XMM3, XMM5--XMM7, and XMM15 are clobbered; the
478 // general-purpose registers are preserved.
479 endprologue
480
481 movdqu xmm5, [rbx]
482 movd xmm12, [rdi + 0]
483 movd xmm13, [rdi + 4]
484 movd xmm14, [rdi + 8]
485 movd xmm15, [rdi + 12]
486
487 mulacc xmm5, 0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
488 propout xmm6, lo, xmm12, xmm13
489
490 mulacc xmm5, 1, xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
491 propout xmm7, lo, xmm13, xmm14
492
493 mulacc xmm5, 2, xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
494 propout xmm6, hi, xmm14, xmm15
495
496 mulacc xmm5, 3, xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
497 propout xmm7, hi, xmm15, xmm12
498
499 punpckldq xmm6, xmm7
500 movdqu [rdi], xmm6
501
502 ret
503
504 ENDFUNC
505
506 INTFUNC(mla4)
507 // On entry, RDI points to the destination buffer, which also
508 // contains an addend A to accumulate; RBX points to a packed operand
509 // X; XMM10/XMM11 holds an expanded operand Y; and XMM12, XMM13,
510 // XMM14 hold the incoming carry registers c0, c1, and c2,
511 // representing a carry-in C; c3 is assumed to be zero.
512 //
513 // On exit, we write the low 128 bits of the sum A + C + X Y to
514 // [RDI], and update the carry registers with the carry out. The
515 // registers XMM0--XMM3, XMM5--XMM7, and XMM15 are clobbered; the
516 // general-purpose registers are preserved.
517 endprologue
518
519 movdqu xmm5, [rbx]
520 carryadd
521
522 mulacc xmm5, 0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
523 propout xmm6, lo, xmm12, xmm13
524
525 mulacc xmm5, 1, xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
526 propout xmm7, lo, xmm13, xmm14
527
528 mulacc xmm5, 2, xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
529 propout xmm6, hi, xmm14, xmm15
530
531 mulacc xmm5, 3, xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
532 propout xmm7, hi, xmm15, xmm12
533
534 punpckldq xmm6, xmm7
535 movdqu [rdi], xmm6
536
537 ret
538
539 ENDFUNC
540
541 INTFUNC(mmul4)
542 // On entry, RDI points to the destination buffer; RAX and RBX point
543 // to the packed operands U and N; and XMM8/XMM9 and XMM10/XMM11 hold
544 // the expanded operands V and M. The stack pointer must be 8 modulo 16
545 // (as usual for AMD64 ABIs).
546 //
547 // On exit, we store Y = U V M mod B in XMM10/XMM11, and write the
548 // low 128 bits of the sum U V + N Y to [RDI], leaving the remaining
549 // carry in XMM12, XMM13, and XMM14. The registers XMM0--XMM7, and
550 // XMM15 are clobbered; the general-purpose registers are preserved.
551 movdqu xmm4, [rax]
552 #if ABI_WIN
553 stalloc 48 + 8 // space for the carries
554 #endif
555 endprologue
556
557 // Calculate W = U V, and leave it in XMM7. Stash the carry pieces
558 // for later.
559 mulcore xmm4, 0, xmm8, xmm9, xmm12, xmm13, xmm14, xmm15
560 propout xmm7, lo, xmm12, xmm13
561 jmp 5f
562
563 ENDFUNC
564
565 INTFUNC(mmla4)
566 // On entry, RDI points to the destination buffer, which also
567 // contains an addend A to accumulate; RAX and RBX point to the
568 // packed operands U and N; and XMM8/XMM9 and XMM10/XMM11 hold the
569 // expanded operands V and M. The stack pointer must be 8 modulo 16
570 // (as usual for AMD64 ABIs).
571 //
572 // On exit, we store Y = (A + U V) M mod B in XMM10/XMM11, and write
573 // the low 128 bits of the sum A + U V + N Y to [RDI], leaving the
574 // remaining carry in XMM12, XMM13, and XMM14. The registers
575 // XMM0--XMM7, and XMM15 are clobbered; the general-purpose registers
576 // are preserved.
577 movdqu xmm4, [rax]
578 #if ABI_WIN
579 stalloc 48 + 8 // space for the carries
580 # define STKTMP(i) [rsp + i]
581 #endif
582 #if ABI_SYSV
583 # define STKTMP(i) [rsp + i - 48 - 8] // use red zone
584 #endif
585 endprologue
586
587 movd xmm12, [rdi + 0]
588 movd xmm13, [rdi + 4]
589 movd xmm14, [rdi + 8]
590 movd xmm15, [rdi + 12]
591
592 // Calculate W = U V, and leave it in XMM7. Stash the carry pieces
593 // for later.
594 mulacc xmm4, 0, xmm8, xmm9, xmm12, xmm13, xmm14, xmm15
595 propout xmm7, lo, xmm12, xmm13
596
597 5: mulacc xmm4, 1, xmm8, xmm9, xmm13, xmm14, xmm15, xmm12, t
598 propout xmm6, lo, xmm13, xmm14
599
600 mulacc xmm4, 2, xmm8, xmm9, xmm14, xmm15, xmm12, xmm13, t
601 propout xmm7, hi, xmm14, xmm15
602
603 mulacc xmm4, 3, xmm8, xmm9, xmm15, xmm12, xmm13, xmm14, t
604 propout xmm6, hi, xmm15, xmm12
605
606 // Prepare W, and stash carries for later.
607 punpckldq xmm7, xmm6
608 movdqa STKTMP( 0), xmm12
609 movdqa STKTMP(16), xmm13
610 movdqa STKTMP(32), xmm14
611
612 // Calculate Y = W M. We just about have enough spare registers to
613 // make this work.
614 mulcore xmm7, 0, xmm10, xmm11, xmm3, xmm4, xmm5, xmm6
615
616 // Start expanding W back into the main carry registers...
617 pxor xmm15, xmm15
618 movdqa xmm12, xmm7
619 movdqa xmm14, xmm7
620
621 mulcore xmm7, 1, xmm10, xmm11, xmm0, xmm1, xmm2
622 accum xmm4, xmm5, xmm6
623
624 punpckldq xmm12, xmm15 // (w_0, 0; w_1, 0)
625 punpckhdq xmm14, xmm15 // (w_2, 0; w_3, 0)
626
627 mulcore xmm7, 2, xmm10, xmm11, xmm0, xmm1
628 accum xmm5, xmm6
629
630 pxor xmm2, xmm2
631 movdqa xmm13, xmm12
632 movdqa xmm15, xmm14
633
634 mulcore xmm7, 3, xmm10, xmm11, xmm0
635 accum xmm6
636
637 punpckldq xmm12, xmm2 // (w_0, 0; 0, 0)
638 punpckldq xmm14, xmm2 // (w_2, 0; 0, 0)
639 punpckhdq xmm13, xmm2 // (w_1, 0; 0, 0)
640 punpckhdq xmm15, xmm2 // (w_3, 0; 0, 0)
641
642 // That's lots of pieces. Now we have to assemble the answer.
643 squash xmm3, xmm4, xmm5, xmm6, xmm0, xmm1, xmm10
644
645 // Expand it.
646 movdqu xmm5, [rbx]
647 expand xmm2, xmm10, xmm11
648
649 // Finish the calculation by adding the Montgomery product.
650 mulacc xmm5, 0 xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
651 propout xmm6, lo, xmm12, xmm13
652
653 mulacc xmm5, 1 xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
654 propout xmm7, lo, xmm13, xmm14
655
656 mulacc xmm5, 2 xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
657 propout xmm6, hi, xmm14, xmm15
658
659 mulacc xmm5, 3 xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
660 propout xmm7, hi, xmm15, xmm12
661
662 punpckldq xmm6, xmm7
663
664 // Add add on the carry we calculated earlier.
665 paddq xmm12, STKTMP( 0)
666 paddq xmm13, STKTMP(16)
667 paddq xmm14, STKTMP(32)
668
669 // And, with that, we're done.
670 movdqu [rdi], xmm6
671 #if ABI_WIN
672 stfree 56
673 #endif
674 ret
675
676 #undef STKTMP
677
678 ENDFUNC
679
680 INTFUNC(mont4)
681 // On entry, RDI points to the destination buffer holding a packed
682 // value W; RBX points to a packed operand N; and XMM8/XMM9 hold an
683 // expanded operand M.
684 //
685 // On exit, we store Y = W M mod B in XMM10/XMM11, and write the low
686 // 128 bits of the sum W + N Y to [RDI], leaving the remaining carry
687 // in XMM12, XMM13, and XMM14. The registers XMM0--XMM3, XMM5--XMM7,
688 // and XMM15 are clobbered; the general-purpose registers are
689 // preserved.
690 endprologue
691
692 movdqu xmm7, [rdi]
693
694 // Calculate Y = W M. Avoid the standard carry registers, because
695 // we're setting something else up there.
696 mulcore xmm7, 0, xmm8, xmm9, xmm3, xmm4, xmm5, xmm6
697
698 // Start expanding W back into the main carry registers...
699 pxor xmm15, xmm15
700 movdqa xmm12, xmm7
701 movdqa xmm14, xmm7
702
703 mulcore xmm7, 1, xmm8, xmm9, xmm0, xmm1, xmm2
704 accum xmm4, xmm5, xmm6
705
706 punpckldq xmm12, xmm15 // (w_0, 0; w_1, 0)
707 punpckhdq xmm14, xmm15 // (w_2, 0; w_3, 0)
708
709 mulcore xmm7, 2, xmm8, xmm9, xmm0, xmm1
710 accum xmm5, xmm6
711
712 pxor xmm2, xmm2
713 movdqa xmm13, xmm12
714 movdqa xmm15, xmm14
715
716 mulcore xmm7, 3, xmm8, xmm9, xmm0
717 accum xmm6
718
719 punpckldq xmm12, xmm2 // (w_0, 0; 0, 0)
720 punpckldq xmm14, xmm2 // (w_2, 0; 0, 0)
721 punpckhdq xmm13, xmm2 // (w_1, 0; 0, 0)
722 punpckhdq xmm15, xmm2 // (w_3, 0; 0, 0)
723
724 // That's lots of pieces. Now we have to assemble the answer.
725 squash xmm3, xmm4, xmm5, xmm6, xmm0, xmm1, xmm10
726
727 // Expand it.
728 movdqu xmm5, [rbx]
729 expand xmm2, xmm10, xmm11
730
731 // Finish the calculation by adding the Montgomery product.
732 mulacc xmm5, 0 xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
733 propout xmm6, lo, xmm12, xmm13
734
735 mulacc xmm5, 1 xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
736 propout xmm7, lo, xmm13, xmm14
737
738 mulacc xmm5, 2 xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
739 propout xmm6, hi, xmm14, xmm15
740
741 mulacc xmm5, 3 xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
742 propout xmm7, hi, xmm15, xmm12
743
744 punpckldq xmm6, xmm7
745
746 // And, with that, we're done.
747 movdqu [rdi], xmm6
748 ret
749
750 ENDFUNC
751
752 ///--------------------------------------------------------------------------
753 /// Bulk multipliers.
754
755 FUNC(mpx_umul4_amd64_avx)
756 .arch .avx
757 vzeroupper
758 endprologue
759 .arch pentium4
760 ENDFUNC
761
762 FUNC(mpx_umul4_amd64_sse2)
763 // void mpx_umul4_amd64_sse2(mpw *dv, const mpw *av, const mpw *avl,
764 // const mpw *bv, const mpw *bvl);
765
766 // Establish the arguments and do initial setup.
767 //
768 // sysv win
769 // inner loop dv rdi rdi*
770 // inner loop av rbx* rbx*
771 // outer loop dv r10 rcx
772 // outer loop bv rcx r9
773 // av base rsi rdx
774 // av limit rdx r8
775 // bv limit r8 r10
776
777 #if ABI_SYSV
778 # define DV r10
779 # define AV rsi
780 # define AVL rdx
781 # define BV rcx
782 # define BVL r8
783
784 pushreg rbx
785 endprologue
786
787 mov DV, rdi
788
789 #endif
790
791 #if ABI_WIN
792 # define DV rcx
793 # define AV rdx
794 # define AVL r8
795 # define BV r9
796 # define BVL r10
797
798 pushreg rbx
799 pushreg rdi
800 stalloc 160 + 8
801
802 savexmm xmm6, 0
803 savexmm xmm7, 16
804 savexmm xmm8, 32
805 savexmm xmm9, 48
806 savexmm xmm10, 64
807 savexmm xmm11, 80
808 savexmm xmm12, 96
809 savexmm xmm13, 112
810 savexmm xmm14, 128
811 savexmm xmm15, 144
812
813 endprologue
814
815 mov rdi, DV
816 mov BVL, [rsp + 224]
817
818 #endif
819
820 // Prepare for the first iteration.
821 pxor xmm0, xmm0
822 movdqu xmm10, [BV] // bv[0]
823 mov rbx, AV
824 add DV, 16
825 add BV, 16
826 expand xmm0, xmm10, xmm11
827 call mul4zc
828 add rbx, 16
829 add rdi, 16
830 cmp rbx, AVL // all done?
831 jae 8f
832
833 .p2align 4
834 // Continue with the first iteration.
835 0: call mul4
836 add rbx, 16
837 add rdi, 16
838 cmp rbx, AVL // all done?
839 jb 0b
840
841 // Write out the leftover carry. There can be no tail here.
842 8: call carryprop
843 cmp BV, BVL // more passes to do?
844 jae 9f
845
846 .p2align 4
847 // Set up for the next pass.
848 1: movdqu xmm10, [BV] // bv[i]
849 mov rdi, DV // -> dv[i]
850 pxor xmm0, xmm0
851 expand xmm0, xmm10, xmm11
852 mov rbx, AV // -> av[0]
853 add DV, 16
854 add BV, 16
855 call mla4zc
856 add rbx, 16
857 add rdi, 16
858 cmp rbx, AVL // done yet?
859 jae 8f
860
861 .p2align 4
862 // Continue...
863 0: call mla4
864 add rbx, 16
865 add rdi, 16
866 cmp rbx, AVL
867 jb 0b
868
869 // Finish off this pass. There was no tail on the previous pass, and
870 // there can be none on this pass.
871 8: call carryprop
872 cmp BV, BVL
873 jb 1b
874
875 // All over.
876 9:
877
878 #if ABI_SYSV
879 popreg rbx
880 #endif
881
882 #if ABI_WIN
883
884 rstrxmm xmm6, 0
885 rstrxmm xmm7, 16
886 rstrxmm xmm8, 32
887 rstrxmm xmm9, 48
888 rstrxmm xmm10, 64
889 rstrxmm xmm11, 80
890 rstrxmm xmm12, 96
891 rstrxmm xmm13, 112
892 rstrxmm xmm14, 128
893 rstrxmm xmm15, 144
894
895 stfree 160 + 8
896 popreg rdi
897 popreg rbx
898
899 #endif
900
901 ret
902
903 #undef DV
904 #undef AV
905 #undef AVL
906 #undef BV
907 #undef BVL
908
909 ENDFUNC
910
911 FUNC(mpxmont_mul4_amd64_avx)
912 .arch .avx
913 vzeroupper
914 endprologue
915 .arch pentium4
916 ENDFUNC
917
918 FUNC(mpxmont_mul4_amd64_sse2)
919 // void mpxmont_mul4_amd64_sse2(mpw *dv, const mpw *av, const mpw *bv,
920 // const mpw *nv, size_t n, const mpw *mi);
921
922 // Establish the arguments and do initial setup.
923 //
924 // sysv win
925 // inner loop dv rdi rdi*
926 // inner loop av rax rax
927 // inner loop nv rbx* rbx*
928 // mi r9 r10
929 // outer loop dv r10 rcx
930 // outer loop bv rdx r8
931 // av base rsi rdx
932 // av limit r11 r11
933 // bv limit r8 r12*
934 // nv base rcx r9
935 // n r8 r12*
936
937 #if ABI_SYSV
938 # define DV r10
939 # define AV rsi
940 # define AVL r11
941 # define BV rdx
942 # define BVL r8
943 # define NV rcx
944 # define N r8
945 # define MI r9
946
947 pushreg rbx
948 endprologue
949
950 mov DV, rdi
951
952 #endif
953
954 #if ABI_WIN
955 # define DV rcx
956 # define AV rdx
957 # define AVL r11
958 # define BV r8
959 # define BVL r12
960 # define NV r9
961 # define N r12
962 # define MI r10
963
964 pushreg rbx
965 pushreg rdi
966 pushreg r12
967 stalloc 160
968
969 savexmm xmm6, 0
970 savexmm xmm7, 16
971 savexmm xmm8, 32
972 savexmm xmm9, 48
973 savexmm xmm10, 64
974 savexmm xmm11, 80
975 savexmm xmm12, 96
976 savexmm xmm13, 112
977 savexmm xmm14, 128
978 savexmm xmm15, 144
979
980 endprologue
981
982 mov rdi, DV
983 mov N, [rsp + 224]
984 mov MI, [rsp + 232]
985
986 #endif
987
988 // Establish the expanded operands.
989 pxor xmm0, xmm0
990 movdqu xmm8, [BV] // bv[0]
991 movdqu xmm10, [MI] // mi
992 expand xmm0, xmm8, xmm9, xmm10, xmm11
993
994 // Set up the outer loop state and prepare for the first iteration.
995 mov rax, AV // -> U = av[0]
996 mov rbx, NV // -> X = nv[0]
997 lea AVL, [AV + 4*N] // -> av[n/4] = av limit
998 lea BVL, [BV + 4*N] // -> bv[n/4] = bv limit
999 add BV, 16
1000 add DV, 16
1001 call mmul4
1002 add rdi, 16
1003 add rax, 16
1004 add rbx, 16
1005 cmp rax, AVL // done already?
1006 jae 8f
1007
1008 .p2align 4
1009 // Complete the first inner loop.
1010 0: call dmul4
1011 add rdi, 16
1012 add rax, 16
1013 add rbx, 16
1014 cmp rax, AVL // done yet?
1015 jb 0b
1016
1017 // Still have carries left to propagate.
1018 call carryprop
1019 movd [rdi + 16], xmm12
1020
1021 .p2align 4
1022 // Embark on the next iteration. (There must be one. If n = 1, then
1023 // we would have bailed above, to label 8. Similarly, the subsequent
1024 // iterations can fall into the inner loop immediately.)
1025 1: pxor xmm0, xmm0
1026 movdqu xmm8, [BV] // bv[i]
1027 movdqu xmm10, [MI] // mi
1028 mov rdi, DV // -> Z = dv[i]
1029 mov rax, AV // -> U = av[0]
1030 mov rbx, NV // -> X = nv[0]
1031 expand xmm0, xmm8, xmm9, xmm10, xmm11
1032 add BV, 16
1033 add DV, 16
1034 call mmla4
1035 add rdi, 16
1036 add rax, 16
1037 add rbx, 16
1038
1039 .p2align 4
1040 // Complete the next inner loop.
1041 0: call dmla4
1042 add rdi, 16
1043 add rax, 16
1044 add rbx, 16
1045 cmp rax, AVL
1046 jb 0b
1047
1048 // Still have carries left to propagate, and they overlap the
1049 // previous iteration's final tail, so read that in and add it.
1050 movd xmm0, [rdi]
1051 paddq xmm12, xmm0
1052 call carryprop
1053 movd [rdi + 16], xmm12
1054
1055 // Back again, maybe.
1056 cmp BV, BVL
1057 jb 1b
1058
1059 // All done.
1060 9:
1061
1062 #if ABI_SYSV
1063 popreg rbx
1064 #endif
1065
1066 #if ABI_WIN
1067
1068 rstrxmm xmm6, 0
1069 rstrxmm xmm7, 16
1070 rstrxmm xmm8, 32
1071 rstrxmm xmm9, 48
1072 rstrxmm xmm10, 64
1073 rstrxmm xmm11, 80
1074 rstrxmm xmm12, 96
1075 rstrxmm xmm13, 112
1076 rstrxmm xmm14, 128
1077 rstrxmm xmm15, 144
1078
1079 stfree 160
1080 popreg r12
1081 popreg rdi
1082 popreg rbx
1083
1084 #endif
1085
1086 ret
1087
1088 // First iteration was short. Write out the carries and we're done.
1089 // (This could be folded into the main loop structure, but that would
1090 // penalize small numbers more.)
1091 8: call carryprop
1092 movd [rdi + 16], xmm12
1093 #if ABI_SYSV
1094 popreg rbx
1095 ret
1096 #endif
1097 #if ABI_WIN
1098 jmp 9b
1099 #endif
1100
1101 #undef DV
1102 #undef AV
1103 #undef AVL
1104 #undef BV
1105 #undef BVL
1106 #undef NV
1107 #undef N
1108 #undef MI
1109
1110 ENDFUNC
1111
1112 FUNC(mpxmont_redc4_amd64_avx)
1113 .arch .avx
1114 vzeroupper
1115 endprologue
1116 .arch pentium4
1117 ENDFUNC
1118
1119 FUNC(mpxmont_redc4_amd64_sse2)
1120 // void mpxmont_redc4_amd64_sse2(mpw *dv, mpw *dvl, const mpw *nv,
1121 // size_t n, const mpw *mi);
1122
1123 // Establish the arguments and do initial setup.
1124 //
1125 // sysv win
1126 // inner loop dv rdi rdi*
1127 // dv limit rax rax
1128 // blocks-of-4 dv limit rsi rdx
1129 // inner loop nv rbx* rbx*
1130 // mi r8 r10
1131 // outer loop dv r10 rcx
1132 // outer loop dv limit r11 r11
1133 // nv base rdx r8
1134 // nv limit r9 r12*
1135 // n rcx r9
1136 // c rcx r9
1137
1138 #if ABI_SYSV
1139
1140 # define DVL rax
1141 # define DVL4 rsi
1142 # define MI r8
1143 # define DV r10
1144 # define DVLO r11
1145 # define NV rdx
1146 # define NVL r9
1147 # define N rcx
1148 # define C ecx
1149
1150 pushreg rbx
1151 endprologue
1152
1153 mov DV, rdi
1154
1155 #endif
1156
1157 #if ABI_WIN
1158
1159 # define DVL rax
1160 # define DVL4 rdx
1161 # define MI r10
1162 # define DV rcx
1163 # define DVLO r11
1164 # define NV r8
1165 # define NVL r12
1166 # define N r9
1167 # define C r9d
1168
1169 pushreg rbx
1170 pushreg rdi
1171 pushreg r12
1172 stalloc 160
1173
1174 savexmm xmm6, 0
1175 savexmm xmm7, 16
1176 savexmm xmm8, 32
1177 savexmm xmm9, 48
1178 savexmm xmm10, 64
1179 savexmm xmm11, 80
1180 savexmm xmm12, 96
1181 savexmm xmm13, 112
1182 savexmm xmm14, 128
1183 savexmm xmm15, 144
1184
1185 endprologue
1186
1187 mov rdi, DV
1188 mov MI, [rsp + 224]
1189
1190 #endif
1191
1192 // Establish the expanded operands and the blocks-of-4 dv limit.
1193 pxor xmm0, xmm0
1194 mov DVL, DVL4 // -> dv[n] = dv limit
1195 sub DVL4, DV // length of dv in bytes
1196 movdqu xmm8, [MI] // mi
1197 and DVL4, ~15 // mask off the tail end
1198 expand xmm0, xmm8, xmm9
1199 add DVL4, DV // find limit
1200
1201 // Set up the outer loop state and prepare for the first iteration.
1202 mov rbx, NV // -> X = nv[0]
1203 lea DVLO, [DV + 4*N] // -> dv[n/4] = outer dv limit
1204 lea NVL, [NV + 4*N] // -> nv[n/4] = nv limit
1205 add DV, 16
1206 call mont4
1207 add rbx, 16
1208 add rdi, 16
1209 cmp rbx, NVL // done already?
1210 jae 8f
1211
1212 .p2align 4
1213 // Complete the first inner loop.
1214 5: call mla4
1215 add rbx, 16
1216 add rdi, 16
1217 cmp rbx, NVL // done yet?
1218 jb 5b
1219
1220 // Still have carries left to propagate.
1221 8: carryadd
1222 psllq xmm15, 16
1223 pslldq xmm15, 8
1224 paddq xmm14, xmm15
1225 call carryprop
1226 movd C, xmm12
1227 add rdi, 16
1228 cmp rdi, DVL4
1229 jae 7f
1230
1231 .p2align 4
1232 // Continue carry propagation until the end of the buffer.
1233 0: add [rdi], C
1234 mov C, 0 // preserves flags
1235 adcd [rdi + 4], 0
1236 adcd [rdi + 8], 0
1237 adcd [rdi + 12], 0
1238 adc C, 0
1239 add rdi, 16
1240 cmp rdi, DVL4
1241 jb 0b
1242
1243 // Deal with the tail end.
1244 7: add [rdi], C
1245 mov C, 0 // preserves flags
1246 add rdi, 4
1247 adc C, 0
1248 cmp rdi, DVL
1249 jb 7b
1250
1251 // All done for this iteration. Start the next. (This must have at
1252 // least one follow-on iteration, or we'd not have started this outer
1253 // loop.)
1254 8: mov rdi, DV // -> Z = dv[i]
1255 mov rbx, NV // -> X = nv[0]
1256 cmp rdi, DVLO // all done yet?
1257 jae 9f
1258 add DV, 16
1259 call mont4
1260 add rdi, 16
1261 add rbx, 16
1262 jmp 5b
1263
1264 // All over.
1265 9:
1266
1267 #if ABI_SYSV
1268 popreg rbx
1269 #endif
1270
1271 #if ABI_WIN
1272
1273 rstrxmm xmm6, 0
1274 rstrxmm xmm7, 16
1275 rstrxmm xmm8, 32
1276 rstrxmm xmm9, 48
1277 rstrxmm xmm10, 64
1278 rstrxmm xmm11, 80
1279 rstrxmm xmm12, 96
1280 rstrxmm xmm13, 112
1281 rstrxmm xmm14, 128
1282 rstrxmm xmm15, 144
1283
1284 stfree 160
1285 popreg r12
1286 popreg rdi
1287 popreg rbx
1288
1289 #endif
1290
1291 ret
1292
1293 #undef DVL
1294 #undef DVL4
1295 #undef MI
1296 #undef DV
1297 #undef DVLO
1298 #undef NV
1299 #undef NVL
1300 #undef N
1301 #undef C
1302
1303 ENDFUNC
1304
1305 ///--------------------------------------------------------------------------
1306 /// Testing and performance measurement.
1307
1308 #ifdef TEST_MUL4
1309
1310 #if ABI_SYSV
1311 # define ARG0 rdi
1312 # define ARG1 rsi
1313 # define ARG2 rdx
1314 # define ARG3 rcx
1315 # define ARG4 r8
1316 # define ARG5 r9
1317 # define ARG6 STKARG(0)
1318 # define ARG7 STKARG(1)
1319 # define ARG8 STKARG(2)
1320 # define STKARG_OFFSET 16
1321 #endif
1322 #if ABI_WIN
1323 # define ARG0 rcx
1324 # define ARG1 rdx
1325 # define ARG2 r8
1326 # define ARG3 r9
1327 # define ARG4 STKARG(0)
1328 # define ARG5 STKARG(1)
1329 # define ARG6 STKARG(2)
1330 # define ARG7 STKARG(3)
1331 # define ARG8 STKARG(4)
1332 # define STKARG_OFFSET 224
1333 #endif
1334 #define STKARG(i) [rsp + STKARG_OFFSET + 8*(i)]
1335
1336 // sysv win
1337 // dmul smul mmul mont dmul smul mmul mont
1338 // A rax
1339 // D rdx
1340 // z rdi rdi rdi rdi rdi rcx rcx rcx rcx
1341 // c rcx rsi rsi rsi rsi rdx rdx rdx rdx
1342 // y r10 -- -- rdx rdx -- -- r8 r8
1343 // u r11 rdx -- rcx -- r8 -- r9 --
1344 // x rbx rcx rdx r8 rcx r9 r8 stk0 r9
1345 // vv xmm8/9 r8 -- r9 r8 stk0 -- stk1 stk0
1346 // yy xmm10/11 r9 rcx stk0 -- stk1 r9 stk2 --
1347 // n r8 stk0 r8 stk1 r9 stk2 stk0 stk3 stk1
1348 // cyv r9 stk1 r9 stk2 stk0 stk3 stk1 stk4 stk2
1349
1350 .macro cysetup v, n
1351 rdtsc
1352 shl rdx, 32
1353 or rax, rdx
1354 mov [\v + 8*\n - 8], rax
1355 .endm
1356
1357 .macro cystore v, n
1358 rdtsc
1359 shl rdx, 32
1360 or rax, rdx
1361 sub rax, [\v + 8*\n - 8]
1362 mov [\v + 8*\n - 8], rax
1363 dec \n
1364 .endm
1365
1366 .macro testprologue mode
1367 pushreg rbx
1368 #if ABI_SYSV
1369 endprologue
1370 .ifeqs "\mode", "dmul"
1371 mov rbx, rcx
1372 movdqu xmm8, [r8]
1373 movdqu xmm10, [r9]
1374 mov r8d, STKARG(0)
1375 mov r9, STKARG(1)
1376 mov r11, rdx
1377 mov rcx, rsi
1378 .endif
1379 .ifeqs "\mode", "smul"
1380 mov rbx, rdx
1381 movdqu xmm10, [rcx]
1382 mov rcx, rsi
1383 .endif
1384 .ifeqs "\mode", "mmul"
1385 mov rax, STKARG(0)
1386 mov rbx, r8
1387 movdqu xmm8, [r9]
1388 movdqu xmm10, [rax]
1389 mov r8d, STKARG(1)
1390 mov r9, STKARG(2)
1391 mov r10, rdx
1392 mov r11, rcx
1393 mov rcx, rsi
1394 .endif
1395 .ifeqs "\mode", "mont"
1396 mov rbx, rcx
1397 movdqu xmm8, [r8]
1398 mov r8d, r9d
1399 mov r9, STKARG(0)
1400 mov r10, rdx
1401 mov rcx, rsi
1402 .endif
1403 #endif
1404 #if ABI_WIN
1405 pushreg rdi
1406 stalloc 168
1407 savexmm xmm6, 0
1408 savexmm xmm7, 16
1409 savexmm xmm8, 32
1410 savexmm xmm9, 48
1411 savexmm xmm10, 64
1412 savexmm xmm11, 80
1413 savexmm xmm12, 96
1414 savexmm xmm13, 112
1415 savexmm xmm14, 128
1416 savexmm xmm15, 144
1417 endprologue
1418 .ifeqs "\mode", "dmul"
1419 mov r10, STKARG(0)
1420 mov r11, STKARG(1)
1421 mov rdi, rcx
1422 mov rcx, rdx
1423 mov rbx, r9
1424 movdqu xmm8, [r10]
1425 movdqu xmm10, [r11]
1426 mov r11, r8
1427 mov r8d, STKARG(2)
1428 mov r9, STKARG(3)
1429 .endif
1430 .ifeqs "\mode", "smul"
1431 mov rdi, rcx
1432 mov rcx, rdx
1433 mov rbx, r8
1434 movdqu xmm10, [r9]
1435 mov r8d, STKARG(0)
1436 mov r9, STKARG(1)
1437 .endif
1438 .ifeqs "\mode", "mmul"
1439 mov r10, STKARG(1)
1440 mov r11, STKARG(2)
1441 mov rdi, rcx
1442 mov rcx, rdx
1443 mov rbx, STKARG(0)
1444 movdqu xmm8, [r10]
1445 movdqu xmm10, [r11]
1446 mov r10, r8
1447 mov r11, r9
1448 mov r8d, STKARG(3)
1449 mov r9, STKARG(4)
1450 .endif
1451 .ifeqs "\mode", "mont"
1452 mov r10, STKARG(0)
1453 mov rdi, rcx
1454 mov rcx, rdx
1455 mov rbx, r9
1456 movdqu xmm8, [r10]
1457 mov r10, r8
1458 mov r8d, STKARG(1)
1459 mov r9, STKARG(2)
1460 .endif
1461 #endif
1462
1463 pxor xmm0, xmm0
1464 .ifeqs "\mode", "dmul"
1465 expand xmm0, xmm8, xmm9, xmm10, xmm11
1466 .endif
1467 .ifeqs "\mode", "smul"
1468 expand xmm0, xmm10, xmm11
1469 .endif
1470 .ifeqs "\mode", "mmul"
1471 expand xmm0, xmm8, xmm9, xmm10, xmm11
1472 .endif
1473 .ifeqs "\mode", "mont"
1474 expand xmm0, xmm8, xmm9
1475 .endif
1476 .endm
1477
1478 .macro testepilogue
1479 #if ABI_WIN
1480 rstrxmm xmm6, 0
1481 rstrxmm xmm7, 16
1482 rstrxmm xmm8, 32
1483 rstrxmm xmm9, 48
1484 rstrxmm xmm10, 64
1485 rstrxmm xmm11, 80
1486 rstrxmm xmm12, 96
1487 rstrxmm xmm13, 112
1488 rstrxmm xmm14, 128
1489 rstrxmm xmm15, 144
1490 stfree 168
1491 popreg rdi
1492 #endif
1493 popreg rbx
1494 ret
1495 .endm
1496
1497 .macro testldcarry
1498 movdqu xmm12, [rcx + 0] // (c'_0; c''_0)
1499 movdqu xmm13, [rcx + 16] // (c'_1; c''_1)
1500 movdqu xmm14, [rcx + 32] // (c'_2; c''_2)
1501 .endm
1502
1503 .macro testtop u=nil
1504 .p2align 4
1505 0:
1506 cysetup r9, r8
1507 .ifnes "\u", "nil"
1508 mov rax, \u
1509 .endif
1510 .endm
1511
1512 .macro testtail
1513 cystore r9, r8
1514 jnz 0b
1515 .endm
1516
1517 .macro testcarryout
1518 movdqu [rcx + 0], xmm12
1519 movdqu [rcx + 16], xmm13
1520 movdqu [rcx + 32], xmm14
1521 .endm
1522
1523 FUNC(test_dmul4)
1524 testprologue dmul
1525 testldcarry
1526 testtop r11
1527 call dmul4
1528 testtail
1529 testcarryout
1530 testepilogue
1531 ENDFUNC
1532
1533 FUNC(test_dmla4)
1534 testprologue dmul
1535 testldcarry
1536 testtop r11
1537 call dmla4
1538 testtail
1539 testcarryout
1540 testepilogue
1541 ENDFUNC
1542
1543 FUNC(test_mul4)
1544 testprologue smul
1545 testldcarry
1546 testtop nil
1547 call mul4
1548 testtail
1549 testcarryout
1550 testepilogue
1551 ENDFUNC
1552
1553 FUNC(test_mla4)
1554 testprologue smul
1555 testldcarry
1556 testtop nil
1557 call mla4
1558 testtail
1559 testcarryout
1560 testepilogue
1561 ENDFUNC
1562
1563 FUNC(test_mmul4)
1564 testprologue mmul
1565 testtop r11
1566 call mmul4
1567 testtail
1568 movdqu [r10 + 0], xmm10
1569 movdqu [r10 + 16], xmm11
1570 testcarryout
1571 testepilogue
1572 ENDFUNC
1573
1574 FUNC(test_mmla4)
1575 testprologue mmul
1576 testtop r11
1577 call mmla4
1578 testtail
1579 movdqu [r10 + 0], xmm10
1580 movdqu [r10 + 16], xmm11
1581 testcarryout
1582 testepilogue
1583 ENDFUNC
1584
1585 FUNC(test_mont4)
1586 testprologue mont
1587 testtop
1588 call mont4
1589 testtail
1590 movdqu [r10 + 0], xmm10
1591 movdqu [r10 + 16], xmm11
1592 testcarryout
1593 testepilogue
1594 ENDFUNC
1595
1596 #endif
1597
1598 ///----- That's all, folks --------------------------------------------------