9146a63fa3afcb22e55494486c9c987c1c54c2e2
[catacomb] / math / mpx-mul4-amd64-sse2.S
1 /// -*- mode: asm; asm-comment-char: ?/; comment-start: "// " -*-
2 ///
3 /// Large SIMD-based multiplications
4 ///
5 /// (c) 2016 Straylight/Edgeware
6 ///
7
8 ///----- Licensing notice ---------------------------------------------------
9 ///
10 /// This file is part of Catacomb.
11 ///
12 /// Catacomb is free software; you can redistribute it and/or modify
13 /// it under the terms of the GNU Library General Public License as
14 /// published by the Free Software Foundation; either version 2 of the
15 /// License, or (at your option) any later version.
16 ///
17 /// Catacomb is distributed in the hope that it will be useful,
18 /// but WITHOUT ANY WARRANTY; without even the implied warranty of
19 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 /// GNU Library General Public License for more details.
21 ///
22 /// You should have received a copy of the GNU Library General Public
23 /// License along with Catacomb; if not, write to the Free
24 /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
25 /// MA 02111-1307, USA.
26
27 ///--------------------------------------------------------------------------
28 /// External definitions.
29
30 #include "config.h"
31 #include "asm-common.h"
32
33 ///--------------------------------------------------------------------------
34 /// Prologue.
35
36 .arch pentium4
37 .text
38
39 ///--------------------------------------------------------------------------
40 /// Theory.
41 ///
42 /// We define a number of primitive fixed-size multipliers from which we can
43 /// construct more general variable-length multipliers.
44 ///
45 /// The basic trick is the same throughout. In an operand-scanning
46 /// multiplication, the inner multiplication loop multiplies a
47 /// multiple-precision operand by a single precision factor, and adds the
48 /// result, appropriately shifted, to the result. A `finely integrated
49 /// operand scanning' implementation of Montgomery multiplication also adds
50 /// the product of a single-precision `Montgomery factor' and the modulus,
51 /// calculated in the same pass. The more common `coarsely integrated
52 /// operand scanning' alternates main multiplication and Montgomery passes,
53 /// which requires additional carry propagation.
54 ///
55 /// Throughout both plain-multiplication and Montgomery stages, then, one of
56 /// the factors remains constant throughout the operation, so we can afford
57 /// to take a little time to preprocess it. The transformation we perform is
58 /// as follows. Let b = 2^16, and B = b^2 = 2^32. Suppose we're given a
59 /// 128-bit factor v = v_0 + v_1 B + v_2 B^2 + v_3 B^3. Split each v_i into
60 /// two sixteen-bit pieces, so v_i = v'_i + v''_i b. These eight 16-bit
61 /// pieces are placed into 32-bit cells, and arranged as two 128-bit SSE
62 /// operands, as follows.
63 ///
64 /// Offset 0 4 8 12
65 /// 0 v'_0 v'_1 v''_0 v''_1
66 /// 16 v'_2 v'_3 v''_2 v''_3
67 ///
68 /// A `pmuludqd' instruction ignores the odd positions in its operands; thus,
69 /// it will act on (say) v'_0 and v''_0 in a single instruction. Shifting
70 /// this vector right by 4 bytes brings v'_1 and v''_1 into position. We can
71 /// multiply such a vector by a full 32-bit scalar to produce two 48-bit
72 /// results in 64-bit fields. The sixteen bits of headroom allows us to add
73 /// many products together before we must deal with carrying; it also allows
74 /// for some calculations to be performed on the above expanded form.
75 ///
76 /// ...
77 ///
78 /// We maintain four `carry' registers accumulating intermediate results.
79 /// The registers' precise roles rotate during the computation; we name them
80 /// `c0', `c1', `c2', and `c3'. Each carry register holds two 64-bit halves:
81 /// the register c0, for example, holds c'_0 (low half) and c''_0 (high
82 /// half), and represents the value c_0 = c'_0 + c''_0 b; the carry registers
83 /// collectively represent the value c_0 + c_1 B + c_2 B^2 + c_3 B^3. The
84 /// `pmuluqdq' instruction acting on a scalar operand (broadcast across all
85 /// lanes of its vector) and an operand in the expanded form above produces a
86 /// result which can be added directly to the appropriate carry register.
87 /// Following a pass of four multiplications, we perform some limited carry
88 /// propagation: let t = c''_0 mod B, and let d = c'_0 + t b; then we output
89 /// z = d mod B, add (floor(d/B), floor(c''_0/B)) to c1, and cycle the carry
90 /// registers around, so that c1 becomes c0, and the old c0 is (implicitly)
91 /// zeroed becomes c3.
92
93 ///--------------------------------------------------------------------------
94 /// Macro definitions.
95
96 .macro mulcore r, i, slo, shi, d0, d1=nil, d2=nil, d3=nil
97 // Multiply R_I by the expanded operand SLO/SHI, and leave the pieces
98 // of the product in registers D0, D1, D2, D3.
99 pshufd \d0, \r, SHUF(3, \i, 3, \i) // (r_i, ?, r_i, ?)
100 .ifnes "\d1", "nil"
101 movdqa \d1, \slo // (s'_0, s'_1, s''_0, s''_1)
102 .endif
103 .ifnes "\d3", "nil"
104 movdqa \d3, \shi // (s'_2, s'_3, s''_2, s''_3)
105 .endif
106 .ifnes "\d1", "nil"
107 psrldq \d1, 4 // (s'_1, s''_0, s''_1, 0)
108 .endif
109 .ifnes "\d2", "nil"
110 movdqa \d2, \d0 // another copy of (r_i, ?, r_i, ?)
111 .endif
112 .ifnes "\d3", "nil"
113 psrldq \d3, 4 // (s'_3, s''_2, s''_3, 0)
114 .endif
115 .ifnes "\d1", "nil"
116 pmuludq \d1, \d0 // (r_i s'_1, r_i s''_1)
117 .endif
118 .ifnes "\d3", "nil"
119 pmuludq \d3, \d0 // (r_i s'_3, r_i s''_3)
120 .endif
121 .ifnes "\d2", "nil"
122 pmuludq \d2, \shi // (r_i s'_2, r_i s''_2)
123 .endif
124 pmuludq \d0, \slo // (r_i s'_0, r_i s''_0)
125 .endm
126
127 .macro accum c0, c1=nil, c2=nil, c3=nil
128 // Accumulate 64-bit pieces in XMM0--XMM3 into the corresponding
129 // carry registers C0--C3. Any or all of C1--C3 may be `nil' to skip
130 // updating that register.
131 paddq \c0, xmm0
132 .ifnes "\c1", "nil"
133 paddq \c1, xmm1
134 .endif
135 .ifnes "\c2", "nil"
136 paddq \c2, xmm2
137 .endif
138 .ifnes "\c3", "nil"
139 paddq \c3, xmm3
140 .endif
141 .endm
142
143 .macro mulacc r, i, slo, shi, c0=nil, c1=nil, c2=nil, c3=nil, z3p=nil
144 // Multiply R_I by the expanded operand SLO/SHI, and accumulate in
145 // carry registers C0, C1, C2, C3. If Z3P is `t' then C3 notionally
146 // contains zero, but needs clearing; in practice, we store the
147 // product directly rather than attempting to add. On completion,
148 // XMM0, XMM1, and XMM2 are clobbered, as is XMM3 if Z3P is not `t'.
149 .ifeqs "\z3p", "t"
150 mulcore \r, \i, \slo, \shi, xmm0, xmm1, xmm2, \c3
151 accum \c0, \c1, \c2
152 .else
153 mulcore \r, \i, \slo, \shi, xmm0, xmm1, xmm2, xmm3
154 accum \c0, \c1, \c2, \c3
155 .endif
156 .endm
157
158 .macro propout d, pos, c, cc=nil
159 // Calculate an output word from C, and store it at POS in D;
160 // propagate carries out from C to CC in preparation for a rotation
161 // of the carry registers. D is an XMM register; the POS is either
162 // `lo' or `hi' according to whether the output word should be in
163 // lane 0 or 1 of D; the high two lanes of D are clobbered. On
164 // completion, XMM3 is clobbered. If CC is `nil', then the
165 // contribution which would have been added to it is left in C.
166 pshufd xmm3, \c, SHUF(2, 3, 3, 3) // (?, ?, ?, t = c'' mod B)
167 psrldq xmm3, 12 // (t, 0, 0, 0) = (t, 0)
168 pslldq xmm3, 2 // (t b, 0)
169 paddq \c, xmm3 // (c' + t b, c'')
170 .ifeqs "\pos", "lo"
171 movdqa \d, \c
172 .else
173 punpckldq \d, \c
174 .endif
175 psrlq \c, 32 // floor(c/B)
176 .ifnes "\cc", "nil"
177 paddq \cc, \c // propagate up
178 .endif
179 .endm
180
181 .macro endprop d, pos, c, t
182 // On entry, C contains a carry register. On exit, the low 32 bits
183 // of the value represented in C are written at POS in D, and the
184 // remaining bits are left at the bottom of T.
185 movdqa \t, \c
186 psllq \t, 16 // (?, c'' b)
187 pslldq \c, 8 // (0, c')
188 paddq \t, \c // (?, c' + c'' b)
189 psrldq \t, 8 // c' + c'' b
190 .ifeqs "\pos", "lo"
191 movdqa \d, \t
192 .else
193 punpckldq \d, \t
194 .endif
195 psrldq \t, 4 // floor((c' + c'' b)/B)
196 .endm
197
198 .macro expand z, a, b, c=nil, d=nil
199 // On entry, A and C hold packed 128-bit values, and Z is zero. On
200 // exit, A:B and C:D together hold the same values in expanded
201 // form. If C is `nil', then only expand A to A:B.
202 movdqa \b, \a // (a_0, a_1, a_2, a_3)
203 .ifnes "\c", "nil"
204 movdqa \d, \c // (c_0, c_1, c_2, c_3)
205 .endif
206 punpcklwd \a, \z // (a'_0, a''_0, a'_1, a''_1)
207 punpckhwd \b, \z // (a'_2, a''_2, a'_3, a''_3)
208 .ifnes "\c", "nil"
209 punpcklwd \c, \z // (c'_0, c''_0, c'_1, c''_1)
210 punpckhwd \d, \z // (c'_2, c''_2, c'_3, c''_3)
211 .endif
212 pshufd \a, \a, SHUF(3, 1, 2, 0) // (a'_0, a'_1, a''_0, a''_1)
213 pshufd \b, \b, SHUF(3, 1, 2, 0) // (a'_2, a'_3, a''_2, a''_3)
214 .ifnes "\c", "nil"
215 pshufd \c, \c, SHUF(3, 1, 2, 0) // (c'_0, c'_1, c''_0, c''_1)
216 pshufd \d, \d, SHUF(3, 1, 2, 0) // (c'_2, c'_3, c''_2, c''_3)
217 .endif
218 .endm
219
220 .macro squash c0, c1, c2, c3, t, u, lo, hi=nil
221 // On entry, C0, C1, C2, C3 are carry registers representing a value
222 // Y. On exit, LO holds the low 128 bits of the carry value; C1, C2,
223 // C3, T, and U are clobbered; and the high bits of Y are stored in
224 // HI, if this is not `nil'.
225
226 // The first step is to eliminate the `double-prime' pieces -- i.e.,
227 // the ones offset by 16 bytes from a 32-bit boundary -- by carrying
228 // them into the 32-bit-aligned pieces above and below. But before
229 // we can do that, we must gather them together.
230 movdqa \t, \c0
231 movdqa \u, \c1
232 punpcklqdq \t, \c2 // (y'_0, y'_2)
233 punpckhqdq \c0, \c2 // (y''_0, y''_2)
234 punpcklqdq \u, \c3 // (y'_1, y'_3)
235 punpckhqdq \c1, \c3 // (y''_1, y''_3)
236
237 // Now split the double-prime pieces. The high (up to) 48 bits will
238 // go up; the low 16 bits go down.
239 movdqa \c2, \c0
240 movdqa \c3, \c1
241 psllq \c2, 48
242 psllq \c3, 48
243 psrlq \c0, 16 // high parts of (y''_0, y''_2)
244 psrlq \c1, 16 // high parts of (y''_1, y''_3)
245 psrlq \c2, 32 // low parts of (y''_0, y''_2)
246 psrlq \c3, 32 // low parts of (y''_1, y''_3)
247 .ifnes "\hi", "nil"
248 movdqa \hi, \c1
249 .endif
250 pslldq \c1, 8 // high part of (0, y''_1)
251
252 paddq \t, \c2 // propagate down
253 paddq \u, \c3
254 paddq \t, \c1 // and up: (y_0, y_2)
255 paddq \u, \c0 // (y_1, y_3)
256 .ifnes "\hi", "nil"
257 psrldq \hi, 8 // high part of (y''_3, 0)
258 .endif
259
260 // Finally extract the answer. This complicated dance is better than
261 // storing to memory and loading, because the piecemeal stores
262 // inhibit store forwarding.
263 movdqa \c3, \t // (y_0, y_1)
264 movdqa \lo, \t // (y^*_0, ?, ?, ?)
265 psrldq \t, 8 // (y_2, 0)
266 psrlq \c3, 32 // (floor(y_0/B), ?)
267 paddq \c3, \u // (y_1 + floor(y_0/B), ?)
268 movdqa \c1, \c3 // (y^*_1, ?, ?, ?)
269 psrldq \u, 8 // (y_3, 0)
270 psrlq \c3, 32 // (floor((y_1 B + y_0)/B^2, ?)
271 paddq \c3, \t // (y_2 + floor((y_1 B + y_0)/B^2, ?)
272 punpckldq \lo, \c3 // (y^*_0, y^*_2, ?, ?)
273 psrlq \c3, 32 // (floor((y_2 B^2 + y_1 B + y_0)/B^3, ?)
274 paddq \c3, \u // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3, ?)
275 .ifnes "\hi", "nil"
276 movdqa \t, \c3
277 pxor \u, \u
278 .endif
279 punpckldq \c1, \c3 // (y^*_1, y^*_3, ?, ?)
280 .ifnes "\hi", "nil"
281 psrlq \t, 32 // very high bits of y
282 paddq \hi, \t
283 punpcklqdq \hi, \u // carry up
284 .endif
285 punpckldq \lo, \c1 // y mod B^4
286 .endm
287
288 .macro carryadd
289 // On entry, RDI points to a packed addend A, and XMM12, XMM13, XMM14
290 // hold the incoming carry registers c0, c1, and c2 representing a
291 // carry-in C.
292 //
293 // On exit, the carry registers, including XMM15, are updated to hold
294 // C + A; XMM0, XMM1, XMM2, and XMM3 are clobbered. The other
295 // registers are preserved.
296 movd xmm0, [rdi + 0] // (a_0, 0)
297 movd xmm1, [rdi + 4] // (a_1, 0)
298 movd xmm2, [rdi + 8] // (a_2, 0)
299 movd xmm15, [rdi + 12] // (a_3, 0)
300 paddq xmm12, xmm0 // (c'_0 + a_0, c''_0)
301 paddq xmm13, xmm1 // (c'_1 + a_1, c''_1)
302 paddq xmm14, xmm2 // (c'_2 + a_2, c''_2 + a_3 b)
303 .endm
304
305 ///--------------------------------------------------------------------------
306 /// Primitive multipliers and related utilities.
307
308 INTFUNC(carryprop)
309 // On entry, XMM12, XMM13, and XMM14 hold a 144-bit carry in an
310 // expanded form. Store the low 128 bits of the represented carry to
311 // [RDI] as a packed 128-bit value, and leave the remaining 16 bits
312 // in the low 32 bits of XMM12. On exit, XMM0, XMM1, XMM3, XMM13 and
313 // XMM14 are clobbered.
314 endprologue
315
316 propout xmm0, lo, xmm12, xmm13
317 propout xmm1, lo, xmm13, xmm14
318 propout xmm0, hi, xmm14, nil
319 endprop xmm1, hi, xmm14, xmm12
320 punpckldq xmm0, xmm1
321 movdqu [rdi], xmm0
322
323 ret
324
325 ENDFUNC
326
327 INTFUNC(dmul4)
328 // On entry, RDI points to the destination buffer; RAX and RBX point
329 // to the packed operands U and X; XMM8/XMM9 and XMM10/XMM11 hold the
330 // expanded operands V and Y; and XMM12, XMM13, XMM14 hold the
331 // incoming carry registers c0, c1, and c2; c3 is assumed to be zero.
332 //
333 // On exit, we write the low 128 bits of the sum C + U V + X Y to
334 // [RDI], and update the carry registers with the carry out. The
335 // registers XMM0--XMM7, and XMM15 are clobbered; the general-purpose
336 // registers are preserved.
337 endprologue
338
339 movdqu xmm4, [rax]
340 movdqu xmm5, [rbx]
341
342 mulacc xmm4, 0, xmm8, xmm9, xmm12, xmm13, xmm14, xmm15, t
343 mulacc xmm5, 0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
344 propout xmm6, lo, xmm12, xmm13
345
346 mulacc xmm4, 1, xmm8, xmm9, xmm13, xmm14, xmm15, xmm12, t
347 mulacc xmm5, 1, xmm10, xmm11, xmm13, xmm14, xmm15, xmm12
348 propout xmm7, lo, xmm13, xmm14
349
350 mulacc xmm4, 2, xmm8, xmm9, xmm14, xmm15, xmm12, xmm13, t
351 mulacc xmm5, 2, xmm10, xmm11, xmm14, xmm15, xmm12, xmm13
352 propout xmm6, hi, xmm14, xmm15
353
354 mulacc xmm4, 3, xmm8, xmm9, xmm15, xmm12, xmm13, xmm14, t
355 mulacc xmm5, 3, xmm10, xmm11, xmm15, xmm12, xmm13, xmm14
356 propout xmm7, hi, xmm15, xmm12
357
358 punpckldq xmm6, xmm7
359 movdqu [rdi], xmm6
360
361 ret
362
363 ENDFUNC
364
365 INTFUNC(dmla4)
366 // On entry, RDI points to the destination buffer, which also
367 // contains an addend A to accumulate; RAX and RBX point to the
368 // packed operands U and X; XMM8/XMM9 and XMM10/XMM11 hold the
369 // expanded operands V and Y; and XMM12, XMM13, XMM14 hold the
370 // incoming carry registers c0, c1, and c2 representing a carry-in C;
371 // c3 is assumed to be zero.
372 //
373 // On exit, we write the low 128 bits of the sum A + C + U V + X Y to
374 // [RDI], and update the carry registers with the carry out. The
375 // registers XMM0--XMM7, and XMM15 are clobbered; the general-purpose
376 // registers are preserved.
377 endprologue
378
379 movdqu xmm4, [rax]
380 movdqu xmm5, [rbx]
381 carryadd
382
383 mulacc xmm4, 0, xmm8, xmm9, xmm12, xmm13, xmm14, xmm15
384 mulacc xmm5, 0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
385 propout xmm6, lo, xmm12, xmm13
386
387 mulacc xmm4, 1, xmm8, xmm9, xmm13, xmm14, xmm15, xmm12, t
388 mulacc xmm5, 1, xmm10, xmm11, xmm13, xmm14, xmm15, xmm12
389 propout xmm7, lo, xmm13, xmm14
390
391 mulacc xmm4, 2, xmm8, xmm9, xmm14, xmm15, xmm12, xmm13, t
392 mulacc xmm5, 2, xmm10, xmm11, xmm14, xmm15, xmm12, xmm13
393 propout xmm6, hi, xmm14, xmm15
394
395 mulacc xmm4, 3, xmm8, xmm9, xmm15, xmm12, xmm13, xmm14, t
396 mulacc xmm5, 3, xmm10, xmm11, xmm15, xmm12, xmm13, xmm14
397 propout xmm7, hi, xmm15, xmm12
398
399 punpckldq xmm6, xmm7
400 movdqu [rdi], xmm6
401
402 ret
403
404 ENDFUNC
405
406 INTFUNC(mul4zc)
407 // On entry, RDI points to the destination buffer; RBX points to a
408 // packed operand X; and XMM10/XMM11 hold an expanded operand Y.
409 //
410 // On exit, we write the low 128 bits of the product X Y to [RDI],
411 // and set the carry registers XMM12, XMM13, XMM14 to the carry out.
412 // The registers XMM0--XMM3, XMM5--XMM7, and XMM15 are clobbered; the
413 // general-purpose registers are preserved.
414 endprologue
415
416 movdqu xmm5, [rbx]
417
418 mulcore xmm5, 0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
419 propout xmm6, lo, xmm12, xmm13
420
421 mulacc xmm5, 1, xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
422 propout xmm7, lo, xmm13, xmm14
423
424 mulacc xmm5, 2, xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
425 propout xmm6, hi, xmm14, xmm15
426
427 mulacc xmm5, 3, xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
428 propout xmm7, hi, xmm15, xmm12
429
430 punpckldq xmm6, xmm7
431 movdqu [rdi], xmm6
432
433 ret
434
435 ENDFUNC
436
437 INTFUNC(mul4)
438 // On entry, RDI points to the destination buffer; RBX points to a
439 // packed operand X; XMM10/XMM11 hold an expanded operand Y; and
440 // XMM12, XMM13, XMM14 hold the incoming carry registers c0, c1, and
441 // c2, representing a carry-in C; c3 is assumed to be zero.
442 //
443 // On exit, we write the low 128 bits of the sum C + X Y to [RDI],
444 // and update the carry registers with the carry out. The registers
445 // XMM0--XMM3, XMM5--XMM7, and XMM15 are clobbered; the
446 // general-purpose registers are preserved.
447 endprologue
448
449 movdqu xmm5, [rbx]
450
451 mulacc xmm5, 0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, t
452 propout xmm6, lo, xmm12, xmm13
453
454 mulacc xmm5, 1, xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
455 propout xmm7, lo, xmm13, xmm14
456
457 mulacc xmm5, 2, xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
458 propout xmm6, hi, xmm14, xmm15
459
460 mulacc xmm5, 3, xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
461 propout xmm7, hi, xmm15, xmm12
462
463 punpckldq xmm6, xmm7
464 movdqu [rdi], xmm6
465
466 ret
467
468 ENDFUNC
469
470 INTFUNC(mla4zc)
471 // On entry, RDI points to the destination buffer, which also
472 // contains an addend A to accumulate; RBX points to a packed operand
473 // X; and XMM10/XMM11 points to an expanded operand Y.
474 //
475 // On exit, we write the low 128 bits of the sum A + X Y to [RDI],
476 // and set the carry registers XMM12, XMM13, XMM14 to the carry out.
477 // The registers XMM0--XMM3, XMM5--XMM7, and XMM15 are clobbered; the
478 // general-purpose registers are preserved.
479 endprologue
480
481 movdqu xmm5, [rbx]
482 movd xmm12, [rdi + 0]
483 movd xmm13, [rdi + 4]
484 movd xmm14, [rdi + 8]
485 movd xmm15, [rdi + 12]
486
487 mulacc xmm5, 0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
488 propout xmm6, lo, xmm12, xmm13
489
490 mulacc xmm5, 1, xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
491 propout xmm7, lo, xmm13, xmm14
492
493 mulacc xmm5, 2, xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
494 propout xmm6, hi, xmm14, xmm15
495
496 mulacc xmm5, 3, xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
497 propout xmm7, hi, xmm15, xmm12
498
499 punpckldq xmm6, xmm7
500 movdqu [rdi], xmm6
501
502 ret
503
504 ENDFUNC
505
506 INTFUNC(mla4)
507 // On entry, RDI points to the destination buffer, which also
508 // contains an addend A to accumulate; RBX points to a packed operand
509 // X; XMM10/XMM11 holds an expanded operand Y; and XMM12, XMM13,
510 // XMM14 hold the incoming carry registers c0, c1, and c2,
511 // representing a carry-in C; c3 is assumed to be zero.
512 //
513 // On exit, we write the low 128 bits of the sum A + C + X Y to
514 // [RDI], and update the carry registers with the carry out. The
515 // registers XMM0--XMM3, XMM5--XMM7, and XMM15 are clobbered; the
516 // general-purpose registers are preserved.
517 endprologue
518
519 movdqu xmm5, [rbx]
520 carryadd
521
522 mulacc xmm5, 0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
523 propout xmm6, lo, xmm12, xmm13
524
525 mulacc xmm5, 1, xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
526 propout xmm7, lo, xmm13, xmm14
527
528 mulacc xmm5, 2, xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
529 propout xmm6, hi, xmm14, xmm15
530
531 mulacc xmm5, 3, xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
532 propout xmm7, hi, xmm15, xmm12
533
534 punpckldq xmm6, xmm7
535 movdqu [rdi], xmm6
536
537 ret
538
539 ENDFUNC
540
541 INTFUNC(mmul4)
542 // On entry, RDI points to the destination buffer; RAX and RBX point
543 // to the packed operands U and N; and XMM8/XMM9 and XMM10/XMM11 hold
544 // the expanded operands V and M. The stack pointer must be 8 modulo 16
545 // (as usual for AMD64 ABIs).
546 //
547 // On exit, we store Y = U V M mod B in XMM10/XMM11, and write the
548 // low 128 bits of the sum U V + N Y to [RDI], leaving the remaining
549 // carry in XMM12, XMM13, and XMM14. The registers XMM0--XMM7, and
550 // XMM15 are clobbered; the general-purpose registers are preserved.
551 movdqu xmm4, [rax]
552 #if ABI_WIN
553 stalloc 48 + 8 // space for the carries
554 #endif
555 endprologue
556
557 // Calculate W = U V, and leave it in XMM7. Stash the carry pieces
558 // for later.
559 mulcore xmm4, 0, xmm8, xmm9, xmm12, xmm13, xmm14, xmm15
560 propout xmm7, lo, xmm12, xmm13
561 jmp 5f
562
563 ENDFUNC
564
565 INTFUNC(mmla4)
566 // On entry, RDI points to the destination buffer, which also
567 // contains an addend A to accumulate; RAX and RBX point to the
568 // packed operands U and N; and XMM8/XMM9 and XMM10/XMM11 hold the
569 // expanded operands V and M. The stack pointer must be 8 modulo 16
570 // (as usual for AMD64 ABIs).
571 //
572 // On exit, we store Y = (A + U V) M mod B in XMM10/XMM11, and write
573 // the low 128 bits of the sum A + U V + N Y to [RDI], leaving the
574 // remaining carry in XMM12, XMM13, and XMM14. The registers
575 // XMM0--XMM7, and XMM15 are clobbered; the general-purpose registers
576 // are preserved.
577 movdqu xmm4, [rax]
578 #if ABI_WIN
579 stalloc 48 + 8 // space for the carries
580 # define STKTMP(i) [rsp + i]
581 #endif
582 #if ABI_SYSV
583 # define STKTMP(i) [rsp + i - 48 - 8] // use red zone
584 #endif
585 endprologue
586
587 movd xmm12, [rdi + 0]
588 movd xmm13, [rdi + 4]
589 movd xmm14, [rdi + 8]
590 movd xmm15, [rdi + 12]
591
592 // Calculate W = U V, and leave it in XMM7. Stash the carry pieces
593 // for later.
594 mulacc xmm4, 0, xmm8, xmm9, xmm12, xmm13, xmm14, xmm15
595 propout xmm7, lo, xmm12, xmm13
596
597 5: mulacc xmm4, 1, xmm8, xmm9, xmm13, xmm14, xmm15, xmm12, t
598 propout xmm6, lo, xmm13, xmm14
599
600 mulacc xmm4, 2, xmm8, xmm9, xmm14, xmm15, xmm12, xmm13, t
601 propout xmm7, hi, xmm14, xmm15
602
603 mulacc xmm4, 3, xmm8, xmm9, xmm15, xmm12, xmm13, xmm14, t
604 propout xmm6, hi, xmm15, xmm12
605
606 // Prepare W, and stash carries for later.
607 punpckldq xmm7, xmm6
608 movdqa STKTMP( 0), xmm12
609 movdqa STKTMP(16), xmm13
610 movdqa STKTMP(32), xmm14
611
612 // Calculate Y = W M. We just about have enough spare registers to
613 // make this work.
614 mulcore xmm7, 0, xmm10, xmm11, xmm3, xmm4, xmm5, xmm6
615
616 // Start expanding W back into the main carry registers...
617 pxor xmm15, xmm15
618 movdqa xmm12, xmm7
619 movdqa xmm14, xmm7
620
621 mulcore xmm7, 1, xmm10, xmm11, xmm0, xmm1, xmm2
622 accum xmm4, xmm5, xmm6
623
624 punpckldq xmm12, xmm15 // (w_0, 0, w_1, 0)
625 punpckhdq xmm14, xmm15 // (w_2, 0, w_3, 0)
626
627 mulcore xmm7, 2, xmm10, xmm11, xmm0, xmm1
628 accum xmm5, xmm6
629
630 pxor xmm2, xmm2
631 movdqa xmm13, xmm12
632 movdqa xmm15, xmm14
633
634 mulcore xmm7, 3, xmm10, xmm11, xmm0
635 accum xmm6
636
637 punpckldq xmm12, xmm2 // (w_0, 0, 0, 0)
638 punpckldq xmm14, xmm2 // (w_2, 0, 0, 0)
639 punpckhdq xmm13, xmm2 // (w_1, 0, 0, 0)
640 punpckhdq xmm15, xmm2 // (w_3, 0, 0, 0)
641
642 // That's lots of pieces. Now we have to assemble the answer.
643 squash xmm3, xmm4, xmm5, xmm6, xmm0, xmm1, xmm10
644
645 // Expand it.
646 movdqu xmm5, [rbx]
647 expand xmm2, xmm10, xmm11
648
649 // Finish the calculation by adding the Montgomery product.
650 mulacc xmm5, 0 xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
651 propout xmm6, lo, xmm12, xmm13
652
653 mulacc xmm5, 1 xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
654 propout xmm7, lo, xmm13, xmm14
655
656 mulacc xmm5, 2 xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
657 propout xmm6, hi, xmm14, xmm15
658
659 mulacc xmm5, 3 xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
660 propout xmm7, hi, xmm15, xmm12
661
662 punpckldq xmm6, xmm7
663
664 // Add add on the carry we calculated earlier.
665 paddq xmm12, STKTMP( 0)
666 paddq xmm13, STKTMP(16)
667 paddq xmm14, STKTMP(32)
668
669 // And, with that, we're done.
670 movdqu [rdi], xmm6
671 #if ABI_WIN
672 stfree 56
673 #endif
674 ret
675
676 #undef STKTMP
677
678 ENDFUNC
679
680 INTFUNC(mont4)
681 // On entry, RDI points to the destination buffer holding a packed
682 // value W; RBX points to a packed operand N; and XMM8/XMM9 hold an
683 // expanded operand M.
684 //
685 // On exit, we store Y = W M mod B in XMM10/XMM11, and write the low
686 // 128 bits of the sum W + N Y to [RDI], leaving the remaining carry
687 // in XMM12, XMM13, and XMM14. The registers XMM0--XMM3, XMM5--XMM7,
688 // and XMM15 are clobbered; the general-purpose registers are
689 // preserved.
690 endprologue
691
692 movdqu xmm7, [rdi]
693
694 // Calculate Y = W M. Avoid the standard carry registers, because
695 // we're setting something else up there.
696 mulcore xmm7, 0, xmm8, xmm9, xmm3, xmm4, xmm5, xmm6
697
698 // Start expanding W back into the main carry registers...
699 pxor xmm15, xmm15
700 movdqa xmm12, xmm7
701 movdqa xmm14, xmm7
702
703 mulcore xmm7, 1, xmm8, xmm9, xmm0, xmm1, xmm2
704 accum xmm4, xmm5, xmm6
705
706 punpckldq xmm12, xmm15 // (w_0, 0, w_1, 0)
707 punpckhdq xmm14, xmm15 // (w_2, 0, w_3, 0)
708
709 mulcore xmm7, 2, xmm8, xmm9, xmm0, xmm1
710 accum xmm5, xmm6
711
712 pxor xmm2, xmm2
713 movdqa xmm13, xmm12
714 movdqa xmm15, xmm14
715
716 mulcore xmm7, 3, xmm8, xmm9, xmm0
717 accum xmm6
718
719 punpckldq xmm12, xmm2 // (w_0, 0, 0, 0)
720 punpckldq xmm14, xmm2 // (w_2, 0, 0, 0)
721 punpckhdq xmm13, xmm2 // (w_1, 0, 0, 0)
722 punpckhdq xmm15, xmm2 // (w_3, 0, 0, 0)
723
724 // That's lots of pieces. Now we have to assemble the answer.
725 squash xmm3, xmm4, xmm5, xmm6, xmm0, xmm1, xmm10
726
727 // Expand it.
728 movdqu xmm5, [rbx]
729 expand xmm2, xmm10, xmm11
730
731 // Finish the calculation by adding the Montgomery product.
732 mulacc xmm5, 0 xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
733 propout xmm6, lo, xmm12, xmm13
734
735 mulacc xmm5, 1 xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
736 propout xmm7, lo, xmm13, xmm14
737
738 mulacc xmm5, 2 xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
739 propout xmm6, hi, xmm14, xmm15
740
741 mulacc xmm5, 3 xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
742 propout xmm7, hi, xmm15, xmm12
743
744 punpckldq xmm6, xmm7
745
746 // And, with that, we're done.
747 movdqu [rdi], xmm6
748 ret
749
750 ENDFUNC
751
752 ///--------------------------------------------------------------------------
753 /// Bulk multipliers.
754
755 FUNC(mpx_umul4_amd64_sse2)
756 // void mpx_umul4_amd64_sse2(mpw *dv, const mpw *av, const mpw *avl,
757 // const mpw *bv, const mpw *bvl);
758
759 // Establish the arguments and do initial setup.
760 //
761 // sysv win
762 // inner loop dv rdi rdi*
763 // inner loop av rbx* rbx*
764 // outer loop dv r10 rcx
765 // outer loop bv rcx r9
766 // av base rsi rdx
767 // av limit rdx r8
768 // bv limit r8 r10
769
770 #if ABI_SYSV
771 # define DV r10
772 # define AV rsi
773 # define AVL rdx
774 # define BV rcx
775 # define BVL r8
776
777 pushreg rbx
778 endprologue
779
780 mov DV, rdi
781
782 #endif
783
784 #if ABI_WIN
785 # define DV rcx
786 # define AV rdx
787 # define AVL r8
788 # define BV r9
789 # define BVL r10
790
791 pushreg rbx
792 pushreg rdi
793 stalloc 160 + 8
794
795 savexmm xmm6, 0
796 savexmm xmm7, 16
797 savexmm xmm8, 32
798 savexmm xmm9, 48
799 savexmm xmm10, 64
800 savexmm xmm11, 80
801 savexmm xmm12, 96
802 savexmm xmm13, 112
803 savexmm xmm14, 128
804 savexmm xmm15, 144
805
806 endprologue
807
808 mov rdi, DV
809 mov BVL, [rsp + 224]
810
811 #endif
812
813 // Prepare for the first iteration.
814 pxor xmm0, xmm0
815 movdqu xmm10, [BV] // bv[0]
816 mov rbx, AV
817 add DV, 16
818 add BV, 16
819 expand xmm0, xmm10, xmm11
820 call mul4zc
821 add rbx, 16
822 add rdi, 16
823 cmp rbx, AVL // all done?
824 jae 8f
825
826 .p2align 4
827 // Continue with the first iteration.
828 0: call mul4
829 add rbx, 16
830 add rdi, 16
831 cmp rbx, AVL // all done?
832 jb 0b
833
834 // Write out the leftover carry. There can be no tail here.
835 8: call carryprop
836 cmp BV, BVL // more passes to do?
837 jae 9f
838
839 .p2align 4
840 // Set up for the next pass.
841 1: movdqu xmm10, [BV] // bv[i]
842 mov rdi, DV // -> dv[i]
843 pxor xmm0, xmm0
844 expand xmm0, xmm10, xmm11
845 mov rbx, AV // -> av[0]
846 add DV, 16
847 add BV, 16
848 call mla4zc
849 add rbx, 16
850 add rdi, 16
851 cmp rbx, AVL // done yet?
852 jae 8f
853
854 .p2align 4
855 // Continue...
856 0: call mla4
857 add rbx, 16
858 add rdi, 16
859 cmp rbx, AVL
860 jb 0b
861
862 // Finish off this pass. There was no tail on the previous pass, and
863 // there can be none on this pass.
864 8: call carryprop
865 cmp BV, BVL
866 jb 1b
867
868 // All over.
869 9:
870
871 #if ABI_SYSV
872 popreg rbx
873 #endif
874
875 #if ABI_WIN
876
877 rstrxmm xmm6, 0
878 rstrxmm xmm7, 16
879 rstrxmm xmm8, 32
880 rstrxmm xmm9, 48
881 rstrxmm xmm10, 64
882 rstrxmm xmm11, 80
883 rstrxmm xmm12, 96
884 rstrxmm xmm13, 112
885 rstrxmm xmm14, 128
886 rstrxmm xmm15, 144
887
888 stfree 160 + 8
889 popreg rdi
890 popreg rbx
891
892 #endif
893
894 ret
895
896 #undef DV
897 #undef AV
898 #undef AVL
899 #undef BV
900 #undef BVL
901
902 ENDFUNC
903
904 FUNC(mpxmont_mul4_amd64_sse2)
905 // void mpxmont_mul4_amd64_sse2(mpw *dv, const mpw *av, const mpw *bv,
906 // const mpw *nv, size_t n, const mpw *mi);
907
908 // Establish the arguments and do initial setup.
909 //
910 // sysv win
911 // inner loop dv rdi rdi*
912 // inner loop av rax rax
913 // inner loop nv rbx* rbx*
914 // mi r9 r10
915 // outer loop dv r10 rcx
916 // outer loop bv rdx r8
917 // av base rsi rdx
918 // av limit r11 r11
919 // bv limit r8 r12*
920 // nv base rcx r9
921 // n r8 r12*
922
923 #if ABI_SYSV
924 # define DV r10
925 # define AV rsi
926 # define AVL r11
927 # define BV rdx
928 # define BVL r8
929 # define NV rcx
930 # define N r8
931 # define MI r9
932
933 pushreg rbx
934 endprologue
935
936 mov DV, rdi
937
938 #endif
939
940 #if ABI_WIN
941 # define DV rcx
942 # define AV rdx
943 # define AVL r11
944 # define BV r8
945 # define BVL r12
946 # define NV r9
947 # define N r12
948 # define MI r10
949
950 pushreg rbx
951 pushreg rdi
952 pushreg r12
953 stalloc 160
954
955 savexmm xmm6, 0
956 savexmm xmm7, 16
957 savexmm xmm8, 32
958 savexmm xmm9, 48
959 savexmm xmm10, 64
960 savexmm xmm11, 80
961 savexmm xmm12, 96
962 savexmm xmm13, 112
963 savexmm xmm14, 128
964 savexmm xmm15, 144
965
966 endprologue
967
968 mov rdi, DV
969 mov N, [rsp + 224]
970 mov MI, [rsp + 232]
971
972 #endif
973
974 // Establish the expanded operands.
975 pxor xmm0, xmm0
976 movdqu xmm8, [BV] // bv[0]
977 movdqu xmm10, [MI] // mi
978 expand xmm0, xmm8, xmm9, xmm10, xmm11
979
980 // Set up the outer loop state and prepare for the first iteration.
981 mov rax, AV // -> U = av[0]
982 mov rbx, NV // -> X = nv[0]
983 lea AVL, [AV + 4*N] // -> av[n/4] = av limit
984 lea BVL, [BV + 4*N] // -> bv[n/4] = bv limit
985 add BV, 16
986 add DV, 16
987 call mmul4
988 add rdi, 16
989 add rax, 16
990 add rbx, 16
991 cmp rax, AVL // done already?
992 jae 8f
993
994 .p2align 4
995 // Complete the first inner loop.
996 0: call dmul4
997 add rdi, 16
998 add rax, 16
999 add rbx, 16
1000 cmp rax, AVL // done yet?
1001 jb 0b
1002
1003 // Still have carries left to propagate.
1004 call carryprop
1005 movd [rdi + 16], xmm12
1006
1007 .p2align 4
1008 // Embark on the next iteration. (There must be one. If n = 1, then
1009 // we would have bailed above, to label 8. Similarly, the subsequent
1010 // iterations can fall into the inner loop immediately.)
1011 1: pxor xmm0, xmm0
1012 movdqu xmm8, [BV] // bv[i]
1013 movdqu xmm10, [MI] // mi
1014 mov rdi, DV // -> Z = dv[i]
1015 mov rax, AV // -> U = av[0]
1016 mov rbx, NV // -> X = nv[0]
1017 expand xmm0, xmm8, xmm9, xmm10, xmm11
1018 add BV, 16
1019 add DV, 16
1020 call mmla4
1021 add rdi, 16
1022 add rax, 16
1023 add rbx, 16
1024
1025 .p2align 4
1026 // Complete the next inner loop.
1027 0: call dmla4
1028 add rdi, 16
1029 add rax, 16
1030 add rbx, 16
1031 cmp rax, AVL
1032 jb 0b
1033
1034 // Still have carries left to propagate, and they overlap the
1035 // previous iteration's final tail, so read that in and add it.
1036 movd xmm0, [rdi]
1037 paddq xmm12, xmm0
1038 call carryprop
1039 movd [rdi + 16], xmm12
1040
1041 // Back again, maybe.
1042 cmp BV, BVL
1043 jb 1b
1044
1045 // All done.
1046 9:
1047
1048 #if ABI_SYSV
1049 popreg rbx
1050 #endif
1051
1052 #if ABI_WIN
1053
1054 rstrxmm xmm6, 0
1055 rstrxmm xmm7, 16
1056 rstrxmm xmm8, 32
1057 rstrxmm xmm9, 48
1058 rstrxmm xmm10, 64
1059 rstrxmm xmm11, 80
1060 rstrxmm xmm12, 96
1061 rstrxmm xmm13, 112
1062 rstrxmm xmm14, 128
1063 rstrxmm xmm15, 144
1064
1065 stfree 160
1066 popreg r12
1067 popreg rdi
1068 popreg rbx
1069
1070 #endif
1071
1072 ret
1073
1074 // First iteration was short. Write out the carries and we're done.
1075 // (This could be folded into the main loop structure, but that would
1076 // penalize small numbers more.)
1077 8: call carryprop
1078 movd [rdi + 16], xmm12
1079 #if ABI_SYSV
1080 popreg rbx
1081 ret
1082 #endif
1083 #if ABI_WIN
1084 jmp 9b
1085 #endif
1086
1087 #undef DV
1088 #undef AV
1089 #undef AVL
1090 #undef BV
1091 #undef BVL
1092 #undef NV
1093 #undef N
1094 #undef MI
1095
1096 ENDFUNC
1097
1098 FUNC(mpxmont_redc4_amd64_sse2)
1099 // void mpxmont_redc4_amd64_sse2(mpw *dv, mpw *dvl, const mpw *nv,
1100 // size_t n, const mpw *mi);
1101
1102 // Establish the arguments and do initial setup.
1103 //
1104 // sysv win
1105 // inner loop dv rdi rdi*
1106 // dv limit rax rax
1107 // blocks-of-4 dv limit rsi rdx
1108 // inner loop nv rbx* rbx*
1109 // mi r8 r10
1110 // outer loop dv r10 rcx
1111 // outer loop dv limit r11 r11
1112 // nv base rdx r8
1113 // nv limit r9 r12*
1114 // n rcx r9
1115 // c rcx r9
1116
1117 #if ABI_SYSV
1118
1119 # define DVL rax
1120 # define DVL4 rsi
1121 # define MI r8
1122 # define DV r10
1123 # define DVLO r11
1124 # define NV rdx
1125 # define NVL r9
1126 # define N rcx
1127 # define C ecx
1128
1129 pushreg rbx
1130 endprologue
1131
1132 mov DV, rdi
1133
1134 #endif
1135
1136 #if ABI_WIN
1137
1138 # define DVL rax
1139 # define DVL4 rdx
1140 # define MI r10
1141 # define DV rcx
1142 # define DVLO r11
1143 # define NV r8
1144 # define NVL r12
1145 # define N r9
1146 # define C r9d
1147
1148 pushreg rbx
1149 pushreg rdi
1150 pushreg r12
1151 stalloc 160
1152
1153 savexmm xmm6, 0
1154 savexmm xmm7, 16
1155 savexmm xmm8, 32
1156 savexmm xmm9, 48
1157 savexmm xmm10, 64
1158 savexmm xmm11, 80
1159 savexmm xmm12, 96
1160 savexmm xmm13, 112
1161 savexmm xmm14, 128
1162 savexmm xmm15, 144
1163
1164 endprologue
1165
1166 mov rdi, DV
1167 mov MI, [rsp + 224]
1168
1169 #endif
1170
1171 // Establish the expanded operands and the blocks-of-4 dv limit.
1172 pxor xmm0, xmm0
1173 mov DVL, DVL4 // -> dv[n] = dv limit
1174 sub DVL4, DV // length of dv in bytes
1175 movdqu xmm8, [MI] // mi
1176 and DVL4, ~15 // mask off the tail end
1177 expand xmm0, xmm8, xmm9
1178 add DVL4, DV // find limit
1179
1180 // Set up the outer loop state and prepare for the first iteration.
1181 mov rbx, NV // -> X = nv[0]
1182 lea DVLO, [DV + 4*N] // -> dv[n/4] = outer dv limit
1183 lea NVL, [NV + 4*N] // -> nv[n/4] = nv limit
1184 add DV, 16
1185 call mont4
1186 add rbx, 16
1187 add rdi, 16
1188 cmp rbx, NVL // done already?
1189 jae 8f
1190
1191 .p2align 4
1192 // Complete the first inner loop.
1193 5: call mla4
1194 add rbx, 16
1195 add rdi, 16
1196 cmp rbx, NVL // done yet?
1197 jb 5b
1198
1199 // Still have carries left to propagate.
1200 8: carryadd
1201 psllq xmm15, 16
1202 pslldq xmm15, 8
1203 paddq xmm14, xmm15
1204 call carryprop
1205 movd C, xmm12
1206 add rdi, 16
1207 cmp rdi, DVL4
1208 jae 7f
1209
1210 .p2align 4
1211 // Continue carry propagation until the end of the buffer.
1212 0: add [rdi], C
1213 mov C, 0 // preserves flags
1214 adcd [rdi + 4], 0
1215 adcd [rdi + 8], 0
1216 adcd [rdi + 12], 0
1217 adc C, 0
1218 add rdi, 16
1219 cmp rdi, DVL4
1220 jb 0b
1221
1222 // Deal with the tail end.
1223 7: add [rdi], C
1224 mov C, 0 // preserves flags
1225 add rdi, 4
1226 adc C, 0
1227 cmp rdi, DVL
1228 jb 7b
1229
1230 // All done for this iteration. Start the next. (This must have at
1231 // least one follow-on iteration, or we'd not have started this outer
1232 // loop.)
1233 8: mov rdi, DV // -> Z = dv[i]
1234 mov rbx, NV // -> X = nv[0]
1235 cmp rdi, DVLO // all done yet?
1236 jae 9f
1237 add DV, 16
1238 call mont4
1239 add rdi, 16
1240 add rbx, 16
1241 jmp 5b
1242
1243 // All over.
1244 9:
1245
1246 #if ABI_SYSV
1247 popreg rbx
1248 #endif
1249
1250 #if ABI_WIN
1251
1252 rstrxmm xmm6, 0
1253 rstrxmm xmm7, 16
1254 rstrxmm xmm8, 32
1255 rstrxmm xmm9, 48
1256 rstrxmm xmm10, 64
1257 rstrxmm xmm11, 80
1258 rstrxmm xmm12, 96
1259 rstrxmm xmm13, 112
1260 rstrxmm xmm14, 128
1261 rstrxmm xmm15, 144
1262
1263 stfree 160
1264 popreg r12
1265 popreg rdi
1266 popreg rbx
1267
1268 #endif
1269
1270 ret
1271
1272 #undef DVL
1273 #undef DVL4
1274 #undef MI
1275 #undef DV
1276 #undef DVLO
1277 #undef NV
1278 #undef NVL
1279 #undef N
1280 #undef C
1281
1282 ENDFUNC
1283
1284 ///--------------------------------------------------------------------------
1285 /// Testing and performance measurement.
1286
1287 #ifdef TEST_MUL4
1288
1289 #if ABI_SYSV
1290 # define ARG0 rdi
1291 # define ARG1 rsi
1292 # define ARG2 rdx
1293 # define ARG3 rcx
1294 # define ARG4 r8
1295 # define ARG5 r9
1296 # define ARG6 STKARG(0)
1297 # define ARG7 STKARG(1)
1298 # define ARG8 STKARG(2)
1299 # define STKARG_OFFSET 16
1300 #endif
1301 #if ABI_WIN
1302 # define ARG0 rcx
1303 # define ARG1 rdx
1304 # define ARG2 r8
1305 # define ARG3 r9
1306 # define ARG4 STKARG(0)
1307 # define ARG5 STKARG(1)
1308 # define ARG6 STKARG(2)
1309 # define ARG7 STKARG(3)
1310 # define ARG8 STKARG(4)
1311 # define STKARG_OFFSET 224
1312 #endif
1313 #define STKARG(i) [rsp + STKARG_OFFSET + 8*(i)]
1314
1315 // sysv win
1316 // dmul smul mmul mont dmul smul mmul mont
1317 // A rax
1318 // D rdx
1319 // z rdi rdi rdi rdi rdi rcx rcx rcx rcx
1320 // c rcx rsi rsi rsi rsi rdx rdx rdx rdx
1321 // y r10 -- -- rdx rdx -- -- r8 r8
1322 // u r11 rdx -- rcx -- r8 -- r9 --
1323 // x rbx rcx rdx r8 rcx r9 r8 stk0 r9
1324 // vv xmm8/9 r8 -- r9 r8 stk0 -- stk1 stk0
1325 // yy xmm10/11 r9 rcx stk0 -- stk1 r9 stk2 --
1326 // n r8 stk0 r8 stk1 r9 stk2 stk0 stk3 stk1
1327 // cyv r9 stk1 r9 stk2 stk0 stk3 stk1 stk4 stk2
1328
1329 .macro cysetup v, n
1330 rdtsc
1331 shl rdx, 32
1332 or rax, rdx
1333 mov [\v + 8*\n - 8], rax
1334 .endm
1335
1336 .macro cystore v, n
1337 rdtsc
1338 shl rdx, 32
1339 or rax, rdx
1340 sub rax, [\v + 8*\n - 8]
1341 mov [\v + 8*\n - 8], rax
1342 dec \n
1343 .endm
1344
1345 .macro testprologue mode
1346 pushreg rbx
1347 #if ABI_SYSV
1348 endprologue
1349 .ifeqs "\mode", "dmul"
1350 mov rbx, rcx
1351 movdqu xmm8, [r8]
1352 movdqu xmm10, [r9]
1353 mov r8d, STKARG(0)
1354 mov r9, STKARG(1)
1355 mov r11, rdx
1356 mov rcx, rsi
1357 .endif
1358 .ifeqs "\mode", "smul"
1359 mov rbx, rdx
1360 movdqu xmm10, [rcx]
1361 mov rcx, rsi
1362 .endif
1363 .ifeqs "\mode", "mmul"
1364 mov rax, STKARG(0)
1365 mov rbx, r8
1366 movdqu xmm8, [r9]
1367 movdqu xmm10, [rax]
1368 mov r8d, STKARG(1)
1369 mov r9, STKARG(2)
1370 mov r10, rdx
1371 mov r11, rcx
1372 mov rcx, rsi
1373 .endif
1374 .ifeqs "\mode", "mont"
1375 mov rbx, rcx
1376 movdqu xmm8, [r8]
1377 mov r8d, r9d
1378 mov r9, STKARG(0)
1379 mov r10, rdx
1380 mov rcx, rsi
1381 .endif
1382 #endif
1383 #if ABI_WIN
1384 pushreg rdi
1385 stalloc 168
1386 savexmm xmm6, 0
1387 savexmm xmm7, 16
1388 savexmm xmm8, 32
1389 savexmm xmm9, 48
1390 savexmm xmm10, 64
1391 savexmm xmm11, 80
1392 savexmm xmm12, 96
1393 savexmm xmm13, 112
1394 savexmm xmm14, 128
1395 savexmm xmm15, 144
1396 endprologue
1397 .ifeqs "\mode", "dmul"
1398 mov r10, STKARG(0)
1399 mov r11, STKARG(1)
1400 mov rdi, rcx
1401 mov rcx, rdx
1402 mov rbx, r9
1403 movdqu xmm8, [r10]
1404 movdqu xmm10, [r11]
1405 mov r11, r8
1406 mov r8d, STKARG(2)
1407 mov r9, STKARG(3)
1408 .endif
1409 .ifeqs "\mode", "smul"
1410 mov rdi, rcx
1411 mov rcx, rdx
1412 mov rbx, r8
1413 movdqu xmm10, [r9]
1414 mov r8d, STKARG(0)
1415 mov r9, STKARG(1)
1416 .endif
1417 .ifeqs "\mode", "mmul"
1418 mov r10, STKARG(1)
1419 mov r11, STKARG(2)
1420 mov rdi, rcx
1421 mov rcx, rdx
1422 mov rbx, STKARG(0)
1423 movdqu xmm8, [r10]
1424 movdqu xmm10, [r11]
1425 mov r10, r8
1426 mov r11, r9
1427 mov r8d, STKARG(3)
1428 mov r9, STKARG(4)
1429 .endif
1430 .ifeqs "\mode", "mont"
1431 mov r10, STKARG(0)
1432 mov rdi, rcx
1433 mov rcx, rdx
1434 mov rbx, r9
1435 movdqu xmm8, [r10]
1436 mov r10, r8
1437 mov r8d, STKARG(1)
1438 mov r9, STKARG(2)
1439 .endif
1440 #endif
1441
1442 pxor xmm0, xmm0
1443 .ifeqs "\mode", "dmul"
1444 expand xmm0, xmm8, xmm9, xmm10, xmm11
1445 .endif
1446 .ifeqs "\mode", "smul"
1447 expand xmm0, xmm10, xmm11
1448 .endif
1449 .ifeqs "\mode", "mmul"
1450 expand xmm0, xmm8, xmm9, xmm10, xmm11
1451 .endif
1452 .ifeqs "\mode", "mont"
1453 expand xmm0, xmm8, xmm9
1454 .endif
1455 .endm
1456
1457 .macro testepilogue
1458 #if ABI_WIN
1459 rstrxmm xmm6, 0
1460 rstrxmm xmm7, 16
1461 rstrxmm xmm8, 32
1462 rstrxmm xmm9, 48
1463 rstrxmm xmm10, 64
1464 rstrxmm xmm11, 80
1465 rstrxmm xmm12, 96
1466 rstrxmm xmm13, 112
1467 rstrxmm xmm14, 128
1468 rstrxmm xmm15, 144
1469 stfree 168
1470 popreg rdi
1471 #endif
1472 popreg rbx
1473 ret
1474 .endm
1475
1476 .macro testldcarry
1477 movdqu xmm12, [rcx + 0] // (c'_0, c''_0)
1478 movdqu xmm13, [rcx + 16] // (c'_1, c''_1)
1479 movdqu xmm14, [rcx + 32] // (c'_2, c''_2)
1480 .endm
1481
1482 .macro testtop u=nil
1483 .p2align 4
1484 0:
1485 cysetup r9, r8
1486 .ifnes "\u", "nil"
1487 mov rax, \u
1488 .endif
1489 .endm
1490
1491 .macro testtail
1492 cystore r9, r8
1493 jnz 0b
1494 .endm
1495
1496 .macro testcarryout
1497 movdqu [rcx + 0], xmm12
1498 movdqu [rcx + 16], xmm13
1499 movdqu [rcx + 32], xmm14
1500 .endm
1501
1502 FUNC(test_dmul4)
1503 testprologue dmul
1504 testldcarry
1505 testtop r11
1506 call dmul4
1507 testtail
1508 testcarryout
1509 testepilogue
1510 ENDFUNC
1511
1512 FUNC(test_dmla4)
1513 testprologue dmul
1514 testldcarry
1515 testtop r11
1516 call dmla4
1517 testtail
1518 testcarryout
1519 testepilogue
1520 ENDFUNC
1521
1522 FUNC(test_mul4)
1523 testprologue smul
1524 testldcarry
1525 testtop nil
1526 call mul4
1527 testtail
1528 testcarryout
1529 testepilogue
1530 ENDFUNC
1531
1532 FUNC(test_mla4)
1533 testprologue smul
1534 testldcarry
1535 testtop nil
1536 call mla4
1537 testtail
1538 testcarryout
1539 testepilogue
1540 ENDFUNC
1541
1542 FUNC(test_mmul4)
1543 testprologue mmul
1544 testtop r11
1545 call mmul4
1546 testtail
1547 movdqu [r10 + 0], xmm10
1548 movdqu [r10 + 16], xmm11
1549 testcarryout
1550 testepilogue
1551 ENDFUNC
1552
1553 FUNC(test_mmla4)
1554 testprologue mmul
1555 testtop r11
1556 call mmla4
1557 testtail
1558 movdqu [r10 + 0], xmm10
1559 movdqu [r10 + 16], xmm11
1560 testcarryout
1561 testepilogue
1562 ENDFUNC
1563
1564 FUNC(test_mont4)
1565 testprologue mont
1566 testtop
1567 call mont4
1568 testtail
1569 movdqu [r10 + 0], xmm10
1570 movdqu [r10 + 16], xmm11
1571 testcarryout
1572 testepilogue
1573 ENDFUNC
1574
1575 #endif
1576
1577 ///----- That's all, folks --------------------------------------------------