bd8ff2f9e5ee35e09b73441f0b4d9bf2aa00d70c
[catacomb] / math / mpx-mul4-amd64-sse2.S
1 /// -*- mode: asm; asm-comment-char: ?/; comment-start: "// " -*-
2 ///
3 /// Large SIMD-based multiplications
4 ///
5 /// (c) 2016 Straylight/Edgeware
6 ///
7
8 ///----- Licensing notice ---------------------------------------------------
9 ///
10 /// This file is part of Catacomb.
11 ///
12 /// Catacomb is free software; you can redistribute it and/or modify
13 /// it under the terms of the GNU Library General Public License as
14 /// published by the Free Software Foundation; either version 2 of the
15 /// License, or (at your option) any later version.
16 ///
17 /// Catacomb is distributed in the hope that it will be useful,
18 /// but WITHOUT ANY WARRANTY; without even the implied warranty of
19 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 /// GNU Library General Public License for more details.
21 ///
22 /// You should have received a copy of the GNU Library General Public
23 /// License along with Catacomb; if not, write to the Free
24 /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
25 /// MA 02111-1307, USA.
26
27 ///--------------------------------------------------------------------------
28 /// Preliminaries.
29
30 #include "config.h"
31 #include "asm-common.h"
32
33 .arch pentium4
34
35 .text
36
37 ///--------------------------------------------------------------------------
38 /// Theory.
39 ///
40 /// We define a number of primitive fixed-size multipliers from which we can
41 /// construct more general variable-length multipliers.
42 ///
43 /// The basic trick is the same throughout. In an operand-scanning
44 /// multiplication, the inner multiplication loop multiplies a
45 /// multiple-precision operand by a single precision factor, and adds the
46 /// result, appropriately shifted, to the result. A `finely integrated
47 /// operand scanning' implementation of Montgomery multiplication also adds
48 /// the product of a single-precision `Montgomery factor' and the modulus,
49 /// calculated in the same pass. The more common `coarsely integrated
50 /// operand scanning' alternates main multiplication and Montgomery passes,
51 /// which requires additional carry propagation.
52 ///
53 /// Throughout both plain-multiplication and Montgomery stages, then, one of
54 /// the factors remains constant throughout the operation, so we can afford
55 /// to take a little time to preprocess it. The transformation we perform is
56 /// as follows. Let b = 2^16, and B = b^2 = 2^32. Suppose we're given a
57 /// 128-bit factor v = v_0 + v_1 B + v_2 B^2 + v_3 B^3. Split each v_i into
58 /// two sixteen-bit pieces, so v_i = v'_i + v''_i b. These eight 16-bit
59 /// pieces are placed into 32-bit cells, and arranged as two 128-bit SSE
60 /// operands, as follows.
61 ///
62 /// Offset 0 4 8 12
63 /// 0 v'_0 v'_1 v''_0 v''_1
64 /// 16 v'_2 v'_3 v''_2 v''_3
65 ///
66 /// A `pmuludqd' instruction ignores the odd positions in its operands; thus,
67 /// it will act on (say) v'_0 and v''_0 in a single instruction. Shifting
68 /// this vector right by 4 bytes brings v'_1 and v''_1 into position. We can
69 /// multiply such a vector by a full 32-bit scalar to produce two 48-bit
70 /// results in 64-bit fields. The sixteen bits of headroom allows us to add
71 /// many products together before we must deal with carrying; it also allows
72 /// for some calculations to be performed on the above expanded form.
73 ///
74 /// ...
75 ///
76 /// We maintain four `carry' registers accumulating intermediate results.
77 /// The registers' precise roles rotate during the computation; we name them
78 /// `c0', `c1', `c2', and `c3'. Each carry register holds two 64-bit halves:
79 /// the register c0, for example, holds c'_0 (low half) and c''_0 (high
80 /// half), and represents the value c_0 = c'_0 + c''_0 b; the carry registers
81 /// collectively represent the value c_0 + c_1 B + c_2 B^2 + c_3 B^3. The
82 /// `pmuluqdq' instruction acting on a scalar operand (broadcast across all
83 /// lanes of its vector) and an operand in the expanded form above produces a
84 /// result which can be added directly to the appropriate carry register.
85 /// Following a pass of four multiplications, we perform some limited carry
86 /// propagation: let t = c''_0 mod B, and let d = c'_0 + t b; then we output
87 /// z = d mod B, add (floor(d/B), floor(c''_0/B)) to c1, and cycle the carry
88 /// registers around, so that c1 becomes c0, and the old c0 is (implicitly)
89 /// zeroed becomes c3.
90
91 ///--------------------------------------------------------------------------
92 /// Macro definitions.
93
94 .macro mulcore r, i, slo, shi, d0, d1=nil, d2=nil, d3=nil
95 // Multiply R_I by the expanded operand SLO/SHI, and leave the pieces
96 // of the product in registers D0, D1, D2, D3.
97 pshufd \d0, \r, SHUF(\i, 3, \i, 3) // (r_i, ?; r_i, ?)
98 .ifnes "\d1", "nil"
99 movdqa \d1, \slo // (s'_0, s'_1; s''_0, s''_1)
100 .endif
101 .ifnes "\d3", "nil"
102 movdqa \d3, \shi // (s'_2, s'_3; s''_2, s''_3)
103 .endif
104 .ifnes "\d1", "nil"
105 psrldq \d1, 4 // (s'_1, s''_0; s''_1, 0)
106 .endif
107 .ifnes "\d2", "nil"
108 movdqa \d2, \d0 // another copy of (r_i, ?; r_i, ?)
109 .endif
110 .ifnes "\d3", "nil"
111 psrldq \d3, 4 // (s'_3, s''_2; s''_3, 0)
112 .endif
113 .ifnes "\d1", "nil"
114 pmuludq \d1, \d0 // (r_i s'_1; r_i s''_1)
115 .endif
116 .ifnes "\d3", "nil"
117 pmuludq \d3, \d0 // (r_i s'_3; r_i s''_3)
118 .endif
119 .ifnes "\d2", "nil"
120 pmuludq \d2, \shi // (r_i s'_2; r_i s''_2)
121 .endif
122 pmuludq \d0, \slo // (r_i s'_0; r_i s''_0)
123 .endm
124
125 .macro accum c0, c1=nil, c2=nil, c3=nil
126 // Accumulate 64-bit pieces in XMM0--XMM3 into the corresponding
127 // carry registers C0--C3. Any or all of C1--C3 may be `nil' to skip
128 // updating that register.
129 paddq \c0, xmm0
130 .ifnes "\c1", "nil"
131 paddq \c1, xmm1
132 .endif
133 .ifnes "\c2", "nil"
134 paddq \c2, xmm2
135 .endif
136 .ifnes "\c3", "nil"
137 paddq \c3, xmm3
138 .endif
139 .endm
140
141 .macro mulacc r, i, slo, shi, c0=nil, c1=nil, c2=nil, c3=nil, z3p=nil
142 // Multiply R_I by the expanded operand SLO/SHI, and accumulate in
143 // carry registers C0, C1, C2, C3. If Z3P is `t' then C3 notionally
144 // contains zero, but needs clearing; in practice, we store the
145 // product directly rather than attempting to add. On completion,
146 // XMM0, XMM1, and XMM2 are clobbered, as is XMM3 if Z3P is not `t'.
147 .ifeqs "\z3p", "t"
148 mulcore \r, \i, \slo, \shi, xmm0, xmm1, xmm2, \c3
149 accum \c0, \c1, \c2
150 .else
151 mulcore \r, \i, \slo, \shi, xmm0, xmm1, xmm2, xmm3
152 accum \c0, \c1, \c2, \c3
153 .endif
154 .endm
155
156 .macro propout d, pos, c, cc=nil
157 // Calculate an output word from C, and store it at POS in D;
158 // propagate carries out from C to CC in preparation for a rotation
159 // of the carry registers. D is an XMM register; the POS is either
160 // `lo' or `hi' according to whether the output word should be in
161 // lane 0 or 1 of D; the high two lanes of D are clobbered. On
162 // completion, XMM3 is clobbered. If CC is `nil', then the
163 // contribution which would have been added to it is left in C.
164 pshufd xmm3, \c, SHUF(3, 3, 3, 2) // (?, ?; ?, t = c'' mod B)
165 psrldq xmm3, 12 // (t, 0; 0, 0) = (t; 0)
166 pslldq xmm3, 2 // (t b; 0)
167 paddq \c, xmm3 // (c' + t b; c'')
168 .ifeqs "\pos", "lo"
169 movdqa \d, \c
170 .else
171 punpckldq \d, \c
172 .endif
173 psrlq \c, 32 // floor(c/B)
174 .ifnes "\cc", "nil"
175 paddq \cc, \c // propagate up
176 .endif
177 .endm
178
179 .macro endprop d, pos, c, t
180 // On entry, C contains a carry register. On exit, the low 32 bits
181 // of the value represented in C are written at POS in D, and the
182 // remaining bits are left at the bottom of T.
183 movdqa \t, \c
184 psllq \t, 16 // (?; c'' b)
185 pslldq \c, 8 // (0; c')
186 paddq \t, \c // (?; c' + c'' b)
187 psrldq \t, 8 // (c' + c'' b; 0) = (c; 0)
188 .ifeqs "\pos", "lo"
189 movdqa \d, \t
190 .else
191 punpckldq \d, \t
192 .endif
193 psrldq \t, 4 // (floor(c/B); 0)
194 .endm
195
196 .macro expand z, a, b, c=nil, d=nil
197 // On entry, A and C hold packed 128-bit values, and Z is zero. On
198 // exit, A:B and C:D together hold the same values in expanded
199 // form. If C is `nil', then only expand A to A:B.
200 movdqa \b, \a // (a_0, a_1; a_2, a_3)
201 .ifnes "\c", "nil"
202 movdqa \d, \c // (c_0, c_1; c_2, c_3)
203 .endif
204 punpcklwd \a, \z // (a'_0, a''_0; a'_1, a''_1)
205 punpckhwd \b, \z // (a'_2, a''_2; a'_3, a''_3)
206 .ifnes "\c", "nil"
207 punpcklwd \c, \z // (c'_0, c''_0; c'_1, c''_1)
208 punpckhwd \d, \z // (c'_2, c''_2; c'_3, c''_3)
209 .endif
210 pshufd \a, \a, SHUF(0, 2, 1, 3) // (a'_0, a'_1; a''_0, a''_1)
211 pshufd \b, \b, SHUF(0, 2, 1, 3) // (a'_2, a'_3; a''_2, a''_3)
212 .ifnes "\c", "nil"
213 pshufd \c, \c, SHUF(0, 2, 1, 3) // (c'_0, c'_1; c''_0, c''_1)
214 pshufd \d, \d, SHUF(0, 2, 1, 3) // (c'_2, c'_3; c''_2, c''_3)
215 .endif
216 .endm
217
218 .macro squash c0, c1, c2, c3, t, u, lo, hi=nil
219 // On entry, C0, C1, C2, C3 are carry registers representing a value
220 // Y. On exit, LO holds the low 128 bits of the carry value; C1, C2,
221 // C3, T, and U are clobbered; and the high bits of Y are stored in
222 // HI, if this is not `nil'.
223
224 // The first step is to eliminate the `double-prime' pieces -- i.e.,
225 // the ones offset by 16 bytes from a 32-bit boundary -- by carrying
226 // them into the 32-bit-aligned pieces above and below. But before
227 // we can do that, we must gather them together.
228 movdqa \t, \c0
229 movdqa \u, \c1
230 punpcklqdq \t, \c2 // (y'_0; y'_2)
231 punpckhqdq \c0, \c2 // (y''_0; y''_2)
232 punpcklqdq \u, \c3 // (y'_1; y'_3)
233 punpckhqdq \c1, \c3 // (y''_1; y''_3)
234
235 // Now split the double-prime pieces. The high (up to) 48 bits will
236 // go up; the low 16 bits go down.
237 movdqa \c2, \c0
238 movdqa \c3, \c1
239 psllq \c2, 48
240 psllq \c3, 48
241 psrlq \c0, 16 // high parts of (y''_0; y''_2)
242 psrlq \c1, 16 // high parts of (y''_1; y''_3)
243 psrlq \c2, 32 // low parts of (y''_0; y''_2)
244 psrlq \c3, 32 // low parts of (y''_1; y''_3)
245 .ifnes "\hi", "nil"
246 movdqa \hi, \c1
247 .endif
248 pslldq \c1, 8 // high part of (0; y''_1)
249
250 paddq \t, \c2 // propagate down
251 paddq \u, \c3
252 paddq \t, \c1 // and up: (y_0; y_2)
253 paddq \u, \c0 // (y_1; y_3)
254 .ifnes "\hi", "nil"
255 psrldq \hi, 8 // high part of (y''_3; 0)
256 .endif
257
258 // Finally extract the answer. This complicated dance is better than
259 // storing to memory and loading, because the piecemeal stores
260 // inhibit store forwarding.
261 movdqa \c3, \t // (y_0; ?)
262 movdqa \lo, \t // (y^*_0, ?; ?, ?)
263 psrldq \t, 8 // (y_2; 0)
264 psrlq \c3, 32 // (floor(y_0/B); ?)
265 paddq \c3, \u // (y_1 + floor(y_0/B); ?)
266 movdqa \c1, \c3 // (y^*_1, ?; ?, ?)
267 psrldq \u, 8 // (y_3; 0)
268 psrlq \c3, 32 // (floor((y_1 B + y_0)/B^2; ?)
269 paddq \c3, \t // (y_2 + floor((y_1 B + y_0)/B^2; ?)
270 punpckldq \lo, \c3 // (y^*_0, y^*_2; ?, ?)
271 psrlq \c3, 32 // (floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
272 paddq \c3, \u // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
273 .ifnes "\hi", "nil"
274 movdqa \t, \c3
275 pxor \u, \u
276 .endif
277 punpckldq \c1, \c3 // (y^*_1, y^*_3; ?, ?)
278 .ifnes "\hi", "nil"
279 psrlq \t, 32 // very high bits of y
280 paddq \hi, \t
281 punpcklqdq \hi, \u // carry up
282 .endif
283 punpckldq \lo, \c1 // y mod B^4
284 .endm
285
286 .macro carryadd
287 // On entry, RDI points to a packed addend A, and XMM12, XMM13, XMM14
288 // hold the incoming carry registers c0, c1, and c2 representing a
289 // carry-in C.
290 //
291 // On exit, the carry registers, including XMM15, are updated to hold
292 // C + A; XMM0, XMM1, XMM2, and XMM3 are clobbered. The other
293 // registers are preserved.
294 movd xmm0, [rdi + 0] // (a_0; 0)
295 movd xmm1, [rdi + 4] // (a_1; 0)
296 movd xmm2, [rdi + 8] // (a_2; 0)
297 movd xmm15, [rdi + 12] // (a_3; 0)
298 paddq xmm12, xmm0 // (c'_0 + a_0; c''_0)
299 paddq xmm13, xmm1 // (c'_1 + a_1; c''_1)
300 paddq xmm14, xmm2 // (c'_2 + a_2; c''_2 + a_3 b)
301 .endm
302
303 ///--------------------------------------------------------------------------
304 /// Primitive multipliers and related utilities.
305
306 INTFUNC(carryprop)
307 // On entry, XMM12, XMM13, and XMM14 hold a 144-bit carry in an
308 // expanded form. Store the low 128 bits of the represented carry to
309 // [RDI] as a packed 128-bit value, and leave the remaining 16 bits
310 // in the low 32 bits of XMM12. On exit, XMM0, XMM1, XMM3, XMM13 and
311 // XMM14 are clobbered.
312 endprologue
313
314 propout xmm0, lo, xmm12, xmm13
315 propout xmm1, lo, xmm13, xmm14
316 propout xmm0, hi, xmm14, nil
317 endprop xmm1, hi, xmm14, xmm12
318 punpckldq xmm0, xmm1
319 movdqu [rdi], xmm0
320
321 ret
322 ENDFUNC
323
324 INTFUNC(dmul4)
325 // On entry, RDI points to the destination buffer; RAX and RBX point
326 // to the packed operands U and X; XMM8/XMM9 and XMM10/XMM11 hold the
327 // expanded operands V and Y; and XMM12, XMM13, XMM14 hold the
328 // incoming carry registers c0, c1, and c2; c3 is assumed to be zero.
329 //
330 // On exit, we write the low 128 bits of the sum C + U V + X Y to
331 // [RDI], and update the carry registers with the carry out. The
332 // registers XMM0--XMM7, and XMM15 are clobbered; the general-purpose
333 // registers are preserved.
334 endprologue
335
336 movdqu xmm4, [rax]
337 movdqu xmm5, [rbx]
338
339 mulacc xmm4, 0, xmm8, xmm9, xmm12, xmm13, xmm14, xmm15, t
340 mulacc xmm5, 0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
341 propout xmm6, lo, xmm12, xmm13
342
343 mulacc xmm4, 1, xmm8, xmm9, xmm13, xmm14, xmm15, xmm12, t
344 mulacc xmm5, 1, xmm10, xmm11, xmm13, xmm14, xmm15, xmm12
345 propout xmm7, lo, xmm13, xmm14
346
347 mulacc xmm4, 2, xmm8, xmm9, xmm14, xmm15, xmm12, xmm13, t
348 mulacc xmm5, 2, xmm10, xmm11, xmm14, xmm15, xmm12, xmm13
349 propout xmm6, hi, xmm14, xmm15
350
351 mulacc xmm4, 3, xmm8, xmm9, xmm15, xmm12, xmm13, xmm14, t
352 mulacc xmm5, 3, xmm10, xmm11, xmm15, xmm12, xmm13, xmm14
353 propout xmm7, hi, xmm15, xmm12
354
355 punpckldq xmm6, xmm7
356 movdqu [rdi], xmm6
357
358 ret
359 ENDFUNC
360
361 INTFUNC(dmla4)
362 // On entry, RDI points to the destination buffer, which also
363 // contains an addend A to accumulate; RAX and RBX point to the
364 // packed operands U and X; XMM8/XMM9 and XMM10/XMM11 hold the
365 // expanded operands V and Y; and XMM12, XMM13, XMM14 hold the
366 // incoming carry registers c0, c1, and c2 representing a carry-in C;
367 // c3 is assumed to be zero.
368 //
369 // On exit, we write the low 128 bits of the sum A + C + U V + X Y to
370 // [RDI], and update the carry registers with the carry out. The
371 // registers XMM0--XMM7, and XMM15 are clobbered; the general-purpose
372 // registers are preserved.
373 endprologue
374
375 movdqu xmm4, [rax]
376 movdqu xmm5, [rbx]
377 carryadd
378
379 mulacc xmm4, 0, xmm8, xmm9, xmm12, xmm13, xmm14, xmm15
380 mulacc xmm5, 0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
381 propout xmm6, lo, xmm12, xmm13
382
383 mulacc xmm4, 1, xmm8, xmm9, xmm13, xmm14, xmm15, xmm12, t
384 mulacc xmm5, 1, xmm10, xmm11, xmm13, xmm14, xmm15, xmm12
385 propout xmm7, lo, xmm13, xmm14
386
387 mulacc xmm4, 2, xmm8, xmm9, xmm14, xmm15, xmm12, xmm13, t
388 mulacc xmm5, 2, xmm10, xmm11, xmm14, xmm15, xmm12, xmm13
389 propout xmm6, hi, xmm14, xmm15
390
391 mulacc xmm4, 3, xmm8, xmm9, xmm15, xmm12, xmm13, xmm14, t
392 mulacc xmm5, 3, xmm10, xmm11, xmm15, xmm12, xmm13, xmm14
393 propout xmm7, hi, xmm15, xmm12
394
395 punpckldq xmm6, xmm7
396 movdqu [rdi], xmm6
397
398 ret
399 ENDFUNC
400
401 INTFUNC(mul4zc)
402 // On entry, RDI points to the destination buffer; RBX points to a
403 // packed operand X; and XMM10/XMM11 hold an expanded operand Y.
404 //
405 // On exit, we write the low 128 bits of the product X Y to [RDI],
406 // and set the carry registers XMM12, XMM13, XMM14 to the carry out.
407 // The registers XMM0--XMM3, XMM5--XMM7, and XMM15 are clobbered; the
408 // general-purpose registers are preserved.
409 endprologue
410
411 movdqu xmm5, [rbx]
412
413 mulcore xmm5, 0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
414 propout xmm6, lo, xmm12, xmm13
415
416 mulacc xmm5, 1, xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
417 propout xmm7, lo, xmm13, xmm14
418
419 mulacc xmm5, 2, xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
420 propout xmm6, hi, xmm14, xmm15
421
422 mulacc xmm5, 3, xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
423 propout xmm7, hi, xmm15, xmm12
424
425 punpckldq xmm6, xmm7
426 movdqu [rdi], xmm6
427
428 ret
429 ENDFUNC
430
431 INTFUNC(mul4)
432 // On entry, RDI points to the destination buffer; RBX points to a
433 // packed operand X; XMM10/XMM11 hold an expanded operand Y; and
434 // XMM12, XMM13, XMM14 hold the incoming carry registers c0, c1, and
435 // c2, representing a carry-in C; c3 is assumed to be zero.
436 //
437 // On exit, we write the low 128 bits of the sum C + X Y to [RDI],
438 // and update the carry registers with the carry out. The registers
439 // XMM0--XMM3, XMM5--XMM7, and XMM15 are clobbered; the
440 // general-purpose registers are preserved.
441 endprologue
442
443 movdqu xmm5, [rbx]
444
445 mulacc xmm5, 0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, t
446 propout xmm6, lo, xmm12, xmm13
447
448 mulacc xmm5, 1, xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
449 propout xmm7, lo, xmm13, xmm14
450
451 mulacc xmm5, 2, xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
452 propout xmm6, hi, xmm14, xmm15
453
454 mulacc xmm5, 3, xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
455 propout xmm7, hi, xmm15, xmm12
456
457 punpckldq xmm6, xmm7
458 movdqu [rdi], xmm6
459
460 ret
461 ENDFUNC
462
463 INTFUNC(mla4zc)
464 // On entry, RDI points to the destination buffer, which also
465 // contains an addend A to accumulate; RBX points to a packed operand
466 // X; and XMM10/XMM11 points to an expanded operand Y.
467 //
468 // On exit, we write the low 128 bits of the sum A + X Y to [RDI],
469 // and set the carry registers XMM12, XMM13, XMM14 to the carry out.
470 // The registers XMM0--XMM3, XMM5--XMM7, and XMM15 are clobbered; the
471 // general-purpose registers are preserved.
472 endprologue
473
474 movdqu xmm5, [rbx]
475 movd xmm12, [rdi + 0]
476 movd xmm13, [rdi + 4]
477 movd xmm14, [rdi + 8]
478 movd xmm15, [rdi + 12]
479
480 mulacc xmm5, 0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
481 propout xmm6, lo, xmm12, xmm13
482
483 mulacc xmm5, 1, xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
484 propout xmm7, lo, xmm13, xmm14
485
486 mulacc xmm5, 2, xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
487 propout xmm6, hi, xmm14, xmm15
488
489 mulacc xmm5, 3, xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
490 propout xmm7, hi, xmm15, xmm12
491
492 punpckldq xmm6, xmm7
493 movdqu [rdi], xmm6
494
495 ret
496 ENDFUNC
497
498 INTFUNC(mla4)
499 // On entry, RDI points to the destination buffer, which also
500 // contains an addend A to accumulate; RBX points to a packed operand
501 // X; XMM10/XMM11 holds an expanded operand Y; and XMM12, XMM13,
502 // XMM14 hold the incoming carry registers c0, c1, and c2,
503 // representing a carry-in C; c3 is assumed to be zero.
504 //
505 // On exit, we write the low 128 bits of the sum A + C + X Y to
506 // [RDI], and update the carry registers with the carry out. The
507 // registers XMM0--XMM3, XMM5--XMM7, and XMM15 are clobbered; the
508 // general-purpose registers are preserved.
509 endprologue
510
511 movdqu xmm5, [rbx]
512 carryadd
513
514 mulacc xmm5, 0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
515 propout xmm6, lo, xmm12, xmm13
516
517 mulacc xmm5, 1, xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
518 propout xmm7, lo, xmm13, xmm14
519
520 mulacc xmm5, 2, xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
521 propout xmm6, hi, xmm14, xmm15
522
523 mulacc xmm5, 3, xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
524 propout xmm7, hi, xmm15, xmm12
525
526 punpckldq xmm6, xmm7
527 movdqu [rdi], xmm6
528
529 ret
530 ENDFUNC
531
532 INTFUNC(mmul4)
533 // On entry, RDI points to the destination buffer; RAX and RBX point
534 // to the packed operands U and N; and XMM8/XMM9 and XMM10/XMM11 hold
535 // the expanded operands V and M. The stack pointer must be 8 modulo 16
536 // (as usual for AMD64 ABIs).
537 //
538 // On exit, we store Y = U V M mod B in XMM10/XMM11, and write the
539 // low 128 bits of the sum U V + N Y to [RDI], leaving the remaining
540 // carry in XMM12, XMM13, and XMM14. The registers XMM0--XMM7, and
541 // XMM15 are clobbered; the general-purpose registers are preserved.
542 movdqu xmm4, [rax]
543 #if ABI_WIN
544 stalloc 48 + 8 // space for the carries
545 #endif
546 endprologue
547
548 // Calculate W = U V, and leave it in XMM7. Stash the carry pieces
549 // for later.
550 mulcore xmm4, 0, xmm8, xmm9, xmm12, xmm13, xmm14, xmm15
551 propout xmm7, lo, xmm12, xmm13
552 jmp 5f
553 ENDFUNC
554
555 INTFUNC(mmla4)
556 // On entry, RDI points to the destination buffer, which also
557 // contains an addend A to accumulate; RAX and RBX point to the
558 // packed operands U and N; and XMM8/XMM9 and XMM10/XMM11 hold the
559 // expanded operands V and M. The stack pointer must be 8 modulo 16
560 // (as usual for AMD64 ABIs).
561 //
562 // On exit, we store Y = (A + U V) M mod B in XMM10/XMM11, and write
563 // the low 128 bits of the sum A + U V + N Y to [RDI], leaving the
564 // remaining carry in XMM12, XMM13, and XMM14. The registers
565 // XMM0--XMM7, and XMM15 are clobbered; the general-purpose registers
566 // are preserved.
567 movdqu xmm4, [rax]
568 #if ABI_WIN
569 stalloc 48 + 8 // space for the carries
570 # define STKTMP(i) [SP + i]
571 #endif
572 #if ABI_SYSV
573 # define STKTMP(i) [SP + i - 48 - 8] // use red zone
574 #endif
575 endprologue
576
577 movd xmm12, [rdi + 0]
578 movd xmm13, [rdi + 4]
579 movd xmm14, [rdi + 8]
580 movd xmm15, [rdi + 12]
581
582 // Calculate W = U V, and leave it in XMM7. Stash the carry pieces
583 // for later.
584 mulacc xmm4, 0, xmm8, xmm9, xmm12, xmm13, xmm14, xmm15
585 propout xmm7, lo, xmm12, xmm13
586
587 5: mulacc xmm4, 1, xmm8, xmm9, xmm13, xmm14, xmm15, xmm12, t
588 propout xmm6, lo, xmm13, xmm14
589
590 mulacc xmm4, 2, xmm8, xmm9, xmm14, xmm15, xmm12, xmm13, t
591 propout xmm7, hi, xmm14, xmm15
592
593 mulacc xmm4, 3, xmm8, xmm9, xmm15, xmm12, xmm13, xmm14, t
594 propout xmm6, hi, xmm15, xmm12
595
596 // Prepare W, and stash carries for later.
597 punpckldq xmm7, xmm6
598 movdqa STKTMP( 0), xmm12
599 movdqa STKTMP(16), xmm13
600 movdqa STKTMP(32), xmm14
601
602 // Calculate Y = W M. We just about have enough spare registers to
603 // make this work.
604 mulcore xmm7, 0, xmm10, xmm11, xmm3, xmm4, xmm5, xmm6
605
606 // Start expanding W back into the main carry registers...
607 pxor xmm15, xmm15
608 movdqa xmm12, xmm7
609 movdqa xmm14, xmm7
610
611 mulcore xmm7, 1, xmm10, xmm11, xmm0, xmm1, xmm2
612 accum xmm4, xmm5, xmm6
613
614 punpckldq xmm12, xmm15 // (w_0, 0; w_1, 0)
615 punpckhdq xmm14, xmm15 // (w_2, 0; w_3, 0)
616
617 mulcore xmm7, 2, xmm10, xmm11, xmm0, xmm1
618 accum xmm5, xmm6
619
620 pxor xmm2, xmm2
621 movdqa xmm13, xmm12
622 movdqa xmm15, xmm14
623
624 mulcore xmm7, 3, xmm10, xmm11, xmm0
625 accum xmm6
626
627 punpckldq xmm12, xmm2 // (w_0, 0; 0, 0)
628 punpckldq xmm14, xmm2 // (w_2, 0; 0, 0)
629 punpckhdq xmm13, xmm2 // (w_1, 0; 0, 0)
630 punpckhdq xmm15, xmm2 // (w_3, 0; 0, 0)
631
632 // That's lots of pieces. Now we have to assemble the answer.
633 squash xmm3, xmm4, xmm5, xmm6, xmm0, xmm1, xmm10
634
635 // Expand it.
636 movdqu xmm5, [rbx]
637 expand xmm2, xmm10, xmm11
638
639 // Finish the calculation by adding the Montgomery product.
640 mulacc xmm5, 0 xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
641 propout xmm6, lo, xmm12, xmm13
642
643 mulacc xmm5, 1 xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
644 propout xmm7, lo, xmm13, xmm14
645
646 mulacc xmm5, 2 xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
647 propout xmm6, hi, xmm14, xmm15
648
649 mulacc xmm5, 3 xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
650 propout xmm7, hi, xmm15, xmm12
651
652 punpckldq xmm6, xmm7
653
654 // Add add on the carry we calculated earlier.
655 paddq xmm12, STKTMP( 0)
656 paddq xmm13, STKTMP(16)
657 paddq xmm14, STKTMP(32)
658
659 // And, with that, we're done.
660 movdqu [rdi], xmm6
661 #if ABI_WIN
662 stfree 56
663 #endif
664 ret
665
666 #undef STKTMP
667
668 ENDFUNC
669
670 INTFUNC(mont4)
671 // On entry, RDI points to the destination buffer holding a packed
672 // value W; RBX points to a packed operand N; and XMM8/XMM9 hold an
673 // expanded operand M.
674 //
675 // On exit, we store Y = W M mod B in XMM10/XMM11, and write the low
676 // 128 bits of the sum W + N Y to [RDI], leaving the remaining carry
677 // in XMM12, XMM13, and XMM14. The registers XMM0--XMM3, XMM5--XMM7,
678 // and XMM15 are clobbered; the general-purpose registers are
679 // preserved.
680 endprologue
681
682 movdqu xmm7, [rdi]
683
684 // Calculate Y = W M. Avoid the standard carry registers, because
685 // we're setting something else up there.
686 mulcore xmm7, 0, xmm8, xmm9, xmm3, xmm4, xmm5, xmm6
687
688 // Start expanding W back into the main carry registers...
689 pxor xmm15, xmm15
690 movdqa xmm12, xmm7
691 movdqa xmm14, xmm7
692
693 mulcore xmm7, 1, xmm8, xmm9, xmm0, xmm1, xmm2
694 accum xmm4, xmm5, xmm6
695
696 punpckldq xmm12, xmm15 // (w_0, 0; w_1, 0)
697 punpckhdq xmm14, xmm15 // (w_2, 0; w_3, 0)
698
699 mulcore xmm7, 2, xmm8, xmm9, xmm0, xmm1
700 accum xmm5, xmm6
701
702 pxor xmm2, xmm2
703 movdqa xmm13, xmm12
704 movdqa xmm15, xmm14
705
706 mulcore xmm7, 3, xmm8, xmm9, xmm0
707 accum xmm6
708
709 punpckldq xmm12, xmm2 // (w_0, 0; 0, 0)
710 punpckldq xmm14, xmm2 // (w_2, 0; 0, 0)
711 punpckhdq xmm13, xmm2 // (w_1, 0; 0, 0)
712 punpckhdq xmm15, xmm2 // (w_3, 0; 0, 0)
713
714 // That's lots of pieces. Now we have to assemble the answer.
715 squash xmm3, xmm4, xmm5, xmm6, xmm0, xmm1, xmm10
716
717 // Expand it.
718 movdqu xmm5, [rbx]
719 expand xmm2, xmm10, xmm11
720
721 // Finish the calculation by adding the Montgomery product.
722 mulacc xmm5, 0 xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
723 propout xmm6, lo, xmm12, xmm13
724
725 mulacc xmm5, 1 xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
726 propout xmm7, lo, xmm13, xmm14
727
728 mulacc xmm5, 2 xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
729 propout xmm6, hi, xmm14, xmm15
730
731 mulacc xmm5, 3 xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
732 propout xmm7, hi, xmm15, xmm12
733
734 punpckldq xmm6, xmm7
735
736 // And, with that, we're done.
737 movdqu [rdi], xmm6
738 ret
739 ENDFUNC
740
741 ///--------------------------------------------------------------------------
742 /// Bulk multipliers.
743
744 FUNC(mpx_umul4_amd64_avx)
745 .arch .avx
746 vzeroupper
747 endprologue
748 .arch pentium4
749 ENDFUNC
750
751 FUNC(mpx_umul4_amd64_sse2)
752 // void mpx_umul4_amd64_sse2(mpw *dv, const mpw *av, const mpw *avl,
753 // const mpw *bv, const mpw *bvl);
754
755 // Establish the arguments and do initial setup.
756 //
757 // sysv win
758 // inner loop dv rdi rdi*
759 // inner loop av rbx* rbx*
760 // outer loop dv r10 rcx
761 // outer loop bv rcx r9
762 // av base rsi rdx
763 // av limit rdx r8
764 // bv limit r8 r10
765
766 #if ABI_SYSV
767 # define DV r10
768 # define AV rsi
769 # define AVL rdx
770 # define BV rcx
771 # define BVL r8
772
773 pushreg rbx
774 endprologue
775
776 mov DV, rdi
777 #endif
778
779 #if ABI_WIN
780 # define DV rcx
781 # define AV rdx
782 # define AVL r8
783 # define BV r9
784 # define BVL r10
785
786 pushreg rbx
787 pushreg rdi
788 stalloc 160 + 8
789
790 savexmm xmm6, 0
791 savexmm xmm7, 16
792 savexmm xmm8, 32
793 savexmm xmm9, 48
794 savexmm xmm10, 64
795 savexmm xmm11, 80
796 savexmm xmm12, 96
797 savexmm xmm13, 112
798 savexmm xmm14, 128
799 savexmm xmm15, 144
800
801 endprologue
802
803 mov rdi, DV
804 mov BVL, [SP + 224]
805 #endif
806
807 // Prepare for the first iteration.
808 pxor xmm0, xmm0
809 movdqu xmm10, [BV] // bv[0]
810 mov rbx, AV
811 add DV, 16
812 add BV, 16
813 expand xmm0, xmm10, xmm11
814 call mul4zc
815 add rbx, 16
816 add rdi, 16
817 cmp rbx, AVL // all done?
818 jae 8f
819
820 .p2align 4
821 // Continue with the first iteration.
822 0: call mul4
823 add rbx, 16
824 add rdi, 16
825 cmp rbx, AVL // all done?
826 jb 0b
827
828 // Write out the leftover carry. There can be no tail here.
829 8: call carryprop
830 cmp BV, BVL // more passes to do?
831 jae 9f
832
833 .p2align 4
834 // Set up for the next pass.
835 1: movdqu xmm10, [BV] // bv[i]
836 mov rdi, DV // -> dv[i]
837 pxor xmm0, xmm0
838 expand xmm0, xmm10, xmm11
839 mov rbx, AV // -> av[0]
840 add DV, 16
841 add BV, 16
842 call mla4zc
843 add rbx, 16
844 add rdi, 16
845 cmp rbx, AVL // done yet?
846 jae 8f
847
848 .p2align 4
849 // Continue...
850 0: call mla4
851 add rbx, 16
852 add rdi, 16
853 cmp rbx, AVL
854 jb 0b
855
856 // Finish off this pass. There was no tail on the previous pass, and
857 // there can be none on this pass.
858 8: call carryprop
859 cmp BV, BVL
860 jb 1b
861
862 // All over.
863 9:
864
865 #if ABI_SYSV
866 popreg rbx
867 #endif
868
869 #if ABI_WIN
870 rstrxmm xmm6, 0
871 rstrxmm xmm7, 16
872 rstrxmm xmm8, 32
873 rstrxmm xmm9, 48
874 rstrxmm xmm10, 64
875 rstrxmm xmm11, 80
876 rstrxmm xmm12, 96
877 rstrxmm xmm13, 112
878 rstrxmm xmm14, 128
879 rstrxmm xmm15, 144
880
881 stfree 160 + 8
882 popreg rdi
883 popreg rbx
884 #endif
885
886 ret
887
888 #undef DV
889 #undef AV
890 #undef AVL
891 #undef BV
892 #undef BVL
893
894 ENDFUNC
895
896 FUNC(mpxmont_mul4_amd64_avx)
897 .arch .avx
898 vzeroupper
899 endprologue
900 .arch pentium4
901 ENDFUNC
902
903 FUNC(mpxmont_mul4_amd64_sse2)
904 // void mpxmont_mul4_amd64_sse2(mpw *dv, const mpw *av, const mpw *bv,
905 // const mpw *nv, size_t n, const mpw *mi);
906
907 // Establish the arguments and do initial setup.
908 //
909 // sysv win
910 // inner loop dv rdi rdi*
911 // inner loop av rax rax
912 // inner loop nv rbx* rbx*
913 // mi r9 r10
914 // outer loop dv r10 rcx
915 // outer loop bv rdx r8
916 // av base rsi rdx
917 // av limit r11 r11
918 // bv limit r8 r12*
919 // nv base rcx r9
920 // n r8 r12*
921
922 #if ABI_SYSV
923 # define DV r10
924 # define AV rsi
925 # define AVL r11
926 # define BV rdx
927 # define BVL r8
928 # define NV rcx
929 # define N r8
930 # define MI r9
931
932 pushreg rbx
933 endprologue
934
935 mov DV, rdi
936 #endif
937
938 #if ABI_WIN
939 # define DV rcx
940 # define AV rdx
941 # define AVL r11
942 # define BV r8
943 # define BVL r12
944 # define NV r9
945 # define N r12
946 # define MI r10
947
948 pushreg rbx
949 pushreg rdi
950 pushreg r12
951 stalloc 160
952
953 savexmm xmm6, 0
954 savexmm xmm7, 16
955 savexmm xmm8, 32
956 savexmm xmm9, 48
957 savexmm xmm10, 64
958 savexmm xmm11, 80
959 savexmm xmm12, 96
960 savexmm xmm13, 112
961 savexmm xmm14, 128
962 savexmm xmm15, 144
963
964 endprologue
965
966 mov rdi, DV
967 mov N, [SP + 224]
968 mov MI, [SP + 232]
969 #endif
970
971 // Establish the expanded operands.
972 pxor xmm0, xmm0
973 movdqu xmm8, [BV] // bv[0]
974 movdqu xmm10, [MI] // mi
975 expand xmm0, xmm8, xmm9, xmm10, xmm11
976
977 // Set up the outer loop state and prepare for the first iteration.
978 mov rax, AV // -> U = av[0]
979 mov rbx, NV // -> X = nv[0]
980 lea AVL, [AV + 4*N] // -> av[n/4] = av limit
981 lea BVL, [BV + 4*N] // -> bv[n/4] = bv limit
982 add BV, 16
983 add DV, 16
984 call mmul4
985 add rdi, 16
986 add rax, 16
987 add rbx, 16
988 cmp rax, AVL // done already?
989 jae 8f
990
991 .p2align 4
992 // Complete the first inner loop.
993 0: call dmul4
994 add rdi, 16
995 add rax, 16
996 add rbx, 16
997 cmp rax, AVL // done yet?
998 jb 0b
999
1000 // Still have carries left to propagate.
1001 call carryprop
1002 movd [rdi + 16], xmm12
1003
1004 .p2align 4
1005 // Embark on the next iteration. (There must be one. If n = 1, then
1006 // we would have bailed above, to label 8. Similarly, the subsequent
1007 // iterations can fall into the inner loop immediately.)
1008 1: pxor xmm0, xmm0
1009 movdqu xmm8, [BV] // bv[i]
1010 movdqu xmm10, [MI] // mi
1011 mov rdi, DV // -> Z = dv[i]
1012 mov rax, AV // -> U = av[0]
1013 mov rbx, NV // -> X = nv[0]
1014 expand xmm0, xmm8, xmm9, xmm10, xmm11
1015 add BV, 16
1016 add DV, 16
1017 call mmla4
1018 add rdi, 16
1019 add rax, 16
1020 add rbx, 16
1021
1022 .p2align 4
1023 // Complete the next inner loop.
1024 0: call dmla4
1025 add rdi, 16
1026 add rax, 16
1027 add rbx, 16
1028 cmp rax, AVL
1029 jb 0b
1030
1031 // Still have carries left to propagate, and they overlap the
1032 // previous iteration's final tail, so read that in and add it.
1033 movd xmm0, [rdi]
1034 paddq xmm12, xmm0
1035 call carryprop
1036 movd [rdi + 16], xmm12
1037
1038 // Back again, maybe.
1039 cmp BV, BVL
1040 jb 1b
1041
1042 // All done.
1043 9:
1044
1045 #if ABI_SYSV
1046 popreg rbx
1047 #endif
1048
1049 #if ABI_WIN
1050 rstrxmm xmm6, 0
1051 rstrxmm xmm7, 16
1052 rstrxmm xmm8, 32
1053 rstrxmm xmm9, 48
1054 rstrxmm xmm10, 64
1055 rstrxmm xmm11, 80
1056 rstrxmm xmm12, 96
1057 rstrxmm xmm13, 112
1058 rstrxmm xmm14, 128
1059 rstrxmm xmm15, 144
1060
1061 stfree 160
1062 popreg r12
1063 popreg rdi
1064 popreg rbx
1065 #endif
1066
1067 ret
1068
1069 // First iteration was short. Write out the carries and we're done.
1070 // (This could be folded into the main loop structure, but that would
1071 // penalize small numbers more.)
1072 8: call carryprop
1073 movd [rdi + 16], xmm12
1074 #if ABI_SYSV
1075 popreg rbx
1076 ret
1077 #endif
1078 #if ABI_WIN
1079 jmp 9b
1080 #endif
1081
1082 #undef DV
1083 #undef AV
1084 #undef AVL
1085 #undef BV
1086 #undef BVL
1087 #undef NV
1088 #undef N
1089 #undef MI
1090
1091 ENDFUNC
1092
1093 FUNC(mpxmont_redc4_amd64_avx)
1094 .arch .avx
1095 vzeroupper
1096 endprologue
1097 .arch pentium4
1098 ENDFUNC
1099
1100 FUNC(mpxmont_redc4_amd64_sse2)
1101 // void mpxmont_redc4_amd64_sse2(mpw *dv, mpw *dvl, const mpw *nv,
1102 // size_t n, const mpw *mi);
1103
1104 // Establish the arguments and do initial setup.
1105 //
1106 // sysv win
1107 // inner loop dv rdi rdi*
1108 // dv limit rax rax
1109 // blocks-of-4 dv limit rsi rdx
1110 // inner loop nv rbx* rbx*
1111 // mi r8 r10
1112 // outer loop dv r10 rcx
1113 // outer loop dv limit r11 r11
1114 // nv base rdx r8
1115 // nv limit r9 r12*
1116 // n rcx r9
1117 // c rcx r9
1118
1119 #if ABI_SYSV
1120 # define DVL rax
1121 # define DVL4 rsi
1122 # define MI r8
1123 # define DV r10
1124 # define DVLO r11
1125 # define NV rdx
1126 # define NVL r9
1127 # define N rcx
1128 # define C ecx
1129
1130 pushreg rbx
1131 endprologue
1132
1133 mov DV, rdi
1134 #endif
1135
1136 #if ABI_WIN
1137 # define DVL rax
1138 # define DVL4 rdx
1139 # define MI r10
1140 # define DV rcx
1141 # define DVLO r11
1142 # define NV r8
1143 # define NVL r12
1144 # define N r9
1145 # define C r9d
1146
1147 pushreg rbx
1148 pushreg rdi
1149 pushreg r12
1150 stalloc 160
1151
1152 savexmm xmm6, 0
1153 savexmm xmm7, 16
1154 savexmm xmm8, 32
1155 savexmm xmm9, 48
1156 savexmm xmm10, 64
1157 savexmm xmm11, 80
1158 savexmm xmm12, 96
1159 savexmm xmm13, 112
1160 savexmm xmm14, 128
1161 savexmm xmm15, 144
1162
1163 endprologue
1164
1165 mov rdi, DV
1166 mov MI, [SP + 224]
1167 #endif
1168
1169 // Establish the expanded operands and the blocks-of-4 dv limit.
1170 pxor xmm0, xmm0
1171 mov DVL, DVL4 // -> dv[n] = dv limit
1172 sub DVL4, DV // length of dv in bytes
1173 movdqu xmm8, [MI] // mi
1174 and DVL4, ~15 // mask off the tail end
1175 expand xmm0, xmm8, xmm9
1176 add DVL4, DV // find limit
1177
1178 // Set up the outer loop state and prepare for the first iteration.
1179 mov rbx, NV // -> X = nv[0]
1180 lea DVLO, [DV + 4*N] // -> dv[n/4] = outer dv limit
1181 lea NVL, [NV + 4*N] // -> nv[n/4] = nv limit
1182 add DV, 16
1183 call mont4
1184 add rbx, 16
1185 add rdi, 16
1186 cmp rbx, NVL // done already?
1187 jae 8f
1188
1189 .p2align 4
1190 // Complete the first inner loop.
1191 5: call mla4
1192 add rbx, 16
1193 add rdi, 16
1194 cmp rbx, NVL // done yet?
1195 jb 5b
1196
1197 // Still have carries left to propagate.
1198 8: carryadd
1199 psllq xmm15, 16
1200 pslldq xmm15, 8
1201 paddq xmm14, xmm15
1202 call carryprop
1203 movd C, xmm12
1204 add rdi, 16
1205 cmp rdi, DVL4
1206 jae 7f
1207
1208 .p2align 4
1209 // Continue carry propagation until the end of the buffer.
1210 0: add [rdi], C
1211 mov C, 0 // preserves flags
1212 adcd [rdi + 4], 0
1213 adcd [rdi + 8], 0
1214 adcd [rdi + 12], 0
1215 adc C, 0
1216 add rdi, 16
1217 cmp rdi, DVL4
1218 jb 0b
1219
1220 // Deal with the tail end.
1221 7: add [rdi], C
1222 mov C, 0 // preserves flags
1223 add rdi, 4
1224 adc C, 0
1225 cmp rdi, DVL
1226 jb 7b
1227
1228 // All done for this iteration. Start the next. (This must have at
1229 // least one follow-on iteration, or we'd not have started this outer
1230 // loop.)
1231 8: mov rdi, DV // -> Z = dv[i]
1232 mov rbx, NV // -> X = nv[0]
1233 cmp rdi, DVLO // all done yet?
1234 jae 9f
1235 add DV, 16
1236 call mont4
1237 add rdi, 16
1238 add rbx, 16
1239 jmp 5b
1240
1241 // All over.
1242 9:
1243
1244 #if ABI_SYSV
1245 popreg rbx
1246 #endif
1247
1248 #if ABI_WIN
1249 rstrxmm xmm6, 0
1250 rstrxmm xmm7, 16
1251 rstrxmm xmm8, 32
1252 rstrxmm xmm9, 48
1253 rstrxmm xmm10, 64
1254 rstrxmm xmm11, 80
1255 rstrxmm xmm12, 96
1256 rstrxmm xmm13, 112
1257 rstrxmm xmm14, 128
1258 rstrxmm xmm15, 144
1259
1260 stfree 160
1261 popreg r12
1262 popreg rdi
1263 popreg rbx
1264 #endif
1265
1266 ret
1267
1268 #undef DVL
1269 #undef DVL4
1270 #undef MI
1271 #undef DV
1272 #undef DVLO
1273 #undef NV
1274 #undef NVL
1275 #undef N
1276 #undef C
1277
1278 ENDFUNC
1279
1280 ///--------------------------------------------------------------------------
1281 /// Testing and performance measurement.
1282
1283 #ifdef TEST_MUL4
1284
1285 #if ABI_SYSV
1286 # define ARG0 rdi
1287 # define ARG1 rsi
1288 # define ARG2 rdx
1289 # define ARG3 rcx
1290 # define ARG4 r8
1291 # define ARG5 r9
1292 # define ARG6 STKARG(0)
1293 # define ARG7 STKARG(1)
1294 # define ARG8 STKARG(2)
1295 # define STKARG_OFFSET 16
1296 #endif
1297 #if ABI_WIN
1298 # define ARG0 rcx
1299 # define ARG1 rdx
1300 # define ARG2 r8
1301 # define ARG3 r9
1302 # define ARG4 STKARG(0)
1303 # define ARG5 STKARG(1)
1304 # define ARG6 STKARG(2)
1305 # define ARG7 STKARG(3)
1306 # define ARG8 STKARG(4)
1307 # define STKARG_OFFSET 224
1308 #endif
1309 #define STKARG(i) [SP + STKARG_OFFSET + 8*(i)]
1310
1311 // sysv win
1312 // dmul smul mmul mont dmul smul mmul mont
1313 // A rax
1314 // D rdx
1315 // z rdi rdi rdi rdi rdi rcx rcx rcx rcx
1316 // c rcx rsi rsi rsi rsi rdx rdx rdx rdx
1317 // y r10 -- -- rdx rdx -- -- r8 r8
1318 // u r11 rdx -- rcx -- r8 -- r9 --
1319 // x rbx rcx rdx r8 rcx r9 r8 stk0 r9
1320 // vv xmm8/9 r8 -- r9 r8 stk0 -- stk1 stk0
1321 // yy xmm10/11 r9 rcx stk0 -- stk1 r9 stk2 --
1322 // n r8 stk0 r8 stk1 r9 stk2 stk0 stk3 stk1
1323 // cyv r9 stk1 r9 stk2 stk0 stk3 stk1 stk4 stk2
1324
1325 .macro cysetup v, n
1326 rdtsc
1327 shl rdx, 32
1328 or rax, rdx
1329 mov [\v + 8*\n - 8], rax
1330 .endm
1331
1332 .macro cystore v, n
1333 rdtsc
1334 shl rdx, 32
1335 or rax, rdx
1336 sub rax, [\v + 8*\n - 8]
1337 mov [\v + 8*\n - 8], rax
1338 dec \n
1339 .endm
1340
1341 .macro testprologue mode
1342 pushreg rbx
1343 #if ABI_SYSV
1344 endprologue
1345 .ifeqs "\mode", "dmul"
1346 mov rbx, rcx
1347 movdqu xmm8, [r8]
1348 movdqu xmm10, [r9]
1349 mov r8d, STKARG(0)
1350 mov r9, STKARG(1)
1351 mov r11, rdx
1352 mov rcx, rsi
1353 .endif
1354 .ifeqs "\mode", "smul"
1355 mov rbx, rdx
1356 movdqu xmm10, [rcx]
1357 mov rcx, rsi
1358 .endif
1359 .ifeqs "\mode", "mmul"
1360 mov rax, STKARG(0)
1361 mov rbx, r8
1362 movdqu xmm8, [r9]
1363 movdqu xmm10, [rax]
1364 mov r8d, STKARG(1)
1365 mov r9, STKARG(2)
1366 mov r10, rdx
1367 mov r11, rcx
1368 mov rcx, rsi
1369 .endif
1370 .ifeqs "\mode", "mont"
1371 mov rbx, rcx
1372 movdqu xmm8, [r8]
1373 mov r8d, r9d
1374 mov r9, STKARG(0)
1375 mov r10, rdx
1376 mov rcx, rsi
1377 .endif
1378 #endif
1379 #if ABI_WIN
1380 pushreg rdi
1381 stalloc 168
1382 savexmm xmm6, 0
1383 savexmm xmm7, 16
1384 savexmm xmm8, 32
1385 savexmm xmm9, 48
1386 savexmm xmm10, 64
1387 savexmm xmm11, 80
1388 savexmm xmm12, 96
1389 savexmm xmm13, 112
1390 savexmm xmm14, 128
1391 savexmm xmm15, 144
1392 endprologue
1393 .ifeqs "\mode", "dmul"
1394 mov r10, STKARG(0)
1395 mov r11, STKARG(1)
1396 mov rdi, rcx
1397 mov rcx, rdx
1398 mov rbx, r9
1399 movdqu xmm8, [r10]
1400 movdqu xmm10, [r11]
1401 mov r11, r8
1402 mov r8d, STKARG(2)
1403 mov r9, STKARG(3)
1404 .endif
1405 .ifeqs "\mode", "smul"
1406 mov rdi, rcx
1407 mov rcx, rdx
1408 mov rbx, r8
1409 movdqu xmm10, [r9]
1410 mov r8d, STKARG(0)
1411 mov r9, STKARG(1)
1412 .endif
1413 .ifeqs "\mode", "mmul"
1414 mov r10, STKARG(1)
1415 mov r11, STKARG(2)
1416 mov rdi, rcx
1417 mov rcx, rdx
1418 mov rbx, STKARG(0)
1419 movdqu xmm8, [r10]
1420 movdqu xmm10, [r11]
1421 mov r10, r8
1422 mov r11, r9
1423 mov r8d, STKARG(3)
1424 mov r9, STKARG(4)
1425 .endif
1426 .ifeqs "\mode", "mont"
1427 mov r10, STKARG(0)
1428 mov rdi, rcx
1429 mov rcx, rdx
1430 mov rbx, r9
1431 movdqu xmm8, [r10]
1432 mov r10, r8
1433 mov r8d, STKARG(1)
1434 mov r9, STKARG(2)
1435 .endif
1436 #endif
1437
1438 pxor xmm0, xmm0
1439 .ifeqs "\mode", "dmul"
1440 expand xmm0, xmm8, xmm9, xmm10, xmm11
1441 .endif
1442 .ifeqs "\mode", "smul"
1443 expand xmm0, xmm10, xmm11
1444 .endif
1445 .ifeqs "\mode", "mmul"
1446 expand xmm0, xmm8, xmm9, xmm10, xmm11
1447 .endif
1448 .ifeqs "\mode", "mont"
1449 expand xmm0, xmm8, xmm9
1450 .endif
1451 .endm
1452
1453 .macro testepilogue
1454 #if ABI_WIN
1455 rstrxmm xmm6, 0
1456 rstrxmm xmm7, 16
1457 rstrxmm xmm8, 32
1458 rstrxmm xmm9, 48
1459 rstrxmm xmm10, 64
1460 rstrxmm xmm11, 80
1461 rstrxmm xmm12, 96
1462 rstrxmm xmm13, 112
1463 rstrxmm xmm14, 128
1464 rstrxmm xmm15, 144
1465 stfree 168
1466 popreg rdi
1467 #endif
1468 popreg rbx
1469 ret
1470 .endm
1471
1472 .macro testldcarry
1473 movdqu xmm12, [rcx + 0] // (c'_0; c''_0)
1474 movdqu xmm13, [rcx + 16] // (c'_1; c''_1)
1475 movdqu xmm14, [rcx + 32] // (c'_2; c''_2)
1476 .endm
1477
1478 .macro testtop u=nil
1479 .p2align 4
1480 0:
1481 cysetup r9, r8
1482 .ifnes "\u", "nil"
1483 mov rax, \u
1484 .endif
1485 .endm
1486
1487 .macro testtail
1488 cystore r9, r8
1489 jnz 0b
1490 .endm
1491
1492 .macro testcarryout
1493 movdqu [rcx + 0], xmm12
1494 movdqu [rcx + 16], xmm13
1495 movdqu [rcx + 32], xmm14
1496 .endm
1497
1498 FUNC(test_dmul4)
1499 testprologue dmul
1500 testldcarry
1501 testtop r11
1502 call dmul4
1503 testtail
1504 testcarryout
1505 testepilogue
1506 ENDFUNC
1507
1508 FUNC(test_dmla4)
1509 testprologue dmul
1510 testldcarry
1511 testtop r11
1512 call dmla4
1513 testtail
1514 testcarryout
1515 testepilogue
1516 ENDFUNC
1517
1518 FUNC(test_mul4)
1519 testprologue smul
1520 testldcarry
1521 testtop nil
1522 call mul4
1523 testtail
1524 testcarryout
1525 testepilogue
1526 ENDFUNC
1527
1528 FUNC(test_mul4zc)
1529 testprologue smul
1530 testldcarry
1531 testtop nil
1532 call mul4zc
1533 testtail
1534 testcarryout
1535 testepilogue
1536 ENDFUNC
1537
1538 FUNC(test_mla4)
1539 testprologue smul
1540 testldcarry
1541 testtop nil
1542 call mla4
1543 testtail
1544 testcarryout
1545 testepilogue
1546 ENDFUNC
1547
1548 FUNC(test_mla4zc)
1549 testprologue smul
1550 testldcarry
1551 testtop nil
1552 call mla4zc
1553 testtail
1554 testcarryout
1555 testepilogue
1556 ENDFUNC
1557
1558 FUNC(test_mmul4)
1559 testprologue mmul
1560 testtop r11
1561 call mmul4
1562 testtail
1563 movdqu [r10 + 0], xmm10
1564 movdqu [r10 + 16], xmm11
1565 testcarryout
1566 testepilogue
1567 ENDFUNC
1568
1569 FUNC(test_mmla4)
1570 testprologue mmul
1571 testtop r11
1572 call mmla4
1573 testtail
1574 movdqu [r10 + 0], xmm10
1575 movdqu [r10 + 16], xmm11
1576 testcarryout
1577 testepilogue
1578 ENDFUNC
1579
1580 FUNC(test_mont4)
1581 testprologue mont
1582 testtop
1583 call mont4
1584 testtail
1585 movdqu [r10 + 0], xmm10
1586 movdqu [r10 + 16], xmm11
1587 testcarryout
1588 testepilogue
1589 ENDFUNC
1590
1591 #endif
1592
1593 ///----- That's all, folks --------------------------------------------------