math/mpx-mul4-*.S: Use more portable type syntax for ambiguous instructions.
[catacomb] / math / mpx-mul4-amd64-sse2.S
1 /// -*- mode: asm; asm-comment-char: ?/; comment-start: "// " -*-
2 ///
3 /// Large SIMD-based multiplications
4 ///
5 /// (c) 2016 Straylight/Edgeware
6 ///
7
8 ///----- Licensing notice ---------------------------------------------------
9 ///
10 /// This file is part of Catacomb.
11 ///
12 /// Catacomb is free software; you can redistribute it and/or modify
13 /// it under the terms of the GNU Library General Public License as
14 /// published by the Free Software Foundation; either version 2 of the
15 /// License, or (at your option) any later version.
16 ///
17 /// Catacomb is distributed in the hope that it will be useful,
18 /// but WITHOUT ANY WARRANTY; without even the implied warranty of
19 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 /// GNU Library General Public License for more details.
21 ///
22 /// You should have received a copy of the GNU Library General Public
23 /// License along with Catacomb; if not, write to the Free
24 /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
25 /// MA 02111-1307, USA.
26
27 ///--------------------------------------------------------------------------
28 /// Preliminaries.
29
30 #include "config.h"
31 #include "asm-common.h"
32
33 .arch pentium4
34
35 .text
36
37 ///--------------------------------------------------------------------------
38 /// Theory.
39 ///
40 /// We define a number of primitive fixed-size multipliers from which we can
41 /// construct more general variable-length multipliers.
42 ///
43 /// The basic trick is the same throughout. In an operand-scanning
44 /// multiplication, the inner multiplication loop multiplies a multiple-
45 /// precision operand by a single precision factor, and adds the result,
46 /// appropriately shifted, to the result. A `finely integrated operand
47 /// scanning' implementation of Montgomery multiplication also adds the
48 /// product of a single-precision `Montgomery factor' and the modulus,
49 /// calculated in the same pass. The more common `coarsely integrated
50 /// operand scanning' alternates main multiplication and Montgomery passes,
51 /// which requires additional carry propagation.
52 ///
53 /// Throughout both plain-multiplication and Montgomery stages, then, one of
54 /// the factors remains constant throughout the operation, so we can afford
55 /// to take a little time to preprocess it. The transformation we perform is
56 /// as follows. Let b = 2^16, and B = b^2 = 2^32. Suppose we're given a
57 /// 128-bit factor v = v_0 + v_1 B + v_2 B^2 + v_3 B^3. Split each v_i into
58 /// two sixteen-bit pieces, so v_i = v'_i + v''_i b. These eight 16-bit
59 /// pieces are placed into 32-bit cells, and arranged as two 128-bit SSE
60 /// operands, as follows.
61 ///
62 /// Offset 0 4 8 12
63 /// 0 v'_0 v'_1 v''_0 v''_1
64 /// 16 v'_2 v'_3 v''_2 v''_3
65 ///
66 /// A `pmuludqd' instruction ignores the odd positions in its operands; thus,
67 /// it will act on (say) v'_0 and v''_0 in a single instruction. Shifting
68 /// this vector right by 4 bytes brings v'_1 and v''_1 into position. We can
69 /// multiply such a vector by a full 32-bit scalar to produce two 48-bit
70 /// results in 64-bit fields. The sixteen bits of headroom allows us to add
71 /// many products together before we must deal with carrying; it also allows
72 /// for some calculations to be performed on the above expanded form.
73 ///
74 /// We maintain four `carry' registers XMM12--XMM15 accumulating intermediate
75 /// results. The registers' precise roles rotate during the computation; we
76 /// name them `c0', `c1', `c2', and `c3'. Each carry register holds two
77 /// 64-bit halves: the register c0, for example, holds c'_0 (low half) and
78 /// c''_0 (high half), and represents the value c_0 = c'_0 + c''_0 b; the
79 /// carry registers collectively represent the value c_0 + c_1 B + c_2 B^2 +
80 /// c_3 B^3. The `pmuluqdq' instruction acting on a scalar operand
81 /// (broadcast across all lanes of its vector) and an operand in the expanded
82 /// form above produces a result which can be added directly to the
83 /// appropriate carry register. Following a pass of four multiplications, we
84 /// perform some limited carry propagation: let t = c''_0 mod B, and let d =
85 /// c'_0 + t b; then we output z = d mod B, add (floor(d/B), floor(c''_0/B))
86 /// to c1, and cycle the carry registers around, so that c1 becomes c0, and
87 /// the old (implicitly) zeroed c0 becomes c3.
88 ///
89 /// On 64-bit AMD64, we have a reasonable number of registers: the expanded
90 /// operands are kept in registers. The packed operands are read from memory
91 /// into working registers XMM4 and XMM5; XMM0--XMM3 are used for the actual
92 /// multiplications; and XMM6 and XMM7 are used for combining the results.
93 /// The following conventional argument names and locations are used
94 /// throughout.
95 ///
96 /// Arg Format Location Notes
97 ///
98 /// U packed [RAX]
99 /// X packed [RBX] In Montgomery multiplication, X = N
100 /// V expanded XMM8/XMM9
101 /// Y expanded XMM10/XMM11 In Montgomery multiplication, Y = (A + U V) M
102 /// M expanded (see below) Montgomery factor, M = -N^{-1} (mod B^4)
103 /// N Modulus, for Montgomery multiplication
104 /// A packed [RDI] Destination/accumulator
105 /// C carry XMM12--XMM15
106 ///
107 /// The calculation is some variant of
108 ///
109 /// A' + C' B^4 <- U V + X Y + A + C
110 ///
111 /// The low-level functions fit into a fairly traditional (finely-integrated)
112 /// operand scanning loop over operand pairs (U, X) (indexed by j) and (V, Y)
113 /// (indexed by i).
114 ///
115 /// The variants are as follows.
116 ///
117 /// Function Variant Use i j
118 ///
119 /// mmul4 A = C = 0, Y = M Montgomery 0 0
120 /// dmul4 A = 0 Montgomery 0 +
121 /// mmla4 C = 0, Y = M Montgomery + 0
122 /// dmla4 exactly as shown Montgomery + +
123 /// mont4 U = C = 0, V = M Montgomery any 0
124 ///
125 /// mul4zc U = V = A = C = 0 Plain 0 0
126 /// mul4 U = V = A = 0 Plain 0 +
127 /// mla4zc U = V = C = 0 Plain + 0
128 /// mla4 U = V = 0 Plain + +
129 ///
130 /// The `mmul4' and `mmla4' functions are also responsible for calculating
131 /// the Montgomery reduction factor Y = (A + U V) M used by the rest of the
132 /// inner loop.
133
134 ///--------------------------------------------------------------------------
135 /// Macro definitions.
136
137 .macro mulcore r, i, slo, shi, d0, d1=nil, d2=nil, d3=nil
138 // Multiply R_I by the expanded operand SLO/SHI, and leave the pieces
139 // of the product in registers D0, D1, D2, D3.
140 pshufd \d0, \r, SHUF(\i, 3, \i, 3) // (r_i, ?; r_i, ?)
141 .ifnes "\d1", "nil"
142 movdqa \d1, \slo // (s'_0, s'_1; s''_0, s''_1)
143 .endif
144 .ifnes "\d3", "nil"
145 movdqa \d3, \shi // (s'_2, s'_3; s''_2, s''_3)
146 .endif
147 .ifnes "\d1", "nil"
148 psrldq \d1, 4 // (s'_1, s''_0; s''_1, 0)
149 .endif
150 .ifnes "\d2", "nil"
151 movdqa \d2, \d0 // another copy of (r_i, ?; r_i, ?)
152 .endif
153 .ifnes "\d3", "nil"
154 psrldq \d3, 4 // (s'_3, s''_2; s''_3, 0)
155 .endif
156 .ifnes "\d1", "nil"
157 pmuludq \d1, \d0 // (r_i s'_1; r_i s''_1)
158 .endif
159 .ifnes "\d3", "nil"
160 pmuludq \d3, \d0 // (r_i s'_3; r_i s''_3)
161 .endif
162 .ifnes "\d2", "nil"
163 pmuludq \d2, \shi // (r_i s'_2; r_i s''_2)
164 .endif
165 pmuludq \d0, \slo // (r_i s'_0; r_i s''_0)
166 .endm
167
168 .macro accum c0, c1=nil, c2=nil, c3=nil
169 // Accumulate 64-bit pieces in XMM0--XMM3 into the corresponding
170 // carry registers C0--C3. Any or all of C1--C3 may be `nil' to skip
171 // updating that register.
172 paddq \c0, xmm0
173 .ifnes "\c1", "nil"
174 paddq \c1, xmm1
175 .endif
176 .ifnes "\c2", "nil"
177 paddq \c2, xmm2
178 .endif
179 .ifnes "\c3", "nil"
180 paddq \c3, xmm3
181 .endif
182 .endm
183
184 .macro mulacc r, i, slo, shi, c0=nil, c1=nil, c2=nil, c3=nil, z3p=nil
185 // Multiply R_I by the expanded operand SLO/SHI, and accumulate in
186 // carry registers C0, C1, C2, C3. If Z3P is `t' then C3 notionally
187 // contains zero, but needs clearing; in practice, we store the
188 // product directly rather than attempting to add. On completion,
189 // XMM0, XMM1, and XMM2 are clobbered, as is XMM3 if Z3P is not `t'.
190 .ifeqs "\z3p", "t"
191 mulcore \r, \i, \slo, \shi, xmm0, xmm1, xmm2, \c3
192 accum \c0, \c1, \c2
193 .else
194 mulcore \r, \i, \slo, \shi, xmm0, xmm1, xmm2, xmm3
195 accum \c0, \c1, \c2, \c3
196 .endif
197 .endm
198
199 .macro propout d, pos, c, cc=nil
200 // Calculate an output word from C, and store it at POS in D;
201 // propagate carries out from C to CC in preparation for a rotation
202 // of the carry registers. D is an XMM register; the POS is either
203 // `lo' or `hi' according to whether the output word should be in
204 // lane 0 or 1 of D; the high two lanes of D are clobbered. On
205 // completion, XMM3 is clobbered. If CC is `nil', then the
206 // contribution which would have been added to it is left in C.
207 pshufd xmm3, \c, SHUF(3, 3, 3, 2) // (?, ?; ?, t = c'' mod B)
208 psrldq xmm3, 12 // (t, 0; 0, 0) = (t; 0)
209 pslldq xmm3, 2 // (t b; 0)
210 paddq \c, xmm3 // (c' + t b; c'')
211 .ifeqs "\pos", "lo"
212 movdqa \d, \c
213 .else
214 punpckldq \d, \c
215 .endif
216 psrlq \c, 32 // floor(c/B)
217 .ifnes "\cc", "nil"
218 paddq \cc, \c // propagate up
219 .endif
220 .endm
221
222 .macro endprop d, pos, c, t
223 // On entry, C contains a carry register. On exit, the low 32 bits
224 // of the value represented in C are written at POS in D, and the
225 // remaining bits are left at the bottom of T.
226 movdqa \t, \c
227 psllq \t, 16 // (?; c'' b)
228 pslldq \c, 8 // (0; c')
229 paddq \t, \c // (?; c' + c'' b)
230 psrldq \t, 8 // (c' + c'' b; 0) = (c; 0)
231 .ifeqs "\pos", "lo"
232 movdqa \d, \t
233 .else
234 punpckldq \d, \t
235 .endif
236 psrldq \t, 4 // (floor(c/B); 0)
237 .endm
238
239 .macro expand z, a, b, c=nil, d=nil
240 // On entry, A and C hold packed 128-bit values, and Z is zero. On
241 // exit, A:B and C:D together hold the same values in expanded
242 // form. If C is `nil', then only expand A to A:B.
243 movdqa \b, \a // (a_0, a_1; a_2, a_3)
244 .ifnes "\c", "nil"
245 movdqa \d, \c // (c_0, c_1; c_2, c_3)
246 .endif
247 punpcklwd \a, \z // (a'_0, a''_0; a'_1, a''_1)
248 punpckhwd \b, \z // (a'_2, a''_2; a'_3, a''_3)
249 .ifnes "\c", "nil"
250 punpcklwd \c, \z // (c'_0, c''_0; c'_1, c''_1)
251 punpckhwd \d, \z // (c'_2, c''_2; c'_3, c''_3)
252 .endif
253 pshufd \a, \a, SHUF(0, 2, 1, 3) // (a'_0, a'_1; a''_0, a''_1)
254 pshufd \b, \b, SHUF(0, 2, 1, 3) // (a'_2, a'_3; a''_2, a''_3)
255 .ifnes "\c", "nil"
256 pshufd \c, \c, SHUF(0, 2, 1, 3) // (c'_0, c'_1; c''_0, c''_1)
257 pshufd \d, \d, SHUF(0, 2, 1, 3) // (c'_2, c'_3; c''_2, c''_3)
258 .endif
259 .endm
260
261 .macro squash c0, c1, c2, c3, t, u, lo, hi=nil
262 // On entry, C0, C1, C2, C3 are carry registers representing a value
263 // Y. On exit, LO holds the low 128 bits of the carry value; C1, C2,
264 // C3, T, and U are clobbered; and the high bits of Y are stored in
265 // HI, if this is not `nil'.
266
267 // The first step is to eliminate the `double-prime' pieces -- i.e.,
268 // the ones offset by 16 bytes from a 32-bit boundary -- by carrying
269 // them into the 32-bit-aligned pieces above and below. But before
270 // we can do that, we must gather them together.
271 movdqa \t, \c0
272 movdqa \u, \c1
273 punpcklqdq \t, \c2 // (y'_0; y'_2)
274 punpckhqdq \c0, \c2 // (y''_0; y''_2)
275 punpcklqdq \u, \c3 // (y'_1; y'_3)
276 punpckhqdq \c1, \c3 // (y''_1; y''_3)
277
278 // Now split the double-prime pieces. The high (up to) 48 bits will
279 // go up; the low 16 bits go down.
280 movdqa \c2, \c0
281 movdqa \c3, \c1
282 psllq \c2, 48
283 psllq \c3, 48
284 psrlq \c0, 16 // high parts of (y''_0; y''_2)
285 psrlq \c1, 16 // high parts of (y''_1; y''_3)
286 psrlq \c2, 32 // low parts of (y''_0; y''_2)
287 psrlq \c3, 32 // low parts of (y''_1; y''_3)
288 .ifnes "\hi", "nil"
289 movdqa \hi, \c1
290 .endif
291 pslldq \c1, 8 // high part of (0; y''_1)
292
293 paddq \t, \c2 // propagate down
294 paddq \u, \c3
295 paddq \t, \c1 // and up: (y_0; y_2)
296 paddq \u, \c0 // (y_1; y_3)
297 .ifnes "\hi", "nil"
298 psrldq \hi, 8 // high part of (y''_3; 0)
299 .endif
300
301 // Finally extract the answer. This complicated dance is better than
302 // storing to memory and loading, because the piecemeal stores
303 // inhibit store forwarding.
304 movdqa \c3, \t // (y_0; ?)
305 movdqa \lo, \t // (y^*_0, ?; ?, ?)
306 psrldq \t, 8 // (y_2; 0)
307 psrlq \c3, 32 // (floor(y_0/B); ?)
308 paddq \c3, \u // (y_1 + floor(y_0/B); ?)
309 movdqa \c1, \c3 // (y^*_1, ?; ?, ?)
310 psrldq \u, 8 // (y_3; 0)
311 psrlq \c3, 32 // (floor((y_1 B + y_0)/B^2; ?)
312 paddq \c3, \t // (y_2 + floor((y_1 B + y_0)/B^2; ?)
313 punpckldq \lo, \c3 // (y^*_0, y^*_2; ?, ?)
314 psrlq \c3, 32 // (floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
315 paddq \c3, \u // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
316 .ifnes "\hi", "nil"
317 movdqa \t, \c3
318 pxor \u, \u
319 .endif
320 punpckldq \c1, \c3 // (y^*_1, y^*_3; ?, ?)
321 .ifnes "\hi", "nil"
322 psrlq \t, 32 // very high bits of y
323 paddq \hi, \t
324 punpcklqdq \hi, \u // carry up
325 .endif
326 punpckldq \lo, \c1 // y mod B^4
327 .endm
328
329 .macro carryadd
330 // On entry, RDI points to a packed addend A, and XMM12, XMM13, XMM14
331 // hold the incoming carry registers c0, c1, and c2 representing a
332 // carry-in C.
333 //
334 // On exit, the carry registers, including XMM15, are updated to hold
335 // C + A; XMM0, XMM1, XMM2, and XMM3 are clobbered. The other
336 // registers are preserved.
337 movd xmm0, [rdi + 0] // (a_0; 0)
338 movd xmm1, [rdi + 4] // (a_1; 0)
339 movd xmm2, [rdi + 8] // (a_2; 0)
340 movd xmm15, [rdi + 12] // (a_3; 0)
341 paddq xmm12, xmm0 // (c'_0 + a_0; c''_0)
342 paddq xmm13, xmm1 // (c'_1 + a_1; c''_1)
343 paddq xmm14, xmm2 // (c'_2 + a_2; c''_2 + a_3 b)
344 .endm
345
346 ///--------------------------------------------------------------------------
347 /// Primitive multipliers and related utilities.
348
349 INTFUNC(carryprop)
350 // On entry, XMM12, XMM13, and XMM14 hold a 144-bit carry in an
351 // expanded form. Store the low 128 bits of the represented carry to
352 // [RDI] as a packed 128-bit value, and leave the remaining 16 bits
353 // in the low 32 bits of XMM12. On exit, XMM0, XMM1, XMM3, XMM13 and
354 // XMM14 are clobbered.
355 endprologue
356
357 propout xmm0, lo, xmm12, xmm13
358 propout xmm1, lo, xmm13, xmm14
359 propout xmm0, hi, xmm14, nil
360 endprop xmm1, hi, xmm14, xmm12
361 punpckldq xmm0, xmm1
362 movdqu [rdi], xmm0
363
364 ret
365 ENDFUNC
366
367 INTFUNC(dmul4)
368 // On entry, RDI points to the destination buffer; RAX and RBX point
369 // to the packed operands U and X; XMM8/XMM9 and XMM10/XMM11 hold the
370 // expanded operands V and Y; and XMM12, XMM13, XMM14 hold the
371 // incoming carry registers c0, c1, and c2; c3 is assumed to be zero.
372 //
373 // On exit, we write the low 128 bits of the sum C + U V + X Y to
374 // [RDI], and update the carry registers with the carry out. The
375 // registers XMM0--XMM7, and XMM15 are clobbered; the general-purpose
376 // registers are preserved.
377 endprologue
378
379 movdqu xmm4, [rax]
380 movdqu xmm5, [rbx]
381
382 mulacc xmm4, 0, xmm8, xmm9, xmm12, xmm13, xmm14, xmm15, t
383 mulacc xmm5, 0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
384 propout xmm6, lo, xmm12, xmm13
385
386 mulacc xmm4, 1, xmm8, xmm9, xmm13, xmm14, xmm15, xmm12, t
387 mulacc xmm5, 1, xmm10, xmm11, xmm13, xmm14, xmm15, xmm12
388 propout xmm7, lo, xmm13, xmm14
389
390 mulacc xmm4, 2, xmm8, xmm9, xmm14, xmm15, xmm12, xmm13, t
391 mulacc xmm5, 2, xmm10, xmm11, xmm14, xmm15, xmm12, xmm13
392 propout xmm6, hi, xmm14, xmm15
393
394 mulacc xmm4, 3, xmm8, xmm9, xmm15, xmm12, xmm13, xmm14, t
395 mulacc xmm5, 3, xmm10, xmm11, xmm15, xmm12, xmm13, xmm14
396 propout xmm7, hi, xmm15, xmm12
397
398 punpckldq xmm6, xmm7
399 movdqu [rdi], xmm6
400
401 ret
402 ENDFUNC
403
404 INTFUNC(dmla4)
405 // On entry, RDI points to the destination buffer, which also
406 // contains an addend A to accumulate; RAX and RBX point to the
407 // packed operands U and X; XMM8/XMM9 and XMM10/XMM11 hold the
408 // expanded operands V and Y; and XMM12, XMM13, XMM14 hold the
409 // incoming carry registers c0, c1, and c2 representing a carry-in C;
410 // c3 is assumed to be zero.
411 //
412 // On exit, we write the low 128 bits of the sum A + C + U V + X Y to
413 // [RDI], and update the carry registers with the carry out. The
414 // registers XMM0--XMM7, and XMM15 are clobbered; the general-purpose
415 // registers are preserved.
416 endprologue
417
418 movdqu xmm4, [rax]
419 movdqu xmm5, [rbx]
420 carryadd
421
422 mulacc xmm4, 0, xmm8, xmm9, xmm12, xmm13, xmm14, xmm15
423 mulacc xmm5, 0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
424 propout xmm6, lo, xmm12, xmm13
425
426 mulacc xmm4, 1, xmm8, xmm9, xmm13, xmm14, xmm15, xmm12, t
427 mulacc xmm5, 1, xmm10, xmm11, xmm13, xmm14, xmm15, xmm12
428 propout xmm7, lo, xmm13, xmm14
429
430 mulacc xmm4, 2, xmm8, xmm9, xmm14, xmm15, xmm12, xmm13, t
431 mulacc xmm5, 2, xmm10, xmm11, xmm14, xmm15, xmm12, xmm13
432 propout xmm6, hi, xmm14, xmm15
433
434 mulacc xmm4, 3, xmm8, xmm9, xmm15, xmm12, xmm13, xmm14, t
435 mulacc xmm5, 3, xmm10, xmm11, xmm15, xmm12, xmm13, xmm14
436 propout xmm7, hi, xmm15, xmm12
437
438 punpckldq xmm6, xmm7
439 movdqu [rdi], xmm6
440
441 ret
442 ENDFUNC
443
444 INTFUNC(mul4zc)
445 // On entry, RDI points to the destination buffer; RBX points to a
446 // packed operand X; and XMM10/XMM11 hold an expanded operand Y.
447 //
448 // On exit, we write the low 128 bits of the product X Y to [RDI],
449 // and set the carry registers XMM12, XMM13, XMM14 to the carry out.
450 // The registers XMM0--XMM3, XMM5--XMM7, and XMM15 are clobbered; the
451 // general-purpose registers are preserved.
452 endprologue
453
454 movdqu xmm5, [rbx]
455
456 mulcore xmm5, 0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
457 propout xmm6, lo, xmm12, xmm13
458
459 mulacc xmm5, 1, xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
460 propout xmm7, lo, xmm13, xmm14
461
462 mulacc xmm5, 2, xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
463 propout xmm6, hi, xmm14, xmm15
464
465 mulacc xmm5, 3, xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
466 propout xmm7, hi, xmm15, xmm12
467
468 punpckldq xmm6, xmm7
469 movdqu [rdi], xmm6
470
471 ret
472 ENDFUNC
473
474 INTFUNC(mul4)
475 // On entry, RDI points to the destination buffer; RBX points to a
476 // packed operand X; XMM10/XMM11 hold an expanded operand Y; and
477 // XMM12, XMM13, XMM14 hold the incoming carry registers c0, c1, and
478 // c2, representing a carry-in C; c3 is assumed to be zero.
479 //
480 // On exit, we write the low 128 bits of the sum C + X Y to [RDI],
481 // and update the carry registers with the carry out. The registers
482 // XMM0--XMM3, XMM5--XMM7, and XMM15 are clobbered; the
483 // general-purpose registers are preserved.
484 endprologue
485
486 movdqu xmm5, [rbx]
487
488 mulacc xmm5, 0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, t
489 propout xmm6, lo, xmm12, xmm13
490
491 mulacc xmm5, 1, xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
492 propout xmm7, lo, xmm13, xmm14
493
494 mulacc xmm5, 2, xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
495 propout xmm6, hi, xmm14, xmm15
496
497 mulacc xmm5, 3, xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
498 propout xmm7, hi, xmm15, xmm12
499
500 punpckldq xmm6, xmm7
501 movdqu [rdi], xmm6
502
503 ret
504 ENDFUNC
505
506 INTFUNC(mla4zc)
507 // On entry, RDI points to the destination buffer, which also
508 // contains an addend A to accumulate; RBX points to a packed operand
509 // X; and XMM10/XMM11 points to an expanded operand Y.
510 //
511 // On exit, we write the low 128 bits of the sum A + X Y to [RDI],
512 // and set the carry registers XMM12, XMM13, XMM14 to the carry out.
513 // The registers XMM0--XMM3, XMM5--XMM7, and XMM15 are clobbered; the
514 // general-purpose registers are preserved.
515 endprologue
516
517 movdqu xmm5, [rbx]
518 movd xmm12, [rdi + 0]
519 movd xmm13, [rdi + 4]
520 movd xmm14, [rdi + 8]
521 movd xmm15, [rdi + 12]
522
523 mulacc xmm5, 0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
524 propout xmm6, lo, xmm12, xmm13
525
526 mulacc xmm5, 1, xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
527 propout xmm7, lo, xmm13, xmm14
528
529 mulacc xmm5, 2, xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
530 propout xmm6, hi, xmm14, xmm15
531
532 mulacc xmm5, 3, xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
533 propout xmm7, hi, xmm15, xmm12
534
535 punpckldq xmm6, xmm7
536 movdqu [rdi], xmm6
537
538 ret
539 ENDFUNC
540
541 INTFUNC(mla4)
542 // On entry, RDI points to the destination buffer, which also
543 // contains an addend A to accumulate; RBX points to a packed operand
544 // X; XMM10/XMM11 holds an expanded operand Y; and XMM12, XMM13,
545 // XMM14 hold the incoming carry registers c0, c1, and c2,
546 // representing a carry-in C; c3 is assumed to be zero.
547 //
548 // On exit, we write the low 128 bits of the sum A + C + X Y to
549 // [RDI], and update the carry registers with the carry out. The
550 // registers XMM0--XMM3, XMM5--XMM7, and XMM15 are clobbered; the
551 // general-purpose registers are preserved.
552 endprologue
553
554 movdqu xmm5, [rbx]
555 carryadd
556
557 mulacc xmm5, 0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
558 propout xmm6, lo, xmm12, xmm13
559
560 mulacc xmm5, 1, xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
561 propout xmm7, lo, xmm13, xmm14
562
563 mulacc xmm5, 2, xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
564 propout xmm6, hi, xmm14, xmm15
565
566 mulacc xmm5, 3, xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
567 propout xmm7, hi, xmm15, xmm12
568
569 punpckldq xmm6, xmm7
570 movdqu [rdi], xmm6
571
572 ret
573 ENDFUNC
574
575 INTFUNC(mmul4)
576 // On entry, RDI points to the destination buffer; RAX and RBX point
577 // to the packed operands U and N; and XMM8/XMM9 and XMM10/XMM11 hold
578 // the expanded operands V and M. The stack pointer must be 8 modulo
579 // 16 (as usual for AMD64 ABIs).
580 //
581 // On exit, we store Y = U V M mod B in XMM10/XMM11, and write the
582 // low 128 bits of the sum U V + N Y to [RDI], leaving the remaining
583 // carry in XMM12, XMM13, and XMM14. The registers XMM0--XMM7, and
584 // XMM15 are clobbered; the general-purpose registers are preserved.
585 movdqu xmm4, [rax]
586 #if ABI_WIN
587 stalloc 48 + 8 // space for the carries
588 #endif
589 endprologue
590
591 // Calculate W = U V, and leave it in XMM7. Stash the carry pieces
592 // for later.
593 mulcore xmm4, 0, xmm8, xmm9, xmm12, xmm13, xmm14, xmm15
594 propout xmm7, lo, xmm12, xmm13
595 jmp 5f
596 ENDFUNC
597
598 INTFUNC(mmla4)
599 // On entry, RDI points to the destination buffer, which also
600 // contains an addend A to accumulate; RAX and RBX point to the
601 // packed operands U and N; and XMM8/XMM9 and XMM10/XMM11 hold the
602 // expanded operands V and M. The stack pointer must be 8 modulo 16
603 // (as usual for AMD64 ABIs).
604 //
605 // On exit, we store Y = (A + U V) M mod B in XMM10/XMM11, and write
606 // the low 128 bits of the sum A + U V + N Y to [RDI], leaving the
607 // remaining carry in XMM12, XMM13, and XMM14. The registers
608 // XMM0--XMM7, and XMM15 are clobbered; the general-purpose registers
609 // are preserved.
610 movdqu xmm4, [rax]
611 #if ABI_WIN
612 stalloc 48 + 8 // space for the carries
613 # define STKTMP(i) [SP + i]
614 #endif
615 #if ABI_SYSV
616 # define STKTMP(i) [SP + i - 48 - 8] // use red zone
617 #endif
618 endprologue
619
620 movd xmm12, [rdi + 0]
621 movd xmm13, [rdi + 4]
622 movd xmm14, [rdi + 8]
623 movd xmm15, [rdi + 12]
624
625 // Calculate W = U V, and leave it in XMM7. Stash the carry pieces
626 // for later.
627 mulacc xmm4, 0, xmm8, xmm9, xmm12, xmm13, xmm14, xmm15
628 propout xmm7, lo, xmm12, xmm13
629
630 5: mulacc xmm4, 1, xmm8, xmm9, xmm13, xmm14, xmm15, xmm12, t
631 propout xmm6, lo, xmm13, xmm14
632
633 mulacc xmm4, 2, xmm8, xmm9, xmm14, xmm15, xmm12, xmm13, t
634 propout xmm7, hi, xmm14, xmm15
635
636 mulacc xmm4, 3, xmm8, xmm9, xmm15, xmm12, xmm13, xmm14, t
637 propout xmm6, hi, xmm15, xmm12
638
639 // Prepare W, and stash carries for later.
640 punpckldq xmm7, xmm6
641 movdqa STKTMP( 0), xmm12
642 movdqa STKTMP(16), xmm13
643 movdqa STKTMP(32), xmm14
644
645 // Calculate Y = W M. We just about have enough spare registers to
646 // make this work.
647 mulcore xmm7, 0, xmm10, xmm11, xmm3, xmm4, xmm5, xmm6
648
649 // Start expanding W back into the main carry registers...
650 pxor xmm15, xmm15
651 movdqa xmm12, xmm7
652 movdqa xmm14, xmm7
653
654 mulcore xmm7, 1, xmm10, xmm11, xmm0, xmm1, xmm2
655 accum xmm4, xmm5, xmm6
656
657 punpckldq xmm12, xmm15 // (w_0, 0; w_1, 0)
658 punpckhdq xmm14, xmm15 // (w_2, 0; w_3, 0)
659
660 mulcore xmm7, 2, xmm10, xmm11, xmm0, xmm1
661 accum xmm5, xmm6
662
663 pxor xmm2, xmm2
664 movdqa xmm13, xmm12
665 movdqa xmm15, xmm14
666
667 mulcore xmm7, 3, xmm10, xmm11, xmm0
668 accum xmm6
669
670 punpckldq xmm12, xmm2 // (w_0, 0; 0, 0)
671 punpckldq xmm14, xmm2 // (w_2, 0; 0, 0)
672 punpckhdq xmm13, xmm2 // (w_1, 0; 0, 0)
673 punpckhdq xmm15, xmm2 // (w_3, 0; 0, 0)
674
675 // That's lots of pieces. Now we have to assemble the answer.
676 squash xmm3, xmm4, xmm5, xmm6, xmm0, xmm1, xmm10
677
678 // Expand it.
679 movdqu xmm5, [rbx]
680 expand xmm2, xmm10, xmm11
681
682 // Finish the calculation by adding the Montgomery product.
683 mulacc xmm5, 0 xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
684 propout xmm6, lo, xmm12, xmm13
685
686 mulacc xmm5, 1 xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
687 propout xmm7, lo, xmm13, xmm14
688
689 mulacc xmm5, 2 xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
690 propout xmm6, hi, xmm14, xmm15
691
692 mulacc xmm5, 3 xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
693 propout xmm7, hi, xmm15, xmm12
694
695 punpckldq xmm6, xmm7
696
697 // Add add on the carry we calculated earlier.
698 paddq xmm12, STKTMP( 0)
699 paddq xmm13, STKTMP(16)
700 paddq xmm14, STKTMP(32)
701
702 // And, with that, we're done.
703 movdqu [rdi], xmm6
704 #if ABI_WIN
705 stfree 56
706 #endif
707 ret
708
709 #undef STKTMP
710
711 ENDFUNC
712
713 INTFUNC(mont4)
714 // On entry, RDI points to the destination buffer holding a packed
715 // value W; RBX points to a packed operand N; and XMM8/XMM9 hold an
716 // expanded operand M.
717 //
718 // On exit, we store Y = W M mod B in XMM10/XMM11, and write the low
719 // 128 bits of the sum W + N Y to [RDI], leaving the remaining carry
720 // in XMM12, XMM13, and XMM14. The registers XMM0--XMM3, XMM5--XMM7,
721 // and XMM15 are clobbered; the general-purpose registers are
722 // preserved.
723 endprologue
724
725 movdqu xmm7, [rdi]
726
727 // Calculate Y = W M. Avoid the standard carry registers, because
728 // we're setting something else up there.
729 mulcore xmm7, 0, xmm8, xmm9, xmm3, xmm4, xmm5, xmm6
730
731 // Start expanding W back into the main carry registers...
732 pxor xmm15, xmm15
733 movdqa xmm12, xmm7
734 movdqa xmm14, xmm7
735
736 mulcore xmm7, 1, xmm8, xmm9, xmm0, xmm1, xmm2
737 accum xmm4, xmm5, xmm6
738
739 punpckldq xmm12, xmm15 // (w_0, 0; w_1, 0)
740 punpckhdq xmm14, xmm15 // (w_2, 0; w_3, 0)
741
742 mulcore xmm7, 2, xmm8, xmm9, xmm0, xmm1
743 accum xmm5, xmm6
744
745 pxor xmm2, xmm2
746 movdqa xmm13, xmm12
747 movdqa xmm15, xmm14
748
749 mulcore xmm7, 3, xmm8, xmm9, xmm0
750 accum xmm6
751
752 punpckldq xmm12, xmm2 // (w_0, 0; 0, 0)
753 punpckldq xmm14, xmm2 // (w_2, 0; 0, 0)
754 punpckhdq xmm13, xmm2 // (w_1, 0; 0, 0)
755 punpckhdq xmm15, xmm2 // (w_3, 0; 0, 0)
756
757 // That's lots of pieces. Now we have to assemble the answer.
758 squash xmm3, xmm4, xmm5, xmm6, xmm0, xmm1, xmm10
759
760 // Expand it.
761 movdqu xmm5, [rbx]
762 expand xmm2, xmm10, xmm11
763
764 // Finish the calculation by adding the Montgomery product.
765 mulacc xmm5, 0 xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
766 propout xmm6, lo, xmm12, xmm13
767
768 mulacc xmm5, 1 xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
769 propout xmm7, lo, xmm13, xmm14
770
771 mulacc xmm5, 2 xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
772 propout xmm6, hi, xmm14, xmm15
773
774 mulacc xmm5, 3 xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
775 propout xmm7, hi, xmm15, xmm12
776
777 punpckldq xmm6, xmm7
778
779 // And, with that, we're done.
780 movdqu [rdi], xmm6
781 ret
782 ENDFUNC
783
784 ///--------------------------------------------------------------------------
785 /// Bulk multipliers.
786
787 FUNC(mpx_umul4_amd64_avx)
788 .arch .avx
789 vzeroupper
790 endprologue
791 .arch pentium4
792 ENDFUNC
793
794 FUNC(mpx_umul4_amd64_sse2)
795 // void mpx_umul4_amd64_sse2(mpw *dv, const mpw *av, const mpw *avl,
796 // const mpw *bv, const mpw *bvl);
797
798 // Establish the arguments and do initial setup.
799 //
800 // sysv win
801 // inner loop dv rdi rdi*
802 // inner loop av rbx* rbx*
803 // outer loop dv r10 rcx
804 // outer loop bv rcx r9
805 // av base rsi rdx
806 // av limit rdx r8
807 // bv limit r8 r10
808
809 #if ABI_SYSV
810 # define DV r10
811 # define AV rsi
812 # define AVL rdx
813 # define BV rcx
814 # define BVL r8
815
816 pushreg rbx
817 endprologue
818
819 mov DV, rdi
820 #endif
821
822 #if ABI_WIN
823 # define DV rcx
824 # define AV rdx
825 # define AVL r8
826 # define BV r9
827 # define BVL r10
828
829 pushreg rbx
830 pushreg rdi
831 stalloc 160 + 8
832
833 savexmm xmm6, 0
834 savexmm xmm7, 16
835 savexmm xmm8, 32
836 savexmm xmm9, 48
837 savexmm xmm10, 64
838 savexmm xmm11, 80
839 savexmm xmm12, 96
840 savexmm xmm13, 112
841 savexmm xmm14, 128
842 savexmm xmm15, 144
843
844 endprologue
845
846 mov rdi, DV
847 mov BVL, [SP + 224]
848 #endif
849
850 // Prepare for the first iteration.
851 pxor xmm0, xmm0
852 movdqu xmm10, [BV] // bv[0]
853 mov rbx, AV
854 add DV, 16
855 add BV, 16
856 expand xmm0, xmm10, xmm11
857 call mul4zc
858 add rbx, 16
859 add rdi, 16
860 cmp rbx, AVL // all done?
861 jae 8f
862
863 .p2align 4
864 // Continue with the first iteration.
865 0: call mul4
866 add rbx, 16
867 add rdi, 16
868 cmp rbx, AVL // all done?
869 jb 0b
870
871 // Write out the leftover carry. There can be no tail here.
872 8: call carryprop
873 cmp BV, BVL // more passes to do?
874 jae 9f
875
876 .p2align 4
877 // Set up for the next pass.
878 1: movdqu xmm10, [BV] // bv[i]
879 mov rdi, DV // -> dv[i]
880 pxor xmm0, xmm0
881 expand xmm0, xmm10, xmm11
882 mov rbx, AV // -> av[0]
883 add DV, 16
884 add BV, 16
885 call mla4zc
886 add rbx, 16
887 add rdi, 16
888 cmp rbx, AVL // done yet?
889 jae 8f
890
891 .p2align 4
892 // Continue...
893 0: call mla4
894 add rbx, 16
895 add rdi, 16
896 cmp rbx, AVL
897 jb 0b
898
899 // Finish off this pass. There was no tail on the previous pass, and
900 // there can be none on this pass.
901 8: call carryprop
902 cmp BV, BVL
903 jb 1b
904
905 // All over.
906 9:
907
908 #if ABI_SYSV
909 popreg rbx
910 #endif
911
912 #if ABI_WIN
913 rstrxmm xmm6, 0
914 rstrxmm xmm7, 16
915 rstrxmm xmm8, 32
916 rstrxmm xmm9, 48
917 rstrxmm xmm10, 64
918 rstrxmm xmm11, 80
919 rstrxmm xmm12, 96
920 rstrxmm xmm13, 112
921 rstrxmm xmm14, 128
922 rstrxmm xmm15, 144
923
924 stfree 160 + 8
925 popreg rdi
926 popreg rbx
927 #endif
928
929 ret
930
931 #undef DV
932 #undef AV
933 #undef AVL
934 #undef BV
935 #undef BVL
936
937 ENDFUNC
938
939 FUNC(mpxmont_mul4_amd64_avx)
940 .arch .avx
941 vzeroupper
942 endprologue
943 .arch pentium4
944 ENDFUNC
945
946 FUNC(mpxmont_mul4_amd64_sse2)
947 // void mpxmont_mul4_amd64_sse2(mpw *dv, const mpw *av, const mpw *bv,
948 // const mpw *nv, size_t n, const mpw *mi);
949
950 // Establish the arguments and do initial setup.
951 //
952 // sysv win
953 // inner loop dv rdi rdi*
954 // inner loop av rax rax
955 // inner loop nv rbx* rbx*
956 // mi r9 r10
957 // outer loop dv r10 rcx
958 // outer loop bv rdx r8
959 // av base rsi rdx
960 // av limit r11 r11
961 // bv limit r8 r12*
962 // nv base rcx r9
963 // n r8 r12*
964
965 #if ABI_SYSV
966 # define DV r10
967 # define AV rsi
968 # define AVL r11
969 # define BV rdx
970 # define BVL r8
971 # define NV rcx
972 # define N r8
973 # define MI r9
974
975 pushreg rbx
976 endprologue
977
978 mov DV, rdi
979 #endif
980
981 #if ABI_WIN
982 # define DV rcx
983 # define AV rdx
984 # define AVL r11
985 # define BV r8
986 # define BVL r12
987 # define NV r9
988 # define N r12
989 # define MI r10
990
991 pushreg rbx
992 pushreg rdi
993 pushreg r12
994 stalloc 160
995
996 savexmm xmm6, 0
997 savexmm xmm7, 16
998 savexmm xmm8, 32
999 savexmm xmm9, 48
1000 savexmm xmm10, 64
1001 savexmm xmm11, 80
1002 savexmm xmm12, 96
1003 savexmm xmm13, 112
1004 savexmm xmm14, 128
1005 savexmm xmm15, 144
1006
1007 endprologue
1008
1009 mov rdi, DV
1010 mov N, [SP + 224]
1011 mov MI, [SP + 232]
1012 #endif
1013
1014 // Establish the expanded operands.
1015 pxor xmm0, xmm0
1016 movdqu xmm8, [BV] // bv[0]
1017 movdqu xmm10, [MI] // mi
1018 expand xmm0, xmm8, xmm9, xmm10, xmm11
1019
1020 // Set up the outer loop state and prepare for the first iteration.
1021 mov rax, AV // -> U = av[0]
1022 mov rbx, NV // -> X = nv[0]
1023 lea AVL, [AV + 4*N] // -> av[n/4] = av limit
1024 lea BVL, [BV + 4*N] // -> bv[n/4] = bv limit
1025 add BV, 16
1026 add DV, 16
1027 call mmul4
1028 add rdi, 16
1029 add rax, 16
1030 add rbx, 16
1031 cmp rax, AVL // done already?
1032 jae 8f
1033
1034 .p2align 4
1035 // Complete the first inner loop.
1036 0: call dmul4
1037 add rdi, 16
1038 add rax, 16
1039 add rbx, 16
1040 cmp rax, AVL // done yet?
1041 jb 0b
1042
1043 // Still have carries left to propagate.
1044 call carryprop
1045 movd [rdi + 16], xmm12
1046
1047 .p2align 4
1048 // Embark on the next iteration. (There must be one. If n = 1, then
1049 // we would have bailed above, to label 8. Similarly, the subsequent
1050 // iterations can fall into the inner loop immediately.)
1051 1: pxor xmm0, xmm0
1052 movdqu xmm8, [BV] // bv[i]
1053 movdqu xmm10, [MI] // mi
1054 mov rdi, DV // -> Z = dv[i]
1055 mov rax, AV // -> U = av[0]
1056 mov rbx, NV // -> X = nv[0]
1057 expand xmm0, xmm8, xmm9, xmm10, xmm11
1058 add BV, 16
1059 add DV, 16
1060 call mmla4
1061 add rdi, 16
1062 add rax, 16
1063 add rbx, 16
1064
1065 .p2align 4
1066 // Complete the next inner loop.
1067 0: call dmla4
1068 add rdi, 16
1069 add rax, 16
1070 add rbx, 16
1071 cmp rax, AVL
1072 jb 0b
1073
1074 // Still have carries left to propagate, and they overlap the
1075 // previous iteration's final tail, so read that in and add it.
1076 movd xmm0, [rdi]
1077 paddq xmm12, xmm0
1078 call carryprop
1079 movd [rdi + 16], xmm12
1080
1081 // Back again, maybe.
1082 cmp BV, BVL
1083 jb 1b
1084
1085 // All done.
1086 9:
1087
1088 #if ABI_SYSV
1089 popreg rbx
1090 #endif
1091
1092 #if ABI_WIN
1093 rstrxmm xmm6, 0
1094 rstrxmm xmm7, 16
1095 rstrxmm xmm8, 32
1096 rstrxmm xmm9, 48
1097 rstrxmm xmm10, 64
1098 rstrxmm xmm11, 80
1099 rstrxmm xmm12, 96
1100 rstrxmm xmm13, 112
1101 rstrxmm xmm14, 128
1102 rstrxmm xmm15, 144
1103
1104 stfree 160
1105 popreg r12
1106 popreg rdi
1107 popreg rbx
1108 #endif
1109
1110 ret
1111
1112 // First iteration was short. Write out the carries and we're done.
1113 // (This could be folded into the main loop structure, but that would
1114 // penalize small numbers more.)
1115 8: call carryprop
1116 movd [rdi + 16], xmm12
1117 #if ABI_SYSV
1118 popreg rbx
1119 ret
1120 #endif
1121 #if ABI_WIN
1122 jmp 9b
1123 #endif
1124
1125 #undef DV
1126 #undef AV
1127 #undef AVL
1128 #undef BV
1129 #undef BVL
1130 #undef NV
1131 #undef N
1132 #undef MI
1133
1134 ENDFUNC
1135
1136 FUNC(mpxmont_redc4_amd64_avx)
1137 .arch .avx
1138 vzeroupper
1139 endprologue
1140 .arch pentium4
1141 ENDFUNC
1142
1143 FUNC(mpxmont_redc4_amd64_sse2)
1144 // void mpxmont_redc4_amd64_sse2(mpw *dv, mpw *dvl, const mpw *nv,
1145 // size_t n, const mpw *mi);
1146
1147 // Establish the arguments and do initial setup.
1148 //
1149 // sysv win
1150 // inner loop dv rdi rdi*
1151 // dv limit rax rax
1152 // blocks-of-4 dv limit rsi rdx
1153 // inner loop nv rbx* rbx*
1154 // mi r8 r10
1155 // outer loop dv r10 rcx
1156 // outer loop dv limit r11 r11
1157 // nv base rdx r8
1158 // nv limit r9 r10*
1159 // n rcx r9
1160 // c rcx r9
1161
1162 #if ABI_SYSV
1163 # define DVL rax
1164 # define DVL4 rsi
1165 # define MI r8
1166 # define DV r10
1167 # define DVLO r11
1168 # define NV rdx
1169 # define NVL r9
1170 # define N rcx
1171 # define C ecx
1172
1173 pushreg rbx
1174 endprologue
1175
1176 mov DV, rdi
1177 #endif
1178
1179 #if ABI_WIN
1180 # define DVL rax
1181 # define DVL4 rdx
1182 # define MI r10
1183 # define DV rcx
1184 # define DVLO r11
1185 # define NV r8
1186 # define NVL r10
1187 # define N r9
1188 # define C r9d
1189
1190 pushreg rbx
1191 pushreg rdi
1192 stalloc 168
1193
1194 savexmm xmm6, 0
1195 savexmm xmm7, 16
1196 savexmm xmm8, 32
1197 savexmm xmm9, 48
1198 savexmm xmm10, 64
1199 savexmm xmm11, 80
1200 savexmm xmm12, 96
1201 savexmm xmm13, 112
1202 savexmm xmm14, 128
1203 savexmm xmm15, 144
1204
1205 endprologue
1206
1207 mov rdi, DV
1208 mov MI, [SP + 224]
1209 #endif
1210
1211 // Establish the expanded operands and the blocks-of-4 dv limit.
1212 pxor xmm0, xmm0
1213 mov DVL, DVL4 // -> dv[n] = dv limit
1214 sub DVL4, DV // length of dv in bytes
1215 movdqu xmm8, [MI] // mi
1216 and DVL4, ~15 // mask off the tail end
1217 expand xmm0, xmm8, xmm9
1218 add DVL4, DV // find limit
1219
1220 // Set up the outer loop state and prepare for the first iteration.
1221 mov rbx, NV // -> X = nv[0]
1222 lea DVLO, [DV + 4*N] // -> dv[n/4] = outer dv limit
1223 lea NVL, [NV + 4*N] // -> nv[n/4] = nv limit
1224 add DV, 16
1225 call mont4
1226 add rbx, 16
1227 add rdi, 16
1228 cmp rbx, NVL // done already?
1229 jae 8f
1230
1231 .p2align 4
1232 // Complete the first inner loop.
1233 5: call mla4
1234 add rbx, 16
1235 add rdi, 16
1236 cmp rbx, NVL // done yet?
1237 jb 5b
1238
1239 // Still have carries left to propagate.
1240 8: carryadd
1241 psllq xmm15, 16
1242 pslldq xmm15, 8
1243 paddq xmm14, xmm15
1244 call carryprop
1245 movd C, xmm12
1246 add rdi, 16
1247 cmp rdi, DVL4
1248 jae 7f
1249
1250 .p2align 4
1251 // Continue carry propagation until the end of the buffer.
1252 0: add [rdi], C
1253 mov C, 0 // preserves flags
1254 adc dword ptr [rdi + 4], 0
1255 adc dword ptr [rdi + 8], 0
1256 adc dword ptr [rdi + 12], 0
1257 adc C, 0
1258 add rdi, 16
1259 cmp rdi, DVL4
1260 jb 0b
1261
1262 // Deal with the tail end. Note that the actual destination length
1263 // won't be an exacty number of blocks of four, so it's safe to just
1264 // drop through here.
1265 7: add [rdi], C
1266 mov C, 0
1267 add rdi, 4
1268 adc C, 0
1269 cmp rdi, DVL
1270 jb 7b
1271
1272 // All done for this iteration. Start the next.
1273 cmp DV, DVLO // all done yet?
1274 jae 9f
1275 mov rdi, DV // -> Z = dv[i]
1276 mov rbx, NV // -> X = nv[0]
1277 add DV, 16
1278 call mont4
1279 add rdi, 16
1280 add rbx, 16
1281 jmp 5b
1282
1283 // All over.
1284 9:
1285
1286 #if ABI_SYSV
1287 popreg rbx
1288 #endif
1289
1290 #if ABI_WIN
1291 rstrxmm xmm6, 0
1292 rstrxmm xmm7, 16
1293 rstrxmm xmm8, 32
1294 rstrxmm xmm9, 48
1295 rstrxmm xmm10, 64
1296 rstrxmm xmm11, 80
1297 rstrxmm xmm12, 96
1298 rstrxmm xmm13, 112
1299 rstrxmm xmm14, 128
1300 rstrxmm xmm15, 144
1301
1302 stfree 168
1303 popreg rdi
1304 popreg rbx
1305 #endif
1306
1307 ret
1308
1309 #undef DVL
1310 #undef DVL4
1311 #undef MI
1312 #undef DV
1313 #undef DVLO
1314 #undef NV
1315 #undef NVL
1316 #undef N
1317 #undef C
1318
1319 ENDFUNC
1320
1321 ///--------------------------------------------------------------------------
1322 /// Testing and performance measurement.
1323
1324 #ifdef TEST_MUL4
1325
1326 #if ABI_SYSV
1327 # define ARG0 rdi
1328 # define ARG1 rsi
1329 # define ARG2 rdx
1330 # define ARG3 rcx
1331 # define ARG4 r8
1332 # define ARG5 r9
1333 # define ARG6 STKARG(0)
1334 # define ARG7 STKARG(1)
1335 # define ARG8 STKARG(2)
1336 # define STKARG_OFFSET 16
1337 #endif
1338 #if ABI_WIN
1339 # define ARG0 rcx
1340 # define ARG1 rdx
1341 # define ARG2 r8
1342 # define ARG3 r9
1343 # define ARG4 STKARG(0)
1344 # define ARG5 STKARG(1)
1345 # define ARG6 STKARG(2)
1346 # define ARG7 STKARG(3)
1347 # define ARG8 STKARG(4)
1348 # define STKARG_OFFSET 224
1349 #endif
1350 #define STKARG(i) [SP + STKARG_OFFSET + 8*(i)]
1351
1352 // sysv win
1353 // dmul smul mmul mont dmul smul mmul mont
1354 // A rax
1355 // D rdx
1356 // z rdi rdi rdi rdi rdi rcx rcx rcx rcx
1357 // c rcx rsi rsi rsi rsi rdx rdx rdx rdx
1358 // y r10 -- -- rdx rdx -- -- r8 r8
1359 // u r11 rdx -- rcx -- r8 -- r9 --
1360 // x rbx rcx rdx r8 rcx r9 r8 stk0 r9
1361 // vv xmm8/9 r8 -- r9 r8 stk0 -- stk1 stk0
1362 // yy xmm10/11 r9 rcx stk0 -- stk1 r9 stk2 --
1363 // n r8 stk0 r8 stk1 r9 stk2 stk0 stk3 stk1
1364 // cyv r9 stk1 r9 stk2 stk0 stk3 stk1 stk4 stk2
1365
1366 .macro cysetup v, n
1367 rdtsc
1368 shl rdx, 32
1369 or rax, rdx
1370 mov [\v + 8*\n - 8], rax
1371 .endm
1372
1373 .macro cystore v, n
1374 rdtsc
1375 shl rdx, 32
1376 or rax, rdx
1377 sub rax, [\v + 8*\n - 8]
1378 mov [\v + 8*\n - 8], rax
1379 dec \n
1380 .endm
1381
1382 .macro testprologue mode
1383 pushreg rbx
1384 #if ABI_SYSV
1385 endprologue
1386 .ifeqs "\mode", "dmul"
1387 mov rbx, rcx
1388 movdqu xmm8, [r8]
1389 movdqu xmm10, [r9]
1390 mov r8d, STKARG(0)
1391 mov r9, STKARG(1)
1392 mov r11, rdx
1393 mov rcx, rsi
1394 .endif
1395 .ifeqs "\mode", "smul"
1396 mov rbx, rdx
1397 movdqu xmm10, [rcx]
1398 mov rcx, rsi
1399 .endif
1400 .ifeqs "\mode", "mmul"
1401 mov rax, STKARG(0)
1402 mov rbx, r8
1403 movdqu xmm8, [r9]
1404 movdqu xmm10, [rax]
1405 mov r8d, STKARG(1)
1406 mov r9, STKARG(2)
1407 mov r10, rdx
1408 mov r11, rcx
1409 mov rcx, rsi
1410 .endif
1411 .ifeqs "\mode", "mont"
1412 mov rbx, rcx
1413 movdqu xmm8, [r8]
1414 mov r8d, r9d
1415 mov r9, STKARG(0)
1416 mov r10, rdx
1417 mov rcx, rsi
1418 .endif
1419 #endif
1420 #if ABI_WIN
1421 pushreg rdi
1422 stalloc 168
1423 savexmm xmm6, 0
1424 savexmm xmm7, 16
1425 savexmm xmm8, 32
1426 savexmm xmm9, 48
1427 savexmm xmm10, 64
1428 savexmm xmm11, 80
1429 savexmm xmm12, 96
1430 savexmm xmm13, 112
1431 savexmm xmm14, 128
1432 savexmm xmm15, 144
1433 endprologue
1434 .ifeqs "\mode", "dmul"
1435 mov r10, STKARG(0)
1436 mov r11, STKARG(1)
1437 mov rdi, rcx
1438 mov rcx, rdx
1439 mov rbx, r9
1440 movdqu xmm8, [r10]
1441 movdqu xmm10, [r11]
1442 mov r11, r8
1443 mov r8d, STKARG(2)
1444 mov r9, STKARG(3)
1445 .endif
1446 .ifeqs "\mode", "smul"
1447 mov rdi, rcx
1448 mov rcx, rdx
1449 mov rbx, r8
1450 movdqu xmm10, [r9]
1451 mov r8d, STKARG(0)
1452 mov r9, STKARG(1)
1453 .endif
1454 .ifeqs "\mode", "mmul"
1455 mov r10, STKARG(1)
1456 mov r11, STKARG(2)
1457 mov rdi, rcx
1458 mov rcx, rdx
1459 mov rbx, STKARG(0)
1460 movdqu xmm8, [r10]
1461 movdqu xmm10, [r11]
1462 mov r10, r8
1463 mov r11, r9
1464 mov r8d, STKARG(3)
1465 mov r9, STKARG(4)
1466 .endif
1467 .ifeqs "\mode", "mont"
1468 mov r10, STKARG(0)
1469 mov rdi, rcx
1470 mov rcx, rdx
1471 mov rbx, r9
1472 movdqu xmm8, [r10]
1473 mov r10, r8
1474 mov r8d, STKARG(1)
1475 mov r9, STKARG(2)
1476 .endif
1477 #endif
1478
1479 pxor xmm0, xmm0
1480 .ifeqs "\mode", "dmul"
1481 expand xmm0, xmm8, xmm9, xmm10, xmm11
1482 .endif
1483 .ifeqs "\mode", "smul"
1484 expand xmm0, xmm10, xmm11
1485 .endif
1486 .ifeqs "\mode", "mmul"
1487 expand xmm0, xmm8, xmm9, xmm10, xmm11
1488 .endif
1489 .ifeqs "\mode", "mont"
1490 expand xmm0, xmm8, xmm9
1491 .endif
1492 .endm
1493
1494 .macro testepilogue
1495 #if ABI_WIN
1496 rstrxmm xmm6, 0
1497 rstrxmm xmm7, 16
1498 rstrxmm xmm8, 32
1499 rstrxmm xmm9, 48
1500 rstrxmm xmm10, 64
1501 rstrxmm xmm11, 80
1502 rstrxmm xmm12, 96
1503 rstrxmm xmm13, 112
1504 rstrxmm xmm14, 128
1505 rstrxmm xmm15, 144
1506 stfree 168
1507 popreg rdi
1508 #endif
1509 popreg rbx
1510 ret
1511 .endm
1512
1513 .macro testldcarry
1514 movdqu xmm12, [rcx + 0] // (c'_0; c''_0)
1515 movdqu xmm13, [rcx + 16] // (c'_1; c''_1)
1516 movdqu xmm14, [rcx + 32] // (c'_2; c''_2)
1517 .endm
1518
1519 .macro testtop u=nil
1520 .p2align 4
1521 0:
1522 cysetup r9, r8
1523 .ifnes "\u", "nil"
1524 mov rax, \u
1525 .endif
1526 .endm
1527
1528 .macro testtail
1529 cystore r9, r8
1530 jnz 0b
1531 .endm
1532
1533 .macro testcarryout
1534 movdqu [rcx + 0], xmm12
1535 movdqu [rcx + 16], xmm13
1536 movdqu [rcx + 32], xmm14
1537 .endm
1538
1539 FUNC(test_dmul4)
1540 testprologue dmul
1541 testldcarry
1542 testtop r11
1543 call dmul4
1544 testtail
1545 testcarryout
1546 testepilogue
1547 ENDFUNC
1548
1549 FUNC(test_dmla4)
1550 testprologue dmul
1551 testldcarry
1552 testtop r11
1553 call dmla4
1554 testtail
1555 testcarryout
1556 testepilogue
1557 ENDFUNC
1558
1559 FUNC(test_mul4)
1560 testprologue smul
1561 testldcarry
1562 testtop nil
1563 call mul4
1564 testtail
1565 testcarryout
1566 testepilogue
1567 ENDFUNC
1568
1569 FUNC(test_mul4zc)
1570 testprologue smul
1571 testldcarry
1572 testtop nil
1573 call mul4zc
1574 testtail
1575 testcarryout
1576 testepilogue
1577 ENDFUNC
1578
1579 FUNC(test_mla4)
1580 testprologue smul
1581 testldcarry
1582 testtop nil
1583 call mla4
1584 testtail
1585 testcarryout
1586 testepilogue
1587 ENDFUNC
1588
1589 FUNC(test_mla4zc)
1590 testprologue smul
1591 testldcarry
1592 testtop nil
1593 call mla4zc
1594 testtail
1595 testcarryout
1596 testepilogue
1597 ENDFUNC
1598
1599 FUNC(test_mmul4)
1600 testprologue mmul
1601 testtop r11
1602 call mmul4
1603 testtail
1604 pshufd xmm10, xmm10, SHUF(0, 2, 1, 3)
1605 pshufd xmm11, xmm11, SHUF(0, 2, 1, 3)
1606 movdqu [r10 + 0], xmm10
1607 movdqu [r10 + 16], xmm11
1608 testcarryout
1609 testepilogue
1610 ENDFUNC
1611
1612 FUNC(test_mmla4)
1613 testprologue mmul
1614 testtop r11
1615 call mmla4
1616 testtail
1617 pshufd xmm10, xmm10, SHUF(0, 2, 1, 3)
1618 pshufd xmm11, xmm11, SHUF(0, 2, 1, 3)
1619 movdqu [r10 + 0], xmm10
1620 movdqu [r10 + 16], xmm11
1621 testcarryout
1622 testepilogue
1623 ENDFUNC
1624
1625 FUNC(test_mont4)
1626 testprologue mont
1627 testtop
1628 call mont4
1629 testtail
1630 pshufd xmm10, xmm10, SHUF(0, 2, 1, 3)
1631 pshufd xmm11, xmm11, SHUF(0, 2, 1, 3)
1632 movdqu [r10 + 0], xmm10
1633 movdqu [r10 + 16], xmm11
1634 testcarryout
1635 testepilogue
1636 ENDFUNC
1637
1638 #endif
1639
1640 ///----- That's all, folks --------------------------------------------------