math/: Delete some unnecessary blank lines.
[catacomb] / math / mpx-mul4-amd64-sse2.S
CommitLineData
3119b3ae
MW
1/// -*- mode: asm; asm-comment-char: ?/; comment-start: "// " -*-
2///
3/// Large SIMD-based multiplications
4///
5/// (c) 2016 Straylight/Edgeware
6///
7
8///----- Licensing notice ---------------------------------------------------
9///
10/// This file is part of Catacomb.
11///
12/// Catacomb is free software; you can redistribute it and/or modify
13/// it under the terms of the GNU Library General Public License as
14/// published by the Free Software Foundation; either version 2 of the
15/// License, or (at your option) any later version.
16///
17/// Catacomb is distributed in the hope that it will be useful,
18/// but WITHOUT ANY WARRANTY; without even the implied warranty of
19/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20/// GNU Library General Public License for more details.
21///
22/// You should have received a copy of the GNU Library General Public
23/// License along with Catacomb; if not, write to the Free
24/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
25/// MA 02111-1307, USA.
26
27///--------------------------------------------------------------------------
df07f2c0 28/// Preliminaries.
3119b3ae
MW
29
30#include "config.h"
31#include "asm-common.h"
32
3119b3ae 33 .arch pentium4
df07f2c0 34
3119b3ae
MW
35 .text
36
37///--------------------------------------------------------------------------
38/// Theory.
39///
40/// We define a number of primitive fixed-size multipliers from which we can
41/// construct more general variable-length multipliers.
42///
43/// The basic trick is the same throughout. In an operand-scanning
44/// multiplication, the inner multiplication loop multiplies a
45/// multiple-precision operand by a single precision factor, and adds the
46/// result, appropriately shifted, to the result. A `finely integrated
47/// operand scanning' implementation of Montgomery multiplication also adds
48/// the product of a single-precision `Montgomery factor' and the modulus,
49/// calculated in the same pass. The more common `coarsely integrated
50/// operand scanning' alternates main multiplication and Montgomery passes,
51/// which requires additional carry propagation.
52///
53/// Throughout both plain-multiplication and Montgomery stages, then, one of
54/// the factors remains constant throughout the operation, so we can afford
55/// to take a little time to preprocess it. The transformation we perform is
56/// as follows. Let b = 2^16, and B = b^2 = 2^32. Suppose we're given a
57/// 128-bit factor v = v_0 + v_1 B + v_2 B^2 + v_3 B^3. Split each v_i into
58/// two sixteen-bit pieces, so v_i = v'_i + v''_i b. These eight 16-bit
59/// pieces are placed into 32-bit cells, and arranged as two 128-bit SSE
60/// operands, as follows.
61///
62/// Offset 0 4 8 12
63/// 0 v'_0 v'_1 v''_0 v''_1
64/// 16 v'_2 v'_3 v''_2 v''_3
65///
66/// A `pmuludqd' instruction ignores the odd positions in its operands; thus,
67/// it will act on (say) v'_0 and v''_0 in a single instruction. Shifting
68/// this vector right by 4 bytes brings v'_1 and v''_1 into position. We can
69/// multiply such a vector by a full 32-bit scalar to produce two 48-bit
70/// results in 64-bit fields. The sixteen bits of headroom allows us to add
71/// many products together before we must deal with carrying; it also allows
72/// for some calculations to be performed on the above expanded form.
73///
74/// ...
75///
76/// We maintain four `carry' registers accumulating intermediate results.
77/// The registers' precise roles rotate during the computation; we name them
78/// `c0', `c1', `c2', and `c3'. Each carry register holds two 64-bit halves:
79/// the register c0, for example, holds c'_0 (low half) and c''_0 (high
80/// half), and represents the value c_0 = c'_0 + c''_0 b; the carry registers
81/// collectively represent the value c_0 + c_1 B + c_2 B^2 + c_3 B^3. The
82/// `pmuluqdq' instruction acting on a scalar operand (broadcast across all
83/// lanes of its vector) and an operand in the expanded form above produces a
84/// result which can be added directly to the appropriate carry register.
85/// Following a pass of four multiplications, we perform some limited carry
86/// propagation: let t = c''_0 mod B, and let d = c'_0 + t b; then we output
87/// z = d mod B, add (floor(d/B), floor(c''_0/B)) to c1, and cycle the carry
88/// registers around, so that c1 becomes c0, and the old c0 is (implicitly)
89/// zeroed becomes c3.
90
91///--------------------------------------------------------------------------
92/// Macro definitions.
93
94.macro mulcore r, i, slo, shi, d0, d1=nil, d2=nil, d3=nil
95 // Multiply R_I by the expanded operand SLO/SHI, and leave the pieces
96 // of the product in registers D0, D1, D2, D3.
a117c06f 97 pshufd \d0, \r, SHUF(\i, 3, \i, 3) // (r_i, ?; r_i, ?)
3119b3ae 98 .ifnes "\d1", "nil"
8e91d6e5 99 movdqa \d1, \slo // (s'_0, s'_1; s''_0, s''_1)
3119b3ae
MW
100 .endif
101 .ifnes "\d3", "nil"
8e91d6e5 102 movdqa \d3, \shi // (s'_2, s'_3; s''_2, s''_3)
3119b3ae
MW
103 .endif
104 .ifnes "\d1", "nil"
8e91d6e5 105 psrldq \d1, 4 // (s'_1, s''_0; s''_1, 0)
3119b3ae
MW
106 .endif
107 .ifnes "\d2", "nil"
8e91d6e5 108 movdqa \d2, \d0 // another copy of (r_i, ?; r_i, ?)
3119b3ae
MW
109 .endif
110 .ifnes "\d3", "nil"
8e91d6e5 111 psrldq \d3, 4 // (s'_3, s''_2; s''_3, 0)
3119b3ae
MW
112 .endif
113 .ifnes "\d1", "nil"
8e91d6e5 114 pmuludq \d1, \d0 // (r_i s'_1; r_i s''_1)
3119b3ae
MW
115 .endif
116 .ifnes "\d3", "nil"
8e91d6e5 117 pmuludq \d3, \d0 // (r_i s'_3; r_i s''_3)
3119b3ae
MW
118 .endif
119 .ifnes "\d2", "nil"
8e91d6e5 120 pmuludq \d2, \shi // (r_i s'_2; r_i s''_2)
3119b3ae 121 .endif
8e91d6e5 122 pmuludq \d0, \slo // (r_i s'_0; r_i s''_0)
3119b3ae
MW
123.endm
124
125.macro accum c0, c1=nil, c2=nil, c3=nil
126 // Accumulate 64-bit pieces in XMM0--XMM3 into the corresponding
127 // carry registers C0--C3. Any or all of C1--C3 may be `nil' to skip
128 // updating that register.
129 paddq \c0, xmm0
130 .ifnes "\c1", "nil"
131 paddq \c1, xmm1
132 .endif
133 .ifnes "\c2", "nil"
134 paddq \c2, xmm2
135 .endif
136 .ifnes "\c3", "nil"
137 paddq \c3, xmm3
138 .endif
139.endm
140
141.macro mulacc r, i, slo, shi, c0=nil, c1=nil, c2=nil, c3=nil, z3p=nil
142 // Multiply R_I by the expanded operand SLO/SHI, and accumulate in
143 // carry registers C0, C1, C2, C3. If Z3P is `t' then C3 notionally
144 // contains zero, but needs clearing; in practice, we store the
145 // product directly rather than attempting to add. On completion,
146 // XMM0, XMM1, and XMM2 are clobbered, as is XMM3 if Z3P is not `t'.
147 .ifeqs "\z3p", "t"
148 mulcore \r, \i, \slo, \shi, xmm0, xmm1, xmm2, \c3
149 accum \c0, \c1, \c2
150 .else
151 mulcore \r, \i, \slo, \shi, xmm0, xmm1, xmm2, xmm3
152 accum \c0, \c1, \c2, \c3
153 .endif
154.endm
155
156.macro propout d, pos, c, cc=nil
157 // Calculate an output word from C, and store it at POS in D;
158 // propagate carries out from C to CC in preparation for a rotation
159 // of the carry registers. D is an XMM register; the POS is either
160 // `lo' or `hi' according to whether the output word should be in
161 // lane 0 or 1 of D; the high two lanes of D are clobbered. On
162 // completion, XMM3 is clobbered. If CC is `nil', then the
163 // contribution which would have been added to it is left in C.
a117c06f 164 pshufd xmm3, \c, SHUF(3, 3, 3, 2) // (?, ?; ?, t = c'' mod B)
8e91d6e5
MW
165 psrldq xmm3, 12 // (t, 0; 0, 0) = (t; 0)
166 pslldq xmm3, 2 // (t b; 0)
167 paddq \c, xmm3 // (c' + t b; c'')
3119b3ae
MW
168 .ifeqs "\pos", "lo"
169 movdqa \d, \c
170 .else
171 punpckldq \d, \c
172 .endif
173 psrlq \c, 32 // floor(c/B)
174 .ifnes "\cc", "nil"
175 paddq \cc, \c // propagate up
176 .endif
177.endm
178
179.macro endprop d, pos, c, t
180 // On entry, C contains a carry register. On exit, the low 32 bits
181 // of the value represented in C are written at POS in D, and the
182 // remaining bits are left at the bottom of T.
183 movdqa \t, \c
8e91d6e5
MW
184 psllq \t, 16 // (?; c'' b)
185 pslldq \c, 8 // (0; c')
186 paddq \t, \c // (?; c' + c'' b)
187 psrldq \t, 8 // (c' + c'' b; 0) = (c; 0)
3119b3ae
MW
188 .ifeqs "\pos", "lo"
189 movdqa \d, \t
190 .else
191 punpckldq \d, \t
192 .endif
8e91d6e5 193 psrldq \t, 4 // (floor(c/B); 0)
3119b3ae
MW
194.endm
195
196.macro expand z, a, b, c=nil, d=nil
197 // On entry, A and C hold packed 128-bit values, and Z is zero. On
198 // exit, A:B and C:D together hold the same values in expanded
199 // form. If C is `nil', then only expand A to A:B.
8e91d6e5 200 movdqa \b, \a // (a_0, a_1; a_2, a_3)
3119b3ae 201 .ifnes "\c", "nil"
8e91d6e5 202 movdqa \d, \c // (c_0, c_1; c_2, c_3)
3119b3ae 203 .endif
8e91d6e5
MW
204 punpcklwd \a, \z // (a'_0, a''_0; a'_1, a''_1)
205 punpckhwd \b, \z // (a'_2, a''_2; a'_3, a''_3)
3119b3ae 206 .ifnes "\c", "nil"
8e91d6e5
MW
207 punpcklwd \c, \z // (c'_0, c''_0; c'_1, c''_1)
208 punpckhwd \d, \z // (c'_2, c''_2; c'_3, c''_3)
3119b3ae 209 .endif
a117c06f
MW
210 pshufd \a, \a, SHUF(0, 2, 1, 3) // (a'_0, a'_1; a''_0, a''_1)
211 pshufd \b, \b, SHUF(0, 2, 1, 3) // (a'_2, a'_3; a''_2, a''_3)
3119b3ae 212 .ifnes "\c", "nil"
a117c06f
MW
213 pshufd \c, \c, SHUF(0, 2, 1, 3) // (c'_0, c'_1; c''_0, c''_1)
214 pshufd \d, \d, SHUF(0, 2, 1, 3) // (c'_2, c'_3; c''_2, c''_3)
3119b3ae
MW
215 .endif
216.endm
217
218.macro squash c0, c1, c2, c3, t, u, lo, hi=nil
219 // On entry, C0, C1, C2, C3 are carry registers representing a value
220 // Y. On exit, LO holds the low 128 bits of the carry value; C1, C2,
221 // C3, T, and U are clobbered; and the high bits of Y are stored in
222 // HI, if this is not `nil'.
223
224 // The first step is to eliminate the `double-prime' pieces -- i.e.,
225 // the ones offset by 16 bytes from a 32-bit boundary -- by carrying
226 // them into the 32-bit-aligned pieces above and below. But before
227 // we can do that, we must gather them together.
228 movdqa \t, \c0
229 movdqa \u, \c1
8e91d6e5
MW
230 punpcklqdq \t, \c2 // (y'_0; y'_2)
231 punpckhqdq \c0, \c2 // (y''_0; y''_2)
232 punpcklqdq \u, \c3 // (y'_1; y'_3)
233 punpckhqdq \c1, \c3 // (y''_1; y''_3)
3119b3ae
MW
234
235 // Now split the double-prime pieces. The high (up to) 48 bits will
236 // go up; the low 16 bits go down.
237 movdqa \c2, \c0
238 movdqa \c3, \c1
239 psllq \c2, 48
240 psllq \c3, 48
8e91d6e5
MW
241 psrlq \c0, 16 // high parts of (y''_0; y''_2)
242 psrlq \c1, 16 // high parts of (y''_1; y''_3)
243 psrlq \c2, 32 // low parts of (y''_0; y''_2)
244 psrlq \c3, 32 // low parts of (y''_1; y''_3)
3119b3ae
MW
245 .ifnes "\hi", "nil"
246 movdqa \hi, \c1
247 .endif
8e91d6e5 248 pslldq \c1, 8 // high part of (0; y''_1)
3119b3ae
MW
249
250 paddq \t, \c2 // propagate down
251 paddq \u, \c3
8e91d6e5
MW
252 paddq \t, \c1 // and up: (y_0; y_2)
253 paddq \u, \c0 // (y_1; y_3)
3119b3ae 254 .ifnes "\hi", "nil"
8e91d6e5 255 psrldq \hi, 8 // high part of (y''_3; 0)
3119b3ae
MW
256 .endif
257
258 // Finally extract the answer. This complicated dance is better than
259 // storing to memory and loading, because the piecemeal stores
260 // inhibit store forwarding.
8e91d6e5
MW
261 movdqa \c3, \t // (y_0; ?)
262 movdqa \lo, \t // (y^*_0, ?; ?, ?)
263 psrldq \t, 8 // (y_2; 0)
264 psrlq \c3, 32 // (floor(y_0/B); ?)
265 paddq \c3, \u // (y_1 + floor(y_0/B); ?)
266 movdqa \c1, \c3 // (y^*_1, ?; ?, ?)
267 psrldq \u, 8 // (y_3; 0)
268 psrlq \c3, 32 // (floor((y_1 B + y_0)/B^2; ?)
269 paddq \c3, \t // (y_2 + floor((y_1 B + y_0)/B^2; ?)
270 punpckldq \lo, \c3 // (y^*_0, y^*_2; ?, ?)
271 psrlq \c3, 32 // (floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
272 paddq \c3, \u // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
3119b3ae
MW
273 .ifnes "\hi", "nil"
274 movdqa \t, \c3
275 pxor \u, \u
276 .endif
8e91d6e5 277 punpckldq \c1, \c3 // (y^*_1, y^*_3; ?, ?)
3119b3ae
MW
278 .ifnes "\hi", "nil"
279 psrlq \t, 32 // very high bits of y
280 paddq \hi, \t
281 punpcklqdq \hi, \u // carry up
282 .endif
283 punpckldq \lo, \c1 // y mod B^4
284.endm
285
286.macro carryadd
287 // On entry, RDI points to a packed addend A, and XMM12, XMM13, XMM14
288 // hold the incoming carry registers c0, c1, and c2 representing a
289 // carry-in C.
290 //
291 // On exit, the carry registers, including XMM15, are updated to hold
292 // C + A; XMM0, XMM1, XMM2, and XMM3 are clobbered. The other
293 // registers are preserved.
8e91d6e5
MW
294 movd xmm0, [rdi + 0] // (a_0; 0)
295 movd xmm1, [rdi + 4] // (a_1; 0)
296 movd xmm2, [rdi + 8] // (a_2; 0)
297 movd xmm15, [rdi + 12] // (a_3; 0)
298 paddq xmm12, xmm0 // (c'_0 + a_0; c''_0)
299 paddq xmm13, xmm1 // (c'_1 + a_1; c''_1)
300 paddq xmm14, xmm2 // (c'_2 + a_2; c''_2 + a_3 b)
3119b3ae
MW
301.endm
302
303///--------------------------------------------------------------------------
304/// Primitive multipliers and related utilities.
305
306INTFUNC(carryprop)
307 // On entry, XMM12, XMM13, and XMM14 hold a 144-bit carry in an
308 // expanded form. Store the low 128 bits of the represented carry to
309 // [RDI] as a packed 128-bit value, and leave the remaining 16 bits
310 // in the low 32 bits of XMM12. On exit, XMM0, XMM1, XMM3, XMM13 and
311 // XMM14 are clobbered.
312 endprologue
313
314 propout xmm0, lo, xmm12, xmm13
315 propout xmm1, lo, xmm13, xmm14
316 propout xmm0, hi, xmm14, nil
317 endprop xmm1, hi, xmm14, xmm12
318 punpckldq xmm0, xmm1
319 movdqu [rdi], xmm0
320
321 ret
3119b3ae
MW
322ENDFUNC
323
324INTFUNC(dmul4)
325 // On entry, RDI points to the destination buffer; RAX and RBX point
326 // to the packed operands U and X; XMM8/XMM9 and XMM10/XMM11 hold the
327 // expanded operands V and Y; and XMM12, XMM13, XMM14 hold the
328 // incoming carry registers c0, c1, and c2; c3 is assumed to be zero.
329 //
330 // On exit, we write the low 128 bits of the sum C + U V + X Y to
331 // [RDI], and update the carry registers with the carry out. The
332 // registers XMM0--XMM7, and XMM15 are clobbered; the general-purpose
333 // registers are preserved.
334 endprologue
335
336 movdqu xmm4, [rax]
337 movdqu xmm5, [rbx]
338
339 mulacc xmm4, 0, xmm8, xmm9, xmm12, xmm13, xmm14, xmm15, t
340 mulacc xmm5, 0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
341 propout xmm6, lo, xmm12, xmm13
342
343 mulacc xmm4, 1, xmm8, xmm9, xmm13, xmm14, xmm15, xmm12, t
344 mulacc xmm5, 1, xmm10, xmm11, xmm13, xmm14, xmm15, xmm12
345 propout xmm7, lo, xmm13, xmm14
346
347 mulacc xmm4, 2, xmm8, xmm9, xmm14, xmm15, xmm12, xmm13, t
348 mulacc xmm5, 2, xmm10, xmm11, xmm14, xmm15, xmm12, xmm13
349 propout xmm6, hi, xmm14, xmm15
350
351 mulacc xmm4, 3, xmm8, xmm9, xmm15, xmm12, xmm13, xmm14, t
352 mulacc xmm5, 3, xmm10, xmm11, xmm15, xmm12, xmm13, xmm14
353 propout xmm7, hi, xmm15, xmm12
354
355 punpckldq xmm6, xmm7
356 movdqu [rdi], xmm6
357
358 ret
3119b3ae
MW
359ENDFUNC
360
361INTFUNC(dmla4)
362 // On entry, RDI points to the destination buffer, which also
363 // contains an addend A to accumulate; RAX and RBX point to the
364 // packed operands U and X; XMM8/XMM9 and XMM10/XMM11 hold the
365 // expanded operands V and Y; and XMM12, XMM13, XMM14 hold the
366 // incoming carry registers c0, c1, and c2 representing a carry-in C;
367 // c3 is assumed to be zero.
368 //
369 // On exit, we write the low 128 bits of the sum A + C + U V + X Y to
370 // [RDI], and update the carry registers with the carry out. The
371 // registers XMM0--XMM7, and XMM15 are clobbered; the general-purpose
372 // registers are preserved.
373 endprologue
374
375 movdqu xmm4, [rax]
376 movdqu xmm5, [rbx]
377 carryadd
378
379 mulacc xmm4, 0, xmm8, xmm9, xmm12, xmm13, xmm14, xmm15
380 mulacc xmm5, 0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
381 propout xmm6, lo, xmm12, xmm13
382
383 mulacc xmm4, 1, xmm8, xmm9, xmm13, xmm14, xmm15, xmm12, t
384 mulacc xmm5, 1, xmm10, xmm11, xmm13, xmm14, xmm15, xmm12
385 propout xmm7, lo, xmm13, xmm14
386
387 mulacc xmm4, 2, xmm8, xmm9, xmm14, xmm15, xmm12, xmm13, t
388 mulacc xmm5, 2, xmm10, xmm11, xmm14, xmm15, xmm12, xmm13
389 propout xmm6, hi, xmm14, xmm15
390
391 mulacc xmm4, 3, xmm8, xmm9, xmm15, xmm12, xmm13, xmm14, t
392 mulacc xmm5, 3, xmm10, xmm11, xmm15, xmm12, xmm13, xmm14
393 propout xmm7, hi, xmm15, xmm12
394
395 punpckldq xmm6, xmm7
396 movdqu [rdi], xmm6
397
398 ret
3119b3ae
MW
399ENDFUNC
400
401INTFUNC(mul4zc)
402 // On entry, RDI points to the destination buffer; RBX points to a
403 // packed operand X; and XMM10/XMM11 hold an expanded operand Y.
404 //
405 // On exit, we write the low 128 bits of the product X Y to [RDI],
406 // and set the carry registers XMM12, XMM13, XMM14 to the carry out.
407 // The registers XMM0--XMM3, XMM5--XMM7, and XMM15 are clobbered; the
408 // general-purpose registers are preserved.
409 endprologue
410
411 movdqu xmm5, [rbx]
412
413 mulcore xmm5, 0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
414 propout xmm6, lo, xmm12, xmm13
415
416 mulacc xmm5, 1, xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
417 propout xmm7, lo, xmm13, xmm14
418
419 mulacc xmm5, 2, xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
420 propout xmm6, hi, xmm14, xmm15
421
422 mulacc xmm5, 3, xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
423 propout xmm7, hi, xmm15, xmm12
424
425 punpckldq xmm6, xmm7
426 movdqu [rdi], xmm6
427
428 ret
3119b3ae
MW
429ENDFUNC
430
431INTFUNC(mul4)
432 // On entry, RDI points to the destination buffer; RBX points to a
433 // packed operand X; XMM10/XMM11 hold an expanded operand Y; and
434 // XMM12, XMM13, XMM14 hold the incoming carry registers c0, c1, and
435 // c2, representing a carry-in C; c3 is assumed to be zero.
436 //
437 // On exit, we write the low 128 bits of the sum C + X Y to [RDI],
438 // and update the carry registers with the carry out. The registers
439 // XMM0--XMM3, XMM5--XMM7, and XMM15 are clobbered; the
440 // general-purpose registers are preserved.
441 endprologue
442
443 movdqu xmm5, [rbx]
444
445 mulacc xmm5, 0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, t
446 propout xmm6, lo, xmm12, xmm13
447
448 mulacc xmm5, 1, xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
449 propout xmm7, lo, xmm13, xmm14
450
451 mulacc xmm5, 2, xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
452 propout xmm6, hi, xmm14, xmm15
453
454 mulacc xmm5, 3, xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
455 propout xmm7, hi, xmm15, xmm12
456
457 punpckldq xmm6, xmm7
458 movdqu [rdi], xmm6
459
460 ret
3119b3ae
MW
461ENDFUNC
462
463INTFUNC(mla4zc)
464 // On entry, RDI points to the destination buffer, which also
465 // contains an addend A to accumulate; RBX points to a packed operand
466 // X; and XMM10/XMM11 points to an expanded operand Y.
467 //
468 // On exit, we write the low 128 bits of the sum A + X Y to [RDI],
469 // and set the carry registers XMM12, XMM13, XMM14 to the carry out.
470 // The registers XMM0--XMM3, XMM5--XMM7, and XMM15 are clobbered; the
471 // general-purpose registers are preserved.
472 endprologue
473
474 movdqu xmm5, [rbx]
475 movd xmm12, [rdi + 0]
476 movd xmm13, [rdi + 4]
477 movd xmm14, [rdi + 8]
478 movd xmm15, [rdi + 12]
479
480 mulacc xmm5, 0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
481 propout xmm6, lo, xmm12, xmm13
482
483 mulacc xmm5, 1, xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
484 propout xmm7, lo, xmm13, xmm14
485
486 mulacc xmm5, 2, xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
487 propout xmm6, hi, xmm14, xmm15
488
489 mulacc xmm5, 3, xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
490 propout xmm7, hi, xmm15, xmm12
491
492 punpckldq xmm6, xmm7
493 movdqu [rdi], xmm6
494
495 ret
3119b3ae
MW
496ENDFUNC
497
498INTFUNC(mla4)
499 // On entry, RDI points to the destination buffer, which also
500 // contains an addend A to accumulate; RBX points to a packed operand
501 // X; XMM10/XMM11 holds an expanded operand Y; and XMM12, XMM13,
502 // XMM14 hold the incoming carry registers c0, c1, and c2,
503 // representing a carry-in C; c3 is assumed to be zero.
504 //
505 // On exit, we write the low 128 bits of the sum A + C + X Y to
506 // [RDI], and update the carry registers with the carry out. The
507 // registers XMM0--XMM3, XMM5--XMM7, and XMM15 are clobbered; the
508 // general-purpose registers are preserved.
509 endprologue
510
511 movdqu xmm5, [rbx]
512 carryadd
513
514 mulacc xmm5, 0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
515 propout xmm6, lo, xmm12, xmm13
516
517 mulacc xmm5, 1, xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
518 propout xmm7, lo, xmm13, xmm14
519
520 mulacc xmm5, 2, xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
521 propout xmm6, hi, xmm14, xmm15
522
523 mulacc xmm5, 3, xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
524 propout xmm7, hi, xmm15, xmm12
525
526 punpckldq xmm6, xmm7
527 movdqu [rdi], xmm6
528
529 ret
3119b3ae
MW
530ENDFUNC
531
532INTFUNC(mmul4)
533 // On entry, RDI points to the destination buffer; RAX and RBX point
534 // to the packed operands U and N; and XMM8/XMM9 and XMM10/XMM11 hold
535 // the expanded operands V and M. The stack pointer must be 8 modulo 16
536 // (as usual for AMD64 ABIs).
537 //
538 // On exit, we store Y = U V M mod B in XMM10/XMM11, and write the
539 // low 128 bits of the sum U V + N Y to [RDI], leaving the remaining
540 // carry in XMM12, XMM13, and XMM14. The registers XMM0--XMM7, and
541 // XMM15 are clobbered; the general-purpose registers are preserved.
542 movdqu xmm4, [rax]
543#if ABI_WIN
544 stalloc 48 + 8 // space for the carries
545#endif
546 endprologue
547
548 // Calculate W = U V, and leave it in XMM7. Stash the carry pieces
549 // for later.
550 mulcore xmm4, 0, xmm8, xmm9, xmm12, xmm13, xmm14, xmm15
551 propout xmm7, lo, xmm12, xmm13
552 jmp 5f
3119b3ae
MW
553ENDFUNC
554
555INTFUNC(mmla4)
556 // On entry, RDI points to the destination buffer, which also
557 // contains an addend A to accumulate; RAX and RBX point to the
558 // packed operands U and N; and XMM8/XMM9 and XMM10/XMM11 hold the
559 // expanded operands V and M. The stack pointer must be 8 modulo 16
560 // (as usual for AMD64 ABIs).
561 //
562 // On exit, we store Y = (A + U V) M mod B in XMM10/XMM11, and write
563 // the low 128 bits of the sum A + U V + N Y to [RDI], leaving the
564 // remaining carry in XMM12, XMM13, and XMM14. The registers
565 // XMM0--XMM7, and XMM15 are clobbered; the general-purpose registers
566 // are preserved.
567 movdqu xmm4, [rax]
568#if ABI_WIN
569 stalloc 48 + 8 // space for the carries
a90d420c 570# define STKTMP(i) [SP + i]
3119b3ae
MW
571#endif
572#if ABI_SYSV
a90d420c 573# define STKTMP(i) [SP + i - 48 - 8] // use red zone
3119b3ae
MW
574#endif
575 endprologue
576
577 movd xmm12, [rdi + 0]
578 movd xmm13, [rdi + 4]
579 movd xmm14, [rdi + 8]
580 movd xmm15, [rdi + 12]
581
582 // Calculate W = U V, and leave it in XMM7. Stash the carry pieces
583 // for later.
584 mulacc xmm4, 0, xmm8, xmm9, xmm12, xmm13, xmm14, xmm15
585 propout xmm7, lo, xmm12, xmm13
586
5875: mulacc xmm4, 1, xmm8, xmm9, xmm13, xmm14, xmm15, xmm12, t
588 propout xmm6, lo, xmm13, xmm14
589
590 mulacc xmm4, 2, xmm8, xmm9, xmm14, xmm15, xmm12, xmm13, t
591 propout xmm7, hi, xmm14, xmm15
592
593 mulacc xmm4, 3, xmm8, xmm9, xmm15, xmm12, xmm13, xmm14, t
594 propout xmm6, hi, xmm15, xmm12
595
596 // Prepare W, and stash carries for later.
597 punpckldq xmm7, xmm6
598 movdqa STKTMP( 0), xmm12
599 movdqa STKTMP(16), xmm13
600 movdqa STKTMP(32), xmm14
601
602 // Calculate Y = W M. We just about have enough spare registers to
603 // make this work.
604 mulcore xmm7, 0, xmm10, xmm11, xmm3, xmm4, xmm5, xmm6
605
606 // Start expanding W back into the main carry registers...
607 pxor xmm15, xmm15
608 movdqa xmm12, xmm7
609 movdqa xmm14, xmm7
610
611 mulcore xmm7, 1, xmm10, xmm11, xmm0, xmm1, xmm2
612 accum xmm4, xmm5, xmm6
613
8e91d6e5
MW
614 punpckldq xmm12, xmm15 // (w_0, 0; w_1, 0)
615 punpckhdq xmm14, xmm15 // (w_2, 0; w_3, 0)
3119b3ae
MW
616
617 mulcore xmm7, 2, xmm10, xmm11, xmm0, xmm1
618 accum xmm5, xmm6
619
620 pxor xmm2, xmm2
621 movdqa xmm13, xmm12
622 movdqa xmm15, xmm14
623
624 mulcore xmm7, 3, xmm10, xmm11, xmm0
625 accum xmm6
626
8e91d6e5
MW
627 punpckldq xmm12, xmm2 // (w_0, 0; 0, 0)
628 punpckldq xmm14, xmm2 // (w_2, 0; 0, 0)
629 punpckhdq xmm13, xmm2 // (w_1, 0; 0, 0)
630 punpckhdq xmm15, xmm2 // (w_3, 0; 0, 0)
3119b3ae
MW
631
632 // That's lots of pieces. Now we have to assemble the answer.
633 squash xmm3, xmm4, xmm5, xmm6, xmm0, xmm1, xmm10
634
635 // Expand it.
636 movdqu xmm5, [rbx]
637 expand xmm2, xmm10, xmm11
638
639 // Finish the calculation by adding the Montgomery product.
640 mulacc xmm5, 0 xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
641 propout xmm6, lo, xmm12, xmm13
642
643 mulacc xmm5, 1 xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
644 propout xmm7, lo, xmm13, xmm14
645
646 mulacc xmm5, 2 xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
647 propout xmm6, hi, xmm14, xmm15
648
649 mulacc xmm5, 3 xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
650 propout xmm7, hi, xmm15, xmm12
651
652 punpckldq xmm6, xmm7
653
654 // Add add on the carry we calculated earlier.
655 paddq xmm12, STKTMP( 0)
656 paddq xmm13, STKTMP(16)
657 paddq xmm14, STKTMP(32)
658
659 // And, with that, we're done.
660 movdqu [rdi], xmm6
661#if ABI_WIN
662 stfree 56
663#endif
664 ret
665
666#undef STKTMP
667
668ENDFUNC
669
670INTFUNC(mont4)
671 // On entry, RDI points to the destination buffer holding a packed
672 // value W; RBX points to a packed operand N; and XMM8/XMM9 hold an
673 // expanded operand M.
674 //
675 // On exit, we store Y = W M mod B in XMM10/XMM11, and write the low
676 // 128 bits of the sum W + N Y to [RDI], leaving the remaining carry
677 // in XMM12, XMM13, and XMM14. The registers XMM0--XMM3, XMM5--XMM7,
678 // and XMM15 are clobbered; the general-purpose registers are
679 // preserved.
680 endprologue
681
682 movdqu xmm7, [rdi]
683
684 // Calculate Y = W M. Avoid the standard carry registers, because
685 // we're setting something else up there.
686 mulcore xmm7, 0, xmm8, xmm9, xmm3, xmm4, xmm5, xmm6
687
688 // Start expanding W back into the main carry registers...
689 pxor xmm15, xmm15
690 movdqa xmm12, xmm7
691 movdqa xmm14, xmm7
692
693 mulcore xmm7, 1, xmm8, xmm9, xmm0, xmm1, xmm2
694 accum xmm4, xmm5, xmm6
695
8e91d6e5
MW
696 punpckldq xmm12, xmm15 // (w_0, 0; w_1, 0)
697 punpckhdq xmm14, xmm15 // (w_2, 0; w_3, 0)
3119b3ae
MW
698
699 mulcore xmm7, 2, xmm8, xmm9, xmm0, xmm1
700 accum xmm5, xmm6
701
702 pxor xmm2, xmm2
703 movdqa xmm13, xmm12
704 movdqa xmm15, xmm14
705
706 mulcore xmm7, 3, xmm8, xmm9, xmm0
707 accum xmm6
708
8e91d6e5
MW
709 punpckldq xmm12, xmm2 // (w_0, 0; 0, 0)
710 punpckldq xmm14, xmm2 // (w_2, 0; 0, 0)
711 punpckhdq xmm13, xmm2 // (w_1, 0; 0, 0)
712 punpckhdq xmm15, xmm2 // (w_3, 0; 0, 0)
3119b3ae
MW
713
714 // That's lots of pieces. Now we have to assemble the answer.
715 squash xmm3, xmm4, xmm5, xmm6, xmm0, xmm1, xmm10
716
717 // Expand it.
718 movdqu xmm5, [rbx]
719 expand xmm2, xmm10, xmm11
720
721 // Finish the calculation by adding the Montgomery product.
722 mulacc xmm5, 0 xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
723 propout xmm6, lo, xmm12, xmm13
724
725 mulacc xmm5, 1 xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
726 propout xmm7, lo, xmm13, xmm14
727
728 mulacc xmm5, 2 xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
729 propout xmm6, hi, xmm14, xmm15
730
731 mulacc xmm5, 3 xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
732 propout xmm7, hi, xmm15, xmm12
733
734 punpckldq xmm6, xmm7
735
736 // And, with that, we're done.
737 movdqu [rdi], xmm6
738 ret
3119b3ae
MW
739ENDFUNC
740
741///--------------------------------------------------------------------------
742/// Bulk multipliers.
743
b9b279b4
MW
744FUNC(mpx_umul4_amd64_avx)
745 .arch .avx
746 vzeroupper
747 endprologue
748 .arch pentium4
749ENDFUNC
750
3119b3ae
MW
751FUNC(mpx_umul4_amd64_sse2)
752 // void mpx_umul4_amd64_sse2(mpw *dv, const mpw *av, const mpw *avl,
753 // const mpw *bv, const mpw *bvl);
754
755 // Establish the arguments and do initial setup.
756 //
757 // sysv win
758 // inner loop dv rdi rdi*
759 // inner loop av rbx* rbx*
760 // outer loop dv r10 rcx
761 // outer loop bv rcx r9
762 // av base rsi rdx
763 // av limit rdx r8
764 // bv limit r8 r10
765
766#if ABI_SYSV
767# define DV r10
768# define AV rsi
769# define AVL rdx
770# define BV rcx
771# define BVL r8
772
773 pushreg rbx
774 endprologue
775
776 mov DV, rdi
3119b3ae
MW
777#endif
778
779#if ABI_WIN
780# define DV rcx
781# define AV rdx
782# define AVL r8
783# define BV r9
784# define BVL r10
785
786 pushreg rbx
787 pushreg rdi
788 stalloc 160 + 8
789
790 savexmm xmm6, 0
791 savexmm xmm7, 16
792 savexmm xmm8, 32
793 savexmm xmm9, 48
794 savexmm xmm10, 64
795 savexmm xmm11, 80
796 savexmm xmm12, 96
797 savexmm xmm13, 112
798 savexmm xmm14, 128
799 savexmm xmm15, 144
800
801 endprologue
802
803 mov rdi, DV
a90d420c 804 mov BVL, [SP + 224]
3119b3ae
MW
805#endif
806
807 // Prepare for the first iteration.
808 pxor xmm0, xmm0
809 movdqu xmm10, [BV] // bv[0]
810 mov rbx, AV
811 add DV, 16
812 add BV, 16
813 expand xmm0, xmm10, xmm11
814 call mul4zc
815 add rbx, 16
816 add rdi, 16
817 cmp rbx, AVL // all done?
818 jae 8f
819
820 .p2align 4
821 // Continue with the first iteration.
8220: call mul4
823 add rbx, 16
824 add rdi, 16
825 cmp rbx, AVL // all done?
826 jb 0b
827
828 // Write out the leftover carry. There can be no tail here.
8298: call carryprop
830 cmp BV, BVL // more passes to do?
831 jae 9f
832
833 .p2align 4
834 // Set up for the next pass.
8351: movdqu xmm10, [BV] // bv[i]
836 mov rdi, DV // -> dv[i]
837 pxor xmm0, xmm0
838 expand xmm0, xmm10, xmm11
839 mov rbx, AV // -> av[0]
840 add DV, 16
841 add BV, 16
842 call mla4zc
843 add rbx, 16
844 add rdi, 16
845 cmp rbx, AVL // done yet?
846 jae 8f
847
848 .p2align 4
849 // Continue...
8500: call mla4
851 add rbx, 16
852 add rdi, 16
853 cmp rbx, AVL
854 jb 0b
855
856 // Finish off this pass. There was no tail on the previous pass, and
857 // there can be none on this pass.
8588: call carryprop
859 cmp BV, BVL
860 jb 1b
861
862 // All over.
8639:
864
865#if ABI_SYSV
866 popreg rbx
867#endif
868
869#if ABI_WIN
3119b3ae
MW
870 rstrxmm xmm6, 0
871 rstrxmm xmm7, 16
872 rstrxmm xmm8, 32
873 rstrxmm xmm9, 48
874 rstrxmm xmm10, 64
875 rstrxmm xmm11, 80
876 rstrxmm xmm12, 96
877 rstrxmm xmm13, 112
878 rstrxmm xmm14, 128
879 rstrxmm xmm15, 144
880
881 stfree 160 + 8
882 popreg rdi
883 popreg rbx
3119b3ae
MW
884#endif
885
886 ret
887
888#undef DV
889#undef AV
890#undef AVL
891#undef BV
892#undef BVL
893
894ENDFUNC
895
b9b279b4
MW
896FUNC(mpxmont_mul4_amd64_avx)
897 .arch .avx
898 vzeroupper
899 endprologue
900 .arch pentium4
901ENDFUNC
902
3119b3ae
MW
903FUNC(mpxmont_mul4_amd64_sse2)
904 // void mpxmont_mul4_amd64_sse2(mpw *dv, const mpw *av, const mpw *bv,
905 // const mpw *nv, size_t n, const mpw *mi);
906
907 // Establish the arguments and do initial setup.
908 //
909 // sysv win
910 // inner loop dv rdi rdi*
911 // inner loop av rax rax
912 // inner loop nv rbx* rbx*
913 // mi r9 r10
914 // outer loop dv r10 rcx
915 // outer loop bv rdx r8
916 // av base rsi rdx
917 // av limit r11 r11
918 // bv limit r8 r12*
919 // nv base rcx r9
920 // n r8 r12*
921
922#if ABI_SYSV
923# define DV r10
924# define AV rsi
925# define AVL r11
926# define BV rdx
927# define BVL r8
928# define NV rcx
929# define N r8
930# define MI r9
931
932 pushreg rbx
933 endprologue
934
935 mov DV, rdi
3119b3ae
MW
936#endif
937
938#if ABI_WIN
939# define DV rcx
940# define AV rdx
941# define AVL r11
942# define BV r8
943# define BVL r12
944# define NV r9
945# define N r12
946# define MI r10
947
948 pushreg rbx
949 pushreg rdi
950 pushreg r12
951 stalloc 160
952
953 savexmm xmm6, 0
954 savexmm xmm7, 16
955 savexmm xmm8, 32
956 savexmm xmm9, 48
957 savexmm xmm10, 64
958 savexmm xmm11, 80
959 savexmm xmm12, 96
960 savexmm xmm13, 112
961 savexmm xmm14, 128
962 savexmm xmm15, 144
963
964 endprologue
965
966 mov rdi, DV
a90d420c
MW
967 mov N, [SP + 224]
968 mov MI, [SP + 232]
3119b3ae
MW
969#endif
970
971 // Establish the expanded operands.
972 pxor xmm0, xmm0
973 movdqu xmm8, [BV] // bv[0]
974 movdqu xmm10, [MI] // mi
975 expand xmm0, xmm8, xmm9, xmm10, xmm11
976
977 // Set up the outer loop state and prepare for the first iteration.
978 mov rax, AV // -> U = av[0]
979 mov rbx, NV // -> X = nv[0]
980 lea AVL, [AV + 4*N] // -> av[n/4] = av limit
981 lea BVL, [BV + 4*N] // -> bv[n/4] = bv limit
982 add BV, 16
983 add DV, 16
984 call mmul4
985 add rdi, 16
986 add rax, 16
987 add rbx, 16
988 cmp rax, AVL // done already?
989 jae 8f
990
991 .p2align 4
992 // Complete the first inner loop.
9930: call dmul4
994 add rdi, 16
995 add rax, 16
996 add rbx, 16
997 cmp rax, AVL // done yet?
998 jb 0b
999
1000 // Still have carries left to propagate.
1001 call carryprop
1002 movd [rdi + 16], xmm12
1003
1004 .p2align 4
1005 // Embark on the next iteration. (There must be one. If n = 1, then
1006 // we would have bailed above, to label 8. Similarly, the subsequent
1007 // iterations can fall into the inner loop immediately.)
10081: pxor xmm0, xmm0
1009 movdqu xmm8, [BV] // bv[i]
1010 movdqu xmm10, [MI] // mi
1011 mov rdi, DV // -> Z = dv[i]
1012 mov rax, AV // -> U = av[0]
1013 mov rbx, NV // -> X = nv[0]
1014 expand xmm0, xmm8, xmm9, xmm10, xmm11
1015 add BV, 16
1016 add DV, 16
1017 call mmla4
1018 add rdi, 16
1019 add rax, 16
1020 add rbx, 16
1021
1022 .p2align 4
1023 // Complete the next inner loop.
10240: call dmla4
1025 add rdi, 16
1026 add rax, 16
1027 add rbx, 16
1028 cmp rax, AVL
1029 jb 0b
1030
1031 // Still have carries left to propagate, and they overlap the
1032 // previous iteration's final tail, so read that in and add it.
1033 movd xmm0, [rdi]
1034 paddq xmm12, xmm0
1035 call carryprop
1036 movd [rdi + 16], xmm12
1037
1038 // Back again, maybe.
1039 cmp BV, BVL
1040 jb 1b
1041
1042 // All done.
10439:
1044
1045#if ABI_SYSV
1046 popreg rbx
1047#endif
1048
1049#if ABI_WIN
3119b3ae
MW
1050 rstrxmm xmm6, 0
1051 rstrxmm xmm7, 16
1052 rstrxmm xmm8, 32
1053 rstrxmm xmm9, 48
1054 rstrxmm xmm10, 64
1055 rstrxmm xmm11, 80
1056 rstrxmm xmm12, 96
1057 rstrxmm xmm13, 112
1058 rstrxmm xmm14, 128
1059 rstrxmm xmm15, 144
1060
1061 stfree 160
1062 popreg r12
1063 popreg rdi
1064 popreg rbx
3119b3ae
MW
1065#endif
1066
1067 ret
1068
1069 // First iteration was short. Write out the carries and we're done.
1070 // (This could be folded into the main loop structure, but that would
1071 // penalize small numbers more.)
10728: call carryprop
1073 movd [rdi + 16], xmm12
1074#if ABI_SYSV
1075 popreg rbx
1076 ret
1077#endif
1078#if ABI_WIN
1079 jmp 9b
1080#endif
1081
1082#undef DV
1083#undef AV
1084#undef AVL
1085#undef BV
1086#undef BVL
1087#undef NV
1088#undef N
1089#undef MI
1090
1091ENDFUNC
1092
b9b279b4
MW
1093FUNC(mpxmont_redc4_amd64_avx)
1094 .arch .avx
1095 vzeroupper
1096 endprologue
1097 .arch pentium4
1098ENDFUNC
1099
3119b3ae
MW
1100FUNC(mpxmont_redc4_amd64_sse2)
1101 // void mpxmont_redc4_amd64_sse2(mpw *dv, mpw *dvl, const mpw *nv,
1102 // size_t n, const mpw *mi);
1103
1104 // Establish the arguments and do initial setup.
1105 //
1106 // sysv win
1107 // inner loop dv rdi rdi*
1108 // dv limit rax rax
1109 // blocks-of-4 dv limit rsi rdx
1110 // inner loop nv rbx* rbx*
1111 // mi r8 r10
1112 // outer loop dv r10 rcx
1113 // outer loop dv limit r11 r11
1114 // nv base rdx r8
1115 // nv limit r9 r12*
1116 // n rcx r9
1117 // c rcx r9
1118
1119#if ABI_SYSV
3119b3ae
MW
1120# define DVL rax
1121# define DVL4 rsi
1122# define MI r8
1123# define DV r10
1124# define DVLO r11
1125# define NV rdx
1126# define NVL r9
1127# define N rcx
1128# define C ecx
1129
1130 pushreg rbx
1131 endprologue
1132
1133 mov DV, rdi
3119b3ae
MW
1134#endif
1135
1136#if ABI_WIN
3119b3ae
MW
1137# define DVL rax
1138# define DVL4 rdx
1139# define MI r10
1140# define DV rcx
1141# define DVLO r11
1142# define NV r8
1143# define NVL r12
1144# define N r9
1145# define C r9d
1146
1147 pushreg rbx
1148 pushreg rdi
1149 pushreg r12
1150 stalloc 160
1151
1152 savexmm xmm6, 0
1153 savexmm xmm7, 16
1154 savexmm xmm8, 32
1155 savexmm xmm9, 48
1156 savexmm xmm10, 64
1157 savexmm xmm11, 80
1158 savexmm xmm12, 96
1159 savexmm xmm13, 112
1160 savexmm xmm14, 128
1161 savexmm xmm15, 144
1162
1163 endprologue
1164
1165 mov rdi, DV
a90d420c 1166 mov MI, [SP + 224]
3119b3ae
MW
1167#endif
1168
1169 // Establish the expanded operands and the blocks-of-4 dv limit.
1170 pxor xmm0, xmm0
1171 mov DVL, DVL4 // -> dv[n] = dv limit
1172 sub DVL4, DV // length of dv in bytes
1173 movdqu xmm8, [MI] // mi
1174 and DVL4, ~15 // mask off the tail end
1175 expand xmm0, xmm8, xmm9
1176 add DVL4, DV // find limit
1177
1178 // Set up the outer loop state and prepare for the first iteration.
1179 mov rbx, NV // -> X = nv[0]
1180 lea DVLO, [DV + 4*N] // -> dv[n/4] = outer dv limit
1181 lea NVL, [NV + 4*N] // -> nv[n/4] = nv limit
1182 add DV, 16
1183 call mont4
1184 add rbx, 16
1185 add rdi, 16
1186 cmp rbx, NVL // done already?
1187 jae 8f
1188
1189 .p2align 4
1190 // Complete the first inner loop.
11915: call mla4
1192 add rbx, 16
1193 add rdi, 16
1194 cmp rbx, NVL // done yet?
1195 jb 5b
1196
1197 // Still have carries left to propagate.
11988: carryadd
1199 psllq xmm15, 16
1200 pslldq xmm15, 8
1201 paddq xmm14, xmm15
1202 call carryprop
1203 movd C, xmm12
1204 add rdi, 16
1205 cmp rdi, DVL4
1206 jae 7f
1207
1208 .p2align 4
1209 // Continue carry propagation until the end of the buffer.
12100: add [rdi], C
1211 mov C, 0 // preserves flags
1212 adcd [rdi + 4], 0
1213 adcd [rdi + 8], 0
1214 adcd [rdi + 12], 0
1215 adc C, 0
1216 add rdi, 16
1217 cmp rdi, DVL4
1218 jb 0b
1219
1220 // Deal with the tail end.
12217: add [rdi], C
1222 mov C, 0 // preserves flags
1223 add rdi, 4
1224 adc C, 0
1225 cmp rdi, DVL
1226 jb 7b
1227
1228 // All done for this iteration. Start the next. (This must have at
1229 // least one follow-on iteration, or we'd not have started this outer
1230 // loop.)
12318: mov rdi, DV // -> Z = dv[i]
1232 mov rbx, NV // -> X = nv[0]
1233 cmp rdi, DVLO // all done yet?
1234 jae 9f
1235 add DV, 16
1236 call mont4
1237 add rdi, 16
1238 add rbx, 16
1239 jmp 5b
1240
1241 // All over.
12429:
1243
1244#if ABI_SYSV
1245 popreg rbx
1246#endif
1247
1248#if ABI_WIN
3119b3ae
MW
1249 rstrxmm xmm6, 0
1250 rstrxmm xmm7, 16
1251 rstrxmm xmm8, 32
1252 rstrxmm xmm9, 48
1253 rstrxmm xmm10, 64
1254 rstrxmm xmm11, 80
1255 rstrxmm xmm12, 96
1256 rstrxmm xmm13, 112
1257 rstrxmm xmm14, 128
1258 rstrxmm xmm15, 144
1259
1260 stfree 160
1261 popreg r12
1262 popreg rdi
1263 popreg rbx
3119b3ae
MW
1264#endif
1265
1266 ret
1267
1268#undef DVL
1269#undef DVL4
1270#undef MI
1271#undef DV
1272#undef DVLO
1273#undef NV
1274#undef NVL
1275#undef N
1276#undef C
1277
1278ENDFUNC
1279
1280///--------------------------------------------------------------------------
1281/// Testing and performance measurement.
1282
1283#ifdef TEST_MUL4
1284
1285#if ABI_SYSV
1286# define ARG0 rdi
1287# define ARG1 rsi
1288# define ARG2 rdx
1289# define ARG3 rcx
1290# define ARG4 r8
1291# define ARG5 r9
1292# define ARG6 STKARG(0)
1293# define ARG7 STKARG(1)
1294# define ARG8 STKARG(2)
1295# define STKARG_OFFSET 16
1296#endif
1297#if ABI_WIN
1298# define ARG0 rcx
1299# define ARG1 rdx
1300# define ARG2 r8
1301# define ARG3 r9
1302# define ARG4 STKARG(0)
1303# define ARG5 STKARG(1)
1304# define ARG6 STKARG(2)
1305# define ARG7 STKARG(3)
1306# define ARG8 STKARG(4)
998a2cd4 1307# define STKARG_OFFSET 224
3119b3ae 1308#endif
a90d420c 1309#define STKARG(i) [SP + STKARG_OFFSET + 8*(i)]
3119b3ae
MW
1310
1311// sysv win
1312// dmul smul mmul mont dmul smul mmul mont
1313// A rax
1314// D rdx
1315// z rdi rdi rdi rdi rdi rcx rcx rcx rcx
1316// c rcx rsi rsi rsi rsi rdx rdx rdx rdx
1317// y r10 -- -- rdx rdx -- -- r8 r8
1318// u r11 rdx -- rcx -- r8 -- r9 --
1319// x rbx rcx rdx r8 rcx r9 r8 stk0 r9
1320// vv xmm8/9 r8 -- r9 r8 stk0 -- stk1 stk0
1321// yy xmm10/11 r9 rcx stk0 -- stk1 r9 stk2 --
1322// n r8 stk0 r8 stk1 r9 stk2 stk0 stk3 stk1
1323// cyv r9 stk1 r9 stk2 stk0 stk3 stk1 stk4 stk2
1324
1325.macro cysetup v, n
1326 rdtsc
1327 shl rdx, 32
1328 or rax, rdx
1329 mov [\v + 8*\n - 8], rax
1330.endm
1331
1332.macro cystore v, n
1333 rdtsc
1334 shl rdx, 32
1335 or rax, rdx
1336 sub rax, [\v + 8*\n - 8]
1337 mov [\v + 8*\n - 8], rax
1338 dec \n
1339.endm
1340
1341.macro testprologue mode
1342 pushreg rbx
1343#if ABI_SYSV
1344 endprologue
1345 .ifeqs "\mode", "dmul"
1346 mov rbx, rcx
1347 movdqu xmm8, [r8]
1348 movdqu xmm10, [r9]
1349 mov r8d, STKARG(0)
1350 mov r9, STKARG(1)
1351 mov r11, rdx
1352 mov rcx, rsi
1353 .endif
1354 .ifeqs "\mode", "smul"
1355 mov rbx, rdx
1356 movdqu xmm10, [rcx]
1357 mov rcx, rsi
1358 .endif
1359 .ifeqs "\mode", "mmul"
1360 mov rax, STKARG(0)
1361 mov rbx, r8
1362 movdqu xmm8, [r9]
1363 movdqu xmm10, [rax]
49b0f1b4 1364 mov r8d, STKARG(1)
3119b3ae
MW
1365 mov r9, STKARG(2)
1366 mov r10, rdx
1367 mov r11, rcx
1368 mov rcx, rsi
1369 .endif
1370 .ifeqs "\mode", "mont"
1371 mov rbx, rcx
1372 movdqu xmm8, [r8]
49b0f1b4 1373 mov r8d, r9d
3119b3ae
MW
1374 mov r9, STKARG(0)
1375 mov r10, rdx
1376 mov rcx, rsi
1377 .endif
1378#endif
1379#if ABI_WIN
1380 pushreg rdi
1381 stalloc 168
1382 savexmm xmm6, 0
1383 savexmm xmm7, 16
1384 savexmm xmm8, 32
1385 savexmm xmm9, 48
1386 savexmm xmm10, 64
1387 savexmm xmm11, 80
1388 savexmm xmm12, 96
1389 savexmm xmm13, 112
1390 savexmm xmm14, 128
1391 savexmm xmm15, 144
1392 endprologue
1393 .ifeqs "\mode", "dmul"
1394 mov r10, STKARG(0)
1395 mov r11, STKARG(1)
1396 mov rdi, rcx
1397 mov rcx, rdx
1398 mov rbx, r9
1399 movdqu xmm8, [r10]
1400 movdqu xmm10, [r11]
3119b3ae 1401 mov r11, r8
49b0f1b4
MW
1402 mov r8d, STKARG(2)
1403 mov r9, STKARG(3)
3119b3ae
MW
1404 .endif
1405 .ifeqs "\mode", "smul"
1406 mov rdi, rcx
1407 mov rcx, rdx
1408 mov rbx, r8
1409 movdqu xmm10, [r9]
49b0f1b4 1410 mov r8d, STKARG(0)
3119b3ae
MW
1411 mov r9, STKARG(1)
1412 .endif
1413 .ifeqs "\mode", "mmul"
1414 mov r10, STKARG(1)
1415 mov r11, STKARG(2)
1416 mov rdi, rcx
1417 mov rcx, rdx
1418 mov rbx, STKARG(0)
1419 movdqu xmm8, [r10]
1420 movdqu xmm10, [r11]
3119b3ae
MW
1421 mov r10, r8
1422 mov r11, r9
49b0f1b4
MW
1423 mov r8d, STKARG(3)
1424 mov r9, STKARG(4)
3119b3ae
MW
1425 .endif
1426 .ifeqs "\mode", "mont"
1427 mov r10, STKARG(0)
1428 mov rdi, rcx
1429 mov rcx, rdx
1430 mov rbx, r9
1431 movdqu xmm8, [r10]
3119b3ae 1432 mov r10, r8
49b0f1b4
MW
1433 mov r8d, STKARG(1)
1434 mov r9, STKARG(2)
3119b3ae
MW
1435 .endif
1436#endif
1437
1438 pxor xmm0, xmm0
1439 .ifeqs "\mode", "dmul"
1440 expand xmm0, xmm8, xmm9, xmm10, xmm11
1441 .endif
1442 .ifeqs "\mode", "smul"
1443 expand xmm0, xmm10, xmm11
1444 .endif
1445 .ifeqs "\mode", "mmul"
1446 expand xmm0, xmm8, xmm9, xmm10, xmm11
1447 .endif
1448 .ifeqs "\mode", "mont"
1449 expand xmm0, xmm8, xmm9
1450 .endif
1451.endm
1452
1453.macro testepilogue
1454#if ABI_WIN
1455 rstrxmm xmm6, 0
1456 rstrxmm xmm7, 16
1457 rstrxmm xmm8, 32
1458 rstrxmm xmm9, 48
1459 rstrxmm xmm10, 64
1460 rstrxmm xmm11, 80
1461 rstrxmm xmm12, 96
1462 rstrxmm xmm13, 112
1463 rstrxmm xmm14, 128
1464 rstrxmm xmm15, 144
1465 stfree 168
1466 popreg rdi
1467#endif
1468 popreg rbx
1469 ret
1470.endm
1471
1472.macro testldcarry
8e91d6e5
MW
1473 movdqu xmm12, [rcx + 0] // (c'_0; c''_0)
1474 movdqu xmm13, [rcx + 16] // (c'_1; c''_1)
1475 movdqu xmm14, [rcx + 32] // (c'_2; c''_2)
3119b3ae
MW
1476.endm
1477
1478.macro testtop u=nil
1479 .p2align 4
14800:
1481 cysetup r9, r8
1482 .ifnes "\u", "nil"
1483 mov rax, \u
1484 .endif
1485.endm
1486
1487.macro testtail
1488 cystore r9, r8
1489 jnz 0b
1490.endm
1491
1492.macro testcarryout
1493 movdqu [rcx + 0], xmm12
1494 movdqu [rcx + 16], xmm13
1495 movdqu [rcx + 32], xmm14
1496.endm
1497
1498FUNC(test_dmul4)
1499 testprologue dmul
1500 testldcarry
1501 testtop r11
1502 call dmul4
1503 testtail
1504 testcarryout
1505 testepilogue
1506ENDFUNC
1507
1508FUNC(test_dmla4)
1509 testprologue dmul
1510 testldcarry
1511 testtop r11
1512 call dmla4
1513 testtail
1514 testcarryout
1515 testepilogue
1516ENDFUNC
1517
1518FUNC(test_mul4)
1519 testprologue smul
1520 testldcarry
1521 testtop nil
1522 call mul4
1523 testtail
1524 testcarryout
1525 testepilogue
1526ENDFUNC
1527
d0d41c6e
MW
1528FUNC(test_mul4zc)
1529 testprologue smul
1530 testldcarry
1531 testtop nil
1532 call mul4zc
1533 testtail
1534 testcarryout
1535 testepilogue
1536ENDFUNC
1537
3119b3ae
MW
1538FUNC(test_mla4)
1539 testprologue smul
1540 testldcarry
1541 testtop nil
1542 call mla4
1543 testtail
1544 testcarryout
1545 testepilogue
1546ENDFUNC
1547
d0d41c6e
MW
1548FUNC(test_mla4zc)
1549 testprologue smul
1550 testldcarry
1551 testtop nil
1552 call mla4zc
1553 testtail
1554 testcarryout
1555 testepilogue
1556ENDFUNC
1557
3119b3ae
MW
1558FUNC(test_mmul4)
1559 testprologue mmul
1560 testtop r11
1561 call mmul4
1562 testtail
1563 movdqu [r10 + 0], xmm10
1564 movdqu [r10 + 16], xmm11
1565 testcarryout
1566 testepilogue
1567ENDFUNC
1568
1569FUNC(test_mmla4)
1570 testprologue mmul
1571 testtop r11
1572 call mmla4
1573 testtail
1574 movdqu [r10 + 0], xmm10
1575 movdqu [r10 + 16], xmm11
1576 testcarryout
1577 testepilogue
1578ENDFUNC
1579
1580FUNC(test_mont4)
1581 testprologue mont
1582 testtop
1583 call mont4
1584 testtail
1585 movdqu [r10 + 0], xmm10
1586 movdqu [r10 + 16], xmm11
1587 testcarryout
1588 testepilogue
1589ENDFUNC
1590
1591#endif
1592
1593///----- That's all, folks --------------------------------------------------