progs/perftest.c: Use from Glibc syscall numbers.
[catacomb] / math / mpx-mul4-arm64-simd.S
CommitLineData
ea1b3cec
MW
1/// -*- mode: asm; asm-comment-char: ?/ -*-
2///
3/// Large SIMD-based multiplications
4///
5/// (c) 2019 Straylight/Edgeware
6///
7
8///----- Licensing notice ---------------------------------------------------
9///
10/// This file is part of Catacomb.
11///
12/// Catacomb is free software: you can redistribute it and/or modify it
13/// under the terms of the GNU Library General Public License as published
14/// by the Free Software Foundation; either version 2 of the License, or
15/// (at your option) any later version.
16///
17/// Catacomb is distributed in the hope that it will be useful, but
18/// WITHOUT ANY WARRANTY; without even the implied warranty of
19/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20/// Library General Public License for more details.
21///
22/// You should have received a copy of the GNU Library General Public
23/// License along with Catacomb. If not, write to the Free Software
24/// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
25/// USA.
26
27///--------------------------------------------------------------------------
28/// Preliminaries.
29
30#include "config.h"
31#include "asm-common.h"
32
33 .text
34
35///--------------------------------------------------------------------------
36/// Theory.
37///
38/// We define a number of primitive fixed-size multipliers from which we can
39/// construct more general variable-length multipliers.
40///
41/// The basic trick is the same throughout. In an operand-scanning
42/// multiplication, the inner multiplication loop multiplies a multiple-
43/// precision operand by a single precision factor, and adds the result,
44/// appropriately shifted, to the result. A `finely integrated operand
45/// scanning' implementation of Montgomery multiplication also adds the
46/// product of a single-precision `Montgomery factor' and the modulus,
47/// calculated in the same pass. The more common `coarsely integrated
48/// operand scanning' alternates main multiplication and Montgomery passes,
49/// which requires additional carry propagation.
50///
51/// Throughout both plain-multiplication and Montgomery stages, then, one of
52/// the factors remains constant throughout the operation, so we can afford
53/// to take a little time to preprocess it. The transformation we perform is
54/// as follows. Let b = 2^16, and B = b^2 = 2^32. Suppose we're given a
55/// 128-bit factor v = v_0 + v_1 B + v_2 B^2 + v_3 B^3. Split each v_i into
56/// two sixteen-bit pieces, so v_i = v'_i + v''_i b. These eight 16-bit
57/// pieces are placed into 32-bit cells, and arranged as two 128-bit SIMD
58/// operands, as follows.
59///
981a9e5d
MW
60/// Offset 12 8 4 0
61/// 0 v''_1 v'_1 v''_0 v'_0
62/// 16 v''_3 v'_3 v''_2 v'_2
ea1b3cec
MW
63///
64/// The `umull' and `umlal' instructions can multiply a vector of two 32-bit
65/// values by a 32-bit scalar, giving two 64-bit results; thus, it will act
66/// on (say) v'_0 and v''_0 in a single instruction, to produce two 48-bit
67/// results in 64-bit fields. The sixteen bits of headroom allows us to add
68/// many products together before we must deal with carrying; it also allows
69/// for some calculations to be performed on the above expanded form.
70///
71/// We maintain three `carry' registers, v28--v30, accumulating intermediate
72/// results; we name them `c0', `c1', and `c2'. Each carry register holds
73/// two 64-bit halves: the register c0, for example, holds c'_0 (low half)
74/// and c''_0 (high half), and represents the value c_0 = c'_0 + c''_0 b; the
75/// carry registers collectively represent the value c_0 + c_1 B + c_2 B^2.
76/// The `umull' or `umlal' instruction acting on a scalar operand and an
77/// operand in the expanded form above produces a result which can be added
78/// directly to the appropriate carry register.
79///
80/// An unusual feature of this code, as compared to the other `mul4'
81/// implementations, is that it makes extensive use of the ARM64
82/// general-purpose registers for carry resolution and output construction.
83/// As a result, an additional general-purpose register (typically x15) is
84/// used as an additional carry, with the carry value in bits 16--63.
85///
86/// Multiplication is performed in product-scanning order, since ARM
87/// processors commonly implement result forwarding for consecutive multiply-
88/// and-accumulate instructions specifying the same destination.
89/// Experimentally, this runs faster than operand-scanning in an attempt to
90/// hide instruction latencies.
91///
92/// On 64-bit ARM, we have a vast supply number of registers: the expanded
93/// operands are kept in registers. The packed operands are read from memory
94/// into working registers v0 and v1. The following conventional argument
95/// names and locations are used throughout.
96///
97/// Arg Format Location Notes
98///
99/// U packed [x1]
100/// X packed [x2] In Montgomery multiplication, X = N
101/// V expanded v2/v3
102/// Y expanded v4/v5 In Montgomery multiplication, Y = (A + U V) M
103/// M expanded v6/v7 -N^{-1} (mod B^4)
104/// N Modulus, for Montgomery multiplication
105/// A packed [x0] Destination/accumulator
106/// C carry v28--v30
107/// 0 v31 128-bit zero
108///
109/// The calculation is some variant of
110///
111/// A' + C' B^4 <- U V + X Y + A + C
112///
113/// The low-level functions fit into a fairly traditional (finely-integrated)
114/// operand scanning loop over operand pairs (U, X) (indexed by j) and (V, Y)
115/// (indexed by i).
116///
117/// The variants are as follows.
118///
119/// Function Variant Use i j
120///
121/// mmul4 A = C = 0 Montgomery 0 0
122/// dmul4 A = 0 Montgomery 0 +
123/// mmla4 C = 0 Montgomery + 0
124/// dmla4 exactly as shown Montgomery + +
125///
126/// mul4zc U = V = A = C = 0 Plain 0 0
127/// mul4 U = V = A = 0 Plain 0 +
128/// mla4zc U = V = C = 0 Plain + 0
129/// mla4 U = V = 0 Plain + +
130///
131/// The `mmul4' and `mmla4' functions are also responsible for calculating
132/// the Montgomery reduction factor Y = (A + U V) M used by the rest of the
133/// inner loop.
134
135///--------------------------------------------------------------------------
136/// Macro definitions.
137
138.macro mulacc z, u, v, x=nil, y=nil
139 // Set Z = Z + U V + X Y, using the low halves of V and Y. Y may be
140 // `nil' to omit the second operand. Z, V, and Y should be 128-bit
141 // `vN' registers; and U and X should be 32-bit `vN.s[I]' scalars;
142 // the multiplications produce two 64-bit elementwise products, which
143 // are added elementwise to Z.
144
145 umlal \z\().2d, \v\().2s, \u
146 .ifnes "\y", "nil"
147 umlal \z\().2d, \y\().2s, \x
148 .endif
149.endm
150
151.macro mulacc2 z, u, v, x=nil, y=nil
152 // Set Z = Z + U V + X Y, using the high halves of V and Y; see
153 // `mulacc'.
154
155 umlal2 \z\().2d, \v\().4s, \u
156 .ifnes "\y", "nil"
157 umlal2 \z\().2d, \y\().4s, \x
158 .endif
159.endm
160
161.macro mulinit z, zinitp, u, v=nil, x=nil, y=nil
162 // If ZINITP then set Z = Z + U V + X Y, as for `mulacc'; otherwise,
163 // set Z = U V + X Y. Operand requirements and detailed operation
164 // are as for `mulacc'.
165
166 .ifeqs "\zinitp", "t"
167 mulacc \z, \u, \v, \x, \y
168 .else
169 umull \z\().2d, \v\().2s, \u
170 .ifnes "\y", "nil"
171 umlal \z\().2d, \y\().2s, \x
172 .endif
173 .endif
174.endm
175
176.macro mulini2 z, zinitp, u, v=nil, x=nil, y=nil
177 // As `mulinit', but with the high halves of V and Y.
178
179 .ifeqs "\zinitp", "t"
180 mulacc2 \z, \u, \v, \x, \y
181 .else
182 umull2 \z\().2d, \v\().4s, \u
183 .ifnes "\y", "nil"
184 umlal2 \z\().2d, \y\().4s, \x
185 .endif
186 .endif
187.endm
188
71a2f3b1 189// `mulI': accumulate the B^I and b B^I terms of the polynomial product sum
b9defc89
MW
190// U V + X Y, given that U = u_0 + B u_1 + B^2 u_2 + B^3 u_3 (and similarly
191// for x), and V = v'_0 + b v''_0 + B (v'_1 + b v''_1) + B^2 (v'_2 + b v''_2)
192// + B^3 (v'_3 + b v''_3) (and similarly for Y). The 64-bit coefficients are
ea1b3cec
MW
193// added into the low and high halves of the 128-bit register Z (if ZINIT is
194// `nil' then simply set Z, as if it were initially zero).
195.macro mul0 z, zinitp, u, v0, v1, x=nil, y0=nil, y1=nil
196 mulinit \z, \zinitp, \u\().s[0], \v0, \x\().s[0], \y0
197.endm
198.macro mul1 z, zinitp, u, v0, v1, x=nil, y0=nil, y1=nil
199 mulini2 \z, \zinitp, \u\().s[0], \v0, \x\().s[0], \y0
200 mulacc \z, \u\().s[1], \v0, \x\().s[1], \y0
201.endm
202.macro mul2 z, zinitp, u, v0, v1, x=nil, y0=nil, y1=nil
203 mulinit \z, \zinitp, \u\().s[0], \v1, \x\().s[0], \y1
204 mulacc2 \z, \u\().s[1], \v0, \x\().s[1], \y0
205 mulacc \z, \u\().s[2], \v0, \x\().s[2], \y0
206.endm
207.macro mul3 z, zinitp, u, v0, v1, x=nil, y0=nil, y1=nil
208 mulini2 \z, \zinitp, \u\().s[0], \v1, \x\().s[0], \y1
209 mulacc \z, \u\().s[1], \v1, \x\().s[1], \y1
210 mulacc2 \z, \u\().s[2], \v0, \x\().s[2], \y0
211 mulacc \z, \u\().s[3], \v0, \x\().s[3], \y0
212.endm
213.macro mul4 z, zinitp, u, v0, v1, x=nil, y0=nil, y1=nil
214 mulini2 \z, \zinitp, \u\().s[1], \v1, \x\().s[1], \y1
215 mulacc \z, \u\().s[2], \v1, \x\().s[2], \y1
216 mulacc2 \z, \u\().s[3], \v0, \x\().s[3], \y0
217.endm
218.macro mul5 z, zinitp, u, v0, v1, x=nil, y0=nil, y1=nil
219 mulini2 \z, \zinitp, \u\().s[2], \v1, \x\().s[2], \y1
220 mulacc \z, \u\().s[3], \v1, \x\().s[3], \y1
221.endm
222.macro mul6 z, zinitp, u, v0, v1, x=nil, y0=nil, y1=nil
223 mulini2 \z, \zinitp, \u\().s[3], \v1, \x\().s[3], \y1
224.endm
225
226// Steps in the process of propagating carry bits upwards from ZLO (a 128-bit
227// `vN' register). Here, T0, T1, and CG are 64-bit `xN' general-purpose
228// registers clobbered in the process. Set the low 32 bits of the 64-bit
229// `xN' general-purpose register ZOUT to the completed coefficient z_1,
230// leaving a carry in CG.
231//
232// In detail, what happens is as follows. Suppose initially that ZLO =
981a9e5d 233// (z''_i; z'_i) and ZHI = (z''_{i+1}; z'_{i+1}). Let t = z'_i + b z''_i;
ea1b3cec
MW
234// observe that floor(t/b) = floor(z'_i/b) + z''_i. Let z_i = t mod B, and
235// add floor(t/B) = floor((floor(z'_i/b) + z''_i)/b) onto z'_{i+1}. This has
236// a circuit depth of 3; I don't know how to do better.
237//
238// Output words are left in the low half of a 64-bit register, with rubbish
239// in the high half. Two such results can be combined using the `bfi'
240// instruction.
241.macro carry0 zlo, cg=x15, t0=x16, t1=x17
242 // Capture the values of carry-register ZLO and CG (if not `nil') in
243 // general-purpose registers T0 and T1, suitable for use in `carry1'.
244 mov \t0, \zlo\().d[0]
245 mov \t1, \zlo\().d[1]
246 .ifnes "\cg", "nil"
247 add \t0, \t0, \cg, lsr #16
248 .endif
249.endm
250.macro carry1 zout, cg=x15, t0=x16, t1=x17
251 // Collect a 32-bit output word in the low 32 bits of ZOUT (leaving
252 // rubbish in the high 32 bits), and update CG suitably to continue
253 // processing with the next carry register.
254 .ifnes "\zout", "nil"
255 add \zout, \t0, \t1, lsl #16
256 .endif
257 .ifnes "\cg", "nil"
258 add \cg, \t1, \t0, lsr #16
259 .endif
260.endm
261
262.macro expand vlo, vhi, vz=v31
263 // Expand the packed 128-bit operand in VLO to an expanded operand in
264 // VLO and VHI, assuming that VZ is all-bits-zero. All three are
265 // `vN' 128-bit SIMD registers.
266 zip2 \vhi\().8h, \vlo\().8h, \vz\().8h
267 zip1 \vlo\().8h, \vlo\().8h, \vz\().8h
268.endm
269
270.macro sprdacc a0, a1, a2, a3=nil
271 // Spread the packed 128-bit operand in A0 into carry-format values
272 // in A0, A1, A2, A3. If A3 is `nil', then spread the same value
273 // into A0, A1, A2 only, clobbering x16.
274 .ifeqs "\a3", "nil"
275 mov w16, \a0\().s[3]
276 .endif
277 trn2 \a2\().2d, \a0\().2d, v31.2d
278 trn2 \a1\().2s, \a0\().2s, v31.2s
279 .ifeqs "\a3", "nil"
280 lsl x16, x16, #16
281 .endif
282 trn1 \a0\().2s, \a0\().2s, v31.2s
283 .ifeqs "\a3", "nil"
284 mov \a2\().d[1], x16
285 .else
286 trn2 \a3\().2s, \a2\().2s, v31.2s
287 .endif
288 mov \a2\().s[1], wzr
289.endm
290
291.macro crryacc a0, a1, a2, a3, c0, c1, c2
292 // Add the carry-format values A0, A1, A2 into the existing carries
293 // C0, C1, C2 (leaving A3 where it is).
294 add \c0\().2d, \c0\().2d, \a0\().2d
295 add \c1\().2d, \c1\().2d, \a1\().2d
296 add \c2\().2d, \c2\().2d, \a2\().2d
297.endm
298
299///--------------------------------------------------------------------------
300/// Primitive multipliers and related utilities.
301
302INTFUNC(carryprop)
303 // On entry, x0 points to a destination, and v28--v30 and x15 hold
304 // incoming carries c0--c2 and cg. On exit, the low 128 bits of the
305 // carry value are stored at [x0]; the remaining 16 bits of carry are
306 // left in x10; x0 is advanced by 16; and x11--x17 are clobbered.
307 endprologue
308
309 carry0 v28
310 carry1 x11
311 carry0 v29
312 carry1 x13
313 carry0 v30
314 carry1 x12
315 bfi x11, x13, #32, #32
316 lsr x14, x15, #16
317 lsr x10, x15, #48
318 bfi x12, x14, #32, #32
319 stp x11, x12, [x0], #16
320 ret
321ENDFUNC
322
323INTFUNC(dmul4)
324 // On entry, x0 points to the destination; x1 and x2 point to packed
325 // operands U and X; v2/v3 and v4/v5 hold expanded operands V and Y;
326 // v28--v30 and x15 hold incoming carries c0--c2 and cg; and v31 is
327 // zero. On exit, the destination and carries are updated; x0, x1,
328 // x2 are each advanced by 16; v2--v5 and v8--v15 are preserved; and
329 // x11--x14, x16, x17 and the other SIMD registers are clobbered.
330 endprologue
331
332 // Start by loading the operand words from memory.
333 ldr q0, [x1], #16
334 ldr q1, [x2], #16
335
336 // Do the multiplication.
337 mul0 v28, t, v0, v2, v3, v1, v4, v5
338 mul1 v29, t, v0, v2, v3, v1, v4, v5
339 carry0 v28
340 mul2 v30, t, v0, v2, v3, v1, v4, v5
341 carry1 x11
342 carry0 v29
343 mul3 v27, nil, v0, v2, v3, v1, v4, v5
344 carry1 x13
345 carry0 v30
346 mul4 v28, nil, v0, v2, v3, v1, v4, v5
347 carry1 x12
348 carry0 v27
349 mul5 v29, nil, v0, v2, v3, v1, v4, v5
350 carry1 x14
351 mul6 v30, nil, v0, v2, v3, v1, v4, v5
352
353 // Finish up and store the result.
354 bfi x11, x13, #32, #32
355 bfi x12, x14, #32, #32
356 stp x11, x12, [x0], #16
357
358 // All done.
359 ret
360ENDFUNC
361
362INTFUNC(dmla4)
363 // On entry, x0 points to the destination/accumulator A; x1 and x2
364 // point to packed operands U and X; v2/v3 and v4/v5 hold expanded
365 // operands V and Y; v28--v30 and x15 hold incoming carries c0--c2
366 // and cg; and v31 is zero. On exit, the accumulator and carries are
367 // updated; x0, x1, x2 are each advanced by 16; v2--v5 and v8--v15
368 // are preserved; and x11--x14, x16, x17 and the other SIMD registers
369 // are clobbered.
370 endprologue
371
372 // Start by loading the operand words from memory.
373 ldr q24, [x0]
374 ldr q0, [x1], #16
375 ldr q1, [x2], #16
376 sprdacc v24, v25, v26, v27
377 crryacc v24, v25, v26, v27, v28, v29, v30
378
379 // Do the multiplication.
380 mul0 v28, t, v0, v2, v3, v1, v4, v5
381 mul1 v29, t, v0, v2, v3, v1, v4, v5
382 carry0 v28
383 mul2 v30, t, v0, v2, v3, v1, v4, v5
384 carry1 x11
385 carry0 v29
386 mul3 v27, t, v0, v2, v3, v1, v4, v5
387 carry1 x13
388 carry0 v30
389 mul4 v28, nil, v0, v2, v3, v1, v4, v5
390 carry1 x12
391 carry0 v27
392 mul5 v29, nil, v0, v2, v3, v1, v4, v5
393 carry1 x14
394 mul6 v30, nil, v0, v2, v3, v1, v4, v5
395
396 // Finish up and store the result.
397 bfi x11, x13, #32, #32
398 bfi x12, x14, #32, #32
399 stp x11, x12, [x0], #16
400
401 // All done.
402 ret
403ENDFUNC
404
405INTFUNC(mul4)
406 // On entry, x0 points to the destination; x2 points to a packed
407 // operand X; v4/v5 holds an expanded operand Y; v13--v15 and x15
408 // hold incoming carries c0--c2 and cg; and v31 is zero. On exit,
409 // the destination and carries are updated; x0 and x2 are each
410 // advanced by 16; v4 and v5 and v8--v15 are preserved; and x11--x14,
411 // x16, x17 and the other SIMD registers are clobbered.
412 endprologue
413
414 // Start by loading the operand words from memory.
415 ldr q1, [x2], #16
416
417 // Do the multiplication.
418 mul0 v28, t, v1, v4, v5
419 mul1 v29, t, v1, v4, v5
420 carry0 v28
421 mul2 v30, t, v1, v4, v5
422 carry1 x11
423 carry0 v29
424 mul3 v27, nil, v1, v4, v5
425 carry1 x13
426 carry0 v30
427 mul4 v28, nil, v1, v4, v5
428 carry1 x12
429 carry0 v27
430 mul5 v29, nil, v1, v4, v5
431 carry1 x14
432 mul6 v30, nil, v1, v4, v5
433
434 // Finish up and store the result.
435 bfi x11, x13, #32, #32
436 bfi x12, x14, #32, #32
437 stp x11, x12, [x0], #16
438
439 // All done.
440 ret
441ENDFUNC
442
443INTFUNC(mul4zc)
444 // On entry, x0 points to the destination; x2 points to a packed
445 // operand X; v4/v5 holds an expanded operand Y; and v31 is zero. On
446 // exit, the destination is updated; v28--v30 and x15 hold outgoing
447 // carries c0--c2 and cg; x0 and x2 are each advanced by 16; v4 and
448 // v5 and v8--v15 are preserved; and x11--x14, x16, x17 and the other
449 // SIMD registers are clobbered.
450 endprologue
451
452 // Start by loading the operand words from memory.
453 ldr q1, [x2], #16
454
455 // Do the multiplication.
456 mul0 v28, nil, v1, v4, v5
457 mul1 v29, nil, v1, v4, v5
458 carry0 v28, nil
459 mul2 v30, nil, v1, v4, v5
460 carry1 x11
461 carry0 v29
462 mul3 v27, nil, v1, v4, v5
463 carry1 x13
464 carry0 v30
465 mul4 v28, nil, v1, v4, v5
466 carry1 x12
467 carry0 v27
468 mul5 v29, nil, v1, v4, v5
469 carry1 x14
470 mul6 v30, nil, v1, v4, v5
471
472 // Finish up and store the result.
473 bfi x11, x13, #32, #32
474 bfi x12, x14, #32, #32
475 stp x11, x12, [x0], #16
476
477 // All done.
478 ret
479ENDFUNC
480
481INTFUNC(mla4)
482 // On entry, x0 points to the destination/accumulator A; x2 points to
483 // a packed operand X; v4/v5 holds an expanded operand Y; v13--v15
484 // and x15 hold incoming carries c0--c2 and cg; and v31 is zero. On
485 // exit, the accumulator and carries are updated; x0 and x2 are each
486 // advanced by 16; v4 and v5 and v8--v15 are preserved; and x11--x14,
487 // x16, x17 and the other SIMD registers are clobbered.
488 endprologue
489
490 // Start by loading the operand words from memory.
491 ldr q24, [x0]
492 ldr q1, [x2], #16
493 sprdacc v24, v25, v26, v27
494 crryacc v24, v25, v26, v27, v28, v29, v30
495
496 // Do the multiplication.
497 mul0 v28, t, v1, v4, v5
498 mul1 v29, t, v1, v4, v5
499 carry0 v28
500 mul2 v30, t, v1, v4, v5
501 carry1 x11
502 carry0 v29
503 mul3 v27, t, v1, v4, v5
504 carry1 x13
505 carry0 v30
506 mul4 v28, nil, v1, v4, v5
507 carry1 x12
508 carry0 v27
509 mul5 v29, nil, v1, v4, v5
510 carry1 x14
511 mul6 v30, nil, v1, v4, v5
512
513 // Finish up and store the result.
514 bfi x11, x13, #32, #32
515 bfi x12, x14, #32, #32
516 stp x11, x12, [x0], #16
517
518 // All done.
519 ret
520ENDFUNC
521
522INTFUNC(mla4zc)
523 // On entry, x0 points to the destination/accumulator A; x2 points to
524 // a packed operand X; v4/v5 holds an expanded operand Y; and v31 is
525 // zero. On exit, the accumulator is updated; v28--v30 and x15 hold
526 // outgoing carries c0--c2 and cg; x0 and x2 are each advanced by 16;
527 // v4, v5, and v8--v15 are preserved; and x11--x14, x16, x17 and the
528 // other SIMD registers are clobbered.
529 endprologue
530
531 // Start by loading the operand words from memory.
532 ldr q28, [x0]
533 ldr q1, [x2], #16
534 sprdacc v28, v29, v30, v27
535
536 // Do the multiplication.
537 mul0 v28, t, v1, v4, v5
538 mul1 v29, t, v1, v4, v5
539 carry0 v28, nil
540 mul2 v30, t, v1, v4, v5
541 carry1 x11
542 carry0 v29
543 mul3 v27, t, v1, v4, v5
544 carry1 x13
545 carry0 v30
546 mul4 v28, nil, v1, v4, v5
547 carry1 x12
548 carry0 v27
549 mul5 v29, nil, v1, v4, v5
550 carry1 x14
551 mul6 v30, nil, v1, v4, v5
552
553 // Finish up and store the result.
554 bfi x11, x13, #32, #32
555 bfi x12, x14, #32, #32
556 stp x11, x12, [x0], #16
557
558 // All done.
559 ret
560ENDFUNC
561
562INTFUNC(mmul4)
563 // On entry, x0 points to the destination; x1 points to a packed
564 // operand U; x2 points to a packed operand X (the modulus); v2/v3
565 // holds an expanded operand V; and v6/v7 holds an expanded operand M
566 // (the Montgomery factor -N^{-1} (mod B)). On exit, the destination
567 // is updated (to zero); v4/v5 hold an expanded factor Y = U V M (mod
568 // B); v28--v30 and x15 hold outgoing carries c0--c2 and cg; x0, x1,
569 // and x2 are each advanced by 16; v2, v3, and v8--v15 are preserved;
570 // and x11--x14, x16, x17 and the other SIMD registers are clobbered.
571 endprologue
572
573 // Start by loading the operand words from memory.
574 ldr q0, [x1], #16
575 ldr q1, [x2], #16
576
577 // Calculate the low half of W = A + U V, being careful to leave the
578 // carries in place.
579 mul0 v28, nil, v0, v2, v3
580 mul1 v29, nil, v0, v2, v3
581 carry0 v28, nil
582 mul2 v30, nil, v0, v2, v3
583 carry1 x11
584 carry0 v29
585 mul3 v27, nil, v0, v2, v3
586 b mmla4_common
587ENDFUNC
588
589INTFUNC(mmla4)
590 // On entry, x0 points to the destination/accumulator A; x1 points to
591 // a packed operand U; x2 points to a packed operand X (the modulus);
592 // v2/v3 holds an expanded operand V; and v6/v7 holds an expanded
593 // operand M (the Montgomery factor -N^{-1} (mod B)). On exit, the
594 // accumulator is updated (to zero); v4/v5 hold an expanded factor Y
595 // = (A + U V) M (mod B); v28--v30 and x15 hold outgoing carries
596 // c0--c2 and cg; x0, x1, and x2 are each advanced by 16; v2, v3, v6,
597 // v7, and v8--v15 are preserved; and x11--x14, x16, x17 and the
598 // other SIMD registers are clobbered.
599 endprologue
600
601 // Start by loading the operand words from memory.
602 ldr q28, [x0]
603 ldr q0, [x1], #16
604 ldr q1, [x2], #16
605 sprdacc v28, v29, v30, v27
606
607 // Calculate the low half of W = A + U V, being careful to leave the
608 // carries in place.
609 mul0 v28, t, v0, v2, v3
610 mul1 v29, t, v0, v2, v3
611 carry0 v28, nil
612 mul2 v30, t, v0, v2, v3
613 carry1 x11
614 carry0 v29
615 mul3 v27, t, v0, v2, v3
616mmla4_common:
617 carry1 x13
618 carry0 v30
619 carry1 x12
620 carry0 v27
621 carry1 x14, nil
622
623 // Piece the result together and ship it back.
624 bfi x11, x13, #32, #32
625 bfi x12, x14, #32, #32
626 mov v16.d[0], x11
627 mov v16.d[1], x12
628
629 // Calculate the low half of the Montgomery factor Y = W M.
630 mul0 v18, nil, v16, v6, v7
631 mul1 v19, nil, v16, v6, v7
632 carry0 v18, nil
633 mul2 v20, nil, v16, v6, v7
634 carry1 x11
635 carry0 v19
636 mul3 v21, nil, v16, v6, v7
637 carry1 x13
638 carry0 v20
639 carry1 x12
640 carry0 v21
641 carry1 x14, nil
642
643 // Piece the result together, ship it back, and expand.
644 bfi x11, x13, #32, #32
645 bfi x12, x14, #32, #32
646 mov v4.d[0], x11
647 mov v4.d[1], x12
648 expand v4, v5
649
650 // Build up the product X Y in the carry slots.
651 mul0 v28, t, v1, v4, v5
652 mul1 v29, t, v1, v4, v5
653 carry0 v28, nil
654 mul2 v30, t, v1, v4, v5
655 carry1 nil
656 carry0 v29
657 mul3 v27, t, v1, v4, v5
658 carry1 nil
659 carry0 v30
660
661 // And complete the calculation.
662 mul4 v28, nil, v0, v2, v3, v1, v4, v5
663 carry1 nil
664 carry0 v27
665 mul5 v29, nil, v0, v2, v3, v1, v4, v5
666 carry1 nil
667 mul6 v30, nil, v0, v2, v3, v1, v4, v5
668
669 // Finish up and store the result.
670 stp xzr, xzr, [x0], #16
671
672 // All done.
673 ret
674ENDFUNC
675
676INTFUNC(mont4)
677 // On entry, x0 points to the destination/accumulator A; x2 points to
678 // a packed operand X (the modulus); and v6/v7 holds an expanded
679 // operand M (the Montgomery factor -N^{-1} (mod B)). On exit, the
680 // accumulator is updated (to zero); v4/v5 hold an expanded factor Y
681 // = A M (mod B); v28--v30 and x15 hold outgoing carries c0--c2 and
682 // cg; x0 and x2 are each advanced by 16; v6, v7, and v8--v15 are
683 // preserved; and x11--x14, x16, x17 and the other SIMD registers are
684 // clobbered.
685 endprologue
686
687 // Start by loading the operand words from memory.
688 ldr q28, [x0]
689 ldr q1, [x2], #16
690
691 // Calculate Y = A M (mod B).
692 mul0 v18, nil, v28, v6, v7
693 mul1 v19, nil, v28, v6, v7
694 carry0 v18, nil
695 mul2 v20, nil, v28, v6, v7
696 carry1 x11
697 carry0 v19
698 mul3 v21, nil, v28, v6, v7
699 carry1 x13
700 carry0 v20
701 sprdacc v28, v29, v30, v27
702 carry1 x12
703 carry0 v21
704 carry1 x14, nil
705
706 // Piece the result together, ship it back, and expand.
707 bfi x11, x13, #32, #32
708 bfi x12, x14, #32, #32
709 mov v4.d[0], x11
710 mov v4.d[1], x12
711 expand v4, v5
712
713 // Calculate the actual result. Well, the carries, at least.
714 mul0 v28, t, v1, v4, v5
715 mul1 v29, t, v1, v4, v5
716 carry0 v28, nil
717 mul2 v30, t, v1, v4, v5
718 carry1 nil
719 carry0 v29
720 mul3 v27, t, v1, v4, v5
721 carry1 nil
722 carry0 v30
723
724 // And complete the calculation.
725 mul4 v28, nil, v1, v4, v5
726 carry1 nil
727 carry0 v27
728 mul5 v29, nil, v1, v4, v5
729 carry1 nil
730 mul6 v30, nil, v1, v4, v5
731
732 // Finish up and store the result.
733 stp xzr, xzr, [x0], #16
734
735 // All done.
736 ret
737ENDFUNC
738
739///--------------------------------------------------------------------------
740/// Bulk multipliers.
741
742FUNC(mpx_umul4_arm64_simd)
743 // void mpx_umul4_arm64_simd(mpw *dv, const mpw *av, const mpw *avl,
744 // const mpw *bv, const mpw *bvl);
745
746 // Establish the arguments and do initial setup.
747 //
748 // inner loop dv x0
749 // inner loop av x2
750 // outer loop dv x5
751 // outer loop bv x3
752 // av base x1
753 // inner n x6
754 // n base x7
755 // outer n x4
756 pushreg x29, x30
757 setfp
758 endprologue
759
760 // Prepare for the first iteration.
761 ldr q4, [x3], #16 // Y = bv[0]
762 movi v31.4s, #0
763 sub x7, x2, x1 // = inner loop count base
764 // x0 // = dv for inner loop
765 // x1 // = av base
766 // x3 // = bv for outer loop
767 sub x4, x4, x3 // = outer loop count (decremented)
768 sub x6, x7, #16 // = inner loop count (decremented)
769 mov x2, x1 // = av for inner loop
770 add x5, x0, #16 // = dv for outer loop
771 expand v4, v5 // expand Y
772 bl mul4zc
773 cbz x6, 8f // all done?
774
775 // Continue with the first iteration.
7760: sub x6, x6, #16
777 bl mul4
778 cbnz x6, 0b
779
780 // Write out the leftover carry. There can be no tail here.
7818: bl carryprop
782 cbz x4, 9f
783
784 // Set up for the next pass.
7851: ldr q4, [x3], #16 // Y = bv[i]
786 mov x0, x5 // -> dv[i]
787 mov x2, x1 // -> av[0]
788 add x5, x5, #16
789 sub x6, x7, #16 // = inner loop count (decremented)
790 sub x4, x4, #16 // outer loop count
791 expand v4, v5 // expand Y
792 bl mla4zc
793 cbz x6, 8f
794
795 // Continue...
7960: sub x6, x6, #16
797 bl mla4
798 cbnz x6, 0b
799
800 // Finish off this pass. There was no tail on the previous pass, and
801 // there can be done on this pass.
8028: bl carryprop
803 cbnz x4, 1b
804
805 // All over.
8069: popreg x29, x30
807 ret
808ENDFUNC
809
810FUNC(mpxmont_mul4_arm64_simd)
811 // void mpxmont_mul4_arm64_simd(mpw *dv,
812 // const mpw *av, const mpw *bv,
813 // const mpw *nv, size_t n,
814 // const mpw *mi);
815
816 // Establish the arguments and do initial setup.
817 //
818 // inner loop dv x0
819 // inner loop av x1
820 // inner loop nv x2
821 // nv base x3
822 // base n x4
823 // mi (x5)
824 // outer loop dv x5
825 // outer loop bv x6
826 // av base x7
827 // inner n x8
828 // outer n x9
829 // c x10
830 pushreg x29, x30
831 setfp
832 endprologue
833
834 // Set up the outer loop state and prepare for the first iteration.
835 ldr q2, [x2] // = V = bv[0]
836 ldr q6, [x5] // = M
837 movi v31.4s, #0
838 // x0 // -> dv for inner loop
839 // x1 // -> av for inner loop
840 // x3 // -> nv base
841 // x4 // = n base
842 add x5, x0, #16 // -> dv
843 add x6, x2, #16 // -> bv
844 mov x2, x3 // -> nv[0]
845 mov x7, x1 // -> av base
846 sub x8, x4, #4 // = inner n (decremented)
847 sub x9, x4, #4 // = outer n (decremented)
848 expand v2, v3 // expand V
849 expand v6, v7 // expand M
850 bl mmul4
851 cbz x8, 8f // done already?
852
853 // Complete the first inner loop.
8540: sub x8, x8, #4
855 bl dmul4
856 cbnz x8, 0b // done yet?
857
858 // Still have carries left to propagate. Rather than store the tail
859 // end in memory, keep it in x10 for later.
860 bl carryprop
861
862 // Embark on the next iteration. (There must be one. If n = 1 then
863 // we would have bailed above, to label 8. Similarly, the subsequent
864 // iterations can fall into the inner loop immediately.)
8651: ldr q2, [x6], #16 // = Y = bv[i]
866 mov x0, x5 // -> dv[i]
867 mov x1, x7 // -> av[0]
868 mov x2, x3 // -> nv[0]
869 add x5, x5, #16
870 sub x8, x4, #4
871 sub x9, x9, #4
872 expand v2, v3
873 bl mmla4
874
875 // Complete the next inner loop.
8760: sub x8, x8, #4
877 bl dmla4
878 cbnz x8, 0b
879
880 // Still have carries left to propagate, and they overlap the
881 // previous iteration's final tail, so read that and add it.
882 add x15, x15, x10, lsl #16
883 bl carryprop
884
885 // Back again, maybe.
886 cbnz x9, 1b
887
888 // All done, almost.
889 str w10, [x0], #4
890 popreg x29, x30
891 ret
892
893 // First iteration was short. Write out the carries and we're done.
894 // (This could be folded into the main loop structure, but that would
895 // penalize small numbers more.)
8968: bl carryprop
897 str w10, [x0], #4
898 popreg x29, x30
899 ret
900ENDFUNC
901
902FUNC(mpxmont_redc4_arm64_simd)
903 // void mpxmont_redc4_arm64_simd(mpw *dv, mpw *dvl, const mpw *nv,
904 // size_t n, const mpw *mi);
905
906 // Establish the arguments and do initial setup.
907 //
908 // inner loop dv x0
909 // inner loop nv x2
910 // blocks-of-4 count x6
911 // tail count x7
912 // mi (x4)
913 // outer loop dv x4
914 // outer loop count x8
915 // nv base x5
916 // inner loop count x1
917 // n x3
918 // c x10
919 // t0, t1 x11, x12
920
921 pushreg x29, x30
922 setfp
923 endprologue
924
925 // Set up the outer loop state and prepare for the first iteration.
926 ldr q6, [x4] // = M
927 movi v31.4s, #0
928 // x0 // -> dv for inner loop
929 sub x6, x1, x0 // total dv bytes
930 sub x1, x3, #4 // inner loop counter
931 // x2 // -> nv for inner loop
932 // x3 // = n
933 add x4, x0, #16 // -> dv for outer loop
934 mov x5, x2 // -> nv base
935 sub x6, x6, x3, lsl #2 // dv carry range bytes
936 sub x8, x3, #4 // outer loop counter
937 sub x6, x6, #16 // dv steam-powered carry bytes
938 expand v6, v7 // expand M
939 and x7, x6, #15 // dv tail length in bytes
940 bic x6, x6, #15 // dv blocks-of-four length in bytes
941
942 bl mont4
943 cbz x1, 8f // done already?
944
9455: sub x1, x1, #4
946 bl mla4
947 cbnz x1, 5b // done yet?
948
949 // Still have carries left to propagate. Adding the accumulator
950 // block into the carries is a little different this time, because
951 // all four accumulator limbs have to be squished into the three
952 // carry registers for `carryprop' to do its thing.
9538: ldr q24, [x0]
954 sprdacc v24, v25, v26
955 add v28.2d, v28.2d, v24.2d
956 add v29.2d, v29.2d, v25.2d
957 add v30.2d, v30.2d, v26.2d
958 bl carryprop
959 cbz x6, 7f
960
961 // Propagate the first group of carries.
962 ldp x16, x17, [x0]
963 sub x1, x6, #16
964 adds x16, x16, x10
965 adcs x17, x17, xzr
966 stp x16, x17, [x0], #16
967 cbz x1, 6f
968
969 // Continue carry propagation until the end of the buffer.
9700: ldp x16, x17, [x0]
971 sub x1, x1, #16
972 adcs x16, x16, xzr
973 adcs x17, x17, xzr
974 stp x16, x17, [x0], #16
975 cbnz x1, 0b
976
977 // Deal with the tail end. Note that the actual destination length
978 // won't be an exacty number of blocks of four, so it's safe to just
979 // drop through here.
9806: adc w10, wzr, wzr
9817: ldr w16, [x0]
982 sub x1, x7, #4
983 adds w16, w16, w10
984 str w16, [x0], #4
985 cbz x1, 8f
9860: ldr w16, [x0]
987 sub x1, x1, #4
988 adcs w16, w16, wzr
989 str w16, [x0], #4
990 cbnz x1, 0b
991
992 // All done for this iteration. Start the next.
9938: cbz x8, 9f
994 mov x0, x4
995 add x4, x4, #16
996 sub x1, x3, #4
997 mov x2, x5
998 sub x8, x8, #4
999 sub x6, x6, #16
1000 bl mont4
1001 b 5b
1002
1003 // All over.
10049: popreg x29, x30
1005 ret
1006ENDFUNC
1007
1008///--------------------------------------------------------------------------
1009/// Testing and performance measurement.
1010
1011#ifdef TEST_MUL4
1012
1013// dmul smul mmul mont
1014// z x0 x0 x0 x0 x0
1015// c x3 x1 x1 x1 x1
1016// y x4 -- -- x2 x2
1017// u x1 x2 -- x3 --
1018// x x2 x3 x2 x4 x3
1019// vv v2/v3 x4 -- x5 --
1020// yy v4/v5 x5 x3 x6 --
1021// mm v6/v7 -- -- -- x4
1022// n x5 x6 x4 x7 x5
1023// cyv x6 x7 x5 stk0 x6
1024
1025#define STKARG(i) sp, #16 + i
1026
1027.macro testprologue mode
1028 pushreg x29, x30
1029 setfp
1030 endprologue
1031 movi v31.4s, #0
1032
1033 .ifeqs "\mode", "dmul"
1034 ldr q2, [x4]
981a9e5d
MW
1035 zip2 v3.8h, v2.8h, v31.8h // (v''_3, v'_3; v''_2, v'_2)
1036 zip1 v2.8h, v2.8h, v31.8h // (v''_1, v'_1; v''_0, v'_0)
ea1b3cec
MW
1037
1038 ldr q4, [x5]
981a9e5d
MW
1039 zip2 v5.8h, v4.8h, v31.8h // (y''_3, y'_3; y''_2, y'_2)
1040 zip1 v4.8h, v4.8h, v31.8h // (y''_1, y'_1; y''_0, y'_0)
ea1b3cec
MW
1041
1042 mov x16, x1
1043 mov x1, x2 // -> u
1044 mov x2, x3 // -> x
1045 mov x3, x16 // -> c
1046
1047 mov x5, x6 // = n
1048 mov x6, x7 // -> cyv
1049 .endif
1050
1051 .ifeqs "\mode", "smul"
1052 ldr q4, [x3]
981a9e5d
MW
1053 zip2 v5.8h, v4.8h, v31.8h // (y''_3, y'_3; y''_2, y'_2)
1054 zip1 v4.8h, v4.8h, v31.8h // (y''_1, y'_1; y''_0, y'_0)
ea1b3cec
MW
1055
1056 // x2 // -> x
1057 mov x3, x1 // -> c
1058 mov x6, x5 // -> cyv
1059 mov x5, x4 // = n
1060 .endif
1061
1062 .ifeqs "\mode", "mmul"
1063 ldr q2, [x5]
981a9e5d
MW
1064 zip2 v3.8h, v2.8h, v31.8h // (v''_3, v'_3; v''_2, v'_2)
1065 zip1 v2.8h, v2.8h, v31.8h // (v''_1, v'_1; v''_0, v'_0)
ea1b3cec
MW
1066
1067 ldr q6, [x6]
981a9e5d
MW
1068 zip2 v7.8h, v6.8h, v31.8h // (y''_3, y'_3; y''_2, y'_2)
1069 zip1 v6.8h, v6.8h, v31.8h // (y''_1, y'_1; y''_0, y'_0)
ea1b3cec
MW
1070
1071 mov x16, x1
1072 mov x1, x3 // -> u
1073 mov x3, x16 // -> c
1074
1075 mov x16, x2
1076 mov x2, x4 // -> x
1077 mov x4, x16 // -> y
1078
1079 mov x5, x7 // = n
1080 ldr x6, [STKARG(0)] // -> cyv
1081 .endif
1082
1083 .ifeqs "\mode", "mont"
1084 ldr q6, [x4]
981a9e5d
MW
1085 zip2 v7.8h, v6.8h, v31.8h // (m''_3, m'_3; m''_2, m'_2)
1086 zip1 v6.8h, v6.8h, v31.8h // (m''_1, m'_1; m''_0, m'_0)
ea1b3cec
MW
1087
1088 mov x4, x2 // -> y
1089 mov x2, x3 // -> x
1090 mov x3, x1 // -> c
1091
1092 // x5 // = n
1093 // x6 // -> cyv
1094 .endif
1095.endm
1096
1097.macro testldcarry
1098 ld1 {v28.2d-v30.2d}, [x3]
1099 mov x15, #0
1100.endm
1101
1102.macro testtop
11030: sub x5, x5, #1
1104.endm
1105
1106.macro testtail
1107 cbnz x5, 0b
1108.endm
1109
1110.macro testcarryout
1111 // More complicated than usual because we must mix the general-
1112 // purpose carry back in.
1113 lsr x15, x15, #16
1114 mov v0.d[0], x15
1115 mov v0.d[1], xzr
1116 add v28.2d, v28.2d, v0.2d
1117 st1 {v28.2d-v30.2d}, [x3]
1118.endm
1119
1120.macro testepilogue
1121 popreg x29, x30
1122 ret
1123.endm
1124
1125FUNC(test_dmul4)
1126 testprologue dmul
1127 testldcarry
1128 testtop
1129 bl dmul4
1130 testtail
1131 testcarryout
1132 testepilogue
1133ENDFUNC
1134
1135FUNC(test_dmla4)
1136 testprologue dmul
1137 testldcarry
1138 testtop
1139 bl dmla4
1140 testtail
1141 testcarryout
1142 testepilogue
1143ENDFUNC
1144
1145FUNC(test_mul4)
1146 testprologue smul
1147 testldcarry
1148 testtop
1149 bl mul4
1150 testtail
1151 testcarryout
1152 testepilogue
1153ENDFUNC
1154
1155FUNC(test_mul4zc)
1156 testprologue smul
1157 testldcarry
1158 testtop
1159 bl mul4zc
1160 testtail
1161 testcarryout
1162 testepilogue
1163ENDFUNC
1164
1165FUNC(test_mla4)
1166 testprologue smul
1167 testldcarry
1168 testtop
1169 bl mla4
1170 testtail
1171 testcarryout
1172 testepilogue
1173ENDFUNC
1174
1175FUNC(test_mla4zc)
1176 testprologue smul
1177 testldcarry
1178 testtop
1179 bl mla4zc
1180 testtail
1181 testcarryout
1182 testepilogue
1183ENDFUNC
1184
1185FUNC(test_mmul4)
1186 testprologue mmul
1187 testtop
1188 bl mmul4
1189 testtail
1190 stp q4, q5, [x4]
1191 testcarryout
1192 testepilogue
1193ENDFUNC
1194
1195FUNC(test_mmla4)
1196 testprologue mmul
1197 testtop
1198 bl mmla4
1199 testtail
1200 stp q4, q5, [x4]
1201 testcarryout
1202 testepilogue
1203ENDFUNC
1204
1205FUNC(test_mont4)
1206 testprologue mont
1207 testtop
1208 bl mont4
1209 testtail
1210 stp q4, q5, [x4]
1211 testcarryout
1212 testepilogue
1213ENDFUNC
1214
1215#endif
1216
1217///----- That's all, folks --------------------------------------------------