progs/perftest.c: Use from Glibc syscall numbers.
[catacomb] / math / mpx-mul4-arm-neon.S
CommitLineData
ea1b3cec
MW
1/// -*- mode: asm; asm-comment-char: ?/ -*-
2///
3/// Large SIMD-based multiplications
4///
5/// (c) 2019 Straylight/Edgeware
6///
7
8///----- Licensing notice ---------------------------------------------------
9///
10/// This file is part of Catacomb.
11///
12/// Catacomb is free software: you can redistribute it and/or modify it
13/// under the terms of the GNU Library General Public License as published
14/// by the Free Software Foundation; either version 2 of the License, or
15/// (at your option) any later version.
16///
17/// Catacomb is distributed in the hope that it will be useful, but
18/// WITHOUT ANY WARRANTY; without even the implied warranty of
19/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20/// Library General Public License for more details.
21///
22/// You should have received a copy of the GNU Library General Public
23/// License along with Catacomb. If not, write to the Free Software
24/// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
25/// USA.
26
27///--------------------------------------------------------------------------
28/// Preliminaries.
29
30#include "config.h"
31#include "asm-common.h"
32
33 .arch armv7-a
34 .fpu neon
35
36 .text
37
38///--------------------------------------------------------------------------
39/// Theory.
40///
41/// We define a number of primitive fixed-size multipliers from which we can
42/// construct more general variable-length multipliers.
43///
44/// The basic trick is the same throughout. In an operand-scanning
45/// multiplication, the inner multiplication loop multiplies a multiple-
46/// precision operand by a single precision factor, and adds the result,
47/// appropriately shifted, to the result. A `finely integrated operand
48/// scanning' implementation of Montgomery multiplication also adds the
49/// product of a single-precision `Montgomery factor' and the modulus,
50/// calculated in the same pass. The more common `coarsely integrated
51/// operand scanning' alternates main multiplication and Montgomery passes,
52/// which requires additional carry propagation.
53///
54/// Throughout both plain-multiplication and Montgomery stages, then, one of
55/// the factors remains constant throughout the operation, so we can afford
56/// to take a little time to preprocess it. The transformation we perform is
57/// as follows. Let b = 2^16, and B = b^2 = 2^32. Suppose we're given a
58/// 128-bit factor v = v_0 + v_1 B + v_2 B^2 + v_3 B^3. Split each v_i into
59/// two sixteen-bit pieces, so v_i = v'_i + v''_i b. These eight 16-bit
60/// pieces are placed into 32-bit cells, and arranged as two 128-bit NEON
61/// operands, as follows.
62///
981a9e5d
MW
63/// Offset 12 8 4 0
64/// 0 v''_1 v'_1 v''_0 v'_0
65/// 16 v''_3 v'_3 v''_2 v'_2
ea1b3cec
MW
66///
67/// The `vmull' and `vmlal' instructions can multiply a vector of two 32-bit
68/// values by a 32-bit scalar, giving two 64-bit results; thus, it will act
69/// on (say) v'_0 and v''_0 in a single instruction, to produce two 48-bit
70/// results in 64-bit fields. The sixteen bits of headroom allows us to add
71/// many products together before we must deal with carrying; it also allows
72/// for some calculations to be performed on the above expanded form.
73///
74/// We maintain three `carry' registers, q12--q14, accumulating intermediate
75/// results; we name them `c0', `c1', and `c2'. Each carry register holds
76/// two 64-bit halves: the register c0, for example, holds c'_0 (low half)
77/// and c''_0 (high half), and represents the value c_0 = c'_0 + c''_0 b; the
78/// carry registers collectively represent the value c_0 + c_1 B + c_2 B^2.
79/// The `vmull' or `vmlal' instruction acting on a scalar operand and an
80/// operand in the expanded form above produces a result which can be added
81/// directly to the appropriate carry register.
82///
83/// Multiplication is performed in product-scanning order, since ARM
84/// processors commonly implement result forwarding for consecutive multiply-
85/// and-accumulate instructions specifying the same destination.
86/// Experimentally, this runs faster than operand-scanning in an attempt to
87/// hide instruction latencies.
88///
89/// On 32-bit ARM, we have a reasonable number of registers: the expanded
90/// operands are kept in registers. The packed operands are read from memory
91/// into working registers q0 and q1. The following conventional argument
92/// names and locations are used throughout.
93///
94/// Arg Format Location Notes
95///
96/// U packed [r1]
97/// X packed [r2] In Montgomery multiplication, X = N
98/// V expanded q2/q3
99/// Y expanded q4/q5 In Montgomery multiplication, Y = (A + U V) M
100/// M expanded q4/q5 -N^{-1} (mod B^4)
101/// N Modulus, for Montgomery multiplication
102/// A packed [r0] Destination/accumulator
103/// C carry q13--q15
104///
105/// The calculation is some variant of
106///
107/// A' + C' B^4 <- U V + X Y + A + C
108///
109/// The low-level functions fit into a fairly traditional (finely-integrated)
110/// operand scanning loop over operand pairs (U, X) (indexed by j) and (V, Y)
111/// (indexed by i).
112///
113/// The variants are as follows.
114///
115/// Function Variant Use i j
116///
117/// mmul4 A = C = 0 Montgomery 0 0
118/// dmul4 A = 0 Montgomery 0 +
119/// mmla4 C = 0 Montgomery + 0
120/// dmla4 exactly as shown Montgomery + +
121///
122/// mul4zc U = V = A = C = 0 Plain 0 0
123/// mul4 U = V = A = 0 Plain 0 +
124/// mla4zc U = V = C = 0 Plain + 0
125/// mla4 U = V = 0 Plain + +
126///
127/// The `mmul4' and `mmla4' functions are also responsible for calculating
128/// the Montgomery reduction factor Y = (A + U V) M used by the rest of the
129/// inner loop.
130
131///--------------------------------------------------------------------------
132/// Macro definitions.
133
134.macro mulacc z, u, v, x=nil, y=nil
135 // Set Z = Z + U V + X Y. X may be `nil' to omit the second
136 // operand. Z should be a 128-bit `qN' register; V and Y should be
137 // 64-bit `dN' registers; and U and X should be 32-bit `dN[I]'
138 // scalars; the multiplications produce two 64-bit elementwise
139 // products, which are added elementwise to Z.
140
141 vmlal.u32 \z, \v, \u
142 .ifnes "\x", "nil"
143 vmlal.u32 \z, \y, \x
144 .endif
145.endm
146
147.macro mulinit z, zinitp, u, v, x, y
148 // If ZINITP then set Z = Z + U V + X Y, as for `mulacc'; otherwise,
149 // set Z = U V + X Y. Operand requirements and detailed operation
150 // are as for `mulacc'.
151
152 .ifeqs "\zinitp", "t"
153 mulacc \z, \u, \v, \x, \y
154 .else
155 vmull.u32 \z, \v, \u
156 .ifnes "\x", "nil"
157 vmlal.u32 \z, \y, \x
158 .endif
159 .endif
160.endm
161
162// `MULI': accumulate the B^I and b B^i terms of the polynomial product sum U
163// V + X Y, given that U = u_0 + B u_1 + B^2 u_2 + B^3 u_3 (and similarly for
164// x), and V = v'_0 + b v''_0 + B (v'_1 + b v''_1) + B^2 (v'_2 + b v''_2) +
165// B^3 (v'_3 + b v''_3) (and similarly for Y). The 64-bit coefficients are
166// added into the low and high halves of the 128-bit register Z (if ZINIT is
167// `nil' then simply set Z, as if it were initially zero).
168#define MUL0(z, zinitp, u, v0, v1, x, y0, y1) \
169 mulinit z, zinitp, QW(u, 0), D0(v0), QW(x, 0), D0(y0)
170#define MUL1(z, zinitp, u, v0, v1, x, y0, y1) \
171 mulinit z, zinitp, QW(u, 0), D1(v0), QW(x, 0), D1(y0); \
172 mulacc z, QW(u, 1), D0(v0), QW(x, 1), D0(y0)
173#define MUL2(z, zinitp, u, v0, v1, x, y0, y1) \
174 mulinit z, zinitp, QW(u, 0), D0(v1), QW(x, 0), D0(y1); \
175 mulacc z, QW(u, 1), D1(v0), QW(x, 1), D1(y0); \
176 mulacc z, QW(u, 2), D0(v0), QW(x, 2), D0(y0)
177#define MUL3(z, zinitp, u, v0, v1, x, y0, y1) \
178 mulinit z, zinitp, QW(u, 0), D1(v1), QW(x, 0), D1(y1); \
179 mulacc z, QW(u, 1), D0(v1), QW(x, 1), D0(y1); \
180 mulacc z, QW(u, 2), D1(v0), QW(x, 2), D1(y0); \
181 mulacc z, QW(u, 3), D0(v0), QW(x, 3), D0(y0)
182#define MUL4(z, zinitp, u, v0, v1, x, y0, y1) \
183 mulinit z, zinitp, QW(u, 1), D1(v1), QW(x, 1), D1(y1); \
184 mulacc z, QW(u, 2), D0(v1), QW(x, 2), D0(y1); \
185 mulacc z, QW(u, 3), D1(v0), QW(x, 3), D1(y0)
186#define MUL5(z, zinitp, u, v0, v1, x, y0, y1) \
187 mulinit z, zinitp, QW(u, 2), D1(v1), QW(x, 2), D1(y1); \
188 mulacc z, QW(u, 3), D0(v1), QW(x, 3), D0(y1)
189#define MUL6(z, zinitp, u, v0, v1, x, y0, y1) \
190 mulinit z, zinitp, QW(u, 3), D1(v1), QW(x, 3), D1(y1)
191
192// Steps in the process of propagating carry bits from ZLO to ZHI (both
193// 128-bit `qN' registers). Here, T is a 128-bit `qN' temporary register.
194// Set the low 32 bits of the 64-bit `dN' register ZOUT to the completed
195// coefficient z_i.
196//
197// In detail, what happens is as follows. Suppose initially that ZLO =
198// (z'_i; z''_i) and ZHI = (z'_{i+1}; z''_{i+1}). Let t = z'_i + b z''_i;
199// observe that floor(t/b) = floor(z'_i/b) + z''_i. Let z_i = t mod B, and
200// add floor(t/B) = floor((floor(z'_i/b) + z''_i)/b) onto z'_{i+1}. This has
201// a circuit depth of 4; I don't know how to do better.
202.macro _carry0 zout, zlo0, zlo1, t0, t1
203 // ZLO0 and ZLO1 are the low and high halves of a carry register.
204 // Extract a 32-bit output, in the bottom 32 bits of ZOUT, and set T1
205 // so as to continue processing using `_carry1'. All operands are
206 // 64-bit `dN' registers. If ZOUT is `nil' then no output is
207 // produced; if T1 is `nil' then no further processing will be
208 // possible.
209 .ifnes "\zout", "nil"
210 vshl.i64 \t0, \zlo1, #16
211 .endif
212 .ifnes "\t1", "nil"
213 vshr.u64 \t1, \zlo0, #16
214 .endif
215 .ifnes "\zout", "nil"
216 vadd.i64 \zout, \zlo0, \t0
217 .endif
218 .ifnes "\t1", "nil"
219 vadd.i64 \t1, \t1, \zlo1
220 .endif
221.endm
222.macro _carry1 u, zhi0, t1
223 // ZHI0 is the low half of a carry register, and T1 is the result of
224 // applying `_carry0' to the previous carry register. Set U to the
225 // result of propagating the carry into ZHI0.
226 vshr.u64 \t1, \t1, #16
227 vadd.i64 \u, \zhi0, \t1
228.endm
229
230// More convenient wrappers for `_carry0' and `_carry1'.
231//
232// CARRY0(ZOUT, ZLO, T)
233// Store a 32-bit output in ZOUT from carry ZLO, using T as a
234// temporary. ZOUT is a 64-bit `dN' register; ZLO and T are 128-bit
235// `qN' registers.
236//
237// CARRY1(ZHI, T)
238// Propagate carry from T into ZHI. Both are 128-bit `qN' registers;
239// ZHI is updated.
240#define CARRY0(zout, zlo, t) \
241 CASIDE0(zout, D0(zlo), zlo, t)
242#define CARRY1(zhi, t) \
243 CASIDE1(D0(zhi), zhi, t)
244
245// Versions of `CARRY0' and `CARRY1' which don't mutate their operands.
246//
247// CASIDE0(ZOUT, U, ZLO, T)
248// As for `CARRY0', but the low half of ZLO is actually in U (a 64-bit
249// `dN' register).
250//
251// CASIDE0E(ZOUT, U, ZLO, T)
252// As for `CASIDE0', but only calculate the output word, and no
253// carry-out.
254//
255// CASIDE1(U, ZHI, T)
256// As for `CARRY1', but write the updated low half of ZHI to U.
257#define CASIDE0(zout, u, zlo, t) \
258 _carry0 zout, u, D1(zlo), D0(t), D1(t)
259#define CASIDE0E(zout, u, zlo, t) \
260 _carry0 zout, u, D1(zlo), D0(t), nil
261#define CASIDE1(u, zhi, t) \
262 _carry1 u, D0(zhi), D1(t)
263
264// Steps in spreading a packed 128-bit operand in A0 across A0, A1, A2, A3 in
265// carry format.
266#define SPREADACC0(a0, a1, a2, a3) \
267 vmov.i32 a1, #0; \
268 vmov.i32 a2, #0; \
269 vmov.i32 a3, #0
270#define SPREADACC1(a0, a1, a2, a3) \
271 vswp D1(a0), D0(a2); \
272 vtrn.32 D0(a0), D0(a1); \
273 vtrn.32 D0(a2), D0(a3)
274
275// Add the carry-format values A0, A1, A2 into the existing carries C0, C1,
276// C2 (leaving A3 where it is).
277#define CARRYACC(a0, a1, a2, a3, c0, c1, c2) \
278 vadd.i64 c0, c0, a0; \
279 vadd.i64 c1, c1, a1; \
280 vadd.i64 c2, c2, a2
281
282///--------------------------------------------------------------------------
283/// Primitive multipliers and related utilities.
284
285INTFUNC(carryprop)
286 // On entry, r0 points to a destination, and q13--q15 hold incoming
287 // carries c0--c2. On exit, the low 128 bits of the carry value are
288 // stored at [r0]; the remaining 16 bits of carry are left in d30; r0
289 // is advanced by 16; and q10--q14 are clobbered.
290 endprologue
291
292 CARRY0(D0(q10), q13, q12)
293 CARRY1(q14, q12)
294 CARRY0(D0(q11), q14, q12)
295 CARRY1(q15, q12)
296 CARRY0(D1(q10), q15, q12)
297 vshr.u64 D1(q11), D1(q12), #16
298 vshr.u64 D0(q15), D1(q12), #48
299 vtrn.32 q10, q11
300 vst1.32 {q10}, [r0]!
301 bx r14
302ENDFUNC
303
304INTFUNC(dmul4)
305 // On entry, r0 points to the destination; r1 and r2 point to packed
306 // operands U and X; q2/q3 and q4/q5 hold expanded operands V and Y;
307 // and q13--q15 hold incoming carries c0--c2. On exit, the
308 // destination and carries are updated; r0, r1, r2 are each advanced
309 // by 16; q2--q5 are preserved; and the other NEON registers are
310 // clobbered.
311 endprologue
312
313 // Start by loading the operand words from memory.
314 vld1.32 {q0}, [r1]!
315 vld1.32 {q1}, [r2]!
316
317 // Do the multiplication.
318 MUL0(q13, t, q0, q2, q3, q1, q4, q5)
319 MUL1(q14, t, q0, q2, q3, q1, q4, q5)
320 CARRY0(D0(q8), q13, q6)
321 MUL2(q15, t, q0, q2, q3, q1, q4, q5)
322 CARRY1(q14, q6)
323 CARRY0(D0(q9), q14, q6)
324 MUL3(q12, nil, q0, q2, q3, q1, q4, q5)
325 CARRY1(q15, q6)
326 CARRY0(D1(q8), q15, q6)
327 MUL4(q13, nil, q0, q2, q3, q1, q4, q5)
328 CARRY1(q12, q6)
329 CARRY0(D1(q9), q12, q6)
330 MUL5(q14, nil, q0, q2, q3, q1, q4, q5)
331 CARRY1(q13, q6)
332 MUL6(q15, nil, q0, q2, q3, q1, q4, q5)
333
334 // Finish up and store the result.
335 vtrn.32 q8, q9
336 vst1.32 {q8}, [r0]!
337
338 // All done.
339 bx r14
340ENDFUNC
341
342INTFUNC(dmla4)
343 // On entry, r0 points to the destination/accumulator; r1 and r2
344 // point to packed operands U and X; q2/q3 and q4/q5 hold expanded
345 // operands V and Y; and q13--q15 hold incoming carries c0--c2. On
346 // exit, the accumulator and carries are updated; r0, r1, r2 are each
347 // advanced by 16; q2--q5 are preserved; and the other NEON registers
348 // are clobbered.
349 endprologue
350
351 // Start by loading the operand words from memory.
352 vld1.32 {q9}, [r0]
353 SPREADACC0(q9, q10, q11, q12)
354 vld1.32 {q0}, [r1]!
355 vld1.32 {q1}, [r2]!
356
357 // Add the accumulator input to the incoming carries. Split the
358 // accumulator into four pieces and add the carries onto them.
359 SPREADACC1(q9, q10, q11, q12)
360 CARRYACC(q9, q10, q11, q12, q13, q14, q15)
361
362 // Do the multiplication.
363 MUL0(q13, t, q0, q2, q3, q1, q4, q5)
364 MUL1(q14, t, q0, q2, q3, q1, q4, q5)
365 CARRY0(D0(q8), q13, q6)
366 MUL2(q15, t, q0, q2, q3, q1, q4, q5)
367 CARRY1(q14, q6)
368 CARRY0(D0(q9), q14, q6)
369 MUL3(q12, t, q0, q2, q3, q1, q4, q5)
370 CARRY1(q15, q6)
371 CARRY0(D1(q8), q15, q6)
372 MUL4(q13, nil, q0, q2, q3, q1, q4, q5)
373 CARRY1(q12, q6)
374 CARRY0(D1(q9), q12, q6)
375 MUL5(q14, nil, q0, q2, q3, q1, q4, q5)
376 CARRY1(q13, q6)
377 MUL6(q15, nil, q0, q2, q3, q1, q4, q5)
378
379 // Finish up and store the result.
380 vtrn.32 q8, q9
381 vst1.32 {q8}, [r0]!
382
383 // All done.
384 bx r14
385ENDFUNC
386
387INTFUNC(mul4)
388 // On entry, r0 points to the destination; r2 points to a packed
389 // operand X; q4/q5 holds an expanded operand Y; and q13--q15 hold
390 // incoming carries c0--c2. On exit, the destination and carries are
391 // updated; r0 and r2 are each advanced by 16; q4 and q5 are
392 // preserved; and the other NEON registers are clobbered.
393 endprologue
394
395 // Start by loading the operand words from memory.
396 vld1.32 {q1}, [r2]!
397
398 // Do the multiplication.
399 MUL0(q13, t, q1, q4, q5, nil, nil, nil)
400 MUL1(q14, t, q1, q4, q5, nil, nil, nil)
401 CARRY0(D0(q8), q13, q6)
402 MUL2(q15, t, q1, q4, q5, nil, nil, nil)
403 CARRY1(q14, q6)
404 CARRY0(D0(q9), q14, q6)
405 MUL3(q12, nil, q1, q4, q5, nil, nil, nil)
406 CARRY1(q15, q6)
407 CARRY0(D1(q8), q15, q6)
408 MUL4(q13, nil, q1, q4, q5, nil, nil, nil)
409 CARRY1(q12, q6)
410 CARRY0(D1(q9), q12, q6)
411 MUL5(q14, nil, q1, q4, q5, nil, nil, nil)
412 CARRY1(q13, q6)
413 MUL6(q15, nil, q1, q4, q5, nil, nil, nil)
414
415 // Finish up and store the result.
416 vtrn.32 q8, q9
417 vst1.32 {q8}, [r0]!
418
419 // All done.
420 bx r14
421ENDFUNC
422
423INTFUNC(mul4zc)
424 // On entry, r0 points to the destination; r2 points to a packed
425 // operand X; and q4/q5 holds an expanded operand Y. On exit, the
426 // destination is updated; q13--q15 hold outgoing carries c0--c2; r0
427 // and r2 are each advanced by 16; q4 and q5 are preserved; and the
428 // other NEON registers are clobbered.
429 endprologue
430
431 // Start by loading the operand words from memory.
432 vld1.32 {q1}, [r2]!
433
434 // Do the multiplication.
435 MUL0(q13, nil, q1, q4, q5, nil, nil, nil)
436 MUL1(q14, nil, q1, q4, q5, nil, nil, nil)
437 CARRY0(D0(q8), q13, q6)
438 MUL2(q15, nil, q1, q4, q5, nil, nil, nil)
439 CARRY1(q14, q6)
440 CARRY0(D0(q9), q14, q6)
441 MUL3(q12, nil, q1, q4, q5, nil, nil, nil)
442 CARRY1(q15, q6)
443 CARRY0(D1(q8), q15, q6)
444 MUL4(q13, nil, q1, q4, q5, nil, nil, nil)
445 CARRY1(q12, q6)
446 CARRY0(D1(q9), q12, q6)
447 MUL5(q14, nil, q1, q4, q5, nil, nil, nil)
448 CARRY1(q13, q6)
449 MUL6(q15, nil, q1, q4, q5, nil, nil, nil)
450
451 // Finish up and store the result.
452 vtrn.32 q8, q9
453 vst1.32 {q8}, [r0]!
454
455 // All done.
456 bx r14
457ENDFUNC
458
459INTFUNC(mla4)
460 // On entry, r0 points to the destination/accumulator; r2 points to a
461 // packed operand X; q4/q5 holds an expanded operand Y; and q13--q15
462 // hold incoming carries c0--c2. On exit, the accumulator and
463 // carries are updated; r0 and r2 are each advanced by 16; q4 and q5
464 // are preserved; and the other NEON registers are clobbered.
465 endprologue
466
467 // Start by loading the operand words from memory.
468 vld1.32 {q9}, [r0]
469 SPREADACC0(q9, q10, q11, q12)
470 vld1.32 {q1}, [r2]!
471
472 // Add the accumulator input to the incoming carries. Split the
473 // accumulator into four pieces and add the carries onto them.
474 SPREADACC1(q9, q10, q11, q12)
475 CARRYACC(q9, q10, q11, q12, q13, q14, q15)
476
477 // Do the multiplication.
478mla4_common:
479 MUL0(q13, t, q1, q4, q5, nil, nil, nil)
480 MUL1(q14, t, q1, q4, q5, nil, nil, nil)
481 CARRY0(D0(q8), q13, q6)
482 MUL2(q15, t, q1, q4, q5, nil, nil, nil)
483 CARRY1(q14, q6)
484 CARRY0(D0(q9), q14, q6)
485 MUL3(q12, t, q1, q4, q5, nil, nil, nil)
486 CARRY1(q15, q6)
487 CARRY0(D1(q8), q15, q6)
488 MUL4(q13, nil, q1, q4, q5, nil, nil, nil)
489 CARRY1(q12, q6)
490 CARRY0(D1(q9), q12, q6)
491 MUL5(q14, nil, q1, q4, q5, nil, nil, nil)
492 CARRY1(q13, q6)
493 MUL6(q15, nil, q1, q4, q5, nil, nil, nil)
494
495 // Finish up and store the result.
496 vtrn.32 q8, q9
497 vst1.32 {q8}, [r0]!
498
499 // All done.
500 bx r14
501ENDFUNC
502
503INTFUNC(mla4zc)
504 // On entry, r0 points to the destination/accumulator; r2 points to a
505 // packed operand X; and q4/q5 holds an expanded operand Y. On exit,
506 // the accumulator is updated; q13--q15 hold outgoing carries c0--c2;
507 // r0 and r2 are each advanced by 16; q4 and q5 are preserved; and
508 // the other NEON registers are clobbered.
509 endprologue
510
511 // Start by loading the operand words from memory.
512 vld1.32 {q13}, [r0]
513 SPREADACC0(q13, q14, q15, q12)
514 vld1.32 {q1}, [r2]!
515
516 // Move the accumulator input to the incoming carry slots. Split the
517 // accumulator into four pieces.
518 SPREADACC1(q13, q14, q15, q12)
519
520 b mla4_common
521ENDFUNC
522
523INTFUNC(mmul4)
524 // On entry, r0 points to the destination; r1 points to a packed
525 // operand U; r2 points to a packed operand X (the modulus); q2/q3
526 // holds an expanded operand V; and q4/q5 holds an expanded operand M
527 // (the Montgomery factor -N^{-1} (mod B)). On exit, the destination
528 // is updated (to zero); q4/q5 hold an expanded factor Y = U V M (mod
529 // B); q13--q15 hold outgoing carries c0--c2; r0, r1, and r2 are each
530 // advanced by 16; q2 and q3 are preserved; and the other NEON
531 // registers are clobbered.
532
533 // Start by loading the operand words from memory.
534 vld1.32 {q0}, [r1]!
535
536 // Calculate the low half of W = A + U V, being careful to leave the
537 // carries in place.
538 MUL0(q13, nil, q0, q2, q3, nil, nil, nil)
539 MUL1(q14, nil, q0, q2, q3, nil, nil, nil)
540 CARRY0(D0(q6), q13, q8)
541 MUL2(q15, nil, q0, q2, q3, nil, nil, nil)
542 CASIDE1(D0(q9), q14, q8)
543 CASIDE0(D0(q7), D0(q9), q14, q8)
544 MUL3(q12, nil, q0, q2, q3, nil, nil, nil)
545 b mmla4_common
546ENDFUNC
547
548INTFUNC(mmla4)
549 // On entry, r0 points to the destination/accumulator A; r1 points to
550 // a packed operand U; r2 points to a packed operand X (the modulus);
551 // q2/q3 holds an expanded operand V; and q4/q5 holds an expanded
552 // operand M (the Montgomery factor -N^{-1} (mod B)). On exit, the
553 // accumulator is updated (to zero); q4/q5 hold an expanded factor Y
554 // = (A + U V) M (mod B); q13--q15 hold outgoing carries c0--c2; r0,
555 // r1, and r2 are each advanced by 16; q2 and q3 are preserved; and
556 // the other NEON registers are clobbered.
557 endprologue
558
559 // Start by loading the operand words from memory.
560 vld1.32 {q13}, [r0]
561 SPREADACC0(q13, q14, q15, q12)
562 vld1.32 {q0}, [r1]!
563
564 // Move the accumulator input to the incoming carry slots. Split the
565 // accumulator into four pieces.
566 SPREADACC1(q13, q14, q15, q12)
567
568 // Calculate the low half of W = A + U V, being careful to leave the
569 // carries in place.
570 MUL0(q13, t, q0, q2, q3, nil, nil, nil)
571 MUL1(q14, t, q0, q2, q3, nil, nil, nil)
572 CARRY0(D0(q6), q13, q8)
573 MUL2(q15, t, q0, q2, q3, nil, nil, nil)
574 CASIDE1(D0(q9), q14, q8)
575 CASIDE0(D0(q7), D0(q9), q14, q8)
576 MUL3(q12, t, q0, q2, q3, nil, nil, nil)
577mmla4_common:
578 CASIDE1(D0(q9), q15, q8)
579 CASIDE0(D1(q6), D0(q9), q15, q8)
580 CASIDE1(D0(q9), q12, q8)
581 CASIDE0E(D1(q7), D0(q9), q12, q8)
582 vtrn.32 q6, q7
583
584 // Calculate the low half of the Montgomery factor Y = W M. At this
585 // point, registers are a little tight.
586 MUL0( q8, nil, q6, q4, q5, nil, nil, nil)
587 MUL1( q9, nil, q6, q4, q5, nil, nil, nil)
588 CARRY0(D0(q8), q8, q1)
589 MUL2(q10, nil, q6, q4, q5, nil, nil, nil)
590 CARRY1(q9, q1)
591 CARRY0(D0(q9), q9, q1)
592 MUL3(q11, nil, q6, q4, q5, nil, nil, nil)
593 CARRY1(q10, q1)
594 CARRY0(D1(q8), q10, q1)
595 vmov.i32 q5, #0
596 CARRY1(q11, q1)
597 CARRY0(D1(q9), q11, q1)
598 vld1.32 {q1}, [r2]!
599 vtrn.32 q8, q9
600
601 // Expand Y. We'll put it in its proper place a bit later.
602 vzip.16 q8, q5
603
604 // Build up the product X Y in the carry slots.
605 MUL0(q13, t, q1, q8, q5, nil, nil, nil)
606 MUL1(q14, t, q1, q8, q5, nil, nil, nil)
607 CARRY0(nil, q13, q9)
608 MUL2(q15, t, q1, q8, q5, nil, nil, nil)
609 CARRY1(q14, q9)
610 vmov q4, q8
611 CARRY0(nil, q14, q9)
612 MUL3(q12, t, q1, q8, q5, nil, nil, nil)
613 CARRY1(q15, q9)
614 CARRY0(nil, q15, q9)
615 vmov.u32 q6, #0
616
617 // And complete the calculation.
618 MUL4(q13, nil, q0, q2, q3, q1, q4, q5)
619 CARRY1(q12, q9)
620 CARRY0(nil, q12, q9)
621 MUL5(q14, nil, q0, q2, q3, q1, q4, q5)
622 CARRY1(q13, q9)
623 MUL6(q15, nil, q0, q2, q3, q1, q4, q5)
624
625 // Finish up and store the result.
626 vst1.32 {q6}, [r0]!
627
628 // All done.
629 bx r14
630ENDFUNC
631
632INTFUNC(mont4)
633 // On entry, r0 points to the destination/accumulator A; r2 points to
634 // a packed operand X (the modulus); and q2/q3 holds an expanded
635 // operand M (the Montgomery factor -N^{-1} (mod B)). On exit, the
636 // accumulator is updated (to zero); q4/q5 hold an expanded factor Y
637 // = A M (mod B); q13--q15 hold outgoing carries c0--c2; r0 and r2
638 // are each advanced by 16; q2 and q3 are preserved; and the other
639 // NEON registers are clobbered.
640 endprologue
641
642 // Start by loading the operand words from memory.
643 vld1.32 {q0}, [r0]
644 vld1.32 {q1}, [r2]!
645
646 // Calculate Y = A M (mod B).
647 MUL0(q8, nil, q0, q2, q3, nil, nil, nil)
648 MUL1(q9, nil, q0, q2, q3, nil, nil, nil)
649 CARRY0(D0(q4), q8, q6)
650 MUL2(q10, nil, q0, q2, q3, nil, nil, nil)
651 CARRY1(q9, q6)
652 vmov q13, q0
653 CARRY0(D0(q7), q9, q6)
654 MUL3(q11, nil, q0, q2, q3, nil, nil, nil)
655 CARRY1(q10, q6)
656 CARRY0(D1(q4), q10, q6)
657 SPREADACC0(q13, q14, q15, q12)
658 CARRY1(q11, q6)
659 CARRY0(D1(q7), q11, q6)
660 SPREADACC1(q13, q14, q15, q12)
661 vmov.i32 q5, #0
662 vtrn.32 q4, q7
663 vzip.16 q4, q5
664
665 // Calculate the actual result. Well, the carries, at least.
666 vmov.i32 q8, #0
667 MUL0(q13, t, q1, q4, q5, nil, nil, nil)
668 MUL1(q14, t, q1, q4, q5, nil, nil, nil)
669 CARRY0(nil, q13, q6)
670 MUL2(q15, t, q1, q4, q5, nil, nil, nil)
671 CARRY1(q14, q6)
672 CARRY0(nil, q14, q6)
673 MUL3(q12, t, q1, q4, q5, nil, nil, nil)
674 CARRY1(q15, q6)
675 CARRY0(nil, q15, q6)
676 MUL4(q13, nil, q1, q4, q5, nil, nil, nil)
677 CARRY1(q12, q6)
678 CARRY0(nil, q12, q6)
679 MUL5(q14, nil, q1, q4, q5, nil, nil, nil)
680 CARRY1(q13, q6)
681 MUL6(q15, nil, q1, q4, q5, nil, nil, nil)
682
683 // Finish up and store the result.
684 vst1.32 {q8}, [r0]!
685
686 // All done.
687 bx r14
688ENDFUNC
689
690///--------------------------------------------------------------------------
691/// Bulk multipliers.
692
693FUNC(mpx_umul4_arm_neon)
694 // void mpx_umul4_arm_neon(mpw *dv, const mpw *av, const mpw *avl,
695 // const mpw *bv, const mpw *bvl);
696
697 // Establish the arguments and do initial setup.
698 //
699 // inner loop dv r0
700 // inner loop av r2
701 // outer loop dv r5
702 // outer loop bv r3
703 // av base r1
704 // av limit r12
705 // bv limit r4
706 pushreg r4, r5, r14
707 pushvfp QQ(q4, q7)
708 endprologue
709
710 // Prepare for the first iteration.
711 vld1.32 {q4}, [r3]! // = Y = bv[0]
712 vmov.i32 q5, #0
713 // r0 // = dv for inner loop
714 // r1 // = av base
715 // r3 // = bv for outer loop
716 ldr r4, [sp, #76] // = bv limit
717 mov r12, r2 // = av limit
718 mov r2, r1 // = av for inner loop
719 add r5, r0, #16 // = dv for outer loop
720 vzip.16 q4, q5 // expand Y
721 bl mul4zc
722 cmp r2, r12 // all done?
723 bhs 8f
724
725 // Continue with the first iteration.
7260: bl mul4
727 cmp r2, r12 // all done?
728 blo 0b
729
730 // Write out the leftover carry. There can be no tail here.
7318: bl carryprop
732 cmp r3, r4 // more passes to do?
733 bhs 9f
734
735 // Set up for the next pass.
7361: vld1.32 {q4}, [r3]! // = Y = bv[i]
737 vmov.i32 q5, #0
738 mov r0, r5 // -> dv[i]
739 mov r2, r1 // -> av[0]
740 add r5, r5, #16
741 vzip.16 q4, q5 // expand Y
742 bl mla4zc
743 cmp r2, r12 // done yet?
744 bhs 8f
745
746 // Continue...
7470: bl mla4
748 cmp r2, r12
749 blo 0b
750
751 // Finish off this pass. There was no tail on the previous pass, and
752 // there can be done on this pass.
7538: bl carryprop
754 cmp r3, r4
755 blo 1b
756
757 // All over.
7589: popvfp QQ(q4, q7)
759 popreg r4, r5, r14
760 bx r14
761ENDFUNC
762
763FUNC(mpxmont_mul4_arm_neon)
764 // void mpxmont_mul4_arm_neon(mpw *dv, const mpw *av, const mpw *bv,
765 // const mpw *nv, size_t n, const mpw *mi);
766
767 // Establish the arguments and do initial setup.
768 //
769 // inner loop dv r0
770 // inner loop av r1
771 // inner loop nv r2
772 // mi r5
773 // outer loop dv r6
774 // outer loop bv r7
775 // av base r8
776 // av limit r9
777 // bv limit r4
778 // nv base r3
779 // n r4
780 // c r10
781 // 0 r12
782
783 pushreg r4-r10, r14
784 pushvfp QQ(q4, q7)
785 endprologue
786
787 // Establish the expanded operands.
788 ldrd r4, r5, [sp, #96] // r4 = n; r5 -> mi
789 vld1.32 {q2}, [r2] // = V = bv[0]
790 vmov.i32 q3, #0
791 vmov.i32 q5, #0
792 vld1.32 {q4}, [r5] // = M
793
794 // Set up the outer loop state and prepare for the first iteration.
795 // r0 // -> dv for inner loop
796 // r1 // -> av for inner loop
797 add r7, r2, #16 // -> bv
798 // r3 // -> nv
799 add r6, r0, #16 // -> dv
800 mov r8, r1 // -> av
801 add r9, r1, r4, lsl #2 // -> av limit
802 add r4, r2, r4, lsl #2 // -> bv limit
803 mov r2, r3 // -> nv for inner loop
804 mov r12, #0 // = 0
805
806 vzip.16 q2, q3 // expand V
807 vzip.16 q4, q5 // expand M
808 bl mmul4
809 cmp r1, r9 // done already?
810 bhs 8f
811
812 // Complete the first inner loop.
8130: bl dmul4
814 cmp r1, r9 // done yet?
815 blo 0b
816
817 // Still have carries left to propagate. Rather than store the tail
818 // end in memory, keep it in a general-purpose register for later.
819 bl carryprop
820 vmov.32 r10, QW(q15, 0)
821
822 // Embark on the next iteration. (There must be one. If n = 1 then
823 // we would have bailed above, to label 8. Similarly, the subsequent
824 // iterations can fall into the inner loop immediately.)
8251: vld1.32 {q2}, [r7]! // = V = bv[i]
826 vld1.32 {q4}, [r5] // = M
827 vmov.i32 q3, #0
828 vmov.i32 q5, #0
829 mov r0, r6 // -> dv[i]
830 add r6, r6, #16
831 mov r1, r8 // -> av[0]
832 mov r2, r3 // -> nv[0]
833 vzip.16 q2, q3 // expand V
834 vzip.16 q4, q5 // expand M
835 bl mmla4
836
837 // Complete the next inner loop.
8380: bl dmla4
839 cmp r1, r9 // done yet?
840 blo 0b
841
842 // Still have carries left to propagate, and they overlap the
843 // previous iteration's final tail, so read that and add it.
844 cmp r7, r4
845 vmov.32 D0(q12), r10, r12
846 vadd.i64 D0(q13), D0(q13), D0(q12)
847 bl carryprop
848 vmov.32 r10, QW(q15, 0)
849
850 // Back again, maybe.
851 blo 1b
852
853 // All done, almost.
854 str r10, [r0], #4
855 popvfp QQ(q4, q7)
856 popreg r4-r10, r14
857 bx r14
858
859 // First iteration was short. Write out the carries and we're done.
860 // (This could be folded into the main loop structure, but that would
861 // penalize small numbers more.)
8628: bl carryprop
863 vst1.32 {QW(q15, 0)}, [r0]!
864 popvfp QQ(q4, q7)
865 popreg r4-r10, r14
866 bx r14
867ENDFUNC
868
869FUNC(mpxmont_redc4_arm_neon)
870 // void mpxmont_redc4_arm_neon(mpw *dv, mpw *dvl, const mpw *nv,
871 // size_t n, const mpw *mi);
872
873 // Establish the arguments and do initial setup.
874 //
875 // inner loop dv r0
876 // dv limit r1
877 // inner loop nv r2
878 // blocks-of-4 dv limit r3
879 // mi (r14)
880 // outer loop dv r4
881 // outer loop dv limit r5
882 // nv base r6
883 // nv limit r7
884 // n r3
885 // c (r14)
886 // t0, t1, t2, t3 r2, r8, r9, r10
887 // 0 r12
888
889 pushreg r4-r10, r14
890 pushvfp QQ(q4, q7)
891 endprologue
892
893 // Set up the outer loop state and prepare for the first iteration.
894 ldr r14, [sp, #96] // -> mi
895 vmov.i32 q3, #0
896 sub r12, r1, r0 // total dv bytes
897 // r0 // -> dv for inner loop
898 // r1 // -> overall dv limit
899 // r2 // -> nv for inner loop
900 // r3 // = n (for now)
901 add r4, r0, #16 // -> dv for outer loop
902 add r5, r0, r3, lsl #2 // -> dv limit
903 bic r12, r12, #15 // dv blocks of 4
904 vld1.32 {q2}, [r14] // = M
905 mov r6, r2 // -> nv
906 add r7, r2, r3, lsl #2 // -> nv limit
907 add r3, r0, r12 // -> dv blocks-of-4 limit
908 vzip.16 q2, q3 // expand M
909 mov r12, #0 // = 0
910 bl mont4
911 cmp r2, r7 // done already?
912 bhs 8f
913
9145: bl mla4
915 cmp r2, r7 // done yet?
916 blo 5b
917
918 // Still have carries left to propagate. Adding the accumulator
919 // block into the carries is a little different this time, because
920 // all four accumulator limbs have to be squished into the three
921 // carry registers for `carryprop' to do its thing.
9228: vld1.32 {q9}, [r0]
923 SPREADACC0(q9, q10, q11, q12)
924 SPREADACC1(q9, q10, q11, q12)
925 vshl.u64 D0(q12), D0(q12), #16
926 CARRYACC(q9, q10, q11, q12, q13, q14, q15)
927 vadd.u64 D1(q15), D1(q15), D0(q12)
928
929 bl carryprop
930 vmov.32 r14, QW(q15, 0)
931 cmp r0, r3
932 bhs 7f
933
934 // Propagate the first group of carries.
935 ldmia r0, {r2, r8-r10}
936 adds r2, r2, r14
937 adcs r8, r8, #0
938 adcs r9, r9, #0
939 adcs r10, r10, #0
940 stmia r0!, {r2, r8-r10}
941 teq r0, r3
942 beq 6f
943
944 // Continue carry propagation until the end of the buffer.
9450: ldmia r0, {r2, r8-r10}
946 adcs r2, r2, #0
947 adcs r8, r8, #0
948 adcs r9, r9, #0
949 adcs r10, r10, #0
950 stmia r0!, {r2, r8-r10}
951 teq r0, r3
952 bne 0b
953
954 // Deal with the tail end. Note that the actual destination length
955 // won't be an exacty number of blocks of four, so it's safe to just
956 // drop through here.
9576: adc r14, r12, #0
9587: ldr r2, [r0]
959 adds r2, r2, r14
960 str r2, [r0], #4
961 teq r0, r1
962 beq 8f
9630: ldr r2, [r0]
964 adcs r2, r2, #0
965 str r2, [r0], #4
966 teq r0, r1
967 bne 0b
968
969 // All done for this iteration. Start the next.
9708: cmp r4, r5
971 bhs 9f
972 mov r0, r4
973 add r4, r4, #16
974 mov r2, r6
975 bl mont4
976 b 5b
977
978 // All over.
9799: popvfp QQ(q4, q7)
980 popreg r4-r10, r14
981 bx r14
982ENDFUNC
983
984///--------------------------------------------------------------------------
985/// Testing and performance measurement.
986
987#ifdef TEST_MUL4
988
989// dmul smul mmul mont
990// z r0 r0 r0 r0 r0
991// c r4 r1 r1 r1 r1
992// y r3 -- -- r2 r2
993// u r1 r2 -- r3 --
994// x r2 r3 r2 stk0 r3
995// vv q2/q3 stk0 -- stk1 stk0
996// yy q4/q5 stk1 r3 stk2 --
997// n r5 stk2 stk0 stk3 stk1
998// cyv r6 stk3 stk1 stk4 stk2
999
1000#define STKARG(i) sp, #80 + 4*(i)
1001
1002.macro testprologue mode
1003 pushreg r4-r6, r14
1004 pushvfp QQ(q4, q7)
1005 endprologue
1006
1007 .ifeqs "\mode", "dmul"
1008 mov r4, r1 // -> c
1009 mov r1, r2 // -> u
1010 mov r2, r3 // -> x
1011
1012 ldr r14, [STKARG(0)] // -> vv
1013 vld1.32 {q2}, [r14]
1014 vmov.i32 q3, #0
981a9e5d 1015 vzip.16 q2, q3 // (v''_1, v'_1; v''_0, v'_0)
ea1b3cec
MW
1016
1017 ldr r14, [STKARG(1)] // -> yy
1018 vld1.32 {q4}, [r14]
1019 vmov.i32 q5, #0
981a9e5d 1020 vzip.16 q4, q5 // (y''_1, y'_1; y''_0, y'_0)
ea1b3cec
MW
1021
1022 ldr r5, [STKARG(2)] // = n
1023 ldr r6, [STKARG(3)] // -> cyv
1024 .endif
1025
1026 .ifeqs "\mode", "smul"
1027 mov r4, r1 // -> c
1028 // r2 // -> x
1029
1030 vld1.32 {q4}, [r3]
1031 vmov.i32 q5, #0
981a9e5d 1032 vzip.16 q4, q5 // (y''_1, y'_1; y''_0, y'_0)
ea1b3cec
MW
1033
1034 ldr r5, [STKARG(0)] // = n
1035 ldr r6, [STKARG(1)] // -> cyv
1036 .endif
1037
1038 .ifeqs "\mode", "mmul"
1039 mov r4, r1 // -> c
1040 mov r1, r3 // -> u
1041 mov r3, r2 // -> y
1042 ldr r2, [STKARG(0)] // -> x
1043
1044 ldr r14, [STKARG(1)] // -> vv
1045 vld1.32 {q2}, [r14]
1046 vmov.i32 q3, #0
981a9e5d 1047 vzip.16 q2, q3 // (v''_1, v'_1; v''_0, v'_0)
ea1b3cec
MW
1048
1049 ldr r14, [STKARG(2)] // -> yy
1050 vld1.32 {q4}, [r14]
1051 vmov.i32 q5, #0
981a9e5d 1052 vzip.16 q4, q5 // (y''_1, y'_1; y''_0, y'_0)
ea1b3cec
MW
1053
1054 ldr r5, [STKARG(3)] // = n
1055 ldr r6, [STKARG(4)] // -> cyv
1056 .endif
1057
1058 .ifeqs "\mode", "mont"
1059 mov r4, r1 // -> c
1060 mov r1, r3 // -> u
1061 mov r14, r2
1062 mov r2, r3 // -> x
1063 mov r3, r14 // -> y
1064
1065 ldr r14, [STKARG(0)] // -> vv
1066 vld1.32 {q2}, [r14]
1067 vmov.i32 q3, #0
981a9e5d 1068 vzip.16 q2, q3 // (v''_1, v'_1; v''_0, v'_0)
ea1b3cec
MW
1069
1070 ldr r5, [STKARG(1)] // = n
1071 ldr r6, [STKARG(2)] // -> cyv
1072 .endif
1073.endm
1074
1075.macro testldcarry
1076 vldmia r4, {QQ(q13, q15)} // c0, c1, c2
1077.endm
1078
1079.macro testtop
10800: subs r5, r5, #1
1081.endm
1082
1083.macro testtail
1084 bne 0b
1085.endm
1086
1087.macro testcarryout
1088 vstmia r4, {QQ(q13, q15)}
1089.endm
1090
1091.macro testepilogue
1092 popvfp QQ(q4, q7)
1093 popreg r4-r6, r14
1094 bx r14
1095.endm
1096
1097FUNC(test_dmul4)
1098 testprologue dmul
1099 testldcarry
1100 testtop
1101 bl dmul4
1102 testtail
1103 testcarryout
1104 testepilogue
1105ENDFUNC
1106
1107FUNC(test_dmla4)
1108 testprologue dmul
1109 testldcarry
1110 testtop
1111 bl dmla4
1112 testtail
1113 testcarryout
1114 testepilogue
1115ENDFUNC
1116
1117FUNC(test_mul4)
1118 testprologue smul
1119 testldcarry
1120 testtop
1121 bl mul4
1122 testtail
1123 testcarryout
1124 testepilogue
1125ENDFUNC
1126
1127FUNC(test_mul4zc)
1128 testprologue smul
1129 testldcarry
1130 testtop
1131 bl mul4zc
1132 testtail
1133 testcarryout
1134 testepilogue
1135ENDFUNC
1136
1137FUNC(test_mla4)
1138 testprologue smul
1139 testldcarry
1140 testtop
1141 bl mla4
1142 testtail
1143 testcarryout
1144 testepilogue
1145ENDFUNC
1146
1147FUNC(test_mla4zc)
1148 testprologue smul
1149 testldcarry
1150 testtop
1151 bl mla4zc
1152 testtail
1153 testcarryout
1154 testepilogue
1155ENDFUNC
1156
1157FUNC(test_mmul4)
1158 testprologue mmul
1159 testtop
1160 bl mmul4
1161 testtail
1162 vst1.32 {q4, q5}, [r3]
1163 testcarryout
1164 testepilogue
1165ENDFUNC
1166
1167FUNC(test_mmla4)
1168 testprologue mmul
1169 testtop
1170 bl mmla4
1171 testtail
1172 vst1.32 {q4, q5}, [r3]
1173 testcarryout
1174 testepilogue
1175ENDFUNC
1176
1177FUNC(test_mont4)
1178 testprologue mont
1179 testtop
1180 bl mont4
1181 testtail
1182 vst1.32 {q4, q5}, [r3]
1183 testcarryout
1184 testepilogue
1185ENDFUNC
1186
1187#endif
1188
1189///----- That's all, folks --------------------------------------------------