mdw@git.distorted.org.uk Git - catacomb/blob - math/mpx-mul4-arm-neon.S

   1 /// -*- mode: asm; asm-comment-char: ?/ -*-
   2 ///
   3 /// Large SIMD-based multiplications
   4 ///
   5 /// (c) 2019 Straylight/Edgeware
   6 ///
   7
   8 ///----- Licensing notice ---------------------------------------------------
   9 ///
  10 /// This file is part of Catacomb.
  11 ///
  12 /// Catacomb is free software: you can redistribute it and/or modify it
  13 /// under the terms of the GNU Library General Public License as published
  14 /// by the Free Software Foundation; either version 2 of the License, or
  15 /// (at your option) any later version.
  16 ///
  17 /// Catacomb is distributed in the hope that it will be useful, but
  18 /// WITHOUT ANY WARRANTY; without even the implied warranty of
  19 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20 /// Library General Public License for more details.
  21 ///
  22 /// You should have received a copy of the GNU Library General Public
  23 /// License along with Catacomb.  If not, write to the Free Software
  24 /// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
  25 /// USA.
  26
  27 ///--------------------------------------------------------------------------
  28 /// Preliminaries.
  29
  30 #include "config.h"
  31 #include "asm-common.h"
  32
  33         .arch   armv7-a
  34         .fpu    neon
  35
  36         .text
  37
  38 ///--------------------------------------------------------------------------
  39 /// Theory.
  40 ///
  41 /// We define a number of primitive fixed-size multipliers from which we can
  42 /// construct more general variable-length multipliers.
  43 ///
  44 /// The basic trick is the same throughout.  In an operand-scanning
  45 /// multiplication, the inner multiplication loop multiplies a multiple-
  46 /// precision operand by a single precision factor, and adds the result,
  47 /// appropriately shifted, to the result.  A `finely integrated operand
  48 /// scanning' implementation of Montgomery multiplication also adds the
  49 /// product of a single-precision `Montgomery factor' and the modulus,
  50 /// calculated in the same pass.  The more common `coarsely integrated
  51 /// operand scanning' alternates main multiplication and Montgomery passes,
  52 /// which requires additional carry propagation.
  53 ///
  54 /// Throughout both plain-multiplication and Montgomery stages, then, one of
  55 /// the factors remains constant throughout the operation, so we can afford
  56 /// to take a little time to preprocess it.  The transformation we perform is
  57 /// as follows.  Let b = 2^16, and B = b^2 = 2^32.  Suppose we're given a
  58 /// 128-bit factor v = v_0 + v_1 B + v_2 B^2 + v_3 B^3.  Split each v_i into
  59 /// two sixteen-bit pieces, so v_i = v'_i + v''_i b.  These eight 16-bit
  60 /// pieces are placed into 32-bit cells, and arranged as two 128-bit NEON
  61 /// operands, as follows.
  62 ///
  63 ///     Offset     12      8        4      0
  64 ///        0    v''_1   v'_1    v''_0   v'_0
  65 ///       16    v''_3   v'_3    v''_2   v'_2
  66 ///
  67 /// The `vmull' and `vmlal' instructions can multiply a vector of two 32-bit
  68 /// values by a 32-bit scalar, giving two 64-bit results; thus, it will act
  69 /// on (say) v'_0 and v''_0 in a single instruction, to produce two 48-bit
  70 /// results in 64-bit fields.  The sixteen bits of headroom allows us to add
  71 /// many products together before we must deal with carrying; it also allows
  72 /// for some calculations to be performed on the above expanded form.
  73 ///
  74 /// We maintain three `carry' registers, q12--q14, accumulating intermediate
  75 /// results; we name them `c0', `c1', and `c2'.  Each carry register holds
  76 /// two 64-bit halves: the register c0, for example, holds c'_0 (low half)
  77 /// and c''_0 (high half), and represents the value c_0 = c'_0 + c''_0 b; the
  78 /// carry registers collectively represent the value c_0 + c_1 B + c_2 B^2.
  79 /// The `vmull' or `vmlal' instruction acting on a scalar operand and an
  80 /// operand in the expanded form above produces a result which can be added
  81 /// directly to the appropriate carry register.
  82 ///
  83 /// Multiplication is performed in product-scanning order, since ARM
  84 /// processors commonly implement result forwarding for consecutive multiply-
  85 /// and-accumulate instructions specifying the same destination.
  86 /// Experimentally, this runs faster than operand-scanning in an attempt to
  87 /// hide instruction latencies.
  88 ///
  89 /// On 32-bit ARM, we have a reasonable number of registers: the expanded
  90 /// operands are kept in registers.  The packed operands are read from memory
  91 /// into working registers q0 and q1.  The following conventional argument
  92 /// names and locations are used throughout.
  93 ///
  94 /// Arg Format      Location    Notes
  95 ///
  96 /// U   packed      [r1]
  97 /// X   packed      [r2]        In Montgomery multiplication, X = N
  98 /// V   expanded    q2/q3
  99 /// Y   expanded    q4/q5       In Montgomery multiplication, Y = (A + U V) M
 100 /// M   expanded    q4/q5       -N^{-1} (mod B^4)
 101 /// N                           Modulus, for Montgomery multiplication
 102 /// A   packed      [r0]        Destination/accumulator
 103 /// C   carry       q13--q15
 104 ///
 105 /// The calculation is some variant of
 106 ///
 107 ///     A' + C' B^4 <- U V + X Y + A + C
 108 ///
 109 /// The low-level functions fit into a fairly traditional (finely-integrated)
 110 /// operand scanning loop over operand pairs (U, X) (indexed by j) and (V, Y)
 111 /// (indexed by i).
 112 ///
 113 /// The variants are as follows.
 114 ///
 115 /// Function    Variant                 Use             i       j
 116 ///
 117 /// mmul4       A = C = 0               Montgomery      0       0
 118 /// dmul4       A = 0                   Montgomery      0       +
 119 /// mmla4       C = 0                   Montgomery      +       0
 120 /// dmla4       exactly as shown        Montgomery      +       +
 121 ///
 122 /// mul4zc      U = V = A = C = 0       Plain           0       0
 123 /// mul4        U = V = A = 0           Plain           0       +
 124 /// mla4zc      U = V = C = 0           Plain           +       0
 125 /// mla4        U = V = 0               Plain           +       +
 126 ///
 127 /// The `mmul4' and `mmla4' functions are also responsible for calculating
 128 /// the Montgomery reduction factor Y = (A + U V) M used by the rest of the
 129 /// inner loop.
 130
 131 ///--------------------------------------------------------------------------
 132 /// Macro definitions.
 133
 134 .macro  mulacc  z, u, v, x=nil, y=nil
 135         // Set Z = Z + U V + X Y.  X may be `nil' to omit the second
 136         // operand.  Z should be a 128-bit `qN' register; V and Y should be
 137         // 64-bit `dN' registers; and U and X should be 32-bit `dN[I]'
 138         // scalars; the multiplications produce two 64-bit elementwise
 139         // products, which are added elementwise to Z.
 140
 141         vmlal.u32 \z, \v, \u
 142     .ifnes "\x", "nil"
 143         vmlal.u32 \z, \y, \x
 144     .endif
 145 .endm
 146
 147 .macro  mulinit z, zinitp, u, v, x, y
 148         // If ZINITP then set Z = Z + U V + X Y, as for `mulacc'; otherwise,
 149         // set Z = U V + X Y.  Operand requirements and detailed operation
 150         // are as for `mulacc'.
 151
 152   .ifeqs "\zinitp", "t"
 153         mulacc  \z, \u, \v, \x, \y
 154   .else
 155         vmull.u32 \z, \v, \u
 156     .ifnes "\x", "nil"
 157         vmlal.u32 \z, \y, \x
 158     .endif
 159   .endif
 160 .endm
 161
 162 // `MULI': accumulate the B^I and b B^i terms of the polynomial product sum U
 163 // V + X Y, given that U = u_0 + B u_1 + B^2 u_2 + B^3 u_3 (and similarly for
 164 // x), and V = v'_0 + b v''_0 + B (v'_1 + b v''_1) + B^2 (v'_2 + b v''_2) +
 165 // B^3 (v'_3 + b v''_3) (and similarly for Y).  The 64-bit coefficients are
 166 // added into the low and high halves of the 128-bit register Z (if ZINIT is
 167 // `nil' then simply set Z, as if it were initially zero).
 168 #define MUL0(z, zinitp, u, v0, v1, x, y0, y1)                           \
 169         mulinit z, zinitp, QW(u, 0), D0(v0), QW(x, 0), D0(y0)
 170 #define MUL1(z, zinitp, u, v0, v1, x, y0, y1)                           \
 171         mulinit z, zinitp, QW(u, 0), D1(v0), QW(x, 0), D1(y0);          \
 172         mulacc  z,         QW(u, 1), D0(v0), QW(x, 1), D0(y0)
 173 #define MUL2(z, zinitp, u, v0, v1, x, y0, y1)                           \
 174         mulinit z, zinitp, QW(u, 0), D0(v1), QW(x, 0), D0(y1);          \
 175         mulacc  z,         QW(u, 1), D1(v0), QW(x, 1), D1(y0);          \
 176         mulacc  z,         QW(u, 2), D0(v0), QW(x, 2), D0(y0)
 177 #define MUL3(z, zinitp, u, v0, v1, x, y0, y1)                           \
 178         mulinit z, zinitp, QW(u, 0), D1(v1), QW(x, 0), D1(y1);          \
 179         mulacc  z,         QW(u, 1), D0(v1), QW(x, 1), D0(y1);          \
 180         mulacc  z,         QW(u, 2), D1(v0), QW(x, 2), D1(y0);          \
 181         mulacc  z,         QW(u, 3), D0(v0), QW(x, 3), D0(y0)
 182 #define MUL4(z, zinitp, u, v0, v1, x, y0, y1)                           \
 183         mulinit z, zinitp, QW(u, 1), D1(v1), QW(x, 1), D1(y1);          \
 184         mulacc  z,         QW(u, 2), D0(v1), QW(x, 2), D0(y1);          \
 185         mulacc  z,         QW(u, 3), D1(v0), QW(x, 3), D1(y0)
 186 #define MUL5(z, zinitp, u, v0, v1, x, y0, y1)                           \
 187         mulinit z, zinitp, QW(u, 2), D1(v1), QW(x, 2), D1(y1);          \
 188         mulacc  z,         QW(u, 3), D0(v1), QW(x, 3), D0(y1)
 189 #define MUL6(z, zinitp, u, v0, v1, x, y0, y1)                           \
 190         mulinit z, zinitp, QW(u, 3), D1(v1), QW(x, 3), D1(y1)
 191
 192 // Steps in the process of propagating carry bits from ZLO to ZHI (both
 193 // 128-bit `qN' registers).  Here, T is a 128-bit `qN' temporary register.
 194 // Set the low 32 bits of the 64-bit `dN' register ZOUT to the completed
 195 // coefficient z_i.
 196 //
 197 // In detail, what happens is as follows.  Suppose initially that ZLO =
 198 // (z'_i; z''_i) and ZHI = (z'_{i+1}; z''_{i+1}).  Let t = z'_i + b z''_i;
 199 // observe that floor(t/b) = floor(z'_i/b) + z''_i.  Let z_i = t mod B, and
 200 // add floor(t/B) = floor((floor(z'_i/b) + z''_i)/b) onto z'_{i+1}.  This has
 201 // a circuit depth of 4; I don't know how to do better.
 202 .macro  _carry0 zout, zlo0, zlo1, t0, t1
 203         // ZLO0 and ZLO1 are the low and high halves of a carry register.
 204         // Extract a 32-bit output, in the bottom 32 bits of ZOUT, and set T1
 205         // so as to continue processing using `_carry1'.  All operands are
 206         // 64-bit `dN' registers.  If ZOUT is `nil' then no output is
 207         // produced; if T1 is `nil' then no further processing will be
 208         // possible.
 209   .ifnes "\zout", "nil"
 210         vshl.i64 \t0, \zlo1, #16
 211   .endif
 212   .ifnes "\t1", "nil"
 213         vshr.u64 \t1, \zlo0, #16
 214   .endif
 215   .ifnes "\zout", "nil"
 216         vadd.i64 \zout, \zlo0, \t0
 217   .endif
 218   .ifnes "\t1", "nil"
 219         vadd.i64 \t1, \t1, \zlo1
 220   .endif
 221 .endm
 222 .macro  _carry1 u, zhi0, t1
 223         // ZHI0 is the low half of a carry register, and T1 is the result of
 224         // applying `_carry0' to the previous carry register.  Set U to the
 225         // result of propagating the carry into ZHI0.
 226         vshr.u64 \t1, \t1, #16
 227         vadd.i64 \u, \zhi0, \t1
 228 .endm
 229
 230 // More convenient wrappers for `_carry0' and `_carry1'.
 231 //
 232 // CARRY0(ZOUT, ZLO, T)
 233 //      Store a 32-bit output in ZOUT from carry ZLO, using T as a
 234 //      temporary.  ZOUT is a 64-bit `dN' register; ZLO and T are 128-bit
 235 //      `qN' registers.
 236 //
 237 // CARRY1(ZHI, T)
 238 //      Propagate carry from T into ZHI.  Both are 128-bit `qN' registers;
 239 //      ZHI is updated.
 240 #define CARRY0(zout, zlo, t)                                            \
 241         CASIDE0(zout, D0(zlo), zlo, t)
 242 #define CARRY1(zhi, t)                                                  \
 243         CASIDE1(D0(zhi), zhi, t)
 244
 245 // Versions of `CARRY0' and `CARRY1' which don't mutate their operands.
 246 //
 247 // CASIDE0(ZOUT, U, ZLO, T)
 248 //      As for `CARRY0', but the low half of ZLO is actually in U (a 64-bit
 249 //      `dN' register).
 250 //
 251 // CASIDE0E(ZOUT, U, ZLO, T)
 252 //      As for `CASIDE0', but only calculate the output word, and no
 253 //      carry-out.
 254 //
 255 // CASIDE1(U, ZHI, T)
 256 //      As for `CARRY1', but write the updated low half of ZHI to U.
 257 #define CASIDE0(zout, u, zlo, t)                                        \
 258         _carry0 zout, u, D1(zlo), D0(t), D1(t)
 259 #define CASIDE0E(zout, u, zlo, t)                                       \
 260         _carry0 zout, u, D1(zlo), D0(t), nil
 261 #define CASIDE1(u, zhi, t)                                              \
 262         _carry1 u, D0(zhi), D1(t)
 263
 264 // Steps in spreading a packed 128-bit operand in A0 across A0, A1, A2, A3 in
 265 // carry format.
 266 #define SPREADACC0(a0, a1, a2, a3)                                      \
 267         vmov.i32 a1, #0;                                                \
 268         vmov.i32 a2, #0;                                                \
 269         vmov.i32 a3, #0
 270 #define SPREADACC1(a0, a1, a2, a3)                                      \
 271         vswp    D1(a0), D0(a2);                                         \
 272         vtrn.32 D0(a0), D0(a1);                                         \
 273         vtrn.32 D0(a2), D0(a3)
 274
 275 // Add the carry-format values A0, A1, A2 into the existing carries C0, C1,
 276 // C2 (leaving A3 where it is).
 277 #define CARRYACC(a0, a1, a2, a3, c0, c1, c2)                            \
 278         vadd.i64 c0, c0, a0;                                            \
 279         vadd.i64 c1, c1, a1;                                            \
 280         vadd.i64 c2, c2, a2
 281
 282 ///--------------------------------------------------------------------------
 283 /// Primitive multipliers and related utilities.
 284
 285 INTFUNC(carryprop)
 286         // On entry, r0 points to a destination, and q13--q15 hold incoming
 287         // carries c0--c2.  On exit, the low 128 bits of the carry value are
 288         // stored at [r0]; the remaining 16 bits of carry are left in d30; r0
 289         // is advanced by 16; and q10--q14 are clobbered.
 290   endprologue
 291
 292         CARRY0(D0(q10), q13, q12)
 293         CARRY1(q14, q12)
 294         CARRY0(D0(q11), q14, q12)
 295         CARRY1(q15, q12)
 296         CARRY0(D1(q10), q15, q12)
 297         vshr.u64 D1(q11), D1(q12), #16
 298         vshr.u64 D0(q15), D1(q12), #48
 299         vtrn.32 q10, q11
 300         vst1.32 {q10}, [r0]!
 301         bx      r14
 302 ENDFUNC
 303
 304 INTFUNC(dmul4)
 305         // On entry, r0 points to the destination; r1 and r2 point to packed
 306         // operands U and X; q2/q3 and q4/q5 hold expanded operands V and Y;
 307         // and q13--q15 hold incoming carries c0--c2.  On exit, the
 308         // destination and carries are updated; r0, r1, r2 are each advanced
 309         // by 16; q2--q5 are preserved; and the other NEON registers are
 310         // clobbered.
 311   endprologue
 312
 313         // Start by loading the operand words from memory.
 314         vld1.32 {q0}, [r1]!
 315         vld1.32 {q1}, [r2]!
 316
 317         // Do the multiplication.
 318         MUL0(q13, t,      q0,  q2, q3,    q1,  q4, q5)
 319         MUL1(q14, t,      q0,  q2, q3,    q1,  q4, q5)
 320          CARRY0(D0(q8), q13, q6)
 321         MUL2(q15, t,      q0,  q2, q3,    q1,  q4, q5)
 322          CARRY1(q14, q6)
 323          CARRY0(D0(q9), q14, q6)
 324         MUL3(q12, nil,    q0,  q2, q3,    q1,  q4, q5)
 325          CARRY1(q15, q6)
 326          CARRY0(D1(q8), q15, q6)
 327         MUL4(q13, nil,    q0,  q2, q3,    q1,  q4, q5)
 328          CARRY1(q12, q6)
 329          CARRY0(D1(q9), q12, q6)
 330         MUL5(q14, nil,    q0,  q2, q3,    q1,  q4, q5)
 331          CARRY1(q13, q6)
 332         MUL6(q15, nil,    q0,  q2, q3,    q1,  q4, q5)
 333
 334         // Finish up and store the result.
 335         vtrn.32 q8, q9
 336         vst1.32 {q8}, [r0]!
 337
 338         // All done.
 339         bx      r14
 340 ENDFUNC
 341
 342 INTFUNC(dmla4)
 343         // On entry, r0 points to the destination/accumulator; r1 and r2
 344         // point to packed operands U and X; q2/q3 and q4/q5 hold expanded
 345         // operands V and Y; and q13--q15 hold incoming carries c0--c2.  On
 346         // exit, the accumulator and carries are updated; r0, r1, r2 are each
 347         // advanced by 16; q2--q5 are preserved; and the other NEON registers
 348         // are clobbered.
 349   endprologue
 350
 351         // Start by loading the operand words from memory.
 352         vld1.32 {q9}, [r0]
 353         SPREADACC0(q9, q10, q11, q12)
 354         vld1.32 {q0}, [r1]!
 355         vld1.32 {q1}, [r2]!
 356
 357         // Add the accumulator input to the incoming carries.  Split the
 358         // accumulator into four pieces and add the carries onto them.
 359         SPREADACC1(q9, q10, q11, q12)
 360         CARRYACC(q9, q10, q11, q12, q13, q14, q15)
 361
 362         // Do the multiplication.
 363         MUL0(q13, t,      q0,  q2, q3,    q1,  q4, q5)
 364         MUL1(q14, t,      q0,  q2, q3,    q1,  q4, q5)
 365          CARRY0(D0(q8), q13, q6)
 366         MUL2(q15, t,      q0,  q2, q3,    q1,  q4, q5)
 367          CARRY1(q14, q6)
 368          CARRY0(D0(q9), q14, q6)
 369         MUL3(q12, t,      q0,  q2, q3,    q1,  q4, q5)
 370          CARRY1(q15, q6)
 371          CARRY0(D1(q8), q15, q6)
 372         MUL4(q13, nil,    q0,  q2, q3,    q1,  q4, q5)
 373          CARRY1(q12, q6)
 374          CARRY0(D1(q9), q12, q6)
 375         MUL5(q14, nil,    q0,  q2, q3,    q1,  q4, q5)
 376          CARRY1(q13, q6)
 377         MUL6(q15, nil,    q0,  q2, q3,    q1,  q4, q5)
 378
 379         // Finish up and store the result.
 380         vtrn.32 q8, q9
 381         vst1.32 {q8}, [r0]!
 382
 383         // All done.
 384         bx      r14
 385 ENDFUNC
 386
 387 INTFUNC(mul4)
 388         // On entry, r0 points to the destination; r2 points to a packed
 389         // operand X; q4/q5 holds an expanded operand Y; and q13--q15 hold
 390         // incoming carries c0--c2.  On exit, the destination and carries are
 391         // updated; r0 and r2 are each advanced by 16; q4 and q5 are
 392         // preserved; and the other NEON registers are clobbered.
 393   endprologue
 394
 395         // Start by loading the operand words from memory.
 396         vld1.32 {q1}, [r2]!
 397
 398         // Do the multiplication.
 399         MUL0(q13, t,      q1,  q4, q5,    nil,  nil, nil)
 400         MUL1(q14, t,      q1,  q4, q5,    nil,  nil, nil)
 401          CARRY0(D0(q8), q13, q6)
 402         MUL2(q15, t,      q1,  q4, q5,    nil,  nil, nil)
 403          CARRY1(q14, q6)
 404          CARRY0(D0(q9), q14, q6)
 405         MUL3(q12, nil,    q1,  q4, q5,    nil,  nil, nil)
 406          CARRY1(q15, q6)
 407          CARRY0(D1(q8), q15, q6)
 408         MUL4(q13, nil,    q1,  q4, q5,    nil,  nil, nil)
 409          CARRY1(q12, q6)
 410          CARRY0(D1(q9), q12, q6)
 411         MUL5(q14, nil,    q1,  q4, q5,    nil,  nil, nil)
 412          CARRY1(q13, q6)
 413         MUL6(q15, nil,    q1,  q4, q5,    nil,  nil, nil)
 414
 415         // Finish up and store the result.
 416         vtrn.32 q8, q9
 417         vst1.32 {q8}, [r0]!
 418
 419         // All done.
 420         bx      r14
 421 ENDFUNC
 422
 423 INTFUNC(mul4zc)
 424         // On entry, r0 points to the destination; r2 points to a packed
 425         // operand X; and q4/q5 holds an expanded operand Y.  On exit, the
 426         // destination is updated; q13--q15 hold outgoing carries c0--c2; r0
 427         // and r2 are each advanced by 16; q4 and q5 are preserved; and the
 428         // other NEON registers are clobbered.
 429   endprologue
 430
 431         // Start by loading the operand words from memory.
 432         vld1.32 {q1}, [r2]!
 433
 434         // Do the multiplication.
 435         MUL0(q13, nil,    q1,  q4, q5,    nil,  nil, nil)
 436         MUL1(q14, nil,    q1,  q4, q5,    nil,  nil, nil)
 437          CARRY0(D0(q8), q13, q6)
 438         MUL2(q15, nil,    q1,  q4, q5,    nil,  nil, nil)
 439          CARRY1(q14, q6)
 440          CARRY0(D0(q9), q14, q6)
 441         MUL3(q12, nil,    q1,  q4, q5,    nil,  nil, nil)
 442          CARRY1(q15, q6)
 443          CARRY0(D1(q8), q15, q6)
 444         MUL4(q13, nil,    q1,  q4, q5,    nil,  nil, nil)
 445          CARRY1(q12, q6)
 446          CARRY0(D1(q9), q12, q6)
 447         MUL5(q14, nil,    q1,  q4, q5,    nil,  nil, nil)
 448          CARRY1(q13, q6)
 449         MUL6(q15, nil,    q1,  q4, q5,    nil,  nil, nil)
 450
 451         // Finish up and store the result.
 452         vtrn.32 q8, q9
 453         vst1.32 {q8}, [r0]!
 454
 455         // All done.
 456         bx      r14
 457 ENDFUNC
 458
 459 INTFUNC(mla4)
 460         // On entry, r0 points to the destination/accumulator; r2 points to a
 461         // packed operand X; q4/q5 holds an expanded operand Y; and q13--q15
 462         // hold incoming carries c0--c2.  On exit, the accumulator and
 463         // carries are updated; r0 and r2 are each advanced by 16; q4 and q5
 464         // are preserved; and the other NEON registers are clobbered.
 465   endprologue
 466
 467         // Start by loading the operand words from memory.
 468         vld1.32 {q9}, [r0]
 469         SPREADACC0(q9, q10, q11, q12)
 470         vld1.32 {q1}, [r2]!
 471
 472         // Add the accumulator input to the incoming carries.  Split the
 473         // accumulator into four pieces and add the carries onto them.
 474         SPREADACC1(q9, q10, q11, q12)
 475         CARRYACC(q9, q10, q11, q12, q13, q14, q15)
 476
 477         // Do the multiplication.
 478 mla4_common:
 479         MUL0(q13, t,      q1,  q4, q5,    nil,  nil, nil)
 480         MUL1(q14, t,      q1,  q4, q5,    nil,  nil, nil)
 481          CARRY0(D0(q8), q13, q6)
 482         MUL2(q15, t,      q1,  q4, q5,    nil,  nil, nil)
 483          CARRY1(q14, q6)
 484          CARRY0(D0(q9), q14, q6)
 485         MUL3(q12, t,      q1,  q4, q5,    nil,  nil, nil)
 486          CARRY1(q15, q6)
 487          CARRY0(D1(q8), q15, q6)
 488         MUL4(q13, nil,    q1,  q4, q5,    nil,  nil, nil)
 489          CARRY1(q12, q6)
 490          CARRY0(D1(q9), q12, q6)
 491         MUL5(q14, nil,    q1,  q4, q5,    nil,  nil, nil)
 492          CARRY1(q13, q6)
 493         MUL6(q15, nil,    q1,  q4, q5,    nil,  nil, nil)
 494
 495         // Finish up and store the result.
 496         vtrn.32 q8, q9
 497         vst1.32 {q8}, [r0]!
 498
 499         // All done.
 500         bx      r14
 501 ENDFUNC
 502
 503 INTFUNC(mla4zc)
 504         // On entry, r0 points to the destination/accumulator; r2 points to a
 505         // packed operand X; and q4/q5 holds an expanded operand Y.  On exit,
 506         // the accumulator is updated; q13--q15 hold outgoing carries c0--c2;
 507         // r0 and r2 are each advanced by 16; q4 and q5 are preserved; and
 508         // the other NEON registers are clobbered.
 509   endprologue
 510
 511         // Start by loading the operand words from memory.
 512         vld1.32 {q13}, [r0]
 513         SPREADACC0(q13, q14, q15, q12)
 514         vld1.32 {q1}, [r2]!
 515
 516         // Move the accumulator input to the incoming carry slots.  Split the
 517         // accumulator into four pieces.
 518         SPREADACC1(q13, q14, q15, q12)
 519
 520         b       mla4_common
 521 ENDFUNC
 522
 523 INTFUNC(mmul4)
 524         // On entry, r0 points to the destination; r1 points to a packed
 525         // operand U; r2 points to a packed operand X (the modulus); q2/q3
 526         // holds an expanded operand V; and q4/q5 holds an expanded operand M
 527         // (the Montgomery factor -N^{-1} (mod B)).  On exit, the destination
 528         // is updated (to zero); q4/q5 hold an expanded factor Y = U V M (mod
 529         // B); q13--q15 hold outgoing carries c0--c2; r0, r1, and r2 are each
 530         // advanced by 16; q2 and q3 are preserved; and the other NEON
 531         // registers are clobbered.
 532
 533         // Start by loading the operand words from memory.
 534         vld1.32 {q0}, [r1]!
 535
 536         // Calculate the low half of W = A + U V, being careful to leave the
 537         // carries in place.
 538         MUL0(q13, nil,    q0,  q2, q3,    nil,  nil, nil)
 539         MUL1(q14, nil,    q0,  q2, q3,    nil,  nil, nil)
 540          CARRY0(D0(q6), q13, q8)
 541         MUL2(q15, nil,    q0,  q2, q3,    nil,  nil, nil)
 542          CASIDE1(D0(q9), q14, q8)
 543          CASIDE0(D0(q7), D0(q9), q14, q8)
 544         MUL3(q12, nil,    q0,  q2, q3,    nil,  nil, nil)
 545         b mmla4_common
 546 ENDFUNC
 547
 548 INTFUNC(mmla4)
 549         // On entry, r0 points to the destination/accumulator A; r1 points to
 550         // a packed operand U; r2 points to a packed operand X (the modulus);
 551         // q2/q3 holds an expanded operand V; and q4/q5 holds an expanded
 552         // operand M (the Montgomery factor -N^{-1} (mod B)).  On exit, the
 553         // accumulator is updated (to zero); q4/q5 hold an expanded factor Y
 554         // = (A + U V) M (mod B); q13--q15 hold outgoing carries c0--c2; r0,
 555         // r1, and r2 are each advanced by 16; q2 and q3 are preserved; and
 556         // the other NEON registers are clobbered.
 557   endprologue
 558
 559         // Start by loading the operand words from memory.
 560         vld1.32 {q13}, [r0]
 561         SPREADACC0(q13, q14, q15, q12)
 562         vld1.32 {q0}, [r1]!
 563
 564         // Move the accumulator input to the incoming carry slots.  Split the
 565         // accumulator into four pieces.
 566         SPREADACC1(q13, q14, q15, q12)
 567
 568         // Calculate the low half of W = A + U V, being careful to leave the
 569         // carries in place.
 570         MUL0(q13, t,      q0,  q2, q3,    nil,  nil, nil)
 571         MUL1(q14, t,      q0,  q2, q3,    nil,  nil, nil)
 572          CARRY0(D0(q6), q13, q8)
 573         MUL2(q15, t,      q0,  q2, q3,    nil,  nil, nil)
 574          CASIDE1(D0(q9), q14, q8)
 575          CASIDE0(D0(q7), D0(q9), q14, q8)
 576         MUL3(q12, t,      q0,  q2, q3,    nil,  nil, nil)
 577 mmla4_common:
 578          CASIDE1(D0(q9), q15, q8)
 579          CASIDE0(D1(q6), D0(q9), q15, q8)
 580          CASIDE1(D0(q9), q12, q8)
 581          CASIDE0E(D1(q7), D0(q9), q12, q8)
 582         vtrn.32 q6, q7
 583
 584         // Calculate the low half of the Montgomery factor Y = W M.  At this
 585         // point, registers are a little tight.
 586         MUL0( q8, nil,    q6,  q4, q5,    nil,  nil, nil)
 587         MUL1( q9, nil,    q6,  q4, q5,    nil,  nil, nil)
 588          CARRY0(D0(q8), q8, q1)
 589         MUL2(q10, nil,    q6,  q4, q5,    nil,  nil, nil)
 590          CARRY1(q9, q1)
 591          CARRY0(D0(q9), q9, q1)
 592         MUL3(q11, nil,    q6,  q4, q5,    nil,  nil, nil)
 593          CARRY1(q10, q1)
 594          CARRY0(D1(q8), q10, q1)
 595           vmov.i32 q5, #0
 596          CARRY1(q11, q1)
 597          CARRY0(D1(q9), q11, q1)
 598           vld1.32 {q1}, [r2]!
 599         vtrn.32 q8, q9
 600
 601         // Expand Y.  We'll put it in its proper place a bit later.
 602         vzip.16 q8, q5
 603
 604         // Build up the product X Y in the carry slots.
 605         MUL0(q13, t,      q1,  q8, q5,    nil,  nil, nil)
 606         MUL1(q14, t,      q1,  q8, q5,    nil,  nil, nil)
 607          CARRY0(nil, q13, q9)
 608         MUL2(q15, t,      q1,  q8, q5,    nil,  nil, nil)
 609          CARRY1(q14, q9)
 610           vmov  q4, q8
 611          CARRY0(nil, q14, q9)
 612         MUL3(q12, t,      q1,  q8, q5,    nil,  nil, nil)
 613          CARRY1(q15, q9)
 614          CARRY0(nil, q15, q9)
 615         vmov.u32 q6, #0
 616
 617         // And complete the calculation.
 618         MUL4(q13, nil,    q0,  q2, q3,    q1,  q4, q5)
 619          CARRY1(q12, q9)
 620          CARRY0(nil, q12, q9)
 621         MUL5(q14, nil,    q0,  q2, q3,    q1,  q4, q5)
 622          CARRY1(q13, q9)
 623         MUL6(q15, nil,    q0,  q2, q3,    q1,  q4, q5)
 624
 625         // Finish up and store the result.
 626         vst1.32 {q6}, [r0]!
 627
 628         // All done.
 629         bx      r14
 630 ENDFUNC
 631
 632 INTFUNC(mont4)
 633         // On entry, r0 points to the destination/accumulator A; r2 points to
 634         // a packed operand X (the modulus); and q2/q3 holds an expanded
 635         // operand M (the Montgomery factor -N^{-1} (mod B)).  On exit, the
 636         // accumulator is updated (to zero); q4/q5 hold an expanded factor Y
 637         // = A M (mod B); q13--q15 hold outgoing carries c0--c2; r0 and r2
 638         // are each advanced by 16; q2 and q3 are preserved; and the other
 639         // NEON registers are clobbered.
 640   endprologue
 641
 642         // Start by loading the operand words from memory.
 643         vld1.32 {q0}, [r0]
 644         vld1.32 {q1}, [r2]!
 645
 646         // Calculate Y = A M (mod B).
 647         MUL0(q8, nil,     q0,  q2, q3,    nil,  nil, nil)
 648         MUL1(q9, nil,     q0,  q2, q3,    nil,  nil, nil)
 649          CARRY0(D0(q4), q8, q6)
 650         MUL2(q10, nil,    q0,  q2, q3,    nil,  nil, nil)
 651          CARRY1(q9, q6)
 652           vmov  q13, q0
 653          CARRY0(D0(q7), q9, q6)
 654         MUL3(q11, nil,    q0,  q2, q3,    nil,  nil, nil)
 655          CARRY1(q10, q6)
 656          CARRY0(D1(q4), q10, q6)
 657           SPREADACC0(q13, q14, q15, q12)
 658          CARRY1(q11, q6)
 659          CARRY0(D1(q7), q11, q6)
 660           SPREADACC1(q13, q14, q15, q12)
 661         vmov.i32 q5, #0
 662         vtrn.32 q4, q7
 663         vzip.16 q4, q5
 664
 665         // Calculate the actual result.  Well, the carries, at least.
 666         vmov.i32 q8, #0
 667         MUL0(q13, t,      q1,  q4, q5,    nil,  nil, nil)
 668         MUL1(q14, t,      q1,  q4, q5,    nil,  nil, nil)
 669          CARRY0(nil, q13, q6)
 670         MUL2(q15, t,      q1,  q4, q5,    nil,  nil, nil)
 671          CARRY1(q14, q6)
 672          CARRY0(nil, q14, q6)
 673         MUL3(q12, t,      q1,  q4, q5,    nil,  nil, nil)
 674          CARRY1(q15, q6)
 675          CARRY0(nil, q15, q6)
 676         MUL4(q13, nil,    q1,  q4, q5,    nil,  nil, nil)
 677          CARRY1(q12, q6)
 678          CARRY0(nil, q12, q6)
 679         MUL5(q14, nil,    q1,  q4, q5,    nil,  nil, nil)
 680          CARRY1(q13, q6)
 681         MUL6(q15, nil,    q1,  q4, q5,    nil,  nil, nil)
 682
 683         // Finish up and store the result.
 684         vst1.32 {q8}, [r0]!
 685
 686         // All done.
 687         bx      r14
 688 ENDFUNC
 689
 690 ///--------------------------------------------------------------------------
 691 /// Bulk multipliers.
 692
 693 FUNC(mpx_umul4_arm_neon)
 694         // void mpx_umul4_arm_neon(mpw *dv, const mpw *av, const mpw *avl,
 695         //                         const mpw *bv, const mpw *bvl);
 696
 697         // Establish the arguments and do initial setup.
 698         //
 699         // inner loop dv        r0
 700         // inner loop av        r2
 701         // outer loop dv        r5
 702         // outer loop bv        r3
 703         // av base              r1
 704         // av limit             r12
 705         // bv limit             r4
 706         pushreg r4, r5, r14
 707         pushvfp QQ(q4, q7)
 708   endprologue
 709
 710         // Prepare for the first iteration.
 711         vld1.32 {q4}, [r3]!             // = Y = bv[0]
 712         vmov.i32 q5, #0
 713         // r0                           // = dv for inner loop
 714         // r1                           // = av base
 715         // r3                           // = bv for outer loop
 716         ldr     r4, [sp, #76]           // = bv limit
 717         mov     r12, r2                 // = av limit
 718         mov     r2, r1                  // = av for inner loop
 719         add     r5, r0, #16             // = dv for outer loop
 720         vzip.16 q4, q5                  // expand Y
 721         bl      mul4zc
 722         cmp     r2, r12                 // all done?
 723         bhs     8f
 724
 725         // Continue with the first iteration.
 726 0:      bl      mul4
 727         cmp     r2, r12                 // all done?
 728         blo     0b
 729
 730         // Write out the leftover carry.  There can be no tail here.
 731 8:      bl      carryprop
 732         cmp     r3, r4                  // more passes to do?
 733         bhs     9f
 734
 735         // Set up for the next pass.
 736 1:      vld1.32 {q4}, [r3]!             // = Y = bv[i]
 737         vmov.i32 q5, #0
 738         mov     r0, r5                  // -> dv[i]
 739         mov     r2, r1                  // -> av[0]
 740         add     r5, r5, #16
 741         vzip.16 q4, q5                  // expand Y
 742         bl      mla4zc
 743         cmp     r2, r12                 // done yet?
 744         bhs     8f
 745
 746         // Continue...
 747 0:      bl      mla4
 748         cmp     r2, r12
 749         blo     0b
 750
 751         // Finish off this pass.  There was no tail on the previous pass, and
 752         // there can be done on this pass.
 753 8:      bl      carryprop
 754         cmp     r3, r4
 755         blo     1b
 756
 757         // All over.
 758 9:      popvfp  QQ(q4, q7)
 759         popreg  r4, r5, r14
 760         bx      r14
 761 ENDFUNC
 762
 763 FUNC(mpxmont_mul4_arm_neon)
 764         // void mpxmont_mul4_arm_neon(mpw *dv, const mpw *av, const mpw *bv,
 765         //                           const mpw *nv, size_t n, const mpw *mi);
 766
 767         // Establish the arguments and do initial setup.
 768         //
 769         // inner loop dv        r0
 770         // inner loop av        r1
 771         // inner loop nv        r2
 772         // mi                   r5
 773         // outer loop dv        r6
 774         // outer loop bv        r7
 775         // av base              r8
 776         // av limit             r9
 777         // bv limit             r4
 778         // nv base              r3
 779         // n                    r4
 780         // c                    r10
 781         // 0                    r12
 782
 783         pushreg r4-r10, r14
 784         pushvfp QQ(q4, q7)
 785   endprologue
 786
 787         // Establish the expanded operands.
 788         ldrd    r4, r5, [sp, #96]       // r4 = n; r5 -> mi
 789         vld1.32 {q2}, [r2]              // = V = bv[0]
 790         vmov.i32 q3, #0
 791         vmov.i32 q5, #0
 792         vld1.32 {q4}, [r5]              // = M
 793
 794         // Set up the outer loop state and prepare for the first iteration.
 795         // r0                           // -> dv for inner loop
 796         // r1                           // -> av for inner loop
 797         add     r7, r2, #16             // -> bv
 798         // r3                           // -> nv
 799         add     r6, r0, #16             // -> dv
 800         mov     r8, r1                  // -> av
 801         add     r9, r1, r4, lsl #2      // -> av limit
 802         add     r4, r2, r4, lsl #2      // -> bv limit
 803         mov     r2, r3                  // -> nv for inner loop
 804         mov     r12, #0                 // = 0
 805
 806         vzip.16 q2, q3                  // expand V
 807         vzip.16 q4, q5                  // expand M
 808         bl      mmul4
 809         cmp     r1, r9                  // done already?
 810         bhs     8f
 811
 812         // Complete the first inner loop.
 813 0:      bl      dmul4
 814         cmp     r1, r9                  // done yet?
 815         blo     0b
 816
 817         // Still have carries left to propagate.  Rather than store the tail
 818         // end in memory, keep it in a general-purpose register for later.
 819         bl      carryprop
 820         vmov.32 r10, QW(q15, 0)
 821
 822         // Embark on the next iteration.  (There must be one.  If n = 1 then
 823         // we would have bailed above, to label 8.  Similarly, the subsequent
 824         // iterations can fall into the inner loop immediately.)
 825 1:      vld1.32 {q2}, [r7]!             // = V = bv[i]
 826         vld1.32 {q4}, [r5]              // = M
 827         vmov.i32 q3, #0
 828         vmov.i32 q5, #0
 829         mov     r0, r6                  // -> dv[i]
 830         add     r6, r6, #16
 831         mov     r1, r8                  // -> av[0]
 832         mov     r2, r3                  // -> nv[0]
 833         vzip.16 q2, q3                  // expand V
 834         vzip.16 q4, q5                  // expand M
 835         bl      mmla4
 836
 837         // Complete the next inner loop.
 838 0:      bl      dmla4
 839         cmp     r1, r9                  // done yet?
 840         blo     0b
 841
 842         // Still have carries left to propagate, and they overlap the
 843         // previous iteration's final tail, so read that and add it.
 844         cmp     r7, r4
 845         vmov.32 D0(q12), r10, r12
 846         vadd.i64 D0(q13), D0(q13), D0(q12)
 847         bl      carryprop
 848         vmov.32 r10, QW(q15, 0)
 849
 850         // Back again, maybe.
 851         blo     1b
 852
 853         // All done, almost.
 854         str     r10, [r0], #4
 855         popvfp  QQ(q4, q7)
 856         popreg  r4-r10, r14
 857         bx      r14
 858
 859         // First iteration was short.  Write out the carries and we're done.
 860         // (This could be folded into the main loop structure, but that would
 861         // penalize small numbers more.)
 862 8:      bl      carryprop
 863         vst1.32 {QW(q15, 0)}, [r0]!
 864         popvfp  QQ(q4, q7)
 865         popreg  r4-r10, r14
 866         bx      r14
 867 ENDFUNC
 868
 869 FUNC(mpxmont_redc4_arm_neon)
 870         // void mpxmont_redc4_arm_neon(mpw *dv, mpw *dvl, const mpw *nv,
 871         //                             size_t n, const mpw *mi);
 872
 873         // Establish the arguments and do initial setup.
 874         //
 875         // inner loop dv        r0
 876         // dv limit             r1
 877         // inner loop nv        r2
 878         // blocks-of-4 dv limit r3
 879         // mi                   (r14)
 880         // outer loop dv        r4
 881         // outer loop dv limit  r5
 882         // nv base              r6
 883         // nv limit             r7
 884         // n                    r3
 885         // c                    (r14)
 886         // t0, t1, t2, t3       r2, r8, r9, r10
 887         // 0                    r12
 888
 889         pushreg r4-r10, r14
 890         pushvfp QQ(q4, q7)
 891   endprologue
 892
 893         // Set up the outer loop state and prepare for the first iteration.
 894         ldr     r14, [sp, #96]          // -> mi
 895         vmov.i32 q3, #0
 896          sub    r12, r1, r0             // total dv bytes
 897         // r0                           // -> dv for inner loop
 898         // r1                           // -> overall dv limit
 899         // r2                           // -> nv for inner loop
 900         // r3                           // = n (for now)
 901         add     r4, r0, #16             // -> dv for outer loop
 902         add     r5, r0, r3, lsl #2      // -> dv limit
 903          bic    r12, r12, #15           // dv blocks of 4
 904          vld1.32 {q2}, [r14]            // = M
 905         mov     r6, r2                  // -> nv
 906         add     r7, r2, r3, lsl #2      // -> nv limit
 907         add     r3, r0, r12             // -> dv blocks-of-4 limit
 908          vzip.16 q2, q3                 // expand M
 909         mov     r12, #0                 // = 0
 910         bl      mont4
 911         cmp     r2, r7                  // done already?
 912         bhs     8f
 913
 914 5:      bl      mla4
 915         cmp     r2, r7                  // done yet?
 916         blo     5b
 917
 918         // Still have carries left to propagate.  Adding the accumulator
 919         // block into the carries is a little different this time, because
 920         // all four accumulator limbs have to be squished into the three
 921         // carry registers for `carryprop' to do its thing.
 922 8:      vld1.32 {q9}, [r0]
 923         SPREADACC0(q9, q10, q11, q12)
 924         SPREADACC1(q9, q10, q11, q12)
 925         vshl.u64 D0(q12), D0(q12), #16
 926         CARRYACC(q9, q10, q11, q12, q13, q14, q15)
 927         vadd.u64 D1(q15), D1(q15), D0(q12)
 928
 929         bl      carryprop
 930         vmov.32 r14, QW(q15, 0)
 931         cmp     r0, r3
 932         bhs     7f
 933
 934         // Propagate the first group of carries.
 935         ldmia   r0, {r2, r8-r10}
 936         adds    r2, r2, r14
 937         adcs    r8, r8, #0
 938         adcs    r9, r9, #0
 939         adcs    r10, r10, #0
 940         stmia   r0!, {r2, r8-r10}
 941         teq     r0, r3
 942         beq     6f
 943
 944         // Continue carry propagation until the end of the buffer.
 945 0:      ldmia   r0, {r2, r8-r10}
 946         adcs    r2, r2, #0
 947         adcs    r8, r8, #0
 948         adcs    r9, r9, #0
 949         adcs    r10, r10, #0
 950         stmia   r0!, {r2, r8-r10}
 951         teq     r0, r3
 952         bne     0b
 953
 954         // Deal with the tail end.  Note that the actual destination length
 955         // won't be an exacty number of blocks of four, so it's safe to just
 956         // drop through here.
 957 6:      adc     r14, r12, #0
 958 7:      ldr     r2, [r0]
 959         adds    r2, r2, r14
 960         str     r2, [r0], #4
 961         teq     r0, r1
 962         beq     8f
 963 0:      ldr     r2, [r0]
 964         adcs    r2, r2, #0
 965         str     r2, [r0], #4
 966         teq     r0, r1
 967         bne     0b
 968
 969         // All done for this iteration.  Start the next.
 970 8:      cmp     r4, r5
 971         bhs     9f
 972         mov     r0, r4
 973         add     r4, r4, #16
 974         mov     r2, r6
 975         bl      mont4
 976         b       5b
 977
 978         // All over.
 979 9:      popvfp  QQ(q4, q7)
 980         popreg  r4-r10, r14
 981         bx      r14
 982 ENDFUNC
 983
 984 ///--------------------------------------------------------------------------
 985 /// Testing and performance measurement.
 986
 987 #ifdef TEST_MUL4
 988
 989 //                dmul  smul  mmul  mont
 990 // z    r0        r0    r0    r0     r0
 991 // c    r4        r1    r1    r1     r1
 992 // y    r3        --    --    r2     r2
 993 // u    r1        r2    --    r3     --
 994 // x    r2        r3    r2    stk0   r3
 995 // vv   q2/q3     stk0  --    stk1   stk0
 996 // yy   q4/q5     stk1  r3    stk2   --
 997 // n    r5        stk2  stk0  stk3   stk1
 998 // cyv  r6        stk3  stk1  stk4   stk2
 999
1000 #define STKARG(i) sp, #80 + 4*(i)
1001
1002 .macro  testprologue mode
1003         pushreg r4-r6, r14
1004         pushvfp QQ(q4, q7)
1005   endprologue
1006
1007   .ifeqs "\mode", "dmul"
1008         mov     r4, r1                  // -> c
1009         mov     r1, r2                  // -> u
1010         mov     r2, r3                  // -> x
1011
1012         ldr     r14, [STKARG(0)]        // -> vv
1013         vld1.32 {q2}, [r14]
1014         vmov.i32 q3, #0
1015         vzip.16 q2, q3                  // (v''_1, v'_1; v''_0, v'_0)
1016
1017         ldr     r14, [STKARG(1)]        // -> yy
1018         vld1.32 {q4}, [r14]
1019         vmov.i32 q5, #0
1020         vzip.16 q4, q5                  // (y''_1, y'_1; y''_0, y'_0)
1021
1022         ldr     r5, [STKARG(2)]         // = n
1023         ldr     r6, [STKARG(3)]         // -> cyv
1024   .endif
1025
1026   .ifeqs "\mode", "smul"
1027         mov     r4, r1                  // -> c
1028         // r2                           // -> x
1029
1030         vld1.32 {q4}, [r3]
1031         vmov.i32 q5, #0
1032         vzip.16 q4, q5                  // (y''_1, y'_1; y''_0, y'_0)
1033
1034         ldr     r5, [STKARG(0)]         // = n
1035         ldr     r6, [STKARG(1)]         // -> cyv
1036   .endif
1037
1038   .ifeqs "\mode", "mmul"
1039         mov     r4, r1                  // -> c
1040         mov     r1, r3                  // -> u
1041         mov     r3, r2                  // -> y
1042         ldr     r2, [STKARG(0)]         // -> x
1043
1044         ldr     r14, [STKARG(1)]        // -> vv
1045         vld1.32 {q2}, [r14]
1046         vmov.i32 q3, #0
1047         vzip.16 q2, q3                  // (v''_1, v'_1; v''_0, v'_0)
1048
1049         ldr     r14, [STKARG(2)]        // -> yy
1050         vld1.32 {q4}, [r14]
1051         vmov.i32 q5, #0
1052         vzip.16 q4, q5                  // (y''_1, y'_1; y''_0, y'_0)
1053
1054         ldr     r5, [STKARG(3)]         // = n
1055         ldr     r6, [STKARG(4)]         // -> cyv
1056   .endif
1057
1058   .ifeqs "\mode", "mont"
1059         mov     r4, r1                  // -> c
1060         mov     r1, r3                  // -> u
1061         mov     r14, r2
1062         mov     r2, r3                  // -> x
1063         mov     r3, r14                 // -> y
1064
1065         ldr     r14, [STKARG(0)]        // -> vv
1066         vld1.32 {q2}, [r14]
1067         vmov.i32 q3, #0
1068         vzip.16 q2, q3                  // (v''_1, v'_1; v''_0, v'_0)
1069
1070         ldr     r5, [STKARG(1)]         // = n
1071         ldr     r6, [STKARG(2)]         // -> cyv
1072   .endif
1073 .endm
1074
1075 .macro  testldcarry
1076         vldmia  r4, {QQ(q13, q15)}      // c0, c1, c2
1077 .endm
1078
1079 .macro  testtop
1080 0:      subs    r5, r5, #1
1081 .endm
1082
1083 .macro  testtail
1084         bne     0b
1085 .endm
1086
1087 .macro  testcarryout
1088         vstmia  r4, {QQ(q13, q15)}
1089 .endm
1090
1091 .macro  testepilogue
1092         popvfp  QQ(q4, q7)
1093         popreg  r4-r6, r14
1094         bx      r14
1095 .endm
1096
1097 FUNC(test_dmul4)
1098         testprologue dmul
1099         testldcarry
1100         testtop
1101         bl      dmul4
1102         testtail
1103         testcarryout
1104         testepilogue
1105 ENDFUNC
1106
1107 FUNC(test_dmla4)
1108         testprologue dmul
1109         testldcarry
1110         testtop
1111         bl      dmla4
1112         testtail
1113         testcarryout
1114         testepilogue
1115 ENDFUNC
1116
1117 FUNC(test_mul4)
1118         testprologue smul
1119         testldcarry
1120         testtop
1121         bl      mul4
1122         testtail
1123         testcarryout
1124         testepilogue
1125 ENDFUNC
1126
1127 FUNC(test_mul4zc)
1128         testprologue smul
1129         testldcarry
1130         testtop
1131         bl      mul4zc
1132         testtail
1133         testcarryout
1134         testepilogue
1135 ENDFUNC
1136
1137 FUNC(test_mla4)
1138         testprologue smul
1139         testldcarry
1140         testtop
1141         bl      mla4
1142         testtail
1143         testcarryout
1144         testepilogue
1145 ENDFUNC
1146
1147 FUNC(test_mla4zc)
1148         testprologue smul
1149         testldcarry
1150         testtop
1151         bl      mla4zc
1152         testtail
1153         testcarryout
1154         testepilogue
1155 ENDFUNC
1156
1157 FUNC(test_mmul4)
1158         testprologue mmul
1159         testtop
1160         bl      mmul4
1161         testtail
1162         vst1.32 {q4, q5}, [r3]
1163         testcarryout
1164         testepilogue
1165 ENDFUNC
1166
1167 FUNC(test_mmla4)
1168         testprologue mmul
1169         testtop
1170         bl      mmla4
1171         testtail
1172         vst1.32 {q4, q5}, [r3]
1173         testcarryout
1174         testepilogue
1175 ENDFUNC
1176
1177 FUNC(test_mont4)
1178         testprologue mont
1179         testtop
1180         bl      mont4
1181         testtail
1182         vst1.32 {q4, q5}, [r3]
1183         testcarryout
1184         testepilogue
1185 ENDFUNC
1186
1187 #endif
1188
1189 ///----- That's all, folks --------------------------------------------------