/// -*- mode: asm; asm-comment-char: ?/; comment-start: "// " -*- /// /// Large SIMD-based multiplications /// /// (c) 2016 Straylight/Edgeware /// ///----- Licensing notice --------------------------------------------------- /// /// This file is part of Catacomb. /// /// Catacomb is free software; you can redistribute it and/or modify /// it under the terms of the GNU Library General Public License as /// published by the Free Software Foundation; either version 2 of the /// License, or (at your option) any later version. /// /// Catacomb is distributed in the hope that it will be useful, /// but WITHOUT ANY WARRANTY; without even the implied warranty of /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the /// GNU Library General Public License for more details. /// /// You should have received a copy of the GNU Library General Public /// License along with Catacomb; if not, write to the Free /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, /// MA 02111-1307, USA. ///-------------------------------------------------------------------------- /// Preliminaries. #include "config.h" #include "asm-common.h" .arch pentium4 .text ///-------------------------------------------------------------------------- /// Theory. /// /// We define a number of primitive fixed-size multipliers from which we can /// construct more general variable-length multipliers. /// /// The basic trick is the same throughout. In an operand-scanning /// multiplication, the inner multiplication loop multiplies a multiple- /// precision operand by a single precision factor, and adds the result, /// appropriately shifted, to the result. A `finely integrated operand /// scanning' implementation of Montgomery multiplication also adds the /// product of a single-precision `Montgomery factor' and the modulus, /// calculated in the same pass. The more common `coarsely integrated /// operand scanning' alternates main multiplication and Montgomery passes, /// which requires additional carry propagation. /// /// Throughout both plain-multiplication and Montgomery stages, then, one of /// the factors remains constant throughout the operation, so we can afford /// to take a little time to preprocess it. The transformation we perform is /// as follows. Let b = 2^16, and B = b^2 = 2^32. Suppose we're given a /// 128-bit factor v = v_0 + v_1 B + v_2 B^2 + v_3 B^3. Split each v_i into /// two sixteen-bit pieces, so v_i = v'_i + v''_i b. These eight 16-bit /// pieces are placed into 32-bit cells, and arranged as two 128-bit SSE /// operands, as follows. /// /// Offset 12 8 4 0 /// 0 v''_1 v''_0 v'_1 v'_0 /// 16 v''_3 v''_2 v'_3 v'_2 /// /// A `pmuludqd' instruction ignores the odd positions in its operands; thus, /// it will act on (say) v'_0 and v''_0 in a single instruction. Shifting /// this vector right by 4 bytes brings v'_1 and v''_1 into position. We can /// multiply such a vector by a full 32-bit scalar to produce two 48-bit /// results in 64-bit fields. The sixteen bits of headroom allows us to add /// many products together before we must deal with carrying; it also allows /// for some calculations to be performed on the above expanded form. /// /// We maintain four `carry' registers XMM12--XMM15 accumulating intermediate /// results. The registers' precise roles rotate during the computation; we /// name them `c0', `c1', `c2', and `c3'. Each carry register holds two /// 64-bit halves: the register c0, for example, holds c'_0 (low half) and /// c''_0 (high half), and represents the value c_0 = c'_0 + c''_0 b; the /// carry registers collectively represent the value c_0 + c_1 B + c_2 B^2 + /// c_3 B^3. The `pmuluqdq' instruction acting on a scalar operand /// (broadcast across all lanes of its vector) and an operand in the expanded /// form above produces a result which can be added directly to the /// appropriate carry register. Following a pass of four multiplications, we /// perform some limited carry propagation: let t = c''_0 mod B, and let d = /// c'_0 + t b; then we output z = d mod B, add (floor(d/B), floor(c''_0/B)) /// to c1, and cycle the carry registers around, so that c1 becomes c0, and /// the old (implicitly) zeroed c0 becomes c3. /// /// On 64-bit AMD64, we have a reasonable number of registers: the expanded /// operands are kept in registers. The packed operands are read from memory /// into working registers XMM4 and XMM5; XMM0--XMM3 are used for the actual /// multiplications; and XMM6 and XMM7 are used for combining the results. /// The following conventional argument names and locations are used /// throughout. /// /// Arg Format Location Notes /// /// U packed [RAX] /// X packed [RBX] In Montgomery multiplication, X = N /// V expanded XMM8/XMM9 /// Y expanded XMM10/XMM11 In Montgomery multiplication, Y = (A + U V) M /// M expanded (see below) Montgomery factor, M = -N^{-1} (mod B^4) /// N Modulus, for Montgomery multiplication /// A packed [RDI] Destination/accumulator /// C carry XMM12--XMM15 /// /// The calculation is some variant of /// /// A' + C' B^4 <- U V + X Y + A + C /// /// The low-level functions fit into a fairly traditional (finely-integrated) /// operand scanning loop over operand pairs (U, X) (indexed by j) and (V, Y) /// (indexed by i). /// /// The variants are as follows. /// /// Function Variant Use i j /// /// mmul4 A = C = 0, Y = M Montgomery 0 0 /// dmul4 A = 0 Montgomery 0 + /// mmla4 C = 0, Y = M Montgomery + 0 /// dmla4 exactly as shown Montgomery + + /// mont4 U = C = 0, V = M Montgomery any 0 /// /// mul4zc U = V = A = C = 0 Plain 0 0 /// mul4 U = V = A = 0 Plain 0 + /// mla4zc U = V = C = 0 Plain + 0 /// mla4 U = V = 0 Plain + + /// /// The `mmul4' and `mmla4' functions are also responsible for calculating /// the Montgomery reduction factor Y = (A + U V) M used by the rest of the /// inner loop. ///-------------------------------------------------------------------------- /// Macro definitions. .macro mulcore r, i, slo, shi, d0, d1=nil, d2=nil, d3=nil // Multiply R_I by the expanded operand SLO/SHI, and leave the pieces // of the product in registers D0, D1, D2, D3. pshufd \d0, \r, SHUF(3, \i, 3, \i) // (?, r_i; ?, r_i) .ifnes "\d1", "nil" movdqa \d1, \slo // (s''_1, s''_0; s'_1, s'_0) .endif .ifnes "\d3", "nil" movdqa \d3, \shi // (s''_3, s''_2; s'_3, s'_2) .endif .ifnes "\d1", "nil" psrldq \d1, 4 // (0, s''_1; s''_0, s'_1) .endif .ifnes "\d2", "nil" movdqa \d2, \d0 // another copy of (?, r_i; ?, r_i) .endif .ifnes "\d3", "nil" psrldq \d3, 4 // (0, s''_3; s''_2, s'_3) .endif .ifnes "\d1", "nil" pmuludq \d1, \d0 // (r_i s''_1; r_i s'_1) .endif .ifnes "\d3", "nil" pmuludq \d3, \d0 // (r_i s''_3; r_i s'_3) .endif .ifnes "\d2", "nil" pmuludq \d2, \shi // (r_i s''_2; r_i s'_2) .endif pmuludq \d0, \slo // (r_i s''_0; r_i s'_0) .endm .macro accum c0, c1=nil, c2=nil, c3=nil // Accumulate 64-bit pieces in XMM0--XMM3 into the corresponding // carry registers C0--C3. Any or all of C1--C3 may be `nil' to skip // updating that register. paddq \c0, xmm0 .ifnes "\c1", "nil" paddq \c1, xmm1 .endif .ifnes "\c2", "nil" paddq \c2, xmm2 .endif .ifnes "\c3", "nil" paddq \c3, xmm3 .endif .endm .macro mulacc r, i, slo, shi, c0=nil, c1=nil, c2=nil, c3=nil, z3p=nil // Multiply R_I by the expanded operand SLO/SHI, and accumulate in // carry registers C0, C1, C2, C3. If Z3P is `t' then C3 notionally // contains zero, but needs clearing; in practice, we store the // product directly rather than attempting to add. On completion, // XMM0, XMM1, and XMM2 are clobbered, as is XMM3 if Z3P is not `t'. .ifeqs "\z3p", "t" mulcore \r, \i, \slo, \shi, xmm0, xmm1, xmm2, \c3 accum \c0, \c1, \c2 .else mulcore \r, \i, \slo, \shi, xmm0, xmm1, xmm2, xmm3 accum \c0, \c1, \c2, \c3 .endif .endm .macro propout d, pos, c, cc=nil // Calculate an output word from C, and store it at POS in D; // propagate carries out from C to CC in preparation for a rotation // of the carry registers. D is an XMM register; the POS is either // `lo' or `hi' according to whether the output word should be in // lane 0 or 1 of D; the high two lanes of D are clobbered. On // completion, XMM3 is clobbered. If CC is `nil', then the // contribution which would have been added to it is left in C. pshufd xmm3, \c, SHUF(2, 3, 3, 3) // (t = c'' mod B, ?; ?, ?) psrldq xmm3, 12 // (0, 0; 0, t) = (0; t) pslldq xmm3, 2 // (0; t b) paddq \c, xmm3 // (c''; c' + t b) .ifeqs "\pos", "lo" movdqa \d, \c .else punpckldq \d, \c .endif psrlq \c, 32 // floor(c/B) .ifnes "\cc", "nil" paddq \cc, \c // propagate up .endif .endm .macro endprop d, pos, c, t // On entry, C contains a carry register. On exit, the low 32 bits // of the value represented in C are written at POS in D, and the // remaining bits are left at the bottom of T. movdqa \t, \c psllq \t, 16 // (c'' b; ?) pslldq \c, 8 // (c'; 0) paddq \t, \c // (c' + c'' b; ?) psrldq \t, 8 // (0; c' + c'' b) = (0; c) .ifeqs "\pos", "lo" movdqa \d, \t .else punpckldq \d, \t .endif psrldq \t, 4 // (floor(c/B); 0) .endm .macro expand z, a, b, c=nil, d=nil // On entry, A and C hold packed 128-bit values, and Z is zero. On // exit, A:B and C:D together hold the same values in expanded // form. If C is `nil', then only expand A to A:B. movdqa \b, \a // (a_3, a_2; a_1, a_0) .ifnes "\c", "nil" movdqa \d, \c // (c_3, c_2; c_1, c_0) .endif punpcklwd \a, \z // (a''_1, a'_1; a''_0, a'_0) punpckhwd \b, \z // (a''_3, a'_3; a''_2, a'_2) .ifnes "\c", "nil" punpcklwd \c, \z // (c''_1, c'_1; c''_0, c'_0) punpckhwd \d, \z // (c''_3, c'_3; c''_2, c'_2) .endif pshufd \a, \a, SHUF(3, 1, 2, 0) // (a''_1, a''_0; a'_1, a'_0) pshufd \b, \b, SHUF(3, 1, 2, 0) // (a''_3, a''_2; a'_3, a'_2) .ifnes "\c", "nil" pshufd \c, \c, SHUF(3, 1, 2, 0) // (c''_1, c''_0; c'_1, c'_0) pshufd \d, \d, SHUF(3, 1, 2, 0) // (c''_3, c''_2; c'_3, c'_2) .endif .endm .macro squash c0, c1, c2, c3, t, u, lo, hi=nil // On entry, C0, C1, C2, C3 are carry registers representing a value // Y. On exit, LO holds the low 128 bits of the carry value; C1, C2, // C3, T, and U are clobbered; and the high bits of Y are stored in // HI, if this is not `nil'. // The first step is to eliminate the `double-prime' pieces -- i.e., // the ones offset by 16 bytes from a 32-bit boundary -- by carrying // them into the 32-bit-aligned pieces above and below. But before // we can do that, we must gather them together. movdqa \t, \c0 movdqa \u, \c1 punpcklqdq \t, \c2 // (y'_2; y'_0) punpckhqdq \c0, \c2 // (y''_2; y''_0) punpcklqdq \u, \c3 // (y'_3; y'_1) punpckhqdq \c1, \c3 // (y''_3; y''_1) // Now split the double-prime pieces. The high (up to) 48 bits will // go up; the low 16 bits go down. movdqa \c2, \c0 movdqa \c3, \c1 psllq \c2, 48 psllq \c3, 48 psrlq \c0, 16 // high parts of (y''_2; y''_0) psrlq \c1, 16 // high parts of (y''_3; y''_1) psrlq \c2, 32 // low parts of (y''_2; y''_0) psrlq \c3, 32 // low parts of (y''_3; y''_1) .ifnes "\hi", "nil" movdqa \hi, \c1 .endif pslldq \c1, 8 // high part of (y''_1; 0) paddq \t, \c2 // propagate down paddq \u, \c3 paddq \t, \c1 // and up: (y_2; y_0) paddq \u, \c0 // (y_3; y_1) .ifnes "\hi", "nil" psrldq \hi, 8 // high part of (0; y''_3) .endif // Finally extract the answer. This complicated dance is better than // storing to memory and loading, because the piecemeal stores // inhibit store forwarding. movdqa \c3, \t // (?; y_0) movdqa \lo, \t // (?, ?; ?, y^*_0) psrldq \t, 8 // (0; y_2) psrlq \c3, 32 // (floor(y_0/B); ?) paddq \c3, \u // (y_1 + floor(y_0/B); ?) movdqa \c1, \c3 // (?, ?; ?, y^*_1) psrldq \u, 8 // (0; y_3) psrlq \c3, 32 // (floor((y_1 B + y_0)/B^2; ?) paddq \c3, \t // (y_2 + floor((y_1 B + y_0)/B^2; ?) punpckldq \lo, \c3 // (?, ?; y^*_2, y^*_0) psrlq \c3, 32 // (floor((y_2 B^2 + y_1 B + y_0)/B^3; ?) paddq \c3, \u // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3; ?) .ifnes "\hi", "nil" movdqa \t, \c3 pxor \u, \u .endif punpckldq \c1, \c3 // (?, ?; y^*_3, y^*_1) .ifnes "\hi", "nil" psrlq \t, 32 // very high bits of y paddq \hi, \t punpcklqdq \hi, \u // carry up .endif punpckldq \lo, \c1 // y mod B^4 .endm .macro carryadd // On entry, RDI points to a packed addend A, and XMM12, XMM13, XMM14 // hold the incoming carry registers c0, c1, and c2 representing a // carry-in C. // // On exit, the carry registers, including XMM15, are updated to hold // C + A; XMM0, XMM1, XMM2, and XMM3 are clobbered. The other // registers are preserved. movd xmm0, [rdi + 0] // (0; a_0) movd xmm1, [rdi + 4] // (0; a_1) movd xmm2, [rdi + 8] // (0; a_2) movd xmm15, [rdi + 12] // (0; a_3) paddq xmm12, xmm0 // (c''_0; c'_0 + a_0) paddq xmm13, xmm1 // (c''_1; c'_1 + a_1) paddq xmm14, xmm2 // (c''_2 + a_3 b; c'_2 + a_2) .endm ///-------------------------------------------------------------------------- /// Primitive multipliers and related utilities. INTFUNC(carryprop) // On entry, XMM12, XMM13, and XMM14 hold a 144-bit carry in an // expanded form. Store the low 128 bits of the represented carry to // [RDI] as a packed 128-bit value, and leave the remaining 16 bits // in the low 32 bits of XMM12. On exit, XMM0, XMM1, XMM3, XMM13 and // XMM14 are clobbered. endprologue propout xmm0, lo, xmm12, xmm13 propout xmm1, lo, xmm13, xmm14 propout xmm0, hi, xmm14, nil endprop xmm1, hi, xmm14, xmm12 punpckldq xmm0, xmm1 movdqu [rdi], xmm0 ret ENDFUNC INTFUNC(dmul4) // On entry, RDI points to the destination buffer; RAX and RBX point // to the packed operands U and X; XMM8/XMM9 and XMM10/XMM11 hold the // expanded operands V and Y; and XMM12, XMM13, XMM14 hold the // incoming carry registers c0, c1, and c2; c3 is assumed to be zero. // // On exit, we write the low 128 bits of the sum C + U V + X Y to // [RDI], and update the carry registers with the carry out. The // registers XMM0--XMM7, and XMM15 are clobbered; the general-purpose // registers are preserved. endprologue movdqu xmm4, [rax] movdqu xmm5, [rbx] mulacc xmm4, 0, xmm8, xmm9, xmm12, xmm13, xmm14, xmm15, t mulacc xmm5, 0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15 propout xmm6, lo, xmm12, xmm13 mulacc xmm4, 1, xmm8, xmm9, xmm13, xmm14, xmm15, xmm12, t mulacc xmm5, 1, xmm10, xmm11, xmm13, xmm14, xmm15, xmm12 propout xmm7, lo, xmm13, xmm14 mulacc xmm4, 2, xmm8, xmm9, xmm14, xmm15, xmm12, xmm13, t mulacc xmm5, 2, xmm10, xmm11, xmm14, xmm15, xmm12, xmm13 propout xmm6, hi, xmm14, xmm15 mulacc xmm4, 3, xmm8, xmm9, xmm15, xmm12, xmm13, xmm14, t mulacc xmm5, 3, xmm10, xmm11, xmm15, xmm12, xmm13, xmm14 propout xmm7, hi, xmm15, xmm12 punpckldq xmm6, xmm7 movdqu [rdi], xmm6 ret ENDFUNC INTFUNC(dmla4) // On entry, RDI points to the destination buffer, which also // contains an addend A to accumulate; RAX and RBX point to the // packed operands U and X; XMM8/XMM9 and XMM10/XMM11 hold the // expanded operands V and Y; and XMM12, XMM13, XMM14 hold the // incoming carry registers c0, c1, and c2 representing a carry-in C; // c3 is assumed to be zero. // // On exit, we write the low 128 bits of the sum A + C + U V + X Y to // [RDI], and update the carry registers with the carry out. The // registers XMM0--XMM7, and XMM15 are clobbered; the general-purpose // registers are preserved. endprologue movdqu xmm4, [rax] movdqu xmm5, [rbx] carryadd mulacc xmm4, 0, xmm8, xmm9, xmm12, xmm13, xmm14, xmm15 mulacc xmm5, 0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15 propout xmm6, lo, xmm12, xmm13 mulacc xmm4, 1, xmm8, xmm9, xmm13, xmm14, xmm15, xmm12, t mulacc xmm5, 1, xmm10, xmm11, xmm13, xmm14, xmm15, xmm12 propout xmm7, lo, xmm13, xmm14 mulacc xmm4, 2, xmm8, xmm9, xmm14, xmm15, xmm12, xmm13, t mulacc xmm5, 2, xmm10, xmm11, xmm14, xmm15, xmm12, xmm13 propout xmm6, hi, xmm14, xmm15 mulacc xmm4, 3, xmm8, xmm9, xmm15, xmm12, xmm13, xmm14, t mulacc xmm5, 3, xmm10, xmm11, xmm15, xmm12, xmm13, xmm14 propout xmm7, hi, xmm15, xmm12 punpckldq xmm6, xmm7 movdqu [rdi], xmm6 ret ENDFUNC INTFUNC(mul4zc) // On entry, RDI points to the destination buffer; RBX points to a // packed operand X; and XMM10/XMM11 hold an expanded operand Y. // // On exit, we write the low 128 bits of the product X Y to [RDI], // and set the carry registers XMM12, XMM13, XMM14 to the carry out. // The registers XMM0--XMM3, XMM5--XMM7, and XMM15 are clobbered; the // general-purpose registers are preserved. endprologue movdqu xmm5, [rbx] mulcore xmm5, 0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15 propout xmm6, lo, xmm12, xmm13 mulacc xmm5, 1, xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t propout xmm7, lo, xmm13, xmm14 mulacc xmm5, 2, xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t propout xmm6, hi, xmm14, xmm15 mulacc xmm5, 3, xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t propout xmm7, hi, xmm15, xmm12 punpckldq xmm6, xmm7 movdqu [rdi], xmm6 ret ENDFUNC INTFUNC(mul4) // On entry, RDI points to the destination buffer; RBX points to a // packed operand X; XMM10/XMM11 hold an expanded operand Y; and // XMM12, XMM13, XMM14 hold the incoming carry registers c0, c1, and // c2, representing a carry-in C; c3 is assumed to be zero. // // On exit, we write the low 128 bits of the sum C + X Y to [RDI], // and update the carry registers with the carry out. The registers // XMM0--XMM3, XMM5--XMM7, and XMM15 are clobbered; the // general-purpose registers are preserved. endprologue movdqu xmm5, [rbx] mulacc xmm5, 0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, t propout xmm6, lo, xmm12, xmm13 mulacc xmm5, 1, xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t propout xmm7, lo, xmm13, xmm14 mulacc xmm5, 2, xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t propout xmm6, hi, xmm14, xmm15 mulacc xmm5, 3, xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t propout xmm7, hi, xmm15, xmm12 punpckldq xmm6, xmm7 movdqu [rdi], xmm6 ret ENDFUNC INTFUNC(mla4zc) // On entry, RDI points to the destination buffer, which also // contains an addend A to accumulate; RBX points to a packed operand // X; and XMM10/XMM11 points to an expanded operand Y. // // On exit, we write the low 128 bits of the sum A + X Y to [RDI], // and set the carry registers XMM12, XMM13, XMM14 to the carry out. // The registers XMM0--XMM3, XMM5--XMM7, and XMM15 are clobbered; the // general-purpose registers are preserved. endprologue movdqu xmm5, [rbx] movd xmm12, [rdi + 0] movd xmm13, [rdi + 4] movd xmm14, [rdi + 8] movd xmm15, [rdi + 12] mulacc xmm5, 0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15 propout xmm6, lo, xmm12, xmm13 mulacc xmm5, 1, xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t propout xmm7, lo, xmm13, xmm14 mulacc xmm5, 2, xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t propout xmm6, hi, xmm14, xmm15 mulacc xmm5, 3, xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t propout xmm7, hi, xmm15, xmm12 punpckldq xmm6, xmm7 movdqu [rdi], xmm6 ret ENDFUNC INTFUNC(mla4) // On entry, RDI points to the destination buffer, which also // contains an addend A to accumulate; RBX points to a packed operand // X; XMM10/XMM11 holds an expanded operand Y; and XMM12, XMM13, // XMM14 hold the incoming carry registers c0, c1, and c2, // representing a carry-in C; c3 is assumed to be zero. // // On exit, we write the low 128 bits of the sum A + C + X Y to // [RDI], and update the carry registers with the carry out. The // registers XMM0--XMM3, XMM5--XMM7, and XMM15 are clobbered; the // general-purpose registers are preserved. endprologue movdqu xmm5, [rbx] carryadd mulacc xmm5, 0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15 propout xmm6, lo, xmm12, xmm13 mulacc xmm5, 1, xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t propout xmm7, lo, xmm13, xmm14 mulacc xmm5, 2, xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t propout xmm6, hi, xmm14, xmm15 mulacc xmm5, 3, xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t propout xmm7, hi, xmm15, xmm12 punpckldq xmm6, xmm7 movdqu [rdi], xmm6 ret ENDFUNC INTFUNC(mmul4) // On entry, RDI points to the destination buffer; RAX and RBX point // to the packed operands U and N; and XMM8/XMM9 and XMM10/XMM11 hold // the expanded operands V and M. The stack pointer must be 8 modulo // 16 (as usual for AMD64 ABIs). // // On exit, we store Y = U V M mod B in XMM10/XMM11, and write the // low 128 bits of the sum U V + N Y to [RDI], leaving the remaining // carry in XMM12, XMM13, and XMM14. The registers XMM0--XMM7, and // XMM15 are clobbered; the general-purpose registers are preserved. movdqu xmm4, [rax] #if ABI_WIN stalloc 48 + 8 // space for the carries #endif endprologue // Calculate W = U V, and leave it in XMM7. Stash the carry pieces // for later. mulcore xmm4, 0, xmm8, xmm9, xmm12, xmm13, xmm14, xmm15 propout xmm7, lo, xmm12, xmm13 jmp 5f ENDFUNC INTFUNC(mmla4) // On entry, RDI points to the destination buffer, which also // contains an addend A to accumulate; RAX and RBX point to the // packed operands U and N; and XMM8/XMM9 and XMM10/XMM11 hold the // expanded operands V and M. The stack pointer must be 8 modulo 16 // (as usual for AMD64 ABIs). // // On exit, we store Y = (A + U V) M mod B in XMM10/XMM11, and write // the low 128 bits of the sum A + U V + N Y to [RDI], leaving the // remaining carry in XMM12, XMM13, and XMM14. The registers // XMM0--XMM7, and XMM15 are clobbered; the general-purpose registers // are preserved. movdqu xmm4, [rax] #if ABI_WIN stalloc 48 + 8 // space for the carries # define STKTMP(i) [SP + i] #endif #if ABI_SYSV # define STKTMP(i) [SP + i - 48 - 8] // use red zone #endif endprologue movd xmm12, [rdi + 0] movd xmm13, [rdi + 4] movd xmm14, [rdi + 8] movd xmm15, [rdi + 12] // Calculate W = U V, and leave it in XMM7. Stash the carry pieces // for later. mulacc xmm4, 0, xmm8, xmm9, xmm12, xmm13, xmm14, xmm15 propout xmm7, lo, xmm12, xmm13 5: mulacc xmm4, 1, xmm8, xmm9, xmm13, xmm14, xmm15, xmm12, t propout xmm6, lo, xmm13, xmm14 mulacc xmm4, 2, xmm8, xmm9, xmm14, xmm15, xmm12, xmm13, t propout xmm7, hi, xmm14, xmm15 mulacc xmm4, 3, xmm8, xmm9, xmm15, xmm12, xmm13, xmm14, t propout xmm6, hi, xmm15, xmm12 // Prepare W, and stash carries for later. punpckldq xmm7, xmm6 movdqa STKTMP( 0), xmm12 movdqa STKTMP(16), xmm13 movdqa STKTMP(32), xmm14 // Calculate Y = W M. We just about have enough spare registers to // make this work. mulcore xmm7, 0, xmm10, xmm11, xmm3, xmm4, xmm5, xmm6 // Start expanding W back into the main carry registers... pxor xmm15, xmm15 movdqa xmm12, xmm7 movdqa xmm14, xmm7 mulcore xmm7, 1, xmm10, xmm11, xmm0, xmm1, xmm2 accum xmm4, xmm5, xmm6 punpckldq xmm12, xmm15 // (0, w_1; 0, w_0) punpckhdq xmm14, xmm15 // (0, w_3; 0, w_2) mulcore xmm7, 2, xmm10, xmm11, xmm0, xmm1 accum xmm5, xmm6 pxor xmm2, xmm2 movdqa xmm13, xmm12 movdqa xmm15, xmm14 mulcore xmm7, 3, xmm10, xmm11, xmm0 accum xmm6 punpckldq xmm12, xmm2 // (0, 0; 0, w_0) punpckldq xmm14, xmm2 // (0, 0; 0, w_2) punpckhdq xmm13, xmm2 // (0, 0; 0, w_1) punpckhdq xmm15, xmm2 // (0, 0; 0, w_3) // That's lots of pieces. Now we have to assemble the answer. squash xmm3, xmm4, xmm5, xmm6, xmm0, xmm1, xmm10 // Expand it. movdqu xmm5, [rbx] expand xmm2, xmm10, xmm11 // Finish the calculation by adding the Montgomery product. mulacc xmm5, 0 xmm10, xmm11, xmm12, xmm13, xmm14, xmm15 propout xmm6, lo, xmm12, xmm13 mulacc xmm5, 1 xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t propout xmm7, lo, xmm13, xmm14 mulacc xmm5, 2 xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t propout xmm6, hi, xmm14, xmm15 mulacc xmm5, 3 xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t propout xmm7, hi, xmm15, xmm12 punpckldq xmm6, xmm7 // Add add on the carry we calculated earlier. paddq xmm12, STKTMP( 0) paddq xmm13, STKTMP(16) paddq xmm14, STKTMP(32) // And, with that, we're done. movdqu [rdi], xmm6 #if ABI_WIN stfree 56 #endif ret #undef STKTMP ENDFUNC INTFUNC(mont4) // On entry, RDI points to the destination buffer holding a packed // value W; RBX points to a packed operand N; and XMM8/XMM9 hold an // expanded operand M. // // On exit, we store Y = W M mod B in XMM10/XMM11, and write the low // 128 bits of the sum W + N Y to [RDI], leaving the remaining carry // in XMM12, XMM13, and XMM14. The registers XMM0--XMM3, XMM5--XMM7, // and XMM15 are clobbered; the general-purpose registers are // preserved. endprologue movdqu xmm7, [rdi] // Calculate Y = W M. Avoid the standard carry registers, because // we're setting something else up there. mulcore xmm7, 0, xmm8, xmm9, xmm3, xmm4, xmm5, xmm6 // Start expanding W back into the main carry registers... pxor xmm15, xmm15 movdqa xmm12, xmm7 movdqa xmm14, xmm7 mulcore xmm7, 1, xmm8, xmm9, xmm0, xmm1, xmm2 accum xmm4, xmm5, xmm6 punpckldq xmm12, xmm15 // (0, w_1; 0, w_0) punpckhdq xmm14, xmm15 // (0, w_3; 0, w_2) mulcore xmm7, 2, xmm8, xmm9, xmm0, xmm1 accum xmm5, xmm6 pxor xmm2, xmm2 movdqa xmm13, xmm12 movdqa xmm15, xmm14 mulcore xmm7, 3, xmm8, xmm9, xmm0 accum xmm6 punpckldq xmm12, xmm2 // (0, 0; 0, w_0) punpckldq xmm14, xmm2 // (0, 0; 0, w_2) punpckhdq xmm13, xmm2 // (0, 0; 0, w_1) punpckhdq xmm15, xmm2 // (0, 0; 0, w_3) // That's lots of pieces. Now we have to assemble the answer. squash xmm3, xmm4, xmm5, xmm6, xmm0, xmm1, xmm10 // Expand it. movdqu xmm5, [rbx] expand xmm2, xmm10, xmm11 // Finish the calculation by adding the Montgomery product. mulacc xmm5, 0 xmm10, xmm11, xmm12, xmm13, xmm14, xmm15 propout xmm6, lo, xmm12, xmm13 mulacc xmm5, 1 xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t propout xmm7, lo, xmm13, xmm14 mulacc xmm5, 2 xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t propout xmm6, hi, xmm14, xmm15 mulacc xmm5, 3 xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t propout xmm7, hi, xmm15, xmm12 punpckldq xmm6, xmm7 // And, with that, we're done. movdqu [rdi], xmm6 ret ENDFUNC ///-------------------------------------------------------------------------- /// Bulk multipliers. FUNC(mpx_umul4_amd64_avx) .arch .avx vzeroupper endprologue .arch pentium4 ENDFUNC FUNC(mpx_umul4_amd64_sse2) // void mpx_umul4_amd64_sse2(mpw *dv, const mpw *av, const mpw *avl, // const mpw *bv, const mpw *bvl); // Establish the arguments and do initial setup. // // sysv win // inner loop dv rdi rdi* // inner loop av rbx* rbx* // outer loop dv r10 rcx // outer loop bv rcx r9 // av base rsi rdx // av limit rdx r8 // bv limit r8 r10 #if ABI_SYSV # define DV r10 # define AV rsi # define AVL rdx # define BV rcx # define BVL r8 pushreg rbx endprologue mov DV, rdi #endif #if ABI_WIN # define DV rcx # define AV rdx # define AVL r8 # define BV r9 # define BVL r10 pushreg rbx pushreg rdi stalloc 160 + 8 savexmm xmm6, 0 savexmm xmm7, 16 savexmm xmm8, 32 savexmm xmm9, 48 savexmm xmm10, 64 savexmm xmm11, 80 savexmm xmm12, 96 savexmm xmm13, 112 savexmm xmm14, 128 savexmm xmm15, 144 endprologue mov rdi, DV mov BVL, [SP + 224] #endif // Prepare for the first iteration. pxor xmm0, xmm0 movdqu xmm10, [BV] // bv[0] mov rbx, AV add DV, 16 add BV, 16 expand xmm0, xmm10, xmm11 call mul4zc add rbx, 16 add rdi, 16 cmp rbx, AVL // all done? jae 8f .p2align 4 // Continue with the first iteration. 0: call mul4 add rbx, 16 add rdi, 16 cmp rbx, AVL // all done? jb 0b // Write out the leftover carry. There can be no tail here. 8: call carryprop cmp BV, BVL // more passes to do? jae 9f .p2align 4 // Set up for the next pass. 1: movdqu xmm10, [BV] // bv[i] mov rdi, DV // -> dv[i] pxor xmm0, xmm0 expand xmm0, xmm10, xmm11 mov rbx, AV // -> av[0] add DV, 16 add BV, 16 call mla4zc add rbx, 16 add rdi, 16 cmp rbx, AVL // done yet? jae 8f .p2align 4 // Continue... 0: call mla4 add rbx, 16 add rdi, 16 cmp rbx, AVL jb 0b // Finish off this pass. There was no tail on the previous pass, and // there can be none on this pass. 8: call carryprop cmp BV, BVL jb 1b // All over. 9: #if ABI_SYSV popreg rbx #endif #if ABI_WIN rstrxmm xmm6, 0 rstrxmm xmm7, 16 rstrxmm xmm8, 32 rstrxmm xmm9, 48 rstrxmm xmm10, 64 rstrxmm xmm11, 80 rstrxmm xmm12, 96 rstrxmm xmm13, 112 rstrxmm xmm14, 128 rstrxmm xmm15, 144 stfree 160 + 8 popreg rdi popreg rbx #endif ret #undef DV #undef AV #undef AVL #undef BV #undef BVL ENDFUNC FUNC(mpxmont_mul4_amd64_avx) .arch .avx vzeroupper endprologue .arch pentium4 ENDFUNC FUNC(mpxmont_mul4_amd64_sse2) // void mpxmont_mul4_amd64_sse2(mpw *dv, const mpw *av, const mpw *bv, // const mpw *nv, size_t n, const mpw *mi); // Establish the arguments and do initial setup. // // sysv win // inner loop dv rdi rdi* // inner loop av rax rax // inner loop nv rbx* rbx* // mi r9 r10 // outer loop dv r10 rcx // outer loop bv rdx r8 // av base rsi rdx // av limit r11 r11 // bv limit r8 r12* // nv base rcx r9 // n r8 r12* #if ABI_SYSV # define DV r10 # define AV rsi # define AVL r11 # define BV rdx # define BVL r8 # define NV rcx # define N r8 # define MI r9 pushreg rbx endprologue mov DV, rdi #endif #if ABI_WIN # define DV rcx # define AV rdx # define AVL r11 # define BV r8 # define BVL r12 # define NV r9 # define N r12 # define MI r10 pushreg rbx pushreg rdi pushreg r12 stalloc 160 savexmm xmm6, 0 savexmm xmm7, 16 savexmm xmm8, 32 savexmm xmm9, 48 savexmm xmm10, 64 savexmm xmm11, 80 savexmm xmm12, 96 savexmm xmm13, 112 savexmm xmm14, 128 savexmm xmm15, 144 endprologue mov rdi, DV mov N, [SP + 224] mov MI, [SP + 232] #endif // Establish the expanded operands. pxor xmm0, xmm0 movdqu xmm8, [BV] // bv[0] movdqu xmm10, [MI] // mi expand xmm0, xmm8, xmm9, xmm10, xmm11 // Set up the outer loop state and prepare for the first iteration. mov rax, AV // -> U = av[0] mov rbx, NV // -> X = nv[0] lea AVL, [AV + 4*N] // -> av[n/4] = av limit lea BVL, [BV + 4*N] // -> bv[n/4] = bv limit add BV, 16 add DV, 16 call mmul4 add rdi, 16 add rax, 16 add rbx, 16 cmp rax, AVL // done already? jae 8f .p2align 4 // Complete the first inner loop. 0: call dmul4 add rdi, 16 add rax, 16 add rbx, 16 cmp rax, AVL // done yet? jb 0b // Still have carries left to propagate. call carryprop movd [rdi + 16], xmm12 .p2align 4 // Embark on the next iteration. (There must be one. If n = 1, then // we would have bailed above, to label 8. Similarly, the subsequent // iterations can fall into the inner loop immediately.) 1: pxor xmm0, xmm0 movdqu xmm8, [BV] // bv[i] movdqu xmm10, [MI] // mi mov rdi, DV // -> Z = dv[i] mov rax, AV // -> U = av[0] mov rbx, NV // -> X = nv[0] expand xmm0, xmm8, xmm9, xmm10, xmm11 add BV, 16 add DV, 16 call mmla4 add rdi, 16 add rax, 16 add rbx, 16 .p2align 4 // Complete the next inner loop. 0: call dmla4 add rdi, 16 add rax, 16 add rbx, 16 cmp rax, AVL jb 0b // Still have carries left to propagate, and they overlap the // previous iteration's final tail, so read that in and add it. movd xmm0, [rdi] paddq xmm12, xmm0 call carryprop movd [rdi + 16], xmm12 // Back again, maybe. cmp BV, BVL jb 1b // All done. 9: #if ABI_SYSV popreg rbx #endif #if ABI_WIN rstrxmm xmm6, 0 rstrxmm xmm7, 16 rstrxmm xmm8, 32 rstrxmm xmm9, 48 rstrxmm xmm10, 64 rstrxmm xmm11, 80 rstrxmm xmm12, 96 rstrxmm xmm13, 112 rstrxmm xmm14, 128 rstrxmm xmm15, 144 stfree 160 popreg r12 popreg rdi popreg rbx #endif ret // First iteration was short. Write out the carries and we're done. // (This could be folded into the main loop structure, but that would // penalize small numbers more.) 8: call carryprop movd [rdi + 16], xmm12 #if ABI_SYSV popreg rbx ret #endif #if ABI_WIN jmp 9b #endif #undef DV #undef AV #undef AVL #undef BV #undef BVL #undef NV #undef N #undef MI ENDFUNC FUNC(mpxmont_redc4_amd64_avx) .arch .avx vzeroupper endprologue .arch pentium4 ENDFUNC FUNC(mpxmont_redc4_amd64_sse2) // void mpxmont_redc4_amd64_sse2(mpw *dv, mpw *dvl, const mpw *nv, // size_t n, const mpw *mi); // Establish the arguments and do initial setup. // // sysv win // inner loop dv rdi rdi* // dv limit rax rax // blocks-of-4 dv limit rsi rdx // inner loop nv rbx* rbx* // mi r8 r10 // outer loop dv r10 rcx // outer loop dv limit r11 r11 // nv base rdx r8 // nv limit r9 r10* // n rcx r9 // c rcx r9 #if ABI_SYSV # define DVL rax # define DVL4 rsi # define MI r8 # define DV r10 # define DVLO r11 # define NV rdx # define NVL r9 # define N rcx # define C ecx pushreg rbx endprologue mov DV, rdi #endif #if ABI_WIN # define DVL rax # define DVL4 rdx # define MI r10 # define DV rcx # define DVLO r11 # define NV r8 # define NVL r10 # define N r9 # define C r9d pushreg rbx pushreg rdi stalloc 168 savexmm xmm6, 0 savexmm xmm7, 16 savexmm xmm8, 32 savexmm xmm9, 48 savexmm xmm10, 64 savexmm xmm11, 80 savexmm xmm12, 96 savexmm xmm13, 112 savexmm xmm14, 128 savexmm xmm15, 144 endprologue mov rdi, DV mov MI, [SP + 224] #endif // Establish the expanded operands and the blocks-of-4 dv limit. pxor xmm0, xmm0 mov DVL, DVL4 // -> dv[n] = dv limit sub DVL4, DV // length of dv in bytes movdqu xmm8, [MI] // mi and DVL4, ~15 // mask off the tail end expand xmm0, xmm8, xmm9 add DVL4, DV // find limit // Set up the outer loop state and prepare for the first iteration. mov rbx, NV // -> X = nv[0] lea DVLO, [DV + 4*N] // -> dv[n/4] = outer dv limit lea NVL, [NV + 4*N] // -> nv[n/4] = nv limit add DV, 16 call mont4 add rbx, 16 add rdi, 16 cmp rbx, NVL // done already? jae 8f .p2align 4 // Complete the first inner loop. 5: call mla4 add rbx, 16 add rdi, 16 cmp rbx, NVL // done yet? jb 5b // Still have carries left to propagate. 8: carryadd psllq xmm15, 16 pslldq xmm15, 8 paddq xmm14, xmm15 call carryprop movd C, xmm12 add rdi, 16 cmp rdi, DVL4 jae 7f .p2align 4 // Continue carry propagation until the end of the buffer. 0: add [rdi], C mov C, 0 // preserves flags adc dword ptr [rdi + 4], 0 adc dword ptr [rdi + 8], 0 adc dword ptr [rdi + 12], 0 adc C, 0 add rdi, 16 cmp rdi, DVL4 jb 0b // Deal with the tail end. Note that the actual destination length // won't be an exacty number of blocks of four, so it's safe to just // drop through here. 7: add [rdi], C mov C, 0 add rdi, 4 adc C, 0 cmp rdi, DVL jb 7b // All done for this iteration. Start the next. cmp DV, DVLO // all done yet? jae 9f mov rdi, DV // -> Z = dv[i] mov rbx, NV // -> X = nv[0] add DV, 16 call mont4 add rdi, 16 add rbx, 16 jmp 5b // All over. 9: #if ABI_SYSV popreg rbx #endif #if ABI_WIN rstrxmm xmm6, 0 rstrxmm xmm7, 16 rstrxmm xmm8, 32 rstrxmm xmm9, 48 rstrxmm xmm10, 64 rstrxmm xmm11, 80 rstrxmm xmm12, 96 rstrxmm xmm13, 112 rstrxmm xmm14, 128 rstrxmm xmm15, 144 stfree 168 popreg rdi popreg rbx #endif ret #undef DVL #undef DVL4 #undef MI #undef DV #undef DVLO #undef NV #undef NVL #undef N #undef C ENDFUNC ///-------------------------------------------------------------------------- /// Testing and performance measurement. #ifdef TEST_MUL4 #if ABI_SYSV # define ARG0 rdi # define ARG1 rsi # define ARG2 rdx # define ARG3 rcx # define ARG4 r8 # define ARG5 r9 # define ARG6 STKARG(0) # define ARG7 STKARG(1) # define ARG8 STKARG(2) # define STKARG_OFFSET 16 #endif #if ABI_WIN # define ARG0 rcx # define ARG1 rdx # define ARG2 r8 # define ARG3 r9 # define ARG4 STKARG(0) # define ARG5 STKARG(1) # define ARG6 STKARG(2) # define ARG7 STKARG(3) # define ARG8 STKARG(4) # define STKARG_OFFSET 224 #endif #define STKARG(i) [SP + STKARG_OFFSET + 8*(i)] // sysv win // dmul smul mmul mont dmul smul mmul mont // A rax // D rdx // z rdi rdi rdi rdi rdi rcx rcx rcx rcx // c rcx rsi rsi rsi rsi rdx rdx rdx rdx // y r10 -- -- rdx rdx -- -- r8 r8 // u r11 rdx -- rcx -- r8 -- r9 -- // x rbx rcx rdx r8 rcx r9 r8 stk0 r9 // vv xmm8/9 r8 -- r9 r8 stk0 -- stk1 stk0 // yy xmm10/11 r9 rcx stk0 -- stk1 r9 stk2 -- // n r8 stk0 r8 stk1 r9 stk2 stk0 stk3 stk1 // cyv r9 stk1 r9 stk2 stk0 stk3 stk1 stk4 stk2 .macro cysetup v, n rdtsc shl rdx, 32 or rax, rdx mov [\v + 8*\n - 8], rax .endm .macro cystore v, n rdtsc shl rdx, 32 or rax, rdx sub rax, [\v + 8*\n - 8] mov [\v + 8*\n - 8], rax dec \n .endm .macro testprologue mode pushreg rbx #if ABI_SYSV endprologue .ifeqs "\mode", "dmul" mov rbx, rcx movdqu xmm8, [r8] movdqu xmm10, [r9] mov r8d, STKARG(0) mov r9, STKARG(1) mov r11, rdx mov rcx, rsi .endif .ifeqs "\mode", "smul" mov rbx, rdx movdqu xmm10, [rcx] mov rcx, rsi .endif .ifeqs "\mode", "mmul" mov rax, STKARG(0) mov rbx, r8 movdqu xmm8, [r9] movdqu xmm10, [rax] mov r8d, STKARG(1) mov r9, STKARG(2) mov r10, rdx mov r11, rcx mov rcx, rsi .endif .ifeqs "\mode", "mont" mov rbx, rcx movdqu xmm8, [r8] mov r8d, r9d mov r9, STKARG(0) mov r10, rdx mov rcx, rsi .endif #endif #if ABI_WIN pushreg rdi stalloc 168 savexmm xmm6, 0 savexmm xmm7, 16 savexmm xmm8, 32 savexmm xmm9, 48 savexmm xmm10, 64 savexmm xmm11, 80 savexmm xmm12, 96 savexmm xmm13, 112 savexmm xmm14, 128 savexmm xmm15, 144 endprologue .ifeqs "\mode", "dmul" mov r10, STKARG(0) mov r11, STKARG(1) mov rdi, rcx mov rcx, rdx mov rbx, r9 movdqu xmm8, [r10] movdqu xmm10, [r11] mov r11, r8 mov r8d, STKARG(2) mov r9, STKARG(3) .endif .ifeqs "\mode", "smul" mov rdi, rcx mov rcx, rdx mov rbx, r8 movdqu xmm10, [r9] mov r8d, STKARG(0) mov r9, STKARG(1) .endif .ifeqs "\mode", "mmul" mov r10, STKARG(1) mov r11, STKARG(2) mov rdi, rcx mov rcx, rdx mov rbx, STKARG(0) movdqu xmm8, [r10] movdqu xmm10, [r11] mov r10, r8 mov r11, r9 mov r8d, STKARG(3) mov r9, STKARG(4) .endif .ifeqs "\mode", "mont" mov r10, STKARG(0) mov rdi, rcx mov rcx, rdx mov rbx, r9 movdqu xmm8, [r10] mov r10, r8 mov r8d, STKARG(1) mov r9, STKARG(2) .endif #endif pxor xmm0, xmm0 .ifeqs "\mode", "dmul" expand xmm0, xmm8, xmm9, xmm10, xmm11 .endif .ifeqs "\mode", "smul" expand xmm0, xmm10, xmm11 .endif .ifeqs "\mode", "mmul" expand xmm0, xmm8, xmm9, xmm10, xmm11 .endif .ifeqs "\mode", "mont" expand xmm0, xmm8, xmm9 .endif .endm .macro testepilogue #if ABI_WIN rstrxmm xmm6, 0 rstrxmm xmm7, 16 rstrxmm xmm8, 32 rstrxmm xmm9, 48 rstrxmm xmm10, 64 rstrxmm xmm11, 80 rstrxmm xmm12, 96 rstrxmm xmm13, 112 rstrxmm xmm14, 128 rstrxmm xmm15, 144 stfree 168 popreg rdi #endif popreg rbx ret .endm .macro testldcarry movdqu xmm12, [rcx + 0] // (c''_0; c'_0) movdqu xmm13, [rcx + 16] // (c''_1; c'_1) movdqu xmm14, [rcx + 32] // (c''_2; c'_2) .endm .macro testtop u=nil .p2align 4 0: cysetup r9, r8 .ifnes "\u", "nil" mov rax, \u .endif .endm .macro testtail cystore r9, r8 jnz 0b .endm .macro testcarryout movdqu [rcx + 0], xmm12 movdqu [rcx + 16], xmm13 movdqu [rcx + 32], xmm14 .endm FUNC(test_dmul4) testprologue dmul testldcarry testtop r11 call dmul4 testtail testcarryout testepilogue ENDFUNC FUNC(test_dmla4) testprologue dmul testldcarry testtop r11 call dmla4 testtail testcarryout testepilogue ENDFUNC FUNC(test_mul4) testprologue smul testldcarry testtop nil call mul4 testtail testcarryout testepilogue ENDFUNC FUNC(test_mul4zc) testprologue smul testldcarry testtop nil call mul4zc testtail testcarryout testepilogue ENDFUNC FUNC(test_mla4) testprologue smul testldcarry testtop nil call mla4 testtail testcarryout testepilogue ENDFUNC FUNC(test_mla4zc) testprologue smul testldcarry testtop nil call mla4zc testtail testcarryout testepilogue ENDFUNC FUNC(test_mmul4) testprologue mmul testtop r11 call mmul4 testtail pshufd xmm10, xmm10, SHUF(3, 1, 2, 0) pshufd xmm11, xmm11, SHUF(3, 1, 2, 0) movdqu [r10 + 0], xmm10 movdqu [r10 + 16], xmm11 testcarryout testepilogue ENDFUNC FUNC(test_mmla4) testprologue mmul testtop r11 call mmla4 testtail pshufd xmm10, xmm10, SHUF(3, 1, 2, 0) pshufd xmm11, xmm11, SHUF(3, 1, 2, 0) movdqu [r10 + 0], xmm10 movdqu [r10 + 16], xmm11 testcarryout testepilogue ENDFUNC FUNC(test_mont4) testprologue mont testtop call mont4 testtail pshufd xmm10, xmm10, SHUF(3, 1, 2, 0) pshufd xmm11, xmm11, SHUF(3, 1, 2, 0) movdqu [r10 + 0], xmm10 movdqu [r10 + 16], xmm11 testcarryout testepilogue ENDFUNC #endif ///----- That's all, folks --------------------------------------------------