--- /dev/null
+/// -*- mode: asm; asm-comment-char: ?/; comment-start: "// " -*-
+///
+/// Large SIMD-based multiplications
+///
+/// (c) 2016 Straylight/Edgeware
+///
+
+///----- Licensing notice ---------------------------------------------------
+///
+/// This file is part of Catacomb.
+///
+/// Catacomb is free software; you can redistribute it and/or modify
+/// it under the terms of the GNU Library General Public License as
+/// published by the Free Software Foundation; either version 2 of the
+/// License, or (at your option) any later version.
+///
+/// Catacomb is distributed in the hope that it will be useful,
+/// but WITHOUT ANY WARRANTY; without even the implied warranty of
+/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+/// GNU Library General Public License for more details.
+///
+/// You should have received a copy of the GNU Library General Public
+/// License along with Catacomb; if not, write to the Free
+/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+/// MA 02111-1307, USA.
+
+///--------------------------------------------------------------------------
+/// External definitions.
+
+#include "config.h"
+#include "asm-common.h"
+
+///--------------------------------------------------------------------------
+/// Prologue.
+
+ .arch pentium4
+ .text
+
+///--------------------------------------------------------------------------
+/// Theory.
+///
+/// We define a number of primitive fixed-size multipliers from which we can
+/// construct more general variable-length multipliers.
+///
+/// The basic trick is the same throughout. In an operand-scanning
+/// multiplication, the inner multiplication loop multiplies a
+/// multiple-precision operand by a single precision factor, and adds the
+/// result, appropriately shifted, to the result. A `finely integrated
+/// operand scanning' implementation of Montgomery multiplication also adds
+/// the product of a single-precision `Montgomery factor' and the modulus,
+/// calculated in the same pass. The more common `coarsely integrated
+/// operand scanning' alternates main multiplication and Montgomery passes,
+/// which requires additional carry propagation.
+///
+/// Throughout both plain-multiplication and Montgomery stages, then, one of
+/// the factors remains constant throughout the operation, so we can afford
+/// to take a little time to preprocess it. The transformation we perform is
+/// as follows. Let b = 2^16, and B = b^2 = 2^32. Suppose we're given a
+/// 128-bit factor v = v_0 + v_1 B + v_2 B^2 + v_3 B^3. Split each v_i into
+/// two sixteen-bit pieces, so v_i = v'_i + v''_i b. These eight 16-bit
+/// pieces are placed into 32-bit cells, and arranged as two 128-bit SSE
+/// operands, as follows.
+///
+/// Offset 0 4 8 12
+/// 0 v'_0 v'_1 v''_0 v''_1
+/// 16 v'_2 v'_3 v''_2 v''_3
+///
+/// A `pmuludqd' instruction ignores the odd positions in its operands; thus,
+/// it will act on (say) v'_0 and v''_0 in a single instruction. Shifting
+/// this vector right by 4 bytes brings v'_1 and v''_1 into position. We can
+/// multiply such a vector by a full 32-bit scalar to produce two 48-bit
+/// results in 64-bit fields. The sixteen bits of headroom allows us to add
+/// many products together before we must deal with carrying; it also allows
+/// for some calculations to be performed on the above expanded form.
+///
+/// ...
+///
+/// We maintain four `carry' registers accumulating intermediate results.
+/// The registers' precise roles rotate during the computation; we name them
+/// `c0', `c1', `c2', and `c3'. Each carry register holds two 64-bit halves:
+/// the register c0, for example, holds c'_0 (low half) and c''_0 (high
+/// half), and represents the value c_0 = c'_0 + c''_0 b; the carry registers
+/// collectively represent the value c_0 + c_1 B + c_2 B^2 + c_3 B^3. The
+/// `pmuluqdq' instruction acting on a scalar operand (broadcast across all
+/// lanes of its vector) and an operand in the expanded form above produces a
+/// result which can be added directly to the appropriate carry register.
+/// Following a pass of four multiplications, we perform some limited carry
+/// propagation: let t = c''_0 mod B, and let d = c'_0 + t b; then we output
+/// z = d mod B, add (floor(d/B), floor(c''_0/B)) to c1, and cycle the carry
+/// registers around, so that c1 becomes c0, and the old c0 is (implicitly)
+/// zeroed becomes c3.
+
+///--------------------------------------------------------------------------
+/// Macro definitions.
+
+.macro mulcore r, i, slo, shi, d0, d1=nil, d2=nil, d3=nil
+ // Multiply R_I by the expanded operand SLO/SHI, and leave the pieces
+ // of the product in registers D0, D1, D2, D3.
+ pshufd \d0, \r, SHUF(3, \i, 3, \i) // (r_i, ?, r_i, ?)
+ .ifnes "\d1", "nil"
+ movdqa \d1, \slo // (s'_0, s'_1, s''_0, s''_1)
+ .endif
+ .ifnes "\d3", "nil"
+ movdqa \d3, \shi // (s'_2, s'_3, s''_2, s''_3)
+ .endif
+ .ifnes "\d1", "nil"
+ psrldq \d1, 4 // (s'_1, s''_0, s''_1, 0)
+ .endif
+ .ifnes "\d2", "nil"
+ movdqa \d2, \d0 // another copy of (r_i, ?, r_i, ?)
+ .endif
+ .ifnes "\d3", "nil"
+ psrldq \d3, 4 // (s'_3, s''_2, s''_3, 0)
+ .endif
+ .ifnes "\d1", "nil"
+ pmuludq \d1, \d0 // (r_i s'_1, r_i s''_1)
+ .endif
+ .ifnes "\d3", "nil"
+ pmuludq \d3, \d0 // (r_i s'_3, r_i s''_3)
+ .endif
+ .ifnes "\d2", "nil"
+ pmuludq \d2, \shi // (r_i s'_2, r_i s''_2)
+ .endif
+ pmuludq \d0, \slo // (r_i s'_0, r_i s''_0)
+.endm
+
+.macro accum c0, c1=nil, c2=nil, c3=nil
+ // Accumulate 64-bit pieces in XMM0--XMM3 into the corresponding
+ // carry registers C0--C3. Any or all of C1--C3 may be `nil' to skip
+ // updating that register.
+ paddq \c0, xmm0
+ .ifnes "\c1", "nil"
+ paddq \c1, xmm1
+ .endif
+ .ifnes "\c2", "nil"
+ paddq \c2, xmm2
+ .endif
+ .ifnes "\c3", "nil"
+ paddq \c3, xmm3
+ .endif
+.endm
+
+.macro mulacc r, i, slo, shi, c0=nil, c1=nil, c2=nil, c3=nil, z3p=nil
+ // Multiply R_I by the expanded operand SLO/SHI, and accumulate in
+ // carry registers C0, C1, C2, C3. If Z3P is `t' then C3 notionally
+ // contains zero, but needs clearing; in practice, we store the
+ // product directly rather than attempting to add. On completion,
+ // XMM0, XMM1, and XMM2 are clobbered, as is XMM3 if Z3P is not `t'.
+ .ifeqs "\z3p", "t"
+ mulcore \r, \i, \slo, \shi, xmm0, xmm1, xmm2, \c3
+ accum \c0, \c1, \c2
+ .else
+ mulcore \r, \i, \slo, \shi, xmm0, xmm1, xmm2, xmm3
+ accum \c0, \c1, \c2, \c3
+ .endif
+.endm
+
+.macro propout d, pos, c, cc=nil
+ // Calculate an output word from C, and store it at POS in D;
+ // propagate carries out from C to CC in preparation for a rotation
+ // of the carry registers. D is an XMM register; the POS is either
+ // `lo' or `hi' according to whether the output word should be in
+ // lane 0 or 1 of D; the high two lanes of D are clobbered. On
+ // completion, XMM3 is clobbered. If CC is `nil', then the
+ // contribution which would have been added to it is left in C.
+ pshufd xmm3, \c, SHUF(2, 3, 3, 3) // (?, ?, ?, t = c'' mod B)
+ psrldq xmm3, 12 // (t, 0, 0, 0) = (t, 0)
+ pslldq xmm3, 2 // (t b, 0)
+ paddq \c, xmm3 // (c' + t b, c'')
+ .ifeqs "\pos", "lo"
+ movdqa \d, \c
+ .else
+ punpckldq \d, \c
+ .endif
+ psrlq \c, 32 // floor(c/B)
+ .ifnes "\cc", "nil"
+ paddq \cc, \c // propagate up
+ .endif
+.endm
+
+.macro endprop d, pos, c, t
+ // On entry, C contains a carry register. On exit, the low 32 bits
+ // of the value represented in C are written at POS in D, and the
+ // remaining bits are left at the bottom of T.
+ movdqa \t, \c
+ psllq \t, 16 // (?, c'' b)
+ pslldq \c, 8 // (0, c')
+ paddq \t, \c // (?, c' + c'' b)
+ psrldq \t, 8 // c' + c'' b
+ .ifeqs "\pos", "lo"
+ movdqa \d, \t
+ .else
+ punpckldq \d, \t
+ .endif
+ psrldq \t, 4 // floor((c' + c'' b)/B)
+.endm
+
+.macro expand z, a, b, c=nil, d=nil
+ // On entry, A and C hold packed 128-bit values, and Z is zero. On
+ // exit, A:B and C:D together hold the same values in expanded
+ // form. If C is `nil', then only expand A to A:B.
+ movdqa \b, \a // (a_0, a_1, a_2, a_3)
+ .ifnes "\c", "nil"
+ movdqa \d, \c // (c_0, c_1, c_2, c_3)
+ .endif
+ punpcklwd \a, \z // (a'_0, a''_0, a'_1, a''_1)
+ punpckhwd \b, \z // (a'_2, a''_2, a'_3, a''_3)
+ .ifnes "\c", "nil"
+ punpcklwd \c, \z // (c'_0, c''_0, c'_1, c''_1)
+ punpckhwd \d, \z // (c'_2, c''_2, c'_3, c''_3)
+ .endif
+ pshufd \a, \a, SHUF(3, 1, 2, 0) // (a'_0, a'_1, a''_0, a''_1)
+ pshufd \b, \b, SHUF(3, 1, 2, 0) // (a'_2, a'_3, a''_2, a''_3)
+ .ifnes "\c", "nil"
+ pshufd \c, \c, SHUF(3, 1, 2, 0) // (c'_0, c'_1, c''_0, c''_1)
+ pshufd \d, \d, SHUF(3, 1, 2, 0) // (c'_2, c'_3, c''_2, c''_3)
+ .endif
+.endm
+
+.macro squash c0, c1, c2, c3, t, u, lo, hi=nil
+ // On entry, C0, C1, C2, C3 are carry registers representing a value
+ // Y. On exit, LO holds the low 128 bits of the carry value; C1, C2,
+ // C3, T, and U are clobbered; and the high bits of Y are stored in
+ // HI, if this is not `nil'.
+
+ // The first step is to eliminate the `double-prime' pieces -- i.e.,
+ // the ones offset by 16 bytes from a 32-bit boundary -- by carrying
+ // them into the 32-bit-aligned pieces above and below. But before
+ // we can do that, we must gather them together.
+ movdqa \t, \c0
+ movdqa \u, \c1
+ punpcklqdq \t, \c2 // (y'_0, y'_2)
+ punpckhqdq \c0, \c2 // (y''_0, y''_2)
+ punpcklqdq \u, \c3 // (y'_1, y'_3)
+ punpckhqdq \c1, \c3 // (y''_1, y''_3)
+
+ // Now split the double-prime pieces. The high (up to) 48 bits will
+ // go up; the low 16 bits go down.
+ movdqa \c2, \c0
+ movdqa \c3, \c1
+ psllq \c2, 48
+ psllq \c3, 48
+ psrlq \c0, 16 // high parts of (y''_0, y''_2)
+ psrlq \c1, 16 // high parts of (y''_1, y''_3)
+ psrlq \c2, 32 // low parts of (y''_0, y''_2)
+ psrlq \c3, 32 // low parts of (y''_1, y''_3)
+ .ifnes "\hi", "nil"
+ movdqa \hi, \c1
+ .endif
+ pslldq \c1, 8 // high part of (0, y''_1)
+
+ paddq \t, \c2 // propagate down
+ paddq \u, \c3
+ paddq \t, \c1 // and up: (y_0, y_2)
+ paddq \u, \c0 // (y_1, y_3)
+ .ifnes "\hi", "nil"
+ psrldq \hi, 8 // high part of (y''_3, 0)
+ .endif
+
+ // Finally extract the answer. This complicated dance is better than
+ // storing to memory and loading, because the piecemeal stores
+ // inhibit store forwarding.
+ movdqa \c3, \t // (y_0, y_1)
+ movdqa \lo, \t // (y^*_0, ?, ?, ?)
+ psrldq \t, 8 // (y_2, 0)
+ psrlq \c3, 32 // (floor(y_0/B), ?)
+ paddq \c3, \u // (y_1 + floor(y_0/B), ?)
+ movdqa \c1, \c3 // (y^*_1, ?, ?, ?)
+ psrldq \u, 8 // (y_3, 0)
+ psrlq \c3, 32 // (floor((y_1 B + y_0)/B^2, ?)
+ paddq \c3, \t // (y_2 + floor((y_1 B + y_0)/B^2, ?)
+ punpckldq \lo, \c3 // (y^*_0, y^*_2, ?, ?)
+ psrlq \c3, 32 // (floor((y_2 B^2 + y_1 B + y_0)/B^3, ?)
+ paddq \c3, \u // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3, ?)
+ .ifnes "\hi", "nil"
+ movdqa \t, \c3
+ pxor \u, \u
+ .endif
+ punpckldq \c1, \c3 // (y^*_1, y^*_3, ?, ?)
+ .ifnes "\hi", "nil"
+ psrlq \t, 32 // very high bits of y
+ paddq \hi, \t
+ punpcklqdq \hi, \u // carry up
+ .endif
+ punpckldq \lo, \c1 // y mod B^4
+.endm
+
+.macro carryadd
+ // On entry, RDI points to a packed addend A, and XMM12, XMM13, XMM14
+ // hold the incoming carry registers c0, c1, and c2 representing a
+ // carry-in C.
+ //
+ // On exit, the carry registers, including XMM15, are updated to hold
+ // C + A; XMM0, XMM1, XMM2, and XMM3 are clobbered. The other
+ // registers are preserved.
+ movd xmm0, [rdi + 0] // (a_0, 0)
+ movd xmm1, [rdi + 4] // (a_1, 0)
+ movd xmm2, [rdi + 8] // (a_2, 0)
+ movd xmm15, [rdi + 12] // (a_3, 0)
+ paddq xmm12, xmm0 // (c'_0 + a_0, c''_0)
+ paddq xmm13, xmm1 // (c'_1 + a_1, c''_1)
+ paddq xmm14, xmm2 // (c'_2 + a_2, c''_2 + a_3 b)
+.endm
+
+///--------------------------------------------------------------------------
+/// Primitive multipliers and related utilities.
+
+INTFUNC(carryprop)
+ // On entry, XMM12, XMM13, and XMM14 hold a 144-bit carry in an
+ // expanded form. Store the low 128 bits of the represented carry to
+ // [RDI] as a packed 128-bit value, and leave the remaining 16 bits
+ // in the low 32 bits of XMM12. On exit, XMM0, XMM1, XMM3, XMM13 and
+ // XMM14 are clobbered.
+ endprologue
+
+ propout xmm0, lo, xmm12, xmm13
+ propout xmm1, lo, xmm13, xmm14
+ propout xmm0, hi, xmm14, nil
+ endprop xmm1, hi, xmm14, xmm12
+ punpckldq xmm0, xmm1
+ movdqu [rdi], xmm0
+
+ ret
+
+ENDFUNC
+
+INTFUNC(dmul4)
+ // On entry, RDI points to the destination buffer; RAX and RBX point
+ // to the packed operands U and X; XMM8/XMM9 and XMM10/XMM11 hold the
+ // expanded operands V and Y; and XMM12, XMM13, XMM14 hold the
+ // incoming carry registers c0, c1, and c2; c3 is assumed to be zero.
+ //
+ // On exit, we write the low 128 bits of the sum C + U V + X Y to
+ // [RDI], and update the carry registers with the carry out. The
+ // registers XMM0--XMM7, and XMM15 are clobbered; the general-purpose
+ // registers are preserved.
+ endprologue
+
+ movdqu xmm4, [rax]
+ movdqu xmm5, [rbx]
+
+ mulacc xmm4, 0, xmm8, xmm9, xmm12, xmm13, xmm14, xmm15, t
+ mulacc xmm5, 0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
+ propout xmm6, lo, xmm12, xmm13
+
+ mulacc xmm4, 1, xmm8, xmm9, xmm13, xmm14, xmm15, xmm12, t
+ mulacc xmm5, 1, xmm10, xmm11, xmm13, xmm14, xmm15, xmm12
+ propout xmm7, lo, xmm13, xmm14
+
+ mulacc xmm4, 2, xmm8, xmm9, xmm14, xmm15, xmm12, xmm13, t
+ mulacc xmm5, 2, xmm10, xmm11, xmm14, xmm15, xmm12, xmm13
+ propout xmm6, hi, xmm14, xmm15
+
+ mulacc xmm4, 3, xmm8, xmm9, xmm15, xmm12, xmm13, xmm14, t
+ mulacc xmm5, 3, xmm10, xmm11, xmm15, xmm12, xmm13, xmm14
+ propout xmm7, hi, xmm15, xmm12
+
+ punpckldq xmm6, xmm7
+ movdqu [rdi], xmm6
+
+ ret
+
+ENDFUNC
+
+INTFUNC(dmla4)
+ // On entry, RDI points to the destination buffer, which also
+ // contains an addend A to accumulate; RAX and RBX point to the
+ // packed operands U and X; XMM8/XMM9 and XMM10/XMM11 hold the
+ // expanded operands V and Y; and XMM12, XMM13, XMM14 hold the
+ // incoming carry registers c0, c1, and c2 representing a carry-in C;
+ // c3 is assumed to be zero.
+ //
+ // On exit, we write the low 128 bits of the sum A + C + U V + X Y to
+ // [RDI], and update the carry registers with the carry out. The
+ // registers XMM0--XMM7, and XMM15 are clobbered; the general-purpose
+ // registers are preserved.
+ endprologue
+
+ movdqu xmm4, [rax]
+ movdqu xmm5, [rbx]
+ carryadd
+
+ mulacc xmm4, 0, xmm8, xmm9, xmm12, xmm13, xmm14, xmm15
+ mulacc xmm5, 0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
+ propout xmm6, lo, xmm12, xmm13
+
+ mulacc xmm4, 1, xmm8, xmm9, xmm13, xmm14, xmm15, xmm12, t
+ mulacc xmm5, 1, xmm10, xmm11, xmm13, xmm14, xmm15, xmm12
+ propout xmm7, lo, xmm13, xmm14
+
+ mulacc xmm4, 2, xmm8, xmm9, xmm14, xmm15, xmm12, xmm13, t
+ mulacc xmm5, 2, xmm10, xmm11, xmm14, xmm15, xmm12, xmm13
+ propout xmm6, hi, xmm14, xmm15
+
+ mulacc xmm4, 3, xmm8, xmm9, xmm15, xmm12, xmm13, xmm14, t
+ mulacc xmm5, 3, xmm10, xmm11, xmm15, xmm12, xmm13, xmm14
+ propout xmm7, hi, xmm15, xmm12
+
+ punpckldq xmm6, xmm7
+ movdqu [rdi], xmm6
+
+ ret
+
+ENDFUNC
+
+INTFUNC(mul4zc)
+ // On entry, RDI points to the destination buffer; RBX points to a
+ // packed operand X; and XMM10/XMM11 hold an expanded operand Y.
+ //
+ // On exit, we write the low 128 bits of the product X Y to [RDI],
+ // and set the carry registers XMM12, XMM13, XMM14 to the carry out.
+ // The registers XMM0--XMM3, XMM5--XMM7, and XMM15 are clobbered; the
+ // general-purpose registers are preserved.
+ endprologue
+
+ movdqu xmm5, [rbx]
+
+ mulcore xmm5, 0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
+ propout xmm6, lo, xmm12, xmm13
+
+ mulacc xmm5, 1, xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
+ propout xmm7, lo, xmm13, xmm14
+
+ mulacc xmm5, 2, xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
+ propout xmm6, hi, xmm14, xmm15
+
+ mulacc xmm5, 3, xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
+ propout xmm7, hi, xmm15, xmm12
+
+ punpckldq xmm6, xmm7
+ movdqu [rdi], xmm6
+
+ ret
+
+ENDFUNC
+
+INTFUNC(mul4)
+ // On entry, RDI points to the destination buffer; RBX points to a
+ // packed operand X; XMM10/XMM11 hold an expanded operand Y; and
+ // XMM12, XMM13, XMM14 hold the incoming carry registers c0, c1, and
+ // c2, representing a carry-in C; c3 is assumed to be zero.
+ //
+ // On exit, we write the low 128 bits of the sum C + X Y to [RDI],
+ // and update the carry registers with the carry out. The registers
+ // XMM0--XMM3, XMM5--XMM7, and XMM15 are clobbered; the
+ // general-purpose registers are preserved.
+ endprologue
+
+ movdqu xmm5, [rbx]
+
+ mulacc xmm5, 0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, t
+ propout xmm6, lo, xmm12, xmm13
+
+ mulacc xmm5, 1, xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
+ propout xmm7, lo, xmm13, xmm14
+
+ mulacc xmm5, 2, xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
+ propout xmm6, hi, xmm14, xmm15
+
+ mulacc xmm5, 3, xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
+ propout xmm7, hi, xmm15, xmm12
+
+ punpckldq xmm6, xmm7
+ movdqu [rdi], xmm6
+
+ ret
+
+ENDFUNC
+
+INTFUNC(mla4zc)
+ // On entry, RDI points to the destination buffer, which also
+ // contains an addend A to accumulate; RBX points to a packed operand
+ // X; and XMM10/XMM11 points to an expanded operand Y.
+ //
+ // On exit, we write the low 128 bits of the sum A + X Y to [RDI],
+ // and set the carry registers XMM12, XMM13, XMM14 to the carry out.
+ // The registers XMM0--XMM3, XMM5--XMM7, and XMM15 are clobbered; the
+ // general-purpose registers are preserved.
+ endprologue
+
+ movdqu xmm5, [rbx]
+ movd xmm12, [rdi + 0]
+ movd xmm13, [rdi + 4]
+ movd xmm14, [rdi + 8]
+ movd xmm15, [rdi + 12]
+
+ mulacc xmm5, 0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
+ propout xmm6, lo, xmm12, xmm13
+
+ mulacc xmm5, 1, xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
+ propout xmm7, lo, xmm13, xmm14
+
+ mulacc xmm5, 2, xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
+ propout xmm6, hi, xmm14, xmm15
+
+ mulacc xmm5, 3, xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
+ propout xmm7, hi, xmm15, xmm12
+
+ punpckldq xmm6, xmm7
+ movdqu [rdi], xmm6
+
+ ret
+
+ENDFUNC
+
+INTFUNC(mla4)
+ // On entry, RDI points to the destination buffer, which also
+ // contains an addend A to accumulate; RBX points to a packed operand
+ // X; XMM10/XMM11 holds an expanded operand Y; and XMM12, XMM13,
+ // XMM14 hold the incoming carry registers c0, c1, and c2,
+ // representing a carry-in C; c3 is assumed to be zero.
+ //
+ // On exit, we write the low 128 bits of the sum A + C + X Y to
+ // [RDI], and update the carry registers with the carry out. The
+ // registers XMM0--XMM3, XMM5--XMM7, and XMM15 are clobbered; the
+ // general-purpose registers are preserved.
+ endprologue
+
+ movdqu xmm5, [rbx]
+ carryadd
+
+ mulacc xmm5, 0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
+ propout xmm6, lo, xmm12, xmm13
+
+ mulacc xmm5, 1, xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
+ propout xmm7, lo, xmm13, xmm14
+
+ mulacc xmm5, 2, xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
+ propout xmm6, hi, xmm14, xmm15
+
+ mulacc xmm5, 3, xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
+ propout xmm7, hi, xmm15, xmm12
+
+ punpckldq xmm6, xmm7
+ movdqu [rdi], xmm6
+
+ ret
+
+ENDFUNC
+
+INTFUNC(mmul4)
+ // On entry, RDI points to the destination buffer; RAX and RBX point
+ // to the packed operands U and N; and XMM8/XMM9 and XMM10/XMM11 hold
+ // the expanded operands V and M. The stack pointer must be 8 modulo 16
+ // (as usual for AMD64 ABIs).
+ //
+ // On exit, we store Y = U V M mod B in XMM10/XMM11, and write the
+ // low 128 bits of the sum U V + N Y to [RDI], leaving the remaining
+ // carry in XMM12, XMM13, and XMM14. The registers XMM0--XMM7, and
+ // XMM15 are clobbered; the general-purpose registers are preserved.
+ movdqu xmm4, [rax]
+#if ABI_WIN
+ stalloc 48 + 8 // space for the carries
+#endif
+ endprologue
+
+ // Calculate W = U V, and leave it in XMM7. Stash the carry pieces
+ // for later.
+ mulcore xmm4, 0, xmm8, xmm9, xmm12, xmm13, xmm14, xmm15
+ propout xmm7, lo, xmm12, xmm13
+ jmp 5f
+
+ENDFUNC
+
+INTFUNC(mmla4)
+ // On entry, RDI points to the destination buffer, which also
+ // contains an addend A to accumulate; RAX and RBX point to the
+ // packed operands U and N; and XMM8/XMM9 and XMM10/XMM11 hold the
+ // expanded operands V and M. The stack pointer must be 8 modulo 16
+ // (as usual for AMD64 ABIs).
+ //
+ // On exit, we store Y = (A + U V) M mod B in XMM10/XMM11, and write
+ // the low 128 bits of the sum A + U V + N Y to [RDI], leaving the
+ // remaining carry in XMM12, XMM13, and XMM14. The registers
+ // XMM0--XMM7, and XMM15 are clobbered; the general-purpose registers
+ // are preserved.
+ movdqu xmm4, [rax]
+#if ABI_WIN
+ stalloc 48 + 8 // space for the carries
+# define STKTMP(i) [rsp + i]
+#endif
+#if ABI_SYSV
+# define STKTMP(i) [rsp + i - 48 - 8] // use red zone
+#endif
+ endprologue
+
+ movd xmm12, [rdi + 0]
+ movd xmm13, [rdi + 4]
+ movd xmm14, [rdi + 8]
+ movd xmm15, [rdi + 12]
+
+ // Calculate W = U V, and leave it in XMM7. Stash the carry pieces
+ // for later.
+ mulacc xmm4, 0, xmm8, xmm9, xmm12, xmm13, xmm14, xmm15
+ propout xmm7, lo, xmm12, xmm13
+
+5: mulacc xmm4, 1, xmm8, xmm9, xmm13, xmm14, xmm15, xmm12, t
+ propout xmm6, lo, xmm13, xmm14
+
+ mulacc xmm4, 2, xmm8, xmm9, xmm14, xmm15, xmm12, xmm13, t
+ propout xmm7, hi, xmm14, xmm15
+
+ mulacc xmm4, 3, xmm8, xmm9, xmm15, xmm12, xmm13, xmm14, t
+ propout xmm6, hi, xmm15, xmm12
+
+ // Prepare W, and stash carries for later.
+ punpckldq xmm7, xmm6
+ movdqa STKTMP( 0), xmm12
+ movdqa STKTMP(16), xmm13
+ movdqa STKTMP(32), xmm14
+
+ // Calculate Y = W M. We just about have enough spare registers to
+ // make this work.
+ mulcore xmm7, 0, xmm10, xmm11, xmm3, xmm4, xmm5, xmm6
+
+ // Start expanding W back into the main carry registers...
+ pxor xmm15, xmm15
+ movdqa xmm12, xmm7
+ movdqa xmm14, xmm7
+
+ mulcore xmm7, 1, xmm10, xmm11, xmm0, xmm1, xmm2
+ accum xmm4, xmm5, xmm6
+
+ punpckldq xmm12, xmm15 // (w_0, 0, w_1, 0)
+ punpckhdq xmm14, xmm15 // (w_2, 0, w_3, 0)
+
+ mulcore xmm7, 2, xmm10, xmm11, xmm0, xmm1
+ accum xmm5, xmm6
+
+ pxor xmm2, xmm2
+ movdqa xmm13, xmm12
+ movdqa xmm15, xmm14
+
+ mulcore xmm7, 3, xmm10, xmm11, xmm0
+ accum xmm6
+
+ punpckldq xmm12, xmm2 // (w_0, 0, 0, 0)
+ punpckldq xmm14, xmm2 // (w_2, 0, 0, 0)
+ punpckhdq xmm13, xmm2 // (w_1, 0, 0, 0)
+ punpckhdq xmm15, xmm2 // (w_3, 0, 0, 0)
+
+ // That's lots of pieces. Now we have to assemble the answer.
+ squash xmm3, xmm4, xmm5, xmm6, xmm0, xmm1, xmm10
+
+ // Expand it.
+ movdqu xmm5, [rbx]
+ expand xmm2, xmm10, xmm11
+
+ // Finish the calculation by adding the Montgomery product.
+ mulacc xmm5, 0 xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
+ propout xmm6, lo, xmm12, xmm13
+
+ mulacc xmm5, 1 xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
+ propout xmm7, lo, xmm13, xmm14
+
+ mulacc xmm5, 2 xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
+ propout xmm6, hi, xmm14, xmm15
+
+ mulacc xmm5, 3 xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
+ propout xmm7, hi, xmm15, xmm12
+
+ punpckldq xmm6, xmm7
+
+ // Add add on the carry we calculated earlier.
+ paddq xmm12, STKTMP( 0)
+ paddq xmm13, STKTMP(16)
+ paddq xmm14, STKTMP(32)
+
+ // And, with that, we're done.
+ movdqu [rdi], xmm6
+#if ABI_WIN
+ stfree 56
+#endif
+ ret
+
+#undef STKTMP
+
+ENDFUNC
+
+INTFUNC(mont4)
+ // On entry, RDI points to the destination buffer holding a packed
+ // value W; RBX points to a packed operand N; and XMM8/XMM9 hold an
+ // expanded operand M.
+ //
+ // On exit, we store Y = W M mod B in XMM10/XMM11, and write the low
+ // 128 bits of the sum W + N Y to [RDI], leaving the remaining carry
+ // in XMM12, XMM13, and XMM14. The registers XMM0--XMM3, XMM5--XMM7,
+ // and XMM15 are clobbered; the general-purpose registers are
+ // preserved.
+ endprologue
+
+ movdqu xmm7, [rdi]
+
+ // Calculate Y = W M. Avoid the standard carry registers, because
+ // we're setting something else up there.
+ mulcore xmm7, 0, xmm8, xmm9, xmm3, xmm4, xmm5, xmm6
+
+ // Start expanding W back into the main carry registers...
+ pxor xmm15, xmm15
+ movdqa xmm12, xmm7
+ movdqa xmm14, xmm7
+
+ mulcore xmm7, 1, xmm8, xmm9, xmm0, xmm1, xmm2
+ accum xmm4, xmm5, xmm6
+
+ punpckldq xmm12, xmm15 // (w_0, 0, w_1, 0)
+ punpckhdq xmm14, xmm15 // (w_2, 0, w_3, 0)
+
+ mulcore xmm7, 2, xmm8, xmm9, xmm0, xmm1
+ accum xmm5, xmm6
+
+ pxor xmm2, xmm2
+ movdqa xmm13, xmm12
+ movdqa xmm15, xmm14
+
+ mulcore xmm7, 3, xmm8, xmm9, xmm0
+ accum xmm6
+
+ punpckldq xmm12, xmm2 // (w_0, 0, 0, 0)
+ punpckldq xmm14, xmm2 // (w_2, 0, 0, 0)
+ punpckhdq xmm13, xmm2 // (w_1, 0, 0, 0)
+ punpckhdq xmm15, xmm2 // (w_3, 0, 0, 0)
+
+ // That's lots of pieces. Now we have to assemble the answer.
+ squash xmm3, xmm4, xmm5, xmm6, xmm0, xmm1, xmm10
+
+ // Expand it.
+ movdqu xmm5, [rbx]
+ expand xmm2, xmm10, xmm11
+
+ // Finish the calculation by adding the Montgomery product.
+ mulacc xmm5, 0 xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
+ propout xmm6, lo, xmm12, xmm13
+
+ mulacc xmm5, 1 xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
+ propout xmm7, lo, xmm13, xmm14
+
+ mulacc xmm5, 2 xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
+ propout xmm6, hi, xmm14, xmm15
+
+ mulacc xmm5, 3 xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
+ propout xmm7, hi, xmm15, xmm12
+
+ punpckldq xmm6, xmm7
+
+ // And, with that, we're done.
+ movdqu [rdi], xmm6
+ ret
+
+ENDFUNC
+
+///--------------------------------------------------------------------------
+/// Bulk multipliers.
+
+FUNC(mpx_umul4_amd64_sse2)
+ // void mpx_umul4_amd64_sse2(mpw *dv, const mpw *av, const mpw *avl,
+ // const mpw *bv, const mpw *bvl);
+
+ // Establish the arguments and do initial setup.
+ //
+ // sysv win
+ // inner loop dv rdi rdi*
+ // inner loop av rbx* rbx*
+ // outer loop dv r10 rcx
+ // outer loop bv rcx r9
+ // av base rsi rdx
+ // av limit rdx r8
+ // bv limit r8 r10
+
+#if ABI_SYSV
+# define DV r10
+# define AV rsi
+# define AVL rdx
+# define BV rcx
+# define BVL r8
+
+ pushreg rbx
+ endprologue
+
+ mov DV, rdi
+
+#endif
+
+#if ABI_WIN
+# define DV rcx
+# define AV rdx
+# define AVL r8
+# define BV r9
+# define BVL r10
+
+ pushreg rbx
+ pushreg rdi
+ stalloc 160 + 8
+
+ savexmm xmm6, 0
+ savexmm xmm7, 16
+ savexmm xmm8, 32
+ savexmm xmm9, 48
+ savexmm xmm10, 64
+ savexmm xmm11, 80
+ savexmm xmm12, 96
+ savexmm xmm13, 112
+ savexmm xmm14, 128
+ savexmm xmm15, 144
+
+ endprologue
+
+ mov rdi, DV
+ mov BVL, [rsp + 224]
+
+#endif
+
+ // Prepare for the first iteration.
+ pxor xmm0, xmm0
+ movdqu xmm10, [BV] // bv[0]
+ mov rbx, AV
+ add DV, 16
+ add BV, 16
+ expand xmm0, xmm10, xmm11
+ call mul4zc
+ add rbx, 16
+ add rdi, 16
+ cmp rbx, AVL // all done?
+ jae 8f
+
+ .p2align 4
+ // Continue with the first iteration.
+0: call mul4
+ add rbx, 16
+ add rdi, 16
+ cmp rbx, AVL // all done?
+ jb 0b
+
+ // Write out the leftover carry. There can be no tail here.
+8: call carryprop
+ cmp BV, BVL // more passes to do?
+ jae 9f
+
+ .p2align 4
+ // Set up for the next pass.
+1: movdqu xmm10, [BV] // bv[i]
+ mov rdi, DV // -> dv[i]
+ pxor xmm0, xmm0
+ expand xmm0, xmm10, xmm11
+ mov rbx, AV // -> av[0]
+ add DV, 16
+ add BV, 16
+ call mla4zc
+ add rbx, 16
+ add rdi, 16
+ cmp rbx, AVL // done yet?
+ jae 8f
+
+ .p2align 4
+ // Continue...
+0: call mla4
+ add rbx, 16
+ add rdi, 16
+ cmp rbx, AVL
+ jb 0b
+
+ // Finish off this pass. There was no tail on the previous pass, and
+ // there can be none on this pass.
+8: call carryprop
+ cmp BV, BVL
+ jb 1b
+
+ // All over.
+9:
+
+#if ABI_SYSV
+ popreg rbx
+#endif
+
+#if ABI_WIN
+
+ rstrxmm xmm6, 0
+ rstrxmm xmm7, 16
+ rstrxmm xmm8, 32
+ rstrxmm xmm9, 48
+ rstrxmm xmm10, 64
+ rstrxmm xmm11, 80
+ rstrxmm xmm12, 96
+ rstrxmm xmm13, 112
+ rstrxmm xmm14, 128
+ rstrxmm xmm15, 144
+
+ stfree 160 + 8
+ popreg rdi
+ popreg rbx
+
+#endif
+
+ ret
+
+#undef DV
+#undef AV
+#undef AVL
+#undef BV
+#undef BVL
+
+ENDFUNC
+
+FUNC(mpxmont_mul4_amd64_sse2)
+ // void mpxmont_mul4_amd64_sse2(mpw *dv, const mpw *av, const mpw *bv,
+ // const mpw *nv, size_t n, const mpw *mi);
+
+ // Establish the arguments and do initial setup.
+ //
+ // sysv win
+ // inner loop dv rdi rdi*
+ // inner loop av rax rax
+ // inner loop nv rbx* rbx*
+ // mi r9 r10
+ // outer loop dv r10 rcx
+ // outer loop bv rdx r8
+ // av base rsi rdx
+ // av limit r11 r11
+ // bv limit r8 r12*
+ // nv base rcx r9
+ // n r8 r12*
+
+#if ABI_SYSV
+# define DV r10
+# define AV rsi
+# define AVL r11
+# define BV rdx
+# define BVL r8
+# define NV rcx
+# define N r8
+# define MI r9
+
+ pushreg rbx
+ endprologue
+
+ mov DV, rdi
+
+#endif
+
+#if ABI_WIN
+# define DV rcx
+# define AV rdx
+# define AVL r11
+# define BV r8
+# define BVL r12
+# define NV r9
+# define N r12
+# define MI r10
+
+ pushreg rbx
+ pushreg rdi
+ pushreg r12
+ stalloc 160
+
+ savexmm xmm6, 0
+ savexmm xmm7, 16
+ savexmm xmm8, 32
+ savexmm xmm9, 48
+ savexmm xmm10, 64
+ savexmm xmm11, 80
+ savexmm xmm12, 96
+ savexmm xmm13, 112
+ savexmm xmm14, 128
+ savexmm xmm15, 144
+
+ endprologue
+
+ mov rdi, DV
+ mov N, [rsp + 224]
+ mov MI, [rsp + 232]
+
+#endif
+
+ // Establish the expanded operands.
+ pxor xmm0, xmm0
+ movdqu xmm8, [BV] // bv[0]
+ movdqu xmm10, [MI] // mi
+ expand xmm0, xmm8, xmm9, xmm10, xmm11
+
+ // Set up the outer loop state and prepare for the first iteration.
+ mov rax, AV // -> U = av[0]
+ mov rbx, NV // -> X = nv[0]
+ lea AVL, [AV + 4*N] // -> av[n/4] = av limit
+ lea BVL, [BV + 4*N] // -> bv[n/4] = bv limit
+ add BV, 16
+ add DV, 16
+ call mmul4
+ add rdi, 16
+ add rax, 16
+ add rbx, 16
+ cmp rax, AVL // done already?
+ jae 8f
+
+ .p2align 4
+ // Complete the first inner loop.
+0: call dmul4
+ add rdi, 16
+ add rax, 16
+ add rbx, 16
+ cmp rax, AVL // done yet?
+ jb 0b
+
+ // Still have carries left to propagate.
+ call carryprop
+ movd [rdi + 16], xmm12
+
+ .p2align 4
+ // Embark on the next iteration. (There must be one. If n = 1, then
+ // we would have bailed above, to label 8. Similarly, the subsequent
+ // iterations can fall into the inner loop immediately.)
+1: pxor xmm0, xmm0
+ movdqu xmm8, [BV] // bv[i]
+ movdqu xmm10, [MI] // mi
+ mov rdi, DV // -> Z = dv[i]
+ mov rax, AV // -> U = av[0]
+ mov rbx, NV // -> X = nv[0]
+ expand xmm0, xmm8, xmm9, xmm10, xmm11
+ add BV, 16
+ add DV, 16
+ call mmla4
+ add rdi, 16
+ add rax, 16
+ add rbx, 16
+
+ .p2align 4
+ // Complete the next inner loop.
+0: call dmla4
+ add rdi, 16
+ add rax, 16
+ add rbx, 16
+ cmp rax, AVL
+ jb 0b
+
+ // Still have carries left to propagate, and they overlap the
+ // previous iteration's final tail, so read that in and add it.
+ movd xmm0, [rdi]
+ paddq xmm12, xmm0
+ call carryprop
+ movd [rdi + 16], xmm12
+
+ // Back again, maybe.
+ cmp BV, BVL
+ jb 1b
+
+ // All done.
+9:
+
+#if ABI_SYSV
+ popreg rbx
+#endif
+
+#if ABI_WIN
+
+ rstrxmm xmm6, 0
+ rstrxmm xmm7, 16
+ rstrxmm xmm8, 32
+ rstrxmm xmm9, 48
+ rstrxmm xmm10, 64
+ rstrxmm xmm11, 80
+ rstrxmm xmm12, 96
+ rstrxmm xmm13, 112
+ rstrxmm xmm14, 128
+ rstrxmm xmm15, 144
+
+ stfree 160
+ popreg r12
+ popreg rdi
+ popreg rbx
+
+#endif
+
+ ret
+
+ // First iteration was short. Write out the carries and we're done.
+ // (This could be folded into the main loop structure, but that would
+ // penalize small numbers more.)
+8: call carryprop
+ movd [rdi + 16], xmm12
+#if ABI_SYSV
+ popreg rbx
+ ret
+#endif
+#if ABI_WIN
+ jmp 9b
+#endif
+
+#undef DV
+#undef AV
+#undef AVL
+#undef BV
+#undef BVL
+#undef NV
+#undef N
+#undef MI
+
+ENDFUNC
+
+FUNC(mpxmont_redc4_amd64_sse2)
+ // void mpxmont_redc4_amd64_sse2(mpw *dv, mpw *dvl, const mpw *nv,
+ // size_t n, const mpw *mi);
+
+ // Establish the arguments and do initial setup.
+ //
+ // sysv win
+ // inner loop dv rdi rdi*
+ // dv limit rax rax
+ // blocks-of-4 dv limit rsi rdx
+ // inner loop nv rbx* rbx*
+ // mi r8 r10
+ // outer loop dv r10 rcx
+ // outer loop dv limit r11 r11
+ // nv base rdx r8
+ // nv limit r9 r12*
+ // n rcx r9
+ // c rcx r9
+
+#if ABI_SYSV
+
+# define DVL rax
+# define DVL4 rsi
+# define MI r8
+# define DV r10
+# define DVLO r11
+# define NV rdx
+# define NVL r9
+# define N rcx
+# define C ecx
+
+ pushreg rbx
+ endprologue
+
+ mov DV, rdi
+
+#endif
+
+#if ABI_WIN
+
+# define DVL rax
+# define DVL4 rdx
+# define MI r10
+# define DV rcx
+# define DVLO r11
+# define NV r8
+# define NVL r12
+# define N r9
+# define C r9d
+
+ pushreg rbx
+ pushreg rdi
+ pushreg r12
+ stalloc 160
+
+ savexmm xmm6, 0
+ savexmm xmm7, 16
+ savexmm xmm8, 32
+ savexmm xmm9, 48
+ savexmm xmm10, 64
+ savexmm xmm11, 80
+ savexmm xmm12, 96
+ savexmm xmm13, 112
+ savexmm xmm14, 128
+ savexmm xmm15, 144
+
+ endprologue
+
+ mov rdi, DV
+ mov MI, [rsp + 224]
+
+#endif
+
+ // Establish the expanded operands and the blocks-of-4 dv limit.
+ pxor xmm0, xmm0
+ mov DVL, DVL4 // -> dv[n] = dv limit
+ sub DVL4, DV // length of dv in bytes
+ movdqu xmm8, [MI] // mi
+ and DVL4, ~15 // mask off the tail end
+ expand xmm0, xmm8, xmm9
+ add DVL4, DV // find limit
+
+ // Set up the outer loop state and prepare for the first iteration.
+ mov rbx, NV // -> X = nv[0]
+ lea DVLO, [DV + 4*N] // -> dv[n/4] = outer dv limit
+ lea NVL, [NV + 4*N] // -> nv[n/4] = nv limit
+ add DV, 16
+ call mont4
+ add rbx, 16
+ add rdi, 16
+ cmp rbx, NVL // done already?
+ jae 8f
+
+ .p2align 4
+ // Complete the first inner loop.
+5: call mla4
+ add rbx, 16
+ add rdi, 16
+ cmp rbx, NVL // done yet?
+ jb 5b
+
+ // Still have carries left to propagate.
+8: carryadd
+ psllq xmm15, 16
+ pslldq xmm15, 8
+ paddq xmm14, xmm15
+ call carryprop
+ movd C, xmm12
+ add rdi, 16
+ cmp rdi, DVL4
+ jae 7f
+
+ .p2align 4
+ // Continue carry propagation until the end of the buffer.
+0: add [rdi], C
+ mov C, 0 // preserves flags
+ adcd [rdi + 4], 0
+ adcd [rdi + 8], 0
+ adcd [rdi + 12], 0
+ adc C, 0
+ add rdi, 16
+ cmp rdi, DVL4
+ jb 0b
+
+ // Deal with the tail end.
+7: add [rdi], C
+ mov C, 0 // preserves flags
+ add rdi, 4
+ adc C, 0
+ cmp rdi, DVL
+ jb 7b
+
+ // All done for this iteration. Start the next. (This must have at
+ // least one follow-on iteration, or we'd not have started this outer
+ // loop.)
+8: mov rdi, DV // -> Z = dv[i]
+ mov rbx, NV // -> X = nv[0]
+ cmp rdi, DVLO // all done yet?
+ jae 9f
+ add DV, 16
+ call mont4
+ add rdi, 16
+ add rbx, 16
+ jmp 5b
+
+ // All over.
+9:
+
+#if ABI_SYSV
+ popreg rbx
+#endif
+
+#if ABI_WIN
+
+ rstrxmm xmm6, 0
+ rstrxmm xmm7, 16
+ rstrxmm xmm8, 32
+ rstrxmm xmm9, 48
+ rstrxmm xmm10, 64
+ rstrxmm xmm11, 80
+ rstrxmm xmm12, 96
+ rstrxmm xmm13, 112
+ rstrxmm xmm14, 128
+ rstrxmm xmm15, 144
+
+ stfree 160
+ popreg r12
+ popreg rdi
+ popreg rbx
+
+#endif
+
+ ret
+
+#undef DVL
+#undef DVL4
+#undef MI
+#undef DV
+#undef DVLO
+#undef NV
+#undef NVL
+#undef N
+#undef C
+
+ENDFUNC
+
+///--------------------------------------------------------------------------
+/// Testing and performance measurement.
+
+#ifdef TEST_MUL4
+
+#if ABI_SYSV
+# define ARG0 rdi
+# define ARG1 rsi
+# define ARG2 rdx
+# define ARG3 rcx
+# define ARG4 r8
+# define ARG5 r9
+# define ARG6 STKARG(0)
+# define ARG7 STKARG(1)
+# define ARG8 STKARG(2)
+# define STKARG_OFFSET 16
+#endif
+#if ABI_WIN
+# define ARG0 rcx
+# define ARG1 rdx
+# define ARG2 r8
+# define ARG3 r9
+# define ARG4 STKARG(0)
+# define ARG5 STKARG(1)
+# define ARG6 STKARG(2)
+# define ARG7 STKARG(3)
+# define ARG8 STKARG(4)
+# define STKARG_OFFSET 40
+#endif
+#define STKARG(i) [rsp + STKARG_OFFSET + 8*(i)]
+
+// sysv win
+// dmul smul mmul mont dmul smul mmul mont
+// A rax
+// D rdx
+// z rdi rdi rdi rdi rdi rcx rcx rcx rcx
+// c rcx rsi rsi rsi rsi rdx rdx rdx rdx
+// y r10 -- -- rdx rdx -- -- r8 r8
+// u r11 rdx -- rcx -- r8 -- r9 --
+// x rbx rcx rdx r8 rcx r9 r8 stk0 r9
+// vv xmm8/9 r8 -- r9 r8 stk0 -- stk1 stk0
+// yy xmm10/11 r9 rcx stk0 -- stk1 r9 stk2 --
+// n r8 stk0 r8 stk1 r9 stk2 stk0 stk3 stk1
+// cyv r9 stk1 r9 stk2 stk0 stk3 stk1 stk4 stk2
+
+.macro cysetup v, n
+ rdtsc
+ shl rdx, 32
+ or rax, rdx
+ mov [\v + 8*\n - 8], rax
+.endm
+
+.macro cystore v, n
+ rdtsc
+ shl rdx, 32
+ or rax, rdx
+ sub rax, [\v + 8*\n - 8]
+ mov [\v + 8*\n - 8], rax
+ dec \n
+.endm
+
+.macro testprologue mode
+ pushreg rbx
+#if ABI_SYSV
+ endprologue
+ .ifeqs "\mode", "dmul"
+ mov rbx, rcx
+ movdqu xmm8, [r8]
+ movdqu xmm10, [r9]
+ mov r8d, STKARG(0)
+ mov r9, STKARG(1)
+ mov r11, rdx
+ mov rcx, rsi
+ .endif
+ .ifeqs "\mode", "smul"
+ mov rbx, rdx
+ movdqu xmm10, [rcx]
+ mov rcx, rsi
+ .endif
+ .ifeqs "\mode", "mmul"
+ mov rax, STKARG(0)
+ mov rbx, r8
+ movdqu xmm8, [r9]
+ movdqu xmm10, [rax]
+ mov r8, STKARG(1)
+ mov r9, STKARG(2)
+ mov r10, rdx
+ mov r11, rcx
+ mov rcx, rsi
+ .endif
+ .ifeqs "\mode", "mont"
+ mov rbx, rcx
+ movdqu xmm8, [r8]
+ mov r8, r9
+ mov r9, STKARG(0)
+ mov r10, rdx
+ mov rcx, rsi
+ .endif
+#endif
+#if ABI_WIN
+ pushreg rdi
+ stalloc 168
+ savexmm xmm6, 0
+ savexmm xmm7, 16
+ savexmm xmm8, 32
+ savexmm xmm9, 48
+ savexmm xmm10, 64
+ savexmm xmm11, 80
+ savexmm xmm12, 96
+ savexmm xmm13, 112
+ savexmm xmm14, 128
+ savexmm xmm15, 144
+ endprologue
+ .ifeqs "\mode", "dmul"
+ mov r10, STKARG(0)
+ mov r11, STKARG(1)
+ mov rdi, rcx
+ mov rcx, rdx
+ mov rbx, r9
+ movdqu xmm8, [r10]
+ movdqu xmm10, [r11]
+ mov r8, STKARG(2)
+ mov r9, STKARG(3)
+ mov r11, r8
+ .endif
+ .ifeqs "\mode", "smul"
+ mov rdi, rcx
+ mov rcx, rdx
+ mov rbx, r8
+ movdqu xmm10, [r9]
+ mov r8, STKARG(0)
+ mov r9, STKARG(1)
+ .endif
+ .ifeqs "\mode", "mmul"
+ mov r10, STKARG(1)
+ mov r11, STKARG(2)
+ mov rdi, rcx
+ mov rcx, rdx
+ mov rbx, STKARG(0)
+ movdqu xmm8, [r10]
+ movdqu xmm10, [r11]
+ mov r8, STKARG(3)
+ mov r9, STKARG(4)
+ mov r10, r8
+ mov r11, r9
+ .endif
+ .ifeqs "\mode", "mont"
+ mov r10, STKARG(0)
+ mov rdi, rcx
+ mov rcx, rdx
+ mov rbx, r9
+ movdqu xmm8, [r10]
+ mov r8, STKARG(1)
+ mov r9, STKARG(2)
+ mov r10, r8
+ .endif
+#endif
+
+ pxor xmm0, xmm0
+ .ifeqs "\mode", "dmul"
+ expand xmm0, xmm8, xmm9, xmm10, xmm11
+ .endif
+ .ifeqs "\mode", "smul"
+ expand xmm0, xmm10, xmm11
+ .endif
+ .ifeqs "\mode", "mmul"
+ expand xmm0, xmm8, xmm9, xmm10, xmm11
+ .endif
+ .ifeqs "\mode", "mont"
+ expand xmm0, xmm8, xmm9
+ .endif
+.endm
+
+.macro testepilogue
+#if ABI_WIN
+ rstrxmm xmm6, 0
+ rstrxmm xmm7, 16
+ rstrxmm xmm8, 32
+ rstrxmm xmm9, 48
+ rstrxmm xmm10, 64
+ rstrxmm xmm11, 80
+ rstrxmm xmm12, 96
+ rstrxmm xmm13, 112
+ rstrxmm xmm14, 128
+ rstrxmm xmm15, 144
+ stfree 168
+ popreg rdi
+#endif
+ popreg rbx
+ ret
+.endm
+
+.macro testldcarry
+ movdqu xmm12, [rcx + 0] // (c'_0, c''_0)
+ movdqu xmm13, [rcx + 16] // (c'_1, c''_1)
+ movdqu xmm14, [rcx + 32] // (c'_2, c''_2)
+.endm
+
+.macro testtop u=nil
+ .p2align 4
+0:
+ cysetup r9, r8
+ .ifnes "\u", "nil"
+ mov rax, \u
+ .endif
+.endm
+
+.macro testtail
+ cystore r9, r8
+ jnz 0b
+.endm
+
+.macro testcarryout
+ movdqu [rcx + 0], xmm12
+ movdqu [rcx + 16], xmm13
+ movdqu [rcx + 32], xmm14
+.endm
+
+FUNC(test_dmul4)
+ testprologue dmul
+ testldcarry
+ testtop r11
+ call dmul4
+ testtail
+ testcarryout
+ testepilogue
+ENDFUNC
+
+FUNC(test_dmla4)
+ testprologue dmul
+ testldcarry
+ testtop r11
+ call dmla4
+ testtail
+ testcarryout
+ testepilogue
+ENDFUNC
+
+FUNC(test_mul4)
+ testprologue smul
+ testldcarry
+ testtop nil
+ call mul4
+ testtail
+ testcarryout
+ testepilogue
+ENDFUNC
+
+FUNC(test_mla4)
+ testprologue smul
+ testldcarry
+ testtop nil
+ call mla4
+ testtail
+ testcarryout
+ testepilogue
+ENDFUNC
+
+FUNC(test_mmul4)
+ testprologue mmul
+ testtop r11
+ call mmul4
+ testtail
+ movdqu [r10 + 0], xmm10
+ movdqu [r10 + 16], xmm11
+ testcarryout
+ testepilogue
+ENDFUNC
+
+FUNC(test_mmla4)
+ testprologue mmul
+ testtop r11
+ call mmla4
+ testtail
+ movdqu [r10 + 0], xmm10
+ movdqu [r10 + 16], xmm11
+ testcarryout
+ testepilogue
+ENDFUNC
+
+FUNC(test_mont4)
+ testprologue mont
+ testtop
+ call mont4
+ testtail
+ movdqu [r10 + 0], xmm10
+ movdqu [r10 + 16], xmm11
+ testcarryout
+ testepilogue
+ENDFUNC
+
+#endif
+
+///----- That's all, folks --------------------------------------------------