/// -*- mode: asm; asm-comment-char: ?/; comment-start: "// " -*- /// /// Large SIMD-based multiplications /// /// (c) 2016 Straylight/Edgeware ///----- Licensing notice --------------------------------------------------- /// /// This file is part of Catacomb. /// /// Catacomb is free software; you can redistribute it and/or modify /// it under the terms of the GNU Library General Public License as /// published by the Free Software Foundation; either version 2 of the /// License, or (at your option) any later version. /// /// Catacomb is distributed in the hope that it will be useful, /// but WITHOUT ANY WARRANTY; without even the implied warranty of /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the /// GNU Library General Public License for more details. /// /// You should have received a copy of the GNU Library General Public /// License along with Catacomb; if not, write to the Free /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, /// MA 02111-1307, USA. ///-------------------------------------------------------------------------- /// Preliminaries. #include "config.h" #include "asm-common.h" .arch pentium4 .text ///-------------------------------------------------------------------------- /// Theory. /// /// We define a number of primitive fixed-size multipliers from which we can /// construct more general variable-length multipliers. /// /// The basic trick is the same throughout. In an operand-scanning /// multiplication, the inner multiplication loop multiplies a /// multiple-precision operand by a single precision factor, and adds the /// result, appropriately shifted, to the result. A `finely integrated /// operand scanning' implementation of Montgomery multiplication also adds /// the product of a single-precision `Montgomery factor' and the modulus, /// calculated in the same pass. The more common `coarsely integrated /// operand scanning' alternates main multiplication and Montgomery passes, /// which requires additional carry propagation. /// /// Throughout both plain-multiplication and Montgomery stages, then, one of /// the factors remains constant throughout the operation, so we can afford /// to take a little time to preprocess it. The transformation we perform is /// as follows. Let b = 2^16, and B = b^2 = 2^32. Suppose we're given a /// 128-bit factor v = v_0 + v_1 B + v_2 B^2 + v_3 B^3. Split each v_i into /// two sixteen-bit pieces, so v_i = v'_i + v''_i b. These eight 16-bit /// pieces are placed into 32-bit cells, and arranged as two 128-bit SSE /// operands, as follows. /// /// Offset 0 4 8 12 /// 0 v'_0 v'_1 v''_0 v''_1 /// 16 v'_2 v'_3 v''_2 v''_3 /// /// A `pmuludq' instruction ignores the odd positions in its operands; thus, /// it will act on (say) v'_0 and v''_0 in a single instruction. Shifting /// this vector right by 4 bytes brings v'_1 and v''_1 into position. We can /// multiply such a vector by a full 32-bit scalar to produce two 48-bit /// results in 64-bit fields. The sixteen bits of headroom allows us to add /// many products together before we must deal with carrying; it also allows /// for some calculations to be performed on the above expanded form. /// /// On 32-bit x86, we are register starved: the expanded operands are kept in /// memory, typically in warm L1 cache. /// /// We maintain four `carry' registers accumulating intermediate results. /// The registers' precise roles rotate during the computation; we name them /// `c0', `c1', `c2', and `c3'. Each carry register holds two 64-bit halves: /// the register c0, for example, holds c'_0 (low half) and c''_0 (high /// half), and represents the value c_0 = c'_0 + c''_0 b; the carry registers /// collectively represent the value c_0 + c_1 B + c_2 B^2 + c_3 B^3. The /// `pmuluqd' instruction acting on a scalar operand (broadcast across all /// lanes of its vector) and an operand in the expanded form above produces a /// result which can be added directly to the appropriate carry register. /// Following a pass of four multiplications, we perform some limited carry /// propagation: let t = c''_0 mod B, and let d = c'_0 + t b; then we output /// z = d mod B, add (floor(d/B), floor(c''_0/B)) to c1, and cycle the carry /// registers around, so that c1 becomes c0, and the old c0 is (implicitly) /// zeroed becomes c3. ///-------------------------------------------------------------------------- /// Macro definitions. .macro mulcore r, s, d0, d1=nil, d2=nil, d3=nil // Load a word r_i from R, multiply by the expanded operand [S], and // leave the pieces of the product in registers D0, D1, D2, D3. movd \d0, \r // (r_i, 0; 0, 0) .ifnes "\d1", "nil" movdqa \d1, [\s] // (s'_0, s'_1; s''_0, s''_1) .endif .ifnes "\d3", "nil" movdqa \d3, [\s + 16] // (s'_2, s'_3; s''_2, s''_3) .endif pshufd \d0, \d0, SHUF(0, 3, 0, 3) // (r_i, ?; r_i, ?) .ifnes "\d1", "nil" psrldq \d1, 4 // (s'_1, s''_0; s''_1, 0) .endif .ifnes "\d2", "nil" .ifnes "\d3", "nil" movdqa \d2, \d3 // another copy of (s'_2, s'_3; ...) .else movdqa \d2, \d0 // another copy of (r_i, ?; r_i, ?) .endif .endif .ifnes "\d3", "nil" psrldq \d3, 4 // (s'_3, s''_2; s''_3, 0) .endif .ifnes "\d1", "nil" pmuludq \d1, \d0 // (r_i s'_1; r_i s''_1) .endif .ifnes "\d3", "nil" pmuludq \d3, \d0 // (r_i s'_3; r_i s''_3) .endif .ifnes "\d2", "nil" .ifnes "\d3", "nil" pmuludq \d2, \d0 // (r_i s'_2; r_i s''_2) .else pmuludq \d2, [\s + 16] .endif .endif pmuludq \d0, [\s] // (r_i s'_0; r_i s''_0) .endm .macro accum c0, c1=nil, c2=nil, c3=nil // Accumulate 64-bit pieces in XMM0--XMM3 into the corresponding // carry registers C0--C3. Any or all of C1--C3 may be `nil' to skip // updating that register. paddq \c0, xmm0 .ifnes "\c1", "nil" paddq \c1, xmm1 .endif .ifnes "\c2", "nil" paddq \c2, xmm2 .endif .ifnes "\c3", "nil" paddq \c3, xmm3 .endif .endm .macro mulacc r, s, c0, c1, c2, c3, z3p=nil // Load a word r_i from R, multiply by the expanded operand [S], // and accumulate in carry registers C0, C1, C2, C3. If Z3P is `t' // then C3 notionally contains zero, but needs clearing; in practice, // we store the product directly rather than attempting to add. On // completion, XMM0, XMM1, and XMM2 are clobbered, as is XMM3 if Z3P // is not `t'. .ifeqs "\z3p", "t" mulcore \r, \s, xmm0, xmm1, xmm2, \c3 accum \c0, \c1, \c2 .else mulcore \r, \s, xmm0, xmm1, xmm2, xmm3 accum \c0, \c1, \c2, \c3 .endif .endm .macro propout d, c, cc=nil // Calculate an output word from C, and store it in D; propagate // carries out from C to CC in preparation for a rotation of the // carry registers. On completion, XMM3 is clobbered. If CC is // `nil', then the contribution which would have been added to it is // left in C. pshufd xmm3, \c, SHUF(3, 3, 3, 2) // (?, ?; ?, t = c'' mod B) psrldq xmm3, 12 // (t, 0; 0, 0) = (t, 0) pslldq xmm3, 2 // (t b; 0) paddq \c, xmm3 // (c' + t b; c'') movd \d, \c psrlq \c, 32 // floor(c/B) .ifnes "\cc", "nil" paddq \cc, \c // propagate up .endif .endm .macro endprop d, c, t // On entry, C contains a carry register. On exit, the low 32 bits // of the value represented in C are written to D, and the remaining // bits are left at the bottom of T. movdqa \t, \c psllq \t, 16 // (?; c'' b) pslldq \c, 8 // (0; c') paddq \t, \c // (?; c' + c'' b) psrldq \t, 8 // (c' + c'' b; 0) = (c; 0) movd \d, \t psrldq \t, 4 // (floor(c/B); 0) .endm .macro expand z, a, b, c=nil, d=nil // On entry, A and C hold packed 128-bit values, and Z is zero. On // exit, A:B and C:D together hold the same values in expanded // form. If C is `nil', then only expand A to A:B. movdqa \b, \a // (a_0, a_1; a_2, a_3) .ifnes "\c", "nil" movdqa \d, \c // (c_0, c_1; c_2, c_3) .endif punpcklwd \a, \z // (a'_0, a''_0; a'_1, a''_1) punpckhwd \b, \z // (a'_2, a''_2; a'_3, a''_3) .ifnes "\c", "nil" punpcklwd \c, \z // (c'_0, c''_0; c'_1, c''_1) punpckhwd \d, \z // (c'_2, c''_2; c'_3, c''_3) .endif pshufd \a, \a, SHUF(0, 2, 1, 3) // (a'_0, a'_1; a''_0, a''_1) pshufd \b, \b, SHUF(0, 2, 1, 3) // (a'_2, a'_3; a''_2, a''_3) .ifnes "\c", "nil" pshufd \c, \c, SHUF(0, 2, 1, 3) // (c'_0, c'_1; c''_0, c''_1) pshufd \d, \d, SHUF(0, 2, 1, 3) // (c'_2, c'_3; c''_2, c''_3) .endif .endm .macro squash c0, c1, c2, c3, t, u, lo, hi=nil // On entry, C0, C1, C2, C3 are carry registers representing a value // Y. On exit, LO holds the low 128 bits of the carry value; C1, C2, // C3, T, and U are clobbered; and the high bits of Y are stored in // HI, if this is not `nil'. // The first step is to eliminate the `double-prime' pieces -- i.e., // the ones offset by 16 bytes from a 32-bit boundary -- by carrying // them into the 32-bit-aligned pieces above and below. But before // we can do that, we must gather them together. movdqa \t, \c0 movdqa \u, \c1 punpcklqdq \t, \c2 // (y'_0; y'_2) punpckhqdq \c0, \c2 // (y''_0; y''_2) punpcklqdq \u, \c3 // (y'_1; y'_3) punpckhqdq \c1, \c3 // (y''_1; y''_3) // Now split the double-prime pieces. The high (up to) 48 bits will // go up; the low 16 bits go down. movdqa \c2, \c0 movdqa \c3, \c1 psllq \c2, 48 psllq \c3, 48 psrlq \c0, 16 // high parts of (y''_0; y''_2) psrlq \c1, 16 // high parts of (y''_1; y''_3) psrlq \c2, 32 // low parts of (y''_0; y''_2) psrlq \c3, 32 // low parts of (y''_1; y''_3) .ifnes "\hi", "nil" movdqa \hi, \c1 .endif pslldq \c1, 8 // high part of (0; y''_1) paddq \t, \c2 // propagate down paddq \u, \c3 paddq \t, \c1 // and up: (y_0; y_2) paddq \u, \c0 // (y_1; y_3) .ifnes "\hi", "nil" psrldq \hi, 8 // high part of (y''_3; 0) .endif // Finally extract the answer. This complicated dance is better than // storing to memory and loading, because the piecemeal stores // inhibit store forwarding. movdqa \c3, \t // (y_0; ?) movdqa \lo, \t // (y^*_0, ?; ?, ?) psrldq \t, 8 // (y_2; 0) psrlq \c3, 32 // (floor(y_0/B); ?) paddq \c3, \u // (y_1 + floor(y_0/B); ?) movdqa \c1, \c3 // (y^*_1, ?; ?, ?) psrldq \u, 8 // (y_3; 0) psrlq \c3, 32 // (floor((y_1 B + y_0)/B^2; ?) paddq \c3, \t // (y_2 + floor((y_1 B + y_0)/B^2; ?) punpckldq \lo, \c3 // (y^*_0, y^*_2; ?, ?) psrlq \c3, 32 // (floor((y_2 B^2 + y_1 B + y_0)/B^3; ?) paddq \c3, \u // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3; ?) .ifnes "\hi", "nil" movdqa \t, \c3 pxor \u, \u .endif punpckldq \c1, \c3 // (y^*_1, y^*_3; ?, ?) .ifnes "\hi", "nil" psrlq \t, 32 // very high bits of y paddq \hi, \t punpcklqdq \hi, \u // carry up .endif punpckldq \lo, \c1 // y mod B^4 .endm .macro carryadd // On entry, EDI points to a packed addend A, and XMM4, XMM5, XMM6 // hold the incoming carry registers c0, c1, and c2 representing a // carry-in C. // // On exit, the carry registers, including XMM7, are updated to hold // C + A; XMM0, XMM1, XMM2, and XMM3 are clobbered. The other // registers are preserved. movd xmm0, [edi + 0] // (a_0; 0) movd xmm1, [edi + 4] // (a_1; 0) movd xmm2, [edi + 8] // (a_2; 0) movd xmm7, [edi + 12] // (a_3; 0) paddq xmm4, xmm0 // (c'_0 + a_0; c''_0) paddq xmm5, xmm1 // (c'_1 + a_1; c''_1) paddq xmm6, xmm2 // (c'_2 + a_2; c''_2 + a_3 b) .endm ///-------------------------------------------------------------------------- /// Primitive multipliers and related utilities. INTFUNC(carryprop) // On entry, XMM4, XMM5, and XMM6 hold a 144-bit carry in an expanded // form. Store the low 128 bits of the represented carry to [EDI] as // a packed 128-bit value, and leave the remaining 16 bits in the low // 32 bits of XMM4. On exit, XMM3, XMM5 and XMM6 are clobbered. endprologue propout [edi + 0], xmm4, xmm5 propout [edi + 4], xmm5, xmm6 propout [edi + 8], xmm6, nil endprop [edi + 12], xmm6, xmm4 ret ENDFUNC INTFUNC(dmul4) // On entry, EDI points to the destination buffer; EAX and EBX point // to the packed operands U and X; ECX and EDX point to the expanded // operands V and Y; and XMM4, XMM5, XMM6 hold the incoming carry // registers c0, c1, and c2; c3 is assumed to be zero. // // On exit, we write the low 128 bits of the sum C + U V + X Y to // [EDI], and update the carry registers with the carry out. The // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the // general-purpose registers are preserved. endprologue mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7, t mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7 propout [edi + 0], xmm4, xmm5 mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4 propout [edi + 4], xmm5, xmm6 mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5 propout [edi + 8], xmm6, xmm7 mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6 propout [edi + 12], xmm7, xmm4 ret ENDFUNC INTFUNC(dmla4) // On entry, EDI points to the destination buffer, which also // contains an addend A to accumulate; EAX and EBX point to the // packed operands U and X; ECX and EDX point to the expanded // operands V and Y; and XMM4, XMM5, XMM6 hold the incoming carry // registers c0, c1, and c2 representing a carry-in C; c3 is assumed // to be zero. // // On exit, we write the low 128 bits of the sum A + C + U V + X Y to // [EDI], and update the carry registers with the carry out. The // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the // general-purpose registers are preserved. endprologue carryadd mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7 mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7 propout [edi + 0], xmm4, xmm5 mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4 propout [edi + 4], xmm5, xmm6 mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5 propout [edi + 8], xmm6, xmm7 mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6 propout [edi + 12], xmm7, xmm4 ret ENDFUNC INTFUNC(mul4zc) // On entry, EDI points to the destination buffer; EBX points to a // packed operand X; and EDX points to an expanded operand Y. // // On exit, we write the low 128 bits of the product X Y to [EDI], // and set the carry registers XMM4, XMM5, XMM6 to the carry out. // The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the // general-purpose registers are preserved. endprologue mulcore [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7 propout [edi + 0], xmm4, xmm5 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t propout [edi + 4], xmm5, xmm6 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t propout [edi + 8], xmm6, xmm7 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t propout [edi + 12], xmm7, xmm4 ret ENDFUNC INTFUNC(mul4) // On entry, EDI points to the destination buffer; EBX points to a // packed operand X; EDX points to an expanded operand Y; and XMM4, // XMM5, XMM6 hold the incoming carry registers c0, c1, and c2, // representing a carry-in C; c3 is assumed to be zero. // // On exit, we write the low 128 bits of the sum C + X Y to [EDI], // and update the carry registers with the carry out. The registers // XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the // general-purpose registers are preserved. endprologue mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, t propout [edi + 0], xmm4, xmm5 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t propout [edi + 4], xmm5, xmm6 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t propout [edi + 8], xmm6, xmm7 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t propout [edi + 12], xmm7, xmm4 ret ENDFUNC INTFUNC(mla4zc) // On entry, EDI points to the destination buffer, which also // contains an addend A to accumulate; EBX points to a packed operand // X; and EDX points to an expanded operand Y. // // On exit, we write the low 128 bits of the sum A + X Y to [EDI], // and set the carry registers XMM4, XMM5, XMM6 to the carry out. // The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the // general-purpose registers are preserved. endprologue movd xmm4, [edi + 0] movd xmm5, [edi + 4] movd xmm6, [edi + 8] movd xmm7, [edi + 12] mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7 propout [edi + 0], xmm4, xmm5 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t propout [edi + 4], xmm5, xmm6 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t propout [edi + 8], xmm6, xmm7 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t propout [edi + 12], xmm7, xmm4 ret ENDFUNC INTFUNC(mla4) // On entry, EDI points to the destination buffer, which also // contains an addend A to accumulate; EBX points to a packed operand // X; EDX points to an expanded operand Y; and XMM4, XMM5, XMM6 hold // the incoming carry registers c0, c1, and c2, representing a // carry-in C; c3 is assumed to be zero. // // On exit, we write the low 128 bits of the sum A + C + X Y to // [EDI], and update the carry registers with the carry out. The // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the // general-purpose registers are preserved. endprologue carryadd mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7 propout [edi + 0], xmm4, xmm5 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t propout [edi + 4], xmm5, xmm6 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t propout [edi + 8], xmm6, xmm7 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t propout [edi + 12], xmm7, xmm4 ret ENDFUNC INTFUNC(mmul4) // On entry, EDI points to the destination buffer; EAX and EBX point // to the packed operands U and N; ECX and ESI point to the expanded // operands V and M; and EDX points to a place to store an expanded // result Y (32 bytes, at a 16-byte boundary). The stack pointer // must be 12 modulo 16, as is usual for modern x86 ABIs. // // On exit, we write Y = U V M mod B to [EDX], and the low 128 bits // of the sum U V + N Y to [EDI], leaving the remaining carry in // XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2, XMM3, and // XMM7 are clobbered; the general-purpose registers are preserved. stalloc 48 + 12 // space for the carries endprologue // Calculate W = U V, and leave it in the destination. Stash the // carry pieces for later. mulcore [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7 propout [edi + 0], xmm4, xmm5 jmp 5f ENDFUNC INTFUNC(mmla4) // On entry, EDI points to the destination buffer, which also // contains an addend A to accumulate; EAX and EBX point to the // packed operands U and N; ECX and ESI point to the expanded // operands V and M; and EDX points to a place to store an expanded // result Y (32 bytes, at a 16-byte boundary). The stack pointer // must be 12 modulo 16, as is usual for modern x86 ABIs. // // On exit, we write Y = (A + U V) M mod B to [EDX], and the low 128 // bits of the sum A + U V + N Y to [EDI], leaving the remaining // carry in XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2, // XMM3, and XMM7 are clobbered; the general-purpose registers are // preserved. stalloc 48 + 12 // space for the carries endprologue movd xmm4, [edi + 0] movd xmm5, [edi + 4] movd xmm6, [edi + 8] movd xmm7, [edi + 12] // Calculate W = U V, and leave it in the destination. Stash the // carry pieces for later. mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7 propout [edi + 0], xmm4, xmm5 5: mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t propout [edi + 4], xmm5, xmm6 mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t propout [edi + 8], xmm6, xmm7 mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t propout [edi + 12], xmm7, xmm4 movdqa [esp + 0], xmm4 movdqa [esp + 16], xmm5 movdqa [esp + 32], xmm6 // Calculate Y = W M. mulcore [edi + 0], esi, xmm4, xmm5, xmm6, xmm7 mulcore [edi + 4], esi, xmm0, xmm1, xmm2 accum xmm5, xmm6, xmm7 mulcore [edi + 8], esi, xmm0, xmm1 accum xmm6, xmm7 mulcore [edi + 12], esi, xmm0 accum xmm7 // That's lots of pieces. Now we have to assemble the answer. squash xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm4 // Expand it. pxor xmm2, xmm2 expand xmm2, xmm4, xmm1 movdqa [edx + 0], xmm4 movdqa [edx + 16], xmm1 // Initialize the carry from the value for W we calculated earlier. movd xmm4, [edi + 0] movd xmm5, [edi + 4] movd xmm6, [edi + 8] movd xmm7, [edi + 12] // Finish the calculation by adding the Montgomery product. mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7 propout [edi + 0], xmm4, xmm5 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t propout [edi + 4], xmm5, xmm6 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t propout [edi + 8], xmm6, xmm7 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t propout [edi + 12], xmm7, xmm4 // Add add on the carry we calculated earlier. paddq xmm4, [esp + 0] paddq xmm5, [esp + 16] paddq xmm6, [esp + 32] // And, with that, we're done. stfree 48 + 12 ret ENDFUNC INTFUNC(mont4) // On entry, EDI points to the destination buffer holding a packed // value W; EBX points to a packed operand N; ESI points to an // expanded operand M; and EDX points to a place to store an expanded // result Y (32 bytes, at a 16-byte boundary). // // On exit, we write Y = W M mod B to [EDX], and the low 128 bits // of the sum W + N Y to [EDI], leaving the remaining carry in // XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2, XMM3, and // XMM7 are clobbered; the general-purpose registers are preserved. endprologue // Calculate Y = W M. mulcore [edi + 0], esi, xmm4, xmm5, xmm6, xmm7 mulcore [edi + 4], esi, xmm0, xmm1, xmm2 accum xmm5, xmm6, xmm7 mulcore [edi + 8], esi, xmm0, xmm1 accum xmm6, xmm7 mulcore [edi + 12], esi, xmm0 accum xmm7 // That's lots of pieces. Now we have to assemble the answer. squash xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm4 // Expand it. pxor xmm2, xmm2 expand xmm2, xmm4, xmm1 movdqa [edx + 0], xmm4 movdqa [edx + 16], xmm1 // Initialize the carry from W. movd xmm4, [edi + 0] movd xmm5, [edi + 4] movd xmm6, [edi + 8] movd xmm7, [edi + 12] // Finish the calculation by adding the Montgomery product. mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7 propout [edi + 0], xmm4, xmm5 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t propout [edi + 4], xmm5, xmm6 mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t propout [edi + 8], xmm6, xmm7 mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t propout [edi + 12], xmm7, xmm4 // And, with that, we're done. ret ENDFUNC ///-------------------------------------------------------------------------- /// Bulk multipliers. FUNC(mpx_umul4_x86_avx) .arch .avx vzeroupper endprologue // and drop through... .arch pentium4 ENDFUNC FUNC(mpx_umul4_x86_sse2) // void mpx_umul4_x86_sse2(mpw *dv, const mpw *av, const mpw *avl, // const mpw *bv, const mpw *bvl); // Build a stack frame. Arguments will be relative to EBP, as // follows. // // ebp + 20 dv // ebp + 24 av // ebp + 28 avl // ebp + 32 bv // ebp + 36 bvl // // Locals are relative to ESP, as follows. // // esp + 0 expanded Y (32 bytes) // esp + 32 (top of locals) pushreg ebp pushreg ebx pushreg esi pushreg edi setfp and esp, ~15 sub esp, 32 endprologue // Prepare for the first iteration. mov esi, [ebp + 32] // -> bv[0] pxor xmm7, xmm7 movdqu xmm0, [esi] // bv[0] mov edi, [ebp + 20] // -> dv[0] mov ecx, edi // outer loop dv cursor expand xmm7, xmm0, xmm1 mov ebx, [ebp + 24] // -> av[0] mov eax, [ebp + 28] // -> av[m] = av limit mov edx, esp // -> expanded Y = bv[0] movdqa [esp + 0], xmm0 // bv[0] expanded low movdqa [esp + 16], xmm1 // bv[0] expanded high call mul4zc add ebx, 16 add edi, 16 add ecx, 16 add esi, 16 cmp ebx, eax // all done? jae 8f .p2align 4 // Continue with the first iteration. 0: call mul4 add ebx, 16 add edi, 16 cmp ebx, eax // all done? jb 0b // Write out the leftover carry. There can be no tail here. 8: call carryprop cmp esi, [ebp + 36] // more passes to do? jae 9f .p2align 4 // Set up for the next pass. 1: movdqu xmm0, [esi] // bv[i] mov edi, ecx // -> dv[i] pxor xmm7, xmm7 expand xmm7, xmm0, xmm1 mov ebx, [ebp + 24] // -> av[0] movdqa [esp + 0], xmm0 // bv[i] expanded low movdqa [esp + 16], xmm1 // bv[i] expanded high call mla4zc add edi, 16 add ebx, 16 add ecx, 16 add esi, 16 cmp ebx, eax // done yet? jae 8f .p2align 4 // Continue... 0: call mla4 add ebx, 16 add edi, 16 cmp ebx, eax jb 0b // Finish off this pass. There was no tail on the previous pass, and // there can be none on this pass. 8: call carryprop cmp esi, [ebp + 36] jb 1b // All over. 9: dropfp pop edi pop esi pop ebx pop ebp ret ENDFUNC FUNC(mpxmont_mul4_x86_avx) .arch .avx vzeroupper endprologue // and drop through... .arch pentium4 ENDFUNC FUNC(mpxmont_mul4_x86_sse2) // void mpxmont_mul4_x86_sse2(mpw *dv, const mpw *av, const mpw *bv, // const mpw *nv, size_t n, const mpw *mi); // Build a stack frame. Arguments will be relative to EBP, as // follows. // // ebp + 20 dv // ebp + 24 av // ebp + 28 bv // ebp + 32 nv // ebp + 36 n (nonzero multiple of 4) // ebp + 40 mi // // Locals are relative to ESP, which 16-byte aligned, as follows. // // esp + 0 expanded V (32 bytes) // esp + 32 expanded M (32 bytes) // esp + 64 expanded Y (32 bytes) // esp + 96 outer loop dv // esp + 100 outer loop bv // esp + 104 av limit (mostly in ESI) // esp + 108 bv limit // esp + 112 (top of locals) pushreg ebp pushreg ebx pushreg esi pushreg edi setfp and esp, ~15 sub esp, 112 endprologue // Establish the expanded operands. pxor xmm7, xmm7 mov ecx, [ebp + 28] // -> bv mov edx, [ebp + 40] // -> mi movdqu xmm0, [ecx] // bv[0] movdqu xmm2, [edx] // mi expand xmm7, xmm0, xmm1, xmm2, xmm3 movdqa [esp + 0], xmm0 // bv[0] expanded low movdqa [esp + 16], xmm1 // bv[0] expanded high movdqa [esp + 32], xmm2 // mi expanded low movdqa [esp + 48], xmm3 // mi expanded high // Set up the outer loop state and prepare for the first iteration. mov edx, [ebp + 36] // n mov eax, [ebp + 24] // -> U = av[0] mov ebx, [ebp + 32] // -> X = nv[0] mov edi, [ebp + 20] // -> Z = dv[0] mov [esp + 100], ecx lea ecx, [ecx + 4*edx] // -> bv[n/4] = bv limit lea edx, [eax + 4*edx] // -> av[n/4] = av limit mov [esp + 96], edi mov [esp + 104], edx mov [esp + 108], ecx lea ecx, [esp + 0] // -> expanded V = bv[0] lea esi, [esp + 32] // -> expanded M = mi lea edx, [esp + 64] // -> space for Y call mmul4 mov esi, [esp + 104] // recover av limit add edi, 16 add eax, 16 add ebx, 16 cmp eax, esi // done already? jae 8f mov [esp + 96], edi .p2align 4 // Complete the first inner loop. 0: call dmul4 add edi, 16 add eax, 16 add ebx, 16 cmp eax, esi // done yet? jb 0b // Still have carries left to propagate. call carryprop movd [edi + 16], xmm4 .p2align 4 // Embark on the next iteration. (There must be one. If n = 1, then // we would have bailed above, to label 8. Similarly, the subsequent // iterations can fall into the inner loop immediately.) 1: mov eax, [esp + 100] // -> bv[i - 1] mov edi, [esp + 96] // -> Z = dv[i] add eax, 16 // -> bv[i] pxor xmm7, xmm7 mov [esp + 100], eax cmp eax, [esp + 108] // done yet? jae 9f movdqu xmm0, [eax] // bv[i] mov ebx, [ebp + 32] // -> X = nv[0] lea esi, [esp + 32] // -> expanded M = mi mov eax, [ebp + 24] // -> U = av[0] expand xmm7, xmm0, xmm1 movdqa [esp + 0], xmm0 // bv[i] expanded low movdqa [esp + 16], xmm1 // bv[i] expanded high call mmla4 mov esi, [esp + 104] // recover av limit add edi, 16 add eax, 16 add ebx, 16 mov [esp + 96], edi .p2align 4 // Complete the next inner loop. 0: call dmla4 add edi, 16 add eax, 16 add ebx, 16 cmp eax, esi jb 0b // Still have carries left to propagate, and they overlap the // previous iteration's final tail, so read that in and add it. movd xmm0, [edi] paddq xmm4, xmm0 call carryprop movd [edi + 16], xmm4 // Back again. jmp 1b // First iteration was short. Write out the carries and we're done. // (This could be folded into the main loop structure, but that would // penalize small numbers more.) 8: call carryprop movd [edi + 16], xmm4 // All done. 9: dropfp popreg edi popreg esi popreg ebx popreg ebp ret ENDFUNC FUNC(mpxmont_redc4_x86_avx) .arch .avx vzeroupper endprologue // and drop through... .arch pentium4 ENDFUNC FUNC(mpxmont_redc4_x86_sse2) // void mpxmont_redc4_x86_sse2(mpw *dv, mpw *dvl, const mpw *nv, // size_t n, const mpw *mi); // Build a stack frame. Arguments will be relative to EBP, as // follows. // // ebp + 20 dv // ebp + 24 dvl // ebp + 28 nv // ebp + 32 n (nonzero multiple of 4) // ebp + 36 mi // // Locals are relative to ESP, as follows. // // esp + 0 outer loop dv // esp + 4 outer dv limit // esp + 8 blocks-of-4 dv limit // esp + 12 expanded M (32 bytes) // esp + 44 expanded Y (32 bytes) // esp + 76 (top of locals) pushreg ebp pushreg ebx pushreg esi pushreg edi setfp and esp, ~15 sub esp, 76 endprologue // Establish the expanded operands and the blocks-of-4 dv limit. mov edi, [ebp + 20] // -> Z = dv[0] pxor xmm7, xmm7 mov eax, [ebp + 24] // -> dv[n] = dv limit sub eax, edi // length of dv in bytes mov edx, [ebp + 36] // -> mi movdqu xmm0, [edx] // mi and eax, ~15 // mask off the tail end expand xmm7, xmm0, xmm1 add eax, edi // find limit movdqa [esp + 12], xmm0 // mi expanded low movdqa [esp + 28], xmm1 // mi expanded high mov [esp + 8], eax // Set up the outer loop state and prepare for the first iteration. mov ecx, [ebp + 32] // n mov ebx, [ebp + 28] // -> X = nv[0] lea edx, [edi + 4*ecx] // -> dv[n/4] = outer dv limit lea ecx, [ebx + 4*ecx] // -> nv[n/4] = nv limit mov [esp + 0], edi mov [esp + 4], edx lea esi, [esp + 12] // -> expanded M = mi lea edx, [esp + 44] // -> space for Y call mont4 add ebx, 16 add edi, 16 cmp ebx, ecx // done already? jae 8f .p2align 4 // Complete the first inner loop. 5: call mla4 add ebx, 16 add edi, 16 cmp ebx, ecx // done yet? jb 5b // Still have carries left to propagate. 8: carryadd mov esi, [esp + 8] // -> dv blocks limit mov edx, [ebp + 24] // dv limit psllq xmm7, 16 pslldq xmm7, 8 paddq xmm6, xmm7 call carryprop movd eax, xmm4 add edi, 16 cmp edi, esi jae 7f .p2align 4 // Continue carry propagation until the end of the buffer. 0: add [edi], eax mov eax, 0 // preserves flags adcd [edi + 4], 0 adcd [edi + 8], 0 adcd [edi + 12], 0 adc eax, 0 add edi, 16 cmp edi, esi jb 0b // Deal with the tail end. 7: add [edi], eax mov eax, 0 // preserves flags add edi, 4 adc eax, 0 cmp edi, edx jb 7b // All done for this iteration. Start the next. (This must have at // least one follow-on iteration, or we'd not have started this outer // loop.) 8: mov edi, [esp + 0] // -> dv[i - 1] mov ebx, [ebp + 28] // -> X = nv[0] lea edx, [esp + 44] // -> space for Y lea esi, [esp + 12] // -> expanded M = mi add edi, 16 // -> Z = dv[i] cmp edi, [esp + 4] // all done yet? jae 9f mov [esp + 0], edi call mont4 add edi, 16 add ebx, 16 jmp 5b // All over. 9: dropfp popreg edi popreg esi popreg ebx popreg ebp ret ENDFUNC ///-------------------------------------------------------------------------- /// Testing and performance measurement. #ifdef TEST_MUL4 .macro cysetup c rdtsc mov [\c], eax mov [\c + 4], edx .endm .macro cystore c, v, n rdtsc sub eax, [\c] sbb edx, [\c + 4] mov ebx, [\v] mov ecx, [\n] dec ecx mov [\n], ecx mov [ebx + ecx*8], eax mov [ebx + ecx*8 + 4], edx .endm .macro testprologue n pushreg ebp pushreg ebx pushreg esi pushreg edi setfp and esp, ~15 sub esp, 3*32 + 4*4 endprologue mov eax, \n mov [esp + 104], eax // vars: // esp + 0 = v expanded // esp + 32 = y expanded // esp + 64 = ? expanded // esp + 96 = cycles // esp + 104 = count .endm .macro testepilogue dropfp popreg edi popreg esi popreg ebx popreg ebp ret .endm .macro testldcarry c mov ecx, \c // -> c movdqu xmm4, [ecx + 0] // (c'_0; c''_0) movdqu xmm5, [ecx + 16] // (c'_1; c''_1) movdqu xmm6, [ecx + 32] // (c'_2; c''_2) .endm .macro testexpand v=nil, y=nil pxor xmm7, xmm7 .ifnes "\v", "nil" mov ecx, \v movdqu xmm0, [ecx] expand xmm7, xmm0, xmm1 movdqa [esp + 0], xmm0 movdqa [esp + 16], xmm1 .endif .ifnes "\y", "nil" mov edx, \y movdqu xmm2, [edx] expand xmm7, xmm2, xmm3 movdqa [esp + 32], xmm2 movdqa [esp + 48], xmm3 .endif .endm .macro testtop u=nil, x=nil, mode=nil .p2align 4 0: .ifnes "\u", "nil" lea ecx, [esp + 0] .endif mov ebx, \x .ifeqs "\mode", "mont" lea esi, [esp + 32] .endif cysetup esp + 96 .ifnes "\u", "nil" mov eax, \u .endif .ifeqs "\mode", "mont" lea edx, [esp + 64] .else lea edx, [esp + 32] .endif .endm .macro testtail cyv cystore esp + 96, \cyv, esp + 104 jnz 0b .endm .macro testcarryout c mov ecx, \c movdqu [ecx + 0], xmm4 movdqu [ecx + 16], xmm5 movdqu [ecx + 32], xmm6 .endm FUNC(test_dmul4) testprologue [ebp + 44] testldcarry [ebp + 24] testexpand [ebp + 36], [ebp + 40] mov edi, [ebp + 20] testtop [ebp + 28], [ebp + 32] call dmul4 testtail [ebp + 48] testcarryout [ebp + 24] testepilogue ENDFUNC FUNC(test_dmla4) testprologue [ebp + 44] testldcarry [ebp + 24] testexpand [ebp + 36], [ebp + 40] mov edi, [ebp + 20] testtop [ebp + 28], [ebp + 32] call dmla4 testtail [ebp + 48] testcarryout [ebp + 24] testepilogue ENDFUNC FUNC(test_mul4) testprologue [ebp + 36] testldcarry [ebp + 24] testexpand nil, [ebp + 32] mov edi, [ebp + 20] testtop nil, [ebp + 28] call mul4 testtail [ebp + 40] testcarryout [ebp + 24] testepilogue ENDFUNC FUNC(test_mla4) testprologue [ebp + 36] testldcarry [ebp + 24] testexpand nil, [ebp + 32] mov edi, [ebp + 20] testtop nil, [ebp + 28] call mla4 testtail [ebp + 40] testcarryout [ebp + 24] testepilogue ENDFUNC FUNC(test_mmul4) testprologue [ebp + 48] testexpand [ebp + 40], [ebp + 44] mov edi, [ebp + 20] testtop [ebp + 32], [ebp + 36], mont call mmul4 testtail [ebp + 52] mov edi, [ebp + 28] movdqa xmm0, [esp + 64] movdqa xmm1, [esp + 80] movdqu [edi], xmm0 movdqu [edi + 16], xmm1 testcarryout [ebp + 24] testepilogue ENDFUNC FUNC(test_mmla4) testprologue [ebp + 48] testexpand [ebp + 40], [ebp + 44] mov edi, [ebp + 20] testtop [ebp + 32], [ebp + 36], mont call mmla4 testtail [ebp + 52] mov edi, [ebp + 28] movdqa xmm0, [esp + 64] movdqa xmm1, [esp + 80] movdqu [edi], xmm0 movdqu [edi + 16], xmm1 testcarryout [ebp + 24] testepilogue ENDFUNC FUNC(test_mont4) testprologue [ebp + 40] testexpand nil, [ebp + 36] mov edi, [ebp + 20] testtop nil, [ebp + 32], mont call mont4 testtail [ebp + 44] mov edi, [ebp + 28] movdqa xmm0, [esp + 64] movdqa xmm1, [esp + 80] movdqu [edi], xmm0 movdqu [edi + 16], xmm1 testcarryout [ebp + 24] testepilogue ENDFUNC #endif ///----- That's all, folks --------------------------------------------------