X-Git-Url: https://git.distorted.org.uk/~mdw/catacomb/blobdiff_plain/ac82eac807a9818e676c316f8afeab5cff2572cd..9599917f4a0aa31e2dafd15a7f0e4993bdedf715:/math/mpx-mul4-x86-sse2.S diff --git a/math/mpx-mul4-x86-sse2.S b/math/mpx-mul4-x86-sse2.S index 9d664b44..b1072ff2 100644 --- a/math/mpx-mul4-x86-sse2.S +++ b/math/mpx-mul4-x86-sse2.S @@ -40,11 +40,11 @@ /// construct more general variable-length multipliers. /// /// The basic trick is the same throughout. In an operand-scanning -/// multiplication, the inner multiplication loop multiplies a -/// multiple-precision operand by a single precision factor, and adds the -/// result, appropriately shifted, to the result. A `finely integrated -/// operand scanning' implementation of Montgomery multiplication also adds -/// the product of a single-precision `Montgomery factor' and the modulus, +/// multiplication, the inner multiplication loop multiplies a multiple- +/// precision operand by a single precision factor, and adds the result, +/// appropriately shifted, to the result. A `finely integrated operand +/// scanning' implementation of Montgomery multiplication also adds the +/// product of a single-precision `Montgomery factor' and the modulus, /// calculated in the same pass. The more common `coarsely integrated /// operand scanning' alternates main multiplication and Montgomery passes, /// which requires additional carry propagation. @@ -70,23 +70,64 @@ /// many products together before we must deal with carrying; it also allows /// for some calculations to be performed on the above expanded form. /// +/// We maintain four `carry' registers XMM4--XMM7 accumulating intermediate +/// results. The registers' precise roles rotate during the computation; we +/// name them `c0', `c1', `c2', and `c3'. Each carry register holds two +/// 64-bit halves: the register c0, for example, holds c'_0 (low half) and +/// c''_0 (high half), and represents the value c_0 = c'_0 + c''_0 b; the +/// carry registers collectively represent the value c_0 + c_1 B + c_2 B^2 + +/// c_3 B^3. The `pmuluqd' instruction acting on a scalar operand (broadcast +/// across all lanes of its vector) and an operand in the expanded form above +/// produces a result which can be added directly to the appropriate carry +/// register. Following a pass of four multiplications, we perform some +/// limited carry propagation: let t = c''_0 mod B, and let d = c'_0 + t b; +/// then we output z = d mod B, add (floor(d/B), floor(c''_0/B)) to c1, and +/// cycle the carry registers around, so that c1 becomes c0, and the old +/// (implicitly) zeroed c0 becomes c3. +/// /// On 32-bit x86, we are register starved: the expanded operands are kept in -/// memory, typically in warm L1 cache. +/// memory, typically in warm L1 cache. The packed operands are read from +/// memory into working registers XMM0--XMM3 and processed immediately. +/// The following conventional argument names and locations are used +/// throughout. +/// +/// Arg Format Location Notes +/// +/// U packed [EAX] +/// X packed [EBX] In Montgomery multiplication, X = N +/// V expanded [ECX] +/// Y expanded [EDX] In Montgomery multiplication, Y = (A + U V) M +/// M expanded [ESI] -N^{-1} (mod B^4) +/// N Modulus, for Montgomery multiplication +/// A packed [EDI] Destination/accumulator +/// C carry XMM4--XMM7 +/// +/// The calculation is some variant of +/// +/// A' + C' B^4 <- U V + X Y + A + C +/// +/// The low-level functions fit into a fairly traditional (finely-integrated) +/// operand scanning loop over operand pairs (U, X) (indexed by j) and (V, Y) +/// (indexed by i). +/// +/// The variants are as follows. +/// +/// Function Variant Use i j /// -/// We maintain four `carry' registers accumulating intermediate results. -/// The registers' precise roles rotate during the computation; we name them -/// `c0', `c1', `c2', and `c3'. Each carry register holds two 64-bit halves: -/// the register c0, for example, holds c'_0 (low half) and c''_0 (high -/// half), and represents the value c_0 = c'_0 + c''_0 b; the carry registers -/// collectively represent the value c_0 + c_1 B + c_2 B^2 + c_3 B^3. The -/// `pmuluqd' instruction acting on a scalar operand (broadcast across all -/// lanes of its vector) and an operand in the expanded form above produces a -/// result which can be added directly to the appropriate carry register. -/// Following a pass of four multiplications, we perform some limited carry -/// propagation: let t = c''_0 mod B, and let d = c'_0 + t b; then we output -/// z = d mod B, add (floor(d/B), floor(c''_0/B)) to c1, and cycle the carry -/// registers around, so that c1 becomes c0, and the old c0 is (implicitly) -/// zeroed becomes c3. +/// mmul4 A = C = 0 Montgomery 0 0 +/// dmul4 A = 0 Montgomery 0 + +/// mmla4 C = 0 Montgomery + 0 +/// dmla4 exactly as shown Montgomery + + +/// mont4 U = V = C = 0 Montgomery any 0 +/// +/// mul4zc U = V = A = C = 0 Plain 0 0 +/// mul4 U = V = A = 0 Plain 0 + +/// mla4zc U = V = C = 0 Plain + 0 +/// mla4 U = V = 0 Plain + + +/// +/// The `mmul4' and `mmla4' functions are also responsible for calculating +/// the Montgomery reduction factor Y = (A + U V) M used by the rest of the +/// inner loop. ///-------------------------------------------------------------------------- /// Macro definitions. @@ -316,7 +357,6 @@ INTFUNC(carryprop) propout [edi + 8], xmm6, nil endprop [edi + 12], xmm6, xmm4 ret - ENDFUNC INTFUNC(dmul4) @@ -348,7 +388,6 @@ INTFUNC(dmul4) propout [edi + 12], xmm7, xmm4 ret - ENDFUNC INTFUNC(dmla4) @@ -384,7 +423,6 @@ INTFUNC(dmla4) propout [edi + 12], xmm7, xmm4 ret - ENDFUNC INTFUNC(mul4zc) @@ -410,7 +448,6 @@ INTFUNC(mul4zc) propout [edi + 12], xmm7, xmm4 ret - ENDFUNC INTFUNC(mul4) @@ -438,7 +475,6 @@ INTFUNC(mul4) propout [edi + 12], xmm7, xmm4 ret - ENDFUNC INTFUNC(mla4zc) @@ -470,7 +506,6 @@ INTFUNC(mla4zc) propout [edi + 12], xmm7, xmm4 ret - ENDFUNC INTFUNC(mla4) @@ -501,7 +536,6 @@ INTFUNC(mla4) propout [edi + 12], xmm7, xmm4 ret - ENDFUNC INTFUNC(mmul4) @@ -523,7 +557,6 @@ INTFUNC(mmul4) mulcore [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7 propout [edi + 0], xmm4, xmm5 jmp 5f - ENDFUNC INTFUNC(mmla4) @@ -561,9 +594,9 @@ INTFUNC(mmla4) mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t propout [edi + 12], xmm7, xmm4 - movdqa [esp + 0], xmm4 - movdqa [esp + 16], xmm5 - movdqa [esp + 32], xmm6 + movdqa [SP + 0], xmm4 + movdqa [SP + 16], xmm5 + movdqa [SP + 32], xmm6 // Calculate Y = W M. mulcore [edi + 0], esi, xmm4, xmm5, xmm6, xmm7 @@ -606,14 +639,13 @@ INTFUNC(mmla4) propout [edi + 12], xmm7, xmm4 // Add add on the carry we calculated earlier. - paddq xmm4, [esp + 0] - paddq xmm5, [esp + 16] - paddq xmm6, [esp + 32] + paddq xmm4, [SP + 0] + paddq xmm5, [SP + 16] + paddq xmm6, [SP + 32] // And, with that, we're done. stfree 48 + 12 ret - ENDFUNC INTFUNC(mont4) @@ -670,7 +702,6 @@ INTFUNC(mont4) // And, with that, we're done. ret - ENDFUNC ///-------------------------------------------------------------------------- @@ -688,40 +719,40 @@ FUNC(mpx_umul4_x86_sse2) // void mpx_umul4_x86_sse2(mpw *dv, const mpw *av, const mpw *avl, // const mpw *bv, const mpw *bvl); - // Build a stack frame. Arguments will be relative to EBP, as + // Build a stack frame. Arguments will be relative to BP, as // follows. // - // ebp + 20 dv - // ebp + 24 av - // ebp + 28 avl - // ebp + 32 bv - // ebp + 36 bvl + // BP + 20 dv + // BP + 24 av + // BP + 28 avl + // BP + 32 bv + // BP + 36 bvl // - // Locals are relative to ESP, as follows. + // Locals are relative to SP, as follows. // - // esp + 0 expanded Y (32 bytes) - // esp + 32 (top of locals) - pushreg ebp + // SP + 0 expanded Y (32 bytes) + // SP + 32 (top of locals) + pushreg BP pushreg ebx pushreg esi pushreg edi setfp - and esp, ~15 - sub esp, 32 + stalloc 32 + and SP, ~15 endprologue // Prepare for the first iteration. - mov esi, [ebp + 32] // -> bv[0] + mov esi, [BP + 32] // -> bv[0] pxor xmm7, xmm7 movdqu xmm0, [esi] // bv[0] - mov edi, [ebp + 20] // -> dv[0] + mov edi, [BP + 20] // -> dv[0] mov ecx, edi // outer loop dv cursor expand xmm7, xmm0, xmm1 - mov ebx, [ebp + 24] // -> av[0] - mov eax, [ebp + 28] // -> av[m] = av limit - mov edx, esp // -> expanded Y = bv[0] - movdqa [esp + 0], xmm0 // bv[0] expanded low - movdqa [esp + 16], xmm1 // bv[0] expanded high + mov ebx, [BP + 24] // -> av[0] + mov eax, [BP + 28] // -> av[m] = av limit + mov edx, SP // -> expanded Y = bv[0] + movdqa [SP + 0], xmm0 // bv[0] expanded low + movdqa [SP + 16], xmm1 // bv[0] expanded high call mul4zc add ebx, 16 add edi, 16 @@ -740,7 +771,7 @@ FUNC(mpx_umul4_x86_sse2) // Write out the leftover carry. There can be no tail here. 8: call carryprop - cmp esi, [ebp + 36] // more passes to do? + cmp esi, [BP + 36] // more passes to do? jae 9f .p2align 4 @@ -749,9 +780,9 @@ FUNC(mpx_umul4_x86_sse2) mov edi, ecx // -> dv[i] pxor xmm7, xmm7 expand xmm7, xmm0, xmm1 - mov ebx, [ebp + 24] // -> av[0] - movdqa [esp + 0], xmm0 // bv[i] expanded low - movdqa [esp + 16], xmm1 // bv[i] expanded high + mov ebx, [BP + 24] // -> av[0] + movdqa [SP + 0], xmm0 // bv[i] expanded low + movdqa [SP + 16], xmm1 // bv[i] expanded high call mla4zc add edi, 16 add ebx, 16 @@ -771,7 +802,7 @@ FUNC(mpx_umul4_x86_sse2) // Finish off this pass. There was no tail on the previous pass, and // there can be none on this pass. 8: call carryprop - cmp esi, [ebp + 36] + cmp esi, [BP + 36] jb 1b // All over. @@ -779,9 +810,8 @@ FUNC(mpx_umul4_x86_sse2) pop edi pop esi pop ebx - pop ebp + pop BP ret - ENDFUNC FUNC(mpxmont_mul4_x86_avx) @@ -796,69 +826,69 @@ FUNC(mpxmont_mul4_x86_sse2) // void mpxmont_mul4_x86_sse2(mpw *dv, const mpw *av, const mpw *bv, // const mpw *nv, size_t n, const mpw *mi); - // Build a stack frame. Arguments will be relative to EBP, as + // Build a stack frame. Arguments will be relative to BP, as // follows. // - // ebp + 20 dv - // ebp + 24 av - // ebp + 28 bv - // ebp + 32 nv - // ebp + 36 n (nonzero multiple of 4) - // ebp + 40 mi + // BP + 20 dv + // BP + 24 av + // BP + 28 bv + // BP + 32 nv + // BP + 36 n (nonzero multiple of 4) + // BP + 40 mi // - // Locals are relative to ESP, which 16-byte aligned, as follows. + // Locals are relative to SP, which 16-byte aligned, as follows. // - // esp + 0 expanded V (32 bytes) - // esp + 32 expanded M (32 bytes) - // esp + 64 expanded Y (32 bytes) - // esp + 96 outer loop dv - // esp + 100 outer loop bv - // esp + 104 av limit (mostly in ESI) - // esp + 108 bv limit - // esp + 112 (top of locals) - pushreg ebp + // SP + 0 expanded V (32 bytes) + // SP + 32 expanded M (32 bytes) + // SP + 64 expanded Y (32 bytes) + // SP + 96 outer loop dv + // SP + 100 outer loop bv + // SP + 104 av limit (mostly in ESI) + // SP + 108 bv limit + // SP + 112 (top of locals) + pushreg BP pushreg ebx pushreg esi pushreg edi setfp - and esp, ~15 - sub esp, 112 + stalloc 112 + and SP, ~15 endprologue // Establish the expanded operands. pxor xmm7, xmm7 - mov ecx, [ebp + 28] // -> bv - mov edx, [ebp + 40] // -> mi + mov ecx, [BP + 28] // -> bv + mov edx, [BP + 40] // -> mi movdqu xmm0, [ecx] // bv[0] movdqu xmm2, [edx] // mi expand xmm7, xmm0, xmm1, xmm2, xmm3 - movdqa [esp + 0], xmm0 // bv[0] expanded low - movdqa [esp + 16], xmm1 // bv[0] expanded high - movdqa [esp + 32], xmm2 // mi expanded low - movdqa [esp + 48], xmm3 // mi expanded high + movdqa [SP + 0], xmm0 // bv[0] expanded low + movdqa [SP + 16], xmm1 // bv[0] expanded high + movdqa [SP + 32], xmm2 // mi expanded low + movdqa [SP + 48], xmm3 // mi expanded high // Set up the outer loop state and prepare for the first iteration. - mov edx, [ebp + 36] // n - mov eax, [ebp + 24] // -> U = av[0] - mov ebx, [ebp + 32] // -> X = nv[0] - mov edi, [ebp + 20] // -> Z = dv[0] - mov [esp + 100], ecx + mov edx, [BP + 36] // n + mov eax, [BP + 24] // -> U = av[0] + mov ebx, [BP + 32] // -> X = nv[0] + mov edi, [BP + 20] // -> Z = dv[0] + mov [SP + 100], ecx lea ecx, [ecx + 4*edx] // -> bv[n/4] = bv limit lea edx, [eax + 4*edx] // -> av[n/4] = av limit - mov [esp + 96], edi - mov [esp + 104], edx - mov [esp + 108], ecx - lea ecx, [esp + 0] // -> expanded V = bv[0] - lea esi, [esp + 32] // -> expanded M = mi - lea edx, [esp + 64] // -> space for Y + mov [SP + 96], edi + mov [SP + 104], edx + mov [SP + 108], ecx + lea ecx, [SP + 0] // -> expanded V = bv[0] + lea esi, [SP + 32] // -> expanded M = mi + lea edx, [SP + 64] // -> space for Y call mmul4 - mov esi, [esp + 104] // recover av limit + mov esi, [SP + 104] // recover av limit add edi, 16 add eax, 16 add ebx, 16 cmp eax, esi // done already? jae 8f - mov [esp + 96], edi + mov [SP + 96], edi .p2align 4 // Complete the first inner loop. @@ -877,26 +907,26 @@ FUNC(mpxmont_mul4_x86_sse2) // Embark on the next iteration. (There must be one. If n = 1, then // we would have bailed above, to label 8. Similarly, the subsequent // iterations can fall into the inner loop immediately.) -1: mov eax, [esp + 100] // -> bv[i - 1] - mov edi, [esp + 96] // -> Z = dv[i] +1: mov eax, [SP + 100] // -> bv[i - 1] + mov edi, [SP + 96] // -> Z = dv[i] add eax, 16 // -> bv[i] pxor xmm7, xmm7 - mov [esp + 100], eax - cmp eax, [esp + 108] // done yet? + mov [SP + 100], eax + cmp eax, [SP + 108] // done yet? jae 9f movdqu xmm0, [eax] // bv[i] - mov ebx, [ebp + 32] // -> X = nv[0] - lea esi, [esp + 32] // -> expanded M = mi - mov eax, [ebp + 24] // -> U = av[0] + mov ebx, [BP + 32] // -> X = nv[0] + lea esi, [SP + 32] // -> expanded M = mi + mov eax, [BP + 24] // -> U = av[0] expand xmm7, xmm0, xmm1 - movdqa [esp + 0], xmm0 // bv[i] expanded low - movdqa [esp + 16], xmm1 // bv[i] expanded high + movdqa [SP + 0], xmm0 // bv[i] expanded low + movdqa [SP + 16], xmm1 // bv[i] expanded high call mmla4 - mov esi, [esp + 104] // recover av limit + mov esi, [SP + 104] // recover av limit add edi, 16 add eax, 16 add ebx, 16 - mov [esp + 96], edi + mov [SP + 96], edi .p2align 4 // Complete the next inner loop. @@ -928,9 +958,8 @@ FUNC(mpxmont_mul4_x86_sse2) popreg edi popreg esi popreg ebx - popreg ebp + popreg BP ret - ENDFUNC FUNC(mpxmont_redc4_x86_avx) @@ -945,55 +974,55 @@ FUNC(mpxmont_redc4_x86_sse2) // void mpxmont_redc4_x86_sse2(mpw *dv, mpw *dvl, const mpw *nv, // size_t n, const mpw *mi); - // Build a stack frame. Arguments will be relative to EBP, as + // Build a stack frame. Arguments will be relative to BP, as // follows. // - // ebp + 20 dv - // ebp + 24 dvl - // ebp + 28 nv - // ebp + 32 n (nonzero multiple of 4) - // ebp + 36 mi + // BP + 20 dv + // BP + 24 dvl + // BP + 28 nv + // BP + 32 n (nonzero multiple of 4) + // BP + 36 mi // - // Locals are relative to ESP, as follows. + // Locals are relative to SP, as follows. // - // esp + 0 outer loop dv - // esp + 4 outer dv limit - // esp + 8 blocks-of-4 dv limit - // esp + 12 expanded M (32 bytes) - // esp + 44 expanded Y (32 bytes) - // esp + 76 (top of locals) - pushreg ebp + // SP + 0 outer loop dv + // SP + 4 outer dv limit + // SP + 8 blocks-of-4 dv limit + // SP + 12 expanded M (32 bytes) + // SP + 44 expanded Y (32 bytes) + // SP + 76 (top of locals) + pushreg BP pushreg ebx pushreg esi pushreg edi setfp - and esp, ~15 - sub esp, 76 + and SP, ~15 + stalloc 76 endprologue // Establish the expanded operands and the blocks-of-4 dv limit. - mov edi, [ebp + 20] // -> Z = dv[0] + mov edi, [BP + 20] // -> Z = dv[0] pxor xmm7, xmm7 - mov eax, [ebp + 24] // -> dv[n] = dv limit + mov eax, [BP + 24] // -> dv[n] = dv limit sub eax, edi // length of dv in bytes - mov edx, [ebp + 36] // -> mi + mov edx, [BP + 36] // -> mi movdqu xmm0, [edx] // mi and eax, ~15 // mask off the tail end expand xmm7, xmm0, xmm1 add eax, edi // find limit - movdqa [esp + 12], xmm0 // mi expanded low - movdqa [esp + 28], xmm1 // mi expanded high - mov [esp + 8], eax + movdqa [SP + 12], xmm0 // mi expanded low + movdqa [SP + 28], xmm1 // mi expanded high + mov [SP + 8], eax // Set up the outer loop state and prepare for the first iteration. - mov ecx, [ebp + 32] // n - mov ebx, [ebp + 28] // -> X = nv[0] + mov ecx, [BP + 32] // n + mov ebx, [BP + 28] // -> X = nv[0] lea edx, [edi + 4*ecx] // -> dv[n/4] = outer dv limit lea ecx, [ebx + 4*ecx] // -> nv[n/4] = nv limit - mov [esp + 0], edi - mov [esp + 4], edx - lea esi, [esp + 12] // -> expanded M = mi - lea edx, [esp + 44] // -> space for Y + mov [SP + 0], edi + mov [SP + 4], edx + lea esi, [SP + 12] // -> expanded M = mi + lea edx, [SP + 44] // -> space for Y call mont4 add ebx, 16 add edi, 16 @@ -1010,8 +1039,8 @@ FUNC(mpxmont_redc4_x86_sse2) // Still have carries left to propagate. 8: carryadd - mov esi, [esp + 8] // -> dv blocks limit - mov edx, [ebp + 24] // dv limit + mov esi, [SP + 8] // -> dv blocks limit + mov edx, [BP + 24] // dv limit psllq xmm7, 16 pslldq xmm7, 8 paddq xmm6, xmm7 @@ -1044,14 +1073,14 @@ FUNC(mpxmont_redc4_x86_sse2) // All done for this iteration. Start the next. (This must have at // least one follow-on iteration, or we'd not have started this outer // loop.) -8: mov edi, [esp + 0] // -> dv[i - 1] - mov ebx, [ebp + 28] // -> X = nv[0] - lea edx, [esp + 44] // -> space for Y - lea esi, [esp + 12] // -> expanded M = mi +8: mov edi, [SP + 0] // -> dv[i - 1] + mov ebx, [BP + 28] // -> X = nv[0] + lea edx, [SP + 44] // -> space for Y + lea esi, [SP + 12] // -> expanded M = mi add edi, 16 // -> Z = dv[i] - cmp edi, [esp + 4] // all done yet? + cmp edi, [SP + 4] // all done yet? jae 9f - mov [esp + 0], edi + mov [SP + 0], edi call mont4 add edi, 16 add ebx, 16 @@ -1062,9 +1091,8 @@ FUNC(mpxmont_redc4_x86_sse2) popreg edi popreg esi popreg ebx - popreg ebp + popreg BP ret - ENDFUNC ///-------------------------------------------------------------------------- @@ -1091,22 +1119,22 @@ ENDFUNC .endm .macro testprologue n - pushreg ebp + pushreg BP pushreg ebx pushreg esi pushreg edi setfp - and esp, ~15 - sub esp, 3*32 + 4*4 + stalloc 3*32 + 4*4 + and SP, ~15 endprologue mov eax, \n - mov [esp + 104], eax + mov [SP + 104], eax // vars: - // esp + 0 = v expanded - // esp + 32 = y expanded - // esp + 64 = ? expanded - // esp + 96 = cycles - // esp + 104 = count + // SP + 0 = v expanded + // SP + 32 = y expanded + // SP + 64 = ? expanded + // SP + 96 = cycles + // SP + 104 = count .endm .macro testepilogue @@ -1114,7 +1142,7 @@ ENDFUNC popreg edi popreg esi popreg ebx - popreg ebp + popreg BP ret .endm @@ -1131,15 +1159,15 @@ ENDFUNC mov ecx, \v movdqu xmm0, [ecx] expand xmm7, xmm0, xmm1 - movdqa [esp + 0], xmm0 - movdqa [esp + 16], xmm1 + movdqa [SP + 0], xmm0 + movdqa [SP + 16], xmm1 .endif .ifnes "\y", "nil" mov edx, \y movdqu xmm2, [edx] expand xmm7, xmm2, xmm3 - movdqa [esp + 32], xmm2 - movdqa [esp + 48], xmm3 + movdqa [SP + 32], xmm2 + movdqa [SP + 48], xmm3 .endif .endm @@ -1147,25 +1175,25 @@ ENDFUNC .p2align 4 0: .ifnes "\u", "nil" - lea ecx, [esp + 0] + lea ecx, [SP + 0] .endif mov ebx, \x .ifeqs "\mode", "mont" - lea esi, [esp + 32] + lea esi, [SP + 32] .endif - cysetup esp + 96 + cysetup SP + 96 .ifnes "\u", "nil" mov eax, \u .endif .ifeqs "\mode", "mont" - lea edx, [esp + 64] + lea edx, [SP + 64] .else - lea edx, [esp + 32] + lea edx, [SP + 32] .endif .endm .macro testtail cyv - cystore esp + 96, \cyv, esp + 104 + cystore SP + 96, \cyv, SP + 104 jnz 0b .endm @@ -1177,98 +1205,122 @@ ENDFUNC .endm FUNC(test_dmul4) - testprologue [ebp + 44] - testldcarry [ebp + 24] - testexpand [ebp + 36], [ebp + 40] - mov edi, [ebp + 20] - testtop [ebp + 28], [ebp + 32] + testprologue [BP + 44] + testldcarry [BP + 24] + testexpand [BP + 36], [BP + 40] + mov edi, [BP + 20] + testtop [BP + 28], [BP + 32] call dmul4 - testtail [ebp + 48] - testcarryout [ebp + 24] + testtail [BP + 48] + testcarryout [BP + 24] testepilogue ENDFUNC FUNC(test_dmla4) - testprologue [ebp + 44] - testldcarry [ebp + 24] - testexpand [ebp + 36], [ebp + 40] - mov edi, [ebp + 20] - testtop [ebp + 28], [ebp + 32] + testprologue [BP + 44] + testldcarry [BP + 24] + testexpand [BP + 36], [BP + 40] + mov edi, [BP + 20] + testtop [BP + 28], [BP + 32] call dmla4 - testtail [ebp + 48] - testcarryout [ebp + 24] + testtail [BP + 48] + testcarryout [BP + 24] testepilogue ENDFUNC FUNC(test_mul4) - testprologue [ebp + 36] - testldcarry [ebp + 24] - testexpand nil, [ebp + 32] - mov edi, [ebp + 20] - testtop nil, [ebp + 28] + testprologue [BP + 36] + testldcarry [BP + 24] + testexpand nil, [BP + 32] + mov edi, [BP + 20] + testtop nil, [BP + 28] call mul4 - testtail [ebp + 40] - testcarryout [ebp + 24] + testtail [BP + 40] + testcarryout [BP + 24] + testepilogue +ENDFUNC + +FUNC(test_mul4zc) + testprologue [BP + 36] + testldcarry [BP + 24] + testexpand nil, [BP + 32] + mov edi, [BP + 20] + testtop nil, [BP + 28] + call mul4zc + testtail [BP + 40] + testcarryout [BP + 24] testepilogue ENDFUNC FUNC(test_mla4) - testprologue [ebp + 36] - testldcarry [ebp + 24] - testexpand nil, [ebp + 32] - mov edi, [ebp + 20] - testtop nil, [ebp + 28] + testprologue [BP + 36] + testldcarry [BP + 24] + testexpand nil, [BP + 32] + mov edi, [BP + 20] + testtop nil, [BP + 28] call mla4 - testtail [ebp + 40] - testcarryout [ebp + 24] + testtail [BP + 40] + testcarryout [BP + 24] + testepilogue +ENDFUNC + +FUNC(test_mla4zc) + testprologue [BP + 36] + testldcarry [BP + 24] + testexpand nil, [BP + 32] + mov edi, [BP + 20] + testtop nil, [BP + 28] + call mla4zc + testtail [BP + 40] + testcarryout [BP + 24] testepilogue ENDFUNC FUNC(test_mmul4) - testprologue [ebp + 48] - testexpand [ebp + 40], [ebp + 44] - mov edi, [ebp + 20] - testtop [ebp + 32], [ebp + 36], mont + testprologue [BP + 48] + testexpand [BP + 40], [BP + 44] + mov edi, [BP + 20] + testtop [BP + 32], [BP + 36], mont call mmul4 - testtail [ebp + 52] - mov edi, [ebp + 28] - movdqa xmm0, [esp + 64] - movdqa xmm1, [esp + 80] + testtail [BP + 52] + mov edi, [BP + 28] + movdqa xmm0, [SP + 64] + movdqa xmm1, [SP + 80] movdqu [edi], xmm0 movdqu [edi + 16], xmm1 - testcarryout [ebp + 24] + testcarryout [BP + 24] testepilogue ENDFUNC FUNC(test_mmla4) - testprologue [ebp + 48] - testexpand [ebp + 40], [ebp + 44] - mov edi, [ebp + 20] - testtop [ebp + 32], [ebp + 36], mont + testprologue [BP + 48] + testexpand [BP + 40], [BP + 44] + mov edi, [BP + 20] + testtop [BP + 32], [BP + 36], mont call mmla4 - testtail [ebp + 52] - mov edi, [ebp + 28] - movdqa xmm0, [esp + 64] - movdqa xmm1, [esp + 80] + testtail [BP + 52] + mov edi, [BP + 28] + movdqa xmm0, [SP + 64] + movdqa xmm1, [SP + 80] movdqu [edi], xmm0 movdqu [edi + 16], xmm1 - testcarryout [ebp + 24] + testcarryout [BP + 24] testepilogue ENDFUNC FUNC(test_mont4) - testprologue [ebp + 40] - testexpand nil, [ebp + 36] - mov edi, [ebp + 20] - testtop nil, [ebp + 32], mont + testprologue [BP + 40] + testexpand nil, [BP + 36] + mov edi, [BP + 20] + testtop nil, [BP + 32], mont call mont4 - testtail [ebp + 44] - mov edi, [ebp + 28] - movdqa xmm0, [esp + 64] - movdqa xmm1, [esp + 80] + testtail [BP + 44] + mov edi, [BP + 28] + movdqa xmm0, [SP + 64] + movdqa xmm1, [SP + 80] movdqu [edi], xmm0 movdqu [edi + 16], xmm1 - testcarryout [ebp + 24] + testcarryout [BP + 24] testepilogue ENDFUNC