X-Git-Url: https://git.distorted.org.uk/~mdw/catacomb/blobdiff_plain/1a517bb3785891ff6940c73af7c5a136d0250ebf..0923a413958b0e778a3f059c76355ab58e5be414:/math/mpx-mul4-x86-sse2.S diff --git a/math/mpx-mul4-x86-sse2.S b/math/mpx-mul4-x86-sse2.S index 8f69a559..a6613ed0 100644 --- a/math/mpx-mul4-x86-sse2.S +++ b/math/mpx-mul4-x86-sse2.S @@ -315,6 +315,8 @@ INTFUNC(carryprop) // form. Store the low 128 bits of the represented carry to [EDI] as // a packed 128-bit value, and leave the remaining 16 bits in the low // 32 bits of XMM4. On exit, XMM3, XMM5 and XMM6 are clobbered. + endprologue + propout [edi + 0], xmm4, xmm5 propout [edi + 4], xmm5, xmm6 propout [edi + 8], xmm6, nil @@ -333,6 +335,8 @@ INTFUNC(dmul4) // [EDI], and update the carry registers with the carry out. The // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the // general-purpose registers are preserved. + endprologue + mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7, t mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil propout [edi + 0], xmm4, xmm5 @@ -365,6 +369,8 @@ INTFUNC(dmla4) // [EDI], and update the carry registers with the carry out. The // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the // general-purpose registers are preserved. + endprologue + carryadd mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7, nil @@ -395,6 +401,8 @@ INTFUNC(mul4zc) // and set the carry registers XMM4, XMM5, XMM6 to the carry out. // The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the // general-purpose registers are preserved. + endprologue + mulcore [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7 propout [edi + 0], xmm4, xmm5 @@ -421,6 +429,8 @@ INTFUNC(mul4) // and update the carry registers with the carry out. The registers // XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the // general-purpose registers are preserved. + endprologue + mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, t propout [edi + 0], xmm4, xmm5 @@ -446,6 +456,8 @@ INTFUNC(mla4zc) // and set the carry registers XMM4, XMM5, XMM6 to the carry out. // The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the // general-purpose registers are preserved. + endprologue + movd xmm4, [edi + 0] movd xmm5, [edi + 4] movd xmm6, [edi + 8] @@ -478,6 +490,8 @@ INTFUNC(mla4) // [EDI], and update the carry registers with the carry out. The // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the // general-purpose registers are preserved. + endprologue + carryadd mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil @@ -508,7 +522,8 @@ INTFUNC(mmul4) // of the sum U V + N Y to [EDI], leaving the remaining carry in // XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2, XMM3, and // XMM7 are clobbered; the general-purpose registers are preserved. - sub esp, 64 // space for the carries + stalloc 64 // space for the carries + endprologue // Calculate W = U V, and leave it in the destination. Stash the // carry pieces for later. @@ -532,7 +547,9 @@ INTFUNC(mmla4) // carry in XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2, // XMM3, and XMM7 are clobbered; the general-purpose registers are // preserved. - sub esp, 64 // space for the carries + stalloc 64 // space for the carries + endprologue + movd xmm4, [edi + 0] movd xmm5, [edi + 4] movd xmm6, [edi + 8] @@ -599,7 +616,7 @@ INTFUNC(mmla4) paddq xmm6, [esp + 32] // And, with that, we're done. - add esp, 64 + stfree 64 ret ENDFUNC @@ -614,6 +631,7 @@ INTFUNC(mont4) // of the sum W + N Y to [EDI], leaving the remaining carry in // XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2, XMM3, and // XMM7 are clobbered; the general-purpose registers are preserved. + endprologue // Calculate Y = W M. mulcore [edi + 0], esi, xmm4, xmm5, xmm6, xmm7 @@ -680,13 +698,14 @@ FUNC(mpx_umul4_x86_sse2) // // esp + 0 expanded Y (32 bytes) // esp + 32 (top of locals) - push ebp - push ebx - push esi - push edi - mov ebp, esp + pushreg ebp + pushreg ebx + pushreg esi + pushreg edi + setfp ebp and esp, ~15 sub esp, 32 + endprologue // Prepare for the first iteration. mov esi, [ebp + 32] // -> bv[0] @@ -753,7 +772,7 @@ FUNC(mpx_umul4_x86_sse2) jb 1b // All over. -9: mov esp, ebp +9: dropfp pop edi pop esi pop ebx @@ -787,13 +806,14 @@ FUNC(mpxmont_mul4_x86_sse2) // esp + 108 bv limit // esp + 112 (gap) // esp + 124 (top of locals) - push ebp - push ebx - push esi - push edi - mov ebp, esp + pushreg ebp + pushreg ebx + pushreg esi + pushreg edi + setfp ebp and esp, ~15 sub esp, 124 + endprologue // Establish the expanded operands. pxor xmm7, xmm7 @@ -894,11 +914,11 @@ FUNC(mpxmont_mul4_x86_sse2) movd [edi + 16], xmm4 // All done. -9: mov esp, ebp - pop edi - pop esi - pop ebx - pop ebp +9: dropfp + popreg edi + popreg esi + popreg ebx + popreg ebp ret ENDFUNC @@ -924,13 +944,14 @@ FUNC(mpxmont_redc4_x86_sse2) // esp + 12 expanded M (32 bytes) // esp + 44 expanded Y (32 bytes) // esp + 76 (top of locals) - push ebp - push ebx - push esi - push edi - mov ebp, esp + pushreg ebp + pushreg ebx + pushreg esi + pushreg edi + setfp ebp and esp, ~15 sub esp, 76 + endprologue // Establish the expanded operands and the blocks-of-4 dv limit. mov edi, [ebp + 20] // -> Z = dv[0] @@ -1019,11 +1040,11 @@ FUNC(mpxmont_redc4_x86_sse2) jmp 5b // All over. -9: mov esp, ebp - pop edi - pop esi - pop ebx - pop ebp +9: dropfp + popreg edi + popreg esi + popreg ebx + popreg ebp ret ENDFUNC @@ -1052,13 +1073,14 @@ ENDFUNC .endm .macro testprologue - push ebp - push ebx - push esi - push edi - mov ebp, esp + pushreg ebp + pushreg ebx + pushreg esi + pushreg edi + setfp ebp and esp, ~15 sub esp, 3*32 + 12 + endprologue // vars: // esp + 0 = cycles // esp + 12 = v expanded @@ -1067,11 +1089,11 @@ ENDFUNC .endm .macro testepilogue - mov esp, ebp - pop edi - pop esi - pop ebx - pop ebp + dropfp + popreg edi + popreg esi + popreg ebx + popreg ebp ret .endm @@ -1133,8 +1155,7 @@ ENDFUNC movdqu [ecx + 32], xmm6 .endm - .globl test_dmul4 -test_dmul4: +FUNC(test_dmul4) testprologue testldcarry [ebp + 24] testexpand [ebp + 36], [ebp + 40] @@ -1144,9 +1165,9 @@ test_dmul4: testtail [ebp + 48], [ebp + 44] testcarryout [ebp + 24] testepilogue +ENDFUNC - .globl test_dmla4 -test_dmla4: +FUNC(test_dmla4) testprologue testldcarry [ebp + 24] testexpand [ebp + 36], [ebp + 40] @@ -1156,9 +1177,9 @@ test_dmla4: testtail [ebp + 48], [ebp + 44] testcarryout [ebp + 24] testepilogue +ENDFUNC - .globl test_mul4 -test_mul4: +FUNC(test_mul4) testprologue testldcarry [ebp + 24] testexpand nil, [ebp + 32] @@ -1168,9 +1189,9 @@ test_mul4: testtail [ebp + 40], [ebp + 36] testcarryout [ebp + 24] testepilogue +ENDFUNC - .globl test_mla4 -test_mla4: +FUNC(test_mla4) testprologue testldcarry [ebp + 24] testexpand nil, [ebp + 32] @@ -1180,9 +1201,9 @@ test_mla4: testtail [ebp + 40], [ebp + 36] testcarryout [ebp + 24] testepilogue +ENDFUNC - .globl test_mmul4 -test_mmul4: +FUNC(test_mmul4) testprologue testexpand [ebp + 40], [ebp + 44] mov edi, [ebp + 20] @@ -1196,9 +1217,9 @@ test_mmul4: movdqu [edi + 16], xmm1 testcarryout [ebp + 24] testepilogue +ENDFUNC - .globl test_mmla4 -test_mmla4: +FUNC(test_mmla4) testprologue testexpand [ebp + 40], [ebp + 44] mov edi, [ebp + 20] @@ -1212,9 +1233,9 @@ test_mmla4: movdqu [edi + 16], xmm1 testcarryout [ebp + 24] testepilogue +ENDFUNC - .globl test_mont4 -test_mont4: +FUNC(test_mont4) testprologue testexpand nil, [ebp + 36] mov edi, [ebp + 20] @@ -1228,6 +1249,7 @@ test_mont4: movdqu [edi + 16], xmm1 testcarryout [ebp + 24] testepilogue +ENDFUNC #endif