From 6ecc0b8facfd2f1f13abc03b0f2013112af3430b Mon Sep 17 00:00:00 2001 From: Mark Wooding Date: Wed, 4 Jan 2017 01:35:50 +0000 Subject: [PATCH] math/mpx-mul4-x86-sse2.S: Make stack alignment more standard. This actually slightly reduces the amount of stack needed, but I don't quite understand why. There's a knock-on rearrangement of the stack frame in the test wrappers and C-interface subroutines. There's also a slightly sneaky introduction of space for a later change. But there shouldn't be any externally observable difference. --- math/mpx-mul4-x86-sse2.S | 115 +++++++++++++++++++++++------------------------ 1 file changed, 56 insertions(+), 59 deletions(-) diff --git a/math/mpx-mul4-x86-sse2.S b/math/mpx-mul4-x86-sse2.S index 2406111f..14052fd0 100644 --- a/math/mpx-mul4-x86-sse2.S +++ b/math/mpx-mul4-x86-sse2.S @@ -511,14 +511,13 @@ INTFUNC(mmul4) // to the packed operands U and N; ECX and ESI point to the expanded // operands V and M; and EDX points to a place to store an expanded // result Y (32 bytes, at a 16-byte boundary). The stack pointer - // must be 16-byte aligned. (This is not the usual convention, which - // requires alignment before the call.) + // must be 12 modulo 16, as is usual for modern x86 ABIs. // // On exit, we write Y = U V M mod B to [EDX], and the low 128 bits // of the sum U V + N Y to [EDI], leaving the remaining carry in // XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2, XMM3, and // XMM7 are clobbered; the general-purpose registers are preserved. - stalloc 48 // space for the carries + stalloc 48 + 12 // space for the carries endprologue // Calculate W = U V, and leave it in the destination. Stash the @@ -535,15 +534,14 @@ INTFUNC(mmla4) // packed operands U and N; ECX and ESI point to the expanded // operands V and M; and EDX points to a place to store an expanded // result Y (32 bytes, at a 16-byte boundary). The stack pointer - // must be 16-byte aligned. (This is not the usual convention, which - // requires alignment before the call.) + // must be 12 modulo 16, as is usual for modern x86 ABIs. // // On exit, we write Y = (A + U V) M mod B to [EDX], and the low 128 // bits of the sum A + U V + N Y to [EDI], leaving the remaining // carry in XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2, // XMM3, and XMM7 are clobbered; the general-purpose registers are // preserved. - stalloc 48 // space for the carries + stalloc 48 + 12 // space for the carries endprologue movd xmm4, [edi + 0] @@ -615,7 +613,7 @@ INTFUNC(mmla4) paddq xmm6, [esp + 32] // And, with that, we're done. - stfree 48 + stfree 48 + 12 ret ENDFUNC @@ -794,24 +792,23 @@ FUNC(mpxmont_mul4_x86_sse2) // ebp + 36 n (nonzero multiple of 4) // ebp + 40 mi // - // Locals are relative to ESP, which is 4 mod 16, as follows. + // Locals are relative to ESP, which 16-byte aligned, as follows. // - // esp + 0 outer loop dv - // esp + 4 outer loop bv - // esp + 8 av limit (mostly in ESI) - // esp + 12 expanded V (32 bytes) - // esp + 44 expanded M (32 bytes) - // esp + 76 expanded Y (32 bytes) + // esp + 0 expanded V (32 bytes) + // esp + 32 expanded M (32 bytes) + // esp + 64 expanded Y (32 bytes) + // esp + 96 outer loop dv + // esp + 100 outer loop bv + // esp + 104 av limit (mostly in ESI) // esp + 108 bv limit - // esp + 112 (gap) - // esp + 124 (top of locals) + // esp + 112 (top of locals) pushreg ebp pushreg ebx pushreg esi pushreg edi setfp ebp and esp, ~15 - sub esp, 124 + sub esp, 112 endprologue // Establish the expanded operands. @@ -821,33 +818,33 @@ FUNC(mpxmont_mul4_x86_sse2) movdqu xmm0, [ecx] // bv[0] movdqu xmm2, [edx] // mi expand xmm7, xmm0, xmm1, xmm2, xmm3 - movdqa [esp + 12], xmm0 // bv[0] expanded low - movdqa [esp + 28], xmm1 // bv[0] expanded high - movdqa [esp + 44], xmm2 // mi expanded low - movdqa [esp + 60], xmm3 // mi expanded high + movdqa [esp + 0], xmm0 // bv[0] expanded low + movdqa [esp + 16], xmm1 // bv[0] expanded high + movdqa [esp + 32], xmm2 // mi expanded low + movdqa [esp + 48], xmm3 // mi expanded high // Set up the outer loop state and prepare for the first iteration. mov edx, [ebp + 36] // n mov eax, [ebp + 24] // -> U = av[0] mov ebx, [ebp + 32] // -> X = nv[0] mov edi, [ebp + 20] // -> Z = dv[0] - mov [esp + 4], ecx + mov [esp + 100], ecx lea ecx, [ecx + 4*edx] // -> bv[n/4] = bv limit lea edx, [eax + 4*edx] // -> av[n/4] = av limit - mov [esp + 0], edi + mov [esp + 96], edi + mov [esp + 104], edx mov [esp + 108], ecx - mov [esp + 8], edx - lea ecx, [esp + 12] // -> expanded V = bv[0] - lea esi, [esp + 44] // -> expanded M = mi - lea edx, [esp + 76] // -> space for Y + lea ecx, [esp + 0] // -> expanded V = bv[0] + lea esi, [esp + 32] // -> expanded M = mi + lea edx, [esp + 64] // -> space for Y call mmul4 - mov esi, [esp + 8] // recover av limit + mov esi, [esp + 104] // recover av limit add edi, 16 add eax, 16 add ebx, 16 cmp eax, esi // done already? jae 8f - mov [esp + 0], edi + mov [esp + 96], edi .p2align 4 // Complete the first inner loop. @@ -866,26 +863,26 @@ FUNC(mpxmont_mul4_x86_sse2) // Embark on the next iteration. (There must be one. If n = 1, then // we would have bailed above, to label 8. Similarly, the subsequent // iterations can fall into the inner loop immediately.) -1: mov eax, [esp + 4] // -> bv[i - 1] - mov edi, [esp + 0] // -> Z = dv[i] +1: mov eax, [esp + 100] // -> bv[i - 1] + mov edi, [esp + 96] // -> Z = dv[i] add eax, 16 // -> bv[i] pxor xmm7, xmm7 - movdqu xmm0, [eax] // bv[i] - mov [esp + 4], eax + mov [esp + 100], eax cmp eax, [esp + 108] // done yet? jae 9f + movdqu xmm0, [eax] // bv[i] mov ebx, [ebp + 32] // -> X = nv[0] - lea esi, [esp + 44] // -> expanded M = mi + lea esi, [esp + 32] // -> expanded M = mi mov eax, [ebp + 24] // -> U = av[0] expand xmm7, xmm0, xmm1 - movdqa [esp + 12], xmm0 // bv[i] expanded low - movdqa [esp + 28], xmm1 // bv[i] expanded high + movdqa [esp + 0], xmm0 // bv[i] expanded low + movdqa [esp + 16], xmm1 // bv[i] expanded high call mmla4 - mov esi, [esp + 8] // recover av limit + mov esi, [esp + 104] // recover av limit add edi, 16 add eax, 16 add ebx, 16 - mov [esp + 0], edi + mov [esp + 96], edi .p2align 4 // Complete the next inner loop. @@ -1078,13 +1075,13 @@ ENDFUNC pushreg edi setfp ebp and esp, ~15 - sub esp, 3*32 + 12 + sub esp, 3*32 + 4*4 endprologue // vars: - // esp + 0 = cycles - // esp + 12 = v expanded - // esp + 44 = y expanded - // esp + 72 = ? expanded + // esp + 0 = v expanded + // esp + 32 = y expanded + // esp + 64 = ? expanded + // esp + 96 = cycles .endm .macro testepilogue @@ -1109,15 +1106,15 @@ ENDFUNC mov ecx, \v movdqu xmm0, [ecx] expand xmm7, xmm0, xmm1 - movdqa [esp + 12], xmm0 - movdqa [esp + 28], xmm1 + movdqa [esp + 0], xmm0 + movdqa [esp + 16], xmm1 .endif .ifnes "\y", "nil" mov edx, \y movdqu xmm2, [edx] expand xmm7, xmm2, xmm3 - movdqa [esp + 44], xmm2 - movdqa [esp + 60], xmm3 + movdqa [esp + 32], xmm2 + movdqa [esp + 48], xmm3 .endif .endm @@ -1125,25 +1122,25 @@ ENDFUNC .p2align 4 0: .ifnes "\u", "nil" - lea ecx, [esp + 12] + lea ecx, [esp + 0] .endif mov ebx, \x .ifeqs "\mode", "mont" - lea esi, [esp + 44] + lea esi, [esp + 32] .endif - cysetup esp + 0 + cysetup esp + 96 .ifnes "\u", "nil" mov eax, \u .endif .ifeqs "\mode", "mont" - lea edx, [esp + 76] + lea edx, [esp + 64] .else - lea edx, [esp + 44] + lea edx, [esp + 32] .endif .endm .macro testtail cyv, n - cystore esp + 0, \cyv, \n + cystore esp + 96, \cyv, \n jnz 0b .endm @@ -1210,8 +1207,8 @@ FUNC(test_mmul4) call mmul4 testtail [ebp + 52], [ebp + 48] mov edi, [ebp + 28] - movdqa xmm0, [esp + 76] - movdqa xmm1, [esp + 92] + movdqa xmm0, [esp + 64] + movdqa xmm1, [esp + 80] movdqu [edi], xmm0 movdqu [edi + 16], xmm1 testcarryout [ebp + 24] @@ -1226,8 +1223,8 @@ FUNC(test_mmla4) call mmla4 testtail [ebp + 52], [ebp + 48] mov edi, [ebp + 28] - movdqa xmm0, [esp + 76] - movdqa xmm1, [esp + 92] + movdqa xmm0, [esp + 64] + movdqa xmm1, [esp + 80] movdqu [edi], xmm0 movdqu [edi + 16], xmm1 testcarryout [ebp + 24] @@ -1242,8 +1239,8 @@ FUNC(test_mont4) call mont4 testtail [ebp + 44], [ebp + 40] mov edi, [ebp + 28] - movdqa xmm0, [esp + 76] - movdqa xmm1, [esp + 92] + movdqa xmm0, [esp + 64] + movdqa xmm1, [esp + 80] movdqu [edi], xmm0 movdqu [edi + 16], xmm1 testcarryout [ebp + 24] -- 2.11.0