// to the packed operands U and N; ECX and ESI point to the expanded
// operands V and M; and EDX points to a place to store an expanded
// result Y (32 bytes, at a 16-byte boundary). The stack pointer
- // must be 16-byte aligned. (This is not the usual convention, which
- // requires alignment before the call.)
+ // must be 12 modulo 16, as is usual for modern x86 ABIs.
//
// On exit, we write Y = U V M mod B to [EDX], and the low 128 bits
// of the sum U V + N Y to [EDI], leaving the remaining carry in
// XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2, XMM3, and
// XMM7 are clobbered; the general-purpose registers are preserved.
- stalloc 48 // space for the carries
+ stalloc 48 + 12 // space for the carries
endprologue
// Calculate W = U V, and leave it in the destination. Stash the
// packed operands U and N; ECX and ESI point to the expanded
// operands V and M; and EDX points to a place to store an expanded
// result Y (32 bytes, at a 16-byte boundary). The stack pointer
- // must be 16-byte aligned. (This is not the usual convention, which
- // requires alignment before the call.)
+ // must be 12 modulo 16, as is usual for modern x86 ABIs.
//
// On exit, we write Y = (A + U V) M mod B to [EDX], and the low 128
// bits of the sum A + U V + N Y to [EDI], leaving the remaining
// carry in XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2,
// XMM3, and XMM7 are clobbered; the general-purpose registers are
// preserved.
- stalloc 48 // space for the carries
+ stalloc 48 + 12 // space for the carries
endprologue
movd xmm4, [edi + 0]
paddq xmm6, [esp + 32]
// And, with that, we're done.
- stfree 48
+ stfree 48 + 12
ret
ENDFUNC
// ebp + 36 n (nonzero multiple of 4)
// ebp + 40 mi
//
- // Locals are relative to ESP, which is 4 mod 16, as follows.
+ // Locals are relative to ESP, which 16-byte aligned, as follows.
//
- // esp + 0 outer loop dv
- // esp + 4 outer loop bv
- // esp + 8 av limit (mostly in ESI)
- // esp + 12 expanded V (32 bytes)
- // esp + 44 expanded M (32 bytes)
- // esp + 76 expanded Y (32 bytes)
+ // esp + 0 expanded V (32 bytes)
+ // esp + 32 expanded M (32 bytes)
+ // esp + 64 expanded Y (32 bytes)
+ // esp + 96 outer loop dv
+ // esp + 100 outer loop bv
+ // esp + 104 av limit (mostly in ESI)
// esp + 108 bv limit
- // esp + 112 (gap)
- // esp + 124 (top of locals)
+ // esp + 112 (top of locals)
pushreg ebp
pushreg ebx
pushreg esi
pushreg edi
setfp ebp
and esp, ~15
- sub esp, 124
+ sub esp, 112
endprologue
// Establish the expanded operands.
movdqu xmm0, [ecx] // bv[0]
movdqu xmm2, [edx] // mi
expand xmm7, xmm0, xmm1, xmm2, xmm3
- movdqa [esp + 12], xmm0 // bv[0] expanded low
- movdqa [esp + 28], xmm1 // bv[0] expanded high
- movdqa [esp + 44], xmm2 // mi expanded low
- movdqa [esp + 60], xmm3 // mi expanded high
+ movdqa [esp + 0], xmm0 // bv[0] expanded low
+ movdqa [esp + 16], xmm1 // bv[0] expanded high
+ movdqa [esp + 32], xmm2 // mi expanded low
+ movdqa [esp + 48], xmm3 // mi expanded high
// Set up the outer loop state and prepare for the first iteration.
mov edx, [ebp + 36] // n
mov eax, [ebp + 24] // -> U = av[0]
mov ebx, [ebp + 32] // -> X = nv[0]
mov edi, [ebp + 20] // -> Z = dv[0]
- mov [esp + 4], ecx
+ mov [esp + 100], ecx
lea ecx, [ecx + 4*edx] // -> bv[n/4] = bv limit
lea edx, [eax + 4*edx] // -> av[n/4] = av limit
- mov [esp + 0], edi
+ mov [esp + 96], edi
+ mov [esp + 104], edx
mov [esp + 108], ecx
- mov [esp + 8], edx
- lea ecx, [esp + 12] // -> expanded V = bv[0]
- lea esi, [esp + 44] // -> expanded M = mi
- lea edx, [esp + 76] // -> space for Y
+ lea ecx, [esp + 0] // -> expanded V = bv[0]
+ lea esi, [esp + 32] // -> expanded M = mi
+ lea edx, [esp + 64] // -> space for Y
call mmul4
- mov esi, [esp + 8] // recover av limit
+ mov esi, [esp + 104] // recover av limit
add edi, 16
add eax, 16
add ebx, 16
cmp eax, esi // done already?
jae 8f
- mov [esp + 0], edi
+ mov [esp + 96], edi
.p2align 4
// Complete the first inner loop.
// Embark on the next iteration. (There must be one. If n = 1, then
// we would have bailed above, to label 8. Similarly, the subsequent
// iterations can fall into the inner loop immediately.)
-1: mov eax, [esp + 4] // -> bv[i - 1]
- mov edi, [esp + 0] // -> Z = dv[i]
+1: mov eax, [esp + 100] // -> bv[i - 1]
+ mov edi, [esp + 96] // -> Z = dv[i]
add eax, 16 // -> bv[i]
pxor xmm7, xmm7
- movdqu xmm0, [eax] // bv[i]
- mov [esp + 4], eax
+ mov [esp + 100], eax
cmp eax, [esp + 108] // done yet?
jae 9f
+ movdqu xmm0, [eax] // bv[i]
mov ebx, [ebp + 32] // -> X = nv[0]
- lea esi, [esp + 44] // -> expanded M = mi
+ lea esi, [esp + 32] // -> expanded M = mi
mov eax, [ebp + 24] // -> U = av[0]
expand xmm7, xmm0, xmm1
- movdqa [esp + 12], xmm0 // bv[i] expanded low
- movdqa [esp + 28], xmm1 // bv[i] expanded high
+ movdqa [esp + 0], xmm0 // bv[i] expanded low
+ movdqa [esp + 16], xmm1 // bv[i] expanded high
call mmla4
- mov esi, [esp + 8] // recover av limit
+ mov esi, [esp + 104] // recover av limit
add edi, 16
add eax, 16
add ebx, 16
- mov [esp + 0], edi
+ mov [esp + 96], edi
.p2align 4
// Complete the next inner loop.
mov [ebx + ecx*8 + 4], edx
.endm
-.macro testprologue
+.macro testprologue n
pushreg ebp
pushreg ebx
pushreg esi
pushreg edi
setfp ebp
and esp, ~15
- sub esp, 3*32 + 12
+ sub esp, 3*32 + 4*4
endprologue
+ mov eax, \n
+ mov [esp + 104], eax
// vars:
- // esp + 0 = cycles
- // esp + 12 = v expanded
- // esp + 44 = y expanded
- // esp + 72 = ? expanded
+ // esp + 0 = v expanded
+ // esp + 32 = y expanded
+ // esp + 64 = ? expanded
+ // esp + 96 = cycles
+ // esp + 104 = count
.endm
.macro testepilogue
mov ecx, \v
movdqu xmm0, [ecx]
expand xmm7, xmm0, xmm1
- movdqa [esp + 12], xmm0
- movdqa [esp + 28], xmm1
+ movdqa [esp + 0], xmm0
+ movdqa [esp + 16], xmm1
.endif
.ifnes "\y", "nil"
mov edx, \y
movdqu xmm2, [edx]
expand xmm7, xmm2, xmm3
- movdqa [esp + 44], xmm2
- movdqa [esp + 60], xmm3
+ movdqa [esp + 32], xmm2
+ movdqa [esp + 48], xmm3
.endif
.endm
.p2align 4
0:
.ifnes "\u", "nil"
- lea ecx, [esp + 12]
+ lea ecx, [esp + 0]
.endif
mov ebx, \x
.ifeqs "\mode", "mont"
- lea esi, [esp + 44]
+ lea esi, [esp + 32]
.endif
- cysetup esp + 0
+ cysetup esp + 96
.ifnes "\u", "nil"
mov eax, \u
.endif
.ifeqs "\mode", "mont"
- lea edx, [esp + 76]
+ lea edx, [esp + 64]
.else
- lea edx, [esp + 44]
+ lea edx, [esp + 32]
.endif
.endm
-.macro testtail cyv, n
- cystore esp + 0, \cyv, \n
+.macro testtail cyv
+ cystore esp + 96, \cyv, esp + 104
jnz 0b
.endm
.endm
FUNC(test_dmul4)
- testprologue
+ testprologue [ebp + 44]
testldcarry [ebp + 24]
testexpand [ebp + 36], [ebp + 40]
mov edi, [ebp + 20]
testtop [ebp + 28], [ebp + 32]
call dmul4
- testtail [ebp + 48], [ebp + 44]
+ testtail [ebp + 48]
testcarryout [ebp + 24]
testepilogue
ENDFUNC
FUNC(test_dmla4)
- testprologue
+ testprologue [ebp + 44]
testldcarry [ebp + 24]
testexpand [ebp + 36], [ebp + 40]
mov edi, [ebp + 20]
testtop [ebp + 28], [ebp + 32]
call dmla4
- testtail [ebp + 48], [ebp + 44]
+ testtail [ebp + 48]
testcarryout [ebp + 24]
testepilogue
ENDFUNC
FUNC(test_mul4)
- testprologue
+ testprologue [ebp + 36]
testldcarry [ebp + 24]
testexpand nil, [ebp + 32]
mov edi, [ebp + 20]
testtop nil, [ebp + 28]
call mul4
- testtail [ebp + 40], [ebp + 36]
+ testtail [ebp + 40]
testcarryout [ebp + 24]
testepilogue
ENDFUNC
FUNC(test_mla4)
- testprologue
+ testprologue [ebp + 36]
testldcarry [ebp + 24]
testexpand nil, [ebp + 32]
mov edi, [ebp + 20]
testtop nil, [ebp + 28]
call mla4
- testtail [ebp + 40], [ebp + 36]
+ testtail [ebp + 40]
testcarryout [ebp + 24]
testepilogue
ENDFUNC
FUNC(test_mmul4)
- testprologue
+ testprologue [ebp + 48]
testexpand [ebp + 40], [ebp + 44]
mov edi, [ebp + 20]
testtop [ebp + 32], [ebp + 36], mont
call mmul4
- testtail [ebp + 52], [ebp + 48]
+ testtail [ebp + 52]
mov edi, [ebp + 28]
- movdqa xmm0, [esp + 76]
- movdqa xmm1, [esp + 92]
+ movdqa xmm0, [esp + 64]
+ movdqa xmm1, [esp + 80]
movdqu [edi], xmm0
movdqu [edi + 16], xmm1
testcarryout [ebp + 24]
ENDFUNC
FUNC(test_mmla4)
- testprologue
+ testprologue [ebp + 48]
testexpand [ebp + 40], [ebp + 44]
mov edi, [ebp + 20]
testtop [ebp + 32], [ebp + 36], mont
call mmla4
- testtail [ebp + 52], [ebp + 48]
+ testtail [ebp + 52]
mov edi, [ebp + 28]
- movdqa xmm0, [esp + 76]
- movdqa xmm1, [esp + 92]
+ movdqa xmm0, [esp + 64]
+ movdqa xmm1, [esp + 80]
movdqu [edi], xmm0
movdqu [edi + 16], xmm1
testcarryout [ebp + 24]
ENDFUNC
FUNC(test_mont4)
- testprologue
+ testprologue [ebp + 40]
testexpand nil, [ebp + 36]
mov edi, [ebp + 20]
testtop nil, [ebp + 32], mont
call mont4
- testtail [ebp + 44], [ebp + 40]
+ testtail [ebp + 44]
mov edi, [ebp + 28]
- movdqa xmm0, [esp + 76]
- movdqa xmm1, [esp + 92]
+ movdqa xmm0, [esp + 64]
+ movdqa xmm1, [esp + 80]
movdqu [edi], xmm0
movdqu [edi + 16], xmm1
testcarryout [ebp + 24]