Also, prefer aligning afterwards, except where that won't work.
pushreg esi
pushreg edi
setfp
+ stalloc 32
and SP, ~15
- sub SP, 32
endprologue
// Prepare for the first iteration.
pushreg esi
pushreg edi
setfp
+ stalloc 112
and SP, ~15
- sub SP, 112
endprologue
// Establish the expanded operands.
pushreg edi
setfp
and SP, ~15
- sub SP, 76
+ stalloc 76
endprologue
// Establish the expanded operands and the blocks-of-4 dv limit.
pushreg esi
pushreg edi
setfp
+ stalloc 3*32 + 4*4
and SP, ~15
- sub SP, 3*32 + 4*4
endprologue
mov eax, \n
mov [SP + 104], eax
pushreg BP
setfp
- sub SP, 16
+ stalloc 16
mov IN, [BP + 12]
mov OUT, [BP + 16]
and SP, ~15
setfp
mov A, [SP + 8]
mov K, [SP + 12]
+ stalloc 16
and SP, ~15
- sub SP, 16
#endif
#if CPUFAM_AMD64 && ABI_WIN
stalloc 3*16 + 8
setfp
mov A, [SP + 8]
mov K, [SP + 12]
- and SP, ~15
+ stalloc 16
ldgot ecx
- sub SP, 16
+ and SP, ~15
#endif
#if CPUFAM_AMD64 && ABI_WIN
stalloc 3*16 + 8
pushreg BP
setfp
- sub SP, 32
+ stalloc 32
mov IN, [BP + 12]
mov OUT, [BP + 16]
and SP, ~15