The Windows code doesn't need to spill r12, because we don't need the
`mi' register after we've loaded and expanded the Montgomery factor.
This doesn't save any stack space because we need 16-byte alignment, but
it does avoid saving and restoring the register.
// outer loop dv r10 rcx
// outer loop dv limit r11 r11
// nv base rdx r8
// outer loop dv r10 rcx
// outer loop dv limit r11 r11
// nv base rdx r8
# define DV rcx
# define DVLO r11
# define NV r8
# define DV rcx
# define DVLO r11
# define NV r8
# define N r9
# define C r9d
pushreg rbx
pushreg rdi
# define N r9
# define C r9d
pushreg rbx
pushreg rdi
- pushreg r12
- stalloc 160
savexmm xmm6, 0
savexmm xmm7, 16
savexmm xmm6, 0
savexmm xmm7, 16
rstrxmm xmm14, 128
rstrxmm xmm15, 144
rstrxmm xmm14, 128
rstrxmm xmm15, 144
- stfree 160
- popreg r12
popreg rdi
popreg rbx
#endif
popreg rdi
popreg rbx
#endif