// outer loop dv r10 rcx
// outer loop dv limit r11 r11
// nv base rdx r8
- // nv limit r9 r12*
+ // nv limit r9 r10*
// n rcx r9
// c rcx r9
# define DV rcx
# define DVLO r11
# define NV r8
-# define NVL r12
+# define NVL r10
# define N r9
# define C r9d
pushreg rbx
pushreg rdi
- pushreg r12
- stalloc 160
+ stalloc 168
savexmm xmm6, 0
savexmm xmm7, 16
jb 7b
// All done for this iteration. Start the next.
-8: mov rdi, DV // -> Z = dv[i]
- mov rbx, NV // -> X = nv[0]
- cmp rdi, DVLO // all done yet?
+ cmp DV, DVLO // all done yet?
jae 9f
+ mov rdi, DV // -> Z = dv[i]
+ mov rbx, NV // -> X = nv[0]
add DV, 16
call mont4
add rdi, 16
rstrxmm xmm14, 128
rstrxmm xmm15, 144
- stfree 160
- popreg r12
+ stfree 168
popreg rdi
popreg rbx
#endif
testtop r11
call mmul4
testtail
+ pshufd xmm10, xmm10, SHUF(0, 2, 1, 3)
+ pshufd xmm11, xmm11, SHUF(0, 2, 1, 3)
movdqu [r10 + 0], xmm10
movdqu [r10 + 16], xmm11
testcarryout
testtop r11
call mmla4
testtail
+ pshufd xmm10, xmm10, SHUF(0, 2, 1, 3)
+ pshufd xmm11, xmm11, SHUF(0, 2, 1, 3)
movdqu [r10 + 0], xmm10
movdqu [r10 + 16], xmm11
testcarryout
testtop
call mont4
testtail
+ pshufd xmm10, xmm10, SHUF(0, 2, 1, 3)
+ pshufd xmm11, xmm11, SHUF(0, 2, 1, 3)
movdqu [r10 + 0], xmm10
movdqu [r10 + 16], xmm11
testcarryout