X-Git-Url: https://git.distorted.org.uk/~mdw/catacomb/blobdiff_plain/9599917f4a0aa31e2dafd15a7f0e4993bdedf715..bd6d65e32b835551677456bf286d09ced6859882:/math/mpx-mul4-amd64-sse2.S diff --git a/math/mpx-mul4-amd64-sse2.S b/math/mpx-mul4-amd64-sse2.S index 0f2dbd32..5a748c60 100644 --- a/math/mpx-mul4-amd64-sse2.S +++ b/math/mpx-mul4-amd64-sse2.S @@ -1155,7 +1155,7 @@ FUNC(mpxmont_redc4_amd64_sse2) // outer loop dv r10 rcx // outer loop dv limit r11 r11 // nv base rdx r8 - // nv limit r9 r12* + // nv limit r9 r10* // n rcx r9 // c rcx r9 @@ -1183,14 +1183,13 @@ FUNC(mpxmont_redc4_amd64_sse2) # define DV rcx # define DVLO r11 # define NV r8 -# define NVL r12 +# define NVL r10 # define N r9 # define C r9d pushreg rbx pushreg rdi - pushreg r12 - stalloc 160 + stalloc 168 savexmm xmm6, 0 savexmm xmm7, 16 @@ -1252,29 +1251,29 @@ FUNC(mpxmont_redc4_amd64_sse2) // Continue carry propagation until the end of the buffer. 0: add [rdi], C mov C, 0 // preserves flags - adcd [rdi + 4], 0 - adcd [rdi + 8], 0 - adcd [rdi + 12], 0 + adc dword ptr [rdi + 4], 0 + adc dword ptr [rdi + 8], 0 + adc dword ptr [rdi + 12], 0 adc C, 0 add rdi, 16 cmp rdi, DVL4 jb 0b - // Deal with the tail end. + // Deal with the tail end. Note that the actual destination length + // won't be an exacty number of blocks of four, so it's safe to just + // drop through here. 7: add [rdi], C - mov C, 0 // preserves flags + mov C, 0 add rdi, 4 adc C, 0 cmp rdi, DVL jb 7b - // All done for this iteration. Start the next. (This must have at - // least one follow-on iteration, or we'd not have started this outer - // loop.) -8: mov rdi, DV // -> Z = dv[i] - mov rbx, NV // -> X = nv[0] - cmp rdi, DVLO // all done yet? + // All done for this iteration. Start the next. + cmp DV, DVLO // all done yet? jae 9f + mov rdi, DV // -> Z = dv[i] + mov rbx, NV // -> X = nv[0] add DV, 16 call mont4 add rdi, 16 @@ -1300,8 +1299,7 @@ FUNC(mpxmont_redc4_amd64_sse2) rstrxmm xmm14, 128 rstrxmm xmm15, 144 - stfree 160 - popreg r12 + stfree 168 popreg rdi popreg rbx #endif @@ -1603,6 +1601,8 @@ FUNC(test_mmul4) testtop r11 call mmul4 testtail + pshufd xmm10, xmm10, SHUF(0, 2, 1, 3) + pshufd xmm11, xmm11, SHUF(0, 2, 1, 3) movdqu [r10 + 0], xmm10 movdqu [r10 + 16], xmm11 testcarryout @@ -1614,6 +1614,8 @@ FUNC(test_mmla4) testtop r11 call mmla4 testtail + pshufd xmm10, xmm10, SHUF(0, 2, 1, 3) + pshufd xmm11, xmm11, SHUF(0, 2, 1, 3) movdqu [r10 + 0], xmm10 movdqu [r10 + 16], xmm11 testcarryout @@ -1625,6 +1627,8 @@ FUNC(test_mont4) testtop call mont4 testtail + pshufd xmm10, xmm10, SHUF(0, 2, 1, 3) + pshufd xmm11, xmm11, SHUF(0, 2, 1, 3) movdqu [r10 + 0], xmm10 movdqu [r10 + 16], xmm11 testcarryout