X-Git-Url: https://git.distorted.org.uk/~mdw/catacomb/blobdiff_plain/f79c8756d054b2979f79d8277affb988c4f39c49..bd6d65e32b835551677456bf286d09ced6859882:/math/mpx-mul4-amd64-sse2.S diff --git a/math/mpx-mul4-amd64-sse2.S b/math/mpx-mul4-amd64-sse2.S index da3e6d61..5a748c60 100644 --- a/math/mpx-mul4-amd64-sse2.S +++ b/math/mpx-mul4-amd64-sse2.S @@ -1251,9 +1251,9 @@ FUNC(mpxmont_redc4_amd64_sse2) // Continue carry propagation until the end of the buffer. 0: add [rdi], C mov C, 0 // preserves flags - adcd [rdi + 4], 0 - adcd [rdi + 8], 0 - adcd [rdi + 12], 0 + adc dword ptr [rdi + 4], 0 + adc dword ptr [rdi + 8], 0 + adc dword ptr [rdi + 12], 0 adc C, 0 add rdi, 16 cmp rdi, DVL4 @@ -1270,10 +1270,10 @@ FUNC(mpxmont_redc4_amd64_sse2) jb 7b // All done for this iteration. Start the next. -8: mov rdi, DV // -> Z = dv[i] - mov rbx, NV // -> X = nv[0] - cmp rdi, DVLO // all done yet? + cmp DV, DVLO // all done yet? jae 9f + mov rdi, DV // -> Z = dv[i] + mov rbx, NV // -> X = nv[0] add DV, 16 call mont4 add rdi, 16 @@ -1601,6 +1601,8 @@ FUNC(test_mmul4) testtop r11 call mmul4 testtail + pshufd xmm10, xmm10, SHUF(0, 2, 1, 3) + pshufd xmm11, xmm11, SHUF(0, 2, 1, 3) movdqu [r10 + 0], xmm10 movdqu [r10 + 16], xmm11 testcarryout @@ -1612,6 +1614,8 @@ FUNC(test_mmla4) testtop r11 call mmla4 testtail + pshufd xmm10, xmm10, SHUF(0, 2, 1, 3) + pshufd xmm11, xmm11, SHUF(0, 2, 1, 3) movdqu [r10 + 0], xmm10 movdqu [r10 + 16], xmm11 testcarryout @@ -1623,6 +1627,8 @@ FUNC(test_mont4) testtop call mont4 testtail + pshufd xmm10, xmm10, SHUF(0, 2, 1, 3) + pshufd xmm11, xmm11, SHUF(0, 2, 1, 3) movdqu [r10 + 0], xmm10 movdqu [r10 + 16], xmm11 testcarryout