X-Git-Url: https://git.distorted.org.uk/~mdw/catacomb/blobdiff_plain/92edc356a312bc64abca0c30bc03d4b6676f3d39..bd6d65e32b835551677456bf286d09ced6859882:/math/mpx-mul4-x86-sse2.S diff --git a/math/mpx-mul4-x86-sse2.S b/math/mpx-mul4-x86-sse2.S index 9e1d4782..916adef9 100644 --- a/math/mpx-mul4-x86-sse2.S +++ b/math/mpx-mul4-x86-sse2.S @@ -1054,15 +1054,17 @@ FUNC(mpxmont_redc4_x86_sse2) // Continue carry propagation until the end of the buffer. 0: add [edi], eax mov eax, 0 // preserves flags - adcd [edi + 4], 0 - adcd [edi + 8], 0 - adcd [edi + 12], 0 + adc dword ptr [edi + 4], 0 + adc dword ptr [edi + 8], 0 + adc dword ptr [edi + 12], 0 adc eax, 0 add edi, 16 cmp edi, esi jb 0b - // Deal with the tail end. + // Deal with the tail end. Note that the actual destination length + // won't be an exact number of blocks of four, so it's safe to just + // drop through here. 7: add [edi], eax mov eax, 0 add edi, 4 @@ -1070,9 +1072,7 @@ FUNC(mpxmont_redc4_x86_sse2) cmp edi, edx jb 7b - // All done for this iteration. Start the next. (This must have at - // least one follow-on iteration, or we'd not have started this outer - // loop.) + // All done for this iteration. Start the next. 8: mov edi, [SP + 0] // -> dv[i - 1] mov ebx, [BP + 28] // -> X = nv[0] lea edx, [SP + 44] // -> space for Y @@ -1286,6 +1286,8 @@ FUNC(test_mmul4) mov edi, [BP + 28] movdqa xmm0, [SP + 64] movdqa xmm1, [SP + 80] + pshufd xmm0, xmm0, SHUF(0, 2, 1, 3) + pshufd xmm1, xmm1, SHUF(0, 2, 1, 3) movdqu [edi], xmm0 movdqu [edi + 16], xmm1 testcarryout [BP + 24] @@ -1302,6 +1304,8 @@ FUNC(test_mmla4) mov edi, [BP + 28] movdqa xmm0, [SP + 64] movdqa xmm1, [SP + 80] + pshufd xmm0, xmm0, SHUF(0, 2, 1, 3) + pshufd xmm1, xmm1, SHUF(0, 2, 1, 3) movdqu [edi], xmm0 movdqu [edi + 16], xmm1 testcarryout [BP + 24] @@ -1318,6 +1322,8 @@ FUNC(test_mont4) mov edi, [BP + 28] movdqa xmm0, [SP + 64] movdqa xmm1, [SP + 80] + pshufd xmm0, xmm0, SHUF(0, 2, 1, 3) + pshufd xmm1, xmm1, SHUF(0, 2, 1, 3) movdqu [edi], xmm0 movdqu [edi + 16], xmm1 testcarryout [BP + 24]