From 0b38b8b77ca6eef0006b1051f86df440fe5ab8dd Mon Sep 17 00:00:00 2001 From: Mark Wooding Date: Tue, 5 Nov 2019 11:13:03 +0000 Subject: [PATCH] math/mpx-mul4-*.S: Output expanded Montgomery factor in a sensible order. The current order is (y'_0, y'_1; y''_0, y''_1), (y'_2, y'_3; y''_2, y''_3), but while this makes sense in the context of SSE2, it's not really very satisfactory as a common currency. (In particular, if we want to resolve the expanded factor into a value then we'll have to do it by steam because the limb placements are irregular.) Instead, fix the ordering in the test stubs so that the pieces come out as (y'_0, y''_0; y'_1, y''_1), (y'_2, y''_2; y'_3, y''_3), which is generally much better to work with outside of SSE2. Of course, this only affects testing, not the actual code, so performance is unchanged. --- math/mpx-mul4-amd64-sse2.S | 6 ++++++ math/mpx-mul4-x86-sse2.S | 6 ++++++ math/t/mpx-mul4 | 6 +++--- 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/math/mpx-mul4-amd64-sse2.S b/math/mpx-mul4-amd64-sse2.S index 1c205a73..1c344f40 100644 --- a/math/mpx-mul4-amd64-sse2.S +++ b/math/mpx-mul4-amd64-sse2.S @@ -1601,6 +1601,8 @@ FUNC(test_mmul4) testtop r11 call mmul4 testtail + pshufd xmm10, xmm10, SHUF(0, 2, 1, 3) + pshufd xmm11, xmm11, SHUF(0, 2, 1, 3) movdqu [r10 + 0], xmm10 movdqu [r10 + 16], xmm11 testcarryout @@ -1612,6 +1614,8 @@ FUNC(test_mmla4) testtop r11 call mmla4 testtail + pshufd xmm10, xmm10, SHUF(0, 2, 1, 3) + pshufd xmm11, xmm11, SHUF(0, 2, 1, 3) movdqu [r10 + 0], xmm10 movdqu [r10 + 16], xmm11 testcarryout @@ -1623,6 +1627,8 @@ FUNC(test_mont4) testtop call mont4 testtail + pshufd xmm10, xmm10, SHUF(0, 2, 1, 3) + pshufd xmm11, xmm11, SHUF(0, 2, 1, 3) movdqu [r10 + 0], xmm10 movdqu [r10 + 16], xmm11 testcarryout diff --git a/math/mpx-mul4-x86-sse2.S b/math/mpx-mul4-x86-sse2.S index 7e7173fc..cdbdfaf7 100644 --- a/math/mpx-mul4-x86-sse2.S +++ b/math/mpx-mul4-x86-sse2.S @@ -1286,6 +1286,8 @@ FUNC(test_mmul4) mov edi, [BP + 28] movdqa xmm0, [SP + 64] movdqa xmm1, [SP + 80] + pshufd xmm0, xmm0, SHUF(0, 2, 1, 3) + pshufd xmm1, xmm1, SHUF(0, 2, 1, 3) movdqu [edi], xmm0 movdqu [edi + 16], xmm1 testcarryout [BP + 24] @@ -1302,6 +1304,8 @@ FUNC(test_mmla4) mov edi, [BP + 28] movdqa xmm0, [SP + 64] movdqa xmm1, [SP + 80] + pshufd xmm0, xmm0, SHUF(0, 2, 1, 3) + pshufd xmm1, xmm1, SHUF(0, 2, 1, 3) movdqu [edi], xmm0 movdqu [edi + 16], xmm1 testcarryout [BP + 24] @@ -1318,6 +1322,8 @@ FUNC(test_mont4) mov edi, [BP + 28] movdqa xmm0, [SP + 64] movdqa xmm1, [SP + 80] + pshufd xmm0, xmm0, SHUF(0, 2, 1, 3) + pshufd xmm1, xmm1, SHUF(0, 2, 1, 3) movdqu [edi], xmm0 movdqu [edi + 16], xmm1 testcarryout [BP + 24] diff --git a/math/t/mpx-mul4 b/math/t/mpx-mul4 index f2337b8d..1d8a741e 100644 --- a/math/t/mpx-mul4 +++ b/math/t/mpx-mul4 @@ -59,7 +59,7 @@ mmul4 { d3d2d1d0d7d6d5d4dbdad9d8dfdedddc # v 546f97b132b6ca1d10d519b5ca6ab8a9 # m 00000000000000000000000000000000 # zz - 00006c00000012ad00009a8d0000630c0000f0840000979d000077a400000caa # yy + 00006c0000009a8d000012ad0000630c0000f084000077a40000979d00000caa # yy 0003126be83bdbf40002a05c4867918e000259dfe01b01770001b7e463bf6b7a00011339f770da470000bdab9990cf26; # cc } @@ -70,7 +70,7 @@ mmla4 { d3d2d1d0d7d6d5d4dbdad9d8dfdedddc # v 546f97b132b6ca1d10d519b5ca6ab8a9 # m 00000000000000000000000000000000 # zz - 000016b00000d85500000b390000507000008de20000754b000057700000c5db # yy + 000016b000000b390000d8550000507000008de2000057700000754b0000c5db # yy 000338658ad352110002f9fbc6cd85d5000205e99c5e20d300021acac7b997550000fdb10c111c11000131df2708bb59; # cc } @@ -79,6 +79,6 @@ mont4 { acadaeafa8a9aaaba4a5a6a7a0a1a2a3 # n 546f97b132b6ca1d10d519b5ca6ab8a9 # m 00000000000000000000000000000000 # zz - 0000aab00000c5a7000070ab0000ed6400009d5d0000ddad0000dfcb0000b930 # yy + 0000aab0000070ab0000c5a70000ed6400009d5d0000dfcb0000ddad0000b930 # yy 0001734705fa761d00019ee57a6290e40000f14fc045d61200010386c155e29100008b1816a19f2700007432ecd64990; # cc } -- 2.11.0