The current order is (y'_0, y'_1; y''_0, y''_1), (y'_2, y'_3; y''_2,
y''_3), but while this makes sense in the context of SSE2, it's not
really very satisfactory as a common currency. (In particular, if we
want to resolve the expanded factor into a value then we'll have to do
it by steam because the limb placements are irregular.)
Instead, fix the ordering in the test stubs so that the pieces come out
as (y'_0, y''_0; y'_1, y''_1), (y'_2, y''_2; y'_3, y''_3), which is
generally much better to work with outside of SSE2.
Of course, this only affects testing, not the actual code, so
performance is unchanged.
testtop r11
call mmul4
testtail
testtop r11
call mmul4
testtail
+ pshufd xmm10, xmm10, SHUF(0, 2, 1, 3)
+ pshufd xmm11, xmm11, SHUF(0, 2, 1, 3)
movdqu [r10 + 0], xmm10
movdqu [r10 + 16], xmm11
testcarryout
movdqu [r10 + 0], xmm10
movdqu [r10 + 16], xmm11
testcarryout
testtop r11
call mmla4
testtail
testtop r11
call mmla4
testtail
+ pshufd xmm10, xmm10, SHUF(0, 2, 1, 3)
+ pshufd xmm11, xmm11, SHUF(0, 2, 1, 3)
movdqu [r10 + 0], xmm10
movdqu [r10 + 16], xmm11
testcarryout
movdqu [r10 + 0], xmm10
movdqu [r10 + 16], xmm11
testcarryout
testtop
call mont4
testtail
testtop
call mont4
testtail
+ pshufd xmm10, xmm10, SHUF(0, 2, 1, 3)
+ pshufd xmm11, xmm11, SHUF(0, 2, 1, 3)
movdqu [r10 + 0], xmm10
movdqu [r10 + 16], xmm11
testcarryout
movdqu [r10 + 0], xmm10
movdqu [r10 + 16], xmm11
testcarryout
mov edi, [BP + 28]
movdqa xmm0, [SP + 64]
movdqa xmm1, [SP + 80]
mov edi, [BP + 28]
movdqa xmm0, [SP + 64]
movdqa xmm1, [SP + 80]
+ pshufd xmm0, xmm0, SHUF(0, 2, 1, 3)
+ pshufd xmm1, xmm1, SHUF(0, 2, 1, 3)
movdqu [edi], xmm0
movdqu [edi + 16], xmm1
testcarryout [BP + 24]
movdqu [edi], xmm0
movdqu [edi + 16], xmm1
testcarryout [BP + 24]
mov edi, [BP + 28]
movdqa xmm0, [SP + 64]
movdqa xmm1, [SP + 80]
mov edi, [BP + 28]
movdqa xmm0, [SP + 64]
movdqa xmm1, [SP + 80]
+ pshufd xmm0, xmm0, SHUF(0, 2, 1, 3)
+ pshufd xmm1, xmm1, SHUF(0, 2, 1, 3)
movdqu [edi], xmm0
movdqu [edi + 16], xmm1
testcarryout [BP + 24]
movdqu [edi], xmm0
movdqu [edi + 16], xmm1
testcarryout [BP + 24]
mov edi, [BP + 28]
movdqa xmm0, [SP + 64]
movdqa xmm1, [SP + 80]
mov edi, [BP + 28]
movdqa xmm0, [SP + 64]
movdqa xmm1, [SP + 80]
+ pshufd xmm0, xmm0, SHUF(0, 2, 1, 3)
+ pshufd xmm1, xmm1, SHUF(0, 2, 1, 3)
movdqu [edi], xmm0
movdqu [edi + 16], xmm1
testcarryout [BP + 24]
movdqu [edi], xmm0
movdqu [edi + 16], xmm1
testcarryout [BP + 24]
d3d2d1d0d7d6d5d4dbdad9d8dfdedddc # v
546f97b132b6ca1d10d519b5ca6ab8a9 # m
00000000000000000000000000000000 # zz
d3d2d1d0d7d6d5d4dbdad9d8dfdedddc # v
546f97b132b6ca1d10d519b5ca6ab8a9 # m
00000000000000000000000000000000 # zz
- 00006c00000012ad00009a8d0000630c0000f0840000979d000077a400000caa # yy
+ 00006c0000009a8d000012ad0000630c0000f084000077a40000979d00000caa # yy
0003126be83bdbf40002a05c4867918e000259dfe01b01770001b7e463bf6b7a00011339f770da470000bdab9990cf26; # cc
}
0003126be83bdbf40002a05c4867918e000259dfe01b01770001b7e463bf6b7a00011339f770da470000bdab9990cf26; # cc
}
d3d2d1d0d7d6d5d4dbdad9d8dfdedddc # v
546f97b132b6ca1d10d519b5ca6ab8a9 # m
00000000000000000000000000000000 # zz
d3d2d1d0d7d6d5d4dbdad9d8dfdedddc # v
546f97b132b6ca1d10d519b5ca6ab8a9 # m
00000000000000000000000000000000 # zz
- 000016b00000d85500000b390000507000008de20000754b000057700000c5db # yy
+ 000016b000000b390000d8550000507000008de2000057700000754b0000c5db # yy
000338658ad352110002f9fbc6cd85d5000205e99c5e20d300021acac7b997550000fdb10c111c11000131df2708bb59; # cc
}
000338658ad352110002f9fbc6cd85d5000205e99c5e20d300021acac7b997550000fdb10c111c11000131df2708bb59; # cc
}
acadaeafa8a9aaaba4a5a6a7a0a1a2a3 # n
546f97b132b6ca1d10d519b5ca6ab8a9 # m
00000000000000000000000000000000 # zz
acadaeafa8a9aaaba4a5a6a7a0a1a2a3 # n
546f97b132b6ca1d10d519b5ca6ab8a9 # m
00000000000000000000000000000000 # zz
- 0000aab00000c5a7000070ab0000ed6400009d5d0000ddad0000dfcb0000b930 # yy
+ 0000aab0000070ab0000c5a70000ed6400009d5d0000dfcb0000ddad0000b930 # yy
0001734705fa761d00019ee57a6290e40000f14fc045d61200010386c155e29100008b1816a19f2700007432ecd64990; # cc
}
0001734705fa761d00019ee57a6290e40000f14fc045d61200010386c155e29100008b1816a19f2700007432ecd64990; # cc
}