From: Mark Wooding Date: Sat, 24 Nov 2018 21:53:58 +0000 (+0000) Subject: Merge branch '2.4.x' X-Git-Tag: 2.5.0~21 X-Git-Url: https://git.distorted.org.uk/~mdw/catacomb/commitdiff_plain/925ff94a516478164fdd01d53332637455e0074d?hp=-c Merge branch '2.4.x' * 2.4.x: progs/cc-progress.c: Use `fstat' to discover the file size. math/mpx-mul4-amd64-sse2.S: Always collect iteration count as 32 bits. math/mpx-mul4-amd64-sse2.S: Fix stack-argument offset for 64-bit Windows. symm/salsa20-x86ish-sse2.S: Fix typo in 64-bit Windows code. symm/desx.c, symm/desx.h (desx_init): Fix documentation. symm/t/rijndael256: Add tests for small key sizes. progs/cc-kem.c (getkem): Parse the `kdf' spec after bulk crypto. progs/..., symm/...: Fix 32-bit right-shift idiom. --- 925ff94a516478164fdd01d53332637455e0074d diff --combined math/mpx-mul4-amd64-sse2.S index 64460ca9,9146a63f..29939c1c --- a/math/mpx-mul4-amd64-sse2.S +++ b/math/mpx-mul4-amd64-sse2.S @@@ -96,32 -96,32 +96,32 @@@ .macro mulcore r, i, slo, shi, d0, d1=nil, d2=nil, d3=nil // Multiply R_I by the expanded operand SLO/SHI, and leave the pieces // of the product in registers D0, D1, D2, D3. - pshufd \d0, \r, SHUF(3, \i, 3, \i) // (r_i, ?, r_i, ?) + pshufd \d0, \r, SHUF(\i, 3, \i, 3) // (r_i, ?; r_i, ?) .ifnes "\d1", "nil" - movdqa \d1, \slo // (s'_0, s'_1, s''_0, s''_1) + movdqa \d1, \slo // (s'_0, s'_1; s''_0, s''_1) .endif .ifnes "\d3", "nil" - movdqa \d3, \shi // (s'_2, s'_3, s''_2, s''_3) + movdqa \d3, \shi // (s'_2, s'_3; s''_2, s''_3) .endif .ifnes "\d1", "nil" - psrldq \d1, 4 // (s'_1, s''_0, s''_1, 0) + psrldq \d1, 4 // (s'_1, s''_0; s''_1, 0) .endif .ifnes "\d2", "nil" - movdqa \d2, \d0 // another copy of (r_i, ?, r_i, ?) + movdqa \d2, \d0 // another copy of (r_i, ?; r_i, ?) .endif .ifnes "\d3", "nil" - psrldq \d3, 4 // (s'_3, s''_2, s''_3, 0) + psrldq \d3, 4 // (s'_3, s''_2; s''_3, 0) .endif .ifnes "\d1", "nil" - pmuludq \d1, \d0 // (r_i s'_1, r_i s''_1) + pmuludq \d1, \d0 // (r_i s'_1; r_i s''_1) .endif .ifnes "\d3", "nil" - pmuludq \d3, \d0 // (r_i s'_3, r_i s''_3) + pmuludq \d3, \d0 // (r_i s'_3; r_i s''_3) .endif .ifnes "\d2", "nil" - pmuludq \d2, \shi // (r_i s'_2, r_i s''_2) + pmuludq \d2, \shi // (r_i s'_2; r_i s''_2) .endif - pmuludq \d0, \slo // (r_i s'_0, r_i s''_0) + pmuludq \d0, \slo // (r_i s'_0; r_i s''_0) .endm .macro accum c0, c1=nil, c2=nil, c3=nil @@@ -163,10 -163,10 +163,10 @@@ // lane 0 or 1 of D; the high two lanes of D are clobbered. On // completion, XMM3 is clobbered. If CC is `nil', then the // contribution which would have been added to it is left in C. - pshufd xmm3, \c, SHUF(2, 3, 3, 3) // (?, ?, ?, t = c'' mod B) - psrldq xmm3, 12 // (t, 0, 0, 0) = (t, 0) - pslldq xmm3, 2 // (t b, 0) - paddq \c, xmm3 // (c' + t b, c'') + pshufd xmm3, \c, SHUF(3, 3, 3, 2) // (?, ?; ?, t = c'' mod B) + psrldq xmm3, 12 // (t, 0; 0, 0) = (t; 0) + pslldq xmm3, 2 // (t b; 0) + paddq \c, xmm3 // (c' + t b; c'') .ifeqs "\pos", "lo" movdqa \d, \c .else @@@ -183,37 -183,37 +183,37 @@@ // of the value represented in C are written at POS in D, and the // remaining bits are left at the bottom of T. movdqa \t, \c - psllq \t, 16 // (?, c'' b) - pslldq \c, 8 // (0, c') - paddq \t, \c // (?, c' + c'' b) - psrldq \t, 8 // c' + c'' b + psllq \t, 16 // (?; c'' b) + pslldq \c, 8 // (0; c') + paddq \t, \c // (?; c' + c'' b) + psrldq \t, 8 // (c' + c'' b; 0) = (c; 0) .ifeqs "\pos", "lo" movdqa \d, \t .else punpckldq \d, \t .endif - psrldq \t, 4 // floor((c' + c'' b)/B) + psrldq \t, 4 // (floor(c/B); 0) .endm .macro expand z, a, b, c=nil, d=nil // On entry, A and C hold packed 128-bit values, and Z is zero. On // exit, A:B and C:D together hold the same values in expanded // form. If C is `nil', then only expand A to A:B. - movdqa \b, \a // (a_0, a_1, a_2, a_3) + movdqa \b, \a // (a_0, a_1; a_2, a_3) .ifnes "\c", "nil" - movdqa \d, \c // (c_0, c_1, c_2, c_3) + movdqa \d, \c // (c_0, c_1; c_2, c_3) .endif - punpcklwd \a, \z // (a'_0, a''_0, a'_1, a''_1) - punpckhwd \b, \z // (a'_2, a''_2, a'_3, a''_3) + punpcklwd \a, \z // (a'_0, a''_0; a'_1, a''_1) + punpckhwd \b, \z // (a'_2, a''_2; a'_3, a''_3) .ifnes "\c", "nil" - punpcklwd \c, \z // (c'_0, c''_0, c'_1, c''_1) - punpckhwd \d, \z // (c'_2, c''_2, c'_3, c''_3) + punpcklwd \c, \z // (c'_0, c''_0; c'_1, c''_1) + punpckhwd \d, \z // (c'_2, c''_2; c'_3, c''_3) .endif - pshufd \a, \a, SHUF(3, 1, 2, 0) // (a'_0, a'_1, a''_0, a''_1) - pshufd \b, \b, SHUF(3, 1, 2, 0) // (a'_2, a'_3, a''_2, a''_3) + pshufd \a, \a, SHUF(0, 2, 1, 3) // (a'_0, a'_1; a''_0, a''_1) + pshufd \b, \b, SHUF(0, 2, 1, 3) // (a'_2, a'_3; a''_2, a''_3) .ifnes "\c", "nil" - pshufd \c, \c, SHUF(3, 1, 2, 0) // (c'_0, c'_1, c''_0, c''_1) - pshufd \d, \d, SHUF(3, 1, 2, 0) // (c'_2, c'_3, c''_2, c''_3) + pshufd \c, \c, SHUF(0, 2, 1, 3) // (c'_0, c'_1; c''_0, c''_1) + pshufd \d, \d, SHUF(0, 2, 1, 3) // (c'_2, c'_3; c''_2, c''_3) .endif .endm @@@ -229,10 -229,10 +229,10 @@@ // we can do that, we must gather them together. movdqa \t, \c0 movdqa \u, \c1 - punpcklqdq \t, \c2 // (y'_0, y'_2) - punpckhqdq \c0, \c2 // (y''_0, y''_2) - punpcklqdq \u, \c3 // (y'_1, y'_3) - punpckhqdq \c1, \c3 // (y''_1, y''_3) + punpcklqdq \t, \c2 // (y'_0; y'_2) + punpckhqdq \c0, \c2 // (y''_0; y''_2) + punpcklqdq \u, \c3 // (y'_1; y'_3) + punpckhqdq \c1, \c3 // (y''_1; y''_3) // Now split the double-prime pieces. The high (up to) 48 bits will // go up; the low 16 bits go down. @@@ -240,43 -240,43 +240,43 @@@ movdqa \c3, \c1 psllq \c2, 48 psllq \c3, 48 - psrlq \c0, 16 // high parts of (y''_0, y''_2) - psrlq \c1, 16 // high parts of (y''_1, y''_3) - psrlq \c2, 32 // low parts of (y''_0, y''_2) - psrlq \c3, 32 // low parts of (y''_1, y''_3) + psrlq \c0, 16 // high parts of (y''_0; y''_2) + psrlq \c1, 16 // high parts of (y''_1; y''_3) + psrlq \c2, 32 // low parts of (y''_0; y''_2) + psrlq \c3, 32 // low parts of (y''_1; y''_3) .ifnes "\hi", "nil" movdqa \hi, \c1 .endif - pslldq \c1, 8 // high part of (0, y''_1) + pslldq \c1, 8 // high part of (0; y''_1) paddq \t, \c2 // propagate down paddq \u, \c3 - paddq \t, \c1 // and up: (y_0, y_2) - paddq \u, \c0 // (y_1, y_3) + paddq \t, \c1 // and up: (y_0; y_2) + paddq \u, \c0 // (y_1; y_3) .ifnes "\hi", "nil" - psrldq \hi, 8 // high part of (y''_3, 0) + psrldq \hi, 8 // high part of (y''_3; 0) .endif // Finally extract the answer. This complicated dance is better than // storing to memory and loading, because the piecemeal stores // inhibit store forwarding. - movdqa \c3, \t // (y_0, y_1) - movdqa \lo, \t // (y^*_0, ?, ?, ?) - psrldq \t, 8 // (y_2, 0) - psrlq \c3, 32 // (floor(y_0/B), ?) - paddq \c3, \u // (y_1 + floor(y_0/B), ?) - movdqa \c1, \c3 // (y^*_1, ?, ?, ?) - psrldq \u, 8 // (y_3, 0) - psrlq \c3, 32 // (floor((y_1 B + y_0)/B^2, ?) - paddq \c3, \t // (y_2 + floor((y_1 B + y_0)/B^2, ?) - punpckldq \lo, \c3 // (y^*_0, y^*_2, ?, ?) - psrlq \c3, 32 // (floor((y_2 B^2 + y_1 B + y_0)/B^3, ?) - paddq \c3, \u // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3, ?) + movdqa \c3, \t // (y_0; ?) + movdqa \lo, \t // (y^*_0, ?; ?, ?) + psrldq \t, 8 // (y_2; 0) + psrlq \c3, 32 // (floor(y_0/B); ?) + paddq \c3, \u // (y_1 + floor(y_0/B); ?) + movdqa \c1, \c3 // (y^*_1, ?; ?, ?) + psrldq \u, 8 // (y_3; 0) + psrlq \c3, 32 // (floor((y_1 B + y_0)/B^2; ?) + paddq \c3, \t // (y_2 + floor((y_1 B + y_0)/B^2; ?) + punpckldq \lo, \c3 // (y^*_0, y^*_2; ?, ?) + psrlq \c3, 32 // (floor((y_2 B^2 + y_1 B + y_0)/B^3; ?) + paddq \c3, \u // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3; ?) .ifnes "\hi", "nil" movdqa \t, \c3 pxor \u, \u .endif - punpckldq \c1, \c3 // (y^*_1, y^*_3, ?, ?) + punpckldq \c1, \c3 // (y^*_1, y^*_3; ?, ?) .ifnes "\hi", "nil" psrlq \t, 32 // very high bits of y paddq \hi, \t @@@ -293,13 -293,13 +293,13 @@@ // On exit, the carry registers, including XMM15, are updated to hold // C + A; XMM0, XMM1, XMM2, and XMM3 are clobbered. The other // registers are preserved. - movd xmm0, [rdi + 0] // (a_0, 0) - movd xmm1, [rdi + 4] // (a_1, 0) - movd xmm2, [rdi + 8] // (a_2, 0) - movd xmm15, [rdi + 12] // (a_3, 0) - paddq xmm12, xmm0 // (c'_0 + a_0, c''_0) - paddq xmm13, xmm1 // (c'_1 + a_1, c''_1) - paddq xmm14, xmm2 // (c'_2 + a_2, c''_2 + a_3 b) + movd xmm0, [rdi + 0] // (a_0; 0) + movd xmm1, [rdi + 4] // (a_1; 0) + movd xmm2, [rdi + 8] // (a_2; 0) + movd xmm15, [rdi + 12] // (a_3; 0) + paddq xmm12, xmm0 // (c'_0 + a_0; c''_0) + paddq xmm13, xmm1 // (c'_1 + a_1; c''_1) + paddq xmm14, xmm2 // (c'_2 + a_2; c''_2 + a_3 b) .endm ///-------------------------------------------------------------------------- @@@ -621,8 -621,8 +621,8 @@@ INTFUNC(mmla4 mulcore xmm7, 1, xmm10, xmm11, xmm0, xmm1, xmm2 accum xmm4, xmm5, xmm6 - punpckldq xmm12, xmm15 // (w_0, 0, w_1, 0) - punpckhdq xmm14, xmm15 // (w_2, 0, w_3, 0) + punpckldq xmm12, xmm15 // (w_0, 0; w_1, 0) + punpckhdq xmm14, xmm15 // (w_2, 0; w_3, 0) mulcore xmm7, 2, xmm10, xmm11, xmm0, xmm1 accum xmm5, xmm6 @@@ -634,10 -634,10 +634,10 @@@ mulcore xmm7, 3, xmm10, xmm11, xmm0 accum xmm6 - punpckldq xmm12, xmm2 // (w_0, 0, 0, 0) - punpckldq xmm14, xmm2 // (w_2, 0, 0, 0) - punpckhdq xmm13, xmm2 // (w_1, 0, 0, 0) - punpckhdq xmm15, xmm2 // (w_3, 0, 0, 0) + punpckldq xmm12, xmm2 // (w_0, 0; 0, 0) + punpckldq xmm14, xmm2 // (w_2, 0; 0, 0) + punpckhdq xmm13, xmm2 // (w_1, 0; 0, 0) + punpckhdq xmm15, xmm2 // (w_3, 0; 0, 0) // That's lots of pieces. Now we have to assemble the answer. squash xmm3, xmm4, xmm5, xmm6, xmm0, xmm1, xmm10 @@@ -703,8 -703,8 +703,8 @@@ INTFUNC(mont4 mulcore xmm7, 1, xmm8, xmm9, xmm0, xmm1, xmm2 accum xmm4, xmm5, xmm6 - punpckldq xmm12, xmm15 // (w_0, 0, w_1, 0) - punpckhdq xmm14, xmm15 // (w_2, 0, w_3, 0) + punpckldq xmm12, xmm15 // (w_0, 0; w_1, 0) + punpckhdq xmm14, xmm15 // (w_2, 0; w_3, 0) mulcore xmm7, 2, xmm8, xmm9, xmm0, xmm1 accum xmm5, xmm6 @@@ -716,10 -716,10 +716,10 @@@ mulcore xmm7, 3, xmm8, xmm9, xmm0 accum xmm6 - punpckldq xmm12, xmm2 // (w_0, 0, 0, 0) - punpckldq xmm14, xmm2 // (w_2, 0, 0, 0) - punpckhdq xmm13, xmm2 // (w_1, 0, 0, 0) - punpckhdq xmm15, xmm2 // (w_3, 0, 0, 0) + punpckldq xmm12, xmm2 // (w_0, 0; 0, 0) + punpckldq xmm14, xmm2 // (w_2, 0; 0, 0) + punpckhdq xmm13, xmm2 // (w_1, 0; 0, 0) + punpckhdq xmm15, xmm2 // (w_3, 0; 0, 0) // That's lots of pieces. Now we have to assemble the answer. squash xmm3, xmm4, xmm5, xmm6, xmm0, xmm1, xmm10 @@@ -752,13 -752,6 +752,13 @@@ ENDFUN ///-------------------------------------------------------------------------- /// Bulk multipliers. +FUNC(mpx_umul4_amd64_avx) + .arch .avx + vzeroupper + endprologue + .arch pentium4 +ENDFUNC + FUNC(mpx_umul4_amd64_sse2) // void mpx_umul4_amd64_sse2(mpw *dv, const mpw *av, const mpw *avl, // const mpw *bv, const mpw *bvl); @@@ -908,13 -901,6 +908,13 @@@ ENDFUNC +FUNC(mpxmont_mul4_amd64_avx) + .arch .avx + vzeroupper + endprologue + .arch pentium4 +ENDFUNC + FUNC(mpxmont_mul4_amd64_sse2) // void mpxmont_mul4_amd64_sse2(mpw *dv, const mpw *av, const mpw *bv, // const mpw *nv, size_t n, const mpw *mi); @@@ -1109,13 -1095,6 +1109,13 @@@ ENDFUNC +FUNC(mpxmont_redc4_amd64_avx) + .arch .avx + vzeroupper + endprologue + .arch pentium4 +ENDFUNC + FUNC(mpxmont_redc4_amd64_sse2) // void mpxmont_redc4_amd64_sse2(mpw *dv, mpw *dvl, const mpw *nv, // size_t n, const mpw *mi); @@@ -1329,7 -1308,7 +1329,7 @@@ ENDFUN # define ARG6 STKARG(2) # define ARG7 STKARG(3) # define ARG8 STKARG(4) - # define STKARG_OFFSET 40 + # define STKARG_OFFSET 224 #endif #define STKARG(i) [rsp + STKARG_OFFSET + 8*(i)] @@@ -1386,7 -1365,7 +1386,7 @@@ mov rbx, r8 movdqu xmm8, [r9] movdqu xmm10, [rax] - mov r8, STKARG(1) + mov r8d, STKARG(1) mov r9, STKARG(2) mov r10, rdx mov r11, rcx @@@ -1395,7 -1374,7 +1395,7 @@@ .ifeqs "\mode", "mont" mov rbx, rcx movdqu xmm8, [r8] - mov r8, r9 + mov r8d, r9d mov r9, STKARG(0) mov r10, rdx mov rcx, rsi @@@ -1423,16 -1402,16 +1423,16 @@@ mov rbx, r9 movdqu xmm8, [r10] movdqu xmm10, [r11] - mov r8, STKARG(2) - mov r9, STKARG(3) mov r11, r8 + mov r8d, STKARG(2) + mov r9, STKARG(3) .endif .ifeqs "\mode", "smul" mov rdi, rcx mov rcx, rdx mov rbx, r8 movdqu xmm10, [r9] - mov r8, STKARG(0) + mov r8d, STKARG(0) mov r9, STKARG(1) .endif .ifeqs "\mode", "mmul" @@@ -1443,10 -1422,10 +1443,10 @@@ mov rbx, STKARG(0) movdqu xmm8, [r10] movdqu xmm10, [r11] - mov r8, STKARG(3) - mov r9, STKARG(4) mov r10, r8 mov r11, r9 + mov r8d, STKARG(3) + mov r9, STKARG(4) .endif .ifeqs "\mode", "mont" mov r10, STKARG(0) @@@ -1454,9 -1433,9 +1454,9 @@@ mov rcx, rdx mov rbx, r9 movdqu xmm8, [r10] - mov r8, STKARG(1) - mov r9, STKARG(2) mov r10, r8 + mov r8d, STKARG(1) + mov r9, STKARG(2) .endif #endif @@@ -1495,9 -1474,9 +1495,9 @@@ .endm .macro testldcarry - movdqu xmm12, [rcx + 0] // (c'_0, c''_0) - movdqu xmm13, [rcx + 16] // (c'_1, c''_1) - movdqu xmm14, [rcx + 32] // (c'_2, c''_2) + movdqu xmm12, [rcx + 0] // (c'_0; c''_0) + movdqu xmm13, [rcx + 16] // (c'_1; c''_1) + movdqu xmm14, [rcx + 32] // (c'_2; c''_2) .endm .macro testtop u=nil diff --combined symm/blkc.h index e94e932b,ff631f09..e0837521 --- a/symm/blkc.h +++ b/symm/blkc.h @@@ -109,7 -109,7 +109,7 @@@ #define BLKC_SHOW(PRE, tag, w) do { \ fputs(tag ": ", stdout); \ - BLKC_SKEL_X(PRE, BLKC_W(w);, printf("%08x ", *_w++);); \ + BLKC_SKEL_X(PRE, const BLKC_W(w);, printf("%08x ", *_w++);); \ fputc('\n', stdout); \ } while (0) @@@ -174,7 -174,7 +174,7 @@@ unsigned _i; BLKC_W(w); unsigned long _x = x; \ for (_i = 0; _i < PRE##_BLKSZ / 4; _i++) { \ *_w++ = U32(_x); \ - _x = ((_x & ~MASK32) >> 16) >> 16; \ + _x = ((_x & ~(unsigned long)MASK32) >> 16) >> 16; \ } \ } while (0) @@@ -182,7 -182,7 +182,7 @@@ unsigned _i; BLKC_W(w); unsigned long _x = x; _w += PRE##_BLKSZ / 4; \ for (_i = 0; _i < PRE##_BLKSZ / 4; _i++) { \ *--_w = U32(_x); \ - _x = ((_x & ~MASK32) >> 16) >> 16; \ + _x = ((_x & ~(unsigned long)MASK32) >> 16) >> 16; \ } \ } while (0) diff --combined symm/salsa20-x86ish-sse2.S index ad4e322b,7d8e2e38..06ba3d2c --- a/symm/salsa20-x86ish-sse2.S +++ b/symm/salsa20-x86ish-sse2.S @@@ -33,17 -33,9 +33,17 @@@ ///-------------------------------------------------------------------------- /// Main code. - .arch pentium4 .text +FUNC(salsa20_core_x86ish_avx) + .arch .avx + vzeroupper + endprologue + // drop through... +ENDFUNC + + .arch pentium4 + FUNC(salsa20_core_x86ish_sse2) // Initial setup. @@@ -180,7 -172,7 +180,7 @@@ // d ^= (c + b) <<< 13 movdqa xmm4, xmm2 paddd xmm4, xmm1 - pshufd xmm1, xmm1, SHUF(2, 1, 0, 3) + pshufd xmm1, xmm1, SHUF(3, 0, 1, 2) movdqa xmm5, xmm4 pslld xmm4, 13 psrld xmm5, 19 @@@ -189,9 -181,9 +189,9 @@@ // a ^= (d + c) <<< 18 movdqa xmm4, xmm3 - pshufd xmm3, xmm3, SHUF(0, 3, 2, 1) + pshufd xmm3, xmm3, SHUF(1, 2, 3, 0) paddd xmm4, xmm2 - pshufd xmm2, xmm2, SHUF(1, 0, 3, 2) + pshufd xmm2, xmm2, SHUF(2, 3, 0, 1) movdqa xmm5, xmm4 pslld xmm4, 18 psrld xmm5, 14 @@@ -235,7 -227,7 +235,7 @@@ // d ^= (c + b) <<< 13 movdqa xmm4, xmm2 paddd xmm4, xmm3 - pshufd xmm3, xmm3, SHUF(2, 1, 0, 3) + pshufd xmm3, xmm3, SHUF(3, 0, 1, 2) movdqa xmm5, xmm4 pslld xmm4, 13 psrld xmm5, 19 @@@ -244,9 -236,9 +244,9 @@@ // a ^= (d + c) <<< 18 movdqa xmm4, xmm1 - pshufd xmm1, xmm1, SHUF(0, 3, 2, 1) + pshufd xmm1, xmm1, SHUF(1, 2, 3, 0) paddd xmm4, xmm2 - pshufd xmm2, xmm2, SHUF(1, 0, 3, 2) + pshufd xmm2, xmm2, SHUF(2, 3, 0, 1) movdqa xmm5, xmm4 pslld xmm4, 18 psrld xmm5, 14 @@@ -270,9 -262,9 +270,9 @@@ // input. This can be done by juggling values in registers, with the // following fancy footwork: some row rotations, a transpose, and // some more rotations. - pshufd xmm1, xmm1, SHUF(2, 1, 0, 3) // 3, 4, 9, 14 - pshufd xmm2, xmm2, SHUF(1, 0, 3, 2) // 2, 7, 8, 13 - pshufd xmm3, xmm3, SHUF(0, 3, 2, 1) // 1, 6, 11, 12 + pshufd xmm1, xmm1, SHUF(3, 0, 1, 2) // 3, 4, 9, 14 + pshufd xmm2, xmm2, SHUF(2, 3, 0, 1) // 2, 7, 8, 13 + pshufd xmm3, xmm3, SHUF(1, 2, 3, 0) // 1, 6, 11, 12 movdqa xmm4, xmm0 movdqa xmm5, xmm3 @@@ -288,9 -280,9 +288,9 @@@ punpckhdq xmm1, xmm3 // 5, 6, 7, 4 punpckhdq xmm2, xmm5 // 15, 12, 13, 14 - pshufd xmm1, xmm1, SHUF(2, 1, 0, 3) // 4, 5, 6, 7 - pshufd xmm4, xmm4, SHUF(1, 0, 3, 2) // 8, 9, 10, 11 - pshufd xmm2, xmm2, SHUF(0, 3, 2, 1) // 12, 13, 14, 15 + pshufd xmm1, xmm1, SHUF(3, 0, 1, 2) // 4, 5, 6, 7 + pshufd xmm4, xmm4, SHUF(2, 3, 0, 1) // 8, 9, 10, 11 + pshufd xmm2, xmm2, SHUF(1, 2, 3, 0) // 12, 13, 14, 15 // Finally we have to write out the result. movdqu [OUT + 0], xmm0 @@@ -305,7 -297,7 +305,7 @@@ #endif #if CPUFAM_AMD64 && ABI_WIN rstrxmm xmm6, 0 - rsrrxmm xmm7, 16 + rstrxmm xmm7, 16 stfree 64 + 8 #endif