From a117c06f5ee62cbe7812769703eada01843f76ca Mon Sep 17 00:00:00 2001 From: Mark Wooding Date: Mon, 12 Nov 2018 11:03:05 +0000 Subject: [PATCH] base/asm-common.h: Reverse the order of `SHUF' arguments. The original idea was this: since one can change one's view of how the bits in an XMM register are divided into lanes on a per-instruction basis, it would make more sense if I took a single consistent view of how the bits are arranged, with the least significant on the right and the most significant on the left. Therefore, I listed the shuffle indices from left to right, counting from right to left. This, I now realise, was a mistake. The thing which finally made this clear to me was that it makes the order of indices in the `SHUF' macro be inconsistent with the order of bytes in a table for the SSSE3 `pshufb' instruction, and I can't do anything about that. So: change the order of the arguments, and track down all uses of this macro to fix them. Sorry about that. To verify that I got them all: for i in $(git grep -l SHUF); do git blame -- $i | grep SHUF done | less --- base/asm-common.h | 10 +++++----- math/mpx-mul4-amd64-sse2.S | 12 ++++++------ math/mpx-mul4-x86-sse2.S | 12 ++++++------ symm/chacha-x86ish-sse2.S | 12 ++++++------ symm/rijndael-x86ish-aesni.S | 6 +++--- symm/salsa20-x86ish-sse2.S | 24 ++++++++++++------------ 6 files changed, 38 insertions(+), 38 deletions(-) diff --git a/base/asm-common.h b/base/asm-common.h index 8e51ea39..d6a8b012 100644 --- a/base/asm-common.h +++ b/base/asm-common.h @@ -217,11 +217,11 @@ name: # define INTADDR__1(addr, got) addr #endif -// Permutations for SIMD instructions. SHUF(D, C, B, A) is an immediate, -// suitable for use in `pshufd' or `shufpd', which copies element D -// (0 <= D < 4) of the source to element 3 of the destination, element C to -// element 2, element B to element 1, and element A to element 0. -#define SHUF(d, c, b, a) (64*(d) + 16*(c) + 4*(b) + (a)) +// Permutations for SIMD instructions. SHUF(A, B, C, D) is an immediate, +// suitable for use in `pshufd' or `shufpd', which copies element A +// (0 <= A < 4) of the source to element 0 of the destination, element B to +// element 1, element C to element 2, and element D to element 3. +#define SHUF(a, b, c, d) ((a) + 4*(b) + 16*(c) + 64*(d)) // Map register names to their individual pieces. diff --git a/math/mpx-mul4-amd64-sse2.S b/math/mpx-mul4-amd64-sse2.S index 8b8cd414..64460ca9 100644 --- a/math/mpx-mul4-amd64-sse2.S +++ b/math/mpx-mul4-amd64-sse2.S @@ -96,7 +96,7 @@ .macro mulcore r, i, slo, shi, d0, d1=nil, d2=nil, d3=nil // Multiply R_I by the expanded operand SLO/SHI, and leave the pieces // of the product in registers D0, D1, D2, D3. - pshufd \d0, \r, SHUF(3, \i, 3, \i) // (r_i, ?; r_i, ?) + pshufd \d0, \r, SHUF(\i, 3, \i, 3) // (r_i, ?; r_i, ?) .ifnes "\d1", "nil" movdqa \d1, \slo // (s'_0, s'_1; s''_0, s''_1) .endif @@ -163,7 +163,7 @@ // lane 0 or 1 of D; the high two lanes of D are clobbered. On // completion, XMM3 is clobbered. If CC is `nil', then the // contribution which would have been added to it is left in C. - pshufd xmm3, \c, SHUF(2, 3, 3, 3) // (?, ?; ?, t = c'' mod B) + pshufd xmm3, \c, SHUF(3, 3, 3, 2) // (?, ?; ?, t = c'' mod B) psrldq xmm3, 12 // (t, 0; 0, 0) = (t; 0) pslldq xmm3, 2 // (t b; 0) paddq \c, xmm3 // (c' + t b; c'') @@ -209,11 +209,11 @@ punpcklwd \c, \z // (c'_0, c''_0; c'_1, c''_1) punpckhwd \d, \z // (c'_2, c''_2; c'_3, c''_3) .endif - pshufd \a, \a, SHUF(3, 1, 2, 0) // (a'_0, a'_1; a''_0, a''_1) - pshufd \b, \b, SHUF(3, 1, 2, 0) // (a'_2, a'_3; a''_2, a''_3) + pshufd \a, \a, SHUF(0, 2, 1, 3) // (a'_0, a'_1; a''_0, a''_1) + pshufd \b, \b, SHUF(0, 2, 1, 3) // (a'_2, a'_3; a''_2, a''_3) .ifnes "\c", "nil" - pshufd \c, \c, SHUF(3, 1, 2, 0) // (c'_0, c'_1; c''_0, c''_1) - pshufd \d, \d, SHUF(3, 1, 2, 0) // (c'_2, c'_3; c''_2, c''_3) + pshufd \c, \c, SHUF(0, 2, 1, 3) // (c'_0, c'_1; c''_0, c''_1) + pshufd \d, \d, SHUF(0, 2, 1, 3) // (c'_2, c'_3; c''_2, c''_3) .endif .endm diff --git a/math/mpx-mul4-x86-sse2.S b/math/mpx-mul4-x86-sse2.S index 591a7a8f..11aadc95 100644 --- a/math/mpx-mul4-x86-sse2.S +++ b/math/mpx-mul4-x86-sse2.S @@ -103,7 +103,7 @@ .ifnes "\d3", "nil" movdqa \d3, [\s + 16] // (s'_2, s'_3; s''_2, s''_3) .endif - pshufd \d0, \d0, SHUF(3, 0, 3, 0) // (r_i, ?; r_i, ?) + pshufd \d0, \d0, SHUF(0, 3, 0, 3) // (r_i, ?; r_i, ?) .ifnes "\d1", "nil" psrldq \d1, 4 // (s'_1, s''_0; s''_1, 0) .endif @@ -171,7 +171,7 @@ // carry registers. On completion, XMM3 is clobbered. If CC is // `nil', then the contribution which would have been added to it is // left in C. - pshufd xmm3, \c, SHUF(2, 3, 3, 3) // (?, ?; ?, t = c'' mod B) + pshufd xmm3, \c, SHUF(3, 3, 3, 2) // (?, ?; ?, t = c'' mod B) psrldq xmm3, 12 // (t, 0; 0, 0) = (t, 0) pslldq xmm3, 2 // (t b; 0) paddq \c, xmm3 // (c' + t b; c'') @@ -209,11 +209,11 @@ punpcklwd \c, \z // (c'_0, c''_0; c'_1, c''_1) punpckhwd \d, \z // (c'_2, c''_2; c'_3, c''_3) .endif - pshufd \a, \a, SHUF(3, 1, 2, 0) // (a'_0, a'_1; a''_0, a''_1) - pshufd \b, \b, SHUF(3, 1, 2, 0) // (a'_2, a'_3; a''_2, a''_3) + pshufd \a, \a, SHUF(0, 2, 1, 3) // (a'_0, a'_1; a''_0, a''_1) + pshufd \b, \b, SHUF(0, 2, 1, 3) // (a'_2, a'_3; a''_2, a''_3) .ifnes "\c", "nil" - pshufd \c, \c, SHUF(3, 1, 2, 0) // (c'_0, c'_1; c''_0, c''_1) - pshufd \d, \d, SHUF(3, 1, 2, 0) // (c'_2, c'_3; c''_2, c''_3) + pshufd \c, \c, SHUF(0, 2, 1, 3) // (c'_0, c'_1; c''_0, c''_1) + pshufd \d, \d, SHUF(0, 2, 1, 3) // (c'_2, c'_3; c''_2, c''_3) .endif .endm diff --git a/symm/chacha-x86ish-sse2.S b/symm/chacha-x86ish-sse2.S index b8f72d53..77047ebe 100644 --- a/symm/chacha-x86ish-sse2.S +++ b/symm/chacha-x86ish-sse2.S @@ -164,9 +164,9 @@ FUNC(chacha_core_x86ish_sse2) // c += d; b ^= c; b <<<= 7 paddd xmm2, xmm3 - pshufd xmm3, xmm3, SHUF(2, 1, 0, 3) + pshufd xmm3, xmm3, SHUF(3, 0, 1, 2) pxor xmm1, xmm2 - pshufd xmm2, xmm2, SHUF(1, 0, 3, 2) + pshufd xmm2, xmm2, SHUF(2, 3, 0, 1) movdqa xmm4, xmm1 pslld xmm1, 7 psrld xmm4, 25 @@ -184,7 +184,7 @@ FUNC(chacha_core_x86ish_sse2) // // The shuffles have quite high latency, so they've mostly been // pushed upwards. The remaining one can't be moved, though. - pshufd xmm1, xmm1, SHUF(0, 3, 2, 1) + pshufd xmm1, xmm1, SHUF(1, 2, 3, 0) // Apply the diagonal quarterround to each of the columns // simultaneously. @@ -215,9 +215,9 @@ FUNC(chacha_core_x86ish_sse2) // c += d; b ^= c; b <<<= 7 paddd xmm2, xmm3 - pshufd xmm3, xmm3, SHUF(0, 3, 2, 1) + pshufd xmm3, xmm3, SHUF(1, 2, 3, 0) pxor xmm1, xmm2 - pshufd xmm2, xmm2, SHUF(1, 0, 3, 2) + pshufd xmm2, xmm2, SHUF(2, 3, 0, 1) movdqa xmm4, xmm1 pslld xmm1, 7 psrld xmm4, 25 @@ -226,7 +226,7 @@ FUNC(chacha_core_x86ish_sse2) // Finally, finish off undoing the transpose, and we're done for this // doubleround. Again, most of this was done above so we don't have // to wait for the shuffles. - pshufd xmm1, xmm1, SHUF(2, 1, 0, 3) + pshufd xmm1, xmm1, SHUF(3, 0, 1, 2) // Decrement the loop counter and see if we should go round again. sub NR, 2 diff --git a/symm/rijndael-x86ish-aesni.S b/symm/rijndael-x86ish-aesni.S index a7a1ece3..dc80f4db 100644 --- a/symm/rijndael-x86ish-aesni.S +++ b/symm/rijndael-x86ish-aesni.S @@ -211,16 +211,16 @@ FUNC(rijndael_setup_x86ish_aesni) // Fourth word of the cycle, and seven or eight words of key. Do a // byte substitution. movd xmm0, eax - pshufd xmm0, xmm0, SHUF(2, 1, 0, 3) + pshufd xmm0, xmm0, SHUF(3, 0, 1, 2) aeskeygenassist xmm1, xmm0, 0 movd eax, xmm1 jmp 2f // First word of the cycle. This is the complicated piece. 1: movd xmm0, eax - pshufd xmm0, xmm0, SHUF(0, 3, 2, 1) + pshufd xmm0, xmm0, SHUF(1, 2, 3, 0) aeskeygenassist xmm1, xmm0, 0 - pshufd xmm1, xmm1, SHUF(2, 1, 0, 3) + pshufd xmm1, xmm1, SHUF(3, 0, 1, 2) movd eax, xmm1 xor al, [RCON] inc RCON diff --git a/symm/salsa20-x86ish-sse2.S b/symm/salsa20-x86ish-sse2.S index 76ac0ed9..ad4e322b 100644 --- a/symm/salsa20-x86ish-sse2.S +++ b/symm/salsa20-x86ish-sse2.S @@ -180,7 +180,7 @@ FUNC(salsa20_core_x86ish_sse2) // d ^= (c + b) <<< 13 movdqa xmm4, xmm2 paddd xmm4, xmm1 - pshufd xmm1, xmm1, SHUF(2, 1, 0, 3) + pshufd xmm1, xmm1, SHUF(3, 0, 1, 2) movdqa xmm5, xmm4 pslld xmm4, 13 psrld xmm5, 19 @@ -189,9 +189,9 @@ FUNC(salsa20_core_x86ish_sse2) // a ^= (d + c) <<< 18 movdqa xmm4, xmm3 - pshufd xmm3, xmm3, SHUF(0, 3, 2, 1) + pshufd xmm3, xmm3, SHUF(1, 2, 3, 0) paddd xmm4, xmm2 - pshufd xmm2, xmm2, SHUF(1, 0, 3, 2) + pshufd xmm2, xmm2, SHUF(2, 3, 0, 1) movdqa xmm5, xmm4 pslld xmm4, 18 psrld xmm5, 14 @@ -235,7 +235,7 @@ FUNC(salsa20_core_x86ish_sse2) // d ^= (c + b) <<< 13 movdqa xmm4, xmm2 paddd xmm4, xmm3 - pshufd xmm3, xmm3, SHUF(2, 1, 0, 3) + pshufd xmm3, xmm3, SHUF(3, 0, 1, 2) movdqa xmm5, xmm4 pslld xmm4, 13 psrld xmm5, 19 @@ -244,9 +244,9 @@ FUNC(salsa20_core_x86ish_sse2) // a ^= (d + c) <<< 18 movdqa xmm4, xmm1 - pshufd xmm1, xmm1, SHUF(0, 3, 2, 1) + pshufd xmm1, xmm1, SHUF(1, 2, 3, 0) paddd xmm4, xmm2 - pshufd xmm2, xmm2, SHUF(1, 0, 3, 2) + pshufd xmm2, xmm2, SHUF(2, 3, 0, 1) movdqa xmm5, xmm4 pslld xmm4, 18 psrld xmm5, 14 @@ -270,9 +270,9 @@ FUNC(salsa20_core_x86ish_sse2) // input. This can be done by juggling values in registers, with the // following fancy footwork: some row rotations, a transpose, and // some more rotations. - pshufd xmm1, xmm1, SHUF(2, 1, 0, 3) // 3, 4, 9, 14 - pshufd xmm2, xmm2, SHUF(1, 0, 3, 2) // 2, 7, 8, 13 - pshufd xmm3, xmm3, SHUF(0, 3, 2, 1) // 1, 6, 11, 12 + pshufd xmm1, xmm1, SHUF(3, 0, 1, 2) // 3, 4, 9, 14 + pshufd xmm2, xmm2, SHUF(2, 3, 0, 1) // 2, 7, 8, 13 + pshufd xmm3, xmm3, SHUF(1, 2, 3, 0) // 1, 6, 11, 12 movdqa xmm4, xmm0 movdqa xmm5, xmm3 @@ -288,9 +288,9 @@ FUNC(salsa20_core_x86ish_sse2) punpckhdq xmm1, xmm3 // 5, 6, 7, 4 punpckhdq xmm2, xmm5 // 15, 12, 13, 14 - pshufd xmm1, xmm1, SHUF(2, 1, 0, 3) // 4, 5, 6, 7 - pshufd xmm4, xmm4, SHUF(1, 0, 3, 2) // 8, 9, 10, 11 - pshufd xmm2, xmm2, SHUF(0, 3, 2, 1) // 12, 13, 14, 15 + pshufd xmm1, xmm1, SHUF(3, 0, 1, 2) // 4, 5, 6, 7 + pshufd xmm4, xmm4, SHUF(2, 3, 0, 1) // 8, 9, 10, 11 + pshufd xmm2, xmm2, SHUF(1, 2, 3, 0) // 12, 13, 14, 15 // Finally we have to write out the result. movdqu [OUT + 0], xmm0 -- 2.11.0