From a117c06f5ee62cbe7812769703eada01843f76ca Mon Sep 17 00:00:00 2001
From: Mark Wooding <mdw@distorted.org.uk>
Date: Mon, 12 Nov 2018 11:03:05 +0000
Subject: [PATCH] base/asm-common.h: Reverse the order of `SHUF' arguments.

The original idea was this: since one can change one's view of how the
bits in an XMM register are divided into lanes on a per-instruction
basis, it would make more sense if I took a single consistent view of
how the bits are arranged, with the least significant on the right and
the most significant on the left.  Therefore, I listed the shuffle
indices from left to right, counting from right to left.

This, I now realise, was a mistake.  The thing which finally made this
clear to me was that it makes the order of indices in the `SHUF' macro
be inconsistent with the order of bytes in a table for the SSSE3
`pshufb' instruction, and I can't do anything about that.

So: change the order of the arguments, and track down all uses of this
macro to fix them.  Sorry about that.

To verify that I got them all:

	for i in $(git grep -l SHUF); do
	  git blame -- $i | grep SHUF
	done | less
---
 base/asm-common.h            | 10 +++++-----
 math/mpx-mul4-amd64-sse2.S   | 12 ++++++------
 math/mpx-mul4-x86-sse2.S     | 12 ++++++------
 symm/chacha-x86ish-sse2.S    | 12 ++++++------
 symm/rijndael-x86ish-aesni.S |  6 +++---
 symm/salsa20-x86ish-sse2.S   | 24 ++++++++++++------------
 6 files changed, 38 insertions(+), 38 deletions(-)

diff --git a/base/asm-common.h b/base/asm-common.h
index 8e51ea39..d6a8b012 100644
--- a/base/asm-common.h
+++ b/base/asm-common.h
@@ -217,11 +217,11 @@ name:
 #  define INTADDR__1(addr, got) addr
 #endif
 
-// Permutations for SIMD instructions.  SHUF(D, C, B, A) is an immediate,
-// suitable for use in `pshufd' or `shufpd', which copies element D
-// (0 <= D < 4) of the source to element 3 of the destination, element C to
-// element 2, element B to element 1, and element A to element 0.
-#define SHUF(d, c, b, a) (64*(d) + 16*(c) + 4*(b) + (a))
+// Permutations for SIMD instructions.  SHUF(A, B, C, D) is an immediate,
+// suitable for use in `pshufd' or `shufpd', which copies element A
+// (0 <= A < 4) of the source to element 0 of the destination, element B to
+// element 1, element C to element 2, and element D to element 3.
+#define SHUF(a, b, c, d) ((a) + 4*(b) + 16*(c) + 64*(d))
 
 // Map register names to their individual pieces.
 
diff --git a/math/mpx-mul4-amd64-sse2.S b/math/mpx-mul4-amd64-sse2.S
index 8b8cd414..64460ca9 100644
--- a/math/mpx-mul4-amd64-sse2.S
+++ b/math/mpx-mul4-amd64-sse2.S
@@ -96,7 +96,7 @@
 .macro	mulcore	r, i, slo, shi, d0, d1=nil, d2=nil, d3=nil
 	// Multiply R_I by the expanded operand SLO/SHI, and leave the pieces
 	// of the product in registers D0, D1, D2, D3.
-	pshufd	\d0, \r, SHUF(3, \i, 3, \i) // (r_i, ?; r_i, ?)
+	pshufd	\d0, \r, SHUF(\i, 3, \i, 3) // (r_i, ?; r_i, ?)
   .ifnes "\d1", "nil"
 	movdqa	\d1, \slo		// (s'_0, s'_1; s''_0, s''_1)
   .endif
@@ -163,7 +163,7 @@
 	// lane 0 or 1 of D; the high two lanes of D are clobbered.  On
 	// completion, XMM3 is clobbered.  If CC is `nil', then the
 	// contribution which would have been added to it is left in C.
-	pshufd	xmm3, \c, SHUF(2, 3, 3, 3) // (?, ?; ?, t = c'' mod B)
+	pshufd	xmm3, \c, SHUF(3, 3, 3, 2) // (?, ?; ?, t = c'' mod B)
 	psrldq	xmm3, 12		// (t, 0; 0, 0) = (t; 0)
 	pslldq	xmm3, 2			// (t b; 0)
 	paddq	\c, xmm3		// (c' + t b; c'')
@@ -209,11 +209,11 @@
 	punpcklwd \c, \z		// (c'_0, c''_0; c'_1, c''_1)
 	punpckhwd \d, \z		// (c'_2, c''_2; c'_3, c''_3)
   .endif
-	pshufd	\a, \a, SHUF(3, 1, 2, 0) // (a'_0, a'_1; a''_0, a''_1)
-	pshufd	\b, \b, SHUF(3, 1, 2, 0) // (a'_2, a'_3; a''_2, a''_3)
+	pshufd	\a, \a, SHUF(0, 2, 1, 3) // (a'_0, a'_1; a''_0, a''_1)
+	pshufd	\b, \b, SHUF(0, 2, 1, 3) // (a'_2, a'_3; a''_2, a''_3)
   .ifnes "\c", "nil"
-	pshufd	\c, \c, SHUF(3, 1, 2, 0) // (c'_0, c'_1; c''_0, c''_1)
-	pshufd	\d, \d, SHUF(3, 1, 2, 0) // (c'_2, c'_3; c''_2, c''_3)
+	pshufd	\c, \c, SHUF(0, 2, 1, 3) // (c'_0, c'_1; c''_0, c''_1)
+	pshufd	\d, \d, SHUF(0, 2, 1, 3) // (c'_2, c'_3; c''_2, c''_3)
   .endif
 .endm
 
diff --git a/math/mpx-mul4-x86-sse2.S b/math/mpx-mul4-x86-sse2.S
index 591a7a8f..11aadc95 100644
--- a/math/mpx-mul4-x86-sse2.S
+++ b/math/mpx-mul4-x86-sse2.S
@@ -103,7 +103,7 @@
   .ifnes "\d3", "nil"
 	movdqa	\d3, [\s + 16]		// (s'_2, s'_3; s''_2, s''_3)
   .endif
-	pshufd	\d0, \d0, SHUF(3, 0, 3, 0) // (r_i, ?; r_i, ?)
+	pshufd	\d0, \d0, SHUF(0, 3, 0, 3) // (r_i, ?; r_i, ?)
   .ifnes "\d1", "nil"
 	psrldq	\d1, 4			// (s'_1, s''_0; s''_1, 0)
   .endif
@@ -171,7 +171,7 @@
 	// carry registers.  On completion, XMM3 is clobbered.  If CC is
 	// `nil', then the contribution which would have been added to it is
 	// left in C.
-	pshufd	xmm3, \c, SHUF(2, 3, 3, 3) // (?, ?; ?, t = c'' mod B)
+	pshufd	xmm3, \c, SHUF(3, 3, 3, 2) // (?, ?; ?, t = c'' mod B)
 	psrldq	xmm3, 12		// (t, 0; 0, 0) = (t, 0)
 	pslldq	xmm3, 2			// (t b; 0)
 	paddq	\c, xmm3		// (c' + t b; c'')
@@ -209,11 +209,11 @@
 	punpcklwd \c, \z		// (c'_0, c''_0; c'_1, c''_1)
 	punpckhwd \d, \z		// (c'_2, c''_2; c'_3, c''_3)
   .endif
-	pshufd	\a, \a, SHUF(3, 1, 2, 0) // (a'_0, a'_1; a''_0, a''_1)
-	pshufd	\b, \b, SHUF(3, 1, 2, 0) // (a'_2, a'_3; a''_2, a''_3)
+	pshufd	\a, \a, SHUF(0, 2, 1, 3) // (a'_0, a'_1; a''_0, a''_1)
+	pshufd	\b, \b, SHUF(0, 2, 1, 3) // (a'_2, a'_3; a''_2, a''_3)
   .ifnes "\c", "nil"
-	pshufd	\c, \c, SHUF(3, 1, 2, 0) // (c'_0, c'_1; c''_0, c''_1)
-	pshufd	\d, \d, SHUF(3, 1, 2, 0) // (c'_2, c'_3; c''_2, c''_3)
+	pshufd	\c, \c, SHUF(0, 2, 1, 3) // (c'_0, c'_1; c''_0, c''_1)
+	pshufd	\d, \d, SHUF(0, 2, 1, 3) // (c'_2, c'_3; c''_2, c''_3)
   .endif
 .endm
 
diff --git a/symm/chacha-x86ish-sse2.S b/symm/chacha-x86ish-sse2.S
index b8f72d53..77047ebe 100644
--- a/symm/chacha-x86ish-sse2.S
+++ b/symm/chacha-x86ish-sse2.S
@@ -164,9 +164,9 @@ FUNC(chacha_core_x86ish_sse2)
 
 	// c += d; b ^= c; b <<<=  7
 	paddd	xmm2, xmm3
-	 pshufd	xmm3, xmm3, SHUF(2, 1, 0, 3)
+	 pshufd	xmm3, xmm3, SHUF(3, 0, 1, 2)
 	pxor	xmm1, xmm2
-	 pshufd	xmm2, xmm2, SHUF(1, 0, 3, 2)
+	 pshufd	xmm2, xmm2, SHUF(2, 3, 0, 1)
 	movdqa	xmm4, xmm1
 	pslld	xmm1, 7
 	psrld	xmm4, 25
@@ -184,7 +184,7 @@ FUNC(chacha_core_x86ish_sse2)
 	//
 	// The shuffles have quite high latency, so they've mostly been
 	// pushed upwards.  The remaining one can't be moved, though.
-	pshufd	xmm1, xmm1, SHUF(0, 3, 2, 1)
+	pshufd	xmm1, xmm1, SHUF(1, 2, 3, 0)
 
 	// Apply the diagonal quarterround to each of the columns
 	// simultaneously.
@@ -215,9 +215,9 @@ FUNC(chacha_core_x86ish_sse2)
 
 	// c += d; b ^= c; b <<<=  7
 	paddd	xmm2, xmm3
-	 pshufd	xmm3, xmm3, SHUF(0, 3, 2, 1)
+	 pshufd	xmm3, xmm3, SHUF(1, 2, 3, 0)
 	pxor	xmm1, xmm2
-	 pshufd	xmm2, xmm2, SHUF(1, 0, 3, 2)
+	 pshufd	xmm2, xmm2, SHUF(2, 3, 0, 1)
 	movdqa	xmm4, xmm1
 	pslld	xmm1, 7
 	psrld	xmm4, 25
@@ -226,7 +226,7 @@ FUNC(chacha_core_x86ish_sse2)
 	// Finally, finish off undoing the transpose, and we're done for this
 	// doubleround.  Again, most of this was done above so we don't have
 	// to wait for the shuffles.
-	pshufd	xmm1, xmm1, SHUF(2, 1, 0, 3)
+	pshufd	xmm1, xmm1, SHUF(3, 0, 1, 2)
 
 	// Decrement the loop counter and see if we should go round again.
 	sub	NR, 2
diff --git a/symm/rijndael-x86ish-aesni.S b/symm/rijndael-x86ish-aesni.S
index a7a1ece3..dc80f4db 100644
--- a/symm/rijndael-x86ish-aesni.S
+++ b/symm/rijndael-x86ish-aesni.S
@@ -211,16 +211,16 @@ FUNC(rijndael_setup_x86ish_aesni)
 	// Fourth word of the cycle, and seven or eight words of key.  Do a
 	// byte substitution.
 	movd	xmm0, eax
-	pshufd	xmm0, xmm0, SHUF(2, 1, 0, 3)
+	pshufd	xmm0, xmm0, SHUF(3, 0, 1, 2)
 	aeskeygenassist xmm1, xmm0, 0
 	movd	eax, xmm1
 	jmp	2f
 
 	// First word of the cycle.  This is the complicated piece.
 1:	movd	xmm0, eax
-	pshufd	xmm0, xmm0, SHUF(0, 3, 2, 1)
+	pshufd	xmm0, xmm0, SHUF(1, 2, 3, 0)
 	aeskeygenassist xmm1, xmm0, 0
-	pshufd	xmm1, xmm1, SHUF(2, 1, 0, 3)
+	pshufd	xmm1, xmm1, SHUF(3, 0, 1, 2)
 	movd	eax, xmm1
 	xor	al, [RCON]
 	inc	RCON
diff --git a/symm/salsa20-x86ish-sse2.S b/symm/salsa20-x86ish-sse2.S
index 76ac0ed9..ad4e322b 100644
--- a/symm/salsa20-x86ish-sse2.S
+++ b/symm/salsa20-x86ish-sse2.S
@@ -180,7 +180,7 @@ FUNC(salsa20_core_x86ish_sse2)
 	// d ^= (c + b) <<< 13
 	movdqa	xmm4, xmm2
 	paddd	xmm4, xmm1
-	 pshufd	xmm1, xmm1, SHUF(2, 1, 0, 3)
+	 pshufd	xmm1, xmm1, SHUF(3, 0, 1, 2)
 	movdqa	xmm5, xmm4
 	pslld	xmm4, 13
 	psrld	xmm5, 19
@@ -189,9 +189,9 @@ FUNC(salsa20_core_x86ish_sse2)
 
 	// a ^= (d + c) <<< 18
 	movdqa	xmm4, xmm3
-	 pshufd	xmm3, xmm3, SHUF(0, 3, 2, 1)
+	 pshufd	xmm3, xmm3, SHUF(1, 2, 3, 0)
 	paddd	xmm4, xmm2
-	 pshufd	xmm2, xmm2, SHUF(1, 0, 3, 2)
+	 pshufd	xmm2, xmm2, SHUF(2, 3, 0, 1)
 	movdqa	xmm5, xmm4
 	pslld	xmm4, 18
 	psrld	xmm5, 14
@@ -235,7 +235,7 @@ FUNC(salsa20_core_x86ish_sse2)
 	// d ^= (c + b) <<< 13
 	movdqa	xmm4, xmm2
 	paddd	xmm4, xmm3
-	 pshufd	xmm3, xmm3, SHUF(2, 1, 0, 3)
+	 pshufd	xmm3, xmm3, SHUF(3, 0, 1, 2)
 	movdqa	xmm5, xmm4
 	pslld	xmm4, 13
 	psrld	xmm5, 19
@@ -244,9 +244,9 @@ FUNC(salsa20_core_x86ish_sse2)
 
 	// a ^= (d + c) <<< 18
 	movdqa	xmm4, xmm1
-	 pshufd	xmm1, xmm1, SHUF(0, 3, 2, 1)
+	 pshufd	xmm1, xmm1, SHUF(1, 2, 3, 0)
 	paddd	xmm4, xmm2
-	 pshufd	xmm2, xmm2, SHUF(1, 0, 3, 2)
+	 pshufd	xmm2, xmm2, SHUF(2, 3, 0, 1)
 	movdqa	xmm5, xmm4
 	pslld	xmm4, 18
 	psrld	xmm5, 14
@@ -270,9 +270,9 @@ FUNC(salsa20_core_x86ish_sse2)
 	// input.  This can be done by juggling values in registers, with the
 	// following fancy footwork: some row rotations, a transpose, and
 	// some more rotations.
-	pshufd	xmm1, xmm1, SHUF(2, 1, 0, 3)	//  3,  4,  9, 14
-	pshufd	xmm2, xmm2, SHUF(1, 0, 3, 2)	//  2,  7,  8, 13
-	pshufd	xmm3, xmm3, SHUF(0, 3, 2, 1)	//  1,  6, 11, 12
+	pshufd	xmm1, xmm1, SHUF(3, 0, 1, 2)	//  3,  4,  9, 14
+	pshufd	xmm2, xmm2, SHUF(2, 3, 0, 1)	//  2,  7,  8, 13
+	pshufd	xmm3, xmm3, SHUF(1, 2, 3, 0)	//  1,  6, 11, 12
 
 	movdqa	xmm4, xmm0
 	movdqa	xmm5, xmm3
@@ -288,9 +288,9 @@ FUNC(salsa20_core_x86ish_sse2)
 	punpckhdq xmm1, xmm3			//  5,  6,  7,  4
 	punpckhdq xmm2, xmm5			// 15, 12, 13, 14
 
-	pshufd	xmm1, xmm1, SHUF(2, 1, 0, 3)	//  4,  5,  6,  7
-	pshufd	xmm4, xmm4, SHUF(1, 0, 3, 2)	//  8,  9, 10, 11
-	pshufd	xmm2, xmm2, SHUF(0, 3, 2, 1)	// 12, 13, 14, 15
+	pshufd	xmm1, xmm1, SHUF(3, 0, 1, 2)	//  4,  5,  6,  7
+	pshufd	xmm4, xmm4, SHUF(2, 3, 0, 1)	//  8,  9, 10, 11
+	pshufd	xmm2, xmm2, SHUF(1, 2, 3, 0)	// 12, 13, 14, 15
 
 	// Finally we have to write out the result.
 	movdqu	[OUT +  0], xmm0
-- 
2.11.0