From: Mark Wooding <mdw@distorted.org.uk>
Date: Sat, 3 Feb 2024 23:02:22 +0000 (+0000)
Subject: base/asm-common.h, *.S: Use consistent little-endian notation for SIMD regs.
X-Git-Url: https://git.distorted.org.uk/~mdw/catacomb/commitdiff_plain/981a9e5d5e3af6c06ad8b3f821928852068227e4

base/asm-common.h, *.S: Use consistent little-endian notation for SIMD regs.

This makes operations which involve changing one's perspective about the
SIMD processing elements make significantly more sense.  In particular,
I hope that this removes a layer of brain-twisting from the GCM code.

  * Adjust all of the register-contents diagrams so that less
    significant elements are on the right, rather than on the left.

  * Change the x86 `SHUF' macro so that the desired pieces are listed in
    decreasing significance order, so `SHUF(3, 2, 1, 0)' would be a
    no-op.

I would, of course, continue to use big-endian notation on a target
which actually used a big-endian ordering natively, but we don't
currently support any of them.
---

diff --git a/base/asm-common.h b/base/asm-common.h
index b4d4a909..9257d762 100644
--- a/base/asm-common.h
+++ b/base/asm-common.h
@@ -222,11 +222,11 @@ name:
 #  define INTADDR__1(addr, got) addr
 #endif
 
-// Permutations for SIMD instructions.  SHUF(A, B, C, D) is an immediate,
+// Permutations for SIMD instructions.  SHUF(D, C, B, A) is an immediate,
 // suitable for use in `pshufd' or `shufpd', which copies element A
 // (0 <= A < 4) of the source to element 0 of the destination, element B to
 // element 1, element C to element 2, and element D to element 3.
-#define SHUF(a, b, c, d) ((a) + 4*(b) + 16*(c) + 64*(d))
+#define SHUF(d, c, b, a) (64*(d) + 16*(c) + 4*(b) + (a))
 
 // Map register names to their individual pieces.
 
diff --git a/math/mpx-mul4-amd64-sse2.S b/math/mpx-mul4-amd64-sse2.S
index 5a748c60..d4726afa 100644
--- a/math/mpx-mul4-amd64-sse2.S
+++ b/math/mpx-mul4-amd64-sse2.S
@@ -59,9 +59,9 @@
 /// pieces are placed into 32-bit cells, and arranged as two 128-bit SSE
 /// operands, as follows.
 ///
-///	Offset	   0	   4	    8	   12
-///	   0	v'_0	v'_1	v''_0	v''_1
-///	  16	v'_2	v'_3	v''_2	v''_3
+///	Offset	   12	    8	   4	   0
+///	   0	v''_1	v''_0	v'_1	v'_0
+///	  16	v''_3	v''_2	v'_3	v'_2
 ///
 /// A `pmuludqd' instruction ignores the odd positions in its operands; thus,
 /// it will act on (say) v'_0 and v''_0 in a single instruction.  Shifting
@@ -137,32 +137,32 @@
 .macro	mulcore	r, i, slo, shi, d0, d1=nil, d2=nil, d3=nil
 	// Multiply R_I by the expanded operand SLO/SHI, and leave the pieces
 	// of the product in registers D0, D1, D2, D3.
-	pshufd	\d0, \r, SHUF(\i, 3, \i, 3) // (r_i, ?; r_i, ?)
+	pshufd	\d0, \r, SHUF(3, \i, 3, \i) // (?, r_i; ?, r_i)
   .ifnes "\d1", "nil"
-	movdqa	\d1, \slo		// (s'_0, s'_1; s''_0, s''_1)
+	movdqa	\d1, \slo		// (s''_1, s''_0; s'_1, s'_0)
   .endif
   .ifnes "\d3", "nil"
-	movdqa	\d3, \shi		// (s'_2, s'_3; s''_2, s''_3)
+	movdqa	\d3, \shi		// (s''_3, s''_2; s'_3, s'_2)
   .endif
   .ifnes "\d1", "nil"
-	psrldq	\d1, 4			// (s'_1, s''_0; s''_1, 0)
+	psrldq	\d1, 4			// (0, s''_1; s''_0, s'_1)
   .endif
   .ifnes "\d2", "nil"
-	movdqa	\d2, \d0		// another copy of (r_i, ?; r_i, ?)
+	movdqa	\d2, \d0		// another copy of (?, r_i; ?, r_i)
   .endif
   .ifnes "\d3", "nil"
-	psrldq	\d3, 4			// (s'_3, s''_2; s''_3, 0)
+	psrldq	\d3, 4			// (0, s''_3; s''_2, s'_3)
   .endif
   .ifnes "\d1", "nil"
-	pmuludq	\d1, \d0		// (r_i s'_1; r_i s''_1)
+	pmuludq	\d1, \d0		// (r_i s''_1; r_i s'_1)
   .endif
   .ifnes "\d3", "nil"
-	pmuludq	\d3, \d0		// (r_i s'_3; r_i s''_3)
+	pmuludq	\d3, \d0		// (r_i s''_3; r_i s'_3)
   .endif
   .ifnes "\d2", "nil"
-	pmuludq	\d2, \shi		// (r_i s'_2; r_i s''_2)
+	pmuludq	\d2, \shi		// (r_i s''_2; r_i s'_2)
   .endif
-	pmuludq	\d0, \slo		// (r_i s'_0; r_i s''_0)
+	pmuludq	\d0, \slo		// (r_i s''_0; r_i s'_0)
 .endm
 
 .macro	accum	c0, c1=nil, c2=nil, c3=nil
@@ -204,10 +204,10 @@
 	// lane 0 or 1 of D; the high two lanes of D are clobbered.  On
 	// completion, XMM3 is clobbered.  If CC is `nil', then the
 	// contribution which would have been added to it is left in C.
-	pshufd	xmm3, \c, SHUF(3, 3, 3, 2) // (?, ?; ?, t = c'' mod B)
-	psrldq	xmm3, 12		// (t, 0; 0, 0) = (t; 0)
-	pslldq	xmm3, 2			// (t b; 0)
-	paddq	\c, xmm3		// (c' + t b; c'')
+	pshufd	xmm3, \c, SHUF(2, 3, 3, 3) // (t = c'' mod B, ?; ?, ?)
+	psrldq	xmm3, 12		// (0, 0; 0, t) = (0; t)
+	pslldq	xmm3, 2			// (0; t b)
+	paddq	\c, xmm3		// (c''; c' + t b)
   .ifeqs "\pos", "lo"
 	movdqa	\d, \c
   .else
@@ -224,10 +224,10 @@
 	// of the value represented in C are written at POS in D, and the
 	// remaining bits are left at the bottom of T.
 	movdqa	\t, \c
-	psllq	\t, 16			// (?; c'' b)
-	pslldq	\c, 8			// (0; c')
-	paddq	\t, \c			// (?; c' + c'' b)
-	psrldq	\t, 8			// (c' + c'' b; 0) = (c; 0)
+	psllq	\t, 16			// (c'' b; ?)
+	pslldq	\c, 8			// (c'; 0)
+	paddq	\t, \c			// (c' + c'' b; ?)
+	psrldq	\t, 8			// (0; c' + c'' b) = (0; c)
   .ifeqs "\pos", "lo"
 	movdqa	\d, \t
   .else
@@ -240,21 +240,21 @@
 	// On entry, A and C hold packed 128-bit values, and Z is zero.  On
 	// exit, A:B and C:D together hold the same values in expanded
 	// form.  If C is `nil', then only expand A to A:B.
-	movdqa	\b, \a			// (a_0, a_1; a_2, a_3)
+	movdqa	\b, \a			// (a_3, a_2; a_1, a_0)
   .ifnes "\c", "nil"
-	movdqa	\d, \c			// (c_0, c_1; c_2, c_3)
+	movdqa	\d, \c			// (c_3, c_2; c_1, c_0)
   .endif
-	punpcklwd \a, \z		// (a'_0, a''_0; a'_1, a''_1)
-	punpckhwd \b, \z		// (a'_2, a''_2; a'_3, a''_3)
+	punpcklwd \a, \z		// (a''_1, a'_1; a''_0, a'_0)
+	punpckhwd \b, \z		// (a''_3, a'_3; a''_2, a'_2)
   .ifnes "\c", "nil"
-	punpcklwd \c, \z		// (c'_0, c''_0; c'_1, c''_1)
-	punpckhwd \d, \z		// (c'_2, c''_2; c'_3, c''_3)
+	punpcklwd \c, \z		// (c''_1, c'_1; c''_0, c'_0)
+	punpckhwd \d, \z		// (c''_3, c'_3; c''_2, c'_2)
   .endif
-	pshufd	\a, \a, SHUF(0, 2, 1, 3) // (a'_0, a'_1; a''_0, a''_1)
-	pshufd	\b, \b, SHUF(0, 2, 1, 3) // (a'_2, a'_3; a''_2, a''_3)
+	pshufd	\a, \a, SHUF(3, 1, 2, 0) // (a''_1, a''_0; a'_1, a'_0)
+	pshufd	\b, \b, SHUF(3, 1, 2, 0) // (a''_3, a''_2; a'_3, a'_2)
   .ifnes "\c", "nil"
-	pshufd	\c, \c, SHUF(0, 2, 1, 3) // (c'_0, c'_1; c''_0, c''_1)
-	pshufd	\d, \d, SHUF(0, 2, 1, 3) // (c'_2, c'_3; c''_2, c''_3)
+	pshufd	\c, \c, SHUF(3, 1, 2, 0) // (c''_1, c''_0; c'_1, c'_0)
+	pshufd	\d, \d, SHUF(3, 1, 2, 0) // (c''_3, c''_2; c'_3, c'_2)
   .endif
 .endm
 
@@ -270,10 +270,10 @@
 	// we can do that, we must gather them together.
 	movdqa	\t, \c0
 	movdqa	\u, \c1
-	punpcklqdq \t, \c2		// (y'_0; y'_2)
-	punpckhqdq \c0, \c2		// (y''_0; y''_2)
-	punpcklqdq \u, \c3		// (y'_1; y'_3)
-	punpckhqdq \c1, \c3		// (y''_1; y''_3)
+	punpcklqdq \t, \c2		// (y'_2; y'_0)
+	punpckhqdq \c0, \c2		// (y''_2; y''_0)
+	punpcklqdq \u, \c3		// (y'_3; y'_1)
+	punpckhqdq \c1, \c3		// (y''_3; y''_1)
 
 	// Now split the double-prime pieces.  The high (up to) 48 bits will
 	// go up; the low 16 bits go down.
@@ -281,43 +281,43 @@
 	movdqa	\c3, \c1
 	psllq	\c2, 48
 	psllq	\c3, 48
-	psrlq	\c0, 16			// high parts of (y''_0; y''_2)
-	psrlq	\c1, 16			// high parts of (y''_1; y''_3)
-	psrlq	\c2, 32			// low parts of (y''_0; y''_2)
-	psrlq	\c3, 32			// low parts of (y''_1; y''_3)
+	psrlq	\c0, 16			// high parts of (y''_2; y''_0)
+	psrlq	\c1, 16			// high parts of (y''_3; y''_1)
+	psrlq	\c2, 32			// low parts of (y''_2; y''_0)
+	psrlq	\c3, 32			// low parts of (y''_3; y''_1)
   .ifnes "\hi", "nil"
 	movdqa	\hi, \c1
   .endif
-	pslldq	\c1, 8			// high part of (0; y''_1)
+	pslldq	\c1, 8			// high part of (y''_1; 0)
 
 	paddq	\t, \c2			// propagate down
 	paddq	\u, \c3
-	paddq	\t, \c1			// and up: (y_0; y_2)
-	paddq	\u, \c0			// (y_1; y_3)
+	paddq	\t, \c1			// and up: (y_2; y_0)
+	paddq	\u, \c0			// (y_3; y_1)
   .ifnes "\hi", "nil"
-	psrldq	\hi, 8			// high part of (y''_3; 0)
+	psrldq	\hi, 8			// high part of (0; y''_3)
   .endif
 
 	// Finally extract the answer.  This complicated dance is better than
 	// storing to memory and loading, because the piecemeal stores
 	// inhibit store forwarding.
-	movdqa	\c3, \t			// (y_0; ?)
-	movdqa	\lo, \t			// (y^*_0, ?; ?, ?)
-	psrldq	\t, 8			// (y_2; 0)
+	movdqa	\c3, \t			// (?; y_0)
+	movdqa	\lo, \t			// (?, ?; ?, y^*_0)
+	psrldq	\t, 8			// (0; y_2)
 	psrlq	\c3, 32			// (floor(y_0/B); ?)
 	paddq	\c3, \u			// (y_1 + floor(y_0/B); ?)
-	movdqa	\c1, \c3		// (y^*_1, ?; ?, ?)
-	psrldq	\u, 8			// (y_3; 0)
+	movdqa	\c1, \c3		// (?, ?; ?, y^*_1)
+	psrldq	\u, 8			// (0; y_3)
 	psrlq	\c3, 32			// (floor((y_1 B + y_0)/B^2; ?)
 	paddq	\c3, \t			// (y_2 + floor((y_1 B + y_0)/B^2; ?)
-	punpckldq \lo, \c3		// (y^*_0, y^*_2; ?, ?)
+	punpckldq \lo, \c3		// (?, ?; y^*_2, y^*_0)
 	psrlq	\c3, 32		    // (floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
 	paddq	\c3, \u	      // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
   .ifnes "\hi", "nil"
 	movdqa	\t, \c3
 	pxor	\u, \u
   .endif
-	punpckldq \c1, \c3		// (y^*_1, y^*_3; ?, ?)
+	punpckldq \c1, \c3		// (?, ?; y^*_3, y^*_1)
   .ifnes "\hi", "nil"
 	psrlq	\t, 32			// very high bits of y
 	paddq	\hi, \t
@@ -334,13 +334,13 @@
 	// On exit, the carry registers, including XMM15, are updated to hold
 	// C + A; XMM0, XMM1, XMM2, and XMM3 are clobbered.  The other
 	// registers are preserved.
-	movd	xmm0, [rdi +  0]	// (a_0; 0)
-	movd	xmm1, [rdi +  4]	// (a_1; 0)
-	movd	xmm2, [rdi +  8]	// (a_2; 0)
-	movd	xmm15, [rdi + 12]	// (a_3; 0)
-	paddq	xmm12, xmm0		// (c'_0 + a_0; c''_0)
-	paddq	xmm13, xmm1		// (c'_1 + a_1; c''_1)
-	paddq	xmm14, xmm2		// (c'_2 + a_2; c''_2 + a_3 b)
+	movd	xmm0, [rdi +  0]	// (0; a_0)
+	movd	xmm1, [rdi +  4]	// (0; a_1)
+	movd	xmm2, [rdi +  8]	// (0; a_2)
+	movd	xmm15, [rdi + 12]	// (0; a_3)
+	paddq	xmm12, xmm0		// (c''_0; c'_0 + a_0)
+	paddq	xmm13, xmm1		// (c''_1; c'_1 + a_1)
+	paddq	xmm14, xmm2		// (c''_2 + a_3 b; c'_2 + a_2)
 .endm
 
 ///--------------------------------------------------------------------------
@@ -654,8 +654,8 @@ INTFUNC(mmla4)
 	mulcore	xmm7, 1,   xmm10, xmm11, xmm0,  xmm1,  xmm2
 	accum				 xmm4,  xmm5,  xmm6
 
-	punpckldq xmm12, xmm15		// (w_0, 0; w_1, 0)
-	punpckhdq xmm14, xmm15		// (w_2, 0; w_3, 0)
+	punpckldq xmm12, xmm15		// (0, w_1; 0, w_0)
+	punpckhdq xmm14, xmm15		// (0, w_3; 0, w_2)
 
 	mulcore	xmm7, 2,   xmm10, xmm11, xmm0,  xmm1
 	accum				 xmm5,  xmm6
@@ -667,10 +667,10 @@ INTFUNC(mmla4)
 	mulcore	xmm7, 3,   xmm10, xmm11, xmm0
 	accum				 xmm6
 
-	punpckldq xmm12, xmm2		// (w_0, 0; 0, 0)
-	punpckldq xmm14, xmm2		// (w_2, 0; 0, 0)
-	punpckhdq xmm13, xmm2		// (w_1, 0; 0, 0)
-	punpckhdq xmm15, xmm2		// (w_3, 0; 0, 0)
+	punpckldq xmm12, xmm2		// (0, 0; 0, w_0)
+	punpckldq xmm14, xmm2		// (0, 0; 0, w_2)
+	punpckhdq xmm13, xmm2		// (0, 0; 0, w_1)
+	punpckhdq xmm15, xmm2		// (0, 0; 0, w_3)
 
 	// That's lots of pieces.  Now we have to assemble the answer.
 	squash	xmm3, xmm4, xmm5, xmm6,  xmm0, xmm1,  xmm10
@@ -736,8 +736,8 @@ INTFUNC(mont4)
 	mulcore	xmm7, 1,   xmm8,  xmm9,  xmm0,  xmm1,  xmm2
 	accum				 xmm4,  xmm5,  xmm6
 
-	punpckldq xmm12, xmm15		// (w_0, 0; w_1, 0)
-	punpckhdq xmm14, xmm15		// (w_2, 0; w_3, 0)
+	punpckldq xmm12, xmm15		// (0, w_1; 0, w_0)
+	punpckhdq xmm14, xmm15		// (0, w_3; 0, w_2)
 
 	mulcore	xmm7, 2,   xmm8,  xmm9,  xmm0,  xmm1
 	accum				 xmm5,  xmm6
@@ -749,10 +749,10 @@ INTFUNC(mont4)
 	mulcore	xmm7, 3,   xmm8,  xmm9,  xmm0
 	accum				 xmm6
 
-	punpckldq xmm12, xmm2		// (w_0, 0; 0, 0)
-	punpckldq xmm14, xmm2		// (w_2, 0; 0, 0)
-	punpckhdq xmm13, xmm2		// (w_1, 0; 0, 0)
-	punpckhdq xmm15, xmm2		// (w_3, 0; 0, 0)
+	punpckldq xmm12, xmm2		// (0, 0; 0, w_0)
+	punpckldq xmm14, xmm2		// (0, 0; 0, w_2)
+	punpckhdq xmm13, xmm2		// (0, 0; 0, w_1)
+	punpckhdq xmm15, xmm2		// (0, 0; 0, w_3)
 
 	// That's lots of pieces.  Now we have to assemble the answer.
 	squash	xmm3, xmm4, xmm5, xmm6,  xmm0, xmm1,  xmm10
@@ -1511,9 +1511,9 @@ ENDFUNC
 .endm
 
 .macro	testldcarry
-	movdqu	xmm12, [rcx +  0]	// (c'_0; c''_0)
-	movdqu	xmm13, [rcx + 16]	// (c'_1; c''_1)
-	movdqu	xmm14, [rcx + 32]	// (c'_2; c''_2)
+	movdqu	xmm12, [rcx +  0]	// (c''_0; c'_0)
+	movdqu	xmm13, [rcx + 16]	// (c''_1; c'_1)
+	movdqu	xmm14, [rcx + 32]	// (c''_2; c'_2)
 .endm
 
 .macro	testtop	u=nil
@@ -1601,8 +1601,8 @@ FUNC(test_mmul4)
 	testtop	r11
 	call	mmul4
 	testtail
-	pshufd	xmm10, xmm10, SHUF(0, 2, 1, 3)
-	pshufd	xmm11, xmm11, SHUF(0, 2, 1, 3)
+	pshufd	xmm10, xmm10, SHUF(3, 1, 2, 0)
+	pshufd	xmm11, xmm11, SHUF(3, 1, 2, 0)
 	movdqu	[r10 +  0], xmm10
 	movdqu	[r10 + 16], xmm11
 	testcarryout
@@ -1614,8 +1614,8 @@ FUNC(test_mmla4)
 	testtop	r11
 	call	mmla4
 	testtail
-	pshufd	xmm10, xmm10, SHUF(0, 2, 1, 3)
-	pshufd	xmm11, xmm11, SHUF(0, 2, 1, 3)
+	pshufd	xmm10, xmm10, SHUF(3, 1, 2, 0)
+	pshufd	xmm11, xmm11, SHUF(3, 1, 2, 0)
 	movdqu	[r10 +  0], xmm10
 	movdqu	[r10 + 16], xmm11
 	testcarryout
@@ -1627,8 +1627,8 @@ FUNC(test_mont4)
 	testtop
 	call	mont4
 	testtail
-	pshufd	xmm10, xmm10, SHUF(0, 2, 1, 3)
-	pshufd	xmm11, xmm11, SHUF(0, 2, 1, 3)
+	pshufd	xmm10, xmm10, SHUF(3, 1, 2, 0)
+	pshufd	xmm11, xmm11, SHUF(3, 1, 2, 0)
 	movdqu	[r10 +  0], xmm10
 	movdqu	[r10 + 16], xmm11
 	testcarryout
diff --git a/math/mpx-mul4-arm-neon.S b/math/mpx-mul4-arm-neon.S
index efca7902..8aa01bc0 100644
--- a/math/mpx-mul4-arm-neon.S
+++ b/math/mpx-mul4-arm-neon.S
@@ -60,9 +60,9 @@
 /// pieces are placed into 32-bit cells, and arranged as two 128-bit NEON
 /// operands, as follows.
 ///
-///	Offset	   0	   4	    8	   12
-///	   0	v'_0   v''_0	 v'_1	v''_1
-///	  16	v'_2   v''_2	 v'_3	v''_3
+///	Offset	   12	   8	    4	   0
+///	   0	v''_1	v'_1	v''_0	v'_0
+///	  16	v''_3	v'_3	v''_2	v'_2
 ///
 /// The `vmull' and `vmlal' instructions can multiply a vector of two 32-bit
 /// values by a 32-bit scalar, giving two 64-bit results; thus, it will act
@@ -1012,12 +1012,12 @@ ENDFUNC
 	ldr	r14, [STKARG(0)]	// -> vv
 	vld1.32	{q2}, [r14]
 	vmov.i32 q3, #0
-	vzip.16	q2, q3			// (v'_0, v''_0; v'_1, v''_1)
+	vzip.16	q2, q3			// (v''_1, v'_1; v''_0, v'_0)
 
 	ldr	r14, [STKARG(1)]	// -> yy
 	vld1.32	{q4}, [r14]
 	vmov.i32 q5, #0
-	vzip.16	q4, q5			// (y'_0, y''_0; y'_1, y''_1)
+	vzip.16	q4, q5			// (y''_1, y'_1; y''_0, y'_0)
 
 	ldr	r5, [STKARG(2)]		// = n
 	ldr	r6, [STKARG(3)]		// -> cyv
@@ -1029,7 +1029,7 @@ ENDFUNC
 
 	vld1.32	{q4}, [r3]
 	vmov.i32 q5, #0
-	vzip.16	q4, q5			// (y'_0, y''_0; y'_1, y''_1)
+	vzip.16	q4, q5			// (y''_1, y'_1; y''_0, y'_0)
 
 	ldr	r5, [STKARG(0)]		// = n
 	ldr	r6, [STKARG(1)]		// -> cyv
@@ -1044,12 +1044,12 @@ ENDFUNC
 	ldr	r14, [STKARG(1)]	// -> vv
 	vld1.32	{q2}, [r14]
 	vmov.i32 q3, #0
-	vzip.16	q2, q3			// (v'_0, v''_0; v'_1, v''_1)
+	vzip.16	q2, q3			// (v''_1, v'_1; v''_0, v'_0)
 
 	ldr	r14, [STKARG(2)]	// -> yy
 	vld1.32	{q4}, [r14]
 	vmov.i32 q5, #0
-	vzip.16	q4, q5			// (y'_0, y''_0; y'_1, y''_1)
+	vzip.16	q4, q5			// (y''_1, y'_1; y''_0, y'_0)
 
 	ldr	r5, [STKARG(3)]		// = n
 	ldr	r6, [STKARG(4)]		// -> cyv
@@ -1065,7 +1065,7 @@ ENDFUNC
 	ldr	r14, [STKARG(0)]	// -> vv
 	vld1.32	{q2}, [r14]
 	vmov.i32 q3, #0
-	vzip.16	q2, q3			// (v'_0, v''_0; v'_1, v''_1)
+	vzip.16	q2, q3			// (v''_1, v'_1; v''_0, v'_0)
 
 	ldr	r5, [STKARG(1)]		// = n
 	ldr	r6, [STKARG(2)]		// -> cyv
diff --git a/math/mpx-mul4-arm64-simd.S b/math/mpx-mul4-arm64-simd.S
index 60eed208..ee33a002 100644
--- a/math/mpx-mul4-arm64-simd.S
+++ b/math/mpx-mul4-arm64-simd.S
@@ -57,9 +57,9 @@
 /// pieces are placed into 32-bit cells, and arranged as two 128-bit SIMD
 /// operands, as follows.
 ///
-///	Offset	   0	   4	    8	   12
-///	   0	v'_0   v''_0	 v'_1	v''_1
-///	  16	v'_2   v''_2	 v'_3	v''_3
+///	Offset	   12	   8	    4	   0
+///	   0	v''_1	v'_1	v''_0	v'_0
+///	  16	v''_3	v'_3	v''_2	v'_2
 ///
 /// The `umull' and `umlal' instructions can multiply a vector of two 32-bit
 /// values by a 32-bit scalar, giving two 64-bit results; thus, it will act
@@ -230,7 +230,7 @@
 // leaving a carry in CG.
 //
 // In detail, what happens is as follows.  Suppose initially that ZLO =
-// (z'_i; z''_i) and ZHI = (z'_{i+1}; z''_{i+1}).  Let t = z'_i + b z''_i;
+// (z''_i; z'_i) and ZHI = (z''_{i+1}; z'_{i+1}).  Let t = z'_i + b z''_i;
 // observe that floor(t/b) = floor(z'_i/b) + z''_i.  Let z_i = t mod B, and
 // add floor(t/B) = floor((floor(z'_i/b) + z''_i)/b) onto z'_{i+1}.  This has
 // a circuit depth of 3; I don't know how to do better.
@@ -1032,12 +1032,12 @@ ENDFUNC
 
   .ifeqs "\mode", "dmul"
 	ldr	q2, [x4]
-	zip2	v3.8h, v2.8h, v31.8h	// (v'_2, v''_2; v'_3, v''_3)
-	zip1	v2.8h, v2.8h, v31.8h	// (v'_0, v''_0; v'_1, v''_1)
+	zip2	v3.8h, v2.8h, v31.8h	// (v''_3, v'_3; v''_2, v'_2)
+	zip1	v2.8h, v2.8h, v31.8h	// (v''_1, v'_1; v''_0, v'_0)
 
 	ldr	q4, [x5]
-	zip2	v5.8h, v4.8h, v31.8h	// (y'_2, y''_2; y'_3, y''_3)
-	zip1	v4.8h, v4.8h, v31.8h	// (y'_0, y''_0; y'_1, y''_1)
+	zip2	v5.8h, v4.8h, v31.8h	// (y''_3, y'_3; y''_2, y'_2)
+	zip1	v4.8h, v4.8h, v31.8h	// (y''_1, y'_1; y''_0, y'_0)
 
 	mov	x16, x1
 	mov	x1, x2			// -> u
@@ -1050,8 +1050,8 @@ ENDFUNC
 
   .ifeqs "\mode", "smul"
 	ldr	q4, [x3]
-	zip2	v5.8h, v4.8h, v31.8h	// (y'_2, y''_2; y'_3, y''_3)
-	zip1	v4.8h, v4.8h, v31.8h	// (y'_0, y''_0; y'_1, y''_1)
+	zip2	v5.8h, v4.8h, v31.8h	// (y''_3, y'_3; y''_2, y'_2)
+	zip1	v4.8h, v4.8h, v31.8h	// (y''_1, y'_1; y''_0, y'_0)
 
 	// x2				// -> x
 	mov	x3, x1			// -> c
@@ -1061,12 +1061,12 @@ ENDFUNC
 
   .ifeqs "\mode", "mmul"
 	ldr	q2, [x5]
-	zip2	v3.8h, v2.8h, v31.8h	// (v'_2, v''_2; v'_3, v''_3)
-	zip1	v2.8h, v2.8h, v31.8h	// (v'_0, v''_0; v'_1, v''_1)
+	zip2	v3.8h, v2.8h, v31.8h	// (v''_3, v'_3; v''_2, v'_2)
+	zip1	v2.8h, v2.8h, v31.8h	// (v''_1, v'_1; v''_0, v'_0)
 
 	ldr	q6, [x6]
-	zip2	v7.8h, v6.8h, v31.8h	// (y'_2, y''_2; y'_3, y''_3)
-	zip1	v6.8h, v6.8h, v31.8h	// (y'_0, y''_0; y'_1, y''_1)
+	zip2	v7.8h, v6.8h, v31.8h	// (y''_3, y'_3; y''_2, y'_2)
+	zip1	v6.8h, v6.8h, v31.8h	// (y''_1, y'_1; y''_0, y'_0)
 
 	mov	x16, x1
 	mov	x1, x3			// -> u
@@ -1082,8 +1082,8 @@ ENDFUNC
 
   .ifeqs "\mode", "mont"
 	ldr	q6, [x4]
-	zip2	v7.8h, v6.8h, v31.8h	// (m'_2, m''_2; m'_3, m''_3)
-	zip1	v6.8h, v6.8h, v31.8h	// (m'_0, m''_0; m'_1, m''_1)
+	zip2	v7.8h, v6.8h, v31.8h	// (m''_3, m'_3; m''_2, m'_2)
+	zip1	v6.8h, v6.8h, v31.8h	// (m''_1, m'_1; m''_0, m'_0)
 
 	mov	x4, x2			// -> y
 	mov	x2, x3			// -> x
diff --git a/math/mpx-mul4-x86-sse2.S b/math/mpx-mul4-x86-sse2.S
index 916adef9..0964de9f 100644
--- a/math/mpx-mul4-x86-sse2.S
+++ b/math/mpx-mul4-x86-sse2.S
@@ -58,9 +58,9 @@
 /// pieces are placed into 32-bit cells, and arranged as two 128-bit SSE
 /// operands, as follows.
 ///
-///	Offset	   0	   4	    8	   12
-///	   0	v'_0	v'_1	v''_0	v''_1
-///	  16	v'_2	v'_3	v''_2	v''_3
+///	Offset	   12	    8	   4	   0
+///	   0	v''_1	v''_0	v'_1	v'_0
+///	  16	v''_3	v''_2	v'_3	v'_2
 ///
 /// A `pmuludq' instruction ignores the odd positions in its operands; thus,
 /// it will act on (say) v'_0 and v''_0 in a single instruction.  Shifting
@@ -135,41 +135,41 @@
 .macro	mulcore	r, s, d0, d1=nil, d2=nil, d3=nil
 	// Load a word r_i from R, multiply by the expanded operand [S], and
 	// leave the pieces of the product in registers D0, D1, D2, D3.
-	movd	\d0, \r			// (r_i, 0; 0, 0)
+	movd	\d0, \r			// (0, 0; 0, r_i)
   .ifnes "\d1", "nil"
-	movdqa	\d1, [\s]		// (s'_0, s'_1; s''_0, s''_1)
+	movdqa	\d1, [\s]		// (s''_1, s''_0; s'_1, s'_0)
   .endif
   .ifnes "\d3", "nil"
-	movdqa	\d3, [\s + 16]		// (s'_2, s'_3; s''_2, s''_3)
+	movdqa	\d3, [\s + 16]		// (s''_3, s''_2; s'_3, s'_2)
   .endif
-	pshufd	\d0, \d0, SHUF(0, 3, 0, 3) // (r_i, ?; r_i, ?)
+	pshufd	\d0, \d0, SHUF(3, 0, 3, 0) // (?, r_i; ?, r_i)
   .ifnes "\d1", "nil"
-	psrldq	\d1, 4			// (s'_1, s''_0; s''_1, 0)
+	psrldq	\d1, 4			// (0, s''_1; s''_0, s'_1)
   .endif
   .ifnes "\d2", "nil"
     .ifnes "\d3", "nil"
 	movdqa	\d2, \d3		// another copy of (s'_2, s'_3; ...)
     .else
-	movdqa	\d2, \d0		// another copy of (r_i, ?; r_i, ?)
+	movdqa	\d2, \d0		// another copy of (?, r_i; ?, r_i)
     .endif
   .endif
   .ifnes "\d3", "nil"
-	psrldq	\d3, 4			// (s'_3, s''_2; s''_3, 0)
+	psrldq	\d3, 4			// (0, s''_3; s''_2, s'_3)
   .endif
   .ifnes "\d1", "nil"
-	pmuludq	\d1, \d0		// (r_i s'_1; r_i s''_1)
+	pmuludq	\d1, \d0		// (r_i s''_1; r_i s'_1)
   .endif
   .ifnes "\d3", "nil"
-	pmuludq	\d3, \d0		// (r_i s'_3; r_i s''_3)
+	pmuludq	\d3, \d0		// (r_i s''_3; r_i s'_3)
   .endif
   .ifnes "\d2", "nil"
     .ifnes "\d3", "nil"
-	pmuludq	\d2, \d0		// (r_i s'_2; r_i s''_2)
+	pmuludq	\d2, \d0		// (r_i s''_2; r_i s'_2)
     .else
 	pmuludq	\d2, [\s + 16]
     .endif
   .endif
-	pmuludq	\d0, [\s]		// (r_i s'_0; r_i s''_0)
+	pmuludq	\d0, [\s]		// (r_i s''_0; r_i s'_0)
 .endm
 
 .macro	accum	c0, c1=nil, c2=nil, c3=nil
@@ -210,10 +210,10 @@
 	// carry registers.  On completion, XMM3 is clobbered.  If CC is
 	// `nil', then the contribution which would have been added to it is
 	// left in C.
-	pshufd	xmm3, \c, SHUF(3, 3, 3, 2) // (?, ?; ?, t = c'' mod B)
-	psrldq	xmm3, 12		// (t, 0; 0, 0) = (t, 0)
-	pslldq	xmm3, 2			// (t b; 0)
-	paddq	\c, xmm3		// (c' + t b; c'')
+	pshufd	xmm3, \c, SHUF(2, 3, 3, 3) // (t = c'' mod B, ?; ?, ?)
+	psrldq	xmm3, 12		// (0, 0; 0, t) = (0; t)
+	pslldq	xmm3, 2			// (0; t b)
+	paddq	\c, xmm3		// (c''; c' + t b)
 	movd	\d, \c
 	psrlq	\c, 32			// floor(c/B)
   .ifnes "\cc", "nil"
@@ -226,10 +226,10 @@
 	// of the value represented in C are written to D, and the remaining
 	// bits are left at the bottom of T.
 	movdqa	\t, \c
-	psllq	\t, 16			// (?; c'' b)
-	pslldq	\c, 8			// (0; c')
-	paddq	\t, \c			// (?; c' + c'' b)
-	psrldq	\t, 8			// (c' + c'' b; 0) = (c; 0)
+	psllq	\t, 16			// (c'' b; ?)
+	pslldq	\c, 8			// (c'; 0)
+	paddq	\t, \c			// (c' + c'' b; ?)
+	psrldq	\t, 8			// (0; c' + c'' b) = (0; c)
 	movd	\d, \t
 	psrldq	\t, 4			// (floor(c/B); 0)
 .endm
@@ -238,21 +238,21 @@
 	// On entry, A and C hold packed 128-bit values, and Z is zero.  On
 	// exit, A:B and C:D together hold the same values in expanded
 	// form.  If C is `nil', then only expand A to A:B.
-	movdqa	\b, \a			// (a_0, a_1; a_2, a_3)
+	movdqa	\b, \a			// (a_3, a_2; a_1, a_0)
   .ifnes "\c", "nil"
-	movdqa	\d, \c			// (c_0, c_1; c_2, c_3)
+	movdqa	\d, \c			// (c_3, c_2; c_1, c_0)
   .endif
-	punpcklwd \a, \z		// (a'_0, a''_0; a'_1, a''_1)
-	punpckhwd \b, \z		// (a'_2, a''_2; a'_3, a''_3)
+	punpcklwd \a, \z		// (a''_1, a'_1; a''_0, a'_0)
+	punpckhwd \b, \z		// (a''_3, a'_3; a''_2, a'_2)
   .ifnes "\c", "nil"
-	punpcklwd \c, \z		// (c'_0, c''_0; c'_1, c''_1)
-	punpckhwd \d, \z		// (c'_2, c''_2; c'_3, c''_3)
+	punpcklwd \c, \z		// (c''_1, c'_1; c''_0, c'_0)
+	punpckhwd \d, \z		// (c''_3, c'_3; c''_2, c'_2)
   .endif
-	pshufd	\a, \a, SHUF(0, 2, 1, 3) // (a'_0, a'_1; a''_0, a''_1)
-	pshufd	\b, \b, SHUF(0, 2, 1, 3) // (a'_2, a'_3; a''_2, a''_3)
+	pshufd	\a, \a, SHUF(3, 1, 2, 0) // (a''_1, a''_0; a'_1, a'_0)
+	pshufd	\b, \b, SHUF(3, 1, 2, 0) // (a''_3, a''_2; a'_3, a'_2)
   .ifnes "\c", "nil"
-	pshufd	\c, \c, SHUF(0, 2, 1, 3) // (c'_0, c'_1; c''_0, c''_1)
-	pshufd	\d, \d, SHUF(0, 2, 1, 3) // (c'_2, c'_3; c''_2, c''_3)
+	pshufd	\c, \c, SHUF(3, 1, 2, 0) // (c''_1, c''_0; c'_1, c'_0)
+	pshufd	\d, \d, SHUF(3, 1, 2, 0) // (c''_3, c''_2; c'_3, c'_2)
   .endif
 .endm
 
@@ -268,10 +268,10 @@
 	// we can do that, we must gather them together.
 	movdqa	\t, \c0
 	movdqa	\u, \c1
-	punpcklqdq \t, \c2		// (y'_0; y'_2)
-	punpckhqdq \c0, \c2		// (y''_0; y''_2)
-	punpcklqdq \u, \c3		// (y'_1; y'_3)
-	punpckhqdq \c1, \c3		// (y''_1; y''_3)
+	punpcklqdq \t, \c2		// (y'_2; y'_0)
+	punpckhqdq \c0, \c2		// (y''_2; y''_0)
+	punpcklqdq \u, \c3		// (y'_3; y'_1)
+	punpckhqdq \c1, \c3		// (y''_3; y''_1)
 
 	// Now split the double-prime pieces.  The high (up to) 48 bits will
 	// go up; the low 16 bits go down.
@@ -279,43 +279,43 @@
 	movdqa	\c3, \c1
 	psllq	\c2, 48
 	psllq	\c3, 48
-	psrlq	\c0, 16			// high parts of (y''_0; y''_2)
-	psrlq	\c1, 16			// high parts of (y''_1; y''_3)
-	psrlq	\c2, 32			// low parts of (y''_0; y''_2)
-	psrlq	\c3, 32			// low parts of (y''_1; y''_3)
+	psrlq	\c0, 16			// high parts of (y''_2; y''_0)
+	psrlq	\c1, 16			// high parts of (y''_3; y''_1)
+	psrlq	\c2, 32			// low parts of (y''_2; y''_0)
+	psrlq	\c3, 32			// low parts of (y''_3; y''_1)
   .ifnes "\hi", "nil"
 	movdqa	\hi, \c1
   .endif
-	pslldq	\c1, 8			// high part of (0; y''_1)
+	pslldq	\c1, 8			// high part of (y''_1; 0)
 
 	paddq	\t, \c2			// propagate down
 	paddq	\u, \c3
-	paddq	\t, \c1			// and up: (y_0; y_2)
-	paddq	\u, \c0			// (y_1; y_3)
+	paddq	\t, \c1			// and up: (y_2; y_0)
+	paddq	\u, \c0			// (y_3; y_1)
   .ifnes "\hi", "nil"
-	psrldq	\hi, 8			// high part of (y''_3; 0)
+	psrldq	\hi, 8			// high part of (0; y''_3)
   .endif
 
 	// Finally extract the answer.  This complicated dance is better than
 	// storing to memory and loading, because the piecemeal stores
 	// inhibit store forwarding.
-	movdqa	\c3, \t			// (y_0; ?)
-	movdqa	\lo, \t			// (y^*_0, ?; ?, ?)
-	psrldq	\t, 8			// (y_2; 0)
+	movdqa	\c3, \t			// (?; y_0)
+	movdqa	\lo, \t			// (?, ?; ?, y^*_0)
+	psrldq	\t, 8			// (0; y_2)
 	psrlq	\c3, 32			// (floor(y_0/B); ?)
 	paddq	\c3, \u			// (y_1 + floor(y_0/B); ?)
-	movdqa	\c1, \c3		// (y^*_1, ?; ?, ?)
-	psrldq	\u, 8			// (y_3; 0)
+	movdqa	\c1, \c3		// (?, ?; ?, y^*_1)
+	psrldq	\u, 8			// (0; y_3)
 	psrlq	\c3, 32			// (floor((y_1 B + y_0)/B^2; ?)
 	paddq	\c3, \t			// (y_2 + floor((y_1 B + y_0)/B^2; ?)
-	punpckldq \lo, \c3		// (y^*_0, y^*_2; ?, ?)
+	punpckldq \lo, \c3		// (?, ?; y^*_2, y^*_0)
 	psrlq	\c3, 32		    // (floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
 	paddq	\c3, \u	      // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
   .ifnes "\hi", "nil"
 	movdqa	\t, \c3
 	pxor	\u, \u
   .endif
-	punpckldq \c1, \c3		// (y^*_1, y^*_3; ?, ?)
+	punpckldq \c1, \c3		// (?, ?; y^*_3, y^*_1)
   .ifnes "\hi", "nil"
 	psrlq	\t, 32			// very high bits of y
 	paddq	\hi, \t
@@ -332,14 +332,14 @@
 	// On exit, the carry registers, including XMM7, are updated to hold
 	// C + A; XMM0, XMM1, XMM2, and XMM3 are clobbered.  The other
 	// registers are preserved.
-	movd	xmm0, [edi +  0]	// (a_0; 0)
-	movd	xmm1, [edi +  4]	// (a_1; 0)
-	movd	xmm2, [edi +  8]	// (a_2; 0)
-	movd	xmm7, [edi + 12]	// (a_3; 0)
-
-	paddq	xmm4, xmm0		// (c'_0 + a_0; c''_0)
-	paddq	xmm5, xmm1		// (c'_1 + a_1; c''_1)
-	paddq	xmm6, xmm2		// (c'_2 + a_2; c''_2 + a_3 b)
+	movd	xmm0, [edi +  0]	// (0; a_0)
+	movd	xmm1, [edi +  4]	// (0; a_1)
+	movd	xmm2, [edi +  8]	// (0; a_2)
+	movd	xmm7, [edi + 12]	// (0; a_3)
+
+	paddq	xmm4, xmm0		// (c''_0; c'_0 + a_0)
+	paddq	xmm5, xmm1		// (c''_1; c'_1 + a_1)
+	paddq	xmm6, xmm2		// (c''_2 + a_3 b; c'_2 + a_2)
 .endm
 
 ///--------------------------------------------------------------------------
@@ -1148,9 +1148,9 @@ ENDFUNC
 
 .macro	testldcarry c
 	mov	ecx, \c			// -> c
-	movdqu	xmm4, [ecx +  0]	// (c'_0; c''_0)
-	movdqu	xmm5, [ecx + 16]	// (c'_1; c''_1)
-	movdqu	xmm6, [ecx + 32]	// (c'_2; c''_2)
+	movdqu	xmm4, [ecx +  0]	// (c''_0; c'_0)
+	movdqu	xmm5, [ecx + 16]	// (c''_1; c'_1)
+	movdqu	xmm6, [ecx + 32]	// (c''_2; c'_2)
 .endm
 
 .macro	testexpand v=nil, y=nil
@@ -1286,8 +1286,8 @@ FUNC(test_mmul4)
 	mov	edi, [BP + 28]
 	movdqa	xmm0, [SP + 64]
 	movdqa	xmm1, [SP + 80]
-	pshufd	xmm0, xmm0, SHUF(0, 2, 1, 3)
-	pshufd	xmm1, xmm1, SHUF(0, 2, 1, 3)
+	pshufd	xmm0, xmm0, SHUF(3, 1, 2, 0)
+	pshufd	xmm1, xmm1, SHUF(3, 1, 2, 0)
 	movdqu	[edi], xmm0
 	movdqu	[edi + 16], xmm1
 	testcarryout [BP + 24]
@@ -1304,8 +1304,8 @@ FUNC(test_mmla4)
 	mov	edi, [BP + 28]
 	movdqa	xmm0, [SP + 64]
 	movdqa	xmm1, [SP + 80]
-	pshufd	xmm0, xmm0, SHUF(0, 2, 1, 3)
-	pshufd	xmm1, xmm1, SHUF(0, 2, 1, 3)
+	pshufd	xmm0, xmm0, SHUF(3, 1, 2, 0)
+	pshufd	xmm1, xmm1, SHUF(3, 1, 2, 0)
 	movdqu	[edi], xmm0
 	movdqu	[edi + 16], xmm1
 	testcarryout [BP + 24]
@@ -1322,8 +1322,8 @@ FUNC(test_mont4)
 	mov	edi, [BP + 28]
 	movdqa	xmm0, [SP + 64]
 	movdqa	xmm1, [SP + 80]
-	pshufd	xmm0, xmm0, SHUF(0, 2, 1, 3)
-	pshufd	xmm1, xmm1, SHUF(0, 2, 1, 3)
+	pshufd	xmm0, xmm0, SHUF(3, 1, 2, 0)
+	pshufd	xmm1, xmm1, SHUF(3, 1, 2, 0)
 	movdqu	[edi], xmm0
 	movdqu	[edi + 16], xmm1
 	testcarryout [BP + 24]
diff --git a/symm/chacha-x86ish-sse2.S b/symm/chacha-x86ish-sse2.S
index 974ec5b5..13a1848c 100644
--- a/symm/chacha-x86ish-sse2.S
+++ b/symm/chacha-x86ish-sse2.S
@@ -164,9 +164,9 @@ FUNC(chacha_core_x86ish_sse2)
 
 	// c += d; b ^= c; b <<<=  7
 	paddd	xmm2, xmm3
-	 pshufd	xmm3, xmm3, SHUF(3, 0, 1, 2)
+	 pshufd	xmm3, xmm3, SHUF(2, 1, 0, 3)
 	pxor	xmm1, xmm2
-	 pshufd	xmm2, xmm2, SHUF(2, 3, 0, 1)
+	 pshufd	xmm2, xmm2, SHUF(1, 0, 3, 2)
 	movdqa	xmm4, xmm1
 	pslld	xmm1, 7
 	psrld	xmm4, 25
@@ -184,7 +184,7 @@ FUNC(chacha_core_x86ish_sse2)
 	//
 	// The shuffles have quite high latency, so they've mostly been
 	// pushed upwards.  The remaining one can't be moved, though.
-	pshufd	xmm1, xmm1, SHUF(1, 2, 3, 0)
+	pshufd	xmm1, xmm1, SHUF(0, 3, 2, 1)
 
 	// Apply the diagonal quarterround to each of the columns
 	// simultaneously.
@@ -215,9 +215,9 @@ FUNC(chacha_core_x86ish_sse2)
 
 	// c += d; b ^= c; b <<<=  7
 	paddd	xmm2, xmm3
-	 pshufd	xmm3, xmm3, SHUF(1, 2, 3, 0)
+	 pshufd	xmm3, xmm3, SHUF(0, 3, 2, 1)
 	pxor	xmm1, xmm2
-	 pshufd	xmm2, xmm2, SHUF(2, 3, 0, 1)
+	 pshufd	xmm2, xmm2, SHUF(1, 0, 3, 2)
 	movdqa	xmm4, xmm1
 	pslld	xmm1, 7
 	psrld	xmm4, 25
@@ -226,7 +226,7 @@ FUNC(chacha_core_x86ish_sse2)
 	// Finally, finish off undoing the transpose, and we're done for this
 	// doubleround.  Again, most of this was done above so we don't have
 	// to wait for the shuffles.
-	pshufd	xmm1, xmm1, SHUF(3, 0, 1, 2)
+	pshufd	xmm1, xmm1, SHUF(2, 1, 0, 3)
 
 	// Decrement the loop counter and see if we should go round again.
 	sub	NR, 2
diff --git a/symm/gcm-arm-crypto.S b/symm/gcm-arm-crypto.S
index d5a58f89..8494e42b 100644
--- a/symm/gcm-arm-crypto.S
+++ b/symm/gcm-arm-crypto.S
@@ -99,19 +99,19 @@
 	// use Karatsuba's identity here, but I suspect that loses more in
 	// the shifting, bit-twiddling, and dependency chains that it gains
 	// in saving a multiplication which otherwise pipelines well.
-	// q0 =				// (u_0; u_1)
-	// q1 =				// (v_0; v_1)
+	// q0 =				// (u_1; u_0)
+	// q1 =				// (v_1; v_0)
 	vmull.p64 q2, d1, d2		// u_1 v_0
 	vmull.p64 q3, d0, d3		// u_0 v_1
-	vmull.p64 q8, d1, d3		// (x_3; t_1) = u_1 v_1
-	vmull.p64 q9, d0, d2		// (t_0; x_0) = u_0 v_0
+	vmull.p64 q8, d1, d3		// (t_1; x_3) = u_1 v_1
+	vmull.p64 q9, d0, d2		// (x_0; t_0) = u_0 v_0
 
 	// Arrange the pieces to form a double-precision polynomial.
-	veor	q2, q2, q3		// (m_1; m_0) = u_0 v_1 + u_1 v_0
+	veor	q2, q2, q3		// (m_0; m_1) = u_0 v_1 + u_1 v_0
 	veor	d17, d17, d4		// x_2 = t_1 + m_1
 	veor	d18, d18, d5		// x_1 = t_0 + m_0
-	// q8 =				// (x_3; x_2)
-	// q9 =				// (x_1; x_0)
+	// q8 =				// (x_2; x_3)
+	// q9 =				// (x_0; x_1)
 
 	// One-and-a-half problems remain.
 	//
@@ -198,11 +198,11 @@
 
 	// This is an inconvenient size.  There's nothing for it but to do
 	// four multiplications, as if for the 128-bit case.
-	// q0 =				// (u_0 + u_1 t^32; u_2)
-	// q1 =				// (v_0 + v_1 t^32; v_2)
+	// q0 =				// (u_2; u_0 + u_1 t^32)
+	// q1 =				// (v_2; v_0 + v_1 t^32)
 	vmull.p64 q8, d1, d2		// u_2 (v_0 + v_1 t^32) = e_0
 	vmull.p64 q9, d0, d3		// v_2 (u_0 + u_1 t^32) = e_1
-	vmull.p64 q3, d1, d3		// u_2 v_2 t^64 = d = (0; d)
+	vmull.p64 q3, d1, d3		// u_2 v_2 t^64 = d = (d; 0)
 	vmull.p64 q0, d0, d2		// u_0 v_0 + (u_0 v_1 + u_1 v_0) t^32
 					//   + u_1 v_1 t^64 = f
 
@@ -279,24 +279,24 @@
 	 veor	q11, q11, q13		// b = u_1 v_2 + u_2 v_1
 
 	// Piece the product together.
-	veor	d17, d17, d22  //  q8 =	// (x_5; x_4)
+	veor	d17, d17, d22  //  q8 =	// (x_4; x_5)
 	veor	d18, d18, d23
-	veor	d19, d19, d24  //  q9 =	// (x_3; x_2)
-	veor	d20, d20, d25  // q10 =	// (x_1; x_0)
+	veor	d19, d19, d24  //  q9 =	// (x_2; x_3)
+	veor	d20, d20, d25  // q10 =	// (x_0; x_1)
 
 	// Next, the reduction.  Our polynomial this time is p(x) = t^192 +
 	// t^7 + t^2 + t + 1.  Yes, the magic numbers are the same as the
 	// 128-bit case.  I don't know why.
 
 	// First, shift the high bits down.
-	// q8 =				// (y_5; y_4)
-	// q9 =				// (y_3; y_2)
-	// q10 =			// (y_1; y_0)
-	vshl.u64 q11, q8, #63		// (y_5; y_4) b_i for t
+	// q8 =				// (y_4; y_5)
+	// q9 =				// (y_2; y_3)
+	// q10 =			// (y_0; y_1)
+	vshl.u64 q11, q8, #63		// (y_4; y_5) b_i for t
 	vshl.u64 d28, d18, #63		// y_3 b_i for t
-	vshl.u64 q12, q8, #62		// (y_5; y_4) b_i for t^2
+	vshl.u64 q12, q8, #62		// (y_4; y_5) b_i for t^2
 	vshl.u64 d29, d18, #62		// y_3 b_i for t^2
-	vshl.u64 q13, q8, #57		// (y_5; y_4) b_i for t^7
+	vshl.u64 q13, q8, #57		// (y_4; y_5) b_i for t^7
 	vshl.u64 d30, d18, #57		// y_3 b_i for t^7
 	veor	q11, q11, q12		// mix them all together
 	veor	d28, d28, d29
@@ -307,14 +307,14 @@
 
 	// And finally shift the low bits up.  Also, switch the order of the
 	// pieces for output.
-	// q8 =				// (y'_5; y'_4)
-	// q9 =				// (y'_3; y'_2)
-	// q10 =			// (y'_1; y'_0)
-	vshr.u64 q11, q8, #1		// (y_5; y_4) a_i for t
+	// q8 =				// (y'_4; y'_5)
+	// q9 =				// (y'_2; y'_3)
+	// q10 =			// (y'_0; y'_1)
+	vshr.u64 q11, q8, #1		// (y_4; y_5) a_i for t
 	vshr.u64 d28, d18, #1		// y'_3 a_i for t
-	vshr.u64 q12, q8, #2		// (y_5; y_4) a_i for t^2
+	vshr.u64 q12, q8, #2		// (y_4; y_5) a_i for t^2
 	vshr.u64 d29, d18, #2		// y'_3 a_i for t^2
-	vshr.u64 q13, q8, #7		// (y_5; y_4) a_i for t^7
+	vshr.u64 q13, q8, #7		// (y_4; y_5) a_i for t^7
 	vshr.u64 d30, d18, #7		// y'_3 a_i for t^7
 	veor	q8, q8, q11
 	veor	d18, d18, d28
@@ -348,13 +348,13 @@
 	// 128-bit multiplications already, and Karatsuba is too annoying
 	// there, so there'll be 12 multiplications altogether, rather than
 	// the 16 we'd have if we did this the naÃ¯ve way.
-	// q0 =				// u_0 = (u_00; u_01)
-	// q1 =				// u_1 = (u_10; u_11)
-	// q2 =				// v_0 = (v_00; v_01)
-	// q3 =				// v_1 = (v_10; v_11)
+	// q0 =				// u_0 = (u_01; u_00)
+	// q1 =				// u_1 = (u_11; u_10)
+	// q2 =				// v_0 = (v_01; v_00)
+	// q3 =				// v_1 = (v_11; v_10)
 
-	veor	q8, q0, q1		// u_* = (u_00 + u_10; u_01 + u_11)
-	veor	q9, q2, q3		// v_* = (v_00 + v_10; v_01 + v_11)
+	veor	q8, q0, q1		// u_* = (u_01 + u_11; u_00 + u_10)
+	veor	q9, q2, q3		// v_* = (v_01 + v_11; v_00 + v_10)
 
 	// Start by building the cross product, q = u_* v_*.
 	vmull.p64 q14, d16, d19		// u_*0 v_*1
@@ -398,16 +398,16 @@
 	// The polynomial this time is p(t) = t^256 + t^10 + t^5 + t^2 + 1.
 
 	// First, shift the high bits down.
-	// q8 =				// (y_7; y_6)
-	// q9 =				// (y_5; y_4)
-	// q10 =			// (y_3; y_2)
-	// q11 =			// (y_1; y_0)
-	vshl.u64 q0, q8, #62		// (y_7; y_6) b_i for t^2
-	vshl.u64 q12, q9, #62		// (y_5; y_4) b_i for t^2
-	vshl.u64 q1, q8, #59		// (y_7; y_6) b_i for t^5
-	vshl.u64 q13, q9, #59		// (y_5; y_4) b_i for t^5
-	vshl.u64 q2, q8, #54		// (y_7; y_6) b_i for t^10
-	vshl.u64 q14, q9, #54		// (y_5; y_4) b_i for t^10
+	// q8 =				// (y_6; y_7)
+	// q9 =				// (y_4; y_5)
+	// q10 =			// (y_2; y_3)
+	// q11 =			// (y_0; y_1)
+	vshl.u64 q0, q8, #62		// (y_6; y_7) b_i for t^2
+	vshl.u64 q12, q9, #62		// (y_4; y_5) b_i for t^2
+	vshl.u64 q1, q8, #59		// (y_6; y_7) b_i for t^5
+	vshl.u64 q13, q9, #59		// (y_4; y_5) b_i for t^5
+	vshl.u64 q2, q8, #54		// (y_6; y_7) b_i for t^10
+	vshl.u64 q14, q9, #54		// (y_4; y_5) b_i for t^10
 	veor	q0, q0, q1		// mix the contributions together
 	veor	q12, q12, q13
 	veor	q0, q0, q2
@@ -419,16 +419,16 @@
 
 	// And then shift the low bits up.  Also, switch the order of the
 	// pieces for output.
-	// q8 =				// (y'_7; y'_6)
-	// q9 =				// (y'_5; y'_4)
-	// q10 =			// (y'_3; y'_2)
-	// q11 =			// (y'_1; y'_0)
-	vshr.u64 q0, q8, #2		// (y_7; y_6) a_i for t^2
-	vshr.u64 q12, q9, #2		// (y_5; y'_4) a_i for t^2
-	vshr.u64 q1, q8, #5		// (y_7; y_6) a_i for t^5
-	vshr.u64 q13, q9, #5		// (y_5; y_4) a_i for t^5
-	vshr.u64 q2, q8, #10		// (y_7; y_6) a_i for t^10
-	vshr.u64 q14, q9, #10		// (y_5; y_4) a_i for t^10
+	// q8 =				// (y'_6; y'_7)
+	// q9 =				// (y'_4; y'_5)
+	// q10 =			// (y'_2; y'_3)
+	// q11 =			// (y'_0; y'_1)
+	vshr.u64 q0, q8, #2		// (y_6; y_7) a_i for t^2
+	vshr.u64 q12, q9, #2		// (y'_4; y_5) a_i for t^2
+	vshr.u64 q1, q8, #5		// (y_6; y_7) a_i for t^5
+	vshr.u64 q13, q9, #5		// (y_4; y_5) a_i for t^5
+	vshr.u64 q2, q8, #10		// (y_6; y_7) a_i for t^10
+	vshr.u64 q14, q9, #10		// (y_4; y_5) a_i for t^10
 
 	veor	q8, q8, q0		// mix the contributions together
 	veor	q1, q1, q2
diff --git a/symm/gcm-arm64-pmull.S b/symm/gcm-arm64-pmull.S
index dcd8c450..0e4bd798 100644
--- a/symm/gcm-arm64-pmull.S
+++ b/symm/gcm-arm64-pmull.S
@@ -71,19 +71,19 @@
 	// use Karatsuba's identity here, but I suspect that loses more in
 	// the shifting, bit-twiddling, and dependency chains that it gains
 	// in saving a multiplication which otherwise pipelines well.
-	// v0 =				// (u_0; u_1)
-	// v1/v2 =			// (v_0; v_1)
+	// v0 =				// (u_1; u_0)
+	// v1/v2 =			// (v_1; v_0)
 	pmull2	v3.1q, v0.2d, v1.2d	// u_1 v_0
 	pmull	v4.1q, v0.1d, v2.1d	// u_0 v_1
-	pmull2	v5.1q, v0.2d, v2.2d	// (t_1; x_3) = u_1 v_1
-	pmull	v6.1q, v0.1d, v1.1d	// (x_0; t_0) = u_0 v_0
+	pmull2	v5.1q, v0.2d, v2.2d	// (x_3; t_1) = u_1 v_1
+	pmull	v6.1q, v0.1d, v1.1d	// (t_0; x_0) = u_0 v_0
 
 	// Arrange the pieces to form a double-precision polynomial.
-	eor	v3.16b, v3.16b, v4.16b	// (m_0; m_1) = u_0 v_1 + u_1 v_0
-	vshr128	v4, v3, 64		// (m_1; 0)
-	vshl128	v3, v3, 64		// (0; m_0)
-	eor	v1.16b, v5.16b, v4.16b	// (x_2; x_3)
-	eor	v0.16b, v6.16b, v3.16b	// (x_0; x_1)
+	eor	v3.16b, v3.16b, v4.16b	// (m_1; m_0) = u_0 v_1 + u_1 v_0
+	vshr128	v4, v3, 64		// (0; m_1)
+	vshl128	v3, v3, 64		// (m_0; 0)
+	eor	v1.16b, v5.16b, v4.16b	// (x_3; x_2)
+	eor	v0.16b, v6.16b, v3.16b	// (x_1; x_0)
 
 	// And now the only remaining difficulty is that the result needs to
 	// be reduced modulo p(t) = t^128 + t^7 + t^2 + t + 1.  Let R = t^128
@@ -137,8 +137,8 @@
 	// leave with z = u v in x2.  Clobbers x2--x4.
 
 	// The multiplication is thankfully easy.
-	// v0 =					// (u; ?)
-	// v1 =					// (v; ?)
+	// v0 =					// (?; u)
+	// v1 =					// (?; v)
 	pmull	v0.1q, v0.1d, v1.1d		// u v
 
 	// Now we must reduce.  This is essentially the same as the 128-bit
@@ -176,12 +176,12 @@
 	// shift both of them up by four bytes before we start.  This will
 	// mean that the high 64 bits of the result (from GCM's viewpoint)
 	// will be zero.
-	// v0 =				// (u_0 + u_1 t^32; u_2)
+	// v0 =				// (u_2; u_0 + u_1 t^32)
 	// v1 =				// (v_0 + v_1 t^32; v_0 + v_1 t^32)
 	// v2 =				// (v_2; v_2)
 	pmull2	v5.1q, v0.2d, v1.2d	// u_2 (v_0 + v_1 t^32) t^32 = e_0
 	pmull	v4.1q, v0.1d, v2.1d	// v_2 (u_0 + u_1 t^32) t^32 = e_1
-	pmull2	v6.1q, v0.2d, v2.2d	// u_2 v_2 = d = (d; 0)
+	pmull2	v6.1q, v0.2d, v2.2d	// u_2 v_2 = d = (0; d)
 	pmull	v3.1q, v0.1d, v1.1d	// u_0 v_0 + (u_0 v_1 + u_1 v_0) t^32
 					//   + u_1 v_1 t^64 = f
 
@@ -238,8 +238,8 @@
 	// Clobbers v16--v25.
 
 	// Start multiplying and accumulating pieces of product.
-	// v0 =				// (u_0; u_1)
-	// v1 =				// (u_2; ?)
+	// v0 =				// (u_1; u_0)
+	// v1 =				// (?; u_2)
 	// v2 =				// (v_0; v_0)
 	// v3 =				// (v_1; v_1)
 	// v4 =				// (v_2; v_2)
@@ -262,27 +262,27 @@
 	 eor	v20.16b, v20.16b, v24.16b // d = u_1 v_2 + u_2 v_1
 
 	// Piece the product together.
-	// v16 =			// (a_0; a_1)
-	// v19 =			// (b_0; b_1)
-	// v17 =			// (c_0; c_1)
-	// v20 =			// (d_0; d_1)
-	// v18 =			// (e_0; e_1)
-	vshl128	v21, v19, 64		// (0; b_0)
-	ext	v22.16b, v19.16b, v20.16b, #8 // (b_1; d_0)
-	vshr128	v23, v20, 64		// (d_1; 0)
-	eor	v16.16b, v16.16b, v21.16b // (x_0; x_1)
-	eor	v17.16b, v17.16b, v22.16b // (x_2; x_3)
-	eor	v18.16b, v18.16b, v23.16b // (x_2; x_3)
+	// v16 =			// (a_1; a_0)
+	// v19 =			// (b_1; b_0)
+	// v17 =			// (c_1; c_0)
+	// v20 =			// (d_1; d_0)
+	// v18 =			// (e_1; e_0)
+	vshl128	v21, v19, 64		// (b_0; 0)
+	ext	v22.16b, v19.16b, v20.16b, #8 // (d_0; b_1)
+	vshr128	v23, v20, 64		// (0; d_1)
+	eor	v16.16b, v16.16b, v21.16b // (x_1; x_0)
+	eor	v17.16b, v17.16b, v22.16b // (x_3; x_2)
+	eor	v18.16b, v18.16b, v23.16b // (x_3; x_2)
 
 	// Next, the reduction.  Our polynomial this time is p(x) = t^192 +
 	// t^7 + t^2 + t + 1.  Yes, the magic numbers are the same as the
 	// 128-bit case.  I don't know why.
 
 	// First, shift the high bits down.
-	// v16 =			// (y_0; y_1)
-	// v17 =			// (y_2; y_3)
-	// v18 =			// (y_4; y_5)
-	mov	v19.d[0], v17.d[1]	// (y_3; ?)
+	// v16 =			// (y_1; y_0)
+	// v17 =			// (y_3; y_2)
+	// v18 =			// (y_5; y_4)
+	mov	v19.d[0], v17.d[1]	// (?; y_3)
 
 	ushr	v23.2d, v18.2d, #63	// hi b_i for t
 	ushr	d20, d19, #63		// lo b_i for t
@@ -298,15 +298,15 @@
 	// Permute the high pieces while we fold in the b_i.
 	eor	v17.16b, v17.16b, v23.16b
 	vshl128	v20, v20, 64
-	mov	v19.d[0], v18.d[1]	// (y_5; ?)
-	ext	v18.16b, v17.16b, v18.16b, #8 // (y_3; y_4)
+	mov	v19.d[0], v18.d[1]	// (?; y_5)
+	ext	v18.16b, v17.16b, v18.16b, #8 // (y_4; y_3)
 	eor	v16.16b, v16.16b, v20.16b
 
 	// And finally shift the low bits up.
-	// v16 =			// (y'_0; y'_1)
-	// v17 =			// (y'_2; ?)
-	// v18 =			// (y'_3; y'_4)
-	// v19 =			// (y'_5; ?)
+	// v16 =			// (y'_1; y'_0)
+	// v17 =			// (?; y'_2)
+	// v18 =			// (y'_4; y'_3)
+	// v19 =			// (?; y'_5)
 	shl	v20.2d, v18.2d, #1
 	shl	d23, d19, #1
 	shl	v21.2d, v18.2d, #2
@@ -345,14 +345,14 @@
 	// 128-bit multiplications already, and Karatsuba is too annoying
 	// there, so there'll be 12 multiplications altogether, rather than
 	// the 16 we'd have if we did this the naÃ¯ve way.
-	// v0 =				// u_0 = (u_00; u_01)
-	// v1 =				// u_1 = (u_10; u_11)
+	// v0 =				// u_0 = (u_01; u_00)
+	// v1 =				// u_1 = (u_11; u_10)
 	// v2 =				// (v_00; v_00)
 	// v3 =				// (v_01; v_01)
 	// v4 =				// (v_10; v_10)
 	// v5 =				// (v_11; v_11)
 
-	eor	v28.16b, v0.16b, v1.16b	// u_* = (u_00 + u_10; u_01 + u_11)
+	eor	v28.16b, v0.16b, v1.16b	// u_* = (u_01 + u_11; u_00 + u_10)
 	eor	v29.16b, v2.16b, v4.16b	// v_*0 = v_00 + v_10
 	eor	v30.16b, v3.16b, v5.16b	// v_*1 = v_01 + v_11
 
@@ -402,16 +402,16 @@
 	// Now we must reduce.  This is essentially the same as the 192-bit
 	// case above, but more complicated because everything is bigger.
 	// The polynomial this time is p(t) = t^256 + t^10 + t^5 + t^2 + 1.
-	// v16 =			// (y_0; y_1)
-	// v17 =			// (y_2; y_3)
-	// v18 =			// (y_4; y_5)
-	// v19 =			// (y_6; y_7)
-	ushr	v24.2d, v18.2d, #62	// (y_4; y_5) b_i for t^2
-	ushr	v25.2d, v19.2d, #62	// (y_6; y_7) b_i for t^2
-	ushr	v26.2d, v18.2d, #59	// (y_4; y_5) b_i for t^5
-	ushr	v27.2d, v19.2d, #59	// (y_6; y_7) b_i for t^5
-	ushr	v28.2d, v18.2d, #54	// (y_4; y_5) b_i for t^10
-	ushr	v29.2d, v19.2d, #54	// (y_6; y_7) b_i for t^10
+	// v16 =			// (y_1; y_0)
+	// v17 =			// (y_3; y_2)
+	// v18 =			// (y_5; y_4)
+	// v19 =			// (y_7; y_6)
+	ushr	v24.2d, v18.2d, #62	// (y_5; y_4) b_i for t^2
+	ushr	v25.2d, v19.2d, #62	// (y_7; y_6) b_i for t^2
+	ushr	v26.2d, v18.2d, #59	// (y_5; y_4) b_i for t^5
+	ushr	v27.2d, v19.2d, #59	// (y_7; y_6) b_i for t^5
+	ushr	v28.2d, v18.2d, #54	// (y_5; y_4) b_i for t^10
+	ushr	v29.2d, v19.2d, #54	// (y_7; y_6) b_i for t^10
 	eor	v24.16b, v24.16b, v26.16b // mix the contributions together
 	eor	v25.16b, v25.16b, v27.16b
 	eor	v24.16b, v24.16b, v28.16b
@@ -424,16 +424,16 @@
 	eor	v16.16b, v16.16b, v24.16b
 
 	// And then shift the low bits up.
-	// v16 =			// (y'_0; y'_1)
-	// v17 =			// (y'_2; y'_3)
-	// v18 =			// (y'_4; y'_5)
-	// v19 =			// (y'_6; y'_7)
-	shl	v24.2d, v18.2d, #2	// (y'_4; y_5) a_i for t^2
-	shl	v25.2d, v19.2d, #2	// (y_6; y_7) a_i for t^2
-	shl	v26.2d, v18.2d, #5	// (y'_4; y_5) a_i for t^5
-	shl	v27.2d, v19.2d, #5	// (y_6; y_7) a_i for t^5
-	shl	v28.2d, v18.2d, #10	// (y'_4; y_5) a_i for t^10
-	shl	v29.2d, v19.2d, #10	// (y_6; y_7) a_i for t^10
+	// v16 =			// (y'_1; y'_0)
+	// v17 =			// (y'_3; y'_2)
+	// v18 =			// (y'_5; y'_4)
+	// v19 =			// (y'_7; y'_6)
+	shl	v24.2d, v18.2d, #2	// (y_5; y'_4) a_i for t^2
+	shl	v25.2d, v19.2d, #2	// (y_7; y_6) a_i for t^2
+	shl	v26.2d, v18.2d, #5	// (y_5; y'_4) a_i for t^5
+	shl	v27.2d, v19.2d, #5	// (y_7; y_6) a_i for t^5
+	shl	v28.2d, v18.2d, #10	// (y_5; y'_4) a_i for t^10
+	shl	v29.2d, v19.2d, #10	// (y_7; y_6) a_i for t^10
 	eor	v18.16b, v18.16b, v24.16b // mix the contributions together
 	eor	v19.16b, v19.16b, v25.16b
 	eor	v26.16b, v26.16b, v28.16b
diff --git a/symm/gcm-x86ish-pclmul.S b/symm/gcm-x86ish-pclmul.S
index 837abbdd..fadeca58 100644
--- a/symm/gcm-x86ish-pclmul.S
+++ b/symm/gcm-x86ish-pclmul.S
@@ -113,21 +113,21 @@
 	// use Karatsuba's identity here, but I suspect that loses more in
 	// the shifting, bit-twiddling, and dependency chains that it gains
 	// in saving a multiplication which otherwise pipelines well.
-	// xmm0 =			// (u_1; u_0)
-	// xmm1 =			// (v_1; v_0)
-	movdqa	xmm2, xmm1		// (v_1; v_0) again
-	movdqa	xmm3, xmm0		// (u_1; u_0) again
-	movdqa	xmm4, xmm0		// (u_1; u_0) yet again
+	// xmm0 =			// (u_0; u_1)
+	// xmm1 =			// (v_0; v_1)
+	movdqa	xmm2, xmm1		// (v_0; v_1) again
+	movdqa	xmm3, xmm0		// (u_0; u_1) again
+	movdqa	xmm4, xmm0		// (u_0; u_1) yet again
 	pclmulhqlqdq xmm2, xmm0		// u_1 v_0
 	pclmullqlqdq xmm0, xmm1		// u_1 v_1
 	pclmulhqlqdq xmm3, xmm1		// u_0 v_1
 	pclmulhqhqdq xmm4, xmm1		// u_0 v_0
 
 	// Arrange the pieces to form a double-precision polynomial.
-	pxor	xmm2, xmm3		// (m_1; m_0) = u_1 v_0 + u_0 v_1
-	movdqa	xmm1, xmm2		// (m_1; m_0) again
-	pslldq	xmm2, 8			// (0; m_1)
-	psrldq	xmm1, 8			// (m_0; 0)
+	pxor	xmm2, xmm3		// (m_0; m_1) = u_1 v_0 + u_0 v_1
+	movdqa	xmm1, xmm2		// (m_0; m_1) again
+	pslldq	xmm2, 8			// (m_1; 0)
+	psrldq	xmm1, 8			// (0; m_0)
 	pxor	xmm0, xmm2		// z_1 = u_1 v_1 + m_1
 	pxor	xmm1, xmm4		// z_0 = u_0 v_0 + t^64 m_0
 
@@ -158,9 +158,9 @@
 	// word together, and then the low bits, everything will be fine.
 
 	// First, shift the high bits down.
-	movdqa	xmm2, xmm0		// (x_7, x_6; x_5, x_4) again
-	movdqa	xmm3, xmm0		// (x_7, x_6; x_5, x_4) yet again
-	movdqa	xmm4, xmm0		// (x_7, x_6; x_5, x_4) again again
+	movdqa	xmm2, xmm0		// (x_4, x_5; x_6, x_7) again
+	movdqa	xmm3, xmm0		// (x_4, x_5; x_6, x_7) yet again
+	movdqa	xmm4, xmm0		// (x_4, x_5; x_6, x_7) again again
 	pslld	xmm2, 31		// the b_i for t
 	pslld	xmm3, 30		// the b_i for t^2
 	pslld	xmm4, 25		// the b_i for t^7
@@ -196,13 +196,13 @@
 	// polynomial this time is p(t) = t^64 + t^4 + t^3 + t + 1.
 
 	// First, we must detach the top (`low'!) half of the result.
-	movdqa	xmm0, xmm1		// (x_3, x_2; x_1, x_0) again
-	psrldq	xmm1, 8			// (x_1, x_0; 0, 0)
+	movdqa	xmm0, xmm1		// (x_0, x_1; x_2, x_3) again
+	psrldq	xmm1, 8			// (0, 0; x_0, x_1)
 
 	// Next, shift the high bits down.
-	movdqa	xmm2, xmm0		// (x_3, x_2; ?, ?) again
-	movdqa	xmm3, xmm0		// (x_3, x_2; ?, ?) yet again
-	movdqa	xmm4, xmm0		// (x_3, x_2; ?, ?) again again
+	movdqa	xmm2, xmm0		// (?, ?; x_2, x_3) again
+	movdqa	xmm3, xmm0		// (?, ?; x_2, x_3) yet again
+	movdqa	xmm4, xmm0		// (?, ?; x_2, x_3) again again
 	pslld	xmm2, 31		// b_i for t
 	pslld	xmm3, 29		// b_i for t^3
 	pslld	xmm4, 28		// b_i for t^4
@@ -239,11 +239,11 @@
 	// shift both of them up by four bytes before we start.  This will
 	// mean that the high 64 bits of the result (from GCM's viewpoint)
 	// will be zero.
-	// xmm0 =			// (0, u_2; u_1, u_0)
-	// xmm1 =			// (0, v_2; v_1, v_0)
-	movdqa	xmm2, xmm1		// (0, v_2; v_1, v_0) again
-	movdqa	xmm3, xmm0		// (0, u_2; u_1, u_0) again
-	movdqa	xmm4, xmm0		// (0, u_2; u_1, u_0) yet again
+	// xmm0 =			// (u_0, u_1; u_2, 0)
+	// xmm1 =			// (v_0, v_1; v_2, 0)
+	movdqa	xmm2, xmm1		// (v_0, v_1; v_2, 0) again
+	movdqa	xmm3, xmm0		// (u_0, u_1; u_2, 0) again
+	movdqa	xmm4, xmm0		// (u_0, u_1; u_2, 0) yet again
 	pclmulhqlqdq xmm2, xmm0		// u_2 (v_1 t^32 + v_0) = e_0
 	pclmullqlqdq xmm0, xmm1		// u_2 v_2 = d = (0; d)
 	pclmulhqlqdq xmm3, xmm1		// v_2 (u_1 t^32 + u_0) = e_1
@@ -255,10 +255,10 @@
 	// registers.  The answer we want is d t^128 + e t^64 + f, where e =
 	// e_0 + e_1.
 	//
-	// The place values for the two halves are (t^160, t^128; t^96, ?)
-	// and (?, t^64; t^32, 1).  But we also want to shift the high part
+	// The place values for the two halves are (?, t^96; t^128, t^160)
+	// and (1, t^32; t^64, ?).  But we also want to shift the high part
 	// left by a word, for symmetry's sake.
-	psrldq	xmm0, 8			// (d; 0) = d t^128
+	psrldq	xmm0, 8			// (0; d) = d t^128
 	pxor	xmm2, xmm3		// e = (e_0 + e_1)
 	movdqa	xmm1, xmm4		// f again
 	pxor	xmm0, xmm2		// d t^128 + e t^64
@@ -308,15 +308,15 @@
 	// are unimportant.  Clobbers xmm2--xmm7.
 
 	// Start multiplying and accumulating pieces of product.
-	// xmm0 =			// (u_2; u_1)
-	// xmm1 =			// (u_0; ?)
-	// xmm2 =			// (v_2; v_1)
-	// xmm3 =			// (v_0; ?)
-	movdqa	xmm4, xmm0		// (u_2; u_1) again
-	movdqa	xmm5, xmm0		// (u_2; u_1) yet again
-	movdqa	xmm6, xmm0		// (u_2; u_1) again again
-	movdqa	xmm7, xmm3		// (v_0; ?) again
-	punpcklqdq xmm3, xmm1		// (v_0; u_0)
+	// xmm0 =			// (u_1; u_2)
+	// xmm1 =			// (?; u_0)
+	// xmm2 =			// (v_1; v_2)
+	// xmm3 =			// (?; v_0)
+	movdqa	xmm4, xmm0		// (u_1; u_2) again
+	movdqa	xmm5, xmm0		// (u_1; u_2) yet again
+	movdqa	xmm6, xmm0		// (u_1; u_2) again again
+	movdqa	xmm7, xmm3		// (?; v_0) again
+	punpcklqdq xmm3, xmm1		// (u_0; v_0)
 	pclmulhqhqdq xmm4, xmm2		// u_1 v_1
 	pclmullqlqdq xmm1, xmm2		// u_0 v_2
 	pclmullqhqdq xmm5, xmm2		// u_2 v_1
@@ -324,7 +324,7 @@
 	pxor	xmm1, xmm4		// u_0 v_2 + u_1 v_1
 	pclmullqlqdq xmm7, xmm0		// u_2 v_0
 	pxor	xmm5, xmm6		// b = u_2 v_1 + u_1 v_2
-	movdqa	xmm6, xmm0		// (u_2; u_1) like a bad penny
+	movdqa	xmm6, xmm0		// (u_1; u_2) like a bad penny
 	pxor	xmm1, xmm7		// c = u_0 v_2 + u_1 v_1 + u_2 v_0
 	pclmullqlqdq xmm0, xmm2		// a = u_2 v_2
 	pclmulhqlqdq xmm6, xmm3		// u_1 v_0
@@ -334,50 +334,50 @@
 
 	// Next, the piecing together of the product.  There's significant
 	// work here to leave the completed pieces in sensible registers.
-	// xmm0 =			// (a_1; a_0) = a = u_2 v_2
-	// xmm5 =			// (b_1; b_0) = b = u_1 v_2 + u_2 v_1
-	// xmm1 =			// (c_1; c_0) = c = u_0 v_2 +
+	// xmm0 =			// (a_0; a_1) = a = u_2 v_2
+	// xmm5 =			// (b_0; b_1) = b = u_1 v_2 + u_2 v_1
+	// xmm1 =			// (c_0; c_1) = c = u_0 v_2 +
 					//	u_1 v_1 + u_2 v_0
-	// xmm6 =			// (d_1; d_0) = d = u_0 v_1 + u_1 v_0
-	// xmm3 =			// (e_1; e_0) = e = u_0 v_0
+	// xmm6 =			// (d_0; d_1) = d = u_0 v_1 + u_1 v_0
+	// xmm3 =			// (e_0; e_1) = e = u_0 v_0
 	// xmm2, xmm4, xmm7 spare
-	movdqa	xmm2, xmm6		// (d_1; d_0) again
-	movdqa	xmm4, xmm5		// (b_1; b_0) again
-	pslldq	xmm6, 8			// (0; d_1)
-	psrldq	xmm5, 8			// (b_0; 0)
-	psrldq	xmm2, 8			// (d_0; 0)
-	pslldq	xmm4, 8			// (0; b_1)
-	pxor	xmm5, xmm6		// (b_0; d_1)
-	pxor	xmm0, xmm4		// (x_5; x_4) = (a_1; a_0 + b_1)
-	pxor	xmm2, xmm3		// (x_1; x_0) = (e_1 + d_0; e_0)
-	pxor	xmm1, xmm5	       // (x_3; x_2) = (b_0 + c_1; c_0 + d_1)
+	movdqa	xmm2, xmm6		// (d_0; d_1) again
+	movdqa	xmm4, xmm5		// (b_0; b_1) again
+	pslldq	xmm6, 8			// (d_1; 0)
+	psrldq	xmm5, 8			// (0; b_0)
+	psrldq	xmm2, 8			// (0; d_0)
+	pslldq	xmm4, 8			// (b_1; 0)
+	pxor	xmm5, xmm6		// (d_1; b_0)
+	pxor	xmm0, xmm4		// (x_4; x_5) = (a_0 + b_1; a_1)
+	pxor	xmm2, xmm3		// (x_0; x_1) = (e_0; e_1 + d_0)
+	pxor	xmm1, xmm5	       // (x_2; x_3) = (c_0 + d_1; b_0 + c_1)
 
 	// Next, the reduction.  Our polynomial this time is p(x) = t^192 +
 	// t^7 + t^2 + t + 1.  Yes, the magic numbers are the same as the
 	// 128-bit case.  I don't know why.
 
 	// First, shift the high bits down.
-	// xmm0 =			// (x_5; x_4)
-	// xmm1 =			// (x_3; x_2)
-	// xmm2 =			// (x_1; x_0)
+	// xmm0 =			// (x_4; x_5)
+	// xmm1 =			// (x_2; x_3)
+	// xmm2 =			// (x_0; x_1)
 	// xmm3--xmm7 spare
-	movdqa	xmm3, xmm0		// (x_5; x_4) copy
-	movdqa	xmm4, xmm0		// (x_5; x_4) copy
-	movdqa	xmm5, xmm0		// (x_5; x_4) copy
-	pslld	xmm3, 31		// (x_5; x_4) b_i for t
-	pslld	xmm4, 30		// (x_5; x_4) b_i for t^2
-	pslld	xmm5, 25		// (x_5; x_4) b_i for t^7
-	 movq	xmm6, xmm1		// (x_3; 0) copy
+	movdqa	xmm3, xmm0		// (x_4; x_5) copy
+	movdqa	xmm4, xmm0		// (x_4; x_5) copy
+	movdqa	xmm5, xmm0		// (x_4; x_5) copy
+	pslld	xmm3, 31		// (x_4; x_5) b_i for t
+	pslld	xmm4, 30		// (x_4; x_5) b_i for t^2
+	pslld	xmm5, 25		// (x_4; x_5) b_i for t^7
+	 movq	xmm6, xmm1		// (0; x_3) copy
 	pxor	xmm3, xmm4
-	 movq	xmm7, xmm1		// (x_3; 0) copy
+	 movq	xmm7, xmm1		// (0; x_3) copy
 	pxor	xmm3, xmm5
-	 movq	xmm5, xmm1		// (x_3; 0) copy
-	movdqa	xmm4, xmm3		// (x_5; x_4) b_i combined
-	 pslld	xmm6, 31		// (x_3; 0) b_i for t
-	 pslld	xmm7, 30		// (x_3; 0) b_i for t^2
-	 pslld	xmm5, 25		// (x_3; 0) b_i for t^7
-	psrldq	xmm3, 12		// (x_5; x_4) low contrib
-	pslldq	xmm4, 4			// (x_5; x_4) high contrib
+	 movq	xmm5, xmm1		// (0; x_3) copy
+	movdqa	xmm4, xmm3		// (x_4; x_5) b_i combined
+	 pslld	xmm6, 31		// (0; x_3) b_i for t
+	 pslld	xmm7, 30		// (0; x_3) b_i for t^2
+	 pslld	xmm5, 25		// (0; x_3) b_i for t^7
+	psrldq	xmm3, 12		// (x_4; x_5) low contrib
+	pslldq	xmm4, 4			// (x_4; x_5) high contrib
 	 pxor	xmm6, xmm7
 	pxor	xmm2, xmm3
 	 pxor	xmm6, xmm5
@@ -387,17 +387,17 @@
 
 	// And finally shift the low bits up.  Unfortunately, we also have to
 	// split the low bits out.
-	// xmm0 =			// (x'_5; x'_4)
-	// xmm1 =			// (x'_3; x'_2)
-	// xmm2 =			// (x'_1; x'_0)
-	 movdqa xmm5, xmm1		// copies of (x'_3; x'_2)
+	// xmm0 =			// (x'_4; x'_5)
+	// xmm1 =			// (x'_2; x'_3)
+	// xmm2 =			// (x'_0; x'_1)
+	 movdqa xmm5, xmm1		// copies of (x'_2; x'_3)
 	 movdqa	xmm6, xmm1
 	 movdqa	xmm7, xmm1
-	  psrldq xmm1, 8		// bring down (x'_2; ?)
-	movdqa	xmm3, xmm0		// copies of (x'_5; x'_4)
+	  psrldq xmm1, 8		// bring down (?; x'_2)
+	movdqa	xmm3, xmm0		// copies of (x'_4; x'_5)
 	movdqa	xmm4, xmm0
-	  punpcklqdq  xmm1, xmm2	// (x'_2; x'_1)
-	  psrldq xmm2, 8		// (x'_0; ?)
+	  punpcklqdq  xmm1, xmm2	// (x'_1; x'_2)
+	  psrldq xmm2, 8		// (?; x'_0)
 	 pxor	xmm2, xmm5		// low half and unit contrib
 	pxor	xmm1, xmm0
 	 psrld	xmm5, 1
@@ -412,7 +412,7 @@
 	pxor	xmm0, xmm4
 	 pxor	xmm5, xmm2		// mix everything together
 	pxor	xmm0, xmm1
-	 movq	xmm1, xmm5		// shunt (z_0; ?) into proper place
+	 movq	xmm1, xmm5		// shunt (?; z_0) into proper place
 .endm
 
 .macro	mul256
@@ -442,10 +442,10 @@
 	// On x86, there aren't quite enough registers, so spill one for a
 	// bit.  On AMD64, we can keep on going, so it's all good.
 
-	// xmm0 =			// u_1 = (u_11; u_10)
-	// xmm1 =			// u_0 = (u_01; u_00)
-	// xmm2 =			// v_1 = (v_11; v_10)
-	// xmm3 =			// v_0 = (v_01; v_00)
+	// xmm0 =			// u_1 = (u_10; u_11)
+	// xmm1 =			// u_0 = (u_00; u_01)
+	// xmm2 =			// v_1 = (v_10; v_11)
+	// xmm3 =			// v_0 = (v_00; v_01)
 	movdqa	xmm4, xmm0		// u_1 again
 #if CPUFAM_X86
 	movdqa	[SP + 0], xmm3
@@ -453,8 +453,8 @@
 	movdqa	xmm8, xmm3
 #  define V0 xmm8
 #endif
-	pxor	xmm4, xmm1		// u_* = (u_01 + u_11; u_00 + u_10)
-	pxor	xmm3, xmm2		// v_* = (v_01 + v_11; v_00 + v_10)
+	pxor	xmm4, xmm1		// u_* = (u_00 + u_10; u_01 + u_11)
+	pxor	xmm3, xmm2		// v_* = (v_00 + v_10; v_01 + v_11)
 
 	// Start by building the cross product, q = u_* v_*.
 	movdqa	xmm7, xmm4		// more copies of u_*
@@ -588,7 +588,7 @@
 //     the /last/ byte in the block.  If the block size is not a multiple of
 //     16 bytes, then there must be padding.  96-bit blocks are weird: the
 //     padding is inserted at the /least/ significant end, so the register
-//     holds (0, x_0; x_1, x_2); otherwise, the padding goes at the most
+//     holds (x_2, x_1; x_0, 0); otherwise, the padding goes at the most
 //     significant end.
 //
 //   * The `words' format consists of a sequence of bytes, as in the
@@ -613,9 +613,9 @@ SSEFUNC(gcm_mulk_128b_x86ish_pclmul)
   endprologue
 	movdqu	xmm0, [A]
 	movdqu	xmm1, [K]
-	pshufd	xmm0, xmm0, SHUF(3, 2, 1, 0)
+	pshufd	xmm0, xmm0, SHUF(0, 1, 2, 3)
 	mul128
-	pshufd	xmm0, xmm0, SHUF(3, 2, 1, 0)
+	pshufd	xmm0, xmm0, SHUF(0, 1, 2, 3)
 	movdqu	[A], xmm0
 	ret
 ENDFUNC
@@ -653,9 +653,9 @@ SSEFUNC(gcm_mulk_64b_x86ish_pclmul)
   endprologue
 	movq	xmm0, [A]
 	movq	xmm1, [K]
-	pshufd	xmm0, xmm0, SHUF(1, 0, 3, 3)
+	pshufd	xmm0, xmm0, SHUF(3, 3, 0, 1)
 	mul64
-	pshufd	xmm0, xmm0, SHUF(1, 0, 3, 3)
+	pshufd	xmm0, xmm0, SHUF(3, 3, 0, 1)
 	movq	[A], xmm0
 	ret
 ENDFUNC
@@ -696,9 +696,9 @@ SSEFUNC(gcm_mulk_96b_x86ish_pclmul)
 	movd	xmm2, [A + 8]
 	movdqu	xmm1, [K]
 	punpcklqdq xmm0, xmm2
-	pshufd	xmm0, xmm0, SHUF(3, 2, 1, 0)
+	pshufd	xmm0, xmm0, SHUF(0, 1, 2, 3)
 	mul96
-	pshufd	xmm1, xmm0, SHUF(3, 2, 1, 0)
+	pshufd	xmm1, xmm0, SHUF(0, 1, 2, 3)
 	psrldq	xmm0, 4
 	movq	[A + 0], xmm1
 	movd	[A + 8], xmm0
@@ -750,11 +750,11 @@ SSEFUNC(gcm_mulk_192b_x86ish_pclmul)
 	movq	xmm1, [A + 0]
 	movdqu	xmm2, [K + 0]
 	movq	xmm3, [K + 16]
-	pshufd	xmm0, xmm0, SHUF(3, 2, 1, 0)
-	pshufd	xmm1, xmm1, SHUF(1, 0, 3, 3)
+	pshufd	xmm0, xmm0, SHUF(0, 1, 2, 3)
+	pshufd	xmm1, xmm1, SHUF(3, 3, 0, 1)
 	mul192
-	pshufd	xmm0, xmm0, SHUF(3, 2, 1, 0)
-	pshufd	xmm1, xmm1, SHUF(1, 0, 3, 3)
+	pshufd	xmm0, xmm0, SHUF(0, 1, 2, 3)
+	pshufd	xmm1, xmm1, SHUF(3, 3, 0, 1)
 	movdqu	[A + 8], xmm0
 	movq	[A + 0], xmm1
 #if CPUFAM_AMD64 && ABI_WIN
@@ -824,11 +824,11 @@ SSEFUNC(gcm_mulk_256b_x86ish_pclmul)
 	movdqu	xmm1, [A + 0]
 	movdqu	xmm2, [K + 0]
 	movdqu	xmm3, [K + 16]
-	pshufd	xmm0, xmm0, SHUF(3, 2, 1, 0)
-	pshufd	xmm1, xmm1, SHUF(3, 2, 1, 0)
+	pshufd	xmm0, xmm0, SHUF(0, 1, 2, 3)
+	pshufd	xmm1, xmm1, SHUF(0, 1, 2, 3)
 	mul256
-	pshufd	xmm0, xmm0, SHUF(3, 2, 1, 0)
-	pshufd	xmm1, xmm1, SHUF(3, 2, 1, 0)
+	pshufd	xmm0, xmm0, SHUF(0, 1, 2, 3)
+	pshufd	xmm1, xmm1, SHUF(0, 1, 2, 3)
 	movdqu	[A + 16], xmm0
 	movdqu	[A + 0], xmm1
 #if CPUFAM_X86
diff --git a/symm/rijndael-x86ish-aesni.S b/symm/rijndael-x86ish-aesni.S
index f5e5cc9c..ad9236a8 100644
--- a/symm/rijndael-x86ish-aesni.S
+++ b/symm/rijndael-x86ish-aesni.S
@@ -209,16 +209,16 @@ FUNC(rijndael_setup_x86ish_aesni)
 	// Fourth word of the cycle, and seven or eight words of key.  Do a
 	// byte substitution.
 	movd	xmm0, eax
-	pshufd	xmm0, xmm0, SHUF(3, 0, 1, 2)
+	pshufd	xmm0, xmm0, SHUF(2, 1, 0, 3)
 	aeskeygenassist xmm1, xmm0, 0
 	movd	eax, xmm1
 	jmp	2f
 
 	// First word of the cycle.  This is the complicated piece.
 1:	movd	xmm0, eax
-	pshufd	xmm0, xmm0, SHUF(1, 2, 3, 0)
+	pshufd	xmm0, xmm0, SHUF(0, 3, 2, 1)
 	aeskeygenassist xmm1, xmm0, 0
-	pshufd	xmm1, xmm1, SHUF(3, 0, 1, 2)
+	pshufd	xmm1, xmm1, SHUF(2, 1, 0, 3)
 	movd	eax, xmm1
 	xor	al, [RCON]
 	inc	RCON
diff --git a/symm/salsa20-x86ish-sse2.S b/symm/salsa20-x86ish-sse2.S
index 26bab892..ccf912b6 100644
--- a/symm/salsa20-x86ish-sse2.S
+++ b/symm/salsa20-x86ish-sse2.S
@@ -180,7 +180,7 @@ FUNC(salsa20_core_x86ish_sse2)
 	// d ^= (c + b) <<< 13
 	movdqa	xmm4, xmm2
 	paddd	xmm4, xmm1
-	 pshufd	xmm1, xmm1, SHUF(3, 0, 1, 2)
+	 pshufd	xmm1, xmm1, SHUF(2, 1, 0, 3)
 	movdqa	xmm5, xmm4
 	pslld	xmm4, 13
 	psrld	xmm5, 19
@@ -189,9 +189,9 @@ FUNC(salsa20_core_x86ish_sse2)
 
 	// a ^= (d + c) <<< 18
 	movdqa	xmm4, xmm3
-	 pshufd	xmm3, xmm3, SHUF(1, 2, 3, 0)
+	 pshufd	xmm3, xmm3, SHUF(0, 3, 2, 1)
 	paddd	xmm4, xmm2
-	 pshufd	xmm2, xmm2, SHUF(2, 3, 0, 1)
+	 pshufd	xmm2, xmm2, SHUF(1, 0, 3, 2)
 	movdqa	xmm5, xmm4
 	pslld	xmm4, 18
 	psrld	xmm5, 14
@@ -235,7 +235,7 @@ FUNC(salsa20_core_x86ish_sse2)
 	// d ^= (c + b) <<< 13
 	movdqa	xmm4, xmm2
 	paddd	xmm4, xmm3
-	 pshufd	xmm3, xmm3, SHUF(3, 0, 1, 2)
+	 pshufd	xmm3, xmm3, SHUF(2, 1, 0, 3)
 	movdqa	xmm5, xmm4
 	pslld	xmm4, 13
 	psrld	xmm5, 19
@@ -244,9 +244,9 @@ FUNC(salsa20_core_x86ish_sse2)
 
 	// a ^= (d + c) <<< 18
 	movdqa	xmm4, xmm1
-	 pshufd	xmm1, xmm1, SHUF(1, 2, 3, 0)
+	 pshufd	xmm1, xmm1, SHUF(0, 3, 2, 1)
 	paddd	xmm4, xmm2
-	 pshufd	xmm2, xmm2, SHUF(2, 3, 0, 1)
+	 pshufd	xmm2, xmm2, SHUF(1, 0, 3, 2)
 	movdqa	xmm5, xmm4
 	pslld	xmm4, 18
 	psrld	xmm5, 14
@@ -270,9 +270,9 @@ FUNC(salsa20_core_x86ish_sse2)
 	// input.  This can be done by juggling values in registers, with the
 	// following fancy footwork: some row rotations, a transpose, and
 	// some more rotations.
-	pshufd	xmm1, xmm1, SHUF(3, 0, 1, 2)	//  3,  4,  9, 14
-	pshufd	xmm2, xmm2, SHUF(2, 3, 0, 1)	//  2,  7,  8, 13
-	pshufd	xmm3, xmm3, SHUF(1, 2, 3, 0)	//  1,  6, 11, 12
+	pshufd	xmm1, xmm1, SHUF(2, 1, 0, 3)	//  3,  4,  9, 14
+	pshufd	xmm2, xmm2, SHUF(1, 0, 3, 2)	//  2,  7,  8, 13
+	pshufd	xmm3, xmm3, SHUF(0, 3, 2, 1)	//  1,  6, 11, 12
 
 	movdqa	xmm4, xmm0
 	movdqa	xmm5, xmm3
@@ -288,9 +288,9 @@ FUNC(salsa20_core_x86ish_sse2)
 	punpckhdq xmm1, xmm3			//  5,  6,  7,  4
 	punpckhdq xmm2, xmm5			// 15, 12, 13, 14
 
-	pshufd	xmm1, xmm1, SHUF(3, 0, 1, 2)	//  4,  5,  6,  7
-	pshufd	xmm4, xmm4, SHUF(2, 3, 0, 1)	//  8,  9, 10, 11
-	pshufd	xmm2, xmm2, SHUF(1, 2, 3, 0)	// 12, 13, 14, 15
+	pshufd	xmm1, xmm1, SHUF(2, 1, 0, 3)	//  4,  5,  6,  7
+	pshufd	xmm4, xmm4, SHUF(1, 0, 3, 2)	//  8,  9, 10, 11
+	pshufd	xmm2, xmm2, SHUF(0, 3, 2, 1)	// 12, 13, 14, 15
 
 	// Finally we have to write out the result.
 	movdqu	[OUT +  0], xmm0