X-Git-Url: https://git.distorted.org.uk/~mdw/catacomb/blobdiff_plain/e7ee4000d06cadd8355404c8ddfb3d16265d24ca..981a9e5d5e3af6c06ad8b3f821928852068227e4:/symm/gcm-x86ish-pclmul.S

diff --git a/symm/gcm-x86ish-pclmul.S b/symm/gcm-x86ish-pclmul.S
index 837abbdd..fadeca58 100644
--- a/symm/gcm-x86ish-pclmul.S
+++ b/symm/gcm-x86ish-pclmul.S
@@ -113,21 +113,21 @@
 	// use Karatsuba's identity here, but I suspect that loses more in
 	// the shifting, bit-twiddling, and dependency chains that it gains
 	// in saving a multiplication which otherwise pipelines well.
-	// xmm0 =			// (u_1; u_0)
-	// xmm1 =			// (v_1; v_0)
-	movdqa	xmm2, xmm1		// (v_1; v_0) again
-	movdqa	xmm3, xmm0		// (u_1; u_0) again
-	movdqa	xmm4, xmm0		// (u_1; u_0) yet again
+	// xmm0 =			// (u_0; u_1)
+	// xmm1 =			// (v_0; v_1)
+	movdqa	xmm2, xmm1		// (v_0; v_1) again
+	movdqa	xmm3, xmm0		// (u_0; u_1) again
+	movdqa	xmm4, xmm0		// (u_0; u_1) yet again
 	pclmulhqlqdq xmm2, xmm0		// u_1 v_0
 	pclmullqlqdq xmm0, xmm1		// u_1 v_1
 	pclmulhqlqdq xmm3, xmm1		// u_0 v_1
 	pclmulhqhqdq xmm4, xmm1		// u_0 v_0
 
 	// Arrange the pieces to form a double-precision polynomial.
-	pxor	xmm2, xmm3		// (m_1; m_0) = u_1 v_0 + u_0 v_1
-	movdqa	xmm1, xmm2		// (m_1; m_0) again
-	pslldq	xmm2, 8			// (0; m_1)
-	psrldq	xmm1, 8			// (m_0; 0)
+	pxor	xmm2, xmm3		// (m_0; m_1) = u_1 v_0 + u_0 v_1
+	movdqa	xmm1, xmm2		// (m_0; m_1) again
+	pslldq	xmm2, 8			// (m_1; 0)
+	psrldq	xmm1, 8			// (0; m_0)
 	pxor	xmm0, xmm2		// z_1 = u_1 v_1 + m_1
 	pxor	xmm1, xmm4		// z_0 = u_0 v_0 + t^64 m_0
 
@@ -158,9 +158,9 @@
 	// word together, and then the low bits, everything will be fine.
 
 	// First, shift the high bits down.
-	movdqa	xmm2, xmm0		// (x_7, x_6; x_5, x_4) again
-	movdqa	xmm3, xmm0		// (x_7, x_6; x_5, x_4) yet again
-	movdqa	xmm4, xmm0		// (x_7, x_6; x_5, x_4) again again
+	movdqa	xmm2, xmm0		// (x_4, x_5; x_6, x_7) again
+	movdqa	xmm3, xmm0		// (x_4, x_5; x_6, x_7) yet again
+	movdqa	xmm4, xmm0		// (x_4, x_5; x_6, x_7) again again
 	pslld	xmm2, 31		// the b_i for t
 	pslld	xmm3, 30		// the b_i for t^2
 	pslld	xmm4, 25		// the b_i for t^7
@@ -196,13 +196,13 @@
 	// polynomial this time is p(t) = t^64 + t^4 + t^3 + t + 1.
 
 	// First, we must detach the top (`low'!) half of the result.
-	movdqa	xmm0, xmm1		// (x_3, x_2; x_1, x_0) again
-	psrldq	xmm1, 8			// (x_1, x_0; 0, 0)
+	movdqa	xmm0, xmm1		// (x_0, x_1; x_2, x_3) again
+	psrldq	xmm1, 8			// (0, 0; x_0, x_1)
 
 	// Next, shift the high bits down.
-	movdqa	xmm2, xmm0		// (x_3, x_2; ?, ?) again
-	movdqa	xmm3, xmm0		// (x_3, x_2; ?, ?) yet again
-	movdqa	xmm4, xmm0		// (x_3, x_2; ?, ?) again again
+	movdqa	xmm2, xmm0		// (?, ?; x_2, x_3) again
+	movdqa	xmm3, xmm0		// (?, ?; x_2, x_3) yet again
+	movdqa	xmm4, xmm0		// (?, ?; x_2, x_3) again again
 	pslld	xmm2, 31		// b_i for t
 	pslld	xmm3, 29		// b_i for t^3
 	pslld	xmm4, 28		// b_i for t^4
@@ -239,11 +239,11 @@
 	// shift both of them up by four bytes before we start.  This will
 	// mean that the high 64 bits of the result (from GCM's viewpoint)
 	// will be zero.
-	// xmm0 =			// (0, u_2; u_1, u_0)
-	// xmm1 =			// (0, v_2; v_1, v_0)
-	movdqa	xmm2, xmm1		// (0, v_2; v_1, v_0) again
-	movdqa	xmm3, xmm0		// (0, u_2; u_1, u_0) again
-	movdqa	xmm4, xmm0		// (0, u_2; u_1, u_0) yet again
+	// xmm0 =			// (u_0, u_1; u_2, 0)
+	// xmm1 =			// (v_0, v_1; v_2, 0)
+	movdqa	xmm2, xmm1		// (v_0, v_1; v_2, 0) again
+	movdqa	xmm3, xmm0		// (u_0, u_1; u_2, 0) again
+	movdqa	xmm4, xmm0		// (u_0, u_1; u_2, 0) yet again
 	pclmulhqlqdq xmm2, xmm0		// u_2 (v_1 t^32 + v_0) = e_0
 	pclmullqlqdq xmm0, xmm1		// u_2 v_2 = d = (0; d)
 	pclmulhqlqdq xmm3, xmm1		// v_2 (u_1 t^32 + u_0) = e_1
@@ -255,10 +255,10 @@
 	// registers.  The answer we want is d t^128 + e t^64 + f, where e =
 	// e_0 + e_1.
 	//
-	// The place values for the two halves are (t^160, t^128; t^96, ?)
-	// and (?, t^64; t^32, 1).  But we also want to shift the high part
+	// The place values for the two halves are (?, t^96; t^128, t^160)
+	// and (1, t^32; t^64, ?).  But we also want to shift the high part
 	// left by a word, for symmetry's sake.
-	psrldq	xmm0, 8			// (d; 0) = d t^128
+	psrldq	xmm0, 8			// (0; d) = d t^128
 	pxor	xmm2, xmm3		// e = (e_0 + e_1)
 	movdqa	xmm1, xmm4		// f again
 	pxor	xmm0, xmm2		// d t^128 + e t^64
@@ -308,15 +308,15 @@
 	// are unimportant.  Clobbers xmm2--xmm7.
 
 	// Start multiplying and accumulating pieces of product.
-	// xmm0 =			// (u_2; u_1)
-	// xmm1 =			// (u_0; ?)
-	// xmm2 =			// (v_2; v_1)
-	// xmm3 =			// (v_0; ?)
-	movdqa	xmm4, xmm0		// (u_2; u_1) again
-	movdqa	xmm5, xmm0		// (u_2; u_1) yet again
-	movdqa	xmm6, xmm0		// (u_2; u_1) again again
-	movdqa	xmm7, xmm3		// (v_0; ?) again
-	punpcklqdq xmm3, xmm1		// (v_0; u_0)
+	// xmm0 =			// (u_1; u_2)
+	// xmm1 =			// (?; u_0)
+	// xmm2 =			// (v_1; v_2)
+	// xmm3 =			// (?; v_0)
+	movdqa	xmm4, xmm0		// (u_1; u_2) again
+	movdqa	xmm5, xmm0		// (u_1; u_2) yet again
+	movdqa	xmm6, xmm0		// (u_1; u_2) again again
+	movdqa	xmm7, xmm3		// (?; v_0) again
+	punpcklqdq xmm3, xmm1		// (u_0; v_0)
 	pclmulhqhqdq xmm4, xmm2		// u_1 v_1
 	pclmullqlqdq xmm1, xmm2		// u_0 v_2
 	pclmullqhqdq xmm5, xmm2		// u_2 v_1
@@ -324,7 +324,7 @@
 	pxor	xmm1, xmm4		// u_0 v_2 + u_1 v_1
 	pclmullqlqdq xmm7, xmm0		// u_2 v_0
 	pxor	xmm5, xmm6		// b = u_2 v_1 + u_1 v_2
-	movdqa	xmm6, xmm0		// (u_2; u_1) like a bad penny
+	movdqa	xmm6, xmm0		// (u_1; u_2) like a bad penny
 	pxor	xmm1, xmm7		// c = u_0 v_2 + u_1 v_1 + u_2 v_0
 	pclmullqlqdq xmm0, xmm2		// a = u_2 v_2
 	pclmulhqlqdq xmm6, xmm3		// u_1 v_0
@@ -334,50 +334,50 @@
 
 	// Next, the piecing together of the product.  There's significant
 	// work here to leave the completed pieces in sensible registers.
-	// xmm0 =			// (a_1; a_0) = a = u_2 v_2
-	// xmm5 =			// (b_1; b_0) = b = u_1 v_2 + u_2 v_1
-	// xmm1 =			// (c_1; c_0) = c = u_0 v_2 +
+	// xmm0 =			// (a_0; a_1) = a = u_2 v_2
+	// xmm5 =			// (b_0; b_1) = b = u_1 v_2 + u_2 v_1
+	// xmm1 =			// (c_0; c_1) = c = u_0 v_2 +
 					//	u_1 v_1 + u_2 v_0
-	// xmm6 =			// (d_1; d_0) = d = u_0 v_1 + u_1 v_0
-	// xmm3 =			// (e_1; e_0) = e = u_0 v_0
+	// xmm6 =			// (d_0; d_1) = d = u_0 v_1 + u_1 v_0
+	// xmm3 =			// (e_0; e_1) = e = u_0 v_0
 	// xmm2, xmm4, xmm7 spare
-	movdqa	xmm2, xmm6		// (d_1; d_0) again
-	movdqa	xmm4, xmm5		// (b_1; b_0) again
-	pslldq	xmm6, 8			// (0; d_1)
-	psrldq	xmm5, 8			// (b_0; 0)
-	psrldq	xmm2, 8			// (d_0; 0)
-	pslldq	xmm4, 8			// (0; b_1)
-	pxor	xmm5, xmm6		// (b_0; d_1)
-	pxor	xmm0, xmm4		// (x_5; x_4) = (a_1; a_0 + b_1)
-	pxor	xmm2, xmm3		// (x_1; x_0) = (e_1 + d_0; e_0)
-	pxor	xmm1, xmm5	       // (x_3; x_2) = (b_0 + c_1; c_0 + d_1)
+	movdqa	xmm2, xmm6		// (d_0; d_1) again
+	movdqa	xmm4, xmm5		// (b_0; b_1) again
+	pslldq	xmm6, 8			// (d_1; 0)
+	psrldq	xmm5, 8			// (0; b_0)
+	psrldq	xmm2, 8			// (0; d_0)
+	pslldq	xmm4, 8			// (b_1; 0)
+	pxor	xmm5, xmm6		// (d_1; b_0)
+	pxor	xmm0, xmm4		// (x_4; x_5) = (a_0 + b_1; a_1)
+	pxor	xmm2, xmm3		// (x_0; x_1) = (e_0; e_1 + d_0)
+	pxor	xmm1, xmm5	       // (x_2; x_3) = (c_0 + d_1; b_0 + c_1)
 
 	// Next, the reduction.  Our polynomial this time is p(x) = t^192 +
 	// t^7 + t^2 + t + 1.  Yes, the magic numbers are the same as the
 	// 128-bit case.  I don't know why.
 
 	// First, shift the high bits down.
-	// xmm0 =			// (x_5; x_4)
-	// xmm1 =			// (x_3; x_2)
-	// xmm2 =			// (x_1; x_0)
+	// xmm0 =			// (x_4; x_5)
+	// xmm1 =			// (x_2; x_3)
+	// xmm2 =			// (x_0; x_1)
 	// xmm3--xmm7 spare
-	movdqa	xmm3, xmm0		// (x_5; x_4) copy
-	movdqa	xmm4, xmm0		// (x_5; x_4) copy
-	movdqa	xmm5, xmm0		// (x_5; x_4) copy
-	pslld	xmm3, 31		// (x_5; x_4) b_i for t
-	pslld	xmm4, 30		// (x_5; x_4) b_i for t^2
-	pslld	xmm5, 25		// (x_5; x_4) b_i for t^7
-	 movq	xmm6, xmm1		// (x_3; 0) copy
+	movdqa	xmm3, xmm0		// (x_4; x_5) copy
+	movdqa	xmm4, xmm0		// (x_4; x_5) copy
+	movdqa	xmm5, xmm0		// (x_4; x_5) copy
+	pslld	xmm3, 31		// (x_4; x_5) b_i for t
+	pslld	xmm4, 30		// (x_4; x_5) b_i for t^2
+	pslld	xmm5, 25		// (x_4; x_5) b_i for t^7
+	 movq	xmm6, xmm1		// (0; x_3) copy
 	pxor	xmm3, xmm4
-	 movq	xmm7, xmm1		// (x_3; 0) copy
+	 movq	xmm7, xmm1		// (0; x_3) copy
 	pxor	xmm3, xmm5
-	 movq	xmm5, xmm1		// (x_3; 0) copy
-	movdqa	xmm4, xmm3		// (x_5; x_4) b_i combined
-	 pslld	xmm6, 31		// (x_3; 0) b_i for t
-	 pslld	xmm7, 30		// (x_3; 0) b_i for t^2
-	 pslld	xmm5, 25		// (x_3; 0) b_i for t^7
-	psrldq	xmm3, 12		// (x_5; x_4) low contrib
-	pslldq	xmm4, 4			// (x_5; x_4) high contrib
+	 movq	xmm5, xmm1		// (0; x_3) copy
+	movdqa	xmm4, xmm3		// (x_4; x_5) b_i combined
+	 pslld	xmm6, 31		// (0; x_3) b_i for t
+	 pslld	xmm7, 30		// (0; x_3) b_i for t^2
+	 pslld	xmm5, 25		// (0; x_3) b_i for t^7
+	psrldq	xmm3, 12		// (x_4; x_5) low contrib
+	pslldq	xmm4, 4			// (x_4; x_5) high contrib
 	 pxor	xmm6, xmm7
 	pxor	xmm2, xmm3
 	 pxor	xmm6, xmm5
@@ -387,17 +387,17 @@
 
 	// And finally shift the low bits up.  Unfortunately, we also have to
 	// split the low bits out.
-	// xmm0 =			// (x'_5; x'_4)
-	// xmm1 =			// (x'_3; x'_2)
-	// xmm2 =			// (x'_1; x'_0)
-	 movdqa xmm5, xmm1		// copies of (x'_3; x'_2)
+	// xmm0 =			// (x'_4; x'_5)
+	// xmm1 =			// (x'_2; x'_3)
+	// xmm2 =			// (x'_0; x'_1)
+	 movdqa xmm5, xmm1		// copies of (x'_2; x'_3)
 	 movdqa	xmm6, xmm1
 	 movdqa	xmm7, xmm1
-	  psrldq xmm1, 8		// bring down (x'_2; ?)
-	movdqa	xmm3, xmm0		// copies of (x'_5; x'_4)
+	  psrldq xmm1, 8		// bring down (?; x'_2)
+	movdqa	xmm3, xmm0		// copies of (x'_4; x'_5)
 	movdqa	xmm4, xmm0
-	  punpcklqdq  xmm1, xmm2	// (x'_2; x'_1)
-	  psrldq xmm2, 8		// (x'_0; ?)
+	  punpcklqdq  xmm1, xmm2	// (x'_1; x'_2)
+	  psrldq xmm2, 8		// (?; x'_0)
 	 pxor	xmm2, xmm5		// low half and unit contrib
 	pxor	xmm1, xmm0
 	 psrld	xmm5, 1
@@ -412,7 +412,7 @@
 	pxor	xmm0, xmm4
 	 pxor	xmm5, xmm2		// mix everything together
 	pxor	xmm0, xmm1
-	 movq	xmm1, xmm5		// shunt (z_0; ?) into proper place
+	 movq	xmm1, xmm5		// shunt (?; z_0) into proper place
 .endm
 
 .macro	mul256
@@ -442,10 +442,10 @@
 	// On x86, there aren't quite enough registers, so spill one for a
 	// bit.  On AMD64, we can keep on going, so it's all good.
 
-	// xmm0 =			// u_1 = (u_11; u_10)
-	// xmm1 =			// u_0 = (u_01; u_00)
-	// xmm2 =			// v_1 = (v_11; v_10)
-	// xmm3 =			// v_0 = (v_01; v_00)
+	// xmm0 =			// u_1 = (u_10; u_11)
+	// xmm1 =			// u_0 = (u_00; u_01)
+	// xmm2 =			// v_1 = (v_10; v_11)
+	// xmm3 =			// v_0 = (v_00; v_01)
 	movdqa	xmm4, xmm0		// u_1 again
 #if CPUFAM_X86
 	movdqa	[SP + 0], xmm3
@@ -453,8 +453,8 @@
 	movdqa	xmm8, xmm3
 #  define V0 xmm8
 #endif
-	pxor	xmm4, xmm1		// u_* = (u_01 + u_11; u_00 + u_10)
-	pxor	xmm3, xmm2		// v_* = (v_01 + v_11; v_00 + v_10)
+	pxor	xmm4, xmm1		// u_* = (u_00 + u_10; u_01 + u_11)
+	pxor	xmm3, xmm2		// v_* = (v_00 + v_10; v_01 + v_11)
 
 	// Start by building the cross product, q = u_* v_*.
 	movdqa	xmm7, xmm4		// more copies of u_*
@@ -588,7 +588,7 @@
 //     the /last/ byte in the block.  If the block size is not a multiple of
 //     16 bytes, then there must be padding.  96-bit blocks are weird: the
 //     padding is inserted at the /least/ significant end, so the register
-//     holds (0, x_0; x_1, x_2); otherwise, the padding goes at the most
+//     holds (x_2, x_1; x_0, 0); otherwise, the padding goes at the most
 //     significant end.
 //
 //   * The `words' format consists of a sequence of bytes, as in the
@@ -613,9 +613,9 @@ SSEFUNC(gcm_mulk_128b_x86ish_pclmul)
   endprologue
 	movdqu	xmm0, [A]
 	movdqu	xmm1, [K]
-	pshufd	xmm0, xmm0, SHUF(3, 2, 1, 0)
+	pshufd	xmm0, xmm0, SHUF(0, 1, 2, 3)
 	mul128
-	pshufd	xmm0, xmm0, SHUF(3, 2, 1, 0)
+	pshufd	xmm0, xmm0, SHUF(0, 1, 2, 3)
 	movdqu	[A], xmm0
 	ret
 ENDFUNC
@@ -653,9 +653,9 @@ SSEFUNC(gcm_mulk_64b_x86ish_pclmul)
   endprologue
 	movq	xmm0, [A]
 	movq	xmm1, [K]
-	pshufd	xmm0, xmm0, SHUF(1, 0, 3, 3)
+	pshufd	xmm0, xmm0, SHUF(3, 3, 0, 1)
 	mul64
-	pshufd	xmm0, xmm0, SHUF(1, 0, 3, 3)
+	pshufd	xmm0, xmm0, SHUF(3, 3, 0, 1)
 	movq	[A], xmm0
 	ret
 ENDFUNC
@@ -696,9 +696,9 @@ SSEFUNC(gcm_mulk_96b_x86ish_pclmul)
 	movd	xmm2, [A + 8]
 	movdqu	xmm1, [K]
 	punpcklqdq xmm0, xmm2
-	pshufd	xmm0, xmm0, SHUF(3, 2, 1, 0)
+	pshufd	xmm0, xmm0, SHUF(0, 1, 2, 3)
 	mul96
-	pshufd	xmm1, xmm0, SHUF(3, 2, 1, 0)
+	pshufd	xmm1, xmm0, SHUF(0, 1, 2, 3)
 	psrldq	xmm0, 4
 	movq	[A + 0], xmm1
 	movd	[A + 8], xmm0
@@ -750,11 +750,11 @@ SSEFUNC(gcm_mulk_192b_x86ish_pclmul)
 	movq	xmm1, [A + 0]
 	movdqu	xmm2, [K + 0]
 	movq	xmm3, [K + 16]
-	pshufd	xmm0, xmm0, SHUF(3, 2, 1, 0)
-	pshufd	xmm1, xmm1, SHUF(1, 0, 3, 3)
+	pshufd	xmm0, xmm0, SHUF(0, 1, 2, 3)
+	pshufd	xmm1, xmm1, SHUF(3, 3, 0, 1)
 	mul192
-	pshufd	xmm0, xmm0, SHUF(3, 2, 1, 0)
-	pshufd	xmm1, xmm1, SHUF(1, 0, 3, 3)
+	pshufd	xmm0, xmm0, SHUF(0, 1, 2, 3)
+	pshufd	xmm1, xmm1, SHUF(3, 3, 0, 1)
 	movdqu	[A + 8], xmm0
 	movq	[A + 0], xmm1
 #if CPUFAM_AMD64 && ABI_WIN
@@ -824,11 +824,11 @@ SSEFUNC(gcm_mulk_256b_x86ish_pclmul)
 	movdqu	xmm1, [A + 0]
 	movdqu	xmm2, [K + 0]
 	movdqu	xmm3, [K + 16]
-	pshufd	xmm0, xmm0, SHUF(3, 2, 1, 0)
-	pshufd	xmm1, xmm1, SHUF(3, 2, 1, 0)
+	pshufd	xmm0, xmm0, SHUF(0, 1, 2, 3)
+	pshufd	xmm1, xmm1, SHUF(0, 1, 2, 3)
 	mul256
-	pshufd	xmm0, xmm0, SHUF(3, 2, 1, 0)
-	pshufd	xmm1, xmm1, SHUF(3, 2, 1, 0)
+	pshufd	xmm0, xmm0, SHUF(0, 1, 2, 3)
+	pshufd	xmm1, xmm1, SHUF(0, 1, 2, 3)
 	movdqu	[A + 16], xmm0
 	movdqu	[A + 0], xmm1
 #if CPUFAM_X86