X-Git-Url: https://git.distorted.org.uk/~mdw/catacomb/blobdiff_plain/9e6a4409d58d1ed9dfe2de3c6ffaee822e051c9f..HEAD:/symm/gcm-arm64-pmull.S

diff --git a/symm/gcm-arm64-pmull.S b/symm/gcm-arm64-pmull.S
index 97bb3bf2..4935e46d 100644
--- a/symm/gcm-arm64-pmull.S
+++ b/symm/gcm-arm64-pmull.S
@@ -71,19 +71,19 @@
 	// use Karatsuba's identity here, but I suspect that loses more in
 	// the shifting, bit-twiddling, and dependency chains that it gains
 	// in saving a multiplication which otherwise pipelines well.
-	// v0 =				// (u_0; u_1)
-	// v1/v2 =			// (v_0; v_1)
+	// v0 =				// (u_1; u_0)
+	// v1/v2 =			// (v_1; v_0)
 	pmull2	v3.1q, v0.2d, v1.2d	// u_1 v_0
 	pmull	v4.1q, v0.1d, v2.1d	// u_0 v_1
-	pmull2	v5.1q, v0.2d, v2.2d	// (t_1; x_3) = u_1 v_1
-	pmull	v6.1q, v0.1d, v1.1d	// (x_0; t_0) = u_0 v_0
+	pmull2	v5.1q, v0.2d, v2.2d	// (x_3; t_1) = u_1 v_1
+	pmull	v6.1q, v0.1d, v1.1d	// (t_0; x_0) = u_0 v_0
 
 	// Arrange the pieces to form a double-precision polynomial.
-	eor	v3.16b, v3.16b, v4.16b	// (m_0; m_1) = u_0 v_1 + u_1 v_0
-	vshr128	v4, v3, 64		// (m_1; 0)
-	vshl128	v3, v3, 64		// (0; m_0)
-	eor	v1.16b, v5.16b, v4.16b	// (x_2; x_3)
-	eor	v0.16b, v6.16b, v3.16b	// (x_0; x_1)
+	eor	v3.16b, v3.16b, v4.16b	// (m_1; m_0) = u_0 v_1 + u_1 v_0
+	vshr128	v4, v3, 64		// (0; m_1)
+	vshl128	v3, v3, 64		// (m_0; 0)
+	eor	v1.16b, v5.16b, v4.16b	// (x_3; x_2)
+	eor	v0.16b, v6.16b, v3.16b	// (x_1; x_0)
 
 	// And now the only remaining difficulty is that the result needs to
 	// be reduced modulo p(t) = t^128 + t^7 + t^2 + t + 1.  Let R = t^128
@@ -137,8 +137,8 @@
 	// leave with z = u v in x2.  Clobbers x2--x4.
 
 	// The multiplication is thankfully easy.
-	// v0 =					// (u; ?)
-	// v1 =					// (v; ?)
+	// v0 =					// (?; u)
+	// v1 =					// (?; v)
 	pmull	v0.1q, v0.1d, v1.1d		// u v
 
 	// Now we must reduce.  This is essentially the same as the 128-bit
@@ -176,12 +176,12 @@
 	// shift both of them up by four bytes before we start.  This will
 	// mean that the high 64 bits of the result (from GCM's viewpoint)
 	// will be zero.
-	// v0 =				// (u_0 + u_1 t^32; u_2)
+	// v0 =				// (u_2; u_0 + u_1 t^32)
 	// v1 =				// (v_0 + v_1 t^32; v_0 + v_1 t^32)
 	// v2 =				// (v_2; v_2)
 	pmull2	v5.1q, v0.2d, v1.2d	// u_2 (v_0 + v_1 t^32) t^32 = e_0
 	pmull	v4.1q, v0.1d, v2.1d	// v_2 (u_0 + u_1 t^32) t^32 = e_1
-	pmull2	v6.1q, v0.2d, v2.2d	// u_2 v_2 = d = (d; 0)
+	pmull2	v6.1q, v0.2d, v2.2d	// u_2 v_2 = d = (0; d)
 	pmull	v3.1q, v0.1d, v1.1d	// u_0 v_0 + (u_0 v_1 + u_1 v_0) t^32
 					//   + u_1 v_1 t^64 = f
 
@@ -238,8 +238,8 @@
 	// Clobbers v16--v25.
 
 	// Start multiplying and accumulating pieces of product.
-	// v0 =				// (u_0; u_1)
-	// v1 =				// (u_2; ?)
+	// v0 =				// (u_1; u_0)
+	// v1 =				// (?; u_2)
 	// v2 =				// (v_0; v_0)
 	// v3 =				// (v_1; v_1)
 	// v4 =				// (v_2; v_2)
@@ -262,27 +262,27 @@
 	 eor	v20.16b, v20.16b, v24.16b // d = u_1 v_2 + u_2 v_1
 
 	// Piece the product together.
-	// v16 =			// (a_0; a_1)
-	// v19 =			// (b_0; b_1)
-	// v17 =			// (c_0; c_1)
-	// v20 =			// (d_0; d_1)
-	// v18 =			// (e_0; e_1)
-	vshl128	v21, v19, 64		// (0; b_0)
-	ext	v22.16b, v19.16b, v20.16b, #8 // (b_1; d_0)
-	vshr128	v23, v20, 64		// (d_1; 0)
-	eor	v16.16b, v16.16b, v21.16b // (x_0; x_1)
-	eor	v17.16b, v17.16b, v22.16b // (x_2; x_3)
-	eor	v18.16b, v18.16b, v23.16b // (x_2; x_3)
+	// v16 =			// (a_1; a_0)
+	// v19 =			// (b_1; b_0)
+	// v17 =			// (c_1; c_0)
+	// v20 =			// (d_1; d_0)
+	// v18 =			// (e_1; e_0)
+	vshl128	v21, v19, 64		// (b_0; 0)
+	ext	v22.16b, v19.16b, v20.16b, #8 // (d_0; b_1)
+	vshr128	v23, v20, 64		// (0; d_1)
+	eor	v16.16b, v16.16b, v21.16b // (x_1; x_0)
+	eor	v17.16b, v17.16b, v22.16b // (x_3; x_2)
+	eor	v18.16b, v18.16b, v23.16b // (x_5; x_4)
 
 	// Next, the reduction.  Our polynomial this time is p(x) = t^192 +
 	// t^7 + t^2 + t + 1.  Yes, the magic numbers are the same as the
 	// 128-bit case.  I don't know why.
 
 	// First, shift the high bits down.
-	// v16 =			// (y_0; y_1)
-	// v17 =			// (y_2; y_3)
-	// v18 =			// (y_4; y_5)
-	mov	v19.d[0], v17.d[1]	// (y_3; ?)
+	// v16 =			// (y_1; y_0)
+	// v17 =			// (y_3; y_2)
+	// v18 =			// (y_5; y_4)
+	mov	v19.d[0], v17.d[1]	// (?; y_3)
 
 	ushr	v23.2d, v18.2d, #63	// hi b_i for t
 	ushr	d20, d19, #63		// lo b_i for t
@@ -298,15 +298,15 @@
 	// Permute the high pieces while we fold in the b_i.
 	eor	v17.16b, v17.16b, v23.16b
 	vshl128	v20, v20, 64
-	mov	v19.d[0], v18.d[1]	// (y_5; ?)
-	ext	v18.16b, v17.16b, v18.16b, #8 // (y_3; y_4)
+	mov	v19.d[0], v18.d[1]	// (?; y_5)
+	ext	v18.16b, v17.16b, v18.16b, #8 // (y_4; y_3)
 	eor	v16.16b, v16.16b, v20.16b
 
 	// And finally shift the low bits up.
-	// v16 =			// (y'_0; y'_1)
-	// v17 =			// (y'_2; ?)
-	// v18 =			// (y'_3; y'_4)
-	// v19 =			// (y'_5; ?)
+	// v16 =			// (y'_1; y'_0)
+	// v17 =			// (?; y'_2)
+	// v18 =			// (y'_4; y'_3)
+	// v19 =			// (?; y'_5)
 	shl	v20.2d, v18.2d, #1
 	shl	d23, d19, #1
 	shl	v21.2d, v18.2d, #2
@@ -338,21 +338,21 @@
 	//
 	//	q = r s = (u_0 + u_1) (v_0 + v_1)
 	//	  = (u_0 v_0) + (u1 v_1) + (u_0 v_1 + u_1 v_0)
-	//	  = a + d + c
+	//	  = a + c + b
 	//
 	// The first two terms we've already calculated; the last is the
 	// remaining one we want.  We'll set B = t^128.  We know how to do
 	// 128-bit multiplications already, and Karatsuba is too annoying
 	// there, so there'll be 12 multiplications altogether, rather than
 	// the 16 we'd have if we did this the naÃ¯ve way.
-	// v0 =				// u_0 = (u_00; u_01)
-	// v1 =				// u_1 = (u_10; u_11)
+	// v0 =				// u_0 = (u_01; u_00)
+	// v1 =				// u_1 = (u_11; u_10)
 	// v2 =				// (v_00; v_00)
 	// v3 =				// (v_01; v_01)
 	// v4 =				// (v_10; v_10)
 	// v5 =				// (v_11; v_11)
 
-	eor	v28.16b, v0.16b, v1.16b	// u_* = (u_00 + u_10; u_01 + u_11)
+	eor	v28.16b, v0.16b, v1.16b	// u_* = (u_01 + u_11; u_00 + u_10)
 	eor	v29.16b, v2.16b, v4.16b	// v_*0 = v_00 + v_10
 	eor	v30.16b, v3.16b, v5.16b	// v_*1 = v_01 + v_11
 
@@ -402,16 +402,16 @@
 	// Now we must reduce.  This is essentially the same as the 192-bit
 	// case above, but more complicated because everything is bigger.
 	// The polynomial this time is p(t) = t^256 + t^10 + t^5 + t^2 + 1.
-	// v16 =			// (y_0; y_1)
-	// v17 =			// (y_2; y_3)
-	// v18 =			// (y_4; y_5)
-	// v19 =			// (y_6; y_7)
-	ushr	v24.2d, v18.2d, #62	// (y_4; y_5) b_i for t^2
-	ushr	v25.2d, v19.2d, #62	// (y_6; y_7) b_i for t^2
-	ushr	v26.2d, v18.2d, #59	// (y_4; y_5) b_i for t^5
-	ushr	v27.2d, v19.2d, #59	// (y_6; y_7) b_i for t^5
-	ushr	v28.2d, v18.2d, #54	// (y_4; y_5) b_i for t^10
-	ushr	v29.2d, v19.2d, #54	// (y_6; y_7) b_i for t^10
+	// v16 =			// (y_1; y_0)
+	// v17 =			// (y_3; y_2)
+	// v18 =			// (y_5; y_4)
+	// v19 =			// (y_7; y_6)
+	ushr	v24.2d, v18.2d, #62	// (y_5; y_4) b_i for t^2
+	ushr	v25.2d, v19.2d, #62	// (y_7; y_6) b_i for t^2
+	ushr	v26.2d, v18.2d, #59	// (y_5; y_4) b_i for t^5
+	ushr	v27.2d, v19.2d, #59	// (y_7; y_6) b_i for t^5
+	ushr	v28.2d, v18.2d, #54	// (y_5; y_4) b_i for t^10
+	ushr	v29.2d, v19.2d, #54	// (y_7; y_6) b_i for t^10
 	eor	v24.16b, v24.16b, v26.16b // mix the contributions together
 	eor	v25.16b, v25.16b, v27.16b
 	eor	v24.16b, v24.16b, v28.16b
@@ -424,16 +424,16 @@
 	eor	v16.16b, v16.16b, v24.16b
 
 	// And then shift the low bits up.
-	// v16 =			// (y'_0; y'_1)
-	// v17 =			// (y'_2; y'_3)
-	// v18 =			// (y'_4; y'_5)
-	// v19 =			// (y'_6; y'_7)
-	shl	v24.2d, v18.2d, #2	// (y'_4; y_5) a_i for t^2
-	shl	v25.2d, v19.2d, #2	// (y_6; y_7) a_i for t^2
-	shl	v26.2d, v18.2d, #5	// (y'_4; y_5) a_i for t^5
-	shl	v27.2d, v19.2d, #5	// (y_6; y_7) a_i for t^5
-	shl	v28.2d, v18.2d, #10	// (y'_4; y_5) a_i for t^10
-	shl	v29.2d, v19.2d, #10	// (y_6; y_7) a_i for t^10
+	// v16 =			// (y'_1; y'_0)
+	// v17 =			// (y'_3; y'_2)
+	// v18 =			// (y'_5; y'_4)
+	// v19 =			// (y'_7; y'_6)
+	shl	v24.2d, v18.2d, #2	// (y_5; y'_4) a_i for t^2
+	shl	v25.2d, v19.2d, #2	// (y_7; y_6) a_i for t^2
+	shl	v26.2d, v18.2d, #5	// (y_5; y'_4) a_i for t^5
+	shl	v27.2d, v19.2d, #5	// (y_7; y_6) a_i for t^5
+	shl	v28.2d, v18.2d, #10	// (y_5; y'_4) a_i for t^10
+	shl	v29.2d, v19.2d, #10	// (y_7; y_6) a_i for t^10
 	eor	v18.16b, v18.16b, v24.16b // mix the contributions together
 	eor	v19.16b, v19.16b, v25.16b
 	eor	v26.16b, v26.16b, v28.16b