base/asm-common.h, *.S: Use consistent little-endian notation for SIMD regs.
[catacomb] / symm / gcm-arm64-pmull.S
index dcd8c45..0e4bd79 100644 (file)
        // use Karatsuba's identity here, but I suspect that loses more in
        // the shifting, bit-twiddling, and dependency chains that it gains
        // in saving a multiplication which otherwise pipelines well.
-       // v0 =                         // (u_0; u_1)
-       // v1/v2 =                      // (v_0; v_1)
+       // v0 =                         // (u_1; u_0)
+       // v1/v2 =                      // (v_1; v_0)
        pmull2  v3.1q, v0.2d, v1.2d     // u_1 v_0
        pmull   v4.1q, v0.1d, v2.1d     // u_0 v_1
-       pmull2  v5.1q, v0.2d, v2.2d     // (t_1; x_3) = u_1 v_1
-       pmull   v6.1q, v0.1d, v1.1d     // (x_0; t_0) = u_0 v_0
+       pmull2  v5.1q, v0.2d, v2.2d     // (x_3; t_1) = u_1 v_1
+       pmull   v6.1q, v0.1d, v1.1d     // (t_0; x_0) = u_0 v_0
 
        // Arrange the pieces to form a double-precision polynomial.
-       eor     v3.16b, v3.16b, v4.16b  // (m_0; m_1) = u_0 v_1 + u_1 v_0
-       vshr128 v4, v3, 64              // (m_1; 0)
-       vshl128 v3, v3, 64              // (0; m_0)
-       eor     v1.16b, v5.16b, v4.16b  // (x_2; x_3)
-       eor     v0.16b, v6.16b, v3.16b  // (x_0; x_1)
+       eor     v3.16b, v3.16b, v4.16b  // (m_1; m_0) = u_0 v_1 + u_1 v_0
+       vshr128 v4, v3, 64              // (0; m_1)
+       vshl128 v3, v3, 64              // (m_0; 0)
+       eor     v1.16b, v5.16b, v4.16b  // (x_3; x_2)
+       eor     v0.16b, v6.16b, v3.16b  // (x_1; x_0)
 
        // And now the only remaining difficulty is that the result needs to
        // be reduced modulo p(t) = t^128 + t^7 + t^2 + t + 1.  Let R = t^128
        // leave with z = u v in x2.  Clobbers x2--x4.
 
        // The multiplication is thankfully easy.
-       // v0 =                                 // (u; ?)
-       // v1 =                                 // (v; ?)
+       // v0 =                                 // (?; u)
+       // v1 =                                 // (?; v)
        pmull   v0.1q, v0.1d, v1.1d             // u v
 
        // Now we must reduce.  This is essentially the same as the 128-bit
        // shift both of them up by four bytes before we start.  This will
        // mean that the high 64 bits of the result (from GCM's viewpoint)
        // will be zero.
-       // v0 =                         // (u_0 + u_1 t^32; u_2)
+       // v0 =                         // (u_2; u_0 + u_1 t^32)
        // v1 =                         // (v_0 + v_1 t^32; v_0 + v_1 t^32)
        // v2 =                         // (v_2; v_2)
        pmull2  v5.1q, v0.2d, v1.2d     // u_2 (v_0 + v_1 t^32) t^32 = e_0
        pmull   v4.1q, v0.1d, v2.1d     // v_2 (u_0 + u_1 t^32) t^32 = e_1
-       pmull2  v6.1q, v0.2d, v2.2d     // u_2 v_2 = d = (d; 0)
+       pmull2  v6.1q, v0.2d, v2.2d     // u_2 v_2 = d = (0; d)
        pmull   v3.1q, v0.1d, v1.1d     // u_0 v_0 + (u_0 v_1 + u_1 v_0) t^32
                                        //   + u_1 v_1 t^64 = f
 
        // Clobbers v16--v25.
 
        // Start multiplying and accumulating pieces of product.
-       // v0 =                         // (u_0; u_1)
-       // v1 =                         // (u_2; ?)
+       // v0 =                         // (u_1; u_0)
+       // v1 =                         // (?; u_2)
        // v2 =                         // (v_0; v_0)
        // v3 =                         // (v_1; v_1)
        // v4 =                         // (v_2; v_2)
         eor    v20.16b, v20.16b, v24.16b // d = u_1 v_2 + u_2 v_1
 
        // Piece the product together.
-       // v16 =                        // (a_0; a_1)
-       // v19 =                        // (b_0; b_1)
-       // v17 =                        // (c_0; c_1)
-       // v20 =                        // (d_0; d_1)
-       // v18 =                        // (e_0; e_1)
-       vshl128 v21, v19, 64            // (0; b_0)
-       ext     v22.16b, v19.16b, v20.16b, #8 // (b_1; d_0)
-       vshr128 v23, v20, 64            // (d_1; 0)
-       eor     v16.16b, v16.16b, v21.16b // (x_0; x_1)
-       eor     v17.16b, v17.16b, v22.16b // (x_2; x_3)
-       eor     v18.16b, v18.16b, v23.16b // (x_2; x_3)
+       // v16 =                        // (a_1; a_0)
+       // v19 =                        // (b_1; b_0)
+       // v17 =                        // (c_1; c_0)
+       // v20 =                        // (d_1; d_0)
+       // v18 =                        // (e_1; e_0)
+       vshl128 v21, v19, 64            // (b_0; 0)
+       ext     v22.16b, v19.16b, v20.16b, #8 // (d_0; b_1)
+       vshr128 v23, v20, 64            // (0; d_1)
+       eor     v16.16b, v16.16b, v21.16b // (x_1; x_0)
+       eor     v17.16b, v17.16b, v22.16b // (x_3; x_2)
+       eor     v18.16b, v18.16b, v23.16b // (x_3; x_2)
 
        // Next, the reduction.  Our polynomial this time is p(x) = t^192 +
        // t^7 + t^2 + t + 1.  Yes, the magic numbers are the same as the
        // 128-bit case.  I don't know why.
 
        // First, shift the high bits down.
-       // v16 =                        // (y_0; y_1)
-       // v17 =                        // (y_2; y_3)
-       // v18 =                        // (y_4; y_5)
-       mov     v19.d[0], v17.d[1]      // (y_3; ?)
+       // v16 =                        // (y_1; y_0)
+       // v17 =                        // (y_3; y_2)
+       // v18 =                        // (y_5; y_4)
+       mov     v19.d[0], v17.d[1]      // (?; y_3)
 
        ushr    v23.2d, v18.2d, #63     // hi b_i for t
        ushr    d20, d19, #63           // lo b_i for t
        // Permute the high pieces while we fold in the b_i.
        eor     v17.16b, v17.16b, v23.16b
        vshl128 v20, v20, 64
-       mov     v19.d[0], v18.d[1]      // (y_5; ?)
-       ext     v18.16b, v17.16b, v18.16b, #8 // (y_3; y_4)
+       mov     v19.d[0], v18.d[1]      // (?; y_5)
+       ext     v18.16b, v17.16b, v18.16b, #8 // (y_4; y_3)
        eor     v16.16b, v16.16b, v20.16b
 
        // And finally shift the low bits up.
-       // v16 =                        // (y'_0; y'_1)
-       // v17 =                        // (y'_2; ?)
-       // v18 =                        // (y'_3; y'_4)
-       // v19 =                        // (y'_5; ?)
+       // v16 =                        // (y'_1; y'_0)
+       // v17 =                        // (?; y'_2)
+       // v18 =                        // (y'_4; y'_3)
+       // v19 =                        // (?; y'_5)
        shl     v20.2d, v18.2d, #1
        shl     d23, d19, #1
        shl     v21.2d, v18.2d, #2
        // 128-bit multiplications already, and Karatsuba is too annoying
        // there, so there'll be 12 multiplications altogether, rather than
        // the 16 we'd have if we did this the naïve way.
-       // v0 =                         // u_0 = (u_00; u_01)
-       // v1 =                         // u_1 = (u_10; u_11)
+       // v0 =                         // u_0 = (u_01; u_00)
+       // v1 =                         // u_1 = (u_11; u_10)
        // v2 =                         // (v_00; v_00)
        // v3 =                         // (v_01; v_01)
        // v4 =                         // (v_10; v_10)
        // v5 =                         // (v_11; v_11)
 
-       eor     v28.16b, v0.16b, v1.16b // u_* = (u_00 + u_10; u_01 + u_11)
+       eor     v28.16b, v0.16b, v1.16b // u_* = (u_01 + u_11; u_00 + u_10)
        eor     v29.16b, v2.16b, v4.16b // v_*0 = v_00 + v_10
        eor     v30.16b, v3.16b, v5.16b // v_*1 = v_01 + v_11
 
        // Now we must reduce.  This is essentially the same as the 192-bit
        // case above, but more complicated because everything is bigger.
        // The polynomial this time is p(t) = t^256 + t^10 + t^5 + t^2 + 1.
-       // v16 =                        // (y_0; y_1)
-       // v17 =                        // (y_2; y_3)
-       // v18 =                        // (y_4; y_5)
-       // v19 =                        // (y_6; y_7)
-       ushr    v24.2d, v18.2d, #62     // (y_4; y_5) b_i for t^2
-       ushr    v25.2d, v19.2d, #62     // (y_6; y_7) b_i for t^2
-       ushr    v26.2d, v18.2d, #59     // (y_4; y_5) b_i for t^5
-       ushr    v27.2d, v19.2d, #59     // (y_6; y_7) b_i for t^5
-       ushr    v28.2d, v18.2d, #54     // (y_4; y_5) b_i for t^10
-       ushr    v29.2d, v19.2d, #54     // (y_6; y_7) b_i for t^10
+       // v16 =                        // (y_1; y_0)
+       // v17 =                        // (y_3; y_2)
+       // v18 =                        // (y_5; y_4)
+       // v19 =                        // (y_7; y_6)
+       ushr    v24.2d, v18.2d, #62     // (y_5; y_4) b_i for t^2
+       ushr    v25.2d, v19.2d, #62     // (y_7; y_6) b_i for t^2
+       ushr    v26.2d, v18.2d, #59     // (y_5; y_4) b_i for t^5
+       ushr    v27.2d, v19.2d, #59     // (y_7; y_6) b_i for t^5
+       ushr    v28.2d, v18.2d, #54     // (y_5; y_4) b_i for t^10
+       ushr    v29.2d, v19.2d, #54     // (y_7; y_6) b_i for t^10
        eor     v24.16b, v24.16b, v26.16b // mix the contributions together
        eor     v25.16b, v25.16b, v27.16b
        eor     v24.16b, v24.16b, v28.16b
        eor     v16.16b, v16.16b, v24.16b
 
        // And then shift the low bits up.
-       // v16 =                        // (y'_0; y'_1)
-       // v17 =                        // (y'_2; y'_3)
-       // v18 =                        // (y'_4; y'_5)
-       // v19 =                        // (y'_6; y'_7)
-       shl     v24.2d, v18.2d, #2      // (y'_4; y_5) a_i for t^2
-       shl     v25.2d, v19.2d, #2      // (y_6; y_7) a_i for t^2
-       shl     v26.2d, v18.2d, #5      // (y'_4; y_5) a_i for t^5
-       shl     v27.2d, v19.2d, #5      // (y_6; y_7) a_i for t^5
-       shl     v28.2d, v18.2d, #10     // (y'_4; y_5) a_i for t^10
-       shl     v29.2d, v19.2d, #10     // (y_6; y_7) a_i for t^10
+       // v16 =                        // (y'_1; y'_0)
+       // v17 =                        // (y'_3; y'_2)
+       // v18 =                        // (y'_5; y'_4)
+       // v19 =                        // (y'_7; y'_6)
+       shl     v24.2d, v18.2d, #2      // (y_5; y'_4) a_i for t^2
+       shl     v25.2d, v19.2d, #2      // (y_7; y_6) a_i for t^2
+       shl     v26.2d, v18.2d, #5      // (y_5; y'_4) a_i for t^5
+       shl     v27.2d, v19.2d, #5      // (y_7; y_6) a_i for t^5
+       shl     v28.2d, v18.2d, #10     // (y_5; y'_4) a_i for t^10
+       shl     v29.2d, v19.2d, #10     // (y_7; y_6) a_i for t^10
        eor     v18.16b, v18.16b, v24.16b // mix the contributions together
        eor     v19.16b, v19.16b, v25.16b
        eor     v26.16b, v26.16b, v28.16b