X-Git-Url: https://git.distorted.org.uk/~mdw/catacomb/blobdiff_plain/9e6a4409d58d1ed9dfe2de3c6ffaee822e051c9f..HEAD:/symm/gcm-arm64-pmull.S diff --git a/symm/gcm-arm64-pmull.S b/symm/gcm-arm64-pmull.S index 97bb3bf2..4935e46d 100644 --- a/symm/gcm-arm64-pmull.S +++ b/symm/gcm-arm64-pmull.S @@ -71,19 +71,19 @@ // use Karatsuba's identity here, but I suspect that loses more in // the shifting, bit-twiddling, and dependency chains that it gains // in saving a multiplication which otherwise pipelines well. - // v0 = // (u_0; u_1) - // v1/v2 = // (v_0; v_1) + // v0 = // (u_1; u_0) + // v1/v2 = // (v_1; v_0) pmull2 v3.1q, v0.2d, v1.2d // u_1 v_0 pmull v4.1q, v0.1d, v2.1d // u_0 v_1 - pmull2 v5.1q, v0.2d, v2.2d // (t_1; x_3) = u_1 v_1 - pmull v6.1q, v0.1d, v1.1d // (x_0; t_0) = u_0 v_0 + pmull2 v5.1q, v0.2d, v2.2d // (x_3; t_1) = u_1 v_1 + pmull v6.1q, v0.1d, v1.1d // (t_0; x_0) = u_0 v_0 // Arrange the pieces to form a double-precision polynomial. - eor v3.16b, v3.16b, v4.16b // (m_0; m_1) = u_0 v_1 + u_1 v_0 - vshr128 v4, v3, 64 // (m_1; 0) - vshl128 v3, v3, 64 // (0; m_0) - eor v1.16b, v5.16b, v4.16b // (x_2; x_3) - eor v0.16b, v6.16b, v3.16b // (x_0; x_1) + eor v3.16b, v3.16b, v4.16b // (m_1; m_0) = u_0 v_1 + u_1 v_0 + vshr128 v4, v3, 64 // (0; m_1) + vshl128 v3, v3, 64 // (m_0; 0) + eor v1.16b, v5.16b, v4.16b // (x_3; x_2) + eor v0.16b, v6.16b, v3.16b // (x_1; x_0) // And now the only remaining difficulty is that the result needs to // be reduced modulo p(t) = t^128 + t^7 + t^2 + t + 1. Let R = t^128 @@ -137,8 +137,8 @@ // leave with z = u v in x2. Clobbers x2--x4. // The multiplication is thankfully easy. - // v0 = // (u; ?) - // v1 = // (v; ?) + // v0 = // (?; u) + // v1 = // (?; v) pmull v0.1q, v0.1d, v1.1d // u v // Now we must reduce. This is essentially the same as the 128-bit @@ -176,12 +176,12 @@ // shift both of them up by four bytes before we start. This will // mean that the high 64 bits of the result (from GCM's viewpoint) // will be zero. - // v0 = // (u_0 + u_1 t^32; u_2) + // v0 = // (u_2; u_0 + u_1 t^32) // v1 = // (v_0 + v_1 t^32; v_0 + v_1 t^32) // v2 = // (v_2; v_2) pmull2 v5.1q, v0.2d, v1.2d // u_2 (v_0 + v_1 t^32) t^32 = e_0 pmull v4.1q, v0.1d, v2.1d // v_2 (u_0 + u_1 t^32) t^32 = e_1 - pmull2 v6.1q, v0.2d, v2.2d // u_2 v_2 = d = (d; 0) + pmull2 v6.1q, v0.2d, v2.2d // u_2 v_2 = d = (0; d) pmull v3.1q, v0.1d, v1.1d // u_0 v_0 + (u_0 v_1 + u_1 v_0) t^32 // + u_1 v_1 t^64 = f @@ -238,8 +238,8 @@ // Clobbers v16--v25. // Start multiplying and accumulating pieces of product. - // v0 = // (u_0; u_1) - // v1 = // (u_2; ?) + // v0 = // (u_1; u_0) + // v1 = // (?; u_2) // v2 = // (v_0; v_0) // v3 = // (v_1; v_1) // v4 = // (v_2; v_2) @@ -262,27 +262,27 @@ eor v20.16b, v20.16b, v24.16b // d = u_1 v_2 + u_2 v_1 // Piece the product together. - // v16 = // (a_0; a_1) - // v19 = // (b_0; b_1) - // v17 = // (c_0; c_1) - // v20 = // (d_0; d_1) - // v18 = // (e_0; e_1) - vshl128 v21, v19, 64 // (0; b_0) - ext v22.16b, v19.16b, v20.16b, #8 // (b_1; d_0) - vshr128 v23, v20, 64 // (d_1; 0) - eor v16.16b, v16.16b, v21.16b // (x_0; x_1) - eor v17.16b, v17.16b, v22.16b // (x_2; x_3) - eor v18.16b, v18.16b, v23.16b // (x_2; x_3) + // v16 = // (a_1; a_0) + // v19 = // (b_1; b_0) + // v17 = // (c_1; c_0) + // v20 = // (d_1; d_0) + // v18 = // (e_1; e_0) + vshl128 v21, v19, 64 // (b_0; 0) + ext v22.16b, v19.16b, v20.16b, #8 // (d_0; b_1) + vshr128 v23, v20, 64 // (0; d_1) + eor v16.16b, v16.16b, v21.16b // (x_1; x_0) + eor v17.16b, v17.16b, v22.16b // (x_3; x_2) + eor v18.16b, v18.16b, v23.16b // (x_5; x_4) // Next, the reduction. Our polynomial this time is p(x) = t^192 + // t^7 + t^2 + t + 1. Yes, the magic numbers are the same as the // 128-bit case. I don't know why. // First, shift the high bits down. - // v16 = // (y_0; y_1) - // v17 = // (y_2; y_3) - // v18 = // (y_4; y_5) - mov v19.d[0], v17.d[1] // (y_3; ?) + // v16 = // (y_1; y_0) + // v17 = // (y_3; y_2) + // v18 = // (y_5; y_4) + mov v19.d[0], v17.d[1] // (?; y_3) ushr v23.2d, v18.2d, #63 // hi b_i for t ushr d20, d19, #63 // lo b_i for t @@ -298,15 +298,15 @@ // Permute the high pieces while we fold in the b_i. eor v17.16b, v17.16b, v23.16b vshl128 v20, v20, 64 - mov v19.d[0], v18.d[1] // (y_5; ?) - ext v18.16b, v17.16b, v18.16b, #8 // (y_3; y_4) + mov v19.d[0], v18.d[1] // (?; y_5) + ext v18.16b, v17.16b, v18.16b, #8 // (y_4; y_3) eor v16.16b, v16.16b, v20.16b // And finally shift the low bits up. - // v16 = // (y'_0; y'_1) - // v17 = // (y'_2; ?) - // v18 = // (y'_3; y'_4) - // v19 = // (y'_5; ?) + // v16 = // (y'_1; y'_0) + // v17 = // (?; y'_2) + // v18 = // (y'_4; y'_3) + // v19 = // (?; y'_5) shl v20.2d, v18.2d, #1 shl d23, d19, #1 shl v21.2d, v18.2d, #2 @@ -338,21 +338,21 @@ // // q = r s = (u_0 + u_1) (v_0 + v_1) // = (u_0 v_0) + (u1 v_1) + (u_0 v_1 + u_1 v_0) - // = a + d + c + // = a + c + b // // The first two terms we've already calculated; the last is the // remaining one we want. We'll set B = t^128. We know how to do // 128-bit multiplications already, and Karatsuba is too annoying // there, so there'll be 12 multiplications altogether, rather than // the 16 we'd have if we did this the naïve way. - // v0 = // u_0 = (u_00; u_01) - // v1 = // u_1 = (u_10; u_11) + // v0 = // u_0 = (u_01; u_00) + // v1 = // u_1 = (u_11; u_10) // v2 = // (v_00; v_00) // v3 = // (v_01; v_01) // v4 = // (v_10; v_10) // v5 = // (v_11; v_11) - eor v28.16b, v0.16b, v1.16b // u_* = (u_00 + u_10; u_01 + u_11) + eor v28.16b, v0.16b, v1.16b // u_* = (u_01 + u_11; u_00 + u_10) eor v29.16b, v2.16b, v4.16b // v_*0 = v_00 + v_10 eor v30.16b, v3.16b, v5.16b // v_*1 = v_01 + v_11 @@ -402,16 +402,16 @@ // Now we must reduce. This is essentially the same as the 192-bit // case above, but more complicated because everything is bigger. // The polynomial this time is p(t) = t^256 + t^10 + t^5 + t^2 + 1. - // v16 = // (y_0; y_1) - // v17 = // (y_2; y_3) - // v18 = // (y_4; y_5) - // v19 = // (y_6; y_7) - ushr v24.2d, v18.2d, #62 // (y_4; y_5) b_i for t^2 - ushr v25.2d, v19.2d, #62 // (y_6; y_7) b_i for t^2 - ushr v26.2d, v18.2d, #59 // (y_4; y_5) b_i for t^5 - ushr v27.2d, v19.2d, #59 // (y_6; y_7) b_i for t^5 - ushr v28.2d, v18.2d, #54 // (y_4; y_5) b_i for t^10 - ushr v29.2d, v19.2d, #54 // (y_6; y_7) b_i for t^10 + // v16 = // (y_1; y_0) + // v17 = // (y_3; y_2) + // v18 = // (y_5; y_4) + // v19 = // (y_7; y_6) + ushr v24.2d, v18.2d, #62 // (y_5; y_4) b_i for t^2 + ushr v25.2d, v19.2d, #62 // (y_7; y_6) b_i for t^2 + ushr v26.2d, v18.2d, #59 // (y_5; y_4) b_i for t^5 + ushr v27.2d, v19.2d, #59 // (y_7; y_6) b_i for t^5 + ushr v28.2d, v18.2d, #54 // (y_5; y_4) b_i for t^10 + ushr v29.2d, v19.2d, #54 // (y_7; y_6) b_i for t^10 eor v24.16b, v24.16b, v26.16b // mix the contributions together eor v25.16b, v25.16b, v27.16b eor v24.16b, v24.16b, v28.16b @@ -424,16 +424,16 @@ eor v16.16b, v16.16b, v24.16b // And then shift the low bits up. - // v16 = // (y'_0; y'_1) - // v17 = // (y'_2; y'_3) - // v18 = // (y'_4; y'_5) - // v19 = // (y'_6; y'_7) - shl v24.2d, v18.2d, #2 // (y'_4; y_5) a_i for t^2 - shl v25.2d, v19.2d, #2 // (y_6; y_7) a_i for t^2 - shl v26.2d, v18.2d, #5 // (y'_4; y_5) a_i for t^5 - shl v27.2d, v19.2d, #5 // (y_6; y_7) a_i for t^5 - shl v28.2d, v18.2d, #10 // (y'_4; y_5) a_i for t^10 - shl v29.2d, v19.2d, #10 // (y_6; y_7) a_i for t^10 + // v16 = // (y'_1; y'_0) + // v17 = // (y'_3; y'_2) + // v18 = // (y'_5; y'_4) + // v19 = // (y'_7; y'_6) + shl v24.2d, v18.2d, #2 // (y_5; y'_4) a_i for t^2 + shl v25.2d, v19.2d, #2 // (y_7; y_6) a_i for t^2 + shl v26.2d, v18.2d, #5 // (y_5; y'_4) a_i for t^5 + shl v27.2d, v19.2d, #5 // (y_7; y_6) a_i for t^5 + shl v28.2d, v18.2d, #10 // (y_5; y'_4) a_i for t^10 + shl v29.2d, v19.2d, #10 // (y_7; y_6) a_i for t^10 eor v18.16b, v18.16b, v24.16b // mix the contributions together eor v19.16b, v19.16b, v25.16b eor v26.16b, v26.16b, v28.16b