X-Git-Url: https://git.distorted.org.uk/~mdw/catacomb/blobdiff_plain/e7ee4000d06cadd8355404c8ddfb3d16265d24ca..981a9e5d5e3af6c06ad8b3f821928852068227e4:/symm/gcm-x86ish-pclmul.S diff --git a/symm/gcm-x86ish-pclmul.S b/symm/gcm-x86ish-pclmul.S index 837abbdd..fadeca58 100644 --- a/symm/gcm-x86ish-pclmul.S +++ b/symm/gcm-x86ish-pclmul.S @@ -113,21 +113,21 @@ // use Karatsuba's identity here, but I suspect that loses more in // the shifting, bit-twiddling, and dependency chains that it gains // in saving a multiplication which otherwise pipelines well. - // xmm0 = // (u_1; u_0) - // xmm1 = // (v_1; v_0) - movdqa xmm2, xmm1 // (v_1; v_0) again - movdqa xmm3, xmm0 // (u_1; u_0) again - movdqa xmm4, xmm0 // (u_1; u_0) yet again + // xmm0 = // (u_0; u_1) + // xmm1 = // (v_0; v_1) + movdqa xmm2, xmm1 // (v_0; v_1) again + movdqa xmm3, xmm0 // (u_0; u_1) again + movdqa xmm4, xmm0 // (u_0; u_1) yet again pclmulhqlqdq xmm2, xmm0 // u_1 v_0 pclmullqlqdq xmm0, xmm1 // u_1 v_1 pclmulhqlqdq xmm3, xmm1 // u_0 v_1 pclmulhqhqdq xmm4, xmm1 // u_0 v_0 // Arrange the pieces to form a double-precision polynomial. - pxor xmm2, xmm3 // (m_1; m_0) = u_1 v_0 + u_0 v_1 - movdqa xmm1, xmm2 // (m_1; m_0) again - pslldq xmm2, 8 // (0; m_1) - psrldq xmm1, 8 // (m_0; 0) + pxor xmm2, xmm3 // (m_0; m_1) = u_1 v_0 + u_0 v_1 + movdqa xmm1, xmm2 // (m_0; m_1) again + pslldq xmm2, 8 // (m_1; 0) + psrldq xmm1, 8 // (0; m_0) pxor xmm0, xmm2 // z_1 = u_1 v_1 + m_1 pxor xmm1, xmm4 // z_0 = u_0 v_0 + t^64 m_0 @@ -158,9 +158,9 @@ // word together, and then the low bits, everything will be fine. // First, shift the high bits down. - movdqa xmm2, xmm0 // (x_7, x_6; x_5, x_4) again - movdqa xmm3, xmm0 // (x_7, x_6; x_5, x_4) yet again - movdqa xmm4, xmm0 // (x_7, x_6; x_5, x_4) again again + movdqa xmm2, xmm0 // (x_4, x_5; x_6, x_7) again + movdqa xmm3, xmm0 // (x_4, x_5; x_6, x_7) yet again + movdqa xmm4, xmm0 // (x_4, x_5; x_6, x_7) again again pslld xmm2, 31 // the b_i for t pslld xmm3, 30 // the b_i for t^2 pslld xmm4, 25 // the b_i for t^7 @@ -196,13 +196,13 @@ // polynomial this time is p(t) = t^64 + t^4 + t^3 + t + 1. // First, we must detach the top (`low'!) half of the result. - movdqa xmm0, xmm1 // (x_3, x_2; x_1, x_0) again - psrldq xmm1, 8 // (x_1, x_0; 0, 0) + movdqa xmm0, xmm1 // (x_0, x_1; x_2, x_3) again + psrldq xmm1, 8 // (0, 0; x_0, x_1) // Next, shift the high bits down. - movdqa xmm2, xmm0 // (x_3, x_2; ?, ?) again - movdqa xmm3, xmm0 // (x_3, x_2; ?, ?) yet again - movdqa xmm4, xmm0 // (x_3, x_2; ?, ?) again again + movdqa xmm2, xmm0 // (?, ?; x_2, x_3) again + movdqa xmm3, xmm0 // (?, ?; x_2, x_3) yet again + movdqa xmm4, xmm0 // (?, ?; x_2, x_3) again again pslld xmm2, 31 // b_i for t pslld xmm3, 29 // b_i for t^3 pslld xmm4, 28 // b_i for t^4 @@ -239,11 +239,11 @@ // shift both of them up by four bytes before we start. This will // mean that the high 64 bits of the result (from GCM's viewpoint) // will be zero. - // xmm0 = // (0, u_2; u_1, u_0) - // xmm1 = // (0, v_2; v_1, v_0) - movdqa xmm2, xmm1 // (0, v_2; v_1, v_0) again - movdqa xmm3, xmm0 // (0, u_2; u_1, u_0) again - movdqa xmm4, xmm0 // (0, u_2; u_1, u_0) yet again + // xmm0 = // (u_0, u_1; u_2, 0) + // xmm1 = // (v_0, v_1; v_2, 0) + movdqa xmm2, xmm1 // (v_0, v_1; v_2, 0) again + movdqa xmm3, xmm0 // (u_0, u_1; u_2, 0) again + movdqa xmm4, xmm0 // (u_0, u_1; u_2, 0) yet again pclmulhqlqdq xmm2, xmm0 // u_2 (v_1 t^32 + v_0) = e_0 pclmullqlqdq xmm0, xmm1 // u_2 v_2 = d = (0; d) pclmulhqlqdq xmm3, xmm1 // v_2 (u_1 t^32 + u_0) = e_1 @@ -255,10 +255,10 @@ // registers. The answer we want is d t^128 + e t^64 + f, where e = // e_0 + e_1. // - // The place values for the two halves are (t^160, t^128; t^96, ?) - // and (?, t^64; t^32, 1). But we also want to shift the high part + // The place values for the two halves are (?, t^96; t^128, t^160) + // and (1, t^32; t^64, ?). But we also want to shift the high part // left by a word, for symmetry's sake. - psrldq xmm0, 8 // (d; 0) = d t^128 + psrldq xmm0, 8 // (0; d) = d t^128 pxor xmm2, xmm3 // e = (e_0 + e_1) movdqa xmm1, xmm4 // f again pxor xmm0, xmm2 // d t^128 + e t^64 @@ -308,15 +308,15 @@ // are unimportant. Clobbers xmm2--xmm7. // Start multiplying and accumulating pieces of product. - // xmm0 = // (u_2; u_1) - // xmm1 = // (u_0; ?) - // xmm2 = // (v_2; v_1) - // xmm3 = // (v_0; ?) - movdqa xmm4, xmm0 // (u_2; u_1) again - movdqa xmm5, xmm0 // (u_2; u_1) yet again - movdqa xmm6, xmm0 // (u_2; u_1) again again - movdqa xmm7, xmm3 // (v_0; ?) again - punpcklqdq xmm3, xmm1 // (v_0; u_0) + // xmm0 = // (u_1; u_2) + // xmm1 = // (?; u_0) + // xmm2 = // (v_1; v_2) + // xmm3 = // (?; v_0) + movdqa xmm4, xmm0 // (u_1; u_2) again + movdqa xmm5, xmm0 // (u_1; u_2) yet again + movdqa xmm6, xmm0 // (u_1; u_2) again again + movdqa xmm7, xmm3 // (?; v_0) again + punpcklqdq xmm3, xmm1 // (u_0; v_0) pclmulhqhqdq xmm4, xmm2 // u_1 v_1 pclmullqlqdq xmm1, xmm2 // u_0 v_2 pclmullqhqdq xmm5, xmm2 // u_2 v_1 @@ -324,7 +324,7 @@ pxor xmm1, xmm4 // u_0 v_2 + u_1 v_1 pclmullqlqdq xmm7, xmm0 // u_2 v_0 pxor xmm5, xmm6 // b = u_2 v_1 + u_1 v_2 - movdqa xmm6, xmm0 // (u_2; u_1) like a bad penny + movdqa xmm6, xmm0 // (u_1; u_2) like a bad penny pxor xmm1, xmm7 // c = u_0 v_2 + u_1 v_1 + u_2 v_0 pclmullqlqdq xmm0, xmm2 // a = u_2 v_2 pclmulhqlqdq xmm6, xmm3 // u_1 v_0 @@ -334,50 +334,50 @@ // Next, the piecing together of the product. There's significant // work here to leave the completed pieces in sensible registers. - // xmm0 = // (a_1; a_0) = a = u_2 v_2 - // xmm5 = // (b_1; b_0) = b = u_1 v_2 + u_2 v_1 - // xmm1 = // (c_1; c_0) = c = u_0 v_2 + + // xmm0 = // (a_0; a_1) = a = u_2 v_2 + // xmm5 = // (b_0; b_1) = b = u_1 v_2 + u_2 v_1 + // xmm1 = // (c_0; c_1) = c = u_0 v_2 + // u_1 v_1 + u_2 v_0 - // xmm6 = // (d_1; d_0) = d = u_0 v_1 + u_1 v_0 - // xmm3 = // (e_1; e_0) = e = u_0 v_0 + // xmm6 = // (d_0; d_1) = d = u_0 v_1 + u_1 v_0 + // xmm3 = // (e_0; e_1) = e = u_0 v_0 // xmm2, xmm4, xmm7 spare - movdqa xmm2, xmm6 // (d_1; d_0) again - movdqa xmm4, xmm5 // (b_1; b_0) again - pslldq xmm6, 8 // (0; d_1) - psrldq xmm5, 8 // (b_0; 0) - psrldq xmm2, 8 // (d_0; 0) - pslldq xmm4, 8 // (0; b_1) - pxor xmm5, xmm6 // (b_0; d_1) - pxor xmm0, xmm4 // (x_5; x_4) = (a_1; a_0 + b_1) - pxor xmm2, xmm3 // (x_1; x_0) = (e_1 + d_0; e_0) - pxor xmm1, xmm5 // (x_3; x_2) = (b_0 + c_1; c_0 + d_1) + movdqa xmm2, xmm6 // (d_0; d_1) again + movdqa xmm4, xmm5 // (b_0; b_1) again + pslldq xmm6, 8 // (d_1; 0) + psrldq xmm5, 8 // (0; b_0) + psrldq xmm2, 8 // (0; d_0) + pslldq xmm4, 8 // (b_1; 0) + pxor xmm5, xmm6 // (d_1; b_0) + pxor xmm0, xmm4 // (x_4; x_5) = (a_0 + b_1; a_1) + pxor xmm2, xmm3 // (x_0; x_1) = (e_0; e_1 + d_0) + pxor xmm1, xmm5 // (x_2; x_3) = (c_0 + d_1; b_0 + c_1) // Next, the reduction. Our polynomial this time is p(x) = t^192 + // t^7 + t^2 + t + 1. Yes, the magic numbers are the same as the // 128-bit case. I don't know why. // First, shift the high bits down. - // xmm0 = // (x_5; x_4) - // xmm1 = // (x_3; x_2) - // xmm2 = // (x_1; x_0) + // xmm0 = // (x_4; x_5) + // xmm1 = // (x_2; x_3) + // xmm2 = // (x_0; x_1) // xmm3--xmm7 spare - movdqa xmm3, xmm0 // (x_5; x_4) copy - movdqa xmm4, xmm0 // (x_5; x_4) copy - movdqa xmm5, xmm0 // (x_5; x_4) copy - pslld xmm3, 31 // (x_5; x_4) b_i for t - pslld xmm4, 30 // (x_5; x_4) b_i for t^2 - pslld xmm5, 25 // (x_5; x_4) b_i for t^7 - movq xmm6, xmm1 // (x_3; 0) copy + movdqa xmm3, xmm0 // (x_4; x_5) copy + movdqa xmm4, xmm0 // (x_4; x_5) copy + movdqa xmm5, xmm0 // (x_4; x_5) copy + pslld xmm3, 31 // (x_4; x_5) b_i for t + pslld xmm4, 30 // (x_4; x_5) b_i for t^2 + pslld xmm5, 25 // (x_4; x_5) b_i for t^7 + movq xmm6, xmm1 // (0; x_3) copy pxor xmm3, xmm4 - movq xmm7, xmm1 // (x_3; 0) copy + movq xmm7, xmm1 // (0; x_3) copy pxor xmm3, xmm5 - movq xmm5, xmm1 // (x_3; 0) copy - movdqa xmm4, xmm3 // (x_5; x_4) b_i combined - pslld xmm6, 31 // (x_3; 0) b_i for t - pslld xmm7, 30 // (x_3; 0) b_i for t^2 - pslld xmm5, 25 // (x_3; 0) b_i for t^7 - psrldq xmm3, 12 // (x_5; x_4) low contrib - pslldq xmm4, 4 // (x_5; x_4) high contrib + movq xmm5, xmm1 // (0; x_3) copy + movdqa xmm4, xmm3 // (x_4; x_5) b_i combined + pslld xmm6, 31 // (0; x_3) b_i for t + pslld xmm7, 30 // (0; x_3) b_i for t^2 + pslld xmm5, 25 // (0; x_3) b_i for t^7 + psrldq xmm3, 12 // (x_4; x_5) low contrib + pslldq xmm4, 4 // (x_4; x_5) high contrib pxor xmm6, xmm7 pxor xmm2, xmm3 pxor xmm6, xmm5 @@ -387,17 +387,17 @@ // And finally shift the low bits up. Unfortunately, we also have to // split the low bits out. - // xmm0 = // (x'_5; x'_4) - // xmm1 = // (x'_3; x'_2) - // xmm2 = // (x'_1; x'_0) - movdqa xmm5, xmm1 // copies of (x'_3; x'_2) + // xmm0 = // (x'_4; x'_5) + // xmm1 = // (x'_2; x'_3) + // xmm2 = // (x'_0; x'_1) + movdqa xmm5, xmm1 // copies of (x'_2; x'_3) movdqa xmm6, xmm1 movdqa xmm7, xmm1 - psrldq xmm1, 8 // bring down (x'_2; ?) - movdqa xmm3, xmm0 // copies of (x'_5; x'_4) + psrldq xmm1, 8 // bring down (?; x'_2) + movdqa xmm3, xmm0 // copies of (x'_4; x'_5) movdqa xmm4, xmm0 - punpcklqdq xmm1, xmm2 // (x'_2; x'_1) - psrldq xmm2, 8 // (x'_0; ?) + punpcklqdq xmm1, xmm2 // (x'_1; x'_2) + psrldq xmm2, 8 // (?; x'_0) pxor xmm2, xmm5 // low half and unit contrib pxor xmm1, xmm0 psrld xmm5, 1 @@ -412,7 +412,7 @@ pxor xmm0, xmm4 pxor xmm5, xmm2 // mix everything together pxor xmm0, xmm1 - movq xmm1, xmm5 // shunt (z_0; ?) into proper place + movq xmm1, xmm5 // shunt (?; z_0) into proper place .endm .macro mul256 @@ -442,10 +442,10 @@ // On x86, there aren't quite enough registers, so spill one for a // bit. On AMD64, we can keep on going, so it's all good. - // xmm0 = // u_1 = (u_11; u_10) - // xmm1 = // u_0 = (u_01; u_00) - // xmm2 = // v_1 = (v_11; v_10) - // xmm3 = // v_0 = (v_01; v_00) + // xmm0 = // u_1 = (u_10; u_11) + // xmm1 = // u_0 = (u_00; u_01) + // xmm2 = // v_1 = (v_10; v_11) + // xmm3 = // v_0 = (v_00; v_01) movdqa xmm4, xmm0 // u_1 again #if CPUFAM_X86 movdqa [SP + 0], xmm3 @@ -453,8 +453,8 @@ movdqa xmm8, xmm3 # define V0 xmm8 #endif - pxor xmm4, xmm1 // u_* = (u_01 + u_11; u_00 + u_10) - pxor xmm3, xmm2 // v_* = (v_01 + v_11; v_00 + v_10) + pxor xmm4, xmm1 // u_* = (u_00 + u_10; u_01 + u_11) + pxor xmm3, xmm2 // v_* = (v_00 + v_10; v_01 + v_11) // Start by building the cross product, q = u_* v_*. movdqa xmm7, xmm4 // more copies of u_* @@ -588,7 +588,7 @@ // the /last/ byte in the block. If the block size is not a multiple of // 16 bytes, then there must be padding. 96-bit blocks are weird: the // padding is inserted at the /least/ significant end, so the register -// holds (0, x_0; x_1, x_2); otherwise, the padding goes at the most +// holds (x_2, x_1; x_0, 0); otherwise, the padding goes at the most // significant end. // // * The `words' format consists of a sequence of bytes, as in the @@ -613,9 +613,9 @@ SSEFUNC(gcm_mulk_128b_x86ish_pclmul) endprologue movdqu xmm0, [A] movdqu xmm1, [K] - pshufd xmm0, xmm0, SHUF(3, 2, 1, 0) + pshufd xmm0, xmm0, SHUF(0, 1, 2, 3) mul128 - pshufd xmm0, xmm0, SHUF(3, 2, 1, 0) + pshufd xmm0, xmm0, SHUF(0, 1, 2, 3) movdqu [A], xmm0 ret ENDFUNC @@ -653,9 +653,9 @@ SSEFUNC(gcm_mulk_64b_x86ish_pclmul) endprologue movq xmm0, [A] movq xmm1, [K] - pshufd xmm0, xmm0, SHUF(1, 0, 3, 3) + pshufd xmm0, xmm0, SHUF(3, 3, 0, 1) mul64 - pshufd xmm0, xmm0, SHUF(1, 0, 3, 3) + pshufd xmm0, xmm0, SHUF(3, 3, 0, 1) movq [A], xmm0 ret ENDFUNC @@ -696,9 +696,9 @@ SSEFUNC(gcm_mulk_96b_x86ish_pclmul) movd xmm2, [A + 8] movdqu xmm1, [K] punpcklqdq xmm0, xmm2 - pshufd xmm0, xmm0, SHUF(3, 2, 1, 0) + pshufd xmm0, xmm0, SHUF(0, 1, 2, 3) mul96 - pshufd xmm1, xmm0, SHUF(3, 2, 1, 0) + pshufd xmm1, xmm0, SHUF(0, 1, 2, 3) psrldq xmm0, 4 movq [A + 0], xmm1 movd [A + 8], xmm0 @@ -750,11 +750,11 @@ SSEFUNC(gcm_mulk_192b_x86ish_pclmul) movq xmm1, [A + 0] movdqu xmm2, [K + 0] movq xmm3, [K + 16] - pshufd xmm0, xmm0, SHUF(3, 2, 1, 0) - pshufd xmm1, xmm1, SHUF(1, 0, 3, 3) + pshufd xmm0, xmm0, SHUF(0, 1, 2, 3) + pshufd xmm1, xmm1, SHUF(3, 3, 0, 1) mul192 - pshufd xmm0, xmm0, SHUF(3, 2, 1, 0) - pshufd xmm1, xmm1, SHUF(1, 0, 3, 3) + pshufd xmm0, xmm0, SHUF(0, 1, 2, 3) + pshufd xmm1, xmm1, SHUF(3, 3, 0, 1) movdqu [A + 8], xmm0 movq [A + 0], xmm1 #if CPUFAM_AMD64 && ABI_WIN @@ -824,11 +824,11 @@ SSEFUNC(gcm_mulk_256b_x86ish_pclmul) movdqu xmm1, [A + 0] movdqu xmm2, [K + 0] movdqu xmm3, [K + 16] - pshufd xmm0, xmm0, SHUF(3, 2, 1, 0) - pshufd xmm1, xmm1, SHUF(3, 2, 1, 0) + pshufd xmm0, xmm0, SHUF(0, 1, 2, 3) + pshufd xmm1, xmm1, SHUF(0, 1, 2, 3) mul256 - pshufd xmm0, xmm0, SHUF(3, 2, 1, 0) - pshufd xmm1, xmm1, SHUF(3, 2, 1, 0) + pshufd xmm0, xmm0, SHUF(0, 1, 2, 3) + pshufd xmm1, xmm1, SHUF(0, 1, 2, 3) movdqu [A + 16], xmm0 movdqu [A + 0], xmm1 #if CPUFAM_X86