// use Karatsuba's identity here, but I suspect that loses more in
// the shifting, bit-twiddling, and dependency chains that it gains
// in saving a multiplication which otherwise pipelines well.
- // xmm0 = // (u_1; u_0)
- // xmm1 = // (v_1; v_0)
- movdqa xmm2, xmm1 // (v_1; v_0) again
- movdqa xmm3, xmm0 // (u_1; u_0) again
- movdqa xmm4, xmm0 // (u_1; u_0) yet again
+ // xmm0 = // (u_0; u_1)
+ // xmm1 = // (v_0; v_1)
+ movdqa xmm2, xmm1 // (v_0; v_1) again
+ movdqa xmm3, xmm0 // (u_0; u_1) again
+ movdqa xmm4, xmm0 // (u_0; u_1) yet again
pclmulhqlqdq xmm2, xmm0 // u_1 v_0
pclmullqlqdq xmm0, xmm1 // u_1 v_1
pclmulhqlqdq xmm3, xmm1 // u_0 v_1
pclmulhqhqdq xmm4, xmm1 // u_0 v_0
// Arrange the pieces to form a double-precision polynomial.
- pxor xmm2, xmm3 // (m_1; m_0) = u_1 v_0 + u_0 v_1
- movdqa xmm1, xmm2 // (m_1; m_0) again
- pslldq xmm2, 8 // (0; m_1)
- psrldq xmm1, 8 // (m_0; 0)
+ pxor xmm2, xmm3 // (m_0; m_1) = u_1 v_0 + u_0 v_1
+ movdqa xmm1, xmm2 // (m_0; m_1) again
+ pslldq xmm2, 8 // (m_1; 0)
+ psrldq xmm1, 8 // (0; m_0)
pxor xmm0, xmm2 // z_1 = u_1 v_1 + m_1
pxor xmm1, xmm4 // z_0 = u_0 v_0 + t^64 m_0
// word together, and then the low bits, everything will be fine.
// First, shift the high bits down.
- movdqa xmm2, xmm0 // (x_7, x_6; x_5, x_4) again
- movdqa xmm3, xmm0 // (x_7, x_6; x_5, x_4) yet again
- movdqa xmm4, xmm0 // (x_7, x_6; x_5, x_4) again again
+ movdqa xmm2, xmm0 // (x_4, x_5; x_6, x_7) again
+ movdqa xmm3, xmm0 // (x_4, x_5; x_6, x_7) yet again
+ movdqa xmm4, xmm0 // (x_4, x_5; x_6, x_7) again again
pslld xmm2, 31 // the b_i for t
pslld xmm3, 30 // the b_i for t^2
pslld xmm4, 25 // the b_i for t^7
// polynomial this time is p(t) = t^64 + t^4 + t^3 + t + 1.
// First, we must detach the top (`low'!) half of the result.
- movdqa xmm0, xmm1 // (x_3, x_2; x_1, x_0) again
- psrldq xmm1, 8 // (x_1, x_0; 0, 0)
+ movdqa xmm0, xmm1 // (x_0, x_1; x_2, x_3) again
+ psrldq xmm1, 8 // (0, 0; x_0, x_1)
// Next, shift the high bits down.
- movdqa xmm2, xmm0 // (x_3, x_2; ?, ?) again
- movdqa xmm3, xmm0 // (x_3, x_2; ?, ?) yet again
- movdqa xmm4, xmm0 // (x_3, x_2; ?, ?) again again
+ movdqa xmm2, xmm0 // (?, ?; x_2, x_3) again
+ movdqa xmm3, xmm0 // (?, ?; x_2, x_3) yet again
+ movdqa xmm4, xmm0 // (?, ?; x_2, x_3) again again
pslld xmm2, 31 // b_i for t
pslld xmm3, 29 // b_i for t^3
pslld xmm4, 28 // b_i for t^4
// shift both of them up by four bytes before we start. This will
// mean that the high 64 bits of the result (from GCM's viewpoint)
// will be zero.
- // xmm0 = // (0, u_2; u_1, u_0)
- // xmm1 = // (0, v_2; v_1, v_0)
- movdqa xmm2, xmm1 // (0, v_2; v_1, v_0) again
- movdqa xmm3, xmm0 // (0, u_2; u_1, u_0) again
- movdqa xmm4, xmm0 // (0, u_2; u_1, u_0) yet again
+ // xmm0 = // (u_0, u_1; u_2, 0)
+ // xmm1 = // (v_0, v_1; v_2, 0)
+ movdqa xmm2, xmm1 // (v_0, v_1; v_2, 0) again
+ movdqa xmm3, xmm0 // (u_0, u_1; u_2, 0) again
+ movdqa xmm4, xmm0 // (u_0, u_1; u_2, 0) yet again
pclmulhqlqdq xmm2, xmm0 // u_2 (v_1 t^32 + v_0) = e_0
pclmullqlqdq xmm0, xmm1 // u_2 v_2 = d = (0; d)
pclmulhqlqdq xmm3, xmm1 // v_2 (u_1 t^32 + u_0) = e_1
// registers. The answer we want is d t^128 + e t^64 + f, where e =
// e_0 + e_1.
//
- // The place values for the two halves are (t^160, t^128; t^96, ?)
- // and (?, t^64; t^32, 1). But we also want to shift the high part
+ // The place values for the two halves are (?, t^96; t^128, t^160)
+ // and (1, t^32; t^64, ?). But we also want to shift the high part
// left by a word, for symmetry's sake.
- psrldq xmm0, 8 // (d; 0) = d t^128
+ psrldq xmm0, 8 // (0; d) = d t^128
pxor xmm2, xmm3 // e = (e_0 + e_1)
movdqa xmm1, xmm4 // f again
pxor xmm0, xmm2 // d t^128 + e t^64
// are unimportant. Clobbers xmm2--xmm7.
// Start multiplying and accumulating pieces of product.
- // xmm0 = // (u_2; u_1)
- // xmm1 = // (u_0; ?)
- // xmm2 = // (v_2; v_1)
- // xmm3 = // (v_0; ?)
- movdqa xmm4, xmm0 // (u_2; u_1) again
- movdqa xmm5, xmm0 // (u_2; u_1) yet again
- movdqa xmm6, xmm0 // (u_2; u_1) again again
- movdqa xmm7, xmm3 // (v_0; ?) again
- punpcklqdq xmm3, xmm1 // (v_0; u_0)
+ // xmm0 = // (u_1; u_2)
+ // xmm1 = // (?; u_0)
+ // xmm2 = // (v_1; v_2)
+ // xmm3 = // (?; v_0)
+ movdqa xmm4, xmm0 // (u_1; u_2) again
+ movdqa xmm5, xmm0 // (u_1; u_2) yet again
+ movdqa xmm6, xmm0 // (u_1; u_2) again again
+ movdqa xmm7, xmm3 // (?; v_0) again
+ punpcklqdq xmm3, xmm1 // (u_0; v_0)
pclmulhqhqdq xmm4, xmm2 // u_1 v_1
pclmullqlqdq xmm1, xmm2 // u_0 v_2
pclmullqhqdq xmm5, xmm2 // u_2 v_1
pxor xmm1, xmm4 // u_0 v_2 + u_1 v_1
pclmullqlqdq xmm7, xmm0 // u_2 v_0
pxor xmm5, xmm6 // b = u_2 v_1 + u_1 v_2
- movdqa xmm6, xmm0 // (u_2; u_1) like a bad penny
+ movdqa xmm6, xmm0 // (u_1; u_2) like a bad penny
pxor xmm1, xmm7 // c = u_0 v_2 + u_1 v_1 + u_2 v_0
pclmullqlqdq xmm0, xmm2 // a = u_2 v_2
pclmulhqlqdq xmm6, xmm3 // u_1 v_0
// Next, the piecing together of the product. There's significant
// work here to leave the completed pieces in sensible registers.
- // xmm0 = // (a_1; a_0) = a = u_2 v_2
- // xmm5 = // (b_1; b_0) = b = u_1 v_2 + u_2 v_1
- // xmm1 = // (c_1; c_0) = c = u_0 v_2 +
+ // xmm0 = // (a_0; a_1) = a = u_2 v_2
+ // xmm5 = // (b_0; b_1) = b = u_1 v_2 + u_2 v_1
+ // xmm1 = // (c_0; c_1) = c = u_0 v_2 +
// u_1 v_1 + u_2 v_0
- // xmm6 = // (d_1; d_0) = d = u_0 v_1 + u_1 v_0
- // xmm3 = // (e_1; e_0) = e = u_0 v_0
+ // xmm6 = // (d_0; d_1) = d = u_0 v_1 + u_1 v_0
+ // xmm3 = // (e_0; e_1) = e = u_0 v_0
// xmm2, xmm4, xmm7 spare
- movdqa xmm2, xmm6 // (d_1; d_0) again
- movdqa xmm4, xmm5 // (b_1; b_0) again
- pslldq xmm6, 8 // (0; d_1)
- psrldq xmm5, 8 // (b_0; 0)
- psrldq xmm2, 8 // (d_0; 0)
- pslldq xmm4, 8 // (0; b_1)
- pxor xmm5, xmm6 // (b_0; d_1)
- pxor xmm0, xmm4 // (x_5; x_4) = (a_1; a_0 + b_1)
- pxor xmm2, xmm3 // (x_1; x_0) = (e_1 + d_0; e_0)
- pxor xmm1, xmm5 // (x_3; x_2) = (b_0 + c_1; c_0 + d_1)
+ movdqa xmm2, xmm6 // (d_0; d_1) again
+ movdqa xmm4, xmm5 // (b_0; b_1) again
+ pslldq xmm6, 8 // (d_1; 0)
+ psrldq xmm5, 8 // (0; b_0)
+ psrldq xmm2, 8 // (0; d_0)
+ pslldq xmm4, 8 // (b_1; 0)
+ pxor xmm5, xmm6 // (d_1; b_0)
+ pxor xmm0, xmm4 // (x_4; x_5) = (a_0 + b_1; a_1)
+ pxor xmm2, xmm3 // (x_0; x_1) = (e_0; e_1 + d_0)
+ pxor xmm1, xmm5 // (x_2; x_3) = (c_0 + d_1; b_0 + c_1)
// Next, the reduction. Our polynomial this time is p(x) = t^192 +
// t^7 + t^2 + t + 1. Yes, the magic numbers are the same as the
// 128-bit case. I don't know why.
// First, shift the high bits down.
- // xmm0 = // (x_5; x_4)
- // xmm1 = // (x_3; x_2)
- // xmm2 = // (x_1; x_0)
+ // xmm0 = // (x_4; x_5)
+ // xmm1 = // (x_2; x_3)
+ // xmm2 = // (x_0; x_1)
// xmm3--xmm7 spare
- movdqa xmm3, xmm0 // (x_5; x_4) copy
- movdqa xmm4, xmm0 // (x_5; x_4) copy
- movdqa xmm5, xmm0 // (x_5; x_4) copy
- pslld xmm3, 31 // (x_5; x_4) b_i for t
- pslld xmm4, 30 // (x_5; x_4) b_i for t^2
- pslld xmm5, 25 // (x_5; x_4) b_i for t^7
- movq xmm6, xmm1 // (x_3; 0) copy
+ movdqa xmm3, xmm0 // (x_4; x_5) copy
+ movdqa xmm4, xmm0 // (x_4; x_5) copy
+ movdqa xmm5, xmm0 // (x_4; x_5) copy
+ pslld xmm3, 31 // (x_4; x_5) b_i for t
+ pslld xmm4, 30 // (x_4; x_5) b_i for t^2
+ pslld xmm5, 25 // (x_4; x_5) b_i for t^7
+ movq xmm6, xmm1 // (0; x_3) copy
pxor xmm3, xmm4
- movq xmm7, xmm1 // (x_3; 0) copy
+ movq xmm7, xmm1 // (0; x_3) copy
pxor xmm3, xmm5
- movq xmm5, xmm1 // (x_3; 0) copy
- movdqa xmm4, xmm3 // (x_5; x_4) b_i combined
- pslld xmm6, 31 // (x_3; 0) b_i for t
- pslld xmm7, 30 // (x_3; 0) b_i for t^2
- pslld xmm5, 25 // (x_3; 0) b_i for t^7
- psrldq xmm3, 12 // (x_5; x_4) low contrib
- pslldq xmm4, 4 // (x_5; x_4) high contrib
+ movq xmm5, xmm1 // (0; x_3) copy
+ movdqa xmm4, xmm3 // (x_4; x_5) b_i combined
+ pslld xmm6, 31 // (0; x_3) b_i for t
+ pslld xmm7, 30 // (0; x_3) b_i for t^2
+ pslld xmm5, 25 // (0; x_3) b_i for t^7
+ psrldq xmm3, 12 // (x_4; x_5) low contrib
+ pslldq xmm4, 4 // (x_4; x_5) high contrib
pxor xmm6, xmm7
pxor xmm2, xmm3
pxor xmm6, xmm5
// And finally shift the low bits up. Unfortunately, we also have to
// split the low bits out.
- // xmm0 = // (x'_5; x'_4)
- // xmm1 = // (x'_3; x'_2)
- // xmm2 = // (x'_1; x'_0)
- movdqa xmm5, xmm1 // copies of (x'_3; x'_2)
+ // xmm0 = // (x'_4; x'_5)
+ // xmm1 = // (x'_2; x'_3)
+ // xmm2 = // (x'_0; x'_1)
+ movdqa xmm5, xmm1 // copies of (x'_2; x'_3)
movdqa xmm6, xmm1
movdqa xmm7, xmm1
- psrldq xmm1, 8 // bring down (x'_2; ?)
- movdqa xmm3, xmm0 // copies of (x'_5; x'_4)
+ psrldq xmm1, 8 // bring down (?; x'_2)
+ movdqa xmm3, xmm0 // copies of (x'_4; x'_5)
movdqa xmm4, xmm0
- punpcklqdq xmm1, xmm2 // (x'_2; x'_1)
- psrldq xmm2, 8 // (x'_0; ?)
+ punpcklqdq xmm1, xmm2 // (x'_1; x'_2)
+ psrldq xmm2, 8 // (?; x'_0)
pxor xmm2, xmm5 // low half and unit contrib
pxor xmm1, xmm0
psrld xmm5, 1
pxor xmm0, xmm4
pxor xmm5, xmm2 // mix everything together
pxor xmm0, xmm1
- movq xmm1, xmm5 // shunt (z_0; ?) into proper place
+ movq xmm1, xmm5 // shunt (?; z_0) into proper place
.endm
.macro mul256
// On x86, there aren't quite enough registers, so spill one for a
// bit. On AMD64, we can keep on going, so it's all good.
- // xmm0 = // u_1 = (u_11; u_10)
- // xmm1 = // u_0 = (u_01; u_00)
- // xmm2 = // v_1 = (v_11; v_10)
- // xmm3 = // v_0 = (v_01; v_00)
+ // xmm0 = // u_1 = (u_10; u_11)
+ // xmm1 = // u_0 = (u_00; u_01)
+ // xmm2 = // v_1 = (v_10; v_11)
+ // xmm3 = // v_0 = (v_00; v_01)
movdqa xmm4, xmm0 // u_1 again
#if CPUFAM_X86
movdqa [SP + 0], xmm3
movdqa xmm8, xmm3
# define V0 xmm8
#endif
- pxor xmm4, xmm1 // u_* = (u_01 + u_11; u_00 + u_10)
- pxor xmm3, xmm2 // v_* = (v_01 + v_11; v_00 + v_10)
+ pxor xmm4, xmm1 // u_* = (u_00 + u_10; u_01 + u_11)
+ pxor xmm3, xmm2 // v_* = (v_00 + v_10; v_01 + v_11)
// Start by building the cross product, q = u_* v_*.
movdqa xmm7, xmm4 // more copies of u_*
// the /last/ byte in the block. If the block size is not a multiple of
// 16 bytes, then there must be padding. 96-bit blocks are weird: the
// padding is inserted at the /least/ significant end, so the register
-// holds (0, x_0; x_1, x_2); otherwise, the padding goes at the most
+// holds (x_2, x_1; x_0, 0); otherwise, the padding goes at the most
// significant end.
//
// * The `words' format consists of a sequence of bytes, as in the
endprologue
movdqu xmm0, [A]
movdqu xmm1, [K]
- pshufd xmm0, xmm0, SHUF(3, 2, 1, 0)
+ pshufd xmm0, xmm0, SHUF(0, 1, 2, 3)
mul128
- pshufd xmm0, xmm0, SHUF(3, 2, 1, 0)
+ pshufd xmm0, xmm0, SHUF(0, 1, 2, 3)
movdqu [A], xmm0
ret
ENDFUNC
endprologue
movq xmm0, [A]
movq xmm1, [K]
- pshufd xmm0, xmm0, SHUF(1, 0, 3, 3)
+ pshufd xmm0, xmm0, SHUF(3, 3, 0, 1)
mul64
- pshufd xmm0, xmm0, SHUF(1, 0, 3, 3)
+ pshufd xmm0, xmm0, SHUF(3, 3, 0, 1)
movq [A], xmm0
ret
ENDFUNC
movd xmm2, [A + 8]
movdqu xmm1, [K]
punpcklqdq xmm0, xmm2
- pshufd xmm0, xmm0, SHUF(3, 2, 1, 0)
+ pshufd xmm0, xmm0, SHUF(0, 1, 2, 3)
mul96
- pshufd xmm1, xmm0, SHUF(3, 2, 1, 0)
+ pshufd xmm1, xmm0, SHUF(0, 1, 2, 3)
psrldq xmm0, 4
movq [A + 0], xmm1
movd [A + 8], xmm0
movq xmm1, [A + 0]
movdqu xmm2, [K + 0]
movq xmm3, [K + 16]
- pshufd xmm0, xmm0, SHUF(3, 2, 1, 0)
- pshufd xmm1, xmm1, SHUF(1, 0, 3, 3)
+ pshufd xmm0, xmm0, SHUF(0, 1, 2, 3)
+ pshufd xmm1, xmm1, SHUF(3, 3, 0, 1)
mul192
- pshufd xmm0, xmm0, SHUF(3, 2, 1, 0)
- pshufd xmm1, xmm1, SHUF(1, 0, 3, 3)
+ pshufd xmm0, xmm0, SHUF(0, 1, 2, 3)
+ pshufd xmm1, xmm1, SHUF(3, 3, 0, 1)
movdqu [A + 8], xmm0
movq [A + 0], xmm1
#if CPUFAM_AMD64 && ABI_WIN
movdqu xmm1, [A + 0]
movdqu xmm2, [K + 0]
movdqu xmm3, [K + 16]
- pshufd xmm0, xmm0, SHUF(3, 2, 1, 0)
- pshufd xmm1, xmm1, SHUF(3, 2, 1, 0)
+ pshufd xmm0, xmm0, SHUF(0, 1, 2, 3)
+ pshufd xmm1, xmm1, SHUF(0, 1, 2, 3)
mul256
- pshufd xmm0, xmm0, SHUF(3, 2, 1, 0)
- pshufd xmm1, xmm1, SHUF(3, 2, 1, 0)
+ pshufd xmm0, xmm0, SHUF(0, 1, 2, 3)
+ pshufd xmm1, xmm1, SHUF(0, 1, 2, 3)
movdqu [A + 16], xmm0
movdqu [A + 0], xmm1
#if CPUFAM_X86