// u v = SUM_{0<=i,j<n} u_i v_j t^{i+j}
//
// Suppose instead that we're given ũ = SUM_{0<=i<n} u_{n-i-1} t^i
- // and ṽ = SUM_{0<=j<n} v_{n-j-1} t^j, so the bits are backwards.
+ // and ṽ = SUM_{0<=j<n} v_{n-j-1} t^j, so the bits are backwards.
// Then
//
- // ũ ṽ = SUM_{0<=i,j<n} u_{n-i-1} v_{n-j-1} t^{i+j}
+ // ũ ṽ = SUM_{0<=i,j<n} u_{n-i-1} v_{n-j-1} t^{i+j}
// = SUM_{0<=i,j<n} u_i v_j t^{2n-2-(i+j)}
//
// which is almost the bit-reversal of u v, only it's shifted right
- // by one place. Oh, well: we'll have to shift it back later.
+ // by one place. Putting this another way, what we have is actually
+ // the bit reversal of the product u v t. We could get the correct
+ // answer (modulo p(t)) if we'd sneakily divided one of the operands
+ // by t before we started. Conveniently, v is actually the secret
+ // value k set up by the GCM `mktable' function, so we can arrange to
+ // actually store k/t (mod p(t)) and then the product will come out
+ // correct (modulo p(t)) and we won't have anything more to worry
+ // about here.
//
// That was important to think about, but there's not a great deal to
// do about it yet other than to convert what we've got from the
movdqa xmm1, xmm2 // (m_1; m_0) again
pslldq xmm2, 8 // (0; m_1)
psrldq xmm1, 8 // (m_0; 0)
- pxor xmm0, xmm2 // x_1 = u_1 v_1 + m_1
- pxor xmm1, xmm4 // x_0 = u_0 v_0 + t^64 m_0
-
- // Two problems remain. The first is that this product is shifted
- // left (from GCM's backwards perspective) by one place, which is
- // annoying. Let's take care of that now. Once this is done, we'll
- // be properly in GCM's backwards bit-ordering, so xmm1 will hold the
- // low half of the product and xmm0 the high half. (The following
- // diagrams show bit 0 consistently on the right.)
- //
- // xmm1
- // ,-------------.-------------.-------------.-------------.
- // | 0 x_0-x_30 | x_31-x_62 | x_63-x_94 | x_95-x_126 |
- // `-------------^-------------^-------------^-------------'
- //
- // xmm0
- // ,-------------.-------------.-------------.-------------.
- // | x_127-x_158 | x_159-x_190 | x_191-x_222 | x_223-x_254 |
- // `-------------^-------------^-------------^-------------'
- //
- // We start by shifting each 32-bit lane right (from GCM's point of
- // view -- physically, left) by one place, which gives us this:
- //
- // low (xmm3)
- // ,-------------.-------------.-------------.-------------.
- // | x_0-x_30 0 | x_32-x_62 0 | x_64-x_94 0 | x_96-x_126 0|
- // `-------------^-------------^-------------^-------------'
- //
- // high (xmm2)
- // ,-------------.-------------.-------------.-------------.
- // |x_128-x_158 0|x_160-x_190 0|x_192-x_222 0|x_224-x_254 0|
- // `-------------^-------------^-------------^-------------'
- //
- // but we've lost a bunch of bits. We separately shift each lane
- // left by 31 places to give us the bits we lost.
- //
- // low (xmm1)
- // ,-------------.-------------.-------------.-------------.
- // | 0...0 | 0...0 x_31 | 0...0 x_63 | 0...0 x_95 |
- // `-------------^-------------^-------------^-------------'
- //
- // high (xmm0)
- // ,-------------.-------------.-------------.-------------.
- // | 0...0 x_127 | 0...0 x_159 | 0...0 x_191 | 0...0 x_223 |
- // `-------------^-------------^-------------^-------------'
- //
- // Which is close, but we don't get a cigar yet. To get the missing
- // bits into position, we shift each of these right by a lane, but,
- // alas, the x_127 falls off, so, separately, we shift the high
- // register left by three lanes, so that everything is lined up
- // properly when we OR them all together:
- //
- // low (xmm1)
- // ,-------------.-------------.-------------.-------------.
- // ? 0...0 x_31 | 0...0 x_63 | 0...0 x_95 | 0...0 |
- // `-------------^-------------^-------------^-------------'
- //
- // wrap (xmm4)
- // ,-------------.-------------.-------------.-------------.
- // | 0...0 | 0...0 | 0...0 | 0...0 x_127 |
- // `-------------^-------------^-------------^-------------'
- //
- // high (xmm0)
- // ,-------------.-------------.-------------.-------------.
- // | 0...0 x_159 | 0...0 x_191 | 0...0 x_223 | 0...0 |
- // `-------------^-------------^-------------^-------------'
- //
- // The `low' and `wrap' registers (xmm1, xmm3, xmm4) then collect the
- // low 128 coefficients, while the `high' registers (xmm0, xmm2)
- // collect the high 127 registers, leaving a zero bit at the most
- // significant end as we expect.
-
- // xmm0 = // (x_7, x_6; x_5, x_4)
- // xmm1 = // (x_3, x_2; x_1, x_0)
- movdqa xmm3, xmm1 // (x_3, x_2; x_1, x_0) again
- movdqa xmm2, xmm0 // (x_7, x_6; x_5, x_4) again
- psrld xmm1, 31 // shifted left; just the carries
- psrld xmm0, 31
- pslld xmm3, 1 // shifted right, but dropped carries
- pslld xmm2, 1
- movdqa xmm4, xmm0 // another copy for the carry around
- pslldq xmm1, 4 // move carries over
- pslldq xmm0, 4
- psrldq xmm4, 12 // the big carry wraps around
- por xmm1, xmm3
- por xmm0, xmm2 // (y_7, y_6; y_5, y_4)
- por xmm1, xmm4 // (y_3, y_2; y_1, y_0)
-
- // And the other problem is that the result needs to be reduced
+ pxor xmm0, xmm2 // z_1 = u_1 v_1 + m_1
+ pxor xmm1, xmm4 // z_0 = u_0 v_0 + t^64 m_0
+
+ // The remaining problem is that the result needs to be reduced
// modulo p(t) = t^128 + t^7 + t^2 + t + 1. Let R = t^128 = t^7 +
// t^2 + t + 1 in our field. So far, we've calculated z_0 and z_1
// such that z_0 + z_1 R = u v using the identity R = t^128: now we
// identity R = t^7 + t^2 + t + 1.
//
// We do this by working on each 32-bit word of the high half of z
- // separately, so consider y_i, for some 4 <= i < 8. Certainly, y_i
- // t^{32i} = y_i R t^{32(i-4)} = (t^7 + t^2 + t + 1) y_i t^{32(i-4)},
+ // separately, so consider x_i, for some 4 <= i < 8. Certainly, x_i
+ // t^{32i} = x_i R t^{32(i-4)} = (t^7 + t^2 + t + 1) x_i t^{32(i-4)},
// but we can't use that directly without breaking up the 32-bit word
- // structure. Instead, we start by considering just y_i t^7
- // t^{32(i-4)}, which again looks tricky. Now, split y_i = a_i +
+ // structure. Instead, we start by considering just x_i t^7
+ // t^{32(i-4)}, which again looks tricky. Now, split x_i = a_i +
// t^25 b_i, with deg a_i < 25; then
//
- // y_i t^7 t^{32(i-4)} = a_i t^7 t^{32(i-4)} + b_i t^{32(i-3)}
+ // x_i t^7 t^{32(i-4)} = a_i t^7 t^{32(i-4)} + b_i t^{32(i-3)}
//
- // We can similarly decompose y_i t^2 and y_i t into a pair of 32-bit
+ // We can similarly decompose x_i t^2 and x_i t into a pair of 32-bit
// contributions to the t^{32(i-4)} and t^{32(i-3)} words, but the
// splits are different. This is lovely, with one small snag: when
- // we do this to y_7, we end up with a contribution back into the
+ // we do this to x_7, we end up with a contribution back into the
// t^128 coefficient word. But notice that only the low seven bits
// of this word are affected, so there's no knock-on contribution
// into the t^32 word. Therefore, if we handle the high bits of each
// word together, and then the low bits, everything will be fine.
// First, shift the high bits down.
- movdqa xmm2, xmm0 // (y_7, y_6; y_5, y_4) again
- movdqa xmm3, xmm0 // (y_7, y_6; y_5, y_4) yet again
- movdqa xmm4, xmm0 // (y_7, y_6; y_5, y_4) again again
+ movdqa xmm2, xmm0 // (x_7, x_6; x_5, x_4) again
+ movdqa xmm3, xmm0 // (x_7, x_6; x_5, x_4) yet again
+ movdqa xmm4, xmm0 // (x_7, x_6; x_5, x_4) again again
pslld xmm2, 31 // the b_i for t
pslld xmm3, 30 // the b_i for t^2
pslld xmm4, 25 // the b_i for t^7
// respectively; leave with z = u v in xmm0. Clobbers xmm1--xmm4.
// The multiplication is thankfully easy.
- pclmullqlqdq xmm0, xmm1 // u v
-
- // Shift the product up by one place. After this, we're in GCM
- // bizarro-world.
- movdqa xmm1, xmm0 // u v again
- psrld xmm0, 31 // shifted left; just the carries
- pslld xmm1, 1 // shifted right, but dropped carries
- pslldq xmm0, 4 // move carries over
- por xmm1, xmm0 // (y_3, y_2; y_1, y_0)
+ pclmullqlqdq xmm1, xmm0 // u v
// Now we must reduce. This is essentially the same as the 128-bit
// case above, but mostly simpler because everything is smaller. The
// polynomial this time is p(t) = t^64 + t^4 + t^3 + t + 1.
// First, we must detach the top (`low'!) half of the result.
- movdqa xmm0, xmm1 // (y_3, y_2; y_1, y_0) again
- psrldq xmm1, 8 // (y_1, y_0; 0, 0)
+ movdqa xmm0, xmm1 // (x_3, x_2; x_1, x_0) again
+ psrldq xmm1, 8 // (x_1, x_0; 0, 0)
// Next, shift the high bits down.
- movdqa xmm2, xmm0 // (y_3, y_2; ?, ?) again
- movdqa xmm3, xmm0 // (y_3, y_2; ?, ?) yet again
- movdqa xmm4, xmm0 // (y_3, y_2; ?, ?) again again
+ movdqa xmm2, xmm0 // (x_3, x_2; ?, ?) again
+ movdqa xmm3, xmm0 // (x_3, x_2; ?, ?) yet again
+ movdqa xmm4, xmm0 // (x_3, x_2; ?, ?) again again
pslld xmm2, 31 // b_i for t
pslld xmm3, 29 // b_i for t^3
pslld xmm4, 28 // b_i for t^4
// e_0 + e_1.
//
// The place values for the two halves are (t^160, t^128; t^96, ?)
- // and (?, t^64; t^32, 1).
+ // and (?, t^64; t^32, 1). But we also want to shift the high part
+ // left by a word, for symmetry's sake.
psrldq xmm0, 8 // (d; 0) = d t^128
pxor xmm2, xmm3 // e = (e_0 + e_1)
movdqa xmm1, xmm4 // f again
pxor xmm0, xmm2 // d t^128 + e t^64
psrldq xmm2, 12 // e[31..0] t^64
psrldq xmm1, 4 // f[95..0]
- pslldq xmm4, 8 // f[127..96]
+ pslldq xmm4, 12 // f[127..96], shifted
+ pslldq xmm0, 4 // shift high 96 bits
pxor xmm1, xmm2 // low 96 bits of result
pxor xmm0, xmm4 // high 96 bits of result
- // Next, shift everything one bit to the left to compensate for GCM's
- // strange ordering. This will be easier if we shift up the high
- // half by a word before we start. After this we're in GCM bizarro-
- // world.
- movdqa xmm3, xmm1 // low half again
- pslldq xmm0, 4 // shift high half
- psrld xmm1, 31 // shift low half down: just carries
- movdqa xmm2, xmm0 // copy high half
- pslld xmm3, 1 // shift low half down: drop carries
- psrld xmm0, 31 // shift high half up: just carries
- pslld xmm2, 1 // shift high half down: drop carries
- movdqa xmm4, xmm0 // copy high carries for carry-around
- pslldq xmm0, 4 // shift carries down
- pslldq xmm1, 4
- psrldq xmm4, 12 // the big carry wraps around
- por xmm1, xmm3
- por xmm0, xmm2
- por xmm1, xmm4
-
// Finally, the reduction. This is essentially the same as the
// 128-bit case, except that the polynomial is p(t) = t^96 + t^10 +
// t^9 + t^6 + 1. The degrees are larger but not enough to cause
movdqa xmm4, xmm0 // (u_2; u_1) again
movdqa xmm5, xmm0 // (u_2; u_1) yet again
movdqa xmm6, xmm0 // (u_2; u_1) again again
- movdqa xmm7, xmm1 // (u_0; ?) again
- punpcklqdq xmm1, xmm3 // (u_0; v_0)
+ movdqa xmm7, xmm3 // (v_0; ?) again
+ punpcklqdq xmm3, xmm1 // (v_0; u_0)
pclmulhqhqdq xmm4, xmm2 // u_1 v_1
- pclmullqlqdq xmm3, xmm0 // u_2 v_0
+ pclmullqlqdq xmm1, xmm2 // u_0 v_2
pclmullqhqdq xmm5, xmm2 // u_2 v_1
pclmulhqlqdq xmm6, xmm2 // u_1 v_2
- pxor xmm4, xmm3 // u_2 v_0 + u_1 v_1
- pclmullqlqdq xmm7, xmm2 // u_0 v_2
+ pxor xmm1, xmm4 // u_0 v_2 + u_1 v_1
+ pclmullqlqdq xmm7, xmm0 // u_2 v_0
pxor xmm5, xmm6 // b = u_2 v_1 + u_1 v_2
movdqa xmm6, xmm0 // (u_2; u_1) like a bad penny
- pxor xmm4, xmm7 // c = u_0 v_2 + u_1 v_1 + u_2 v_0
+ pxor xmm1, xmm7 // c = u_0 v_2 + u_1 v_1 + u_2 v_0
pclmullqlqdq xmm0, xmm2 // a = u_2 v_2
- pclmulhqhqdq xmm6, xmm1 // u_1 v_0
- pclmulhqlqdq xmm2, xmm1 // u_0 v_1
- pclmullqhqdq xmm1, xmm1 // e = u_0 v_0
- pxor xmm2, xmm6 // d = u_1 v_0 + u_0 v_1
+ pclmulhqlqdq xmm6, xmm3 // u_1 v_0
+ pclmulhqhqdq xmm2, xmm3 // u_0 v_1
+ pclmullqhqdq xmm3, xmm3 // e = u_0 v_0
+ pxor xmm6, xmm2 // d = u_1 v_0 + u_0 v_1
- // Next, the piecing together of the product.
+ // Next, the piecing together of the product. There's significant
+ // work here to leave the completed pieces in sensible registers.
// xmm0 = // (a_1; a_0) = a = u_2 v_2
// xmm5 = // (b_1; b_0) = b = u_1 v_2 + u_2 v_1
- // xmm4 = // (c_1; c_0) = c = u_0 v_2 +
+ // xmm1 = // (c_1; c_0) = c = u_0 v_2 +
// u_1 v_1 + u_2 v_0
- // xmm2 = // (d_1; d_0) = d = u_0 v_1 + u_1 v_0
- // xmm1 = // (e_1; e_0) = e = u_0 v_0
- // xmm3, xmm6, xmm7 spare
- movdqa xmm3, xmm2 // (d_1; d_0) again
- movdqa xmm6, xmm5 // (b_1; b_0) again
- pslldq xmm2, 8 // (0; d_1)
+ // xmm6 = // (d_1; d_0) = d = u_0 v_1 + u_1 v_0
+ // xmm3 = // (e_1; e_0) = e = u_0 v_0
+ // xmm2, xmm4, xmm7 spare
+ movdqa xmm2, xmm6 // (d_1; d_0) again
+ movdqa xmm4, xmm5 // (b_1; b_0) again
+ pslldq xmm6, 8 // (0; d_1)
psrldq xmm5, 8 // (b_0; 0)
- psrldq xmm3, 8 // (d_0; 0)
- pslldq xmm6, 8 // (0; b_1)
- pxor xmm5, xmm2 // (b_0; d_1)
- pxor xmm0, xmm6 // x_2 = (a_1; a_0 + b_1)
- pxor xmm3, xmm1 // x_0 = (e_1 + d_0; e_0)
- pxor xmm4, xmm5 // x_1 = (b_0 + c_1; c_0 + d_1)
-
- // Now, shift it right (from GCM's point of view) by one bit, and try
- // to leave the result in less random registers. After this, we'll
- // be in GCM bizarro-world.
- // xmm1, xmm2, xmm5, xmm6, xmm7 spare
- movdqa xmm5, xmm0 // copy x_2
- movdqa xmm1, xmm4 // copy x_1
- movdqa xmm2, xmm3 // copy x_0
- psrld xmm0, 31 // x_2 carries
- psrld xmm4, 31 // x_1 carries
- psrld xmm3, 31 // x_0 carries
- pslld xmm5, 1 // x_2 shifted
- pslld xmm1, 1 // x_1 shifted
- pslld xmm2, 1 // x_0 shifted
- movdqa xmm6, xmm0 // x_2 carry copy
- movdqa xmm7, xmm4 // x_1 carry copy
- pslldq xmm0, 4 // x_2 carry shifted
- pslldq xmm4, 4 // x_1 carry shifted
- pslldq xmm3, 4 // x_0 carry shifted
- psrldq xmm6, 12 // x_2 carry out
- psrldq xmm7, 12 // x_1 carry out
- por xmm0, xmm5 // (y_5; y_4)
- por xmm1, xmm4
- por xmm2, xmm3
- por xmm1, xmm6 // (y_3; y_2)
- por xmm2, xmm7 // (y_1; y_0)
+ psrldq xmm2, 8 // (d_0; 0)
+ pslldq xmm4, 8 // (0; b_1)
+ pxor xmm5, xmm6 // (b_0; d_1)
+ pxor xmm0, xmm4 // (x_5; x_4) = (a_1; a_0 + b_1)
+ pxor xmm2, xmm3 // (x_1; x_0) = (e_1 + d_0; e_0)
+ pxor xmm1, xmm5 // (x_3; x_2) = (b_0 + c_1; c_0 + d_1)
// Next, the reduction. Our polynomial this time is p(x) = t^192 +
// t^7 + t^2 + t + 1. Yes, the magic numbers are the same as the
// 128-bit case. I don't know why.
// First, shift the high bits down.
- // xmm0 = // (y_5; y_4)
- // xmm1 = // (y_3; y_2)
- // xmm2 = // (y_1; y_0)
+ // xmm0 = // (x_5; x_4)
+ // xmm1 = // (x_3; x_2)
+ // xmm2 = // (x_1; x_0)
// xmm3--xmm7 spare
- movdqa xmm3, xmm0 // (y_5; y_4) copy
- movdqa xmm4, xmm0 // (y_5; y_4) copy
- movdqa xmm5, xmm0 // (y_5; y_4) copy
- pslld xmm3, 31 // (y_5; y_4) b_i for t
- pslld xmm4, 30 // (y_5; y_4) b_i for t^2
- pslld xmm5, 25 // (y_5; y_4) b_i for t^7
- movq xmm6, xmm1 // (y_3; 0) copy
+ movdqa xmm3, xmm0 // (x_5; x_4) copy
+ movdqa xmm4, xmm0 // (x_5; x_4) copy
+ movdqa xmm5, xmm0 // (x_5; x_4) copy
+ pslld xmm3, 31 // (x_5; x_4) b_i for t
+ pslld xmm4, 30 // (x_5; x_4) b_i for t^2
+ pslld xmm5, 25 // (x_5; x_4) b_i for t^7
+ movq xmm6, xmm1 // (x_3; 0) copy
pxor xmm3, xmm4
- movq xmm7, xmm1 // (y_3; 0) copy
+ movq xmm7, xmm1 // (x_3; 0) copy
pxor xmm3, xmm5
- movq xmm5, xmm1 // (y_3; 0) copy
- movdqa xmm4, xmm3 // (y_5; y_4) b_i combined
- pslld xmm6, 31 // (y_3; 0) b_i for t
- pslld xmm7, 30 // (y_3; 0) b_i for t^2
- pslld xmm5, 25 // (y_3; 0) b_i for t^7
- psrldq xmm3, 12 // (y_5; y_4) low contrib
- pslldq xmm4, 4 // (y_5; y_4) high contrib
+ movq xmm5, xmm1 // (x_3; 0) copy
+ movdqa xmm4, xmm3 // (x_5; x_4) b_i combined
+ pslld xmm6, 31 // (x_3; 0) b_i for t
+ pslld xmm7, 30 // (x_3; 0) b_i for t^2
+ pslld xmm5, 25 // (x_3; 0) b_i for t^7
+ psrldq xmm3, 12 // (x_5; x_4) low contrib
+ pslldq xmm4, 4 // (x_5; x_4) high contrib
pxor xmm6, xmm7
pxor xmm2, xmm3
pxor xmm6, xmm5
// And finally shift the low bits up. Unfortunately, we also have to
// split the low bits out.
- // xmm0 = // (y'_5; y'_4)
- // xmm1 = // (y'_3; y'_2)
- // xmm2 = // (y'_1; y'_0)
- movdqa xmm5, xmm1 // copies of (y'_3; y'_2)
+ // xmm0 = // (x'_5; x'_4)
+ // xmm1 = // (x'_3; x'_2)
+ // xmm2 = // (x'_1; x'_0)
+ movdqa xmm5, xmm1 // copies of (x'_3; x'_2)
movdqa xmm6, xmm1
movdqa xmm7, xmm1
- psrldq xmm1, 8 // bring down (y'_2; ?)
- movdqa xmm3, xmm0 // copies of (y'_5; y'_4)
+ psrldq xmm1, 8 // bring down (x'_2; ?)
+ movdqa xmm3, xmm0 // copies of (x'_5; x'_4)
movdqa xmm4, xmm0
- punpcklqdq xmm1, xmm2 // (y'_2; y'_1)
- psrldq xmm2, 8 // (y'_0; ?)
+ punpcklqdq xmm1, xmm2 // (x'_2; x'_1)
+ psrldq xmm2, 8 // (x'_0; ?)
pxor xmm2, xmm5 // low half and unit contrib
pxor xmm1, xmm0
psrld xmm5, 1
//
// q = r s = (u_0 + u_1) (v_0 + v_1)
// = (u_0 v_0) + (u1 v_1) + (u_0 v_1 + u_1 v_0)
- // = a + d + c
+ // = a + c + b
//
// The first two terms we've already calculated; the last is the
// remaining one we want. We'll set B = t^128. We know how to do
// xmm3 = // v_0 = (v_01; v_00)
movdqa xmm4, xmm0 // u_1 again
#if CPUFAM_X86
- movdqa [esp + 0], xmm3
+ movdqa [SP + 0], xmm3
#elif CPUFAM_AMD64
movdqa xmm8, xmm3
# define V0 xmm8
pclmullqlqdq xmm4, xmm2 // u_11 v_11
pclmulhqhqdq xmm7, xmm2 // u_10 v_10
#if CPUFAM_X86
- movdqa xmm2, [esp + 0]
+ movdqa xmm2, [SP + 0]
# define V0 xmm2
#endif
pxor xmm0, xmm3 // u_10 v_11 + u_11 v_10
movdqa xmm3, xmm0
pslldq xmm0, 8
psrldq xmm3, 8
- pxor xmm4, xmm0 // x_1 = a_1
+ pxor xmm4, xmm0 // x_3 = a_1
pxor xmm7, xmm3 // a_0
// Mix that into the product now forming in xmm4--xmm7.
#undef V0
- // Now we need to shift that whole lot one bit to the left. This
- // will also give us an opportunity to put the product back in
- // xmm0--xmm3. This is a slightly merry dance because it's nearly
- // pipelined but we don't have enough registers.
- //
- // After this, we'll be in GCM bizarro-world.
- movdqa xmm0, xmm4 // x_3 again
- psrld xmm4, 31 // x_3 carries
- pslld xmm0, 1 // x_3 shifted left
- movdqa xmm3, xmm4 // x_3 copy carries
- movdqa xmm1, xmm5 // x_2 again
- pslldq xmm4, 4 // x_3 carries shifted up
- psrld xmm5, 31 // x_2 carries
- psrldq xmm3, 12 // x_3 big carry out
- pslld xmm1, 1 // x_2 shifted left
- por xmm0, xmm4 // x_3 mixed together
- movdqa xmm4, xmm5 // x_2 copy carries
- movdqa xmm2, xmm6 // x_1 again
- pslldq xmm5, 4 // x_2 carries shifted up
- psrld xmm6, 31 // x_1 carries
- psrldq xmm4, 12 // x_2 big carry out
- pslld xmm2, 1 // x_1 shifted
- por xmm1, xmm5 // x_2 mixed together
- movdqa xmm5, xmm6 // x_1 copy carries
- por xmm1, xmm3 // x_2 with carry from x_3
- movdqa xmm3, xmm7 // x_0 again
- pslldq xmm6, 4 // x_1 carries shifted up
- psrld xmm7, 31 // x_2 carries
- psrldq xmm5, 12 // x_1 big carry out
- pslld xmm3, 1 // x_0 shifted
- por xmm2, xmm6 // x_1 mixed together
- pslldq xmm7, 4 // x_0 carries shifted up
- por xmm2, xmm4 // x_1 with carry from x_2
- por xmm3, xmm7 // x_0 mixed together
- por xmm3, xmm5 // x_0 with carry from x_1
-
// Now we must reduce. This is essentially the same as the 128-bit
// case above, but more complicated because everything is bigger.
// The polynomial this time is p(t) = t^256 + t^10 + t^5 + t^2 + 1.
// First, shift the high bits down.
- movdqa xmm4, xmm0 // y_3 again
- movdqa xmm5, xmm0 // y_3 yet again
- movdqa xmm6, xmm0 // y_3 again again
- pslld xmm4, 30 // y_3: b_i for t^2
- pslld xmm5, 27 // y_3: b_i for t^5
- pslld xmm6, 22 // y_3: b_i for t^10
- movdqa xmm7, xmm1 // y_2 again
- pxor xmm4, xmm5
- movdqa xmm5, xmm1 // y_2 again
- pxor xmm4, xmm6
- movdqa xmm6, xmm1 // y_2 again
- pslld xmm7, 30 // y_2: b_i for t^2
- pslld xmm5, 27 // y_2: b_i for t^5
- pslld xmm6, 22 // y_2: b_i for t^10
- pxor xmm7, xmm5
- movdqa xmm5, xmm4
- pxor xmm7, xmm6
- psrldq xmm4, 4
- movdqa xmm6, xmm7
- pslldq xmm5, 12
- psrldq xmm7, 4
- pxor xmm2, xmm4
- pslldq xmm6, 12
- pxor xmm3, xmm7
- pxor xmm1, xmm5
- pxor xmm2, xmm6
+ movdqa xmm0, xmm4 // x_3 again
+ movdqa xmm1, xmm4 // x_3 yet again
+ movdqa xmm2, xmm4 // x_3 again again
+ pslld xmm0, 30 // x_3: b_i for t^2
+ pslld xmm1, 27 // x_3: b_i for t^5
+ pslld xmm2, 22 // x_3: b_i for t^10
+ movdqa xmm3, xmm5 // x_2 again
+ pxor xmm0, xmm1
+ movdqa xmm1, xmm5 // x_2 again
+ pxor xmm0, xmm2 // b_3
+ movdqa xmm2, xmm5 // x_2 again
+ pslld xmm3, 30 // x_2: b_i for t^2
+ pslld xmm1, 27 // x_2: b_i for t^5
+ pslld xmm2, 22 // x_2: b_i for t^10
+ pxor xmm3, xmm1
+ movdqa xmm1, xmm0
+ pxor xmm3, xmm2 // b_2
+ psrldq xmm0, 4
+ movdqa xmm2, xmm3
+ pslldq xmm1, 12
+ psrldq xmm3, 4
+ pxor xmm6, xmm0
+ pslldq xmm2, 12
+ pxor xmm7, xmm3
+ pxor xmm5, xmm1
+ pxor xmm6, xmm2
// And then shift the low bits up.
- movdqa xmm4, xmm0 // y_3 again
- movdqa xmm5, xmm1 // y_2 again
- movdqa xmm6, xmm0 // y_3 yet again
- movdqa xmm7, xmm1 // y_2 yet again
- pxor xmm2, xmm0 // y_1 and unit contrib from y_3
- pxor xmm3, xmm1 // y_0 and unit contrib from y_2
- psrld xmm0, 2
- psrld xmm1, 2
- psrld xmm4, 5
- psrld xmm5, 5
- psrld xmm6, 10
- psrld xmm7, 10
- pxor xmm0, xmm2 // y_1, with y_3 units and t^2
- pxor xmm1, xmm3 // y_0, with y_2 units and t^2
- pxor xmm4, xmm6 // y_3 t^5 and t^10 contribs
- pxor xmm5, xmm7 // y_2 t^5 and t^10 contribs
+ movdqa xmm0, xmm4 // x_3 again
+ movdqa xmm1, xmm5 // x_2 again
+ movdqa xmm2, xmm4 // x_3 yet again
+ movdqa xmm3, xmm5 // x_2 yet again
+ pxor xmm6, xmm4 // x_1 and unit contrib from x_3
+ pxor xmm7, xmm5 // x_0 and unit contrib from x_2
+ psrld xmm4, 2
+ psrld xmm5, 2
+ psrld xmm0, 5
+ psrld xmm1, 5
+ psrld xmm2, 10
+ psrld xmm3, 10
+ pxor xmm4, xmm6 // x_1, with x_3 units and t^2
+ pxor xmm5, xmm7 // x_0, with x_2 units and t^2
+ pxor xmm0, xmm2 // x_3 t^5 and t^10 contribs
+ pxor xmm1, xmm3 // x_2 t^5 and t^10 contribs
pxor xmm0, xmm4 // high half of reduced result
pxor xmm1, xmm5 // low half; all done
.endm
// A is updated with the product A K.
#if CPUFAM_X86
- mov A, [esp + 4]
- mov K, [esp + 8]
+ mov A, [SP + 4]
+ mov K, [SP + 8]
#endif
endprologue
movdqu xmm0, [A]
// exit, A is updated with the product A K.
#if CPUFAM_X86
- mov A, [esp + 4]
- mov K, [esp + 8]
+ mov A, [SP + 4]
+ mov K, [SP + 8]
ldgot ecx
#endif
endprologue
// A is updated with the product A K.
#if CPUFAM_X86
- mov A, [esp + 4]
- mov K, [esp + 8]
+ mov A, [SP + 4]
+ mov K, [SP + 8]
#endif
endprologue
movq xmm0, [A]
// exit, A is updated with the product A K.
#if CPUFAM_X86
- mov A, [esp + 4]
- mov K, [esp + 8]
+ mov A, [SP + 4]
+ mov K, [SP + 8]
ldgot ecx
#endif
endprologue
// with the product A K.
#if CPUFAM_X86
- mov A, [esp + 4]
- mov K, [esp + 8]
+ mov A, [SP + 4]
+ mov K, [SP + 8]
#endif
endprologue
movq xmm0, [A + 0]
// updated with the product A K.
#if CPUFAM_X86
- mov A, [esp + 4]
- mov K, [esp + 8]
+ mov A, [SP + 4]
+ mov K, [SP + 8]
ldgot ecx
#endif
endprologue
// A is updated with the product A K.
#if CPUFAM_X86
- mov A, [esp + 4]
- mov K, [esp + 8]
+ mov A, [SP + 4]
+ mov K, [SP + 8]
#endif
#if CPUFAM_AMD64 && ABI_WIN
stalloc 2*16 + 8
// exit, A is updated with the product A K.
#if CPUFAM_X86
- mov A, [esp + 4]
- mov K, [esp + 8]
+ mov A, [SP + 4]
+ mov K, [SP + 8]
ldgot ecx
#endif
#if CPUFAM_AMD64 && ABI_WIN
// A is updated with the product A K.
#if CPUFAM_X86
- pushreg ebp
+ pushreg BP
setfp
- mov A, [esp + 8]
- mov K, [esp + 12]
- and esp, ~15
- sub esp, 16
+ mov A, [SP + 8]
+ mov K, [SP + 12]
+ stalloc 16
+ and SP, ~15
#endif
#if CPUFAM_AMD64 && ABI_WIN
stalloc 3*16 + 8
movdqu [A + 0], xmm1
#if CPUFAM_X86
dropfp
- popreg ebp
+ popreg BP
#endif
#if CPUFAM_AMD64 && ABI_WIN
rstrxmm xmm6, 0
// exit, A is updated with the product A K.
#if CPUFAM_X86
- pushreg ebp
+ pushreg BP
setfp
- mov A, [esp + 8]
- mov K, [esp + 12]
- and esp, ~15
+ mov A, [SP + 8]
+ mov K, [SP + 12]
+ stalloc 16
ldgot ecx
- sub esp, 16
+ and SP, ~15
#endif
#if CPUFAM_AMD64 && ABI_WIN
stalloc 3*16 + 8
movdqu [A + 0], xmm1
#if CPUFAM_X86
dropfp
- popreg ebp
+ popreg BP
#endif
#if CPUFAM_AMD64 && ABI_WIN
rstrxmm xmm6, 0