- pxor xmm2, xmm3 // (m_1; m_0) = u_1 v_0 + u_0 v_1
- movdqa xmm1, xmm2 // (m_1; m_0) again
- pslldq xmm2, 8 // (0; m_1)
- psrldq xmm1, 8 // (m_0; 0)
- pxor xmm0, xmm2 // x_1 = u_1 v_1 + m_1
- pxor xmm1, xmm4 // x_0 = u_0 v_0 + t^64 m_0
-
- // Two problems remain. The first is that this product is shifted
- // left (from GCM's backwards perspective) by one place, which is
- // annoying. Let's take care of that now. Once this is done, we'll
- // be properly in GCM's backwards bit-ordering, so xmm1 will hold the
- // low half of the product and xmm0 the high half. (The following
- // diagrams show bit 0 consistently on the right.)
- //
- // xmm1
- // ,-------------.-------------.-------------.-------------.
- // | 0 x_0-x_30 | x_31-x_62 | x_63-x_94 | x_95-x_126 |
- // `-------------^-------------^-------------^-------------'
- //
- // xmm0
- // ,-------------.-------------.-------------.-------------.
- // | x_127-x_158 | x_159-x_190 | x_191-x_222 | x_223-x_254 |
- // `-------------^-------------^-------------^-------------'
- //
- // We start by shifting each 32-bit lane right (from GCM's point of
- // view -- physically, left) by one place, which gives us this:
- //
- // low (xmm3)
- // ,-------------.-------------.-------------.-------------.
- // | x_0-x_30 0 | x_32-x_62 0 | x_64-x_94 0 | x_96-x_126 0|
- // `-------------^-------------^-------------^-------------'
- //
- // high (xmm2)
- // ,-------------.-------------.-------------.-------------.
- // |x_128-x_158 0|x_160-x_190 0|x_192-x_222 0|x_224-x_254 0|
- // `-------------^-------------^-------------^-------------'
- //
- // but we've lost a bunch of bits. We separately shift each lane
- // left by 31 places to give us the bits we lost.
- //
- // low (xmm1)
- // ,-------------.-------------.-------------.-------------.
- // | 0...0 | 0...0 x_31 | 0...0 x_63 | 0...0 x_95 |
- // `-------------^-------------^-------------^-------------'
- //
- // high (xmm0)
- // ,-------------.-------------.-------------.-------------.
- // | 0...0 x_127 | 0...0 x_159 | 0...0 x_191 | 0...0 x_223 |
- // `-------------^-------------^-------------^-------------'
- //
- // Which is close, but we don't get a cigar yet. To get the missing
- // bits into position, we shift each of these right by a lane, but,
- // alas, the x_127 falls off, so, separately, we shift the high
- // register left by three lanes, so that everything is lined up
- // properly when we OR them all together:
- //
- // low (xmm1)
- // ,-------------.-------------.-------------.-------------.
- // ? 0...0 x_31 | 0...0 x_63 | 0...0 x_95 | 0...0 |
- // `-------------^-------------^-------------^-------------'
- //
- // wrap (xmm4)
- // ,-------------.-------------.-------------.-------------.
- // | 0...0 | 0...0 | 0...0 | 0...0 x_127 |
- // `-------------^-------------^-------------^-------------'
- //
- // high (xmm0)
- // ,-------------.-------------.-------------.-------------.
- // | 0...0 x_159 | 0...0 x_191 | 0...0 x_223 | 0...0 |
- // `-------------^-------------^-------------^-------------'
- //
- // The `low' and `wrap' registers (xmm1, xmm3, xmm4) then collect the
- // low 128 coefficients, while the `high' registers (xmm0, xmm2)
- // collect the high 127 registers, leaving a zero bit at the most
- // significant end as we expect.
-
- // xmm0 = // (x_7, x_6; x_5, x_4)
- // xmm1 = // (x_3, x_2; x_1, x_0)
- movdqa xmm3, xmm1 // (x_3, x_2; x_1, x_0) again
- movdqa xmm2, xmm0 // (x_7, x_6; x_5, x_4) again
- psrld xmm1, 31 // shifted left; just the carries
- psrld xmm0, 31
- pslld xmm3, 1 // shifted right, but dropped carries
- pslld xmm2, 1
- movdqa xmm4, xmm0 // another copy for the carry around
- pslldq xmm1, 4 // move carries over
- pslldq xmm0, 4
- psrldq xmm4, 12 // the big carry wraps around
- por xmm1, xmm3
- por xmm0, xmm2 // (y_7, y_6; y_5, y_4)
- por xmm1, xmm4 // (y_3, y_2; y_1, y_0)
-
- // And the other problem is that the result needs to be reduced