- // q9 q8
- // ,-------------.-------------. ,-------------.-------------.
- // | 0 x_0-x_62 | x_63-x_126 | | x_127-x_190 | x_191-x_254 |
- // `-------------^-------------' `-------------^-------------'
- // d19 d18 d17 d16
- //
- // We start by shifting each 32-bit lane right (from GCM's point of
- // view -- physically, left) by one place, which gives us this:
- //
- // low (q9) high (q8)
- // ,-------------.-------------. ,-------------.-------------.
- // | x_0-x_62 0 |x_64-x_126 0 | |x_128-x_190 0|x_192-x_254 0|
- // `-------------^-------------' `-------------^-------------'
- // d19 d18 d17 d16
- //
- // but we've lost a bunch of bits. We separately shift each lane
- // left by 31 places to give us the bits we lost.
- //
- // low (q3) high (q2)
- // ,-------------.-------------. ,-------------.-------------.
- // | 0...0 | 0...0 x_63 | | 0...0 x_127 | 0...0 x_191 |
- // `-------------^-------------' `-------------^-------------'
- // d6 d5 d4
- //
- // Since we can address each of these pieces individually, putting
- // them together is relatively straightforward.
-
-
- vshr.u64 d6, d18, #63 // shifted left; just the carries
- vshl.u64 q9, q9, #1 // shifted right, but dropped carries
- vshr.u64 q2, q8, #63
- vshl.u64 q8, q8, #1
- vorr d0, d19, d6 // y_0
- vorr d1, d18, d5 // y_1
- vorr d2, d17, d4 // y_2
- vmov d3, d16 // y_3
-
- // And the other one is that the result needs to be reduced modulo
- // p(t) = t^128 + t^7 + t^2 + t + 1. Let R = t^128 = t^7 + t^2 + t +
- // 1 in our field. So far, we've calculated z_0 and z_1 such that
- // z_0 + z_1 R = u v using the identity R = t^128: now we must
- // collapse the two halves of y together using the other identity R =
- // t^7 + t^2 + t + 1.
- //
- // We do this by working on y_2 and y_3 separately, so consider y_i
- // for i = 2 or 3. Certainly, y_i t^{64i} = y_i R t^{64(i-2) =
- // (t^7 + t^2 + t + 1) y_i t^{64(i-2)}, but we can't use that