+/// -*- mode: asm; asm-comment-char: ?/ -*-
+///
+/// GCM acceleration for x86 processors
+///
+/// (c) 2018 Straylight/Edgeware
+///
+
+///----- Licensing notice ---------------------------------------------------
+///
+/// This file is part of Catacomb.
+///
+/// Catacomb is free software: you can redistribute it and/or modify it
+/// under the terms of the GNU Library General Public License as published
+/// by the Free Software Foundation; either version 2 of the License, or
+/// (at your option) any later version.
+///
+/// Catacomb is distributed in the hope that it will be useful, but
+/// WITHOUT ANY WARRANTY; without even the implied warranty of
+/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+/// Library General Public License for more details.
+///
+/// You should have received a copy of the GNU Library General Public
+/// License along with Catacomb. If not, write to the Free Software
+/// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+/// USA.
+
+///--------------------------------------------------------------------------
+/// Preliminaries.
+
+#include "config.h"
+#include "asm-common.h"
+
+ .arch .pclmul
+
+ .text
+
+///--------------------------------------------------------------------------
+/// Common register allocation.
+
+#if CPUFAM_X86
+# define A eax
+# define K edx
+#elif CPUFAM_AMD64 && ABI_SYSV
+# define A rdi
+# define K rsi
+#elif CPUFAM_AMD64 && ABI_WIN
+# define A rcx
+# define K rdx
+#endif
+
+///--------------------------------------------------------------------------
+/// Multiplication macros.
+
+ // The good news is that we have a fancy instruction to do the
+ // multiplications. The bad news is that it's not particularly well-
+ // suited to the job.
+ //
+ // For one thing, it only does a 64-bit multiplication, so in general
+ // we'll need to synthesize the full-width multiply by hand. For
+ // another thing, it doesn't help with the reduction, so we have to
+ // do that by hand too. And, finally, GCM has crazy bit ordering,
+ // and the instruction does nothing useful for that at all.
+ //
+ // Focusing on that last problem first: the bits aren't in monotonic
+ // significance order unless we permute them. If we reverse the byte
+ // order, then we'll have the bits in monotonic order, but backwards,
+ // so the degree-0 coefficient will be in the most-significant bit.
+ //
+ // This is less of a difficulty than it seems at first, because
+ // algebra. Suppose we are given u = SUM_{0<=i<n} u_i t^i and v =
+ // SUM_{0<=j<n} v_j t^j; then
+ //
+ // u v = SUM_{0<=i,j<n} u_i v_j t^{i+j}
+ //
+ // Suppose instead that we're given ũ = SUM_{0<=i<n} u_{n-i-1} t^i
+ // and ṽ = SUM_{0<=j<n} v_{n-j-1} t^j, so the bits are backwards.
+ // Then
+ //
+ // ũ ṽ = SUM_{0<=i,j<n} u_{n-i-1} v_{n-j-1} t^{i+j}
+ // = SUM_{0<=i,j<n} u_i v_j t^{2n-2-(i+j)}
+ //
+ // which is almost the bit-reversal of u v, only it's shifted right
+ // by one place. Oh, well: we'll have to shift it back later.
+ //
+ // That was important to think about, but there's not a great deal to
+ // do about it yet other than to convert what we've got from the
+ // blockcipher's byte-ordering convention to our big-endian
+ // convention. Since this depends on the blockcipher convention,
+ // we'll leave the caller to cope with this: the macros here will
+ // assume that the operands are in `register' format, which is the
+ // byte-reversal of the external representation, padded at the
+ // most-significant end except for 96-bit blocks, which are
+ // zero-padded at the least-significant end (see `mul96' for the
+ // details). In the commentary, pieces of polynomial are numbered
+ // according to the degree of the coefficients, so the unit
+ // coefficient of some polynomial a is in a_0.
+ //
+ // The commentary for `mul128' is the most detailed. The other
+ // macros assume that you've already read and understood that.
+
+.macro mul128
+ // Enter with u and v in xmm0 and xmm1 respectively; leave with z =
+ // u v in xmm0. Clobbers xmm1--xmm4.
+
+ // First for the double-precision multiplication. It's tempting to
+ // use Karatsuba's identity here, but I suspect that loses more in
+ // the shifting, bit-twiddling, and dependency chains that it gains
+ // in saving a multiplication which otherwise pipelines well.
+ // xmm0 = // (u_1; u_0)
+ // xmm1 = // (v_1; v_0)
+ movdqa xmm2, xmm1 // (v_1; v_0) again
+ movdqa xmm3, xmm0 // (u_1; u_0) again
+ movdqa xmm4, xmm0 // (u_1; u_0) yet again
+ pclmulhqlqdq xmm2, xmm0 // u_1 v_0
+ pclmullqlqdq xmm0, xmm1 // u_1 v_1
+ pclmulhqlqdq xmm3, xmm1 // u_0 v_1
+ pclmulhqhqdq xmm4, xmm1 // u_0 v_0
+
+ // Arrange the pieces to form a double-precision polynomial.
+ pxor xmm2, xmm3 // (m_1; m_0) = u_1 v_0 + u_0 v_1
+ movdqa xmm1, xmm2 // (m_1; m_0) again
+ pslldq xmm2, 8 // (0; m_1)
+ psrldq xmm1, 8 // (m_0; 0)
+ pxor xmm0, xmm2 // x_1 = u_1 v_1 + m_1
+ pxor xmm1, xmm4 // x_0 = u_0 v_0 + t^64 m_0
+
+ // Two problems remain. The first is that this product is shifted
+ // left (from GCM's backwards perspective) by one place, which is
+ // annoying. Let's take care of that now. Once this is done, we'll
+ // be properly in GCM's backwards bit-ordering, so xmm1 will hold the
+ // low half of the product and xmm0 the high half. (The following
+ // diagrams show bit 0 consistently on the right.)
+ //
+ // xmm1
+ // ,-------------.-------------.-------------.-------------.
+ // | 0 x_0-x_30 | x_31-x_62 | x_63-x_94 | x_95-x_126 |
+ // `-------------^-------------^-------------^-------------'
+ //
+ // xmm0
+ // ,-------------.-------------.-------------.-------------.
+ // | x_127-x_158 | x_159-x_190 | x_191-x_222 | x_223-x_254 |
+ // `-------------^-------------^-------------^-------------'
+ //
+ // We start by shifting each 32-bit lane right (from GCM's point of
+ // view -- physically, left) by one place, which gives us this:
+ //
+ // low (xmm3)
+ // ,-------------.-------------.-------------.-------------.
+ // | x_0-x_30 0 | x_32-x_62 0 | x_64-x_94 0 | x_96-x_126 0|
+ // `-------------^-------------^-------------^-------------'
+ //
+ // high (xmm2)
+ // ,-------------.-------------.-------------.-------------.
+ // |x_128-x_158 0|x_160-x_190 0|x_192-x_222 0|x_224-x_254 0|
+ // `-------------^-------------^-------------^-------------'
+ //
+ // but we've lost a bunch of bits. We separately shift each lane
+ // left by 31 places to give us the bits we lost.
+ //
+ // low (xmm1)
+ // ,-------------.-------------.-------------.-------------.
+ // | 0...0 | 0...0 x_31 | 0...0 x_63 | 0...0 x_95 |
+ // `-------------^-------------^-------------^-------------'
+ //
+ // high (xmm0)
+ // ,-------------.-------------.-------------.-------------.
+ // | 0...0 x_127 | 0...0 x_159 | 0...0 x_191 | 0...0 x_223 |
+ // `-------------^-------------^-------------^-------------'
+ //
+ // Which is close, but we don't get a cigar yet. To get the missing
+ // bits into position, we shift each of these right by a lane, but,
+ // alas, the x_127 falls off, so, separately, we shift the high
+ // register left by three lanes, so that everything is lined up
+ // properly when we OR them all together:
+ //
+ // low (xmm1)
+ // ,-------------.-------------.-------------.-------------.
+ // ? 0...0 x_31 | 0...0 x_63 | 0...0 x_95 | 0...0 |
+ // `-------------^-------------^-------------^-------------'
+ //
+ // wrap (xmm4)
+ // ,-------------.-------------.-------------.-------------.
+ // | 0...0 | 0...0 | 0...0 | 0...0 x_127 |
+ // `-------------^-------------^-------------^-------------'
+ //
+ // high (xmm0)
+ // ,-------------.-------------.-------------.-------------.
+ // | 0...0 x_159 | 0...0 x_191 | 0...0 x_223 | 0...0 |
+ // `-------------^-------------^-------------^-------------'
+ //
+ // The `low' and `wrap' registers (xmm1, xmm3, xmm4) then collect the
+ // low 128 coefficients, while the `high' registers (xmm0, xmm2)
+ // collect the high 127 registers, leaving a zero bit at the most
+ // significant end as we expect.
+
+ // xmm0 = // (x_7, x_6; x_5, x_4)
+ // xmm1 = // (x_3, x_2; x_1, x_0)
+ movdqa xmm3, xmm1 // (x_3, x_2; x_1, x_0) again
+ movdqa xmm2, xmm0 // (x_7, x_6; x_5, x_4) again
+ psrld xmm1, 31 // shifted left; just the carries
+ psrld xmm0, 31
+ pslld xmm3, 1 // shifted right, but dropped carries
+ pslld xmm2, 1
+ movdqa xmm4, xmm0 // another copy for the carry around
+ pslldq xmm1, 4 // move carries over
+ pslldq xmm0, 4
+ psrldq xmm4, 12 // the big carry wraps around
+ por xmm1, xmm3
+ por xmm0, xmm2 // (y_7, y_6; y_5, y_4)
+ por xmm1, xmm4 // (y_3, y_2; y_1, y_0)
+
+ // And the other problem is that the result needs to be reduced
+ // modulo p(t) = t^128 + t^7 + t^2 + t + 1. Let R = t^128 = t^7 +
+ // t^2 + t + 1 in our field. So far, we've calculated z_0 and z_1
+ // such that z_0 + z_1 R = u v using the identity R = t^128: now we
+ // must collapse the two halves of z together using the other
+ // identity R = t^7 + t^2 + t + 1.
+ //
+ // We do this by working on each 32-bit word of the high half of z
+ // separately, so consider y_i, for some 4 <= i < 8. Certainly, y_i
+ // t^{32i} = y_i R t^{32(i-4)} = (t^7 + t^2 + t + 1) y_i t^{32(i-4)},
+ // but we can't use that directly without breaking up the 32-bit word
+ // structure. Instead, we start by considering just y_i t^7
+ // t^{32(i-4)}, which again looks tricky. Now, split y_i = a_i +
+ // t^25 b_i, with deg a_i < 25; then
+ //
+ // y_i t^7 t^{32(i-4)} = a_i t^7 t^{32(i-4)} + b_i t^{32(i-3)}
+ //
+ // We can similarly decompose y_i t^2 and y_i t into a pair of 32-bit
+ // contributions to the t^{32(i-4)} and t^{32(i-3)} words, but the
+ // splits are different. This is lovely, with one small snag: when
+ // we do this to y_7, we end up with a contribution back into the
+ // t^128 coefficient word. But notice that only the low seven bits
+ // of this word are affected, so there's no knock-on contribution
+ // into the t^32 word. Therefore, if we handle the high bits of each
+ // word together, and then the low bits, everything will be fine.
+
+ // First, shift the high bits down.
+ movdqa xmm2, xmm0 // (y_7, y_6; y_5, y_4) again
+ movdqa xmm3, xmm0 // (y_7, y_6; y_5, y_4) yet again
+ movdqa xmm4, xmm0 // (y_7, y_6; y_5, y_4) again again
+ pslld xmm2, 31 // the b_i for t
+ pslld xmm3, 30 // the b_i for t^2
+ pslld xmm4, 25 // the b_i for t^7
+ pxor xmm2, xmm3 // add them all together
+ pxor xmm2, xmm4
+ movdqa xmm3, xmm2 // and a copy for later
+ psrldq xmm2, 4 // contribution into low half
+ pslldq xmm3, 12 // and high half
+ pxor xmm1, xmm2
+ pxor xmm0, xmm3
+
+ // And then shift the low bits up.
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm0
+ pxor xmm1, xmm0 // mix in the unit contribution
+ psrld xmm0, 1
+ psrld xmm2, 2
+ psrld xmm3, 7
+ pxor xmm1, xmm2 // low half, unit, and t^2 contribs
+ pxor xmm0, xmm3 // t and t^7 contribs
+ pxor xmm0, xmm1 // mix them together and we're done
+.endm
+
+.macro mul64
+ // Enter with u and v in the low halves of xmm0 and xmm1
+ // respectively; leave with z = u v in xmm0. Clobbers xmm1--xmm4.
+
+ // The multiplication is thankfully easy.
+ pclmullqlqdq xmm0, xmm1 // u v
+
+ // Shift the product up by one place. After this, we're in GCM
+ // bizarro-world.
+ movdqa xmm1, xmm0 // u v again
+ psrld xmm0, 31 // shifted left; just the carries
+ pslld xmm1, 1 // shifted right, but dropped carries
+ pslldq xmm0, 4 // move carries over
+ por xmm1, xmm0 // (y_3, y_2; y_1, y_0)
+
+ // Now we must reduce. This is essentially the same as the 128-bit
+ // case above, but mostly simpler because everything is smaller. The
+ // polynomial this time is p(t) = t^64 + t^4 + t^3 + t + 1.
+
+ // First, we must detach the top (`low'!) half of the result.
+ movdqa xmm0, xmm1 // (y_3, y_2; y_1, y_0) again
+ psrldq xmm1, 8 // (y_1, y_0; 0, 0)
+
+ // Next, shift the high bits down.
+ movdqa xmm2, xmm0 // (y_3, y_2; ?, ?) again
+ movdqa xmm3, xmm0 // (y_3, y_2; ?, ?) yet again
+ movdqa xmm4, xmm0 // (y_3, y_2; ?, ?) again again
+ pslld xmm2, 31 // b_i for t
+ pslld xmm3, 29 // b_i for t^3
+ pslld xmm4, 28 // b_i for t^4
+ pxor xmm2, xmm3 // add them all together
+ pxor xmm2, xmm4
+ movdqa xmm3, xmm2 // and a copy for later
+ movq xmm2, xmm2 // zap high half
+ pslldq xmm3, 4 // contribution into high half
+ psrldq xmm2, 4 // and low half
+ pxor xmm0, xmm3
+ pxor xmm1, xmm2
+
+ // And then shift the low bits up.
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm0
+ pxor xmm1, xmm0 // mix in the unit contribution
+ psrld xmm0, 1
+ psrld xmm2, 3
+ psrld xmm3, 4
+ pxor xmm1, xmm2 // low half, unit, and t^3 contribs
+ pxor xmm0, xmm3 // t and t^4 contribs
+ pxor xmm0, xmm1 // mix them together and we're done
+.endm
+
+.macro mul96
+ // Enter with u and v in the /high/ three words of xmm0 and xmm1
+ // respectively (and zero in the low word); leave with z = u v in the
+ // high three words of xmm0, and /junk/ in the low word. Clobbers
+ // xmm1--xmm4.
+
+ // This is an inconvenient size. There's nothing for it but to do
+ // four multiplications, as if for the 128-bit case. It's possible
+ // that there's cruft in the top 32 bits of the input registers, so
+ // shift both of them up by four bytes before we start. This will
+ // mean that the high 64 bits of the result (from GCM's viewpoint)
+ // will be zero.
+ // xmm0 = // (0, u_2; u_1, u_0)
+ // xmm1 = // (0, v_2; v_1, v_0)
+ movdqa xmm2, xmm1 // (0, v_2; v_1, v_0) again
+ movdqa xmm3, xmm0 // (0, u_2; u_1, u_0) again
+ movdqa xmm4, xmm0 // (0, u_2; u_1, u_0) yet again
+ pclmulhqlqdq xmm2, xmm0 // u_2 (v_1 t^32 + v_0) = e_0
+ pclmullqlqdq xmm0, xmm1 // u_2 v_2 = d = (0; d)
+ pclmulhqlqdq xmm3, xmm1 // v_2 (u_1 t^32 + u_0) = e_1
+ pclmulhqhqdq xmm4, xmm1 // u_0 v_0 + (u_1 v_0 + u_0 v_1) t^32
+ // + u_1 v_1 t^64 = f
+
+ // Extract the high and low halves of the 192-bit result. We don't
+ // need be too picky about the unused high words of the result
+ // registers. The answer we want is d t^128 + e t^64 + f, where e =
+ // e_0 + e_1.
+ //
+ // The place values for the two halves are (t^160, t^128; t^96, ?)
+ // and (?, t^64; t^32, 1).
+ psrldq xmm0, 8 // (d; 0) = d t^128
+ pxor xmm2, xmm3 // e = (e_0 + e_1)
+ movdqa xmm1, xmm4 // f again
+ pxor xmm0, xmm2 // d t^128 + e t^64
+ psrldq xmm2, 12 // e[31..0] t^64
+ psrldq xmm1, 4 // f[95..0]
+ pslldq xmm4, 8 // f[127..96]
+ pxor xmm1, xmm2 // low 96 bits of result
+ pxor xmm0, xmm4 // high 96 bits of result
+
+ // Next, shift everything one bit to the left to compensate for GCM's
+ // strange ordering. This will be easier if we shift up the high
+ // half by a word before we start. After this we're in GCM bizarro-
+ // world.
+ movdqa xmm3, xmm1 // low half again
+ pslldq xmm0, 4 // shift high half
+ psrld xmm1, 31 // shift low half down: just carries
+ movdqa xmm2, xmm0 // copy high half
+ pslld xmm3, 1 // shift low half down: drop carries
+ psrld xmm0, 31 // shift high half up: just carries
+ pslld xmm2, 1 // shift high half down: drop carries
+ movdqa xmm4, xmm0 // copy high carries for carry-around
+ pslldq xmm0, 4 // shift carries down
+ pslldq xmm1, 4
+ psrldq xmm4, 12 // the big carry wraps around
+ por xmm1, xmm3
+ por xmm0, xmm2
+ por xmm1, xmm4
+
+ // Finally, the reduction. This is essentially the same as the
+ // 128-bit case, except that the polynomial is p(t) = t^96 + t^10 +
+ // t^9 + t^6 + 1. The degrees are larger but not enough to cause
+ // trouble for the general approach.
+
+ // First, shift the high bits down.
+ movdqa xmm2, xmm0 // copies of the high part
+ movdqa xmm3, xmm0
+ movdqa xmm4, xmm0
+ pslld xmm2, 26 // b_i for t^6
+ pslld xmm3, 23 // b_i for t^9
+ pslld xmm4, 22 // b_i for t^10
+ pxor xmm2, xmm3 // add them all together
+ pslldq xmm1, 4 // shift low part up to match
+ pxor xmm2, xmm4
+ movdqa xmm3, xmm2 // and a copy for later
+ pslldq xmm2, 8 // contribution to high half
+ psrldq xmm3, 4 // contribution to low half
+ pxor xmm1, xmm3
+ pxor xmm0, xmm2
+
+ // And then shift the low bits up.
+ movdqa xmm2, xmm0 // copies of the high part
+ movdqa xmm3, xmm0
+ pxor xmm1, xmm0 // mix in the unit contribution
+ psrld xmm0, 6
+ psrld xmm2, 9
+ psrld xmm3, 10
+ pxor xmm1, xmm2 // low half, unit, and t^9 contribs
+ pxor xmm0, xmm3 // t^6 and t^10 contribs
+ pxor xmm0, xmm1 // mix them together and we're done
+.endm
+
+.macro mul192
+ // Enter with u and v in xmm0/xmm1 and xmm2/xmm3 respectively; leave
+ // with z = u v in xmm0/xmm1 -- the top halves of the high registers
+ // are unimportant. Clobbers xmm2--xmm7.
+
+ // Start multiplying and accumulating pieces of product.
+ // xmm0 = // (u_2; u_1)
+ // xmm1 = // (u_0; ?)
+ // xmm2 = // (v_2; v_1)
+ // xmm3 = // (v_0; ?)
+ movdqa xmm4, xmm0 // (u_2; u_1) again
+ movdqa xmm5, xmm0 // (u_2; u_1) yet again
+ movdqa xmm6, xmm0 // (u_2; u_1) again again
+ movdqa xmm7, xmm1 // (u_0; ?) again
+ punpcklqdq xmm1, xmm3 // (u_0; v_0)
+ pclmulhqhqdq xmm4, xmm2 // u_1 v_1
+ pclmullqlqdq xmm3, xmm0 // u_2 v_0
+ pclmullqhqdq xmm5, xmm2 // u_2 v_1
+ pclmulhqlqdq xmm6, xmm2 // u_1 v_2
+ pxor xmm4, xmm3 // u_2 v_0 + u_1 v_1
+ pclmullqlqdq xmm7, xmm2 // u_0 v_2
+ pxor xmm5, xmm6 // b = u_2 v_1 + u_1 v_2
+ movdqa xmm6, xmm0 // (u_2; u_1) like a bad penny
+ pxor xmm4, xmm7 // c = u_0 v_2 + u_1 v_1 + u_2 v_0
+ pclmullqlqdq xmm0, xmm2 // a = u_2 v_2
+ pclmulhqhqdq xmm6, xmm1 // u_1 v_0
+ pclmulhqlqdq xmm2, xmm1 // u_0 v_1
+ pclmullqhqdq xmm1, xmm1 // e = u_0 v_0
+ pxor xmm2, xmm6 // d = u_1 v_0 + u_0 v_1
+
+ // Next, the piecing together of the product.
+ // xmm0 = // (a_1; a_0) = a = u_2 v_2
+ // xmm5 = // (b_1; b_0) = b = u_1 v_2 + u_2 v_1
+ // xmm4 = // (c_1; c_0) = c = u_0 v_2 +
+ // u_1 v_1 + u_2 v_0
+ // xmm2 = // (d_1; d_0) = d = u_0 v_1 + u_1 v_0
+ // xmm1 = // (e_1; e_0) = e = u_0 v_0
+ // xmm3, xmm6, xmm7 spare
+ movdqa xmm3, xmm2 // (d_1; d_0) again
+ movdqa xmm6, xmm5 // (b_1; b_0) again
+ pslldq xmm2, 8 // (0; d_1)
+ psrldq xmm5, 8 // (b_0; 0)
+ psrldq xmm3, 8 // (d_0; 0)
+ pslldq xmm6, 8 // (0; b_1)
+ pxor xmm5, xmm2 // (b_0; d_1)
+ pxor xmm0, xmm6 // x_2 = (a_1; a_0 + b_1)
+ pxor xmm3, xmm1 // x_0 = (e_1 + d_0; e_0)
+ pxor xmm4, xmm5 // x_1 = (b_0 + c_1; c_0 + d_1)
+
+ // Now, shift it right (from GCM's point of view) by one bit, and try
+ // to leave the result in less random registers. After this, we'll
+ // be in GCM bizarro-world.
+ // xmm1, xmm2, xmm5, xmm6, xmm7 spare
+ movdqa xmm5, xmm0 // copy x_2
+ movdqa xmm1, xmm4 // copy x_1
+ movdqa xmm2, xmm3 // copy x_0
+ psrld xmm0, 31 // x_2 carries
+ psrld xmm4, 31 // x_1 carries
+ psrld xmm3, 31 // x_0 carries
+ pslld xmm5, 1 // x_2 shifted
+ pslld xmm1, 1 // x_1 shifted
+ pslld xmm2, 1 // x_0 shifted
+ movdqa xmm6, xmm0 // x_2 carry copy
+ movdqa xmm7, xmm4 // x_1 carry copy
+ pslldq xmm0, 4 // x_2 carry shifted
+ pslldq xmm4, 4 // x_1 carry shifted
+ pslldq xmm3, 4 // x_0 carry shifted
+ psrldq xmm6, 12 // x_2 carry out
+ psrldq xmm7, 12 // x_1 carry out
+ por xmm0, xmm5 // (y_5; y_4)
+ por xmm1, xmm4
+ por xmm2, xmm3
+ por xmm1, xmm6 // (y_3; y_2)
+ por xmm2, xmm7 // (y_1; y_0)
+
+ // Next, the reduction. Our polynomial this time is p(x) = t^192 +
+ // t^7 + t^2 + t + 1. Yes, the magic numbers are the same as the
+ // 128-bit case. I don't know why.
+
+ // First, shift the high bits down.
+ // xmm0 = // (y_5; y_4)
+ // xmm1 = // (y_3; y_2)
+ // xmm2 = // (y_1; y_0)
+ // xmm3--xmm7 spare
+ movdqa xmm3, xmm0 // (y_5; y_4) copy
+ movdqa xmm4, xmm0 // (y_5; y_4) copy
+ movdqa xmm5, xmm0 // (y_5; y_4) copy
+ pslld xmm3, 31 // (y_5; y_4) b_i for t
+ pslld xmm4, 30 // (y_5; y_4) b_i for t^2
+ pslld xmm5, 25 // (y_5; y_4) b_i for t^7
+ movq xmm6, xmm1 // (y_3; 0) copy
+ pxor xmm3, xmm4
+ movq xmm7, xmm1 // (y_3; 0) copy
+ pxor xmm3, xmm5
+ movq xmm5, xmm1 // (y_3; 0) copy
+ movdqa xmm4, xmm3 // (y_5; y_4) b_i combined
+ pslld xmm6, 31 // (y_3; 0) b_i for t
+ pslld xmm7, 30 // (y_3; 0) b_i for t^2
+ pslld xmm5, 25 // (y_3; 0) b_i for t^7
+ psrldq xmm3, 12 // (y_5; y_4) low contrib
+ pslldq xmm4, 4 // (y_5; y_4) high contrib
+ pxor xmm6, xmm7
+ pxor xmm2, xmm3
+ pxor xmm6, xmm5
+ pxor xmm1, xmm4
+ pslldq xmm6, 4
+ pxor xmm2, xmm6
+
+ // And finally shift the low bits up. Unfortunately, we also have to
+ // split the low bits out.
+ // xmm0 = // (y'_5; y'_4)
+ // xmm1 = // (y'_3; y'_2)
+ // xmm2 = // (y'_1; y'_0)
+ movdqa xmm5, xmm1 // copies of (y'_3; y'_2)
+ movdqa xmm6, xmm1
+ movdqa xmm7, xmm1
+ psrldq xmm1, 8 // bring down (y'_2; ?)
+ movdqa xmm3, xmm0 // copies of (y'_5; y'_4)
+ movdqa xmm4, xmm0
+ punpcklqdq xmm1, xmm2 // (y'_2; y'_1)
+ psrldq xmm2, 8 // (y'_0; ?)
+ pxor xmm2, xmm5 // low half and unit contrib
+ pxor xmm1, xmm0
+ psrld xmm5, 1
+ psrld xmm0, 1
+ psrld xmm6, 2
+ psrld xmm3, 2
+ psrld xmm7, 7
+ psrld xmm4, 7
+ pxor xmm2, xmm6 // low half, unit, t^2 contribs
+ pxor xmm1, xmm3
+ pxor xmm5, xmm7 // t and t^7 contribs
+ pxor xmm0, xmm4
+ pxor xmm5, xmm2 // mix everything together
+ pxor xmm0, xmm1
+ movq xmm1, xmm5 // shunt (z_0; ?) into proper place
+.endm
+
+.macro mul256
+ // Enter with u and v in xmm0/xmm1 and xmm2/xmm3 respectively; leave
+ // with z = u v in xmm0/xmm1. Clobbers xmm2--xmm7. On 32-bit x86,
+ // requires 16 bytes aligned space at SP; on amd64, also clobbers
+ // xmm8.
+
+ // Now it's starting to look worthwhile to do Karatsuba. Suppose
+ // u = u_0 + u_1 B and v = v_0 + v_1 B. Then
+ //
+ // u v = (u_0 v_0) + (u_0 v_1 + u_1 v_0) B + (u_1 v_1) B^2
+ //
+ // Name these coefficients of B^i be a, b, and c, respectively, and
+ // let r = u_0 + u_1 and s = v_0 + v_1. Then observe that
+ //
+ // q = r s = (u_0 + u_1) (v_0 + v_1)
+ // = (u_0 v_0) + (u1 v_1) + (u_0 v_1 + u_1 v_0)
+ // = a + d + c
+ //
+ // The first two terms we've already calculated; the last is the
+ // remaining one we want. We'll set B = t^128. We know how to do
+ // 128-bit multiplications already, and Karatsuba is too annoying
+ // there, so there'll be 12 multiplications altogether, rather than
+ // the 16 we'd have if we did this the naïve way.
+ //
+ // On x86, there aren't quite enough registers, so spill one for a
+ // bit. On AMD64, we can keep on going, so it's all good.
+
+ // xmm0 = // u_1 = (u_11; u_10)
+ // xmm1 = // u_0 = (u_01; u_00)
+ // xmm2 = // v_1 = (v_11; v_10)
+ // xmm3 = // v_0 = (v_01; v_00)
+ movdqa xmm4, xmm0 // u_1 again
+#if CPUFAM_X86
+ movdqa [esp + 0], xmm3
+#elif CPUFAM_AMD64
+ movdqa xmm8, xmm3
+# define V0 xmm8
+#endif
+ pxor xmm4, xmm1 // u_* = (u_01 + u_11; u_00 + u_10)
+ pxor xmm3, xmm2 // v_* = (v_01 + v_11; v_00 + v_10)
+
+ // Start by building the cross product, q = u_* v_*.
+ movdqa xmm7, xmm4 // more copies of u_*
+ movdqa xmm5, xmm4
+ movdqa xmm6, xmm4
+ pclmullqhqdq xmm4, xmm3 // u_*1 v_*0
+ pclmulhqlqdq xmm7, xmm3 // u_*0 v_*1
+ pclmullqlqdq xmm5, xmm3 // u_*1 v_*1
+ pclmulhqhqdq xmm6, xmm3 // u_*0 v_*0
+ pxor xmm4, xmm7 // u_*1 v_*0 + u_*0 v_*1
+ movdqa xmm7, xmm4
+ pslldq xmm4, 8
+ psrldq xmm7, 8
+ pxor xmm5, xmm4 // q_1
+ pxor xmm6, xmm7 // q_0
+
+ // Next, work on the high half, a = u_1 v_1.
+ movdqa xmm3, xmm0 // more copies of u_1
+ movdqa xmm4, xmm0
+ movdqa xmm7, xmm0
+ pclmullqhqdq xmm0, xmm2 // u_11 v_10
+ pclmulhqlqdq xmm3, xmm2 // u_10 v_11
+ pclmullqlqdq xmm4, xmm2 // u_11 v_11
+ pclmulhqhqdq xmm7, xmm2 // u_10 v_10
+#if CPUFAM_X86
+ movdqa xmm2, [esp + 0]
+# define V0 xmm2
+#endif
+ pxor xmm0, xmm3 // u_10 v_11 + u_11 v_10
+ movdqa xmm3, xmm0
+ pslldq xmm0, 8
+ psrldq xmm3, 8
+ pxor xmm4, xmm0 // x_1 = a_1
+ pxor xmm7, xmm3 // a_0
+
+ // Mix that into the product now forming in xmm4--xmm7.
+ pxor xmm5, xmm4 // a_1 + q_1
+ pxor xmm6, xmm7 // a_0 + q_0
+ pxor xmm5, xmm7 // a_0 + (a_1 + q_1)
+
+ // Finally, the low half, c = u_0 v_0.
+ movdqa xmm0, xmm1 // more copies of u_0
+ movdqa xmm3, xmm1
+ movdqa xmm7, xmm1
+ pclmullqhqdq xmm1, V0 // u_01 v_00
+ pclmulhqlqdq xmm0, V0 // u_00 v_01
+ pclmullqlqdq xmm3, V0 // u_01 v_01
+ pclmulhqhqdq xmm7, V0 // u_00 v_00
+ pxor xmm0, xmm1 // u_10 v_11 + u_11 v_10
+ movdqa xmm1, xmm0
+ pslldq xmm0, 8
+ psrldq xmm1, 8
+ pxor xmm3, xmm0 // c_1
+ pxor xmm7, xmm1 // x_0 = c_0
+
+ // And mix that in to complete the product.
+ pxor xmm6, xmm3 // (a_0 + q_0) + c_1
+ pxor xmm5, xmm3 // x_2 = a_0 + (a_1 + c_1 + q_1) = a_0 + b_1
+ pxor xmm6, xmm7 // x_1 = (a_0 + c_0 + q_0) + c_1 = b_0 + c_1
+
+#undef V0
+
+ // Now we need to shift that whole lot one bit to the left. This
+ // will also give us an opportunity to put the product back in
+ // xmm0--xmm3. This is a slightly merry dance because it's nearly
+ // pipelined but we don't have enough registers.
+ //
+ // After this, we'll be in GCM bizarro-world.
+ movdqa xmm0, xmm4 // x_3 again
+ psrld xmm4, 31 // x_3 carries
+ pslld xmm0, 1 // x_3 shifted left
+ movdqa xmm3, xmm4 // x_3 copy carries
+ movdqa xmm1, xmm5 // x_2 again
+ pslldq xmm4, 4 // x_3 carries shifted up
+ psrld xmm5, 31 // x_2 carries
+ psrldq xmm3, 12 // x_3 big carry out
+ pslld xmm1, 1 // x_2 shifted left
+ por xmm0, xmm4 // x_3 mixed together
+ movdqa xmm4, xmm5 // x_2 copy carries
+ movdqa xmm2, xmm6 // x_1 again
+ pslldq xmm5, 4 // x_2 carries shifted up
+ psrld xmm6, 31 // x_1 carries
+ psrldq xmm4, 12 // x_2 big carry out
+ pslld xmm2, 1 // x_1 shifted
+ por xmm1, xmm5 // x_2 mixed together
+ movdqa xmm5, xmm6 // x_1 copy carries
+ por xmm1, xmm3 // x_2 with carry from x_3
+ movdqa xmm3, xmm7 // x_0 again
+ pslldq xmm6, 4 // x_1 carries shifted up
+ psrld xmm7, 31 // x_2 carries
+ psrldq xmm5, 12 // x_1 big carry out
+ pslld xmm3, 1 // x_0 shifted
+ por xmm2, xmm6 // x_1 mixed together
+ pslldq xmm7, 4 // x_0 carries shifted up
+ por xmm2, xmm4 // x_1 with carry from x_2
+ por xmm3, xmm7 // x_0 mixed together
+ por xmm3, xmm5 // x_0 with carry from x_1
+
+ // Now we must reduce. This is essentially the same as the 128-bit
+ // case above, but more complicated because everything is bigger.
+ // The polynomial this time is p(t) = t^256 + t^10 + t^5 + t^2 + 1.
+
+ // First, shift the high bits down.
+ movdqa xmm4, xmm0 // y_3 again
+ movdqa xmm5, xmm0 // y_3 yet again
+ movdqa xmm6, xmm0 // y_3 again again
+ pslld xmm4, 30 // y_3: b_i for t^2
+ pslld xmm5, 27 // y_3: b_i for t^5
+ pslld xmm6, 22 // y_3: b_i for t^10
+ movdqa xmm7, xmm1 // y_2 again
+ pxor xmm4, xmm5
+ movdqa xmm5, xmm1 // y_2 again
+ pxor xmm4, xmm6
+ movdqa xmm6, xmm1 // y_2 again
+ pslld xmm7, 30 // y_2: b_i for t^2
+ pslld xmm5, 27 // y_2: b_i for t^5
+ pslld xmm6, 22 // y_2: b_i for t^10
+ pxor xmm7, xmm5
+ movdqa xmm5, xmm4
+ pxor xmm7, xmm6
+ psrldq xmm4, 4
+ movdqa xmm6, xmm7
+ pslldq xmm5, 12
+ psrldq xmm7, 4
+ pxor xmm2, xmm4
+ pslldq xmm6, 12
+ pxor xmm3, xmm7
+ pxor xmm1, xmm5
+ pxor xmm2, xmm6
+
+ // And then shift the low bits up.
+ movdqa xmm4, xmm0 // y_3 again
+ movdqa xmm5, xmm1 // y_2 again
+ movdqa xmm6, xmm0 // y_3 yet again
+ movdqa xmm7, xmm1 // y_2 yet again
+ pxor xmm2, xmm0 // y_1 and unit contrib from y_3
+ pxor xmm3, xmm1 // y_0 and unit contrib from y_2
+ psrld xmm0, 2
+ psrld xmm1, 2
+ psrld xmm4, 5
+ psrld xmm5, 5
+ psrld xmm6, 10
+ psrld xmm7, 10
+ pxor xmm0, xmm2 // y_1, with y_3 units and t^2
+ pxor xmm1, xmm3 // y_0, with y_2 units and t^2
+ pxor xmm4, xmm6 // y_3 t^5 and t^10 contribs
+ pxor xmm5, xmm7 // y_2 t^5 and t^10 contribs
+ pxor xmm0, xmm4 // high half of reduced result
+ pxor xmm1, xmm5 // low half; all done
+.endm
+
+///--------------------------------------------------------------------------
+/// Main code.
+
+// There are a number of representations of field elements in this code and
+// it can be confusing.
+//
+// * The `external format' consists of a sequence of contiguous bytes in
+// memory called a `block'. The GCM spec explains how to interpret this
+// block as an element of a finite field. As discussed extensively, this
+// representation is very annoying for a number of reasons. On the other
+// hand, this code never actually deals with it directly.
+//
+// * The `register format' consists of one or more XMM registers, depending
+// on the block size. The bytes in these registers are in reverse order
+// -- so the least-significant byte of the lowest-numbered register holds
+// the /last/ byte in the block. If the block size is not a multiple of
+// 16 bytes, then there must be padding. 96-bit blocks are weird: the
+// padding is inserted at the /least/ significant end, so the register
+// holds (0, x_0; x_1, x_2); otherwise, the padding goes at the most
+// significant end.
+//
+// * The `words' format consists of a sequence of bytes, as in the
+// `external format', but, according to the blockcipher in use, the bytes
+// within each 32-bit word may be reversed (`big-endian') or not
+// (`little-endian'). Accordingly, there are separate entry points for
+// each variant, identified with `b' or `l'.
+
+#define SSEFUNC(f) \
+ FUNC(f##_avx); vzeroupper; endprologue; ENDFUNC; \
+ FUNC(f)
+
+SSEFUNC(gcm_mulk_128b_x86ish_pclmul)
+ // On entry, A points to a 128-bit field element in big-endian words
+ // format; K points to a field-element in register format. On exit,
+ // A is updated with the product A K.
+
+#if CPUFAM_X86
+ mov A, [esp + 4]
+ mov K, [esp + 8]
+#endif
+ endprologue
+ movdqu xmm0, [A]
+ movdqu xmm1, [K]
+ pshufd xmm0, xmm0, SHUF(3, 2, 1, 0)
+ mul128
+ pshufd xmm0, xmm0, SHUF(3, 2, 1, 0)
+ movdqu [A], xmm0
+ ret
+ENDFUNC
+
+SSEFUNC(gcm_mulk_128l_x86ish_pclmul)
+ // On entry, A points to a 128-bit field element in little-endian
+ // words format; K points to a field-element in register format. On
+ // exit, A is updated with the product A K.
+
+#if CPUFAM_X86
+ mov A, [esp + 4]
+ mov K, [esp + 8]
+ ldgot ecx
+#endif
+ endprologue
+ movdqa xmm7, [INTADDR(swaptab_128l, ecx)]
+ movdqu xmm0, [A]
+ movdqu xmm1, [K]
+ pshufb xmm0, xmm7
+ mul128
+ pshufb xmm0, xmm7
+ movdqu [A], xmm0
+ ret
+ENDFUNC
+
+SSEFUNC(gcm_mulk_64b_x86ish_pclmul)
+ // On entry, A points to a 64-bit field element in big-endian words
+ // format; K points to a field-element in register format. On exit,
+ // A is updated with the product A K.
+
+#if CPUFAM_X86
+ mov A, [esp + 4]
+ mov K, [esp + 8]
+#endif
+ endprologue
+ movq xmm0, [A]
+ movq xmm1, [K]
+ pshufd xmm0, xmm0, SHUF(1, 0, 3, 3)
+ mul64
+ pshufd xmm0, xmm0, SHUF(1, 0, 3, 3)
+ movq [A], xmm0
+ ret
+ENDFUNC
+
+SSEFUNC(gcm_mulk_64l_x86ish_pclmul)
+ // On entry, A points to a 64-bit field element in little-endian
+ // words format; K points to a field-element in register format. On
+ // exit, A is updated with the product A K.
+
+#if CPUFAM_X86
+ mov A, [esp + 4]
+ mov K, [esp + 8]
+ ldgot ecx
+#endif
+ endprologue
+ movdqa xmm7, [INTADDR(swaptab_64l, ecx)]
+ movq xmm0, [A]
+ movq xmm1, [K]
+ pshufb xmm0, xmm7
+ mul64
+ pshufb xmm0, xmm7
+ movq [A], xmm0
+ ret
+ENDFUNC
+
+SSEFUNC(gcm_mulk_96b_x86ish_pclmul)
+ // On entry, A points to a 96-bit field element in big-endian words
+ // format; K points to a field-element in register format (i.e., 16
+ // bytes, with the first four bytes zero). On exit, A is updated
+ // with the product A K.
+
+#if CPUFAM_X86
+ mov A, [esp + 4]
+ mov K, [esp + 8]
+#endif
+ endprologue
+ movq xmm0, [A + 0]
+ movd xmm2, [A + 8]
+ movdqu xmm1, [K]
+ punpcklqdq xmm0, xmm2
+ pshufd xmm0, xmm0, SHUF(3, 2, 1, 0)
+ mul96
+ pshufd xmm1, xmm0, SHUF(3, 2, 1, 0)
+ psrldq xmm0, 4
+ movq [A + 0], xmm1
+ movd [A + 8], xmm0
+ ret
+ENDFUNC
+
+SSEFUNC(gcm_mulk_96l_x86ish_pclmul)
+ // On entry, A points to a 96-bit field element in little-endian
+ // words format; K points to a field-element in register format
+ // (i.e., 16 bytes, with the first four bytes zero). On exit, A is
+ // updated with the product A K.
+
+#if CPUFAM_X86
+ mov A, [esp + 4]
+ mov K, [esp + 8]
+ ldgot ecx
+#endif
+ endprologue
+ movdqa xmm7, [INTADDR(swaptab_128l, ecx)]
+ movq xmm0, [A + 0]
+ movd xmm2, [A + 8]
+ movdqu xmm1, [K]
+ punpcklqdq xmm0, xmm2
+ pshufb xmm0, xmm7
+ mul96
+ pshufb xmm0, xmm7
+ movq [A + 0], xmm0
+ psrldq xmm0, 8
+ movd [A + 8], xmm0
+ ret
+ENDFUNC
+
+SSEFUNC(gcm_mulk_192b_x86ish_pclmul)
+ // On entry, A points to a 192-bit field element in big-endian words
+ // format; K points to a field-element in register format. On exit,
+ // A is updated with the product A K.
+
+#if CPUFAM_X86
+ mov A, [esp + 4]
+ mov K, [esp + 8]
+#endif
+#if CPUFAM_AMD64 && ABI_WIN
+ stalloc 2*16 + 8
+ savexmm xmm6, 0
+ savexmm xmm7, 16
+#endif
+ endprologue
+ movdqu xmm0, [A + 8]
+ movq xmm1, [A + 0]
+ movdqu xmm2, [K + 0]
+ movq xmm3, [K + 16]
+ pshufd xmm0, xmm0, SHUF(3, 2, 1, 0)
+ pshufd xmm1, xmm1, SHUF(1, 0, 3, 3)
+ mul192
+ pshufd xmm0, xmm0, SHUF(3, 2, 1, 0)
+ pshufd xmm1, xmm1, SHUF(1, 0, 3, 3)
+ movdqu [A + 8], xmm0
+ movq [A + 0], xmm1
+#if CPUFAM_AMD64 && ABI_WIN
+ rstrxmm xmm6, 0
+ rstrxmm xmm7, 16
+ stfree 2*16 + 8
+#endif
+ ret
+ENDFUNC
+
+SSEFUNC(gcm_mulk_192l_x86ish_pclmul)
+ // On entry, A points to a 192-bit field element in little-endian
+ // words format; K points to a field-element in register format. On
+ // exit, A is updated with the product A K.
+
+#if CPUFAM_X86
+ mov A, [esp + 4]
+ mov K, [esp + 8]
+ ldgot ecx
+#endif
+#if CPUFAM_AMD64 && ABI_WIN
+ stalloc 2*16 + 8
+ savexmm xmm6, 0
+ savexmm xmm7, 16
+#endif
+ endprologue
+ movdqu xmm0, [A + 8]
+ movq xmm1, [A + 0]
+ movdqu xmm2, [K + 0]
+ movq xmm3, [K + 16]
+ pshufb xmm0, [INTADDR(swaptab_128l, ecx)]
+ pshufb xmm1, [INTADDR(swaptab_64l, ecx)]
+ mul192
+ pshufb xmm0, [INTADDR(swaptab_128l, ecx)]
+ pshufb xmm1, [INTADDR(swaptab_64l, ecx)]
+ movdqu [A + 8], xmm0
+ movq [A + 0], xmm1
+#if CPUFAM_AMD64 && ABI_WIN
+ rstrxmm xmm6, 0
+ rstrxmm xmm7, 16
+ stfree 2*16 + 8
+#endif
+ ret
+ENDFUNC
+
+SSEFUNC(gcm_mulk_256b_x86ish_pclmul)
+ // On entry, A points to a 256-bit field element in big-endian words
+ // format; K points to a field-element in register format. On exit,
+ // A is updated with the product A K.
+
+#if CPUFAM_X86
+ pushreg ebp
+ setfp
+ mov A, [esp + 8]
+ mov K, [esp + 12]
+ and esp, ~15
+ sub esp, 16
+#endif
+#if CPUFAM_AMD64 && ABI_WIN
+ stalloc 3*16 + 8
+ savexmm xmm6, 0
+ savexmm xmm7, 16
+ savexmm xmm8, 32
+#endif
+ endprologue
+ movdqu xmm0, [A + 16]
+ movdqu xmm1, [A + 0]
+ movdqu xmm2, [K + 0]
+ movdqu xmm3, [K + 16]
+ pshufd xmm0, xmm0, SHUF(3, 2, 1, 0)
+ pshufd xmm1, xmm1, SHUF(3, 2, 1, 0)
+ mul256
+ pshufd xmm0, xmm0, SHUF(3, 2, 1, 0)
+ pshufd xmm1, xmm1, SHUF(3, 2, 1, 0)
+ movdqu [A + 16], xmm0
+ movdqu [A + 0], xmm1
+#if CPUFAM_X86
+ dropfp
+ popreg ebp
+#endif
+#if CPUFAM_AMD64 && ABI_WIN
+ rstrxmm xmm6, 0
+ rstrxmm xmm7, 16
+ rstrxmm xmm8, 32
+ stfree 3*16 + 8
+#endif
+ ret
+ENDFUNC
+
+SSEFUNC(gcm_mulk_256l_x86ish_pclmul)
+ // On entry, A points to a 256-bit field element in little-endian
+ // words format; K points to a field-element in register format. On
+ // exit, A is updated with the product A K.
+
+#if CPUFAM_X86
+ pushreg ebp
+ setfp
+ mov A, [esp + 8]
+ mov K, [esp + 12]
+ and esp, ~15
+ ldgot ecx
+ sub esp, 16
+#endif
+#if CPUFAM_AMD64 && ABI_WIN
+ stalloc 3*16 + 8
+ savexmm xmm6, 0
+ savexmm xmm7, 16
+ savexmm xmm8, 32
+#endif
+ endprologue
+ movdqa xmm7, [INTADDR(swaptab_128l, ecx)]
+ movdqu xmm0, [A + 16]
+ movdqu xmm1, [A + 0]
+ movdqu xmm2, [K + 0]
+ movdqu xmm3, [K + 16]
+ pshufb xmm0, xmm7
+ pshufb xmm1, xmm7
+ mul256
+ movdqa xmm7, [INTADDR(swaptab_128l, ecx)]
+ pshufb xmm0, xmm7
+ pshufb xmm1, xmm7
+ movdqu [A + 16], xmm0
+ movdqu [A + 0], xmm1
+#if CPUFAM_X86
+ dropfp
+ popreg ebp
+#endif
+#if CPUFAM_AMD64 && ABI_WIN
+ rstrxmm xmm6, 0
+ rstrxmm xmm7, 16
+ rstrxmm xmm8, 32
+ stfree 3*16 + 8
+#endif
+ ret
+ENDFUNC
+
+ RODATA
+
+ .balign 16
+swaptab_128l:
+ // Table for byte-swapping little-endian words-format blocks larger
+ // than 64 bits.
+ .byte 15, 14, 13, 12, 11, 10, 9, 8
+ .byte 7, 6, 5, 4, 3, 2, 1, 0
+
+ .balign 16
+swaptab_64l:
+ // Table for byte-swapping 64-bit little-endian words-format blocks.
+ .byte 7, 6, 5, 4, 3, 2, 1, 0
+ .byte 255, 255, 255, 255, 255, 255, 255, 255
+
+///----- That's all, folks --------------------------------------------------