[catacomb] / symm / gcm-arm-crypto.S

/// -*- mode: asm; asm-comment-char: ?/ -*-
///
/// GCM acceleration for ARM processors
///
/// (c) 2019 Straylight/Edgeware
///

///----- Licensing notice ---------------------------------------------------
///
/// This file is part of Catacomb.
///
/// Catacomb is free software: you can redistribute it and/or modify it
/// under the terms of the GNU Library General Public License as published
/// by the Free Software Foundation; either version 2 of the License, or
/// (at your option) any later version.
///
/// Catacomb is distributed in the hope that it will be useful, but
/// WITHOUT ANY WARRANTY; without even the implied warranty of
/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
/// Library General Public License for more details.
///
/// You should have received a copy of the GNU Library General Public
/// License along with Catacomb.  If not, write to the Free Software
/// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
/// USA.

///--------------------------------------------------------------------------
/// Preliminaries.

#include "config.h"
#include "asm-common.h"

	.arch	armv8-a
	.fpu	crypto-neon-fp-armv8

	.text

///--------------------------------------------------------------------------
/// Multiplication macros.

	// The good news is that we have a fancy instruction to do the
	// multiplications.  The bad news is that it's not particularly well-
	// suited to the job.
	//
	// For one thing, it only does a 64-bit multiplication, so in general
	// we'll need to synthesize the full-width multiply by hand.  For
	// another thing, it doesn't help with the reduction, so we have to
	// do that by hand too.  And, finally, GCM has crazy bit ordering,
	// and the instruction does nothing useful for that at all.
	//
	// Focusing on that last problem first: the bits aren't in monotonic
	// significance order unless we permute them.  If we reverse the byte
	// order, then we'll have the bits in monotonic order, but backwards,
	// so the degree-0 coefficient will be in the most-significant bit.
	//
	// This is less of a difficulty than it seems at first, because
	// algebra.  Suppose we are given u = SUM_{0<=i<n} u_i t^i and v =
	// SUM_{0<=j<n} v_j t^j; then
	//
	//	u v = SUM_{0<=i,j<n} u_i v_j t^{i+j}
	//
	// Suppose instead that we're given ũ = SUM_{0<=i<n} u_{n-i-1} t^i
	// and ṽ = SUM_{0<=j<n} v_{n-j-1} t^j, so the bits are backwards.
	// Then
	//
	//	ũ ṽ = SUM_{0<=i,j<n} u_{n-i-1} v_{n-j-1} t^{i+j}
	//	    = SUM_{0<=i,j<n} u_i v_j t^{2n-2-(i+j)}
	//
	// which is almost the bit-reversal of u v, only it's shifted right
	// by one place.  Oh, well: we'll have to shift it back later.
	//
	// That was important to think about, but there's not a great deal to
	// do about it yet other than to convert what we've got from the
	// blockcipher's byte-ordering convention to our big-endian
	// convention.  Since this depends on the blockcipher convention,
	// we'll leave the caller to cope with this: the macros here will
	// assume that the operands are in `register' format, which is the
	// same as the external representation, except that the bytes within
	// each 64-bit piece are reversed.  In the commentary, pieces of
	// polynomial are numbered according to the degree of the
	// coefficients, so the unit coefficient of some polynomial a is in
	// a_0.
	//
	// The commentary for `mul128' is the most detailed.  The other
	// macros assume that you've already read and understood that.

.macro	mul128
	// Enter with u and v in q0 and q1 respectively; leave with z = u v
	// in q0.  Clobbers q1--q3, q8, q9.

	// First for the double-precision multiplication.  It's tempting to
	// use Karatsuba's identity here, but I suspect that loses more in
	// the shifting, bit-twiddling, and dependency chains that it gains
	// in saving a multiplication which otherwise pipelines well.
	// q0 =				// (u_0; u_1)
	// q1 =				// (v_0; v_1)
	vmull.p64 q2, d1, d2		// u_1 v_0
	vmull.p64 q3, d0, d3		// u_0 v_1
	vmull.p64 q8, d1, d3		// (x_3; t_1) = u_1 v_1
	vmull.p64 q9, d0, d2		// (t_0; x_0) = u_0 v_0

	// Arrange the pieces to form a double-precision polynomial.
	veor	q2, q2, q3		// (m_1; m_0) = u_0 v_1 + u_1 v_0
	veor	d17, d17, d4		// x_2 = t_1 + m_1
	veor	d18, d18, d5		// x_1 = t_0 + m_0
	// q8 =				// (x_3; x_2)
	// q9 =				// (x_1; x_0)

	// Two-and-a-half problems remain.  The first is that this product is
	// shifted left by one place, which is annoying.  Let's take care of
	// that now.  Once this is done, we'll be properly in GCM's backwards
	// bit-ordering.
	//
	// The half a problem is that the result wants to have its 64-bit
	// halves switched.  Here turns out to be the best place to arrange
	// for that.
	//
	//		     q9				    q8
	//	,-------------.-------------.  ,-------------.-------------.
	//	| 0  x_0-x_62 | x_63-x_126  |  | x_127-x_190 | x_191-x_254 |
	//	`-------------^-------------'  `-------------^-------------'
	//	      d19	    d18		     d17	   d16
	//
	// We start by shifting each 32-bit lane right (from GCM's point of
	// view -- physically, left) by one place, which gives us this:
	//
	//		   low (q9)			 high (q8)
	//	,-------------.-------------.  ,-------------.-------------.
	//	| x_0-x_62  0 |x_64-x_126 0 |  |x_128-x_190 0|x_192-x_254 0|
	//	`-------------^-------------'  `-------------^-------------'
	//	      d19	    d18		     d17	   d16
	//
	// but we've lost a bunch of bits.  We separately shift each lane
	// left by 31 places to give us the bits we lost.
	//
	//		   low (q3)			 high (q2)
	//	,-------------.-------------.  ,-------------.-------------.
	//	|    0...0    | 0...0  x_63 |  | 0...0 x_127 | 0...0 x_191 |
	//	`-------------^-------------'  `-------------^-------------'
	//			     d6		     d5		   d4
	//
	// Since we can address each of these pieces individually, putting
	// them together is relatively straightforward.


	vshr.u64 d6, d18, #63		// shifted left; just the carries
	vshl.u64 q9, q9, #1		// shifted right, but dropped carries
	vshr.u64 q2, q8, #63
	vshl.u64 q8, q8, #1
	vorr	d0, d19, d6		// y_0
	vorr	d1, d18, d5		// y_1
	vorr	d2, d17, d4		// y_2
	vmov	d3, d16			// y_3

	// And the other one is that the result needs to be reduced modulo
	// p(t) = t^128 + t^7 + t^2 + t + 1.  Let R = t^128 = t^7 + t^2 + t +
	// 1 in our field.  So far, we've calculated z_0 and z_1 such that
	// z_0 + z_1 R = u v using the identity R = t^128: now we must
	// collapse the two halves of y together using the other identity R =
	// t^7 + t^2 + t + 1.
	//
	// We do this by working on y_2 and y_3 separately, so consider y_i
	// for i = 2 or 3.  Certainly, y_i t^{64i} = y_i R t^{64(i-2) =
	// (t^7 + t^2 + t + 1) y_i t^{64(i-2)}, but we can't use that
	// directly without breaking up the 64-bit word structure.  Instead,
	// we start by considering just y_i t^7 t^{64(i-2)}, which again
	// looks tricky.  Now, split y_i = a_i + t^57 b_i, with deg a_i < 57;
	// then
	//
	//	y_i t^7 t^{64(i-2)} = a_i t^7 t^{64(i-2)} + b_i t^{64(i-1)}
	//
	// We can similarly decompose y_i t^2 and y_i t into a pair of 64-bit
	// contributions to the t^{64(i-2)} and t^{64(i-1)} words, but the
	// splits are different.  This is lovely, with one small snag: when
	// we do this to y_3, we end up with a contribution back into the
	// t^128 coefficient word.  But notice that only the low seven bits
	// of this word are affected, so there's no knock-on contribution
	// into the t^64 word.  Therefore, if we handle the high bits of each
	// word together, and then the low bits, everything will be fine.

	// First, shift the high bits down.
	vshl.u64 q2, q1, #63		// the b_i for t
	vshl.u64 q3, q1, #62		// the b_i for t^2
	vshl.u64 q8, q1, #57		// the b_i for t^7
	veor	q2, q2, q3		// add them all together
	veor	q2, q2, q8
	veor	d2, d2, d5		// contribution into low half
	veor	d1, d1, d4		// and high half

	// And then shift the low bits up.
	vshr.u64 q2, q1, #1
	vshr.u64 q3, q1, #2
	vshr.u64 q8, q1, #7
	veor	q0, q0, q1		// mix in the unit contribution
	veor	q2, q2, q3		// t and t^2 contribs
	veor	q0, q0, q8		// low, unit, and t^7 contribs
	veor	q0, q0, q2		// mix them together and we're done
.endm

.macro	mul64
	// Enter with u and v in the low halves of d0 and d1 respectively;
	// leave with z = u v in d0.  Clobbers d1--d5.

	// The multiplication is thankfully easy.
	vmull.p64 q0, d0, d1		// u v

	// Shift the product up by one place, and swap the two halves.  After
	// this, we're in GCM bizarro-world.
	vshr.u64 d2, d0, #63		// shifted left; just the carries
	vshl.u64 d3, d1, #1		// low half right
	vshl.u64 d1, d0, #1		// high half shifted right
	vorr	d0, d3, d2		// mix in the carries

	// Now we must reduce.  This is essentially the same as the 128-bit
	// case above, but mostly simpler because everything is smaller.  The
	// polynomial this time is p(t) = t^64 + t^4 + t^3 + t + 1.

	// First, shift the high bits down.
	vshl.u64 d2, d1, #63		// b_i for t
	vshl.u64 d3, d1, #61		// b_i for t^3
	vshl.u64 d4, d1, #60		// b_i for t^4
	veor	d2, d2, d3		// add them all together
	veor	d2, d2, d4
	veor	d1, d1, d2		// contribution back into high half

	// And then shift the low bits up.
	vshr.u64 d2, d1, #1
	vshr.u64 d3, d1, #3
	vshr.u64 d4, d1, #4
	veor	d0, d0, d1		// mix in the unit contribution
	veor	d2, d2, d3		// t and t^3 contribs
	veor	d0, d0, d4		// low, unit, and t^4
	veor	d0, d0, d2		// mix them together and we're done
.endm

.macro	mul96
	// Enter with u and v in the most-significant three words of q0 and
	// q1 respectively, and zero in the low words, and zero in q15; leave
	// with z = u v in the high three words of q0, and /junk/ in the low
	// word.  Clobbers ???.

	// This is an inconvenient size.  There's nothing for it but to do
	// four multiplications, as if for the 128-bit case.  It's possible
	// that there's cruft in the top 32 bits of the input registers, so
	// shift both of them up by four bytes before we start.  This will
	// mean that the high 64 bits of the result (from GCM's viewpoint)
	// will be zero.
	// q0 =				// (u_0 + u_1 t^32; u_2)
	// q1 =				// (v_0 + v_1 t^32; v_2)
	vmull.p64 q8, d1, d2		// u_2 (v_0 + v_1 t^32) = e_0
	vmull.p64 q9, d0, d3		// v_2 (u_0 + u_1 t^32) = e_1
	vmull.p64 q3, d1, d3		// u_2 v_2 t^64 = d = (0; d)
	vmull.p64 q0, d0, d2		// u_0 v_0 + (u_0 v_1 + u_1 v_0) t^32
					//   + u_1 v_1 t^64 = f

	// Extract the high and low halves of the 192-bit result.  The answer
	// we want is d t^128 + e t^64 + f, where e = e_0 + e_1.  The low 96
	// bits of the answer will end up in q0, and the high 96 bits will
	// end up in q1; we'll need both of these to have zero in their
	// bottom 32 bits.
	//
	// Here, bot(x) is the low 96 bits of a 192-bit quantity x, arranged
	// in the low 96 bits of a SIMD register, with junk in the top 32
	// bits; and top(x) is the high 96 bits, also arranged in the low 96
	// bits of a register, with /zero/ in the top 32 bits.
	veor	q8, q8, q9		// e_0 + e_1 = e
	vshr128	q1, q3, 32		// top(d t^128)
	vext.8	d19, d16, d17, #4	// top(e t^64)
	vshl.u64 d16, d0, #32		// top(f), sort of
	veor	d3, d3, d19		// q1 = top(d t^128 + e t^64)
	veor	d0, d0, d17		// q0 = bot([d t^128] + e t^64 + f)
	veor	d3, d3, d16		// q1 = top(d t^128 + e t^64 + f)

	// Shift the product right by one place (from GCM's point of view),
	// but, unusually, don't swap the halves, because we need to work on
	// the 32-bit pieces later.  After this, we're in GCM bizarro-world.
	// q0 =				// (?, x_2; x_1, x_0)
	// q1 =				// (0, x_5; x_4, x_3)
	vshr.u64 d4, d0, #63		// carry from d0 to d1
	vshr.u64 d5, d2, #63		// carry from d2 to d3
	vshr.u32 d6, d3, #31		// carry from d3 to d0
	vshl.u64 q0, q0, #1		// shift low half
	vshl.u64 q1, q1, #1		// shift high half
	vorr	d1, d1, d4
	vorr	d0, d0, d6
	vorr	d3, d3, d5

	// Finally, the reduction.  This is essentially the same as the
	// 128-bit case, except that the polynomial is p(t) = t^96 + t^10 +
	// t^9 + t^6 + 1.  The degrees are larger but not enough to cause
	// trouble for the general approach.

	// First, shift the high bits down.
	vshl.u32 q2, q1, #26		// b_i for t^6
	vshl.u32 q3, q1, #23		// b_i for t^9
	vshl.u32 q8, q1, #22		// b_i for t^10
	veor	q2, q2, q3		// add them all together
	veor	q2, q2, q8
	vshl128	q3, q2, 64		// contribution into high half
	vshr128	q2, q2, 32		// and low half
	veor	q1, q1, q3		// mix them in
	veor	q0, q0, q2

	// And then shift the low bits up.
	vshr.u32 q2, q1, #6
	vshr.u32 q3, q1, #9
	veor	q0, q0, q1		// mix in the unit contribution
	vshr.u32 q8, q1, #10
	veor	q2, q2, q3		// mix together t^6 and t^9
	veor	q0, q0, q8		// mix in t^10
	veor	q0, q0, q2		// and the rest

	// And finally swap the two halves.
	vswp	d0, d1
.endm

.macro	mul192
	// Enter with u and v in d0--d2 and d3--d5 respectively; leave
	// with z = u v in d0--d2.  Clobbers q8--q15.

	// Start multiplying and accumulating pieces of product.
	// (d0; d1; d2) =		// (u_0; u_1; u_2)
	// (d3; d4; d5) =		// (v_0; v_1; v_2)
	vmull.p64 q10, d0, d3		// e = u_0 v_0

	vmull.p64 q12, d0, d4		//     u_0 v_1
	vmull.p64 q13, d1, d3		//     u_1 v_0

	vmull.p64 q9, d0, d5		//     u_0 v_2
	vmull.p64 q14, d1, d4		//     u_1 v_1
	vmull.p64 q15, d2, d3		//     u_2 v_0
	 veor	q12, q12, q13		// d = u_0 v_1 + u_1 v_0

	vmull.p64 q11, d1, d5		//     u_1 v_2
	vmull.p64 q13, d2, d4		//     u_2 v_1
	 veor	q9, q9, q14		//     u_0 v_2 + u_1 v_1

	vmull.p64 q8, d2, d5		// a = u_2 v_2
	 veor	q9, q9, q15		// c = u_0 v_2 + u_1 v_1 + u_2 v_0
	 veor	q11, q11, q13		// b = u_1 v_2 + u_2 v_1

	// Piece the product together.
	veor	d17, d17, d22  //  q8 =	// (x_5; x_4)
	veor	d18, d18, d23
	veor	d19, d19, d24  //  q9 =	// (x_3; x_2)
	veor	d20, d20, d25  // q10 =	// (x_1; x_0)

	// Shift the product right by one place (from GCM's point of view).
	vshr.u64 q11, q8, #63		// carry from d16/d17 to d17/d18
	vshr.u64 q12, q9, #63		// carry from d18/d19 to d19/d20
	vshr.u64 d26, d20, #63		// carry from d20 to d21
	vshl.u64 q8, q8, #1		// shift everything down
	vshl.u64 q9, q9, #1
	vshl.u64 q10, q10, #1
	vorr	d17, d17, d22		// and mix in the carries
	vorr	d18, d18, d23
	vorr	d19, d19, d24
	vorr	d20, d20, d25
	vorr	d21, d21, d26

	// Next, the reduction.  Our polynomial this time is p(x) = t^192 +
	// t^7 + t^2 + t + 1.  Yes, the magic numbers are the same as the
	// 128-bit case.  I don't know why.

	// First, shift the high bits down.
	// q8 =				// (y_5; y_4)
	// q9 =				// (y_3; y_2)
	// q10 =			// (y_1; y_0)
	vshl.u64 q11, q8, #63		// (y_5; y_4) b_i for t
	vshl.u64 d28, d18, #63		// y_3 b_i for t
	vshl.u64 q12, q8, #62		// (y_5; y_4) b_i for t^2
	vshl.u64 d29, d18, #62		// y_3 b_i for t^2
	vshl.u64 q13, q8, #57		// (y_5; y_4) b_i for t^7
	vshl.u64 d30, d18, #57		// y_3 b_i for t^7
	veor	q11, q11, q12		// mix them all together
	veor	d28, d28, d29
	veor	q11, q11, q13
	veor	d28, d28, d30
	veor	q9, q9, q11
	veor	d20, d20, d28

	// And finally shift the low bits up.  Also, switch the order of the
	// pieces for output.
	// q8 =				// (y'_5; y'_4)
	// q9 =				// (y'_3; y'_2)
	// q10 =			// (y'_1; y'_0)
	vshr.u64 q11, q8, #1		// (y_5; y_4) a_i for t
	vshr.u64 d28, d18, #1		// y'_3 a_i for t
	vshr.u64 q12, q8, #2		// (y_5; y_4) a_i for t^2
	vshr.u64 d29, d18, #2		// y'_3 a_i for t^2
	vshr.u64 q13, q8, #7		// (y_5; y_4) a_i for t^7
	vshr.u64 d30, d18, #7		// y'_3 a_i for t^7
	veor	q8, q8, q11
	veor	d18, d18, d28
	veor	q12, q12, q13
	veor	d29, d29, d30
	veor	q8, q8, q12
	veor	d18, d18, d29
	veor	d0, d21, d18
	veor	d1, d20, d17
	veor	d2, d19, d16
.endm

.macro	mul256
	// Enter with u and v in q0/q1 and q2/q3 respectively; leave
	// with z = u v in q0/q1.  Clobbers q8--q15.

	// Now it's starting to look worthwhile to do Karatsuba.  Suppose
	// u = u_0 + u_1 B and v = v_0 + v_1 B.  Then
	//
	//	u v = (u_0 v_0) + (u_0 v_1 + u_1 v_0) B + (u_1 v_1) B^2
	//
	// Name these coefficients of B^i be a, b, and c, respectively, and
	// let r = u_0 + u_1 and s = v_0 + v_1.  Then observe that
	//
	//	q = r s = (u_0 + u_1) (v_0 + v_1)
	//	  = (u_0 v_0) + (u1 v_1) + (u_0 v_1 + u_1 v_0)
	//	  = a + d + c
	//
	// The first two terms we've already calculated; the last is the
	// remaining one we want.  We'll set B = t^128.  We know how to do
	// 128-bit multiplications already, and Karatsuba is too annoying
	// there, so there'll be 12 multiplications altogether, rather than
	// the 16 we'd have if we did this the naïve way.
	// q0 =				// u_0 = (u_00; u_01)
	// q1 =				// u_1 = (u_10; u_11)
	// q2 =				// v_0 = (v_00; v_01)
	// q3 =				// v_1 = (v_10; v_11)

	veor	q8, q0, q1		// u_* = (u_00 + u_10; u_01 + u_11)
	veor	q9, q2, q3		// v_* = (v_00 + v_10; v_01 + v_11)

	// Start by building the cross product, q = u_* v_*.
	vmull.p64 q14, d16, d19		// u_*0 v_*1
	vmull.p64 q15, d17, d18		// u_*1 v_*0
	vmull.p64 q12, d17, d19		// u_*1 v_*1
	vmull.p64 q13, d16, d18		// u_*0 v_*0
	veor	q14, q14, q15		// u_*0 v_*1 + u_*1 v_*0
	veor	d25, d25, d28  // q12 =	// q_1
	veor	d26, d26, d29  // q13 = // q_0

	// Next, work on the low half, a = u_0 v_0.
	vmull.p64 q14, d0, d5		// u_00 v_01
	vmull.p64 q15, d1, d4		// u_01 v_00
	vmull.p64 q10, d1, d5		// u_01 v_01
	vmull.p64 q11, d0, d4		// u_00 v_00
	veor	q14, q14, q15		// u_00 v_01 + u_01 v_00
	veor	d21, d21, d28  // q10 =	// a_1
	veor	d22, d22, d29  // q11 =	// a_0

	// Mix the pieces we have so far.
	veor	q12, q12, q10
	veor	q13, q13, q11

	// Finally, the high half, c = u_1 v_1.
	vmull.p64 q14, d2, d7		// u_10 v_11
	vmull.p64 q15, d3, d6		// u_11 v_10
	vmull.p64 q8, d3, d7		// u_11 v_11
	vmull.p64 q9, d2, d6		// u_10 v_10
	veor	q14, q14, q15		// u_10 v_11 + u_11 v_10
	veor	d17, d17, d28  //  q8 =	// c_1
	veor	d18, d18, d29  //  q9 =	// c_0

	// Finish mixing the product together.
	veor	q12, q12, q8   // q12 =	// b_1
	veor	q13, q13, q9   // q13 =	// b_0
	veor	q9, q9, q12
	veor	q10, q10, q13

	// Shift the product right by one place (from GCM's point of view).
	vshr.u64 q0, q8, #63		// carry from d16/d17 to d17/d18
	vshr.u64 q1, q9, #63		// carry from d18/d19 to d19/d20
	vshr.u64 q2, q10, #63		// carry from d20/d21 to d21/d22
	vshr.u64 d6, d22, #63		// carry from d22 to d23
	vshl.u64 q8, q8, #1		// shift everyting down
	vshl.u64 q9, q9, #1
	vshl.u64 q10, q10, #1
	vshl.u64 q11, q11, #1
	vorr	d17, d17, d0
	vorr	d18, d18, d1
	vorr	d19, d19, d2
	vorr	d20, d20, d3
	vorr	d21, d21, d4
	vorr	d22, d22, d5
	vorr	d23, d23, d6

	// Now we must reduce.  This is essentially the same as the 192-bit
	// case above, but more complicated because everything is bigger.
	// The polynomial this time is p(t) = t^256 + t^10 + t^5 + t^2 + 1.

	// First, shift the high bits down.
	// q8 =				// (y_7; y_6)
	// q9 =				// (y_5; y_4)
	// q10 =			// (y_3; y_2)
	// q11 =			// (y_1; y_0)
	vshl.u64 q0, q8, #62		// (y_7; y_6) b_i for t^2
	vshl.u64 q12, q9, #62		// (y_5; y_4) b_i for t^2
	vshl.u64 q1, q8, #59		// (y_7; y_6) b_i for t^5
	vshl.u64 q13, q9, #59		// (y_5; y_4) b_i for t^5
	vshl.u64 q2, q8, #54		// (y_7; y_6) b_i for t^10
	vshl.u64 q14, q9, #54		// (y_5; y_4) b_i for t^10
	veor	q0, q0, q1		// mix the contributions together
	veor	q12, q12, q13
	veor	q0, q0, q2
	veor	q12, q12, q14
	veor	d19, d19, d0		// and combine into the lower pieces
	veor	d20, d20, d1
	veor	d21, d21, d24
	veor	d22, d22, d25

	// And then shift the low bits up.  Also, switch the order of the
	// pieces for output.
	// q8 =				// (y'_7; y'_6)
	// q9 =				// (y'_5; y'_4)
	// q10 =			// (y'_3; y'_2)
	// q11 =			// (y'_1; y'_0)
	vshr.u64 q0, q8, #2		// (y_7; y_6) a_i for t^2
	vshr.u64 q12, q9, #2		// (y_5; y'_4) a_i for t^2
	vshr.u64 q1, q8, #5		// (y_7; y_6) a_i for t^5
	vshr.u64 q13, q9, #5		// (y_5; y_4) a_i for t^5
	vshr.u64 q2, q8, #10		// (y_7; y_6) a_i for t^10
	vshr.u64 q14, q9, #10		// (y_5; y_4) a_i for t^10

	veor	q8, q8, q0		// mix the contributions together
	veor	q1, q1, q2
	veor	q9, q9, q12
	veor	q13, q13, q14
	veor	q8, q8, q1
	veor	q9, q9, q13
	veor	d3, d20, d16		// and output
	veor	d2, d21, d17
	veor	d1, d22, d18
	veor	d0, d23, d19
.endm

///--------------------------------------------------------------------------
/// Main code.

// There are a number of representations of field elements in this code and
// it can be confusing.
//
//   * The `external format' consists of a sequence of contiguous bytes in
//     memory called a `block'.  The GCM spec explains how to interpret this
//     block as an element of a finite field.  As discussed extensively, this
//     representation is very annoying for a number of reasons.  On the other
//     hand, this code never actually deals with it directly.
//
//   * The `register format' consists of one or more NEON registers,
//     depending on the block size.  The bytes in each 64-bit lane of these
//     registers are in reverse order, compared to the external format.
//
//   * The `words' format consists of a sequence of bytes, as in the
//     `external format', but, according to the blockcipher in use, the bytes
//     within each 32-bit word may be reversed (`big-endian') or not
//     (`little-endian').  Accordingly, there are separate entry points for
//     each variant, identified with `b' or `l'.

FUNC(gcm_mulk_128b_arm_crypto)
	// On entry, r0 points to a 128-bit field element A in big-endian
	// words format; r1 points to a field-element K in register format.
	// On exit, A is updated with the product A K.

	vld1.8	{q0}, [r0]
	vld1.8	{q1}, [r1]
	vrev64.32 q0, q0
	mul128
	vrev64.32 q0, q0
	vst1.8	{q0}, [r0]
	bx	r14
ENDFUNC

FUNC(gcm_mulk_128l_arm_crypto)
	// On entry, r0 points to a 128-bit field element A in little-endian
	// words format; r1 points to a field-element K in register format.
	// On exit, A is updated with the product A K.

	vld1.8	{q0}, [r0]
	vld1.8	{q1}, [r1]
	vrev64.8 q0, q0
	mul128
	vrev64.8 q0, q0
	vst1.8	{q0}, [r0]
	bx	r14
ENDFUNC

FUNC(gcm_mulk_64b_arm_crypto)
	// On entry, r0 points to a 64-bit field element A in big-endian
	// words format; r1 points to a field-element K in register format.
	// On exit, A is updated with the product A K.

	vld1.8	{d0}, [r0]
	vld1.8	{d1}, [r1]
	vrev64.32 d0, d0
	mul64
	vrev64.32 d0, d0
	vst1.8	{d0}, [r0]
	bx	r14
ENDFUNC

FUNC(gcm_mulk_64l_arm_crypto)
	// On entry, r0 points to a 64-bit field element A in little-endian
	// words format; r1 points to a field-element K in register format.
	// On exit, A is updated with the product A K.

	vld1.8	{d0}, [r0]
	vld1.8	{d1}, [r1]
	vrev64.8 d0, d0
	vzero
	mul64
	vrev64.8 d0, d0
	vst1.8	{d0}, [r0]
	bx	r14
ENDFUNC

FUNC(gcm_mulk_96b_arm_crypto)
	// On entry, r0 points to a 96-bit field element A in big-endian
	// words format; r1 points to a field-element K in register format.
	// On exit, A is updated with the product A K.

	ldr	r3, [r0, #8]
	mov	r12, #0
	vld1.8	{d0}, [r0]
	vld1.8	{q1}, [r1]
	vrev64.32 d0, d0
	vmov	d1, r12, r3
	vzero
	mul96
	vrev64.32 d0, d0
	vmov	r3, d1[1]
	vst1.8	{d0}, [r0]
	str	r3, [r0, #8]
	bx	r14
ENDFUNC

FUNC(gcm_mulk_96l_arm_crypto)
	// On entry, r0 points to a 128-bit field element A in little-endian
	// words format; r1 points to a field-element K in register format.
	// On exit, A is updated with the product A K.

	ldr	r3, [r0, #8]
	mov	r12, #0
	vld1.8	{d0}, [r0]
	vld1.8	{q1}, [r1]
	vmov	d1, r3, r12
	vrev64.8 q0, q0
	mul96
	vrev64.8 q0, q0
	vmov	r3, d1[0]
	vst1.8	{d0}, [r0]
	str	r3, [r0, #8]
	bx	r14
ENDFUNC

FUNC(gcm_mulk_192b_arm_crypto)
	// On entry, r0 points to a 192-bit field element A in big-endian
	// words format; r1 points to a field-element K in register format.
	// On exit, A is updated with the product A K.

	vld1.8	{d0-d2}, [r0]
	vld1.8	{d3-d5}, [r1]
	vrev64.32 q0, q0
	vrev64.32 d2, d2
	mul192
	vrev64.32 q0, q0
	vrev64.32 d2, d2
	vst1.8	{d0-d2}, [r0]
	bx	r14
ENDFUNC

FUNC(gcm_mulk_192l_arm_crypto)
	// On entry, r0 points to a 192-bit field element A in little-endian
	// words format; r1 points to a field-element K in register format.
	// On exit, A is updated with the product A K.

	vld1.8	{d0-d2}, [r0]
	vld1.8	{d3-d5}, [r1]
	vrev64.8 q0, q0
	vrev64.8 d2, d2
	mul192
	vrev64.8 q0, q0
	vrev64.8 d2, d2
	vst1.8	{d0-d2}, [r0]
	bx	r14
ENDFUNC

FUNC(gcm_mulk_256b_arm_crypto)
	// On entry, r0 points to a 256-bit field element A in big-endian
	// words format; r1 points to a field-element K in register format.
	// On exit, A is updated with the product A K.

	vld1.8	{q0, q1}, [r0]
	vld1.8	{q2, q3}, [r1]
	vrev64.32 q0, q0
	vrev64.32 q1, q1
	mul256
	vrev64.32 q0, q0
	vrev64.32 q1, q1
	vst1.8	{q0, q1}, [r0]
	bx	r14
ENDFUNC

FUNC(gcm_mulk_256l_arm_crypto)
	// On entry, r0 points to a 256-bit field element A in little-endian
	// words format; r1 points to a field-element K in register format.
	// On exit, A is updated with the product A K.

	vld1.8	{q0, q1}, [r0]
	vld1.8	{q2, q3}, [r1]
	vrev64.8 q0, q0
	vrev64.8 q1, q1
	mul256
	vrev64.8 q0, q0
	vrev64.8 q1, q1
	vst1.8	{q0, q1}, [r0]
	bx	r14
ENDFUNC

///----- That's all, folks --------------------------------------------------