[catacomb] / symm / gcm-arm64-pmull.S

/// -*- mode: asm; asm-comment-char: ?/ -*-
///
/// GCM acceleration for ARM64 processors
///
/// (c) 2019 Straylight/Edgeware
///

///----- Licensing notice ---------------------------------------------------
///
/// This file is part of Catacomb.
///
/// Catacomb is free software: you can redistribute it and/or modify it
/// under the terms of the GNU Library General Public License as published
/// by the Free Software Foundation; either version 2 of the License, or
/// (at your option) any later version.
///
/// Catacomb is distributed in the hope that it will be useful, but
/// WITHOUT ANY WARRANTY; without even the implied warranty of
/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
/// Library General Public License for more details.
///
/// You should have received a copy of the GNU Library General Public
/// License along with Catacomb.  If not, write to the Free Software
/// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
/// USA.

///--------------------------------------------------------------------------
/// Preliminaries.

#include "config.h"
#include "asm-common.h"

	.arch	armv8-a+crypto

	.text

///--------------------------------------------------------------------------
/// Multiplication macros.

	// The good news is that we have a fancy instruction to do the
	// multiplications.  The bad news is that it's not particularly well-
	// suited to the job.
	//
	// For one thing, it only does a 64-bit multiplication, so in general
	// we'll need to synthesize the full-width multiply by hand.  For
	// another thing, it doesn't help with the reduction, so we have to
	// do that by hand too.  And, finally, GCM has crazy bit ordering,
	// and the instruction does nothing useful for that at all.
	//
	// Focusing on that last problem first: the bits aren't in monotonic
	// significance order unless we permute them.  Fortunately, ARM64 has
	// an instruction which will just permute the bits in each byte for
	// us, so we don't have to worry about this very much.
	//
	// Our main weapons, the `pmull' and `pmull2' instructions, work on
	// 64-bit operands, in half of a vector register, and produce 128-bit
	// results.  But neither of them will multiply the high half of one
	// vector by the low half of a second one, so we have a problem,
	// which we solve by representing one of the operands redundantly:
	// rather than packing the 64-bit pieces together, we duplicate each
	// 64-bit piece across both halves of a register.
	//
	// The commentary for `mul128' is the most detailed.  The other
	// macros assume that you've already read and understood that.

.macro	mul128
	// Enter with u and v in v0 and v1/v2 respectively, and 0 in v31;
	// leave with z = u v in v0.  Clobbers v1--v6.

	// First for the double-precision multiplication.  It's tempting to
	// use Karatsuba's identity here, but I suspect that loses more in
	// the shifting, bit-twiddling, and dependency chains that it gains
	// in saving a multiplication which otherwise pipelines well.
	// v0 =				// (u_0; u_1)
	// v1/v2 =			// (v_0; v_1)
	pmull2	v3.1q, v0.2d, v1.2d	// u_1 v_0
	pmull	v4.1q, v0.1d, v2.1d	// u_0 v_1
	pmull2	v5.1q, v0.2d, v2.2d	// (t_1; x_3) = u_1 v_1
	pmull	v6.1q, v0.1d, v1.1d	// (x_0; t_0) = u_0 v_0

	// Arrange the pieces to form a double-precision polynomial.
	eor	v3.16b, v3.16b, v4.16b	// (m_0; m_1) = u_0 v_1 + u_1 v_0
	vshr128	v4, v3, 64		// (m_1; 0)
	vshl128	v3, v3, 64		// (0; m_0)
	eor	v1.16b, v5.16b, v4.16b	// (x_2; x_3)
	eor	v0.16b, v6.16b, v3.16b	// (x_0; x_1)

	// And now the only remaining difficulty is that the result needs to
	// be reduced modulo p(t) = t^128 + t^7 + t^2 + t + 1.  Let R = t^128
	// = t^7 + t^2 + t + 1 in our field.  So far, we've calculated z_0
	// and z_1 such that z_0 + z_1 R = u v using the identity R = t^128:
	// now we must collapse the two halves of y together using the other
	// identity R = t^7 + t^2 + t + 1.
	//
	// We do this by working on y_2 and y_3 separately, so consider y_i
	// for i = 2 or 3.  Certainly, y_i t^{64i} = y_i R t^{64(i-2) =
	// (t^7 + t^2 + t + 1) y_i t^{64(i-2)}, but we can't use that
	// directly without breaking up the 64-bit word structure.  Instead,
	// we start by considering just y_i t^7 t^{64(i-2)}, which again
	// looks tricky.  Now, split y_i = a_i + t^57 b_i, with deg a_i < 57;
	// then
	//
	//	y_i t^7 t^{64(i-2)} = a_i t^7 t^{64(i-2)} + b_i t^{64(i-1)}
	//
	// We can similarly decompose y_i t^2 and y_i t into a pair of 64-bit
	// contributions to the t^{64(i-2)} and t^{64(i-1)} words, but the
	// splits are different.  This is lovely, with one small snag: when
	// we do this to y_3, we end up with a contribution back into the
	// t^128 coefficient word.  But notice that only the low seven bits
	// of this word are affected, so there's no knock-on contribution
	// into the t^64 word.  Therefore, if we handle the high bits of each
	// word together, and then the low bits, everything will be fine.

	// First, shift the high bits down.
	ushr	v2.2d, v1.2d, #63	// the b_i for t
	ushr	v3.2d, v1.2d, #62	// the b_i for t^2
	ushr	v4.2d, v1.2d, #57	// the b_i for t^7
	eor	v2.16b, v2.16b, v3.16b	// add them all together
	eor	v2.16b, v2.16b, v4.16b
	vshr128	v3, v2, 64
	vshl128	v4, v2, 64
	eor	v1.16b, v1.16b, v3.16b	// contribution into high half
	eor	v0.16b, v0.16b, v4.16b	// and low half

	// And then shift the low bits up.
	shl	v2.2d, v1.2d, #1
	shl	v3.2d, v1.2d, #2
	shl	v4.2d, v1.2d, #7
	eor	v1.16b, v1.16b, v2.16b	// unit and t contribs
	eor	v3.16b, v3.16b, v4.16b	// t^2 and t^7 contribs
	eor	v0.16b, v0.16b, v1.16b	// mix everything together
	eor	v0.16b, v0.16b, v3.16b	// ... and we're done
.endm

.macro	mul64
	// Enter with u and v in the low halves of v0 and v1, respectively;
	// leave with z = u v in x2.  Clobbers x2--x4.

	// The multiplication is thankfully easy.
	// v0 =					// (u; ?)
	// v1 =					// (v; ?)
	pmull	v0.1q, v0.1d, v1.1d		// u v

	// Now we must reduce.  This is essentially the same as the 128-bit
	// case above, but mostly simpler because everything is smaller.  The
	// polynomial this time is p(t) = t^64 + t^4 + t^3 + t + 1.

	// Before we get stuck in, transfer the product to general-purpose
	// registers.
	mov	x3, v0.d[1]
	mov	x2, v0.d[0]

	// First, shift the high bits down.
	eor	x4, x3, x3, lsr #1	// pre-mix t^3 and t^4
	eor	x3, x3, x3, lsr #63	// mix in t contribution
	eor	x3, x3, x4, lsr #60	// shift and mix in t^3 and t^4

	// And then shift the low bits up.
	eor	x3, x3, x3, lsl #1	// mix unit and t; pre-mix t^3, t^4
	eor	x2, x2, x3		// fold them in
	eor	x2, x2, x3, lsl #3	// and t^3 and t^4
.endm

.macro	mul96
	// Enter with u in the least-significant 96 bits of v0, with zero in
	// the upper 32 bits, and with the least-significant 64 bits of v in
	// both halves of v1, and the upper 32 bits of v in the low 32 bits
	// of each half of v2, with zero in the upper 32 bits; and with zero
	// in v31.  Yes, that's a bit hairy.  Leave with the product u v in
	// the low 96 bits of v0, and /junk/ in the high 32 bits.  Clobbers
	// v1--v6.

	// This is an inconvenient size.  There's nothing for it but to do
	// four multiplications, as if for the 128-bit case.  It's possible
	// that there's cruft in the top 32 bits of the input registers, so
	// shift both of them up by four bytes before we start.  This will
	// mean that the high 64 bits of the result (from GCM's viewpoint)
	// will be zero.
	// v0 =				// (u_0 + u_1 t^32; u_2)
	// v1 =				// (v_0 + v_1 t^32; v_0 + v_1 t^32)
	// v2 =				// (v_2; v_2)
	pmull2	v5.1q, v0.2d, v1.2d	// u_2 (v_0 + v_1 t^32) t^32 = e_0
	pmull	v4.1q, v0.1d, v2.1d	// v_2 (u_0 + u_1 t^32) t^32 = e_1
	pmull2	v6.1q, v0.2d, v2.2d	// u_2 v_2 = d = (d; 0)
	pmull	v3.1q, v0.1d, v1.1d	// u_0 v_0 + (u_0 v_1 + u_1 v_0) t^32
					//   + u_1 v_1 t^64 = f

	// Extract the high and low halves of the 192-bit result.  The answer
	// we want is d t^128 + e t^64 + f, where e = e_0 + e_1.  The low 96
	// bits of the answer will end up in v0, with junk in the top 32
	// bits; the high 96 bits will end up in v1, which must have zero in
	// its top 32 bits.
	//
	// Here, bot(x) is the low 96 bits of a 192-bit quantity x, arranged
	// in the low 96 bits of a SIMD register, with junk in the top 32
	// bits; and top(x) is the high 96 bits, also arranged in the low 96
	// bits of a register, with /zero/ in the top 32 bits.
	eor	v4.16b, v4.16b, v5.16b	// e_0 + e_1 = e
	vshl128	v6, v6, 32		// top(d t^128)
	vshr128	v5, v4, 32		// top(e t^64)
	vshl128	v4, v4, 64		// bot(e t^64)
	vshr128	v1, v3, 96		// top(f)
	eor	v6.16b, v6.16b, v5.16b	// top(d t^128 + e t^64)
	eor	v0.16b, v3.16b, v4.16b	// bot([d t^128] + e t^64 + f)
	eor	v1.16b, v1.16b, v6.16b	// top(e t^64 + d t^128 + f)

	// Finally, the reduction.  This is essentially the same as the
	// 128-bit case, except that the polynomial is p(t) = t^96 + t^10 +
	// t^9 + t^6 + 1.  The degrees are larger but not enough to cause
	// trouble for the general approach.  Unfortunately, we have to do
	// this in 32-bit pieces rather than 64.

	// First, shift the high bits down.
	ushr	v2.4s, v1.4s, #26	// the b_i for t^6
	ushr	v3.4s, v1.4s, #23	// the b_i for t^9
	ushr	v4.4s, v1.4s, #22	// the b_i for t^10
	eor	v2.16b, v2.16b, v3.16b	// add them all together
	eor	v2.16b, v2.16b, v4.16b
	vshr128	v3, v2, 64		// contribution for high half
	vshl128	v2, v2, 32		// contribution for low half
	eor	v1.16b, v1.16b, v3.16b	// apply to high half
	eor	v0.16b, v0.16b, v2.16b	// and low half

	// And then shift the low bits up.
	shl	v2.4s, v1.4s, #6
	shl	v3.4s, v1.4s, #9
	shl	v4.4s, v1.4s, #10
	eor	v1.16b, v1.16b, v2.16b	// unit and t^6 contribs
	eor	v3.16b, v3.16b, v4.16b	// t^9 and t^10 contribs
	eor	v0.16b, v0.16b, v1.16b	// mix everything together
	eor	v0.16b, v0.16b, v3.16b	// ... and we're done
.endm

.macro	mul192
	// Enter with u in v0 and the less-significant half of v1, with v
	// duplicated across both halves of v2/v3/v4, and with zero in v31.
	// Leave with the product u v in v0 and the bottom half of v1.
	// Clobbers v16--v25.

	// Start multiplying and accumulating pieces of product.
	// v0 =				// (u_0; u_1)
	// v1 =				// (u_2; ?)
	// v2 =				// (v_0; v_0)
	// v3 =				// (v_1; v_1)
	// v4 =				// (v_2; v_2)
	pmull	v16.1q, v0.1d, v2.1d	//   a = u_0 v_0

	pmull	v19.1q, v0.1d, v3.1d	//	 u_0 v_1
	pmull2	v21.1q, v0.2d, v2.2d	//	 u_1 v_0

	pmull	v17.1q, v0.1d, v4.1d	//	 u_0 v_2
	pmull2	v22.1q, v0.2d, v3.2d	//	 u_1 v_1
	pmull	v23.1q, v1.1d, v2.1d	//	 u_2 v_0
	 eor	v19.16b, v19.16b, v21.16b // b = u_0 v_1 + u_1 v_0

	pmull2	v20.1q, v0.2d, v4.2d	//	 u_1 v_2
	pmull	v24.1q, v1.1d, v3.1d	//	 u_2 v_1
	 eor	v17.16b, v17.16b, v22.16b //	 u_0 v_2 + u_1 v_1

	pmull	v18.1q, v1.1d, v4.1d	//   e = u_2 v_2
	 eor	v17.16b, v17.16b, v23.16b // c = u_0 v_2 + u_1 v_1 + u_2 v_1
	 eor	v20.16b, v20.16b, v24.16b // d = u_1 v_2 + u_2 v_1

	// Piece the product together.
	// v16 =			// (a_0; a_1)
	// v19 =			// (b_0; b_1)
	// v17 =			// (c_0; c_1)
	// v20 =			// (d_0; d_1)
	// v18 =			// (e_0; e_1)
	vshl128	v21, v19, 64		// (0; b_0)
	ext	v22.16b, v19.16b, v20.16b, #8 // (b_1; d_0)
	vshr128	v23, v20, 64		// (d_1; 0)
	eor	v16.16b, v16.16b, v21.16b // (x_0; x_1)
	eor	v17.16b, v17.16b, v22.16b // (x_2; x_3)
	eor	v18.16b, v18.16b, v23.16b // (x_2; x_3)

	// Next, the reduction.  Our polynomial this time is p(x) = t^192 +
	// t^7 + t^2 + t + 1.  Yes, the magic numbers are the same as the
	// 128-bit case.  I don't know why.

	// First, shift the high bits down.
	// v16 =			// (y_0; y_1)
	// v17 =			// (y_2; y_3)
	// v18 =			// (y_4; y_5)
	mov	v19.d[0], v17.d[1]	// (y_3; ?)

	ushr	v23.2d, v18.2d, #63	// hi b_i for t
	ushr	d20, d19, #63		// lo b_i for t
	ushr	v24.2d, v18.2d, #62	// hi b_i for t^2
	ushr	d21, d19, #62		// lo b_i for t^2
	ushr	v25.2d, v18.2d, #57	// hi b_i for t^7
	ushr	d22, d19, #57		// lo b_i for t^7
	eor	v23.16b, v23.16b, v24.16b // mix them all together
	eor	v20.8b, v20.8b, v21.8b
	eor	v23.16b, v23.16b, v25.16b
	eor	v20.8b, v20.8b, v22.8b

	// Permute the high pieces while we fold in the b_i.
	eor	v17.16b, v17.16b, v23.16b
	vshl128	v20, v20, 64
	mov	v19.d[0], v18.d[1]	// (y_5; ?)
	ext	v18.16b, v17.16b, v18.16b, #8 // (y_3; y_4)
	eor	v16.16b, v16.16b, v20.16b

	// And finally shift the low bits up.
	// v16 =			// (y'_0; y'_1)
	// v17 =			// (y'_2; ?)
	// v18 =			// (y'_3; y'_4)
	// v19 =			// (y'_5; ?)
	shl	v20.2d, v18.2d, #1
	shl	d23, d19, #1
	shl	v21.2d, v18.2d, #2
	shl	d24, d19, #2
	shl	v22.2d, v18.2d, #7
	shl	d25, d19, #7
	eor	v18.16b, v18.16b, v20.16b // unit and t contribs
	eor	v19.8b, v19.8b, v23.8b
	eor	v21.16b, v21.16b, v22.16b // t^2 and t^7 contribs
	eor	v24.8b, v24.8b, v25.8b
	eor	v18.16b, v18.16b, v21.16b // all contribs
	eor	v19.8b, v19.8b, v24.8b
	eor	v0.16b, v16.16b, v18.16b // mix them into the low half
	eor	v1.8b, v17.8b, v19.8b
.endm

.macro	mul256
	// Enter with u in v0/v1, with v duplicated across both halves of
	// v2--v5, and with zero in v31.  Leave with the product u v in
	// v0/v1.  Clobbers ???.

	// Now it's starting to look worthwhile to do Karatsuba.  Suppose
	// u = u_0 + u_1 B and v = v_0 + v_1 B.  Then
	//
	//	u v = (u_0 v_0) + (u_0 v_1 + u_1 v_0) B + (u_1 v_1) B^2
	//
	// Name these coefficients of B^i be a, b, and c, respectively, and
	// let r = u_0 + u_1 and s = v_0 + v_1.  Then observe that
	//
	//	q = r s = (u_0 + u_1) (v_0 + v_1)
	//	  = (u_0 v_0) + (u1 v_1) + (u_0 v_1 + u_1 v_0)
	//	  = a + d + c
	//
	// The first two terms we've already calculated; the last is the
	// remaining one we want.  We'll set B = t^128.  We know how to do
	// 128-bit multiplications already, and Karatsuba is too annoying
	// there, so there'll be 12 multiplications altogether, rather than
	// the 16 we'd have if we did this the naïve way.
	// v0 =				// u_0 = (u_00; u_01)
	// v1 =				// u_1 = (u_10; u_11)
	// v2 =				// (v_00; v_00)
	// v3 =				// (v_01; v_01)
	// v4 =				// (v_10; v_10)
	// v5 =				// (v_11; v_11)

	eor	v28.16b, v0.16b, v1.16b	// u_* = (u_00 + u_10; u_01 + u_11)
	eor	v29.16b, v2.16b, v4.16b	// v_*0 = v_00 + v_10
	eor	v30.16b, v3.16b, v5.16b	// v_*1 = v_01 + v_11

	// Start by building the cross product, q = u_* v_*.
	pmull	v24.1q, v28.1d, v30.1d	// u_*0 v_*1
	pmull2	v25.1q, v28.2d, v29.2d	// u_*1 v_*0
	pmull	v20.1q, v28.1d, v29.1d	// u_*0 v_*0
	pmull2	v21.1q, v28.2d, v30.2d	// u_*1 v_*1
	eor	v24.16b, v24.16b, v25.16b // u_*0 v_*1 + u_*1 v_*0
	vshr128	v25, v24, 64
	vshl128	v24, v24, 64
	eor	v20.16b, v20.16b, v24.16b // q_0
	eor	v21.16b, v21.16b, v25.16b // q_1

	// Next, work on the low half, a = u_0 v_0
	pmull	v24.1q, v0.1d, v3.1d	// u_00 v_01
	pmull2	v25.1q, v0.2d, v2.2d	// u_01 v_00
	pmull	v16.1q, v0.1d, v2.1d	// u_00 v_00
	pmull2	v17.1q, v0.2d, v3.2d	// u_01 v_01
	eor	v24.16b, v24.16b, v25.16b // u_00 v_01 + u_01 v_00
	vshr128	v25, v24, 64
	vshl128	v24, v24, 64
	eor	v16.16b, v16.16b, v24.16b // a_0
	eor	v17.16b, v17.16b, v25.16b // a_1

	// Mix the pieces we have so far.
	eor	v20.16b, v20.16b, v16.16b
	eor	v21.16b, v21.16b, v17.16b

	// Finally, work on the high half, c = u_1 v_1
	pmull	v24.1q, v1.1d, v5.1d	// u_10 v_11
	pmull2	v25.1q, v1.2d, v4.2d	// u_11 v_10
	pmull	v18.1q, v1.1d, v4.1d	// u_10 v_10
	pmull2	v19.1q, v1.2d, v5.2d	// u_11 v_11
	eor	v24.16b, v24.16b, v25.16b // u_10 v_11 + u_11 v_10
	vshr128	v25, v24, 64
	vshl128	v24, v24, 64
	eor	v18.16b, v18.16b, v24.16b // c_0
	eor	v19.16b, v19.16b, v25.16b // c_1

	// Finish mixing the product together.
	eor	v20.16b, v20.16b, v18.16b
	eor	v21.16b, v21.16b, v19.16b
	eor	v17.16b, v17.16b, v20.16b
	eor	v18.16b, v18.16b, v21.16b

	// Now we must reduce.  This is essentially the same as the 192-bit
	// case above, but more complicated because everything is bigger.
	// The polynomial this time is p(t) = t^256 + t^10 + t^5 + t^2 + 1.
	// v16 =			// (y_0; y_1)
	// v17 =			// (y_2; y_3)
	// v18 =			// (y_4; y_5)
	// v19 =			// (y_6; y_7)
	ushr	v24.2d, v18.2d, #62	// (y_4; y_5) b_i for t^2
	ushr	v25.2d, v19.2d, #62	// (y_6; y_7) b_i for t^2
	ushr	v26.2d, v18.2d, #59	// (y_4; y_5) b_i for t^5
	ushr	v27.2d, v19.2d, #59	// (y_6; y_7) b_i for t^5
	ushr	v28.2d, v18.2d, #54	// (y_4; y_5) b_i for t^10
	ushr	v29.2d, v19.2d, #54	// (y_6; y_7) b_i for t^10
	eor	v24.16b, v24.16b, v26.16b // mix the contributions together
	eor	v25.16b, v25.16b, v27.16b
	eor	v24.16b, v24.16b, v28.16b
	eor	v25.16b, v25.16b, v29.16b
	vshr128	v26, v25, 64		// slide contribs into position
	ext	v25.16b, v24.16b, v25.16b, #8
	vshl128	v24, v24, 64
	eor	v18.16b, v18.16b, v26.16b
	eor	v17.16b, v17.16b, v25.16b
	eor	v16.16b, v16.16b, v24.16b

	// And then shift the low bits up.
	// v16 =			// (y'_0; y'_1)
	// v17 =			// (y'_2; y'_3)
	// v18 =			// (y'_4; y'_5)
	// v19 =			// (y'_6; y'_7)
	shl	v24.2d, v18.2d, #2	// (y'_4; y_5) a_i for t^2
	shl	v25.2d, v19.2d, #2	// (y_6; y_7) a_i for t^2
	shl	v26.2d, v18.2d, #5	// (y'_4; y_5) a_i for t^5
	shl	v27.2d, v19.2d, #5	// (y_6; y_7) a_i for t^5
	shl	v28.2d, v18.2d, #10	// (y'_4; y_5) a_i for t^10
	shl	v29.2d, v19.2d, #10	// (y_6; y_7) a_i for t^10
	eor	v18.16b, v18.16b, v24.16b // mix the contributions together
	eor	v19.16b, v19.16b, v25.16b
	eor	v26.16b, v26.16b, v28.16b
	eor	v27.16b, v27.16b, v29.16b
	eor	v18.16b, v18.16b, v26.16b
	eor	v19.16b, v19.16b, v27.16b
	eor	v0.16b, v16.16b, v18.16b
	eor	v1.16b, v17.16b, v19.16b
.endm

///--------------------------------------------------------------------------
/// Main code.

// There are a number of representations of field elements in this code and
// it can be confusing.
//
//   * The `external format' consists of a sequence of contiguous bytes in
//     memory called a `block'.  The GCM spec explains how to interpret this
//     block as an element of a finite field.  As discussed extensively, this
//     representation is very annoying for a number of reasons.  On the other
//     hand, this code never actually deals with it directly.
//
//   * The `register format' consists of one or more SIMD registers,
//     depending on the block size.  The bits in each byte are reversed,
//     compared to the external format, which makes the polynomials
//     completely vanilla, unlike all of the other GCM implementations.
//
//   * The `table format' is just like the `register format', only the two
//     halves of 128-bit SIMD register are the same, so we need twice as many
//     registers.
//
//   * The `words' format consists of a sequence of bytes, as in the
//     `external format', but, according to the blockcipher in use, the bytes
//     within each 32-bit word may be reversed (`big-endian') or not
//     (`little-endian').  Accordingly, there are separate entry points for
//     each variant, identified with `b' or `l'.

FUNC(gcm_mulk_128b_arm64_pmull)
	// On entry, x0 points to a 128-bit field element A in big-endian
	// words format; x1 points to a field-element K in table format.  On
	// exit, A is updated with the product A K.

	ldr	q0, [x0]
	ldp	q1, q2, [x1]
	rev32	v0.16b, v0.16b
	vzero
	rbit	v0.16b, v0.16b
	mul128
	rbit	v0.16b, v0.16b
	rev32	v0.16b, v0.16b
	str	q0, [x0]
	ret
ENDFUNC

FUNC(gcm_mulk_128l_arm64_pmull)
	// On entry, x0 points to a 128-bit field element A in little-endian
	// words format; x1 points to a field-element K in table format.  On
	// exit, A is updated with the product A K.

	ldr	q0, [x0]
	ldp	q1, q2, [x1]
	vzero
	rbit	v0.16b, v0.16b
	mul128
	rbit	v0.16b, v0.16b
	str	q0, [x0]
	ret
ENDFUNC

FUNC(gcm_mulk_64b_arm64_pmull)
	// On entry, x0 points to a 64-bit field element A in big-endian
	// words format; x1 points to a field-element K in table format.  On
	// exit, A is updated with the product A K.

	ldr	d0, [x0]
	ldr	q1, [x1]
	rev32	v0.8b, v0.8b
	rbit	v0.8b, v0.8b
	mul64
	rbit	x2, x2
	ror	x2, x2, #32
	str	x2, [x0]
	ret
ENDFUNC

FUNC(gcm_mulk_64l_arm64_pmull)
	// On entry, x0 points to a 64-bit field element A in little-endian
	// words format; x1 points to a field-element K in table format.  On
	// exit, A is updated with the product A K.

	ldr	d0, [x0]
	ldr	q1, [x1]
	rbit	v0.8b, v0.8b
	mul64
	rbit	x2, x2
	rev	x2, x2
	str	x2, [x0]
	ret
ENDFUNC

FUNC(gcm_mulk_96b_arm64_pmull)
	// On entry, x0 points to a 96-bit field element A in big-endian
	// words format; x1 points to a field-element K in table format.  On
	// exit, A is updated with the product A K.

	ldr	w2, [x0, #8]
	ldr	d0, [x0, #0]
	mov	v0.d[1], x2
	ldp	q1, q2, [x1]
	rev32	v0.16b, v0.16b
	vzero
	rbit	v0.16b, v0.16b
	mul96
	rbit	v0.16b, v0.16b
	rev32	v0.16b, v0.16b
	mov	w2, v0.s[2]
	str	d0, [x0, #0]
	str	w2, [x0, #8]
	ret
ENDFUNC

FUNC(gcm_mulk_96l_arm64_pmull)
	// On entry, x0 points to a 96-bit field element A in little-endian
	// words format; x1 points to a field-element K in table format.  On
	// exit, A is updated with the product A K.

	ldr	d0, [x0, #0]
	ldr	w2, [x0, #8]
	mov	v0.d[1], x2
	ldp	q1, q2, [x1]
	rbit	v0.16b, v0.16b
	vzero
	mul96
	rbit	v0.16b, v0.16b
	mov	w2, v0.s[2]
	str	d0, [x0, #0]
	str	w2, [x0, #8]
	ret
ENDFUNC

FUNC(gcm_mulk_192b_arm64_pmull)
	// On entry, x0 points to a 192-bit field element A in big-endian
	// words format; x1 points to a field-element K in table format.  On
	// exit, A is updated with the product A K.

	ldr	q0, [x0, #0]
	ldr	d1, [x0, #16]
	ldp	q2, q3, [x1, #0]
	ldr	q4, [x1, #32]
	rev32	v0.16b, v0.16b
	rev32	v1.8b, v1.8b
	rbit	v0.16b, v0.16b
	rbit	v1.8b, v1.8b
	vzero
	mul192
	rev32	v0.16b, v0.16b
	rev32	v1.8b, v1.8b
	rbit	v0.16b, v0.16b
	rbit	v1.8b, v1.8b
	str	q0, [x0, #0]
	str	d1, [x0, #16]
	ret
ENDFUNC

FUNC(gcm_mulk_192l_arm64_pmull)
	// On entry, x0 points to a 192-bit field element A in little-endian
	// words format; x1 points to a field-element K in table format.  On
	// exit, A is updated with the product A K.

	ldr	q0, [x0, #0]
	ldr	d1, [x0, #16]
	ldp	q2, q3, [x1, #0]
	ldr	q4, [x1, #32]
	rbit	v0.16b, v0.16b
	rbit	v1.8b, v1.8b
	vzero
	mul192
	rbit	v0.16b, v0.16b
	rbit	v1.8b, v1.8b
	str	q0, [x0, #0]
	str	d1, [x0, #16]
	ret
ENDFUNC

FUNC(gcm_mulk_256b_arm64_pmull)
	// On entry, x0 points to a 256-bit field element A in big-endian
	// words format; x1 points to a field-element K in table format.  On
	// exit, A is updated with the product A K.

	ldp	q0, q1, [x0]
	ldp	q2, q3, [x1, #0]
	ldp	q4, q5, [x1, #32]
	rev32	v0.16b, v0.16b
	rev32	v1.16b, v1.16b
	rbit	v0.16b, v0.16b
	rbit	v1.16b, v1.16b
	vzero
	mul256
	rev32	v0.16b, v0.16b
	rev32	v1.16b, v1.16b
	rbit	v0.16b, v0.16b
	rbit	v1.16b, v1.16b
	stp	q0, q1, [x0]
	ret
ENDFUNC

FUNC(gcm_mulk_256l_arm64_pmull)
	// On entry, x0 points to a 256-bit field element A in little-endian
	// words format; x1 points to a field-element K in table format.  On
	// exit, A is updated with the product A K.

	ldp	q0, q1, [x0]
	ldp	q2, q3, [x1, #0]
	ldp	q4, q5, [x1, #32]
	rbit	v0.16b, v0.16b
	rbit	v1.16b, v1.16b
	vzero
	mul256
	rbit	v0.16b, v0.16b
	rbit	v1.16b, v1.16b
	stp	q0, q1, [x0]
	ret
ENDFUNC

///----- That's all, folks --------------------------------------------------