mdw@git.distorted.org.uk Git - catacomb/blame_incremental

... / ...

Commit	Line	Data
	1	/// -- mode: asm; asm-comment-char: ?/ --
	2	///
	3	/// GCM acceleration for ARM64 processors
	4	///
	5	/// (c) 2019 Straylight/Edgeware
	6	///
	7
	8	///----- Licensing notice ---------------------------------------------------
	9	///
	10	/// This file is part of Catacomb.
	11	///
	12	/// Catacomb is free software: you can redistribute it and/or modify it
	13	/// under the terms of the GNU Library General Public License as published
	14	/// by the Free Software Foundation; either version 2 of the License, or
	15	/// (at your option) any later version.
	16	///
	17	/// Catacomb is distributed in the hope that it will be useful, but
	18	/// WITHOUT ANY WARRANTY; without even the implied warranty of
	19	/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	20	/// Library General Public License for more details.
	21	///
	22	/// You should have received a copy of the GNU Library General Public
	23	/// License along with Catacomb. If not, write to the Free Software
	24	/// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
	25	/// USA.
	26
	27	///--------------------------------------------------------------------------
	28	/// Preliminaries.
	29
	30	#include "config.h"
	31	#include "asm-common.h"
	32
	33	.arch armv8-a+crypto
	34
	35	.text
	36
	37	///--------------------------------------------------------------------------
	38	/// Multiplication macros.
	39
	40	// The good news is that we have a fancy instruction to do the
	41	// multiplications. The bad news is that it's not particularly well-
	42	// suited to the job.
	43	//
	44	// For one thing, it only does a 64-bit multiplication, so in general
	45	// we'll need to synthesize the full-width multiply by hand. For
	46	// another thing, it doesn't help with the reduction, so we have to
	47	// do that by hand too. And, finally, GCM has crazy bit ordering,
	48	// and the instruction does nothing useful for that at all.
	49	//
	50	// Focusing on that last problem first: the bits aren't in monotonic
	51	// significance order unless we permute them. Fortunately, ARM64 has
	52	// an instruction which will just permute the bits in each byte for
	53	// us, so we don't have to worry about this very much.
	54	//
	55	// Our main weapons, the `pmull' and `pmull2' instructions, work on
	56	// 64-bit operands, in half of a vector register, and produce 128-bit
	57	// results. But neither of them will multiply the high half of one
	58	// vector by the low half of a second one, so we have a problem,
	59	// which we solve by representing one of the operands redundantly:
	60	// rather than packing the 64-bit pieces together, we duplicate each
	61	// 64-bit piece across both halves of a register.
	62	//
	63	// The commentary for `mul128' is the most detailed. The other
	64	// macros assume that you've already read and understood that.
	65
	66	.macro mul128
	67	// Enter with u and v in v0 and v1/v2 respectively, and 0 in v31;
	68	// leave with z = u v in v0. Clobbers v1--v6.
	69
	70	// First for the double-precision multiplication. It's tempting to
	71	// use Karatsuba's identity here, but I suspect that loses more in
	72	// the shifting, bit-twiddling, and dependency chains that it gains
	73	// in saving a multiplication which otherwise pipelines well.
	74	// v0 = // (u_0; u_1)
	75	// v1/v2 = // (v_0; v_1)
	76	pmull2 v3.1q, v0.2d, v1.2d // u_1 v_0
	77	pmull v4.1q, v0.1d, v2.1d // u_0 v_1
	78	pmull2 v5.1q, v0.2d, v2.2d // (t_1; x_3) = u_1 v_1
	79	pmull v6.1q, v0.1d, v1.1d // (x_0; t_0) = u_0 v_0
	80
	81	// Arrange the pieces to form a double-precision polynomial.
	82	eor v3.16b, v3.16b, v4.16b // (m_0; m_1) = u_0 v_1 + u_1 v_0
	83	vshr128 v4, v3, 64 // (m_1; 0)
	84	vshl128 v3, v3, 64 // (0; m_0)
	85	eor v1.16b, v5.16b, v4.16b // (x_2; x_3)
	86	eor v0.16b, v6.16b, v3.16b // (x_0; x_1)
	87
	88	// And now the only remaining difficulty is that the result needs to
	89	// be reduced modulo p(t) = t^128 + t^7 + t^2 + t + 1. Let R = t^128
	90	// = t^7 + t^2 + t + 1 in our field. So far, we've calculated z_0
	91	// and z_1 such that z_0 + z_1 R = u v using the identity R = t^128:
	92	// now we must collapse the two halves of y together using the other
	93	// identity R = t^7 + t^2 + t + 1.
	94	//
	95	// We do this by working on y_2 and y_3 separately, so consider y_i
	96	// for i = 2 or 3. Certainly, y_i t^{64i} = y_i R t^{64(i-2) =
	97	// (t^7 + t^2 + t + 1) y_i t^{64(i-2)}, but we can't use that
	98	// directly without breaking up the 64-bit word structure. Instead,
	99	// we start by considering just y_i t^7 t^{64(i-2)}, which again
	100	// looks tricky. Now, split y_i = a_i + t^57 b_i, with deg a_i < 57;
	101	// then
	102	//
	103	// y_i t^7 t^{64(i-2)} = a_i t^7 t^{64(i-2)} + b_i t^{64(i-1)}
	104	//
	105	// We can similarly decompose y_i t^2 and y_i t into a pair of 64-bit
	106	// contributions to the t^{64(i-2)} and t^{64(i-1)} words, but the
	107	// splits are different. This is lovely, with one small snag: when
	108	// we do this to y_3, we end up with a contribution back into the
	109	// t^128 coefficient word. But notice that only the low seven bits
	110	// of this word are affected, so there's no knock-on contribution
	111	// into the t^64 word. Therefore, if we handle the high bits of each
	112	// word together, and then the low bits, everything will be fine.
	113
	114	// First, shift the high bits down.
	115	ushr v2.2d, v1.2d, #63 // the b_i for t
	116	ushr v3.2d, v1.2d, #62 // the b_i for t^2
	117	ushr v4.2d, v1.2d, #57 // the b_i for t^7
	118	eor v2.16b, v2.16b, v3.16b // add them all together
	119	eor v2.16b, v2.16b, v4.16b
	120	vshr128 v3, v2, 64
	121	vshl128 v4, v2, 64
	122	eor v1.16b, v1.16b, v3.16b // contribution into high half
	123	eor v0.16b, v0.16b, v4.16b // and low half
	124
	125	// And then shift the low bits up.
	126	shl v2.2d, v1.2d, #1
	127	shl v3.2d, v1.2d, #2
	128	shl v4.2d, v1.2d, #7
	129	eor v1.16b, v1.16b, v2.16b // unit and t contribs
	130	eor v3.16b, v3.16b, v4.16b // t^2 and t^7 contribs
	131	eor v0.16b, v0.16b, v1.16b // mix everything together
	132	eor v0.16b, v0.16b, v3.16b // ... and we're done
	133	.endm
	134
	135	.macro mul64
	136	// Enter with u and v in the low halves of v0 and v1, respectively;
	137	// leave with z = u v in x2. Clobbers x2--x4.
	138
	139	// The multiplication is thankfully easy.
	140	// v0 = // (u; ?)
	141	// v1 = // (v; ?)
	142	pmull v0.1q, v0.1d, v1.1d // u v
	143
	144	// Now we must reduce. This is essentially the same as the 128-bit
	145	// case above, but mostly simpler because everything is smaller. The
	146	// polynomial this time is p(t) = t^64 + t^4 + t^3 + t + 1.
	147
	148	// Before we get stuck in, transfer the product to general-purpose
	149	// registers.
	150	mov x3, v0.d[1]
	151	mov x2, v0.d[0]
	152
	153	// First, shift the high bits down.
	154	eor x4, x3, x3, lsr #1 // pre-mix t^3 and t^4
	155	eor x3, x3, x3, lsr #63 // mix in t contribution
	156	eor x3, x3, x4, lsr #60 // shift and mix in t^3 and t^4
	157
	158	// And then shift the low bits up.
	159	eor x3, x3, x3, lsl #1 // mix unit and t; pre-mix t^3, t^4
	160	eor x2, x2, x3 // fold them in
	161	eor x2, x2, x3, lsl #3 // and t^3 and t^4
	162	.endm
	163
	164	.macro mul96
	165	// Enter with u in the least-significant 96 bits of v0, with zero in
	166	// the upper 32 bits, and with the least-significant 64 bits of v in
	167	// both halves of v1, and the upper 32 bits of v in the low 32 bits
	168	// of each half of v2, with zero in the upper 32 bits; and with zero
	169	// in v31. Yes, that's a bit hairy. Leave with the product u v in
	170	// the low 96 bits of v0, and /junk/ in the high 32 bits. Clobbers
	171	// v1--v6.
	172
	173	// This is an inconvenient size. There's nothing for it but to do
	174	// four multiplications, as if for the 128-bit case. It's possible
	175	// that there's cruft in the top 32 bits of the input registers, so
	176	// shift both of them up by four bytes before we start. This will
	177	// mean that the high 64 bits of the result (from GCM's viewpoint)
	178	// will be zero.
	179	// v0 = // (u_0 + u_1 t^32; u_2)
	180	// v1 = // (v_0 + v_1 t^32; v_0 + v_1 t^32)
	181	// v2 = // (v_2; v_2)
	182	pmull2 v5.1q, v0.2d, v1.2d // u_2 (v_0 + v_1 t^32) t^32 = e_0
	183	pmull v4.1q, v0.1d, v2.1d // v_2 (u_0 + u_1 t^32) t^32 = e_1
	184	pmull2 v6.1q, v0.2d, v2.2d // u_2 v_2 = d = (d; 0)
	185	pmull v3.1q, v0.1d, v1.1d // u_0 v_0 + (u_0 v_1 + u_1 v_0) t^32
	186	// + u_1 v_1 t^64 = f
	187
	188	// Extract the high and low halves of the 192-bit result. The answer
	189	// we want is d t^128 + e t^64 + f, where e = e_0 + e_1. The low 96
	190	// bits of the answer will end up in v0, with junk in the top 32
	191	// bits; the high 96 bits will end up in v1, which must have zero in
	192	// its top 32 bits.
	193	//
	194	// Here, bot(x) is the low 96 bits of a 192-bit quantity x, arranged
	195	// in the low 96 bits of a SIMD register, with junk in the top 32
	196	// bits; and top(x) is the high 96 bits, also arranged in the low 96
	197	// bits of a register, with /zero/ in the top 32 bits.
	198	eor v4.16b, v4.16b, v5.16b // e_0 + e_1 = e
	199	vshl128 v6, v6, 32 // top(d t^128)
	200	vshr128 v5, v4, 32 // top(e t^64)
	201	vshl128 v4, v4, 64 // bot(e t^64)
	202	vshr128 v1, v3, 96 // top(f)
	203	eor v6.16b, v6.16b, v5.16b // top(d t^128 + e t^64)
	204	eor v0.16b, v3.16b, v4.16b // bot([d t^128] + e t^64 + f)
	205	eor v1.16b, v1.16b, v6.16b // top(e t^64 + d t^128 + f)
	206
	207	// Finally, the reduction. This is essentially the same as the
	208	// 128-bit case, except that the polynomial is p(t) = t^96 + t^10 +
	209	// t^9 + t^6 + 1. The degrees are larger but not enough to cause
	210	// trouble for the general approach. Unfortunately, we have to do
	211	// this in 32-bit pieces rather than 64.
	212
	213	// First, shift the high bits down.
	214	ushr v2.4s, v1.4s, #26 // the b_i for t^6
	215	ushr v3.4s, v1.4s, #23 // the b_i for t^9
	216	ushr v4.4s, v1.4s, #22 // the b_i for t^10
	217	eor v2.16b, v2.16b, v3.16b // add them all together
	218	eor v2.16b, v2.16b, v4.16b
	219	vshr128 v3, v2, 64 // contribution for high half
	220	vshl128 v2, v2, 32 // contribution for low half
	221	eor v1.16b, v1.16b, v3.16b // apply to high half
	222	eor v0.16b, v0.16b, v2.16b // and low half
	223
	224	// And then shift the low bits up.
	225	shl v2.4s, v1.4s, #6
	226	shl v3.4s, v1.4s, #9
	227	shl v4.4s, v1.4s, #10
	228	eor v1.16b, v1.16b, v2.16b // unit and t^6 contribs
	229	eor v3.16b, v3.16b, v4.16b // t^9 and t^10 contribs
	230	eor v0.16b, v0.16b, v1.16b // mix everything together
	231	eor v0.16b, v0.16b, v3.16b // ... and we're done
	232	.endm
	233
	234	.macro mul192
	235	// Enter with u in v0 and the less-significant half of v1, with v
	236	// duplicated across both halves of v2/v3/v4, and with zero in v31.
	237	// Leave with the product u v in v0 and the bottom half of v1.
	238	// Clobbers v16--v25.
	239
	240	// Start multiplying and accumulating pieces of product.
	241	// v0 = // (u_0; u_1)
	242	// v1 = // (u_2; ?)
	243	// v2 = // (v_0; v_0)
	244	// v3 = // (v_1; v_1)
	245	// v4 = // (v_2; v_2)
	246	pmull v16.1q, v0.1d, v2.1d // a = u_0 v_0
	247
	248	pmull v19.1q, v0.1d, v3.1d // u_0 v_1
	249	pmull2 v21.1q, v0.2d, v2.2d // u_1 v_0
	250
	251	pmull v17.1q, v0.1d, v4.1d // u_0 v_2
	252	pmull2 v22.1q, v0.2d, v3.2d // u_1 v_1
	253	pmull v23.1q, v1.1d, v2.1d // u_2 v_0
	254	eor v19.16b, v19.16b, v21.16b // b = u_0 v_1 + u_1 v_0
	255
	256	pmull2 v20.1q, v0.2d, v4.2d // u_1 v_2
	257	pmull v24.1q, v1.1d, v3.1d // u_2 v_1
	258	eor v17.16b, v17.16b, v22.16b // u_0 v_2 + u_1 v_1
	259
	260	pmull v18.1q, v1.1d, v4.1d // e = u_2 v_2
	261	eor v17.16b, v17.16b, v23.16b // c = u_0 v_2 + u_1 v_1 + u_2 v_1
	262	eor v20.16b, v20.16b, v24.16b // d = u_1 v_2 + u_2 v_1
	263
	264	// Piece the product together.
	265	// v16 = // (a_0; a_1)
	266	// v19 = // (b_0; b_1)
	267	// v17 = // (c_0; c_1)
	268	// v20 = // (d_0; d_1)
	269	// v18 = // (e_0; e_1)
	270	vshl128 v21, v19, 64 // (0; b_0)
	271	ext v22.16b, v19.16b, v20.16b, #8 // (b_1; d_0)
	272	vshr128 v23, v20, 64 // (d_1; 0)
	273	eor v16.16b, v16.16b, v21.16b // (x_0; x_1)
	274	eor v17.16b, v17.16b, v22.16b // (x_2; x_3)
	275	eor v18.16b, v18.16b, v23.16b // (x_2; x_3)
	276
	277	// Next, the reduction. Our polynomial this time is p(x) = t^192 +
	278	// t^7 + t^2 + t + 1. Yes, the magic numbers are the same as the
	279	// 128-bit case. I don't know why.
	280
	281	// First, shift the high bits down.
	282	// v16 = // (y_0; y_1)
	283	// v17 = // (y_2; y_3)
	284	// v18 = // (y_4; y_5)
	285	mov v19.d[0], v17.d[1] // (y_3; ?)
	286
	287	ushr v23.2d, v18.2d, #63 // hi b_i for t
	288	ushr d20, d19, #63 // lo b_i for t
	289	ushr v24.2d, v18.2d, #62 // hi b_i for t^2
	290	ushr d21, d19, #62 // lo b_i for t^2
	291	ushr v25.2d, v18.2d, #57 // hi b_i for t^7
	292	ushr d22, d19, #57 // lo b_i for t^7
	293	eor v23.16b, v23.16b, v24.16b // mix them all together
	294	eor v20.8b, v20.8b, v21.8b
	295	eor v23.16b, v23.16b, v25.16b
	296	eor v20.8b, v20.8b, v22.8b
	297
	298	// Permute the high pieces while we fold in the b_i.
	299	eor v17.16b, v17.16b, v23.16b
	300	vshl128 v20, v20, 64
	301	mov v19.d[0], v18.d[1] // (y_5; ?)
	302	ext v18.16b, v17.16b, v18.16b, #8 // (y_3; y_4)
	303	eor v16.16b, v16.16b, v20.16b
	304
	305	// And finally shift the low bits up.
	306	// v16 = // (y'_0; y'_1)
	307	// v17 = // (y'_2; ?)
	308	// v18 = // (y'_3; y'_4)
	309	// v19 = // (y'_5; ?)
	310	shl v20.2d, v18.2d, #1
	311	shl d23, d19, #1
	312	shl v21.2d, v18.2d, #2
	313	shl d24, d19, #2
	314	shl v22.2d, v18.2d, #7
	315	shl d25, d19, #7
	316	eor v18.16b, v18.16b, v20.16b // unit and t contribs
	317	eor v19.8b, v19.8b, v23.8b
	318	eor v21.16b, v21.16b, v22.16b // t^2 and t^7 contribs
	319	eor v24.8b, v24.8b, v25.8b
	320	eor v18.16b, v18.16b, v21.16b // all contribs
	321	eor v19.8b, v19.8b, v24.8b
	322	eor v0.16b, v16.16b, v18.16b // mix them into the low half
	323	eor v1.8b, v17.8b, v19.8b
	324	.endm
	325
	326	.macro mul256
	327	// Enter with u in v0/v1, with v duplicated across both halves of
	328	// v2--v5, and with zero in v31. Leave with the product u v in
	329	// v0/v1. Clobbers ???.
	330
	331	// Now it's starting to look worthwhile to do Karatsuba. Suppose
	332	// u = u_0 + u_1 B and v = v_0 + v_1 B. Then
	333	//
	334	// u v = (u_0 v_0) + (u_0 v_1 + u_1 v_0) B + (u_1 v_1) B^2
	335	//
	336	// Name these coefficients of B^i be a, b, and c, respectively, and
	337	// let r = u_0 + u_1 and s = v_0 + v_1. Then observe that
	338	//
	339	// q = r s = (u_0 + u_1) (v_0 + v_1)
	340	// = (u_0 v_0) + (u1 v_1) + (u_0 v_1 + u_1 v_0)
	341	// = a + d + c
	342	//
	343	// The first two terms we've already calculated; the last is the
	344	// remaining one we want. We'll set B = t^128. We know how to do
	345	// 128-bit multiplications already, and Karatsuba is too annoying
	346	// there, so there'll be 12 multiplications altogether, rather than
	347	// the 16 we'd have if we did this the naïve way.
	348	// v0 = // u_0 = (u_00; u_01)
	349	// v1 = // u_1 = (u_10; u_11)
	350	// v2 = // (v_00; v_00)
	351	// v3 = // (v_01; v_01)
	352	// v4 = // (v_10; v_10)
	353	// v5 = // (v_11; v_11)
	354
	355	eor v28.16b, v0.16b, v1.16b // u_* = (u_00 + u_10; u_01 + u_11)
	356	eor v29.16b, v2.16b, v4.16b // v_*0 = v_00 + v_10
	357	eor v30.16b, v3.16b, v5.16b // v_*1 = v_01 + v_11
	358
	359	// Start by building the cross product, q = u_* v_*.
	360	pmull v24.1q, v28.1d, v30.1d // u_0 v_1
	361	pmull2 v25.1q, v28.2d, v29.2d // u_1 v_0
	362	pmull v20.1q, v28.1d, v29.1d // u_0 v_0
	363	pmull2 v21.1q, v28.2d, v30.2d // u_1 v_1
	364	eor v24.16b, v24.16b, v25.16b // u_0 v_1 + u_1 v_0
	365	vshr128 v25, v24, 64
	366	vshl128 v24, v24, 64
	367	eor v20.16b, v20.16b, v24.16b // q_0
	368	eor v21.16b, v21.16b, v25.16b // q_1
	369
	370	// Next, work on the low half, a = u_0 v_0
	371	pmull v24.1q, v0.1d, v3.1d // u_00 v_01
	372	pmull2 v25.1q, v0.2d, v2.2d // u_01 v_00
	373	pmull v16.1q, v0.1d, v2.1d // u_00 v_00
	374	pmull2 v17.1q, v0.2d, v3.2d // u_01 v_01
	375	eor v24.16b, v24.16b, v25.16b // u_00 v_01 + u_01 v_00
	376	vshr128 v25, v24, 64
	377	vshl128 v24, v24, 64
	378	eor v16.16b, v16.16b, v24.16b // a_0
	379	eor v17.16b, v17.16b, v25.16b // a_1
	380
	381	// Mix the pieces we have so far.
	382	eor v20.16b, v20.16b, v16.16b
	383	eor v21.16b, v21.16b, v17.16b
	384
	385	// Finally, work on the high half, c = u_1 v_1
	386	pmull v24.1q, v1.1d, v5.1d // u_10 v_11
	387	pmull2 v25.1q, v1.2d, v4.2d // u_11 v_10
	388	pmull v18.1q, v1.1d, v4.1d // u_10 v_10
	389	pmull2 v19.1q, v1.2d, v5.2d // u_11 v_11
	390	eor v24.16b, v24.16b, v25.16b // u_10 v_11 + u_11 v_10
	391	vshr128 v25, v24, 64
	392	vshl128 v24, v24, 64
	393	eor v18.16b, v18.16b, v24.16b // c_0
	394	eor v19.16b, v19.16b, v25.16b // c_1
	395
	396	// Finish mixing the product together.
	397	eor v20.16b, v20.16b, v18.16b
	398	eor v21.16b, v21.16b, v19.16b
	399	eor v17.16b, v17.16b, v20.16b
	400	eor v18.16b, v18.16b, v21.16b
	401
	402	// Now we must reduce. This is essentially the same as the 192-bit
	403	// case above, but more complicated because everything is bigger.
	404	// The polynomial this time is p(t) = t^256 + t^10 + t^5 + t^2 + 1.
	405	// v16 = // (y_0; y_1)
	406	// v17 = // (y_2; y_3)
	407	// v18 = // (y_4; y_5)
	408	// v19 = // (y_6; y_7)
	409	ushr v24.2d, v18.2d, #62 // (y_4; y_5) b_i for t^2
	410	ushr v25.2d, v19.2d, #62 // (y_6; y_7) b_i for t^2
	411	ushr v26.2d, v18.2d, #59 // (y_4; y_5) b_i for t^5
	412	ushr v27.2d, v19.2d, #59 // (y_6; y_7) b_i for t^5
	413	ushr v28.2d, v18.2d, #54 // (y_4; y_5) b_i for t^10
	414	ushr v29.2d, v19.2d, #54 // (y_6; y_7) b_i for t^10
	415	eor v24.16b, v24.16b, v26.16b // mix the contributions together
	416	eor v25.16b, v25.16b, v27.16b
	417	eor v24.16b, v24.16b, v28.16b
	418	eor v25.16b, v25.16b, v29.16b
	419	vshr128 v26, v25, 64 // slide contribs into position
	420	ext v25.16b, v24.16b, v25.16b, #8
	421	vshl128 v24, v24, 64
	422	eor v18.16b, v18.16b, v26.16b
	423	eor v17.16b, v17.16b, v25.16b
	424	eor v16.16b, v16.16b, v24.16b
	425
	426	// And then shift the low bits up.
	427	// v16 = // (y'_0; y'_1)
	428	// v17 = // (y'_2; y'_3)
	429	// v18 = // (y'_4; y'_5)
	430	// v19 = // (y'_6; y'_7)
	431	shl v24.2d, v18.2d, #2 // (y'_4; y_5) a_i for t^2
	432	shl v25.2d, v19.2d, #2 // (y_6; y_7) a_i for t^2
	433	shl v26.2d, v18.2d, #5 // (y'_4; y_5) a_i for t^5
	434	shl v27.2d, v19.2d, #5 // (y_6; y_7) a_i for t^5
	435	shl v28.2d, v18.2d, #10 // (y'_4; y_5) a_i for t^10
	436	shl v29.2d, v19.2d, #10 // (y_6; y_7) a_i for t^10
	437	eor v18.16b, v18.16b, v24.16b // mix the contributions together
	438	eor v19.16b, v19.16b, v25.16b
	439	eor v26.16b, v26.16b, v28.16b
	440	eor v27.16b, v27.16b, v29.16b
	441	eor v18.16b, v18.16b, v26.16b
	442	eor v19.16b, v19.16b, v27.16b
	443	eor v0.16b, v16.16b, v18.16b
	444	eor v1.16b, v17.16b, v19.16b
	445	.endm
	446
	447	///--------------------------------------------------------------------------
	448	/// Main code.
	449
	450	// There are a number of representations of field elements in this code and
	451	// it can be confusing.
	452	//
	453	// * The `external format' consists of a sequence of contiguous bytes in
	454	// memory called a `block'. The GCM spec explains how to interpret this
	455	// block as an element of a finite field. As discussed extensively, this
	456	// representation is very annoying for a number of reasons. On the other
	457	// hand, this code never actually deals with it directly.
	458	//
	459	// * The `register format' consists of one or more SIMD registers,
	460	// depending on the block size. The bits in each byte are reversed,
	461	// compared to the external format, which makes the polynomials
	462	// completely vanilla, unlike all of the other GCM implementations.
	463	//
	464	// * The `table format' is just like the `register format', only the two
	465	// halves of 128-bit SIMD register are the same, so we need twice as many
	466	// registers.
	467	//
	468	// * The `words' format consists of a sequence of bytes, as in the
	469	// `external format', but, according to the blockcipher in use, the bytes
	470	// within each 32-bit word may be reversed (`big-endian') or not
	471	// (`little-endian'). Accordingly, there are separate entry points for
	472	// each variant, identified with `b' or `l'.
	473
	474	FUNC(gcm_mulk_128b_arm64_pmull)
	475	// On entry, x0 points to a 128-bit field element A in big-endian
	476	// words format; x1 points to a field-element K in table format. On
	477	// exit, A is updated with the product A K.
	478
	479	ldr q0, [x0]
	480	ldp q1, q2, [x1]
	481	rev32 v0.16b, v0.16b
	482	vzero
	483	rbit v0.16b, v0.16b
	484	mul128
	485	rbit v0.16b, v0.16b
	486	rev32 v0.16b, v0.16b
	487	str q0, [x0]
	488	ret
	489	ENDFUNC
	490
	491	FUNC(gcm_mulk_128l_arm64_pmull)
	492	// On entry, x0 points to a 128-bit field element A in little-endian
	493	// words format; x1 points to a field-element K in table format. On
	494	// exit, A is updated with the product A K.
	495
	496	ldr q0, [x0]
	497	ldp q1, q2, [x1]
	498	vzero
	499	rbit v0.16b, v0.16b
	500	mul128
	501	rbit v0.16b, v0.16b
	502	str q0, [x0]
	503	ret
	504	ENDFUNC
	505
	506	FUNC(gcm_mulk_64b_arm64_pmull)
	507	// On entry, x0 points to a 64-bit field element A in big-endian
	508	// words format; x1 points to a field-element K in table format. On
	509	// exit, A is updated with the product A K.
	510
	511	ldr d0, [x0]
	512	ldr q1, [x1]
	513	rev32 v0.8b, v0.8b
	514	rbit v0.8b, v0.8b
	515	mul64
	516	rbit x2, x2
	517	ror x2, x2, #32
	518	str x2, [x0]
	519	ret
	520	ENDFUNC
	521
	522	FUNC(gcm_mulk_64l_arm64_pmull)
	523	// On entry, x0 points to a 64-bit field element A in little-endian
	524	// words format; x1 points to a field-element K in table format. On
	525	// exit, A is updated with the product A K.
	526
	527	ldr d0, [x0]
	528	ldr q1, [x1]
	529	rbit v0.8b, v0.8b
	530	mul64
	531	rbit x2, x2
	532	rev x2, x2
	533	str x2, [x0]
	534	ret
	535	ENDFUNC
	536
	537	FUNC(gcm_mulk_96b_arm64_pmull)
	538	// On entry, x0 points to a 96-bit field element A in big-endian
	539	// words format; x1 points to a field-element K in table format. On
	540	// exit, A is updated with the product A K.
	541
	542	ldr w2, [x0, #8]
	543	ldr d0, [x0, #0]
	544	mov v0.d[1], x2
	545	ldp q1, q2, [x1]
	546	rev32 v0.16b, v0.16b
	547	vzero
	548	rbit v0.16b, v0.16b
	549	mul96
	550	rbit v0.16b, v0.16b
	551	rev32 v0.16b, v0.16b
	552	mov w2, v0.s[2]
	553	str d0, [x0, #0]
	554	str w2, [x0, #8]
	555	ret
	556	ENDFUNC
	557
	558	FUNC(gcm_mulk_96l_arm64_pmull)
	559	// On entry, x0 points to a 96-bit field element A in little-endian
	560	// words format; x1 points to a field-element K in table format. On
	561	// exit, A is updated with the product A K.
	562
	563	ldr d0, [x0, #0]
	564	ldr w2, [x0, #8]
	565	mov v0.d[1], x2
	566	ldp q1, q2, [x1]
	567	rbit v0.16b, v0.16b
	568	vzero
	569	mul96
	570	rbit v0.16b, v0.16b
	571	mov w2, v0.s[2]
	572	str d0, [x0, #0]
	573	str w2, [x0, #8]
	574	ret
	575	ENDFUNC
	576
	577	FUNC(gcm_mulk_192b_arm64_pmull)
	578	// On entry, x0 points to a 192-bit field element A in big-endian
	579	// words format; x1 points to a field-element K in table format. On
	580	// exit, A is updated with the product A K.
	581
	582	ldr q0, [x0, #0]
	583	ldr d1, [x0, #16]
	584	ldp q2, q3, [x1, #0]
	585	ldr q4, [x1, #32]
	586	rev32 v0.16b, v0.16b
	587	rev32 v1.8b, v1.8b
	588	rbit v0.16b, v0.16b
	589	rbit v1.8b, v1.8b
	590	vzero
	591	mul192
	592	rev32 v0.16b, v0.16b
	593	rev32 v1.8b, v1.8b
	594	rbit v0.16b, v0.16b
	595	rbit v1.8b, v1.8b
	596	str q0, [x0, #0]
	597	str d1, [x0, #16]
	598	ret
	599	ENDFUNC
	600
	601	FUNC(gcm_mulk_192l_arm64_pmull)
	602	// On entry, x0 points to a 192-bit field element A in little-endian
	603	// words format; x1 points to a field-element K in table format. On
	604	// exit, A is updated with the product A K.
	605
	606	ldr q0, [x0, #0]
	607	ldr d1, [x0, #16]
	608	ldp q2, q3, [x1, #0]
	609	ldr q4, [x1, #32]
	610	rbit v0.16b, v0.16b
	611	rbit v1.8b, v1.8b
	612	vzero
	613	mul192
	614	rbit v0.16b, v0.16b
	615	rbit v1.8b, v1.8b
	616	str q0, [x0, #0]
	617	str d1, [x0, #16]
	618	ret
	619	ENDFUNC
	620
	621	FUNC(gcm_mulk_256b_arm64_pmull)
	622	// On entry, x0 points to a 256-bit field element A in big-endian
	623	// words format; x1 points to a field-element K in table format. On
	624	// exit, A is updated with the product A K.
	625
	626	ldp q0, q1, [x0]
	627	ldp q2, q3, [x1, #0]
	628	ldp q4, q5, [x1, #32]
	629	rev32 v0.16b, v0.16b
	630	rev32 v1.16b, v1.16b
	631	rbit v0.16b, v0.16b
	632	rbit v1.16b, v1.16b
	633	vzero
	634	mul256
	635	rev32 v0.16b, v0.16b
	636	rev32 v1.16b, v1.16b
	637	rbit v0.16b, v0.16b
	638	rbit v1.16b, v1.16b
	639	stp q0, q1, [x0]
	640	ret
	641	ENDFUNC
	642
	643	FUNC(gcm_mulk_256l_arm64_pmull)
	644	// On entry, x0 points to a 256-bit field element A in little-endian
	645	// words format; x1 points to a field-element K in table format. On
	646	// exit, A is updated with the product A K.
	647
	648	ldp q0, q1, [x0]
	649	ldp q2, q3, [x1, #0]
	650	ldp q4, q5, [x1, #32]
	651	rbit v0.16b, v0.16b
	652	rbit v1.16b, v1.16b
	653	vzero
	654	mul256
	655	rbit v0.16b, v0.16b
	656	rbit v1.16b, v1.16b
	657	stp q0, q1, [x0]
	658	ret
	659	ENDFUNC
	660
	661	///----- That's all, folks --------------------------------------------------