mdw@git.distorted.org.uk Git - catacomb/blame_incremental

... / ...

Commit	Line	Data
	1	/// -- mode: asm; asm-comment-char: ?/ --
	2	///
	3	/// GCM acceleration for ARM processors
	4	///
	5	/// (c) 2019 Straylight/Edgeware
	6	///
	7
	8	///----- Licensing notice ---------------------------------------------------
	9	///
	10	/// This file is part of Catacomb.
	11	///
	12	/// Catacomb is free software: you can redistribute it and/or modify it
	13	/// under the terms of the GNU Library General Public License as published
	14	/// by the Free Software Foundation; either version 2 of the License, or
	15	/// (at your option) any later version.
	16	///
	17	/// Catacomb is distributed in the hope that it will be useful, but
	18	/// WITHOUT ANY WARRANTY; without even the implied warranty of
	19	/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	20	/// Library General Public License for more details.
	21	///
	22	/// You should have received a copy of the GNU Library General Public
	23	/// License along with Catacomb. If not, write to the Free Software
	24	/// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
	25	/// USA.
	26
	27	///--------------------------------------------------------------------------
	28	/// Preliminaries.
	29
	30	#include "config.h"
	31	#include "asm-common.h"
	32
	33	.arch armv8-a
	34	.fpu crypto-neon-fp-armv8
	35
	36	.text
	37
	38	///--------------------------------------------------------------------------
	39	/// Multiplication macros.
	40
	41	// The good news is that we have a fancy instruction to do the
	42	// multiplications. The bad news is that it's not particularly well-
	43	// suited to the job.
	44	//
	45	// For one thing, it only does a 64-bit multiplication, so in general
	46	// we'll need to synthesize the full-width multiply by hand. For
	47	// another thing, it doesn't help with the reduction, so we have to
	48	// do that by hand too. And, finally, GCM has crazy bit ordering,
	49	// and the instruction does nothing useful for that at all.
	50	//
	51	// Focusing on that last problem first: the bits aren't in monotonic
	52	// significance order unless we permute them. If we reverse the byte
	53	// order, then we'll have the bits in monotonic order, but backwards,
	54	// so the degree-0 coefficient will be in the most-significant bit.
	55	//
	56	// This is less of a difficulty than it seems at first, because
	57	// algebra. Suppose we are given u = SUM_{0<=i<n} u_i t^i and v =
	58	// SUM_{0<=j<n} v_j t^j; then
	59	//
	60	// u v = SUM_{0<=i,j<n} u_i v_j t^{i+j}
	61	//
	62	// Suppose instead that we're given ũ = SUM_{0<=i<n} u_{n-i-1} t^i
	63	// and ṽ = SUM_{0<=j<n} v_{n-j-1} t^j, so the bits are backwards.
	64	// Then
	65	//
	66	// ũ ṽ = SUM_{0<=i,j<n} u_{n-i-1} v_{n-j-1} t^{i+j}
	67	// = SUM_{0<=i,j<n} u_i v_j t^{2n-2-(i+j)}
	68	//
	69	// which is almost the bit-reversal of u v, only it's shifted right
	70	// by one place. Oh, well: we'll have to shift it back later.
	71	//
	72	// That was important to think about, but there's not a great deal to
	73	// do about it yet other than to convert what we've got from the
	74	// blockcipher's byte-ordering convention to our big-endian
	75	// convention. Since this depends on the blockcipher convention,
	76	// we'll leave the caller to cope with this: the macros here will
	77	// assume that the operands are in `register' format, which is the
	78	// same as the external representation, except that the bytes within
	79	// each 64-bit piece are reversed. In the commentary, pieces of
	80	// polynomial are numbered according to the degree of the
	81	// coefficients, so the unit coefficient of some polynomial a is in
	82	// a_0.
	83	//
	84	// The commentary for `mul128' is the most detailed. The other
	85	// macros assume that you've already read and understood that.
	86
	87	.macro mul128
	88	// Enter with u and v in q0 and q1 respectively; leave with z = u v
	89	// in q0. Clobbers q1--q3, q8, q9.
	90
	91	// First for the double-precision multiplication. It's tempting to
	92	// use Karatsuba's identity here, but I suspect that loses more in
	93	// the shifting, bit-twiddling, and dependency chains that it gains
	94	// in saving a multiplication which otherwise pipelines well.
	95	// q0 = // (u_0; u_1)
	96	// q1 = // (v_0; v_1)
	97	vmull.p64 q2, d1, d2 // u_1 v_0
	98	vmull.p64 q3, d0, d3 // u_0 v_1
	99	vmull.p64 q8, d1, d3 // (x_3; t_1) = u_1 v_1
	100	vmull.p64 q9, d0, d2 // (t_0; x_0) = u_0 v_0
	101
	102	// Arrange the pieces to form a double-precision polynomial.
	103	veor q2, q2, q3 // (m_1; m_0) = u_0 v_1 + u_1 v_0
	104	veor d17, d17, d4 // x_2 = t_1 + m_1
	105	veor d18, d18, d5 // x_1 = t_0 + m_0
	106	// q8 = // (x_3; x_2)
	107	// q9 = // (x_1; x_0)
	108
	109	// Two-and-a-half problems remain. The first is that this product is
	110	// shifted left by one place, which is annoying. Let's take care of
	111	// that now. Once this is done, we'll be properly in GCM's backwards
	112	// bit-ordering.
	113	//
	114	// The half a problem is that the result wants to have its 64-bit
	115	// halves switched. Here turns out to be the best place to arrange
	116	// for that.
	117	//
	118	// q9 q8
	119	// ,-------------.-------------. ,-------------.-------------.
	120	// \| 0 x_0-x_62 \| x_63-x_126 \| \| x_127-x_190 \| x_191-x_254 \|
	121	// `-------------^-------------' `-------------^-------------'
	122	// d19 d18 d17 d16
	123	//
	124	// We start by shifting each 32-bit lane right (from GCM's point of
	125	// view -- physically, left) by one place, which gives us this:
	126	//
	127	// low (q9) high (q8)
	128	// ,-------------.-------------. ,-------------.-------------.
	129	// \| x_0-x_62 0 \|x_64-x_126 0 \| \|x_128-x_190 0\|x_192-x_254 0\|
	130	// `-------------^-------------' `-------------^-------------'
	131	// d19 d18 d17 d16
	132	//
	133	// but we've lost a bunch of bits. We separately shift each lane
	134	// left by 31 places to give us the bits we lost.
	135	//
	136	// low (q3) high (q2)
	137	// ,-------------.-------------. ,-------------.-------------.
	138	// \| 0...0 \| 0...0 x_63 \| \| 0...0 x_127 \| 0...0 x_191 \|
	139	// `-------------^-------------' `-------------^-------------'
	140	// d6 d5 d4
	141	//
	142	// Since we can address each of these pieces individually, putting
	143	// them together is relatively straightforward.
	144
	145
	146	vshr.u64 d6, d18, #63 // shifted left; just the carries
	147	vshl.u64 q9, q9, #1 // shifted right, but dropped carries
	148	vshr.u64 q2, q8, #63
	149	vshl.u64 q8, q8, #1
	150	vorr d0, d19, d6 // y_0
	151	vorr d1, d18, d5 // y_1
	152	vorr d2, d17, d4 // y_2
	153	vmov d3, d16 // y_3
	154
	155	// And the other one is that the result needs to be reduced modulo
	156	// p(t) = t^128 + t^7 + t^2 + t + 1. Let R = t^128 = t^7 + t^2 + t +
	157	// 1 in our field. So far, we've calculated z_0 and z_1 such that
	158	// z_0 + z_1 R = u v using the identity R = t^128: now we must
	159	// collapse the two halves of y together using the other identity R =
	160	// t^7 + t^2 + t + 1.
	161	//
	162	// We do this by working on y_2 and y_3 separately, so consider y_i
	163	// for i = 2 or 3. Certainly, y_i t^{64i} = y_i R t^{64(i-2) =
	164	// (t^7 + t^2 + t + 1) y_i t^{64(i-2)}, but we can't use that
	165	// directly without breaking up the 64-bit word structure. Instead,
	166	// we start by considering just y_i t^7 t^{64(i-2)}, which again
	167	// looks tricky. Now, split y_i = a_i + t^57 b_i, with deg a_i < 57;
	168	// then
	169	//
	170	// y_i t^7 t^{64(i-2)} = a_i t^7 t^{64(i-2)} + b_i t^{64(i-1)}
	171	//
	172	// We can similarly decompose y_i t^2 and y_i t into a pair of 64-bit
	173	// contributions to the t^{64(i-2)} and t^{64(i-1)} words, but the
	174	// splits are different. This is lovely, with one small snag: when
	175	// we do this to y_3, we end up with a contribution back into the
	176	// t^128 coefficient word. But notice that only the low seven bits
	177	// of this word are affected, so there's no knock-on contribution
	178	// into the t^64 word. Therefore, if we handle the high bits of each
	179	// word together, and then the low bits, everything will be fine.
	180
	181	// First, shift the high bits down.
	182	vshl.u64 q2, q1, #63 // the b_i for t
	183	vshl.u64 q3, q1, #62 // the b_i for t^2
	184	vshl.u64 q8, q1, #57 // the b_i for t^7
	185	veor q2, q2, q3 // add them all together
	186	veor q2, q2, q8
	187	veor d2, d2, d5 // contribution into low half
	188	veor d1, d1, d4 // and high half
	189
	190	// And then shift the low bits up.
	191	vshr.u64 q2, q1, #1
	192	vshr.u64 q3, q1, #2
	193	vshr.u64 q8, q1, #7
	194	veor q0, q0, q1 // mix in the unit contribution
	195	veor q2, q2, q3 // t and t^2 contribs
	196	veor q0, q0, q8 // low, unit, and t^7 contribs
	197	veor q0, q0, q2 // mix them together and we're done
	198	.endm
	199
	200	.macro mul64
	201	// Enter with u and v in the low halves of d0 and d1 respectively;
	202	// leave with z = u v in d0. Clobbers d1--d5.
	203
	204	// The multiplication is thankfully easy.
	205	vmull.p64 q0, d0, d1 // u v
	206
	207	// Shift the product up by one place, and swap the two halves. After
	208	// this, we're in GCM bizarro-world.
	209	vshr.u64 d2, d0, #63 // shifted left; just the carries
	210	vshl.u64 d3, d1, #1 // low half right
	211	vshl.u64 d1, d0, #1 // high half shifted right
	212	vorr d0, d3, d2 // mix in the carries
	213
	214	// Now we must reduce. This is essentially the same as the 128-bit
	215	// case above, but mostly simpler because everything is smaller. The
	216	// polynomial this time is p(t) = t^64 + t^4 + t^3 + t + 1.
	217
	218	// First, shift the high bits down.
	219	vshl.u64 d2, d1, #63 // b_i for t
	220	vshl.u64 d3, d1, #61 // b_i for t^3
	221	vshl.u64 d4, d1, #60 // b_i for t^4
	222	veor d2, d2, d3 // add them all together
	223	veor d2, d2, d4
	224	veor d1, d1, d2 // contribution back into high half
	225
	226	// And then shift the low bits up.
	227	vshr.u64 d2, d1, #1
	228	vshr.u64 d3, d1, #3
	229	vshr.u64 d4, d1, #4
	230	veor d0, d0, d1 // mix in the unit contribution
	231	veor d2, d2, d3 // t and t^3 contribs
	232	veor d0, d0, d4 // low, unit, and t^4
	233	veor d0, d0, d2 // mix them together and we're done
	234	.endm
	235
	236	.macro mul96
	237	// Enter with u and v in the most-significant three words of q0 and
	238	// q1 respectively, and zero in the low words, and zero in q15; leave
	239	// with z = u v in the high three words of q0, and /junk/ in the low
	240	// word. Clobbers ???.
	241
	242	// This is an inconvenient size. There's nothing for it but to do
	243	// four multiplications, as if for the 128-bit case. It's possible
	244	// that there's cruft in the top 32 bits of the input registers, so
	245	// shift both of them up by four bytes before we start. This will
	246	// mean that the high 64 bits of the result (from GCM's viewpoint)
	247	// will be zero.
	248	// q0 = // (u_0 + u_1 t^32; u_2)
	249	// q1 = // (v_0 + v_1 t^32; v_2)
	250	vmull.p64 q8, d1, d2 // u_2 (v_0 + v_1 t^32) = e_0
	251	vmull.p64 q9, d0, d3 // v_2 (u_0 + u_1 t^32) = e_1
	252	vmull.p64 q3, d1, d3 // u_2 v_2 t^64 = d = (0; d)
	253	vmull.p64 q0, d0, d2 // u_0 v_0 + (u_0 v_1 + u_1 v_0) t^32
	254	// + u_1 v_1 t^64 = f
	255
	256	// Extract the high and low halves of the 192-bit result. The answer
	257	// we want is d t^128 + e t^64 + f, where e = e_0 + e_1. The low 96
	258	// bits of the answer will end up in q0, and the high 96 bits will
	259	// end up in q1; we'll need both of these to have zero in their
	260	// bottom 32 bits.
	261	//
	262	// Here, bot(x) is the low 96 bits of a 192-bit quantity x, arranged
	263	// in the low 96 bits of a SIMD register, with junk in the top 32
	264	// bits; and top(x) is the high 96 bits, also arranged in the low 96
	265	// bits of a register, with /zero/ in the top 32 bits.
	266	veor q8, q8, q9 // e_0 + e_1 = e
	267	vshr128 q1, q3, 32 // top(d t^128)
	268	vext.8 d19, d16, d17, #4 // top(e t^64)
	269	vshl.u64 d16, d0, #32 // top(f), sort of
	270	veor d3, d3, d19 // q1 = top(d t^128 + e t^64)
	271	veor d0, d0, d17 // q0 = bot([d t^128] + e t^64 + f)
	272	veor d3, d3, d16 // q1 = top(d t^128 + e t^64 + f)
	273
	274	// Shift the product right by one place (from GCM's point of view),
	275	// but, unusually, don't swap the halves, because we need to work on
	276	// the 32-bit pieces later. After this, we're in GCM bizarro-world.
	277	// q0 = // (?, x_2; x_1, x_0)
	278	// q1 = // (0, x_5; x_4, x_3)
	279	vshr.u64 d4, d0, #63 // carry from d0 to d1
	280	vshr.u64 d5, d2, #63 // carry from d2 to d3
	281	vshr.u32 d6, d3, #31 // carry from d3 to d0
	282	vshl.u64 q0, q0, #1 // shift low half
	283	vshl.u64 q1, q1, #1 // shift high half
	284	vorr d1, d1, d4
	285	vorr d0, d0, d6
	286	vorr d3, d3, d5
	287
	288	// Finally, the reduction. This is essentially the same as the
	289	// 128-bit case, except that the polynomial is p(t) = t^96 + t^10 +
	290	// t^9 + t^6 + 1. The degrees are larger but not enough to cause
	291	// trouble for the general approach.
	292
	293	// First, shift the high bits down.
	294	vshl.u32 q2, q1, #26 // b_i for t^6
	295	vshl.u32 q3, q1, #23 // b_i for t^9
	296	vshl.u32 q8, q1, #22 // b_i for t^10
	297	veor q2, q2, q3 // add them all together
	298	veor q2, q2, q8
	299	vshl128 q3, q2, 64 // contribution into high half
	300	vshr128 q2, q2, 32 // and low half
	301	veor q1, q1, q3 // mix them in
	302	veor q0, q0, q2
	303
	304	// And then shift the low bits up.
	305	vshr.u32 q2, q1, #6
	306	vshr.u32 q3, q1, #9
	307	veor q0, q0, q1 // mix in the unit contribution
	308	vshr.u32 q8, q1, #10
	309	veor q2, q2, q3 // mix together t^6 and t^9
	310	veor q0, q0, q8 // mix in t^10
	311	veor q0, q0, q2 // and the rest
	312
	313	// And finally swap the two halves.
	314	vswp d0, d1
	315	.endm
	316
	317	.macro mul192
	318	// Enter with u and v in d0--d2 and d3--d5 respectively; leave
	319	// with z = u v in d0--d2. Clobbers q8--q15.
	320
	321	// Start multiplying and accumulating pieces of product.
	322	// (d0; d1; d2) = // (u_0; u_1; u_2)
	323	// (d3; d4; d5) = // (v_0; v_1; v_2)
	324	vmull.p64 q10, d0, d3 // e = u_0 v_0
	325
	326	vmull.p64 q12, d0, d4 // u_0 v_1
	327	vmull.p64 q13, d1, d3 // u_1 v_0
	328
	329	vmull.p64 q9, d0, d5 // u_0 v_2
	330	vmull.p64 q14, d1, d4 // u_1 v_1
	331	vmull.p64 q15, d2, d3 // u_2 v_0
	332	veor q12, q12, q13 // d = u_0 v_1 + u_1 v_0
	333
	334	vmull.p64 q11, d1, d5 // u_1 v_2
	335	vmull.p64 q13, d2, d4 // u_2 v_1
	336	veor q9, q9, q14 // u_0 v_2 + u_1 v_1
	337
	338	vmull.p64 q8, d2, d5 // a = u_2 v_2
	339	veor q9, q9, q15 // c = u_0 v_2 + u_1 v_1 + u_2 v_0
	340	veor q11, q11, q13 // b = u_1 v_2 + u_2 v_1
	341
	342	// Piece the product together.
	343	veor d17, d17, d22 // q8 = // (x_5; x_4)
	344	veor d18, d18, d23
	345	veor d19, d19, d24 // q9 = // (x_3; x_2)
	346	veor d20, d20, d25 // q10 = // (x_1; x_0)
	347
	348	// Shift the product right by one place (from GCM's point of view).
	349	vshr.u64 q11, q8, #63 // carry from d16/d17 to d17/d18
	350	vshr.u64 q12, q9, #63 // carry from d18/d19 to d19/d20
	351	vshr.u64 d26, d20, #63 // carry from d20 to d21
	352	vshl.u64 q8, q8, #1 // shift everything down
	353	vshl.u64 q9, q9, #1
	354	vshl.u64 q10, q10, #1
	355	vorr d17, d17, d22 // and mix in the carries
	356	vorr d18, d18, d23
	357	vorr d19, d19, d24
	358	vorr d20, d20, d25
	359	vorr d21, d21, d26
	360
	361	// Next, the reduction. Our polynomial this time is p(x) = t^192 +
	362	// t^7 + t^2 + t + 1. Yes, the magic numbers are the same as the
	363	// 128-bit case. I don't know why.
	364
	365	// First, shift the high bits down.
	366	// q8 = // (y_5; y_4)
	367	// q9 = // (y_3; y_2)
	368	// q10 = // (y_1; y_0)
	369	vshl.u64 q11, q8, #63 // (y_5; y_4) b_i for t
	370	vshl.u64 d28, d18, #63 // y_3 b_i for t
	371	vshl.u64 q12, q8, #62 // (y_5; y_4) b_i for t^2
	372	vshl.u64 d29, d18, #62 // y_3 b_i for t^2
	373	vshl.u64 q13, q8, #57 // (y_5; y_4) b_i for t^7
	374	vshl.u64 d30, d18, #57 // y_3 b_i for t^7
	375	veor q11, q11, q12 // mix them all together
	376	veor d28, d28, d29
	377	veor q11, q11, q13
	378	veor d28, d28, d30
	379	veor q9, q9, q11
	380	veor d20, d20, d28
	381
	382	// And finally shift the low bits up. Also, switch the order of the
	383	// pieces for output.
	384	// q8 = // (y'_5; y'_4)
	385	// q9 = // (y'_3; y'_2)
	386	// q10 = // (y'_1; y'_0)
	387	vshr.u64 q11, q8, #1 // (y_5; y_4) a_i for t
	388	vshr.u64 d28, d18, #1 // y'_3 a_i for t
	389	vshr.u64 q12, q8, #2 // (y_5; y_4) a_i for t^2
	390	vshr.u64 d29, d18, #2 // y'_3 a_i for t^2
	391	vshr.u64 q13, q8, #7 // (y_5; y_4) a_i for t^7
	392	vshr.u64 d30, d18, #7 // y'_3 a_i for t^7
	393	veor q8, q8, q11
	394	veor d18, d18, d28
	395	veor q12, q12, q13
	396	veor d29, d29, d30
	397	veor q8, q8, q12
	398	veor d18, d18, d29
	399	veor d0, d21, d18
	400	veor d1, d20, d17
	401	veor d2, d19, d16
	402	.endm
	403
	404	.macro mul256
	405	// Enter with u and v in q0/q1 and q2/q3 respectively; leave
	406	// with z = u v in q0/q1. Clobbers q8--q15.
	407
	408	// Now it's starting to look worthwhile to do Karatsuba. Suppose
	409	// u = u_0 + u_1 B and v = v_0 + v_1 B. Then
	410	//
	411	// u v = (u_0 v_0) + (u_0 v_1 + u_1 v_0) B + (u_1 v_1) B^2
	412	//
	413	// Name these coefficients of B^i be a, b, and c, respectively, and
	414	// let r = u_0 + u_1 and s = v_0 + v_1. Then observe that
	415	//
	416	// q = r s = (u_0 + u_1) (v_0 + v_1)
	417	// = (u_0 v_0) + (u1 v_1) + (u_0 v_1 + u_1 v_0)
	418	// = a + d + c
	419	//
	420	// The first two terms we've already calculated; the last is the
	421	// remaining one we want. We'll set B = t^128. We know how to do
	422	// 128-bit multiplications already, and Karatsuba is too annoying
	423	// there, so there'll be 12 multiplications altogether, rather than
	424	// the 16 we'd have if we did this the naïve way.
	425	// q0 = // u_0 = (u_00; u_01)
	426	// q1 = // u_1 = (u_10; u_11)
	427	// q2 = // v_0 = (v_00; v_01)
	428	// q3 = // v_1 = (v_10; v_11)
	429
	430	veor q8, q0, q1 // u_* = (u_00 + u_10; u_01 + u_11)
	431	veor q9, q2, q3 // v_* = (v_00 + v_10; v_01 + v_11)
	432
	433	// Start by building the cross product, q = u_* v_*.
	434	vmull.p64 q14, d16, d19 // u_0 v_1
	435	vmull.p64 q15, d17, d18 // u_1 v_0
	436	vmull.p64 q12, d17, d19 // u_1 v_1
	437	vmull.p64 q13, d16, d18 // u_0 v_0
	438	veor q14, q14, q15 // u_0 v_1 + u_1 v_0
	439	veor d25, d25, d28 // q12 = // q_1
	440	veor d26, d26, d29 // q13 = // q_0
	441
	442	// Next, work on the low half, a = u_0 v_0.
	443	vmull.p64 q14, d0, d5 // u_00 v_01
	444	vmull.p64 q15, d1, d4 // u_01 v_00
	445	vmull.p64 q10, d1, d5 // u_01 v_01
	446	vmull.p64 q11, d0, d4 // u_00 v_00
	447	veor q14, q14, q15 // u_00 v_01 + u_01 v_00
	448	veor d21, d21, d28 // q10 = // a_1
	449	veor d22, d22, d29 // q11 = // a_0
	450
	451	// Mix the pieces we have so far.
	452	veor q12, q12, q10
	453	veor q13, q13, q11
	454
	455	// Finally, the high half, c = u_1 v_1.
	456	vmull.p64 q14, d2, d7 // u_10 v_11
	457	vmull.p64 q15, d3, d6 // u_11 v_10
	458	vmull.p64 q8, d3, d7 // u_11 v_11
	459	vmull.p64 q9, d2, d6 // u_10 v_10
	460	veor q14, q14, q15 // u_10 v_11 + u_11 v_10
	461	veor d17, d17, d28 // q8 = // c_1
	462	veor d18, d18, d29 // q9 = // c_0
	463
	464	// Finish mixing the product together.
	465	veor q12, q12, q8 // q12 = // b_1
	466	veor q13, q13, q9 // q13 = // b_0
	467	veor q9, q9, q12
	468	veor q10, q10, q13
	469
	470	// Shift the product right by one place (from GCM's point of view).
	471	vshr.u64 q0, q8, #63 // carry from d16/d17 to d17/d18
	472	vshr.u64 q1, q9, #63 // carry from d18/d19 to d19/d20
	473	vshr.u64 q2, q10, #63 // carry from d20/d21 to d21/d22
	474	vshr.u64 d6, d22, #63 // carry from d22 to d23
	475	vshl.u64 q8, q8, #1 // shift everyting down
	476	vshl.u64 q9, q9, #1
	477	vshl.u64 q10, q10, #1
	478	vshl.u64 q11, q11, #1
	479	vorr d17, d17, d0
	480	vorr d18, d18, d1
	481	vorr d19, d19, d2
	482	vorr d20, d20, d3
	483	vorr d21, d21, d4
	484	vorr d22, d22, d5
	485	vorr d23, d23, d6
	486
	487	// Now we must reduce. This is essentially the same as the 192-bit
	488	// case above, but more complicated because everything is bigger.
	489	// The polynomial this time is p(t) = t^256 + t^10 + t^5 + t^2 + 1.
	490
	491	// First, shift the high bits down.
	492	// q8 = // (y_7; y_6)
	493	// q9 = // (y_5; y_4)
	494	// q10 = // (y_3; y_2)
	495	// q11 = // (y_1; y_0)
	496	vshl.u64 q0, q8, #62 // (y_7; y_6) b_i for t^2
	497	vshl.u64 q12, q9, #62 // (y_5; y_4) b_i for t^2
	498	vshl.u64 q1, q8, #59 // (y_7; y_6) b_i for t^5
	499	vshl.u64 q13, q9, #59 // (y_5; y_4) b_i for t^5
	500	vshl.u64 q2, q8, #54 // (y_7; y_6) b_i for t^10
	501	vshl.u64 q14, q9, #54 // (y_5; y_4) b_i for t^10
	502	veor q0, q0, q1 // mix the contributions together
	503	veor q12, q12, q13
	504	veor q0, q0, q2
	505	veor q12, q12, q14
	506	veor d19, d19, d0 // and combine into the lower pieces
	507	veor d20, d20, d1
	508	veor d21, d21, d24
	509	veor d22, d22, d25
	510
	511	// And then shift the low bits up. Also, switch the order of the
	512	// pieces for output.
	513	// q8 = // (y'_7; y'_6)
	514	// q9 = // (y'_5; y'_4)
	515	// q10 = // (y'_3; y'_2)
	516	// q11 = // (y'_1; y'_0)
	517	vshr.u64 q0, q8, #2 // (y_7; y_6) a_i for t^2
	518	vshr.u64 q12, q9, #2 // (y_5; y'_4) a_i for t^2
	519	vshr.u64 q1, q8, #5 // (y_7; y_6) a_i for t^5
	520	vshr.u64 q13, q9, #5 // (y_5; y_4) a_i for t^5
	521	vshr.u64 q2, q8, #10 // (y_7; y_6) a_i for t^10
	522	vshr.u64 q14, q9, #10 // (y_5; y_4) a_i for t^10
	523
	524	veor q8, q8, q0 // mix the contributions together
	525	veor q1, q1, q2
	526	veor q9, q9, q12
	527	veor q13, q13, q14
	528	veor q8, q8, q1
	529	veor q9, q9, q13
	530	veor d3, d20, d16 // and output
	531	veor d2, d21, d17
	532	veor d1, d22, d18
	533	veor d0, d23, d19
	534	.endm
	535
	536	///--------------------------------------------------------------------------
	537	/// Main code.
	538
	539	// There are a number of representations of field elements in this code and
	540	// it can be confusing.
	541	//
	542	// * The `external format' consists of a sequence of contiguous bytes in
	543	// memory called a `block'. The GCM spec explains how to interpret this
	544	// block as an element of a finite field. As discussed extensively, this
	545	// representation is very annoying for a number of reasons. On the other
	546	// hand, this code never actually deals with it directly.
	547	//
	548	// * The `register format' consists of one or more NEON registers,
	549	// depending on the block size. The bytes in each 64-bit lane of these
	550	// registers are in reverse order, compared to the external format.
	551	//
	552	// * The `words' format consists of a sequence of bytes, as in the
	553	// `external format', but, according to the blockcipher in use, the bytes
	554	// within each 32-bit word may be reversed (`big-endian') or not
	555	// (`little-endian'). Accordingly, there are separate entry points for
	556	// each variant, identified with `b' or `l'.
	557
	558	FUNC(gcm_mulk_128b_arm_crypto)
	559	// On entry, r0 points to a 128-bit field element A in big-endian
	560	// words format; r1 points to a field-element K in register format.
	561	// On exit, A is updated with the product A K.
	562
	563	vld1.8 {q0}, [r0]
	564	vld1.8 {q1}, [r1]
	565	vrev64.32 q0, q0
	566	mul128
	567	vrev64.32 q0, q0
	568	vst1.8 {q0}, [r0]
	569	bx r14
	570	ENDFUNC
	571
	572	FUNC(gcm_mulk_128l_arm_crypto)
	573	// On entry, r0 points to a 128-bit field element A in little-endian
	574	// words format; r1 points to a field-element K in register format.
	575	// On exit, A is updated with the product A K.
	576
	577	vld1.8 {q0}, [r0]
	578	vld1.8 {q1}, [r1]
	579	vrev64.8 q0, q0
	580	mul128
	581	vrev64.8 q0, q0
	582	vst1.8 {q0}, [r0]
	583	bx r14
	584	ENDFUNC
	585
	586	FUNC(gcm_mulk_64b_arm_crypto)
	587	// On entry, r0 points to a 64-bit field element A in big-endian
	588	// words format; r1 points to a field-element K in register format.
	589	// On exit, A is updated with the product A K.
	590
	591	vld1.8 {d0}, [r0]
	592	vld1.8 {d1}, [r1]
	593	vrev64.32 d0, d0
	594	mul64
	595	vrev64.32 d0, d0
	596	vst1.8 {d0}, [r0]
	597	bx r14
	598	ENDFUNC
	599
	600	FUNC(gcm_mulk_64l_arm_crypto)
	601	// On entry, r0 points to a 64-bit field element A in little-endian
	602	// words format; r1 points to a field-element K in register format.
	603	// On exit, A is updated with the product A K.
	604
	605	vld1.8 {d0}, [r0]
	606	vld1.8 {d1}, [r1]
	607	vrev64.8 d0, d0
	608	vzero
	609	mul64
	610	vrev64.8 d0, d0
	611	vst1.8 {d0}, [r0]
	612	bx r14
	613	ENDFUNC
	614
	615	FUNC(gcm_mulk_96b_arm_crypto)
	616	// On entry, r0 points to a 96-bit field element A in big-endian
	617	// words format; r1 points to a field-element K in register format.
	618	// On exit, A is updated with the product A K.
	619
	620	ldr r3, [r0, #8]
	621	mov r12, #0
	622	vld1.8 {d0}, [r0]
	623	vld1.8 {q1}, [r1]
	624	vrev64.32 d0, d0
	625	vmov d1, r12, r3
	626	vzero
	627	mul96
	628	vrev64.32 d0, d0
	629	vmov r3, d1[1]
	630	vst1.8 {d0}, [r0]
	631	str r3, [r0, #8]
	632	bx r14
	633	ENDFUNC
	634
	635	FUNC(gcm_mulk_96l_arm_crypto)
	636	// On entry, r0 points to a 128-bit field element A in little-endian
	637	// words format; r1 points to a field-element K in register format.
	638	// On exit, A is updated with the product A K.
	639
	640	ldr r3, [r0, #8]
	641	mov r12, #0
	642	vld1.8 {d0}, [r0]
	643	vld1.8 {q1}, [r1]
	644	vmov d1, r3, r12
	645	vrev64.8 q0, q0
	646	mul96
	647	vrev64.8 q0, q0
	648	vmov r3, d1[0]
	649	vst1.8 {d0}, [r0]
	650	str r3, [r0, #8]
	651	bx r14
	652	ENDFUNC
	653
	654	FUNC(gcm_mulk_192b_arm_crypto)
	655	// On entry, r0 points to a 192-bit field element A in big-endian
	656	// words format; r1 points to a field-element K in register format.
	657	// On exit, A is updated with the product A K.
	658
	659	vld1.8 {d0-d2}, [r0]
	660	vld1.8 {d3-d5}, [r1]
	661	vrev64.32 q0, q0
	662	vrev64.32 d2, d2
	663	mul192
	664	vrev64.32 q0, q0
	665	vrev64.32 d2, d2
	666	vst1.8 {d0-d2}, [r0]
	667	bx r14
	668	ENDFUNC
	669
	670	FUNC(gcm_mulk_192l_arm_crypto)
	671	// On entry, r0 points to a 192-bit field element A in little-endian
	672	// words format; r1 points to a field-element K in register format.
	673	// On exit, A is updated with the product A K.
	674
	675	vld1.8 {d0-d2}, [r0]
	676	vld1.8 {d3-d5}, [r1]
	677	vrev64.8 q0, q0
	678	vrev64.8 d2, d2
	679	mul192
	680	vrev64.8 q0, q0
	681	vrev64.8 d2, d2
	682	vst1.8 {d0-d2}, [r0]
	683	bx r14
	684	ENDFUNC
	685
	686	FUNC(gcm_mulk_256b_arm_crypto)
	687	// On entry, r0 points to a 256-bit field element A in big-endian
	688	// words format; r1 points to a field-element K in register format.
	689	// On exit, A is updated with the product A K.
	690
	691	vld1.8 {q0, q1}, [r0]
	692	vld1.8 {q2, q3}, [r1]
	693	vrev64.32 q0, q0
	694	vrev64.32 q1, q1
	695	mul256
	696	vrev64.32 q0, q0
	697	vrev64.32 q1, q1
	698	vst1.8 {q0, q1}, [r0]
	699	bx r14
	700	ENDFUNC
	701
	702	FUNC(gcm_mulk_256l_arm_crypto)
	703	// On entry, r0 points to a 256-bit field element A in little-endian
	704	// words format; r1 points to a field-element K in register format.
	705	// On exit, A is updated with the product A K.
	706
	707	vld1.8 {q0, q1}, [r0]
	708	vld1.8 {q2, q3}, [r1]
	709	vrev64.8 q0, q0
	710	vrev64.8 q1, q1
	711	mul256
	712	vrev64.8 q0, q0
	713	vrev64.8 q1, q1
	714	vst1.8 {q0, q1}, [r0]
	715	bx r14
	716	ENDFUNC
	717
	718	///----- That's all, folks --------------------------------------------------