mdw@git.distorted.org.uk Git - catacomb/blame_incremental

... / ...

Commit	Line	Data
	1	/// -- mode: asm; asm-comment-char: ?/ --
	2	///
	3	/// GCM acceleration for ARM processors
	4	///
	5	/// (c) 2019 Straylight/Edgeware
	6	///
	7
	8	///----- Licensing notice ---------------------------------------------------
	9	///
	10	/// This file is part of Catacomb.
	11	///
	12	/// Catacomb is free software: you can redistribute it and/or modify it
	13	/// under the terms of the GNU Library General Public License as published
	14	/// by the Free Software Foundation; either version 2 of the License, or
	15	/// (at your option) any later version.
	16	///
	17	/// Catacomb is distributed in the hope that it will be useful, but
	18	/// WITHOUT ANY WARRANTY; without even the implied warranty of
	19	/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	20	/// Library General Public License for more details.
	21	///
	22	/// You should have received a copy of the GNU Library General Public
	23	/// License along with Catacomb. If not, write to the Free Software
	24	/// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
	25	/// USA.
	26
	27	///--------------------------------------------------------------------------
	28	/// Preliminaries.
	29
	30	#include "config.h"
	31	#include "asm-common.h"
	32
	33	.arch armv8-a
	34	.fpu crypto-neon-fp-armv8
	35
	36	.text
	37
	38	///--------------------------------------------------------------------------
	39	/// Multiplication macros.
	40
	41	// The good news is that we have a fancy instruction to do the
	42	// multiplications. The bad news is that it's not particularly well-
	43	// suited to the job.
	44	//
	45	// For one thing, it only does a 64-bit multiplication, so in general
	46	// we'll need to synthesize the full-width multiply by hand. For
	47	// another thing, it doesn't help with the reduction, so we have to
	48	// do that by hand too. And, finally, GCM has crazy bit ordering,
	49	// and the instruction does nothing useful for that at all.
	50	//
	51	// Focusing on that last problem first: the bits aren't in monotonic
	52	// significance order unless we permute them. If we reverse the byte
	53	// order, then we'll have the bits in monotonic order, but backwards,
	54	// so the degree-0 coefficient will be in the most-significant bit.
	55	//
	56	// This is less of a difficulty than it seems at first, because
	57	// algebra. Suppose we are given u = SUM_{0<=i<n} u_i t^i and v =
	58	// SUM_{0<=j<n} v_j t^j; then
	59	//
	60	// u v = SUM_{0<=i,j<n} u_i v_j t^{i+j}
	61	//
	62	// Suppose instead that we're given ũ = SUM_{0<=i<n} u_{n-i-1} t^i
	63	// and ṽ = SUM_{0<=j<n} v_{n-j-1} t^j, so the bits are backwards.
	64	// Then
	65	//
	66	// ũ ṽ = SUM_{0<=i,j<n} u_{n-i-1} v_{n-j-1} t^{i+j}
	67	// = SUM_{0<=i,j<n} u_i v_j t^{2n-2-(i+j)}
	68	//
	69	// which is almost the bit-reversal of u v, only it's shifted right
	70	// by one place. Putting this another way, what we have is actually
	71	// the bit reversal of the product u v t. We could get the correct
	72	// answer (modulo p(t)) if we'd sneakily divided one of the operands
	73	// by t before we started. Conveniently, v is actually the secret
	74	// value k set up by the GCM `mktable' function, so we can arrange to
	75	// actually store k/t (mod p(t)) and then the product will come out
	76	// correct (modulo p(t)) and we won't have anything more to worry
	77	// about here.
	78	//
	79	// That was important to think about, but there's not a great deal to
	80	// do about it yet other than to convert what we've got from the
	81	// blockcipher's byte-ordering convention to our big-endian
	82	// convention. Since this depends on the blockcipher convention,
	83	// we'll leave the caller to cope with this: the macros here will
	84	// assume that the operands are in `register' format, which is the
	85	// same as the external representation, except that the bytes within
	86	// each 64-bit piece are reversed. In the commentary, pieces of
	87	// polynomial are numbered according to the degree of the
	88	// coefficients, so the unit coefficient of some polynomial a is in
	89	// a_0.
	90	//
	91	// The commentary for `mul128' is the most detailed. The other
	92	// macros assume that you've already read and understood that.
	93
	94	.macro mul128
	95	// Enter with u and v in q0 and q1 respectively; leave with z = u v
	96	// in q0. Clobbers q1--q3, q8, q9.
	97
	98	// First for the double-precision multiplication. It's tempting to
	99	// use Karatsuba's identity here, but I suspect that loses more in
	100	// the shifting, bit-twiddling, and dependency chains that it gains
	101	// in saving a multiplication which otherwise pipelines well.
	102	// q0 = // (u_0; u_1)
	103	// q1 = // (v_0; v_1)
	104	vmull.p64 q2, d1, d2 // u_1 v_0
	105	vmull.p64 q3, d0, d3 // u_0 v_1
	106	vmull.p64 q8, d1, d3 // (x_3; t_1) = u_1 v_1
	107	vmull.p64 q9, d0, d2 // (t_0; x_0) = u_0 v_0
	108
	109	// Arrange the pieces to form a double-precision polynomial.
	110	veor q2, q2, q3 // (m_1; m_0) = u_0 v_1 + u_1 v_0
	111	veor d17, d17, d4 // x_2 = t_1 + m_1
	112	veor d18, d18, d5 // x_1 = t_0 + m_0
	113	// q8 = // (x_3; x_2)
	114	// q9 = // (x_1; x_0)
	115
	116	// One-and-a-half problems remain.
	117	//
	118	// The full-size problem is that the result needs to be reduced
	119	// modulo p(t) = t^128 + t^7 + t^2 + t + 1. Let R = t^128 = t^7 +
	120	// t^2 + t + 1 in our field. So far, we've calculated z_0 and z_1
	121	// such that z_0 + z_1 R = u v using the identity R = t^128: now we
	122	// must collapse the two halves of y together using the other
	123	// identity R = t^7 + t^2 + t + 1.
	124	//
	125	// We do this by working on x_2 and x_3 separately, so consider x_i
	126	// for i = 2 or 3. Certainly, x_i t^{64i} = x_i R t^{64(i-2) =
	127	// (t^7 + t^2 + t + 1) x_i t^{64(i-2)}, but we can't use that
	128	// directly without breaking up the 64-bit word structure. Instead,
	129	// we start by considering just x_i t^7 t^{64(i-2)}, which again
	130	// looks tricky. Now, split x_i = a_i + t^57 b_i, with deg a_i < 57;
	131	// then
	132	//
	133	// x_i t^7 t^{64(i-2)} = a_i t^7 t^{64(i-2)} + b_i t^{64(i-1)}
	134	//
	135	// We can similarly decompose x_i t^2 and x_i t into a pair of 64-bit
	136	// contributions to the t^{64(i-2)} and t^{64(i-1)} words, but the
	137	// splits are different. This is lovely, with one small snag: when
	138	// we do this to x_3, we end up with a contribution back into the
	139	// t^128 coefficient word. But notice that only the low seven bits
	140	// of this word are affected, so there's no knock-on contribution
	141	// into the t^64 word. Therefore, if we handle the high bits of each
	142	// word together, and then the low bits, everything will be fine.
	143
	144	// First, shift the high bits down.
	145	vshl.u64 q2, q8, #63 // the b_i for t
	146	vshl.u64 q3, q8, #62 // the b_i for t^2
	147	vshl.u64 q0, q8, #57 // the b_i for t^7
	148	veor q2, q2, q3 // add them all together
	149	veor q2, q2, q0
	150	veor d18, d18, d5 // contribution into low half
	151	veor d17, d17, d4 // and high half
	152
	153	// And then shift the low bits up.
	154	vshr.u64 q2, q8, #1
	155	vshr.u64 q3, q8, #2
	156	vshr.u64 q1, q8, #7
	157	veor q8, q8, q9 // mix in the unit contribution
	158	veor q2, q2, q3 // t and t^2 contribs
	159	veor q1, q1, q8 // low, unit, and t^7 contribs
	160	veor d1, d2, d4 // mix them together and swap halves
	161	veor d0, d3, d5
	162	.endm
	163
	164	.macro mul64
	165	// Enter with u and v in the low halves of d0 and d1 respectively;
	166	// leave with z = u v in d0. Clobbers d1--d5.
	167
	168	// The multiplication is thankfully easy.
	169	vmull.p64 q0, d0, d1 // u v
	170
	171	// Now we must reduce. This is essentially the same as the 128-bit
	172	// case above, but mostly simpler because everything is smaller. The
	173	// polynomial this time is p(t) = t^64 + t^4 + t^3 + t + 1.
	174
	175	// First, shift the high bits down.
	176	vshl.u64 d2, d0, #63 // b_i for t
	177	vshl.u64 d3, d0, #61 // b_i for t^3
	178	vshl.u64 d4, d0, #60 // b_i for t^4
	179	veor d2, d2, d3 // add them all together
	180	veor d2, d2, d4
	181	veor d0, d0, d2 // contribution back into high half
	182
	183	// And then shift the low bits up.
	184	vshr.u64 d2, d0, #1
	185	vshr.u64 d3, d0, #3
	186	vshr.u64 d4, d0, #4
	187	veor d0, d0, d1 // mix in the unit contribution
	188	veor d2, d2, d3 // t and t^3 contribs
	189	veor d0, d0, d4 // low, unit, and t^4
	190	veor d0, d0, d2 // mix them together and we're done
	191	.endm
	192
	193	.macro mul96
	194	// Enter with u and v in the most-significant three words of q0 and
	195	// q1 respectively, and zero in the low words, and zero in q15; leave
	196	// with z = u v in the high three words of q0, and /junk/ in the low
	197	// word. Clobbers q1--q3, q8, q9.
	198
	199	// This is an inconvenient size. There's nothing for it but to do
	200	// four multiplications, as if for the 128-bit case.
	201	// q0 = // (u_0 + u_1 t^32; u_2)
	202	// q1 = // (v_0 + v_1 t^32; v_2)
	203	vmull.p64 q8, d1, d2 // u_2 (v_0 + v_1 t^32) = e_0
	204	vmull.p64 q9, d0, d3 // v_2 (u_0 + u_1 t^32) = e_1
	205	vmull.p64 q3, d1, d3 // u_2 v_2 t^64 = d = (0; d)
	206	vmull.p64 q0, d0, d2 // u_0 v_0 + (u_0 v_1 + u_1 v_0) t^32
	207	// + u_1 v_1 t^64 = f
	208
	209	// Extract the high and low halves of the 192-bit result. The answer
	210	// we want is d t^128 + e t^64 + f, where e = e_0 + e_1. The low 96
	211	// bits of the answer will end up in q0, and the high 96 bits will
	212	// end up in q1; we'll need both of these to have zero in their
	213	// bottom 32 bits.
	214	//
	215	// Here, bot(x) is the low 96 bits of a 192-bit quantity x, arranged
	216	// in the low 96 bits of a SIMD register, with junk in the top 32
	217	// bits; and top(x) is the high 96 bits, also arranged in the low 96
	218	// bits of a register, with /zero/ in the top 32 bits.
	219	veor q8, q8, q9 // e_0 + e_1 = e
	220	vshr128 q1, q3, 32 // top(d t^128)
	221	vext.8 d19, d16, d17, #4 // top(e t^64)
	222	vshl.u64 d16, d0, #32 // top(f), sort of
	223	veor d3, d3, d19 // q1 = top(d t^128 + e t^64)
	224	veor d0, d0, d17 // q0 = bot([d t^128] + e t^64 + f)
	225	veor d3, d3, d16 // q1 = top(d t^128 + e t^64 + f)
	226
	227	// Finally, the reduction. This is essentially the same as the
	228	// 128-bit case, except that the polynomial is p(t) = t^96 + t^10 +
	229	// t^9 + t^6 + 1. The degrees are larger but not enough to cause
	230	// trouble for the general approach.
	231
	232	// First, shift the high bits down.
	233	vshl.u32 q2, q1, #26 // b_i for t^6
	234	vshl.u32 q3, q1, #23 // b_i for t^9
	235	vshl.u32 q8, q1, #22 // b_i for t^10
	236	veor q2, q2, q3 // add them all together
	237	veor q2, q2, q8
	238	vshl128 q3, q2, 64 // contribution into high half
	239	vshr128 q2, q2, 32 // and low half
	240	veor q1, q1, q3 // mix them in
	241	veor q0, q0, q2
	242
	243	// And then shift the low bits up.
	244	vshr.u32 q2, q1, #6
	245	vshr.u32 q3, q1, #9
	246	veor q0, q0, q1 // mix in the unit contribution
	247	vshr.u32 q8, q1, #10
	248	veor q2, q2, q3 // mix together t^6 and t^9
	249	veor q0, q0, q8 // mix in t^10
	250	veor q0, q0, q2 // and the rest
	251
	252	// And finally swap the two halves.
	253	vswp d0, d1
	254	.endm
	255
	256	.macro mul192
	257	// Enter with u and v in d0--d2 and d3--d5 respectively; leave
	258	// with z = u v in d0--d2. Clobbers q8--q15.
	259
	260	// Start multiplying and accumulating pieces of product.
	261	// (d0; d1; d2) = // (u_0; u_1; u_2)
	262	// (d3; d4; d5) = // (v_0; v_1; v_2)
	263	vmull.p64 q10, d0, d3 // e = u_0 v_0
	264
	265	vmull.p64 q12, d0, d4 // u_0 v_1
	266	vmull.p64 q13, d1, d3 // u_1 v_0
	267
	268	vmull.p64 q9, d0, d5 // u_0 v_2
	269	vmull.p64 q14, d1, d4 // u_1 v_1
	270	vmull.p64 q15, d2, d3 // u_2 v_0
	271	veor q12, q12, q13 // d = u_0 v_1 + u_1 v_0
	272
	273	vmull.p64 q11, d1, d5 // u_1 v_2
	274	vmull.p64 q13, d2, d4 // u_2 v_1
	275	veor q9, q9, q14 // u_0 v_2 + u_1 v_1
	276
	277	vmull.p64 q8, d2, d5 // a = u_2 v_2
	278	veor q9, q9, q15 // c = u_0 v_2 + u_1 v_1 + u_2 v_0
	279	veor q11, q11, q13 // b = u_1 v_2 + u_2 v_1
	280
	281	// Piece the product together.
	282	veor d17, d17, d22 // q8 = // (x_5; x_4)
	283	veor d18, d18, d23
	284	veor d19, d19, d24 // q9 = // (x_3; x_2)
	285	veor d20, d20, d25 // q10 = // (x_1; x_0)
	286
	287	// Next, the reduction. Our polynomial this time is p(x) = t^192 +
	288	// t^7 + t^2 + t + 1. Yes, the magic numbers are the same as the
	289	// 128-bit case. I don't know why.
	290
	291	// First, shift the high bits down.
	292	// q8 = // (y_5; y_4)
	293	// q9 = // (y_3; y_2)
	294	// q10 = // (y_1; y_0)
	295	vshl.u64 q11, q8, #63 // (y_5; y_4) b_i for t
	296	vshl.u64 d28, d18, #63 // y_3 b_i for t
	297	vshl.u64 q12, q8, #62 // (y_5; y_4) b_i for t^2
	298	vshl.u64 d29, d18, #62 // y_3 b_i for t^2
	299	vshl.u64 q13, q8, #57 // (y_5; y_4) b_i for t^7
	300	vshl.u64 d30, d18, #57 // y_3 b_i for t^7
	301	veor q11, q11, q12 // mix them all together
	302	veor d28, d28, d29
	303	veor q11, q11, q13
	304	veor d28, d28, d30
	305	veor q9, q9, q11
	306	veor d20, d20, d28
	307
	308	// And finally shift the low bits up. Also, switch the order of the
	309	// pieces for output.
	310	// q8 = // (y'_5; y'_4)
	311	// q9 = // (y'_3; y'_2)
	312	// q10 = // (y'_1; y'_0)
	313	vshr.u64 q11, q8, #1 // (y_5; y_4) a_i for t
	314	vshr.u64 d28, d18, #1 // y'_3 a_i for t
	315	vshr.u64 q12, q8, #2 // (y_5; y_4) a_i for t^2
	316	vshr.u64 d29, d18, #2 // y'_3 a_i for t^2
	317	vshr.u64 q13, q8, #7 // (y_5; y_4) a_i for t^7
	318	vshr.u64 d30, d18, #7 // y'_3 a_i for t^7
	319	veor q8, q8, q11
	320	veor d18, d18, d28
	321	veor q12, q12, q13
	322	veor d29, d29, d30
	323	veor q8, q8, q12
	324	veor d18, d18, d29
	325	veor d0, d21, d18
	326	veor d1, d20, d17
	327	veor d2, d19, d16
	328	.endm
	329
	330	.macro mul256
	331	// Enter with u and v in q0/q1 and q2/q3 respectively; leave
	332	// with z = u v in q0/q1. Clobbers q8--q15.
	333
	334	// Now it's starting to look worthwhile to do Karatsuba. Suppose
	335	// u = u_0 + u_1 B and v = v_0 + v_1 B. Then
	336	//
	337	// u v = (u_0 v_0) + (u_0 v_1 + u_1 v_0) B + (u_1 v_1) B^2
	338	//
	339	// Name these coefficients of B^i be a, b, and c, respectively, and
	340	// let r = u_0 + u_1 and s = v_0 + v_1. Then observe that
	341	//
	342	// q = r s = (u_0 + u_1) (v_0 + v_1)
	343	// = (u_0 v_0) + (u1 v_1) + (u_0 v_1 + u_1 v_0)
	344	// = a + c + b
	345	//
	346	// The first two terms we've already calculated; the last is the
	347	// remaining one we want. We'll set B = t^128. We know how to do
	348	// 128-bit multiplications already, and Karatsuba is too annoying
	349	// there, so there'll be 12 multiplications altogether, rather than
	350	// the 16 we'd have if we did this the naïve way.
	351	// q0 = // u_0 = (u_00; u_01)
	352	// q1 = // u_1 = (u_10; u_11)
	353	// q2 = // v_0 = (v_00; v_01)
	354	// q3 = // v_1 = (v_10; v_11)
	355
	356	veor q8, q0, q1 // u_* = (u_00 + u_10; u_01 + u_11)
	357	veor q9, q2, q3 // v_* = (v_00 + v_10; v_01 + v_11)
	358
	359	// Start by building the cross product, q = u_* v_*.
	360	vmull.p64 q14, d16, d19 // u_0 v_1
	361	vmull.p64 q15, d17, d18 // u_1 v_0
	362	vmull.p64 q12, d17, d19 // u_1 v_1
	363	vmull.p64 q13, d16, d18 // u_0 v_0
	364	veor q14, q14, q15 // u_0 v_1 + u_1 v_0
	365	veor d25, d25, d28 // q12 = // q_1
	366	veor d26, d26, d29 // q13 = // q_0
	367
	368	// Next, work on the low half, a = u_0 v_0.
	369	vmull.p64 q14, d0, d5 // u_00 v_01
	370	vmull.p64 q15, d1, d4 // u_01 v_00
	371	vmull.p64 q10, d1, d5 // u_01 v_01
	372	vmull.p64 q11, d0, d4 // u_00 v_00
	373	veor q14, q14, q15 // u_00 v_01 + u_01 v_00
	374	veor d21, d21, d28 // q10 = // a_1
	375	veor d22, d22, d29 // q11 = // a_0
	376
	377	// Mix the pieces we have so far.
	378	veor q12, q12, q10
	379	veor q13, q13, q11
	380
	381	// Finally, the high half, c = u_1 v_1.
	382	vmull.p64 q14, d2, d7 // u_10 v_11
	383	vmull.p64 q15, d3, d6 // u_11 v_10
	384	vmull.p64 q8, d3, d7 // u_11 v_11
	385	vmull.p64 q9, d2, d6 // u_10 v_10
	386	veor q14, q14, q15 // u_10 v_11 + u_11 v_10
	387	veor d17, d17, d28 // q8 = // c_1
	388	veor d18, d18, d29 // q9 = // c_0
	389
	390	// Finish mixing the product together.
	391	veor q12, q12, q8 // q12 = // b_1
	392	veor q13, q13, q9 // q13 = // b_0
	393	veor q9, q9, q12
	394	veor q10, q10, q13
	395
	396	// Now we must reduce. This is essentially the same as the 192-bit
	397	// case above, but more complicated because everything is bigger.
	398	// The polynomial this time is p(t) = t^256 + t^10 + t^5 + t^2 + 1.
	399
	400	// First, shift the high bits down.
	401	// q8 = // (y_7; y_6)
	402	// q9 = // (y_5; y_4)
	403	// q10 = // (y_3; y_2)
	404	// q11 = // (y_1; y_0)
	405	vshl.u64 q0, q8, #62 // (y_7; y_6) b_i for t^2
	406	vshl.u64 q12, q9, #62 // (y_5; y_4) b_i for t^2
	407	vshl.u64 q1, q8, #59 // (y_7; y_6) b_i for t^5
	408	vshl.u64 q13, q9, #59 // (y_5; y_4) b_i for t^5
	409	vshl.u64 q2, q8, #54 // (y_7; y_6) b_i for t^10
	410	vshl.u64 q14, q9, #54 // (y_5; y_4) b_i for t^10
	411	veor q0, q0, q1 // mix the contributions together
	412	veor q12, q12, q13
	413	veor q0, q0, q2
	414	veor q12, q12, q14
	415	veor d19, d19, d0 // and combine into the lower pieces
	416	veor d20, d20, d1
	417	veor d21, d21, d24
	418	veor d22, d22, d25
	419
	420	// And then shift the low bits up. Also, switch the order of the
	421	// pieces for output.
	422	// q8 = // (y'_7; y'_6)
	423	// q9 = // (y'_5; y'_4)
	424	// q10 = // (y'_3; y'_2)
	425	// q11 = // (y'_1; y'_0)
	426	vshr.u64 q0, q8, #2 // (y_7; y_6) a_i for t^2
	427	vshr.u64 q12, q9, #2 // (y_5; y'_4) a_i for t^2
	428	vshr.u64 q1, q8, #5 // (y_7; y_6) a_i for t^5
	429	vshr.u64 q13, q9, #5 // (y_5; y_4) a_i for t^5
	430	vshr.u64 q2, q8, #10 // (y_7; y_6) a_i for t^10
	431	vshr.u64 q14, q9, #10 // (y_5; y_4) a_i for t^10
	432
	433	veor q8, q8, q0 // mix the contributions together
	434	veor q1, q1, q2
	435	veor q9, q9, q12
	436	veor q13, q13, q14
	437	veor q8, q8, q1
	438	veor q9, q9, q13
	439	veor d3, d20, d16 // and output
	440	veor d2, d21, d17
	441	veor d1, d22, d18
	442	veor d0, d23, d19
	443	.endm
	444
	445	///--------------------------------------------------------------------------
	446	/// Main code.
	447
	448	// There are a number of representations of field elements in this code and
	449	// it can be confusing.
	450	//
	451	// * The `external format' consists of a sequence of contiguous bytes in
	452	// memory called a `block'. The GCM spec explains how to interpret this
	453	// block as an element of a finite field. As discussed extensively, this
	454	// representation is very annoying for a number of reasons. On the other
	455	// hand, this code never actually deals with it directly.
	456	//
	457	// * The `register format' consists of one or more NEON registers,
	458	// depending on the block size. The bytes in each 64-bit lane of these
	459	// registers are in reverse order, compared to the external format.
	460	//
	461	// * The `words' format consists of a sequence of bytes, as in the
	462	// `external format', but, according to the blockcipher in use, the bytes
	463	// within each 32-bit word may be reversed (`big-endian') or not
	464	// (`little-endian'). Accordingly, there are separate entry points for
	465	// each variant, identified with `b' or `l'.
	466
	467	FUNC(gcm_mulk_128b_arm_crypto)
	468	// On entry, r0 points to a 128-bit field element A in big-endian
	469	// words format; r1 points to a field-element K in register format.
	470	// On exit, A is updated with the product A K.
	471
	472	vld1.8 {q0}, [r0]
	473	vld1.8 {q1}, [r1]
	474	vrev64.32 q0, q0
	475	mul128
	476	vrev64.32 q0, q0
	477	vst1.8 {q0}, [r0]
	478	bx r14
	479	ENDFUNC
	480
	481	FUNC(gcm_mulk_128l_arm_crypto)
	482	// On entry, r0 points to a 128-bit field element A in little-endian
	483	// words format; r1 points to a field-element K in register format.
	484	// On exit, A is updated with the product A K.
	485
	486	vld1.8 {q0}, [r0]
	487	vld1.8 {q1}, [r1]
	488	vrev64.8 q0, q0
	489	mul128
	490	vrev64.8 q0, q0
	491	vst1.8 {q0}, [r0]
	492	bx r14
	493	ENDFUNC
	494
	495	FUNC(gcm_mulk_64b_arm_crypto)
	496	// On entry, r0 points to a 64-bit field element A in big-endian
	497	// words format; r1 points to a field-element K in register format.
	498	// On exit, A is updated with the product A K.
	499
	500	vld1.8 {d0}, [r0]
	501	vld1.8 {d1}, [r1]
	502	vrev64.32 d0, d0
	503	mul64
	504	vrev64.32 d0, d0
	505	vst1.8 {d0}, [r0]
	506	bx r14
	507	ENDFUNC
	508
	509	FUNC(gcm_mulk_64l_arm_crypto)
	510	// On entry, r0 points to a 64-bit field element A in little-endian
	511	// words format; r1 points to a field-element K in register format.
	512	// On exit, A is updated with the product A K.
	513
	514	vld1.8 {d0}, [r0]
	515	vld1.8 {d1}, [r1]
	516	vrev64.8 d0, d0
	517	vzero
	518	mul64
	519	vrev64.8 d0, d0
	520	vst1.8 {d0}, [r0]
	521	bx r14
	522	ENDFUNC
	523
	524	FUNC(gcm_mulk_96b_arm_crypto)
	525	// On entry, r0 points to a 96-bit field element A in big-endian
	526	// words format; r1 points to a field-element K in register format.
	527	// On exit, A is updated with the product A K.
	528
	529	ldr r3, [r0, #8]
	530	mov r12, #0
	531	vld1.8 {d0}, [r0]
	532	vld1.8 {q1}, [r1]
	533	vrev64.32 d0, d0
	534	vmov d1, r12, r3
	535	vzero
	536	mul96
	537	vrev64.32 d0, d0
	538	vmov r3, d1[1]
	539	vst1.8 {d0}, [r0]
	540	str r3, [r0, #8]
	541	bx r14
	542	ENDFUNC
	543
	544	FUNC(gcm_mulk_96l_arm_crypto)
	545	// On entry, r0 points to a 128-bit field element A in little-endian
	546	// words format; r1 points to a field-element K in register format.
	547	// On exit, A is updated with the product A K.
	548
	549	ldr r3, [r0, #8]
	550	mov r12, #0
	551	vld1.8 {d0}, [r0]
	552	vld1.8 {q1}, [r1]
	553	vmov d1, r3, r12
	554	vrev64.8 q0, q0
	555	mul96
	556	vrev64.8 q0, q0
	557	vmov r3, d1[0]
	558	vst1.8 {d0}, [r0]
	559	str r3, [r0, #8]
	560	bx r14
	561	ENDFUNC
	562
	563	FUNC(gcm_mulk_192b_arm_crypto)
	564	// On entry, r0 points to a 192-bit field element A in big-endian
	565	// words format; r1 points to a field-element K in register format.
	566	// On exit, A is updated with the product A K.
	567
	568	vld1.8 {d0-d2}, [r0]
	569	vld1.8 {d3-d5}, [r1]
	570	vrev64.32 q0, q0
	571	vrev64.32 d2, d2
	572	mul192
	573	vrev64.32 q0, q0
	574	vrev64.32 d2, d2
	575	vst1.8 {d0-d2}, [r0]
	576	bx r14
	577	ENDFUNC
	578
	579	FUNC(gcm_mulk_192l_arm_crypto)
	580	// On entry, r0 points to a 192-bit field element A in little-endian
	581	// words format; r1 points to a field-element K in register format.
	582	// On exit, A is updated with the product A K.
	583
	584	vld1.8 {d0-d2}, [r0]
	585	vld1.8 {d3-d5}, [r1]
	586	vrev64.8 q0, q0
	587	vrev64.8 d2, d2
	588	mul192
	589	vrev64.8 q0, q0
	590	vrev64.8 d2, d2
	591	vst1.8 {d0-d2}, [r0]
	592	bx r14
	593	ENDFUNC
	594
	595	FUNC(gcm_mulk_256b_arm_crypto)
	596	// On entry, r0 points to a 256-bit field element A in big-endian
	597	// words format; r1 points to a field-element K in register format.
	598	// On exit, A is updated with the product A K.
	599
	600	vld1.8 {q0, q1}, [r0]
	601	vld1.8 {q2, q3}, [r1]
	602	vrev64.32 q0, q0
	603	vrev64.32 q1, q1
	604	mul256
	605	vrev64.32 q0, q0
	606	vrev64.32 q1, q1
	607	vst1.8 {q0, q1}, [r0]
	608	bx r14
	609	ENDFUNC
	610
	611	FUNC(gcm_mulk_256l_arm_crypto)
	612	// On entry, r0 points to a 256-bit field element A in little-endian
	613	// words format; r1 points to a field-element K in register format.
	614	// On exit, A is updated with the product A K.
	615
	616	vld1.8 {q0, q1}, [r0]
	617	vld1.8 {q2, q3}, [r1]
	618	vrev64.8 q0, q0
	619	vrev64.8 q1, q1
	620	mul256
	621	vrev64.8 q0, q0
	622	vrev64.8 q1, q1
	623	vst1.8 {q0, q1}, [r0]
	624	bx r14
	625	ENDFUNC
	626
	627	///----- That's all, folks --------------------------------------------------