progs/perftest.c: Use from Glibc syscall numbers.
[catacomb] / symm / gcm-arm-crypto.S
1 /// -*- mode: asm; asm-comment-char: ?/ -*-
2 ///
3 /// GCM acceleration for ARM processors
4 ///
5 /// (c) 2019 Straylight/Edgeware
6 ///
7
8 ///----- Licensing notice ---------------------------------------------------
9 ///
10 /// This file is part of Catacomb.
11 ///
12 /// Catacomb is free software: you can redistribute it and/or modify it
13 /// under the terms of the GNU Library General Public License as published
14 /// by the Free Software Foundation; either version 2 of the License, or
15 /// (at your option) any later version.
16 ///
17 /// Catacomb is distributed in the hope that it will be useful, but
18 /// WITHOUT ANY WARRANTY; without even the implied warranty of
19 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 /// Library General Public License for more details.
21 ///
22 /// You should have received a copy of the GNU Library General Public
23 /// License along with Catacomb. If not, write to the Free Software
24 /// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
25 /// USA.
26
27 ///--------------------------------------------------------------------------
28 /// Preliminaries.
29
30 #include "config.h"
31 #include "asm-common.h"
32
33 .arch armv8-a
34 .fpu crypto-neon-fp-armv8
35
36 .text
37
38 ///--------------------------------------------------------------------------
39 /// Multiplication macros.
40
41 // The good news is that we have a fancy instruction to do the
42 // multiplications. The bad news is that it's not particularly well-
43 // suited to the job.
44 //
45 // For one thing, it only does a 64-bit multiplication, so in general
46 // we'll need to synthesize the full-width multiply by hand. For
47 // another thing, it doesn't help with the reduction, so we have to
48 // do that by hand too. And, finally, GCM has crazy bit ordering,
49 // and the instruction does nothing useful for that at all.
50 //
51 // Focusing on that last problem first: the bits aren't in monotonic
52 // significance order unless we permute them. If we reverse the byte
53 // order, then we'll have the bits in monotonic order, but backwards,
54 // so the degree-0 coefficient will be in the most-significant bit.
55 //
56 // This is less of a difficulty than it seems at first, because
57 // algebra. Suppose we are given u = SUM_{0<=i<n} u_i t^i and v =
58 // SUM_{0<=j<n} v_j t^j; then
59 //
60 // u v = SUM_{0<=i,j<n} u_i v_j t^{i+j}
61 //
62 // Suppose instead that we're given ũ = SUM_{0<=i<n} u_{n-i-1} t^i
63 // and ṽ = SUM_{0<=j<n} v_{n-j-1} t^j, so the bits are backwards.
64 // Then
65 //
66 // ũ ṽ = SUM_{0<=i,j<n} u_{n-i-1} v_{n-j-1} t^{i+j}
67 // = SUM_{0<=i,j<n} u_i v_j t^{2n-2-(i+j)}
68 //
69 // which is almost the bit-reversal of u v, only it's shifted right
70 // by one place. Putting this another way, what we have is actually
71 // the bit reversal of the product u v t. We could get the correct
72 // answer (modulo p(t)) if we'd sneakily divided one of the operands
73 // by t before we started. Conveniently, v is actually the secret
74 // value k set up by the GCM `mktable' function, so we can arrange to
75 // actually store k/t (mod p(t)) and then the product will come out
76 // correct (modulo p(t)) and we won't have anything more to worry
77 // about here.
78 //
79 // That was important to think about, but there's not a great deal to
80 // do about it yet other than to convert what we've got from the
81 // blockcipher's byte-ordering convention to our big-endian
82 // convention. Since this depends on the blockcipher convention,
83 // we'll leave the caller to cope with this: the macros here will
84 // assume that the operands are in `register' format, which is the
85 // same as the external representation, except that the bytes within
86 // each 64-bit piece are reversed. In the commentary, pieces of
87 // polynomial are numbered according to the degree of the
88 // coefficients, so the unit coefficient of some polynomial a is in
89 // a_0.
90 //
91 // The commentary for `mul128' is the most detailed. The other
92 // macros assume that you've already read and understood that.
93
94 .macro mul128
95 // Enter with u and v in q0 and q1 respectively; leave with z = u v
96 // in q0. Clobbers q1--q3, q8, q9.
97
98 // First for the double-precision multiplication. It's tempting to
99 // use Karatsuba's identity here, but I suspect that loses more in
100 // the shifting, bit-twiddling, and dependency chains that it gains
101 // in saving a multiplication which otherwise pipelines well.
102 // q0 = // (u_1; u_0)
103 // q1 = // (v_1; v_0)
104 vmull.p64 q2, d1, d2 // u_1 v_0
105 vmull.p64 q3, d0, d3 // u_0 v_1
106 vmull.p64 q8, d1, d3 // (t_1; x_3) = u_1 v_1
107 vmull.p64 q9, d0, d2 // (x_0; t_0) = u_0 v_0
108
109 // Arrange the pieces to form a double-precision polynomial.
110 veor q2, q2, q3 // (m_0; m_1) = u_0 v_1 + u_1 v_0
111 veor d17, d17, d4 // x_2 = t_1 + m_1
112 veor d18, d18, d5 // x_1 = t_0 + m_0
113 // q8 = // (x_2; x_3)
114 // q9 = // (x_0; x_1)
115
116 // One-and-a-half problems remain.
117 //
118 // The full-size problem is that the result needs to be reduced
119 // modulo p(t) = t^128 + t^7 + t^2 + t + 1. Let R = t^128 = t^7 +
120 // t^2 + t + 1 in our field. So far, we've calculated z_0 and z_1
121 // such that z_0 + z_1 R = u v using the identity R = t^128: now we
122 // must collapse the two halves of y together using the other
123 // identity R = t^7 + t^2 + t + 1.
124 //
125 // We do this by working on x_2 and x_3 separately, so consider x_i
126 // for i = 2 or 3. Certainly, x_i t^{64i} = x_i R t^{64(i-2) =
127 // (t^7 + t^2 + t + 1) x_i t^{64(i-2)}, but we can't use that
128 // directly without breaking up the 64-bit word structure. Instead,
129 // we start by considering just x_i t^7 t^{64(i-2)}, which again
130 // looks tricky. Now, split x_i = a_i + t^57 b_i, with deg a_i < 57;
131 // then
132 //
133 // x_i t^7 t^{64(i-2)} = a_i t^7 t^{64(i-2)} + b_i t^{64(i-1)}
134 //
135 // We can similarly decompose x_i t^2 and x_i t into a pair of 64-bit
136 // contributions to the t^{64(i-2)} and t^{64(i-1)} words, but the
137 // splits are different. This is lovely, with one small snag: when
138 // we do this to x_3, we end up with a contribution back into the
139 // t^128 coefficient word. But notice that only the low seven bits
140 // of this word are affected, so there's no knock-on contribution
141 // into the t^64 word. Therefore, if we handle the high bits of each
142 // word together, and then the low bits, everything will be fine.
143
144 // First, shift the high bits down.
145 vshl.u64 q2, q8, #63 // the b_i for t
146 vshl.u64 q3, q8, #62 // the b_i for t^2
147 vshl.u64 q0, q8, #57 // the b_i for t^7
148 veor q2, q2, q3 // add them all together
149 veor q2, q2, q0
150 veor d18, d18, d5 // contribution into low half
151 veor d17, d17, d4 // and high half
152
153 // And then shift the low bits up.
154 vshr.u64 q2, q8, #1
155 vshr.u64 q3, q8, #2
156 vshr.u64 q1, q8, #7
157 veor q8, q8, q9 // mix in the unit contribution
158 veor q2, q2, q3 // t and t^2 contribs
159 veor q1, q1, q8 // low, unit, and t^7 contribs
160 veor d1, d2, d4 // mix them together and swap halves
161 veor d0, d3, d5
162 .endm
163
164 .macro mul64
165 // Enter with u and v in the low halves of d0 and d1 respectively;
166 // leave with z = u v in d0. Clobbers d1--d5.
167
168 // The multiplication is thankfully easy.
169 vmull.p64 q0, d0, d1 // u v
170
171 // Now we must reduce. This is essentially the same as the 128-bit
172 // case above, but mostly simpler because everything is smaller. The
173 // polynomial this time is p(t) = t^64 + t^4 + t^3 + t + 1.
174
175 // First, shift the high bits down.
176 vshl.u64 d2, d0, #63 // b_i for t
177 vshl.u64 d3, d0, #61 // b_i for t^3
178 vshl.u64 d4, d0, #60 // b_i for t^4
179 veor d2, d2, d3 // add them all together
180 veor d2, d2, d4
181 veor d0, d0, d2 // contribution back into high half
182
183 // And then shift the low bits up.
184 vshr.u64 d2, d0, #1
185 vshr.u64 d3, d0, #3
186 vshr.u64 d4, d0, #4
187 veor d0, d0, d1 // mix in the unit contribution
188 veor d2, d2, d3 // t and t^3 contribs
189 veor d0, d0, d4 // low, unit, and t^4
190 veor d0, d0, d2 // mix them together and we're done
191 .endm
192
193 .macro mul96
194 // Enter with u and v in the most-significant three words of q0 and
195 // q1 respectively, and zero in the low words, and zero in q15; leave
196 // with z = u v in the high three words of q0, and /junk/ in the low
197 // word. Clobbers q1--q3, q8, q9.
198
199 // This is an inconvenient size. There's nothing for it but to do
200 // four multiplications, as if for the 128-bit case.
201 // q0 = // (u_2; u_0 + u_1 t^32)
202 // q1 = // (v_2; v_0 + v_1 t^32)
203 vmull.p64 q8, d1, d2 // u_2 (v_0 + v_1 t^32) = e_0
204 vmull.p64 q9, d0, d3 // v_2 (u_0 + u_1 t^32) = e_1
205 vmull.p64 q3, d1, d3 // u_2 v_2 t^64 = d = (d; 0)
206 vmull.p64 q0, d0, d2 // u_0 v_0 + (u_0 v_1 + u_1 v_0) t^32
207 // + u_1 v_1 t^64 = f
208
209 // Extract the high and low halves of the 192-bit result. The answer
210 // we want is d t^128 + e t^64 + f, where e = e_0 + e_1. The low 96
211 // bits of the answer will end up in q0, and the high 96 bits will
212 // end up in q1; we'll need both of these to have zero in their
213 // bottom 32 bits.
214 //
215 // Here, bot(x) is the low 96 bits of a 192-bit quantity x, arranged
216 // in the low 96 bits of a SIMD register, with junk in the top 32
217 // bits; and top(x) is the high 96 bits, also arranged in the low 96
218 // bits of a register, with /zero/ in the top 32 bits.
219 veor q8, q8, q9 // e_0 + e_1 = e
220 vshr128 q1, q3, 32 // top(d t^128)
221 vext.8 d19, d16, d17, #4 // top(e t^64)
222 vshl.u64 d16, d0, #32 // top(f), sort of
223 veor d3, d3, d19 // q1 = top(d t^128 + e t^64)
224 veor d0, d0, d17 // q0 = bot([d t^128] + e t^64 + f)
225 veor d3, d3, d16 // q1 = top(d t^128 + e t^64 + f)
226
227 // Finally, the reduction. This is essentially the same as the
228 // 128-bit case, except that the polynomial is p(t) = t^96 + t^10 +
229 // t^9 + t^6 + 1. The degrees are larger but not enough to cause
230 // trouble for the general approach.
231
232 // First, shift the high bits down.
233 vshl.u32 q2, q1, #26 // b_i for t^6
234 vshl.u32 q3, q1, #23 // b_i for t^9
235 vshl.u32 q8, q1, #22 // b_i for t^10
236 veor q2, q2, q3 // add them all together
237 veor q2, q2, q8
238 vshl128 q3, q2, 64 // contribution into high half
239 vshr128 q2, q2, 32 // and low half
240 veor q1, q1, q3 // mix them in
241 veor q0, q0, q2
242
243 // And then shift the low bits up.
244 vshr.u32 q2, q1, #6
245 vshr.u32 q3, q1, #9
246 veor q0, q0, q1 // mix in the unit contribution
247 vshr.u32 q8, q1, #10
248 veor q2, q2, q3 // mix together t^6 and t^9
249 veor q0, q0, q8 // mix in t^10
250 veor q0, q0, q2 // and the rest
251
252 // And finally swap the two halves.
253 vswp d0, d1
254 .endm
255
256 .macro mul192
257 // Enter with u and v in d0--d2 and d3--d5 respectively; leave
258 // with z = u v in d0--d2. Clobbers q8--q15.
259
260 // Start multiplying and accumulating pieces of product.
261 // (d0; d1; d2) = // (u_0; u_1; u_2)
262 // (d3; d4; d5) = // (v_0; v_1; v_2)
263 vmull.p64 q10, d0, d3 // e = u_0 v_0
264
265 vmull.p64 q12, d0, d4 // u_0 v_1
266 vmull.p64 q13, d1, d3 // u_1 v_0
267
268 vmull.p64 q9, d0, d5 // u_0 v_2
269 vmull.p64 q14, d1, d4 // u_1 v_1
270 vmull.p64 q15, d2, d3 // u_2 v_0
271 veor q12, q12, q13 // d = u_0 v_1 + u_1 v_0
272
273 vmull.p64 q11, d1, d5 // u_1 v_2
274 vmull.p64 q13, d2, d4 // u_2 v_1
275 veor q9, q9, q14 // u_0 v_2 + u_1 v_1
276
277 vmull.p64 q8, d2, d5 // a = u_2 v_2
278 veor q9, q9, q15 // c = u_0 v_2 + u_1 v_1 + u_2 v_0
279 veor q11, q11, q13 // b = u_1 v_2 + u_2 v_1
280
281 // Piece the product together.
282 veor d17, d17, d22 // q8 = // (x_4; x_5)
283 veor d18, d18, d23
284 veor d19, d19, d24 // q9 = // (x_2; x_3)
285 veor d20, d20, d25 // q10 = // (x_0; x_1)
286
287 // Next, the reduction. Our polynomial this time is p(x) = t^192 +
288 // t^7 + t^2 + t + 1. Yes, the magic numbers are the same as the
289 // 128-bit case. I don't know why.
290
291 // First, shift the high bits down.
292 // q8 = // (y_4; y_5)
293 // q9 = // (y_2; y_3)
294 // q10 = // (y_0; y_1)
295 vshl.u64 q11, q8, #63 // (y_4; y_5) b_i for t
296 vshl.u64 d28, d18, #63 // y_3 b_i for t
297 vshl.u64 q12, q8, #62 // (y_4; y_5) b_i for t^2
298 vshl.u64 d29, d18, #62 // y_3 b_i for t^2
299 vshl.u64 q13, q8, #57 // (y_4; y_5) b_i for t^7
300 vshl.u64 d30, d18, #57 // y_3 b_i for t^7
301 veor q11, q11, q12 // mix them all together
302 veor d28, d28, d29
303 veor q11, q11, q13
304 veor d28, d28, d30
305 veor q9, q9, q11
306 veor d20, d20, d28
307
308 // And finally shift the low bits up. Also, switch the order of the
309 // pieces for output.
310 // q8 = // (y'_4; y'_5)
311 // q9 = // (y'_2; y'_3)
312 // q10 = // (y'_0; y'_1)
313 vshr.u64 q11, q8, #1 // (y_4; y_5) a_i for t
314 vshr.u64 d28, d18, #1 // y'_3 a_i for t
315 vshr.u64 q12, q8, #2 // (y_4; y_5) a_i for t^2
316 vshr.u64 d29, d18, #2 // y'_3 a_i for t^2
317 vshr.u64 q13, q8, #7 // (y_4; y_5) a_i for t^7
318 vshr.u64 d30, d18, #7 // y'_3 a_i for t^7
319 veor q8, q8, q11
320 veor d18, d18, d28
321 veor q12, q12, q13
322 veor d29, d29, d30
323 veor q8, q8, q12
324 veor d18, d18, d29
325 veor d0, d21, d18
326 veor d1, d20, d17
327 veor d2, d19, d16
328 .endm
329
330 .macro mul256
331 // Enter with u and v in q0/q1 and q2/q3 respectively; leave
332 // with z = u v in q0/q1. Clobbers q8--q15.
333
334 // Now it's starting to look worthwhile to do Karatsuba. Suppose
335 // u = u_0 + u_1 B and v = v_0 + v_1 B. Then
336 //
337 // u v = (u_0 v_0) + (u_0 v_1 + u_1 v_0) B + (u_1 v_1) B^2
338 //
339 // Name these coefficients of B^i be a, b, and c, respectively, and
340 // let r = u_0 + u_1 and s = v_0 + v_1. Then observe that
341 //
342 // q = r s = (u_0 + u_1) (v_0 + v_1)
343 // = (u_0 v_0) + (u1 v_1) + (u_0 v_1 + u_1 v_0)
344 // = a + c + b
345 //
346 // The first two terms we've already calculated; the last is the
347 // remaining one we want. We'll set B = t^128. We know how to do
348 // 128-bit multiplications already, and Karatsuba is too annoying
349 // there, so there'll be 12 multiplications altogether, rather than
350 // the 16 we'd have if we did this the naïve way.
351 // q0 = // u_0 = (u_01; u_00)
352 // q1 = // u_1 = (u_11; u_10)
353 // q2 = // v_0 = (v_01; v_00)
354 // q3 = // v_1 = (v_11; v_10)
355
356 veor q8, q0, q1 // u_* = (u_01 + u_11; u_00 + u_10)
357 veor q9, q2, q3 // v_* = (v_01 + v_11; v_00 + v_10)
358
359 // Start by building the cross product, q = u_* v_*.
360 vmull.p64 q14, d16, d19 // u_*0 v_*1
361 vmull.p64 q15, d17, d18 // u_*1 v_*0
362 vmull.p64 q12, d17, d19 // u_*1 v_*1
363 vmull.p64 q13, d16, d18 // u_*0 v_*0
364 veor q14, q14, q15 // u_*0 v_*1 + u_*1 v_*0
365 veor d25, d25, d28 // q12 = // q_1
366 veor d26, d26, d29 // q13 = // q_0
367
368 // Next, work on the low half, a = u_0 v_0.
369 vmull.p64 q14, d0, d5 // u_00 v_01
370 vmull.p64 q15, d1, d4 // u_01 v_00
371 vmull.p64 q10, d1, d5 // u_01 v_01
372 vmull.p64 q11, d0, d4 // u_00 v_00
373 veor q14, q14, q15 // u_00 v_01 + u_01 v_00
374 veor d21, d21, d28 // q10 = // a_1
375 veor d22, d22, d29 // q11 = // a_0
376
377 // Mix the pieces we have so far.
378 veor q12, q12, q10
379 veor q13, q13, q11
380
381 // Finally, the high half, c = u_1 v_1.
382 vmull.p64 q14, d2, d7 // u_10 v_11
383 vmull.p64 q15, d3, d6 // u_11 v_10
384 vmull.p64 q8, d3, d7 // u_11 v_11
385 vmull.p64 q9, d2, d6 // u_10 v_10
386 veor q14, q14, q15 // u_10 v_11 + u_11 v_10
387 veor d17, d17, d28 // q8 = // c_1
388 veor d18, d18, d29 // q9 = // c_0
389
390 // Finish mixing the product together.
391 veor q12, q12, q8 // q12 = // b_1
392 veor q13, q13, q9 // q13 = // b_0
393 veor q9, q9, q12
394 veor q10, q10, q13
395
396 // Now we must reduce. This is essentially the same as the 192-bit
397 // case above, but more complicated because everything is bigger.
398 // The polynomial this time is p(t) = t^256 + t^10 + t^5 + t^2 + 1.
399
400 // First, shift the high bits down.
401 // q8 = // (y_6; y_7)
402 // q9 = // (y_4; y_5)
403 // q10 = // (y_2; y_3)
404 // q11 = // (y_0; y_1)
405 vshl.u64 q0, q8, #62 // (y_6; y_7) b_i for t^2
406 vshl.u64 q12, q9, #62 // (y_4; y_5) b_i for t^2
407 vshl.u64 q1, q8, #59 // (y_6; y_7) b_i for t^5
408 vshl.u64 q13, q9, #59 // (y_4; y_5) b_i for t^5
409 vshl.u64 q2, q8, #54 // (y_6; y_7) b_i for t^10
410 vshl.u64 q14, q9, #54 // (y_4; y_5) b_i for t^10
411 veor q0, q0, q1 // mix the contributions together
412 veor q12, q12, q13
413 veor q0, q0, q2
414 veor q12, q12, q14
415 veor d19, d19, d0 // and combine into the lower pieces
416 veor d20, d20, d1
417 veor d21, d21, d24
418 veor d22, d22, d25
419
420 // And then shift the low bits up. Also, switch the order of the
421 // pieces for output.
422 // q8 = // (y'_6; y'_7)
423 // q9 = // (y'_4; y'_5)
424 // q10 = // (y'_2; y'_3)
425 // q11 = // (y'_0; y'_1)
426 vshr.u64 q0, q8, #2 // (y_6; y_7) a_i for t^2
427 vshr.u64 q12, q9, #2 // (y'_4; y_5) a_i for t^2
428 vshr.u64 q1, q8, #5 // (y_6; y_7) a_i for t^5
429 vshr.u64 q13, q9, #5 // (y_4; y_5) a_i for t^5
430 vshr.u64 q2, q8, #10 // (y_6; y_7) a_i for t^10
431 vshr.u64 q14, q9, #10 // (y_4; y_5) a_i for t^10
432
433 veor q8, q8, q0 // mix the contributions together
434 veor q1, q1, q2
435 veor q9, q9, q12
436 veor q13, q13, q14
437 veor q8, q8, q1
438 veor q9, q9, q13
439 veor d3, d20, d16 // and output
440 veor d2, d21, d17
441 veor d1, d22, d18
442 veor d0, d23, d19
443 .endm
444
445 ///--------------------------------------------------------------------------
446 /// Main code.
447
448 // There are a number of representations of field elements in this code and
449 // it can be confusing.
450 //
451 // * The `external format' consists of a sequence of contiguous bytes in
452 // memory called a `block'. The GCM spec explains how to interpret this
453 // block as an element of a finite field. As discussed extensively, this
454 // representation is very annoying for a number of reasons. On the other
455 // hand, this code never actually deals with it directly.
456 //
457 // * The `register format' consists of one or more NEON registers,
458 // depending on the block size. The bytes in each 64-bit lane of these
459 // registers are in reverse order, compared to the external format.
460 //
461 // * The `words' format consists of a sequence of bytes, as in the
462 // `external format', but, according to the blockcipher in use, the bytes
463 // within each 32-bit word may be reversed (`big-endian') or not
464 // (`little-endian'). Accordingly, there are separate entry points for
465 // each variant, identified with `b' or `l'.
466
467 FUNC(gcm_mulk_128b_arm_crypto)
468 // On entry, r0 points to a 128-bit field element A in big-endian
469 // words format; r1 points to a field-element K in register format.
470 // On exit, A is updated with the product A K.
471
472 vld1.8 {q0}, [r0]
473 vld1.8 {q1}, [r1]
474 vrev64.32 q0, q0
475 mul128
476 vrev64.32 q0, q0
477 vst1.8 {q0}, [r0]
478 bx r14
479 ENDFUNC
480
481 FUNC(gcm_mulk_128l_arm_crypto)
482 // On entry, r0 points to a 128-bit field element A in little-endian
483 // words format; r1 points to a field-element K in register format.
484 // On exit, A is updated with the product A K.
485
486 vld1.8 {q0}, [r0]
487 vld1.8 {q1}, [r1]
488 vrev64.8 q0, q0
489 mul128
490 vrev64.8 q0, q0
491 vst1.8 {q0}, [r0]
492 bx r14
493 ENDFUNC
494
495 FUNC(gcm_mulk_64b_arm_crypto)
496 // On entry, r0 points to a 64-bit field element A in big-endian
497 // words format; r1 points to a field-element K in register format.
498 // On exit, A is updated with the product A K.
499
500 vld1.8 {d0}, [r0]
501 vld1.8 {d1}, [r1]
502 vrev64.32 d0, d0
503 mul64
504 vrev64.32 d0, d0
505 vst1.8 {d0}, [r0]
506 bx r14
507 ENDFUNC
508
509 FUNC(gcm_mulk_64l_arm_crypto)
510 // On entry, r0 points to a 64-bit field element A in little-endian
511 // words format; r1 points to a field-element K in register format.
512 // On exit, A is updated with the product A K.
513
514 vld1.8 {d0}, [r0]
515 vld1.8 {d1}, [r1]
516 vrev64.8 d0, d0
517 vzero
518 mul64
519 vrev64.8 d0, d0
520 vst1.8 {d0}, [r0]
521 bx r14
522 ENDFUNC
523
524 FUNC(gcm_mulk_96b_arm_crypto)
525 // On entry, r0 points to a 96-bit field element A in big-endian
526 // words format; r1 points to a field-element K in register format.
527 // On exit, A is updated with the product A K.
528
529 ldr r3, [r0, #8]
530 mov r12, #0
531 vld1.8 {d0}, [r0]
532 vld1.8 {q1}, [r1]
533 vrev64.32 d0, d0
534 vmov d1, r12, r3
535 vzero
536 mul96
537 vrev64.32 d0, d0
538 vmov r3, d1[1]
539 vst1.8 {d0}, [r0]
540 str r3, [r0, #8]
541 bx r14
542 ENDFUNC
543
544 FUNC(gcm_mulk_96l_arm_crypto)
545 // On entry, r0 points to a 128-bit field element A in little-endian
546 // words format; r1 points to a field-element K in register format.
547 // On exit, A is updated with the product A K.
548
549 ldr r3, [r0, #8]
550 mov r12, #0
551 vld1.8 {d0}, [r0]
552 vld1.8 {q1}, [r1]
553 vmov d1, r3, r12
554 vrev64.8 q0, q0
555 mul96
556 vrev64.8 q0, q0
557 vmov r3, d1[0]
558 vst1.8 {d0}, [r0]
559 str r3, [r0, #8]
560 bx r14
561 ENDFUNC
562
563 FUNC(gcm_mulk_192b_arm_crypto)
564 // On entry, r0 points to a 192-bit field element A in big-endian
565 // words format; r1 points to a field-element K in register format.
566 // On exit, A is updated with the product A K.
567
568 vld1.8 {d0-d2}, [r0]
569 vld1.8 {d3-d5}, [r1]
570 vrev64.32 q0, q0
571 vrev64.32 d2, d2
572 mul192
573 vrev64.32 q0, q0
574 vrev64.32 d2, d2
575 vst1.8 {d0-d2}, [r0]
576 bx r14
577 ENDFUNC
578
579 FUNC(gcm_mulk_192l_arm_crypto)
580 // On entry, r0 points to a 192-bit field element A in little-endian
581 // words format; r1 points to a field-element K in register format.
582 // On exit, A is updated with the product A K.
583
584 vld1.8 {d0-d2}, [r0]
585 vld1.8 {d3-d5}, [r1]
586 vrev64.8 q0, q0
587 vrev64.8 d2, d2
588 mul192
589 vrev64.8 q0, q0
590 vrev64.8 d2, d2
591 vst1.8 {d0-d2}, [r0]
592 bx r14
593 ENDFUNC
594
595 FUNC(gcm_mulk_256b_arm_crypto)
596 // On entry, r0 points to a 256-bit field element A in big-endian
597 // words format; r1 points to a field-element K in register format.
598 // On exit, A is updated with the product A K.
599
600 vld1.8 {q0, q1}, [r0]
601 vld1.8 {q2, q3}, [r1]
602 vrev64.32 q0, q0
603 vrev64.32 q1, q1
604 mul256
605 vrev64.32 q0, q0
606 vrev64.32 q1, q1
607 vst1.8 {q0, q1}, [r0]
608 bx r14
609 ENDFUNC
610
611 FUNC(gcm_mulk_256l_arm_crypto)
612 // On entry, r0 points to a 256-bit field element A in little-endian
613 // words format; r1 points to a field-element K in register format.
614 // On exit, A is updated with the product A K.
615
616 vld1.8 {q0, q1}, [r0]
617 vld1.8 {q2, q3}, [r1]
618 vrev64.8 q0, q0
619 vrev64.8 q1, q1
620 mul256
621 vrev64.8 q0, q0
622 vrev64.8 q1, q1
623 vst1.8 {q0, q1}, [r0]
624 bx r14
625 ENDFUNC
626
627 ///----- That's all, folks --------------------------------------------------