8861e601a4c7994325414a313fa0c921beb8573c
[catacomb] / symm / gcm-arm-crypto.S
1 /// -*- mode: asm; asm-comment-char: ?/ -*-
2 ///
3 /// GCM acceleration for ARM processors
4 ///
5 /// (c) 2019 Straylight/Edgeware
6 ///
7
8 ///----- Licensing notice ---------------------------------------------------
9 ///
10 /// This file is part of Catacomb.
11 ///
12 /// Catacomb is free software: you can redistribute it and/or modify it
13 /// under the terms of the GNU Library General Public License as published
14 /// by the Free Software Foundation; either version 2 of the License, or
15 /// (at your option) any later version.
16 ///
17 /// Catacomb is distributed in the hope that it will be useful, but
18 /// WITHOUT ANY WARRANTY; without even the implied warranty of
19 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 /// Library General Public License for more details.
21 ///
22 /// You should have received a copy of the GNU Library General Public
23 /// License along with Catacomb. If not, write to the Free Software
24 /// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
25 /// USA.
26
27 ///--------------------------------------------------------------------------
28 /// Preliminaries.
29
30 #include "config.h"
31 #include "asm-common.h"
32
33 .arch armv8-a
34 .fpu crypto-neon-fp-armv8
35
36 .text
37
38 ///--------------------------------------------------------------------------
39 /// Multiplication macros.
40
41 // The good news is that we have a fancy instruction to do the
42 // multiplications. The bad news is that it's not particularly well-
43 // suited to the job.
44 //
45 // For one thing, it only does a 64-bit multiplication, so in general
46 // we'll need to synthesize the full-width multiply by hand. For
47 // another thing, it doesn't help with the reduction, so we have to
48 // do that by hand too. And, finally, GCM has crazy bit ordering,
49 // and the instruction does nothing useful for that at all.
50 //
51 // Focusing on that last problem first: the bits aren't in monotonic
52 // significance order unless we permute them. If we reverse the byte
53 // order, then we'll have the bits in monotonic order, but backwards,
54 // so the degree-0 coefficient will be in the most-significant bit.
55 //
56 // This is less of a difficulty than it seems at first, because
57 // algebra. Suppose we are given u = SUM_{0<=i<n} u_i t^i and v =
58 // SUM_{0<=j<n} v_j t^j; then
59 //
60 // u v = SUM_{0<=i,j<n} u_i v_j t^{i+j}
61 //
62 // Suppose instead that we're given ũ = SUM_{0<=i<n} u_{n-i-1} t^i
63 // and ṽ = SUM_{0<=j<n} v_{n-j-1} t^j, so the bits are backwards.
64 // Then
65 //
66 // ũ ṽ = SUM_{0<=i,j<n} u_{n-i-1} v_{n-j-1} t^{i+j}
67 // = SUM_{0<=i,j<n} u_i v_j t^{2n-2-(i+j)}
68 //
69 // which is almost the bit-reversal of u v, only it's shifted right
70 // by one place. Oh, well: we'll have to shift it back later.
71 //
72 // That was important to think about, but there's not a great deal to
73 // do about it yet other than to convert what we've got from the
74 // blockcipher's byte-ordering convention to our big-endian
75 // convention. Since this depends on the blockcipher convention,
76 // we'll leave the caller to cope with this: the macros here will
77 // assume that the operands are in `register' format, which is the
78 // same as the external representation, except that the bytes within
79 // each 64-bit piece are reversed. In the commentary, pieces of
80 // polynomial are numbered according to the degree of the
81 // coefficients, so the unit coefficient of some polynomial a is in
82 // a_0.
83 //
84 // The commentary for `mul128' is the most detailed. The other
85 // macros assume that you've already read and understood that.
86
87 .macro mul128
88 // Enter with u and v in q0 and q1 respectively; leave with z = u v
89 // in q0. Clobbers q1--q3, q8, q9.
90
91 // First for the double-precision multiplication. It's tempting to
92 // use Karatsuba's identity here, but I suspect that loses more in
93 // the shifting, bit-twiddling, and dependency chains that it gains
94 // in saving a multiplication which otherwise pipelines well.
95 // q0 = // (u_0; u_1)
96 // q1 = // (v_0; v_1)
97 vmull.p64 q2, d1, d2 // u_1 v_0
98 vmull.p64 q3, d0, d3 // u_0 v_1
99 vmull.p64 q8, d1, d3 // (x_3; t_1) = u_1 v_1
100 vmull.p64 q9, d0, d2 // (t_0; x_0) = u_0 v_0
101
102 // Arrange the pieces to form a double-precision polynomial.
103 veor q2, q2, q3 // (m_1; m_0) = u_0 v_1 + u_1 v_0
104 veor d17, d17, d4 // x_2 = t_1 + m_1
105 veor d18, d18, d5 // x_1 = t_0 + m_0
106 // q8 = // (x_3; x_2)
107 // q9 = // (x_1; x_0)
108
109 // Two-and-a-half problems remain. The first is that this product is
110 // shifted left by one place, which is annoying. Let's take care of
111 // that now. Once this is done, we'll be properly in GCM's backwards
112 // bit-ordering.
113 //
114 // The half a problem is that the result wants to have its 64-bit
115 // halves switched. Here turns out to be the best place to arrange
116 // for that.
117 //
118 // q9 q8
119 // ,-------------.-------------. ,-------------.-------------.
120 // | 0 x_0-x_62 | x_63-x_126 | | x_127-x_190 | x_191-x_254 |
121 // `-------------^-------------' `-------------^-------------'
122 // d19 d18 d17 d16
123 //
124 // We start by shifting each 32-bit lane right (from GCM's point of
125 // view -- physically, left) by one place, which gives us this:
126 //
127 // low (q9) high (q8)
128 // ,-------------.-------------. ,-------------.-------------.
129 // | x_0-x_62 0 |x_64-x_126 0 | |x_128-x_190 0|x_192-x_254 0|
130 // `-------------^-------------' `-------------^-------------'
131 // d19 d18 d17 d16
132 //
133 // but we've lost a bunch of bits. We separately shift each lane
134 // left by 31 places to give us the bits we lost.
135 //
136 // low (q3) high (q2)
137 // ,-------------.-------------. ,-------------.-------------.
138 // | 0...0 | 0...0 x_63 | | 0...0 x_127 | 0...0 x_191 |
139 // `-------------^-------------' `-------------^-------------'
140 // d6 d5 d4
141 //
142 // Since we can address each of these pieces individually, putting
143 // them together is relatively straightforward.
144
145
146 vshr.u64 d6, d18, #63 // shifted left; just the carries
147 vshl.u64 q9, q9, #1 // shifted right, but dropped carries
148 vshr.u64 q2, q8, #63
149 vshl.u64 q8, q8, #1
150 vorr d0, d19, d6 // y_0
151 vorr d1, d18, d5 // y_1
152 vorr d2, d17, d4 // y_2
153 vmov d3, d16 // y_3
154
155 // And the other one is that the result needs to be reduced modulo
156 // p(t) = t^128 + t^7 + t^2 + t + 1. Let R = t^128 = t^7 + t^2 + t +
157 // 1 in our field. So far, we've calculated z_0 and z_1 such that
158 // z_0 + z_1 R = u v using the identity R = t^128: now we must
159 // collapse the two halves of y together using the other identity R =
160 // t^7 + t^2 + t + 1.
161 //
162 // We do this by working on y_2 and y_3 separately, so consider y_i
163 // for i = 2 or 3. Certainly, y_i t^{64i} = y_i R t^{64(i-2) =
164 // (t^7 + t^2 + t + 1) y_i t^{64(i-2)}, but we can't use that
165 // directly without breaking up the 64-bit word structure. Instead,
166 // we start by considering just y_i t^7 t^{64(i-2)}, which again
167 // looks tricky. Now, split y_i = a_i + t^57 b_i, with deg a_i < 57;
168 // then
169 //
170 // y_i t^7 t^{64(i-2)} = a_i t^7 t^{64(i-2)} + b_i t^{64(i-1)}
171 //
172 // We can similarly decompose y_i t^2 and y_i t into a pair of 64-bit
173 // contributions to the t^{64(i-2)} and t^{64(i-1)} words, but the
174 // splits are different. This is lovely, with one small snag: when
175 // we do this to y_3, we end up with a contribution back into the
176 // t^128 coefficient word. But notice that only the low seven bits
177 // of this word are affected, so there's no knock-on contribution
178 // into the t^64 word. Therefore, if we handle the high bits of each
179 // word together, and then the low bits, everything will be fine.
180
181 // First, shift the high bits down.
182 vshl.u64 q2, q1, #63 // the b_i for t
183 vshl.u64 q3, q1, #62 // the b_i for t^2
184 vshl.u64 q8, q1, #57 // the b_i for t^7
185 veor q2, q2, q3 // add them all together
186 veor q2, q2, q8
187 veor d2, d2, d5 // contribution into low half
188 veor d1, d1, d4 // and high half
189
190 // And then shift the low bits up.
191 vshr.u64 q2, q1, #1
192 vshr.u64 q3, q1, #2
193 vshr.u64 q8, q1, #7
194 veor q0, q0, q1 // mix in the unit contribution
195 veor q2, q2, q3 // t and t^2 contribs
196 veor q0, q0, q8 // low, unit, and t^7 contribs
197 veor q0, q0, q2 // mix them together and we're done
198 .endm
199
200 .macro mul64
201 // Enter with u and v in the low halves of d0 and d1 respectively;
202 // leave with z = u v in d0. Clobbers d1--d5.
203
204 // The multiplication is thankfully easy.
205 vmull.p64 q0, d0, d1 // u v
206
207 // Shift the product up by one place, and swap the two halves. After
208 // this, we're in GCM bizarro-world.
209 vshr.u64 d2, d0, #63 // shifted left; just the carries
210 vshl.u64 d3, d1, #1 // low half right
211 vshl.u64 d1, d0, #1 // high half shifted right
212 vorr d0, d3, d2 // mix in the carries
213
214 // Now we must reduce. This is essentially the same as the 128-bit
215 // case above, but mostly simpler because everything is smaller. The
216 // polynomial this time is p(t) = t^64 + t^4 + t^3 + t + 1.
217
218 // First, shift the high bits down.
219 vshl.u64 d2, d1, #63 // b_i for t
220 vshl.u64 d3, d1, #61 // b_i for t^3
221 vshl.u64 d4, d1, #60 // b_i for t^4
222 veor d2, d2, d3 // add them all together
223 veor d2, d2, d4
224 veor d1, d1, d2 // contribution back into high half
225
226 // And then shift the low bits up.
227 vshr.u64 d2, d1, #1
228 vshr.u64 d3, d1, #3
229 vshr.u64 d4, d1, #4
230 veor d0, d0, d1 // mix in the unit contribution
231 veor d2, d2, d3 // t and t^3 contribs
232 veor d0, d0, d4 // low, unit, and t^4
233 veor d0, d0, d2 // mix them together and we're done
234 .endm
235
236 .macro mul96
237 // Enter with u and v in the most-significant three words of q0 and
238 // q1 respectively, and zero in the low words, and zero in q15; leave
239 // with z = u v in the high three words of q0, and /junk/ in the low
240 // word. Clobbers q1--q3, q8, q9.
241
242 // This is an inconvenient size. There's nothing for it but to do
243 // four multiplications, as if for the 128-bit case. It's possible
244 // that there's cruft in the top 32 bits of the input registers, so
245 // shift both of them up by four bytes before we start. This will
246 // mean that the high 64 bits of the result (from GCM's viewpoint)
247 // will be zero.
248 // q0 = // (u_0 + u_1 t^32; u_2)
249 // q1 = // (v_0 + v_1 t^32; v_2)
250 vmull.p64 q8, d1, d2 // u_2 (v_0 + v_1 t^32) = e_0
251 vmull.p64 q9, d0, d3 // v_2 (u_0 + u_1 t^32) = e_1
252 vmull.p64 q3, d1, d3 // u_2 v_2 t^64 = d = (0; d)
253 vmull.p64 q0, d0, d2 // u_0 v_0 + (u_0 v_1 + u_1 v_0) t^32
254 // + u_1 v_1 t^64 = f
255
256 // Extract the high and low halves of the 192-bit result. The answer
257 // we want is d t^128 + e t^64 + f, where e = e_0 + e_1. The low 96
258 // bits of the answer will end up in q0, and the high 96 bits will
259 // end up in q1; we'll need both of these to have zero in their
260 // bottom 32 bits.
261 //
262 // Here, bot(x) is the low 96 bits of a 192-bit quantity x, arranged
263 // in the low 96 bits of a SIMD register, with junk in the top 32
264 // bits; and top(x) is the high 96 bits, also arranged in the low 96
265 // bits of a register, with /zero/ in the top 32 bits.
266 veor q8, q8, q9 // e_0 + e_1 = e
267 vshr128 q1, q3, 32 // top(d t^128)
268 vext.8 d19, d16, d17, #4 // top(e t^64)
269 vshl.u64 d16, d0, #32 // top(f), sort of
270 veor d3, d3, d19 // q1 = top(d t^128 + e t^64)
271 veor d0, d0, d17 // q0 = bot([d t^128] + e t^64 + f)
272 veor d3, d3, d16 // q1 = top(d t^128 + e t^64 + f)
273
274 // Shift the product right by one place (from GCM's point of view),
275 // but, unusually, don't swap the halves, because we need to work on
276 // the 32-bit pieces later. After this, we're in GCM bizarro-world.
277 // q0 = // (?, x_2; x_1, x_0)
278 // q1 = // (0, x_5; x_4, x_3)
279 vshr.u64 d4, d0, #63 // carry from d0 to d1
280 vshr.u64 d5, d2, #63 // carry from d2 to d3
281 vshr.u32 d6, d3, #31 // carry from d3 to d0
282 vshl.u64 q0, q0, #1 // shift low half
283 vshl.u64 q1, q1, #1 // shift high half
284 vorr d1, d1, d4
285 vorr d0, d0, d6
286 vorr d3, d3, d5
287
288 // Finally, the reduction. This is essentially the same as the
289 // 128-bit case, except that the polynomial is p(t) = t^96 + t^10 +
290 // t^9 + t^6 + 1. The degrees are larger but not enough to cause
291 // trouble for the general approach.
292
293 // First, shift the high bits down.
294 vshl.u32 q2, q1, #26 // b_i for t^6
295 vshl.u32 q3, q1, #23 // b_i for t^9
296 vshl.u32 q8, q1, #22 // b_i for t^10
297 veor q2, q2, q3 // add them all together
298 veor q2, q2, q8
299 vshl128 q3, q2, 64 // contribution into high half
300 vshr128 q2, q2, 32 // and low half
301 veor q1, q1, q3 // mix them in
302 veor q0, q0, q2
303
304 // And then shift the low bits up.
305 vshr.u32 q2, q1, #6
306 vshr.u32 q3, q1, #9
307 veor q0, q0, q1 // mix in the unit contribution
308 vshr.u32 q8, q1, #10
309 veor q2, q2, q3 // mix together t^6 and t^9
310 veor q0, q0, q8 // mix in t^10
311 veor q0, q0, q2 // and the rest
312
313 // And finally swap the two halves.
314 vswp d0, d1
315 .endm
316
317 .macro mul192
318 // Enter with u and v in d0--d2 and d3--d5 respectively; leave
319 // with z = u v in d0--d2. Clobbers q8--q15.
320
321 // Start multiplying and accumulating pieces of product.
322 // (d0; d1; d2) = // (u_0; u_1; u_2)
323 // (d3; d4; d5) = // (v_0; v_1; v_2)
324 vmull.p64 q10, d0, d3 // e = u_0 v_0
325
326 vmull.p64 q12, d0, d4 // u_0 v_1
327 vmull.p64 q13, d1, d3 // u_1 v_0
328
329 vmull.p64 q9, d0, d5 // u_0 v_2
330 vmull.p64 q14, d1, d4 // u_1 v_1
331 vmull.p64 q15, d2, d3 // u_2 v_0
332 veor q12, q12, q13 // d = u_0 v_1 + u_1 v_0
333
334 vmull.p64 q11, d1, d5 // u_1 v_2
335 vmull.p64 q13, d2, d4 // u_2 v_1
336 veor q9, q9, q14 // u_0 v_2 + u_1 v_1
337
338 vmull.p64 q8, d2, d5 // a = u_2 v_2
339 veor q9, q9, q15 // c = u_0 v_2 + u_1 v_1 + u_2 v_0
340 veor q11, q11, q13 // b = u_1 v_2 + u_2 v_1
341
342 // Piece the product together.
343 veor d17, d17, d22 // q8 = // (x_5; x_4)
344 veor d18, d18, d23
345 veor d19, d19, d24 // q9 = // (x_3; x_2)
346 veor d20, d20, d25 // q10 = // (x_1; x_0)
347
348 // Shift the product right by one place (from GCM's point of view).
349 vshr.u64 q11, q8, #63 // carry from d16/d17 to d17/d18
350 vshr.u64 q12, q9, #63 // carry from d18/d19 to d19/d20
351 vshr.u64 d26, d20, #63 // carry from d20 to d21
352 vshl.u64 q8, q8, #1 // shift everything down
353 vshl.u64 q9, q9, #1
354 vshl.u64 q10, q10, #1
355 vorr d17, d17, d22 // and mix in the carries
356 vorr d18, d18, d23
357 vorr d19, d19, d24
358 vorr d20, d20, d25
359 vorr d21, d21, d26
360
361 // Next, the reduction. Our polynomial this time is p(x) = t^192 +
362 // t^7 + t^2 + t + 1. Yes, the magic numbers are the same as the
363 // 128-bit case. I don't know why.
364
365 // First, shift the high bits down.
366 // q8 = // (y_5; y_4)
367 // q9 = // (y_3; y_2)
368 // q10 = // (y_1; y_0)
369 vshl.u64 q11, q8, #63 // (y_5; y_4) b_i for t
370 vshl.u64 d28, d18, #63 // y_3 b_i for t
371 vshl.u64 q12, q8, #62 // (y_5; y_4) b_i for t^2
372 vshl.u64 d29, d18, #62 // y_3 b_i for t^2
373 vshl.u64 q13, q8, #57 // (y_5; y_4) b_i for t^7
374 vshl.u64 d30, d18, #57 // y_3 b_i for t^7
375 veor q11, q11, q12 // mix them all together
376 veor d28, d28, d29
377 veor q11, q11, q13
378 veor d28, d28, d30
379 veor q9, q9, q11
380 veor d20, d20, d28
381
382 // And finally shift the low bits up. Also, switch the order of the
383 // pieces for output.
384 // q8 = // (y'_5; y'_4)
385 // q9 = // (y'_3; y'_2)
386 // q10 = // (y'_1; y'_0)
387 vshr.u64 q11, q8, #1 // (y_5; y_4) a_i for t
388 vshr.u64 d28, d18, #1 // y'_3 a_i for t
389 vshr.u64 q12, q8, #2 // (y_5; y_4) a_i for t^2
390 vshr.u64 d29, d18, #2 // y'_3 a_i for t^2
391 vshr.u64 q13, q8, #7 // (y_5; y_4) a_i for t^7
392 vshr.u64 d30, d18, #7 // y'_3 a_i for t^7
393 veor q8, q8, q11
394 veor d18, d18, d28
395 veor q12, q12, q13
396 veor d29, d29, d30
397 veor q8, q8, q12
398 veor d18, d18, d29
399 veor d0, d21, d18
400 veor d1, d20, d17
401 veor d2, d19, d16
402 .endm
403
404 .macro mul256
405 // Enter with u and v in q0/q1 and q2/q3 respectively; leave
406 // with z = u v in q0/q1. Clobbers q8--q15.
407
408 // Now it's starting to look worthwhile to do Karatsuba. Suppose
409 // u = u_0 + u_1 B and v = v_0 + v_1 B. Then
410 //
411 // u v = (u_0 v_0) + (u_0 v_1 + u_1 v_0) B + (u_1 v_1) B^2
412 //
413 // Name these coefficients of B^i be a, b, and c, respectively, and
414 // let r = u_0 + u_1 and s = v_0 + v_1. Then observe that
415 //
416 // q = r s = (u_0 + u_1) (v_0 + v_1)
417 // = (u_0 v_0) + (u1 v_1) + (u_0 v_1 + u_1 v_0)
418 // = a + c + b
419 //
420 // The first two terms we've already calculated; the last is the
421 // remaining one we want. We'll set B = t^128. We know how to do
422 // 128-bit multiplications already, and Karatsuba is too annoying
423 // there, so there'll be 12 multiplications altogether, rather than
424 // the 16 we'd have if we did this the naïve way.
425 // q0 = // u_0 = (u_00; u_01)
426 // q1 = // u_1 = (u_10; u_11)
427 // q2 = // v_0 = (v_00; v_01)
428 // q3 = // v_1 = (v_10; v_11)
429
430 veor q8, q0, q1 // u_* = (u_00 + u_10; u_01 + u_11)
431 veor q9, q2, q3 // v_* = (v_00 + v_10; v_01 + v_11)
432
433 // Start by building the cross product, q = u_* v_*.
434 vmull.p64 q14, d16, d19 // u_*0 v_*1
435 vmull.p64 q15, d17, d18 // u_*1 v_*0
436 vmull.p64 q12, d17, d19 // u_*1 v_*1
437 vmull.p64 q13, d16, d18 // u_*0 v_*0
438 veor q14, q14, q15 // u_*0 v_*1 + u_*1 v_*0
439 veor d25, d25, d28 // q12 = // q_1
440 veor d26, d26, d29 // q13 = // q_0
441
442 // Next, work on the low half, a = u_0 v_0.
443 vmull.p64 q14, d0, d5 // u_00 v_01
444 vmull.p64 q15, d1, d4 // u_01 v_00
445 vmull.p64 q10, d1, d5 // u_01 v_01
446 vmull.p64 q11, d0, d4 // u_00 v_00
447 veor q14, q14, q15 // u_00 v_01 + u_01 v_00
448 veor d21, d21, d28 // q10 = // a_1
449 veor d22, d22, d29 // q11 = // a_0
450
451 // Mix the pieces we have so far.
452 veor q12, q12, q10
453 veor q13, q13, q11
454
455 // Finally, the high half, c = u_1 v_1.
456 vmull.p64 q14, d2, d7 // u_10 v_11
457 vmull.p64 q15, d3, d6 // u_11 v_10
458 vmull.p64 q8, d3, d7 // u_11 v_11
459 vmull.p64 q9, d2, d6 // u_10 v_10
460 veor q14, q14, q15 // u_10 v_11 + u_11 v_10
461 veor d17, d17, d28 // q8 = // c_1
462 veor d18, d18, d29 // q9 = // c_0
463
464 // Finish mixing the product together.
465 veor q12, q12, q8 // q12 = // b_1
466 veor q13, q13, q9 // q13 = // b_0
467 veor q9, q9, q12
468 veor q10, q10, q13
469
470 // Shift the product right by one place (from GCM's point of view).
471 vshr.u64 q0, q8, #63 // carry from d16/d17 to d17/d18
472 vshr.u64 q1, q9, #63 // carry from d18/d19 to d19/d20
473 vshr.u64 q2, q10, #63 // carry from d20/d21 to d21/d22
474 vshr.u64 d6, d22, #63 // carry from d22 to d23
475 vshl.u64 q8, q8, #1 // shift everyting down
476 vshl.u64 q9, q9, #1
477 vshl.u64 q10, q10, #1
478 vshl.u64 q11, q11, #1
479 vorr d17, d17, d0
480 vorr d18, d18, d1
481 vorr d19, d19, d2
482 vorr d20, d20, d3
483 vorr d21, d21, d4
484 vorr d22, d22, d5
485 vorr d23, d23, d6
486
487 // Now we must reduce. This is essentially the same as the 192-bit
488 // case above, but more complicated because everything is bigger.
489 // The polynomial this time is p(t) = t^256 + t^10 + t^5 + t^2 + 1.
490
491 // First, shift the high bits down.
492 // q8 = // (y_7; y_6)
493 // q9 = // (y_5; y_4)
494 // q10 = // (y_3; y_2)
495 // q11 = // (y_1; y_0)
496 vshl.u64 q0, q8, #62 // (y_7; y_6) b_i for t^2
497 vshl.u64 q12, q9, #62 // (y_5; y_4) b_i for t^2
498 vshl.u64 q1, q8, #59 // (y_7; y_6) b_i for t^5
499 vshl.u64 q13, q9, #59 // (y_5; y_4) b_i for t^5
500 vshl.u64 q2, q8, #54 // (y_7; y_6) b_i for t^10
501 vshl.u64 q14, q9, #54 // (y_5; y_4) b_i for t^10
502 veor q0, q0, q1 // mix the contributions together
503 veor q12, q12, q13
504 veor q0, q0, q2
505 veor q12, q12, q14
506 veor d19, d19, d0 // and combine into the lower pieces
507 veor d20, d20, d1
508 veor d21, d21, d24
509 veor d22, d22, d25
510
511 // And then shift the low bits up. Also, switch the order of the
512 // pieces for output.
513 // q8 = // (y'_7; y'_6)
514 // q9 = // (y'_5; y'_4)
515 // q10 = // (y'_3; y'_2)
516 // q11 = // (y'_1; y'_0)
517 vshr.u64 q0, q8, #2 // (y_7; y_6) a_i for t^2
518 vshr.u64 q12, q9, #2 // (y_5; y'_4) a_i for t^2
519 vshr.u64 q1, q8, #5 // (y_7; y_6) a_i for t^5
520 vshr.u64 q13, q9, #5 // (y_5; y_4) a_i for t^5
521 vshr.u64 q2, q8, #10 // (y_7; y_6) a_i for t^10
522 vshr.u64 q14, q9, #10 // (y_5; y_4) a_i for t^10
523
524 veor q8, q8, q0 // mix the contributions together
525 veor q1, q1, q2
526 veor q9, q9, q12
527 veor q13, q13, q14
528 veor q8, q8, q1
529 veor q9, q9, q13
530 veor d3, d20, d16 // and output
531 veor d2, d21, d17
532 veor d1, d22, d18
533 veor d0, d23, d19
534 .endm
535
536 ///--------------------------------------------------------------------------
537 /// Main code.
538
539 // There are a number of representations of field elements in this code and
540 // it can be confusing.
541 //
542 // * The `external format' consists of a sequence of contiguous bytes in
543 // memory called a `block'. The GCM spec explains how to interpret this
544 // block as an element of a finite field. As discussed extensively, this
545 // representation is very annoying for a number of reasons. On the other
546 // hand, this code never actually deals with it directly.
547 //
548 // * The `register format' consists of one or more NEON registers,
549 // depending on the block size. The bytes in each 64-bit lane of these
550 // registers are in reverse order, compared to the external format.
551 //
552 // * The `words' format consists of a sequence of bytes, as in the
553 // `external format', but, according to the blockcipher in use, the bytes
554 // within each 32-bit word may be reversed (`big-endian') or not
555 // (`little-endian'). Accordingly, there are separate entry points for
556 // each variant, identified with `b' or `l'.
557
558 FUNC(gcm_mulk_128b_arm_crypto)
559 // On entry, r0 points to a 128-bit field element A in big-endian
560 // words format; r1 points to a field-element K in register format.
561 // On exit, A is updated with the product A K.
562
563 vld1.8 {q0}, [r0]
564 vld1.8 {q1}, [r1]
565 vrev64.32 q0, q0
566 mul128
567 vrev64.32 q0, q0
568 vst1.8 {q0}, [r0]
569 bx r14
570 ENDFUNC
571
572 FUNC(gcm_mulk_128l_arm_crypto)
573 // On entry, r0 points to a 128-bit field element A in little-endian
574 // words format; r1 points to a field-element K in register format.
575 // On exit, A is updated with the product A K.
576
577 vld1.8 {q0}, [r0]
578 vld1.8 {q1}, [r1]
579 vrev64.8 q0, q0
580 mul128
581 vrev64.8 q0, q0
582 vst1.8 {q0}, [r0]
583 bx r14
584 ENDFUNC
585
586 FUNC(gcm_mulk_64b_arm_crypto)
587 // On entry, r0 points to a 64-bit field element A in big-endian
588 // words format; r1 points to a field-element K in register format.
589 // On exit, A is updated with the product A K.
590
591 vld1.8 {d0}, [r0]
592 vld1.8 {d1}, [r1]
593 vrev64.32 d0, d0
594 mul64
595 vrev64.32 d0, d0
596 vst1.8 {d0}, [r0]
597 bx r14
598 ENDFUNC
599
600 FUNC(gcm_mulk_64l_arm_crypto)
601 // On entry, r0 points to a 64-bit field element A in little-endian
602 // words format; r1 points to a field-element K in register format.
603 // On exit, A is updated with the product A K.
604
605 vld1.8 {d0}, [r0]
606 vld1.8 {d1}, [r1]
607 vrev64.8 d0, d0
608 vzero
609 mul64
610 vrev64.8 d0, d0
611 vst1.8 {d0}, [r0]
612 bx r14
613 ENDFUNC
614
615 FUNC(gcm_mulk_96b_arm_crypto)
616 // On entry, r0 points to a 96-bit field element A in big-endian
617 // words format; r1 points to a field-element K in register format.
618 // On exit, A is updated with the product A K.
619
620 ldr r3, [r0, #8]
621 mov r12, #0
622 vld1.8 {d0}, [r0]
623 vld1.8 {q1}, [r1]
624 vrev64.32 d0, d0
625 vmov d1, r12, r3
626 vzero
627 mul96
628 vrev64.32 d0, d0
629 vmov r3, d1[1]
630 vst1.8 {d0}, [r0]
631 str r3, [r0, #8]
632 bx r14
633 ENDFUNC
634
635 FUNC(gcm_mulk_96l_arm_crypto)
636 // On entry, r0 points to a 128-bit field element A in little-endian
637 // words format; r1 points to a field-element K in register format.
638 // On exit, A is updated with the product A K.
639
640 ldr r3, [r0, #8]
641 mov r12, #0
642 vld1.8 {d0}, [r0]
643 vld1.8 {q1}, [r1]
644 vmov d1, r3, r12
645 vrev64.8 q0, q0
646 mul96
647 vrev64.8 q0, q0
648 vmov r3, d1[0]
649 vst1.8 {d0}, [r0]
650 str r3, [r0, #8]
651 bx r14
652 ENDFUNC
653
654 FUNC(gcm_mulk_192b_arm_crypto)
655 // On entry, r0 points to a 192-bit field element A in big-endian
656 // words format; r1 points to a field-element K in register format.
657 // On exit, A is updated with the product A K.
658
659 vld1.8 {d0-d2}, [r0]
660 vld1.8 {d3-d5}, [r1]
661 vrev64.32 q0, q0
662 vrev64.32 d2, d2
663 mul192
664 vrev64.32 q0, q0
665 vrev64.32 d2, d2
666 vst1.8 {d0-d2}, [r0]
667 bx r14
668 ENDFUNC
669
670 FUNC(gcm_mulk_192l_arm_crypto)
671 // On entry, r0 points to a 192-bit field element A in little-endian
672 // words format; r1 points to a field-element K in register format.
673 // On exit, A is updated with the product A K.
674
675 vld1.8 {d0-d2}, [r0]
676 vld1.8 {d3-d5}, [r1]
677 vrev64.8 q0, q0
678 vrev64.8 d2, d2
679 mul192
680 vrev64.8 q0, q0
681 vrev64.8 d2, d2
682 vst1.8 {d0-d2}, [r0]
683 bx r14
684 ENDFUNC
685
686 FUNC(gcm_mulk_256b_arm_crypto)
687 // On entry, r0 points to a 256-bit field element A in big-endian
688 // words format; r1 points to a field-element K in register format.
689 // On exit, A is updated with the product A K.
690
691 vld1.8 {q0, q1}, [r0]
692 vld1.8 {q2, q3}, [r1]
693 vrev64.32 q0, q0
694 vrev64.32 q1, q1
695 mul256
696 vrev64.32 q0, q0
697 vrev64.32 q1, q1
698 vst1.8 {q0, q1}, [r0]
699 bx r14
700 ENDFUNC
701
702 FUNC(gcm_mulk_256l_arm_crypto)
703 // On entry, r0 points to a 256-bit field element A in little-endian
704 // words format; r1 points to a field-element K in register format.
705 // On exit, A is updated with the product A K.
706
707 vld1.8 {q0, q1}, [r0]
708 vld1.8 {q2, q3}, [r1]
709 vrev64.8 q0, q0
710 vrev64.8 q1, q1
711 mul256
712 vrev64.8 q0, q0
713 vrev64.8 q1, q1
714 vst1.8 {q0, q1}, [r0]
715 bx r14
716 ENDFUNC
717
718 ///----- That's all, folks --------------------------------------------------