Commit | Line | Data |
---|---|---|
9e6a4409 MW |
1 | /// -*- mode: asm; asm-comment-char: ?/ -*- |
2 | /// | |
3 | /// GCM acceleration for ARM processors | |
4 | /// | |
5 | /// (c) 2019 Straylight/Edgeware | |
6 | /// | |
7 | ||
8 | ///----- Licensing notice --------------------------------------------------- | |
9 | /// | |
10 | /// This file is part of Catacomb. | |
11 | /// | |
12 | /// Catacomb is free software: you can redistribute it and/or modify it | |
13 | /// under the terms of the GNU Library General Public License as published | |
14 | /// by the Free Software Foundation; either version 2 of the License, or | |
15 | /// (at your option) any later version. | |
16 | /// | |
17 | /// Catacomb is distributed in the hope that it will be useful, but | |
18 | /// WITHOUT ANY WARRANTY; without even the implied warranty of | |
19 | /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
20 | /// Library General Public License for more details. | |
21 | /// | |
22 | /// You should have received a copy of the GNU Library General Public | |
23 | /// License along with Catacomb. If not, write to the Free Software | |
24 | /// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, | |
25 | /// USA. | |
26 | ||
27 | ///-------------------------------------------------------------------------- | |
28 | /// Preliminaries. | |
29 | ||
30 | #include "config.h" | |
31 | #include "asm-common.h" | |
32 | ||
33 | .arch armv8-a | |
34 | .fpu crypto-neon-fp-armv8 | |
35 | ||
36 | .text | |
37 | ||
38 | ///-------------------------------------------------------------------------- | |
39 | /// Multiplication macros. | |
40 | ||
41 | // The good news is that we have a fancy instruction to do the | |
42 | // multiplications. The bad news is that it's not particularly well- | |
43 | // suited to the job. | |
44 | // | |
45 | // For one thing, it only does a 64-bit multiplication, so in general | |
46 | // we'll need to synthesize the full-width multiply by hand. For | |
47 | // another thing, it doesn't help with the reduction, so we have to | |
48 | // do that by hand too. And, finally, GCM has crazy bit ordering, | |
49 | // and the instruction does nothing useful for that at all. | |
50 | // | |
51 | // Focusing on that last problem first: the bits aren't in monotonic | |
52 | // significance order unless we permute them. If we reverse the byte | |
53 | // order, then we'll have the bits in monotonic order, but backwards, | |
54 | // so the degree-0 coefficient will be in the most-significant bit. | |
55 | // | |
56 | // This is less of a difficulty than it seems at first, because | |
57 | // algebra. Suppose we are given u = SUM_{0<=i<n} u_i t^i and v = | |
58 | // SUM_{0<=j<n} v_j t^j; then | |
59 | // | |
60 | // u v = SUM_{0<=i,j<n} u_i v_j t^{i+j} | |
61 | // | |
62 | // Suppose instead that we're given ũ = SUM_{0<=i<n} u_{n-i-1} t^i | |
63 | // and ṽ = SUM_{0<=j<n} v_{n-j-1} t^j, so the bits are backwards. | |
64 | // Then | |
65 | // | |
66 | // ũ ṽ = SUM_{0<=i,j<n} u_{n-i-1} v_{n-j-1} t^{i+j} | |
67 | // = SUM_{0<=i,j<n} u_i v_j t^{2n-2-(i+j)} | |
68 | // | |
69 | // which is almost the bit-reversal of u v, only it's shifted right | |
70 | // by one place. Oh, well: we'll have to shift it back later. | |
71 | // | |
72 | // That was important to think about, but there's not a great deal to | |
73 | // do about it yet other than to convert what we've got from the | |
74 | // blockcipher's byte-ordering convention to our big-endian | |
75 | // convention. Since this depends on the blockcipher convention, | |
76 | // we'll leave the caller to cope with this: the macros here will | |
77 | // assume that the operands are in `register' format, which is the | |
78 | // same as the external representation, except that the bytes within | |
79 | // each 64-bit piece are reversed. In the commentary, pieces of | |
80 | // polynomial are numbered according to the degree of the | |
81 | // coefficients, so the unit coefficient of some polynomial a is in | |
82 | // a_0. | |
83 | // | |
84 | // The commentary for `mul128' is the most detailed. The other | |
85 | // macros assume that you've already read and understood that. | |
86 | ||
87 | .macro mul128 | |
88 | // Enter with u and v in q0 and q1 respectively; leave with z = u v | |
89 | // in q0. Clobbers q1--q3, q8, q9. | |
90 | ||
91 | // First for the double-precision multiplication. It's tempting to | |
92 | // use Karatsuba's identity here, but I suspect that loses more in | |
93 | // the shifting, bit-twiddling, and dependency chains that it gains | |
94 | // in saving a multiplication which otherwise pipelines well. | |
95 | // q0 = // (u_0; u_1) | |
96 | // q1 = // (v_0; v_1) | |
97 | vmull.p64 q2, d1, d2 // u_1 v_0 | |
98 | vmull.p64 q3, d0, d3 // u_0 v_1 | |
99 | vmull.p64 q8, d1, d3 // (x_3; t_1) = u_1 v_1 | |
100 | vmull.p64 q9, d0, d2 // (t_0; x_0) = u_0 v_0 | |
101 | ||
102 | // Arrange the pieces to form a double-precision polynomial. | |
103 | veor q2, q2, q3 // (m_1; m_0) = u_0 v_1 + u_1 v_0 | |
104 | veor d17, d17, d4 // x_2 = t_1 + m_1 | |
105 | veor d18, d18, d5 // x_1 = t_0 + m_0 | |
106 | // q8 = // (x_3; x_2) | |
107 | // q9 = // (x_1; x_0) | |
108 | ||
109 | // Two-and-a-half problems remain. The first is that this product is | |
110 | // shifted left by one place, which is annoying. Let's take care of | |
111 | // that now. Once this is done, we'll be properly in GCM's backwards | |
112 | // bit-ordering. | |
113 | // | |
114 | // The half a problem is that the result wants to have its 64-bit | |
115 | // halves switched. Here turns out to be the best place to arrange | |
116 | // for that. | |
117 | // | |
118 | // q9 q8 | |
119 | // ,-------------.-------------. ,-------------.-------------. | |
120 | // | 0 x_0-x_62 | x_63-x_126 | | x_127-x_190 | x_191-x_254 | | |
121 | // `-------------^-------------' `-------------^-------------' | |
122 | // d19 d18 d17 d16 | |
123 | // | |
124 | // We start by shifting each 32-bit lane right (from GCM's point of | |
125 | // view -- physically, left) by one place, which gives us this: | |
126 | // | |
127 | // low (q9) high (q8) | |
128 | // ,-------------.-------------. ,-------------.-------------. | |
129 | // | x_0-x_62 0 |x_64-x_126 0 | |x_128-x_190 0|x_192-x_254 0| | |
130 | // `-------------^-------------' `-------------^-------------' | |
131 | // d19 d18 d17 d16 | |
132 | // | |
133 | // but we've lost a bunch of bits. We separately shift each lane | |
134 | // left by 31 places to give us the bits we lost. | |
135 | // | |
136 | // low (q3) high (q2) | |
137 | // ,-------------.-------------. ,-------------.-------------. | |
138 | // | 0...0 | 0...0 x_63 | | 0...0 x_127 | 0...0 x_191 | | |
139 | // `-------------^-------------' `-------------^-------------' | |
140 | // d6 d5 d4 | |
141 | // | |
142 | // Since we can address each of these pieces individually, putting | |
143 | // them together is relatively straightforward. | |
144 | ||
145 | ||
146 | vshr.u64 d6, d18, #63 // shifted left; just the carries | |
147 | vshl.u64 q9, q9, #1 // shifted right, but dropped carries | |
148 | vshr.u64 q2, q8, #63 | |
149 | vshl.u64 q8, q8, #1 | |
150 | vorr d0, d19, d6 // y_0 | |
151 | vorr d1, d18, d5 // y_1 | |
152 | vorr d2, d17, d4 // y_2 | |
153 | vmov d3, d16 // y_3 | |
154 | ||
155 | // And the other one is that the result needs to be reduced modulo | |
156 | // p(t) = t^128 + t^7 + t^2 + t + 1. Let R = t^128 = t^7 + t^2 + t + | |
157 | // 1 in our field. So far, we've calculated z_0 and z_1 such that | |
158 | // z_0 + z_1 R = u v using the identity R = t^128: now we must | |
159 | // collapse the two halves of y together using the other identity R = | |
160 | // t^7 + t^2 + t + 1. | |
161 | // | |
162 | // We do this by working on y_2 and y_3 separately, so consider y_i | |
163 | // for i = 2 or 3. Certainly, y_i t^{64i} = y_i R t^{64(i-2) = | |
164 | // (t^7 + t^2 + t + 1) y_i t^{64(i-2)}, but we can't use that | |
165 | // directly without breaking up the 64-bit word structure. Instead, | |
166 | // we start by considering just y_i t^7 t^{64(i-2)}, which again | |
167 | // looks tricky. Now, split y_i = a_i + t^57 b_i, with deg a_i < 57; | |
168 | // then | |
169 | // | |
170 | // y_i t^7 t^{64(i-2)} = a_i t^7 t^{64(i-2)} + b_i t^{64(i-1)} | |
171 | // | |
172 | // We can similarly decompose y_i t^2 and y_i t into a pair of 64-bit | |
173 | // contributions to the t^{64(i-2)} and t^{64(i-1)} words, but the | |
174 | // splits are different. This is lovely, with one small snag: when | |
175 | // we do this to y_3, we end up with a contribution back into the | |
176 | // t^128 coefficient word. But notice that only the low seven bits | |
177 | // of this word are affected, so there's no knock-on contribution | |
178 | // into the t^64 word. Therefore, if we handle the high bits of each | |
179 | // word together, and then the low bits, everything will be fine. | |
180 | ||
181 | // First, shift the high bits down. | |
182 | vshl.u64 q2, q1, #63 // the b_i for t | |
183 | vshl.u64 q3, q1, #62 // the b_i for t^2 | |
184 | vshl.u64 q8, q1, #57 // the b_i for t^7 | |
185 | veor q2, q2, q3 // add them all together | |
186 | veor q2, q2, q8 | |
187 | veor d2, d2, d5 // contribution into low half | |
188 | veor d1, d1, d4 // and high half | |
189 | ||
190 | // And then shift the low bits up. | |
191 | vshr.u64 q2, q1, #1 | |
192 | vshr.u64 q3, q1, #2 | |
193 | vshr.u64 q8, q1, #7 | |
194 | veor q0, q0, q1 // mix in the unit contribution | |
195 | veor q2, q2, q3 // t and t^2 contribs | |
196 | veor q0, q0, q8 // low, unit, and t^7 contribs | |
197 | veor q0, q0, q2 // mix them together and we're done | |
198 | .endm | |
199 | ||
200 | .macro mul64 | |
201 | // Enter with u and v in the low halves of d0 and d1 respectively; | |
202 | // leave with z = u v in d0. Clobbers d1--d5. | |
203 | ||
204 | // The multiplication is thankfully easy. | |
205 | vmull.p64 q0, d0, d1 // u v | |
206 | ||
207 | // Shift the product up by one place, and swap the two halves. After | |
208 | // this, we're in GCM bizarro-world. | |
209 | vshr.u64 d2, d0, #63 // shifted left; just the carries | |
210 | vshl.u64 d3, d1, #1 // low half right | |
211 | vshl.u64 d1, d0, #1 // high half shifted right | |
212 | vorr d0, d3, d2 // mix in the carries | |
213 | ||
214 | // Now we must reduce. This is essentially the same as the 128-bit | |
215 | // case above, but mostly simpler because everything is smaller. The | |
216 | // polynomial this time is p(t) = t^64 + t^4 + t^3 + t + 1. | |
217 | ||
218 | // First, shift the high bits down. | |
219 | vshl.u64 d2, d1, #63 // b_i for t | |
220 | vshl.u64 d3, d1, #61 // b_i for t^3 | |
221 | vshl.u64 d4, d1, #60 // b_i for t^4 | |
222 | veor d2, d2, d3 // add them all together | |
223 | veor d2, d2, d4 | |
224 | veor d1, d1, d2 // contribution back into high half | |
225 | ||
226 | // And then shift the low bits up. | |
227 | vshr.u64 d2, d1, #1 | |
228 | vshr.u64 d3, d1, #3 | |
229 | vshr.u64 d4, d1, #4 | |
230 | veor d0, d0, d1 // mix in the unit contribution | |
231 | veor d2, d2, d3 // t and t^3 contribs | |
232 | veor d0, d0, d4 // low, unit, and t^4 | |
233 | veor d0, d0, d2 // mix them together and we're done | |
234 | .endm | |
235 | ||
236 | .macro mul96 | |
237 | // Enter with u and v in the most-significant three words of q0 and | |
238 | // q1 respectively, and zero in the low words, and zero in q15; leave | |
239 | // with z = u v in the high three words of q0, and /junk/ in the low | |
240 | // word. Clobbers ???. | |
241 | ||
242 | // This is an inconvenient size. There's nothing for it but to do | |
243 | // four multiplications, as if for the 128-bit case. It's possible | |
244 | // that there's cruft in the top 32 bits of the input registers, so | |
245 | // shift both of them up by four bytes before we start. This will | |
246 | // mean that the high 64 bits of the result (from GCM's viewpoint) | |
247 | // will be zero. | |
248 | // q0 = // (u_0 + u_1 t^32; u_2) | |
249 | // q1 = // (v_0 + v_1 t^32; v_2) | |
250 | vmull.p64 q8, d1, d2 // u_2 (v_0 + v_1 t^32) = e_0 | |
251 | vmull.p64 q9, d0, d3 // v_2 (u_0 + u_1 t^32) = e_1 | |
252 | vmull.p64 q3, d1, d3 // u_2 v_2 t^64 = d = (0; d) | |
253 | vmull.p64 q0, d0, d2 // u_0 v_0 + (u_0 v_1 + u_1 v_0) t^32 | |
254 | // + u_1 v_1 t^64 = f | |
255 | ||
256 | // Extract the high and low halves of the 192-bit result. The answer | |
257 | // we want is d t^128 + e t^64 + f, where e = e_0 + e_1. The low 96 | |
258 | // bits of the answer will end up in q0, and the high 96 bits will | |
259 | // end up in q1; we'll need both of these to have zero in their | |
260 | // bottom 32 bits. | |
261 | // | |
262 | // Here, bot(x) is the low 96 bits of a 192-bit quantity x, arranged | |
263 | // in the low 96 bits of a SIMD register, with junk in the top 32 | |
264 | // bits; and top(x) is the high 96 bits, also arranged in the low 96 | |
265 | // bits of a register, with /zero/ in the top 32 bits. | |
266 | veor q8, q8, q9 // e_0 + e_1 = e | |
267 | vshr128 q1, q3, 32 // top(d t^128) | |
268 | vext.8 d19, d16, d17, #4 // top(e t^64) | |
269 | vshl.u64 d16, d0, #32 // top(f), sort of | |
270 | veor d3, d3, d19 // q1 = top(d t^128 + e t^64) | |
271 | veor d0, d0, d17 // q0 = bot([d t^128] + e t^64 + f) | |
272 | veor d3, d3, d16 // q1 = top(d t^128 + e t^64 + f) | |
273 | ||
274 | // Shift the product right by one place (from GCM's point of view), | |
275 | // but, unusually, don't swap the halves, because we need to work on | |
276 | // the 32-bit pieces later. After this, we're in GCM bizarro-world. | |
277 | // q0 = // (?, x_2; x_1, x_0) | |
278 | // q1 = // (0, x_5; x_4, x_3) | |
279 | vshr.u64 d4, d0, #63 // carry from d0 to d1 | |
280 | vshr.u64 d5, d2, #63 // carry from d2 to d3 | |
281 | vshr.u32 d6, d3, #31 // carry from d3 to d0 | |
282 | vshl.u64 q0, q0, #1 // shift low half | |
283 | vshl.u64 q1, q1, #1 // shift high half | |
284 | vorr d1, d1, d4 | |
285 | vorr d0, d0, d6 | |
286 | vorr d3, d3, d5 | |
287 | ||
288 | // Finally, the reduction. This is essentially the same as the | |
289 | // 128-bit case, except that the polynomial is p(t) = t^96 + t^10 + | |
290 | // t^9 + t^6 + 1. The degrees are larger but not enough to cause | |
291 | // trouble for the general approach. | |
292 | ||
293 | // First, shift the high bits down. | |
294 | vshl.u32 q2, q1, #26 // b_i for t^6 | |
295 | vshl.u32 q3, q1, #23 // b_i for t^9 | |
296 | vshl.u32 q8, q1, #22 // b_i for t^10 | |
297 | veor q2, q2, q3 // add them all together | |
298 | veor q2, q2, q8 | |
299 | vshl128 q3, q2, 64 // contribution into high half | |
300 | vshr128 q2, q2, 32 // and low half | |
301 | veor q1, q1, q3 // mix them in | |
302 | veor q0, q0, q2 | |
303 | ||
304 | // And then shift the low bits up. | |
305 | vshr.u32 q2, q1, #6 | |
306 | vshr.u32 q3, q1, #9 | |
307 | veor q0, q0, q1 // mix in the unit contribution | |
308 | vshr.u32 q8, q1, #10 | |
309 | veor q2, q2, q3 // mix together t^6 and t^9 | |
310 | veor q0, q0, q8 // mix in t^10 | |
311 | veor q0, q0, q2 // and the rest | |
312 | ||
313 | // And finally swap the two halves. | |
314 | vswp d0, d1 | |
315 | .endm | |
316 | ||
317 | .macro mul192 | |
318 | // Enter with u and v in d0--d2 and d3--d5 respectively; leave | |
319 | // with z = u v in d0--d2. Clobbers q8--q15. | |
320 | ||
321 | // Start multiplying and accumulating pieces of product. | |
322 | // (d0; d1; d2) = // (u_0; u_1; u_2) | |
323 | // (d3; d4; d5) = // (v_0; v_1; v_2) | |
324 | vmull.p64 q10, d0, d3 // e = u_0 v_0 | |
325 | ||
326 | vmull.p64 q12, d0, d4 // u_0 v_1 | |
327 | vmull.p64 q13, d1, d3 // u_1 v_0 | |
328 | ||
329 | vmull.p64 q9, d0, d5 // u_0 v_2 | |
330 | vmull.p64 q14, d1, d4 // u_1 v_1 | |
331 | vmull.p64 q15, d2, d3 // u_2 v_0 | |
332 | veor q12, q12, q13 // d = u_0 v_1 + u_1 v_0 | |
333 | ||
334 | vmull.p64 q11, d1, d5 // u_1 v_2 | |
335 | vmull.p64 q13, d2, d4 // u_2 v_1 | |
336 | veor q9, q9, q14 // u_0 v_2 + u_1 v_1 | |
337 | ||
338 | vmull.p64 q8, d2, d5 // a = u_2 v_2 | |
339 | veor q9, q9, q15 // c = u_0 v_2 + u_1 v_1 + u_2 v_0 | |
340 | veor q11, q11, q13 // b = u_1 v_2 + u_2 v_1 | |
341 | ||
342 | // Piece the product together. | |
343 | veor d17, d17, d22 // q8 = // (x_5; x_4) | |
344 | veor d18, d18, d23 | |
345 | veor d19, d19, d24 // q9 = // (x_3; x_2) | |
346 | veor d20, d20, d25 // q10 = // (x_1; x_0) | |
347 | ||
348 | // Shift the product right by one place (from GCM's point of view). | |
349 | vshr.u64 q11, q8, #63 // carry from d16/d17 to d17/d18 | |
350 | vshr.u64 q12, q9, #63 // carry from d18/d19 to d19/d20 | |
351 | vshr.u64 d26, d20, #63 // carry from d20 to d21 | |
352 | vshl.u64 q8, q8, #1 // shift everything down | |
353 | vshl.u64 q9, q9, #1 | |
354 | vshl.u64 q10, q10, #1 | |
355 | vorr d17, d17, d22 // and mix in the carries | |
356 | vorr d18, d18, d23 | |
357 | vorr d19, d19, d24 | |
358 | vorr d20, d20, d25 | |
359 | vorr d21, d21, d26 | |
360 | ||
361 | // Next, the reduction. Our polynomial this time is p(x) = t^192 + | |
362 | // t^7 + t^2 + t + 1. Yes, the magic numbers are the same as the | |
363 | // 128-bit case. I don't know why. | |
364 | ||
365 | // First, shift the high bits down. | |
366 | // q8 = // (y_5; y_4) | |
367 | // q9 = // (y_3; y_2) | |
368 | // q10 = // (y_1; y_0) | |
369 | vshl.u64 q11, q8, #63 // (y_5; y_4) b_i for t | |
370 | vshl.u64 d28, d18, #63 // y_3 b_i for t | |
371 | vshl.u64 q12, q8, #62 // (y_5; y_4) b_i for t^2 | |
372 | vshl.u64 d29, d18, #62 // y_3 b_i for t^2 | |
373 | vshl.u64 q13, q8, #57 // (y_5; y_4) b_i for t^7 | |
374 | vshl.u64 d30, d18, #57 // y_3 b_i for t^7 | |
375 | veor q11, q11, q12 // mix them all together | |
376 | veor d28, d28, d29 | |
377 | veor q11, q11, q13 | |
378 | veor d28, d28, d30 | |
379 | veor q9, q9, q11 | |
380 | veor d20, d20, d28 | |
381 | ||
382 | // And finally shift the low bits up. Also, switch the order of the | |
383 | // pieces for output. | |
384 | // q8 = // (y'_5; y'_4) | |
385 | // q9 = // (y'_3; y'_2) | |
386 | // q10 = // (y'_1; y'_0) | |
387 | vshr.u64 q11, q8, #1 // (y_5; y_4) a_i for t | |
388 | vshr.u64 d28, d18, #1 // y'_3 a_i for t | |
389 | vshr.u64 q12, q8, #2 // (y_5; y_4) a_i for t^2 | |
390 | vshr.u64 d29, d18, #2 // y'_3 a_i for t^2 | |
391 | vshr.u64 q13, q8, #7 // (y_5; y_4) a_i for t^7 | |
392 | vshr.u64 d30, d18, #7 // y'_3 a_i for t^7 | |
393 | veor q8, q8, q11 | |
394 | veor d18, d18, d28 | |
395 | veor q12, q12, q13 | |
396 | veor d29, d29, d30 | |
397 | veor q8, q8, q12 | |
398 | veor d18, d18, d29 | |
399 | veor d0, d21, d18 | |
400 | veor d1, d20, d17 | |
401 | veor d2, d19, d16 | |
402 | .endm | |
403 | ||
404 | .macro mul256 | |
405 | // Enter with u and v in q0/q1 and q2/q3 respectively; leave | |
406 | // with z = u v in q0/q1. Clobbers q8--q15. | |
407 | ||
408 | // Now it's starting to look worthwhile to do Karatsuba. Suppose | |
409 | // u = u_0 + u_1 B and v = v_0 + v_1 B. Then | |
410 | // | |
411 | // u v = (u_0 v_0) + (u_0 v_1 + u_1 v_0) B + (u_1 v_1) B^2 | |
412 | // | |
413 | // Name these coefficients of B^i be a, b, and c, respectively, and | |
414 | // let r = u_0 + u_1 and s = v_0 + v_1. Then observe that | |
415 | // | |
416 | // q = r s = (u_0 + u_1) (v_0 + v_1) | |
417 | // = (u_0 v_0) + (u1 v_1) + (u_0 v_1 + u_1 v_0) | |
418 | // = a + d + c | |
419 | // | |
420 | // The first two terms we've already calculated; the last is the | |
421 | // remaining one we want. We'll set B = t^128. We know how to do | |
422 | // 128-bit multiplications already, and Karatsuba is too annoying | |
423 | // there, so there'll be 12 multiplications altogether, rather than | |
424 | // the 16 we'd have if we did this the naïve way. | |
425 | // q0 = // u_0 = (u_00; u_01) | |
426 | // q1 = // u_1 = (u_10; u_11) | |
427 | // q2 = // v_0 = (v_00; v_01) | |
428 | // q3 = // v_1 = (v_10; v_11) | |
429 | ||
430 | veor q8, q0, q1 // u_* = (u_00 + u_10; u_01 + u_11) | |
431 | veor q9, q2, q3 // v_* = (v_00 + v_10; v_01 + v_11) | |
432 | ||
433 | // Start by building the cross product, q = u_* v_*. | |
434 | vmull.p64 q14, d16, d19 // u_*0 v_*1 | |
435 | vmull.p64 q15, d17, d18 // u_*1 v_*0 | |
436 | vmull.p64 q12, d17, d19 // u_*1 v_*1 | |
437 | vmull.p64 q13, d16, d18 // u_*0 v_*0 | |
438 | veor q14, q14, q15 // u_*0 v_*1 + u_*1 v_*0 | |
439 | veor d25, d25, d28 // q12 = // q_1 | |
440 | veor d26, d26, d29 // q13 = // q_0 | |
441 | ||
442 | // Next, work on the low half, a = u_0 v_0. | |
443 | vmull.p64 q14, d0, d5 // u_00 v_01 | |
444 | vmull.p64 q15, d1, d4 // u_01 v_00 | |
445 | vmull.p64 q10, d1, d5 // u_01 v_01 | |
446 | vmull.p64 q11, d0, d4 // u_00 v_00 | |
447 | veor q14, q14, q15 // u_00 v_01 + u_01 v_00 | |
448 | veor d21, d21, d28 // q10 = // a_1 | |
449 | veor d22, d22, d29 // q11 = // a_0 | |
450 | ||
451 | // Mix the pieces we have so far. | |
452 | veor q12, q12, q10 | |
453 | veor q13, q13, q11 | |
454 | ||
455 | // Finally, the high half, c = u_1 v_1. | |
456 | vmull.p64 q14, d2, d7 // u_10 v_11 | |
457 | vmull.p64 q15, d3, d6 // u_11 v_10 | |
458 | vmull.p64 q8, d3, d7 // u_11 v_11 | |
459 | vmull.p64 q9, d2, d6 // u_10 v_10 | |
460 | veor q14, q14, q15 // u_10 v_11 + u_11 v_10 | |
461 | veor d17, d17, d28 // q8 = // c_1 | |
462 | veor d18, d18, d29 // q9 = // c_0 | |
463 | ||
464 | // Finish mixing the product together. | |
465 | veor q12, q12, q8 // q12 = // b_1 | |
466 | veor q13, q13, q9 // q13 = // b_0 | |
467 | veor q9, q9, q12 | |
468 | veor q10, q10, q13 | |
469 | ||
470 | // Shift the product right by one place (from GCM's point of view). | |
471 | vshr.u64 q0, q8, #63 // carry from d16/d17 to d17/d18 | |
472 | vshr.u64 q1, q9, #63 // carry from d18/d19 to d19/d20 | |
473 | vshr.u64 q2, q10, #63 // carry from d20/d21 to d21/d22 | |
474 | vshr.u64 d6, d22, #63 // carry from d22 to d23 | |
475 | vshl.u64 q8, q8, #1 // shift everyting down | |
476 | vshl.u64 q9, q9, #1 | |
477 | vshl.u64 q10, q10, #1 | |
478 | vshl.u64 q11, q11, #1 | |
479 | vorr d17, d17, d0 | |
480 | vorr d18, d18, d1 | |
481 | vorr d19, d19, d2 | |
482 | vorr d20, d20, d3 | |
483 | vorr d21, d21, d4 | |
484 | vorr d22, d22, d5 | |
485 | vorr d23, d23, d6 | |
486 | ||
487 | // Now we must reduce. This is essentially the same as the 192-bit | |
488 | // case above, but more complicated because everything is bigger. | |
489 | // The polynomial this time is p(t) = t^256 + t^10 + t^5 + t^2 + 1. | |
490 | ||
491 | // First, shift the high bits down. | |
492 | // q8 = // (y_7; y_6) | |
493 | // q9 = // (y_5; y_4) | |
494 | // q10 = // (y_3; y_2) | |
495 | // q11 = // (y_1; y_0) | |
496 | vshl.u64 q0, q8, #62 // (y_7; y_6) b_i for t^2 | |
497 | vshl.u64 q12, q9, #62 // (y_5; y_4) b_i for t^2 | |
498 | vshl.u64 q1, q8, #59 // (y_7; y_6) b_i for t^5 | |
499 | vshl.u64 q13, q9, #59 // (y_5; y_4) b_i for t^5 | |
500 | vshl.u64 q2, q8, #54 // (y_7; y_6) b_i for t^10 | |
501 | vshl.u64 q14, q9, #54 // (y_5; y_4) b_i for t^10 | |
502 | veor q0, q0, q1 // mix the contributions together | |
503 | veor q12, q12, q13 | |
504 | veor q0, q0, q2 | |
505 | veor q12, q12, q14 | |
506 | veor d19, d19, d0 // and combine into the lower pieces | |
507 | veor d20, d20, d1 | |
508 | veor d21, d21, d24 | |
509 | veor d22, d22, d25 | |
510 | ||
511 | // And then shift the low bits up. Also, switch the order of the | |
512 | // pieces for output. | |
513 | // q8 = // (y'_7; y'_6) | |
514 | // q9 = // (y'_5; y'_4) | |
515 | // q10 = // (y'_3; y'_2) | |
516 | // q11 = // (y'_1; y'_0) | |
517 | vshr.u64 q0, q8, #2 // (y_7; y_6) a_i for t^2 | |
518 | vshr.u64 q12, q9, #2 // (y_5; y'_4) a_i for t^2 | |
519 | vshr.u64 q1, q8, #5 // (y_7; y_6) a_i for t^5 | |
520 | vshr.u64 q13, q9, #5 // (y_5; y_4) a_i for t^5 | |
521 | vshr.u64 q2, q8, #10 // (y_7; y_6) a_i for t^10 | |
522 | vshr.u64 q14, q9, #10 // (y_5; y_4) a_i for t^10 | |
523 | ||
524 | veor q8, q8, q0 // mix the contributions together | |
525 | veor q1, q1, q2 | |
526 | veor q9, q9, q12 | |
527 | veor q13, q13, q14 | |
528 | veor q8, q8, q1 | |
529 | veor q9, q9, q13 | |
530 | veor d3, d20, d16 // and output | |
531 | veor d2, d21, d17 | |
532 | veor d1, d22, d18 | |
533 | veor d0, d23, d19 | |
534 | .endm | |
535 | ||
536 | ///-------------------------------------------------------------------------- | |
537 | /// Main code. | |
538 | ||
539 | // There are a number of representations of field elements in this code and | |
540 | // it can be confusing. | |
541 | // | |
542 | // * The `external format' consists of a sequence of contiguous bytes in | |
543 | // memory called a `block'. The GCM spec explains how to interpret this | |
544 | // block as an element of a finite field. As discussed extensively, this | |
545 | // representation is very annoying for a number of reasons. On the other | |
546 | // hand, this code never actually deals with it directly. | |
547 | // | |
548 | // * The `register format' consists of one or more NEON registers, | |
549 | // depending on the block size. The bytes in each 64-bit lane of these | |
550 | // registers are in reverse order, compared to the external format. | |
551 | // | |
552 | // * The `words' format consists of a sequence of bytes, as in the | |
553 | // `external format', but, according to the blockcipher in use, the bytes | |
554 | // within each 32-bit word may be reversed (`big-endian') or not | |
555 | // (`little-endian'). Accordingly, there are separate entry points for | |
556 | // each variant, identified with `b' or `l'. | |
557 | ||
558 | FUNC(gcm_mulk_128b_arm_crypto) | |
559 | // On entry, r0 points to a 128-bit field element A in big-endian | |
560 | // words format; r1 points to a field-element K in register format. | |
561 | // On exit, A is updated with the product A K. | |
562 | ||
563 | vld1.8 {q0}, [r0] | |
564 | vld1.8 {q1}, [r1] | |
565 | vrev64.32 q0, q0 | |
566 | mul128 | |
567 | vrev64.32 q0, q0 | |
568 | vst1.8 {q0}, [r0] | |
569 | bx r14 | |
570 | ENDFUNC | |
571 | ||
572 | FUNC(gcm_mulk_128l_arm_crypto) | |
573 | // On entry, r0 points to a 128-bit field element A in little-endian | |
574 | // words format; r1 points to a field-element K in register format. | |
575 | // On exit, A is updated with the product A K. | |
576 | ||
577 | vld1.8 {q0}, [r0] | |
578 | vld1.8 {q1}, [r1] | |
579 | vrev64.8 q0, q0 | |
580 | mul128 | |
581 | vrev64.8 q0, q0 | |
582 | vst1.8 {q0}, [r0] | |
583 | bx r14 | |
584 | ENDFUNC | |
585 | ||
586 | FUNC(gcm_mulk_64b_arm_crypto) | |
587 | // On entry, r0 points to a 64-bit field element A in big-endian | |
588 | // words format; r1 points to a field-element K in register format. | |
589 | // On exit, A is updated with the product A K. | |
590 | ||
591 | vld1.8 {d0}, [r0] | |
592 | vld1.8 {d1}, [r1] | |
593 | vrev64.32 d0, d0 | |
594 | mul64 | |
595 | vrev64.32 d0, d0 | |
596 | vst1.8 {d0}, [r0] | |
597 | bx r14 | |
598 | ENDFUNC | |
599 | ||
600 | FUNC(gcm_mulk_64l_arm_crypto) | |
601 | // On entry, r0 points to a 64-bit field element A in little-endian | |
602 | // words format; r1 points to a field-element K in register format. | |
603 | // On exit, A is updated with the product A K. | |
604 | ||
605 | vld1.8 {d0}, [r0] | |
606 | vld1.8 {d1}, [r1] | |
607 | vrev64.8 d0, d0 | |
608 | vzero | |
609 | mul64 | |
610 | vrev64.8 d0, d0 | |
611 | vst1.8 {d0}, [r0] | |
612 | bx r14 | |
613 | ENDFUNC | |
614 | ||
615 | FUNC(gcm_mulk_96b_arm_crypto) | |
616 | // On entry, r0 points to a 96-bit field element A in big-endian | |
617 | // words format; r1 points to a field-element K in register format. | |
618 | // On exit, A is updated with the product A K. | |
619 | ||
620 | ldr r3, [r0, #8] | |
621 | mov r12, #0 | |
622 | vld1.8 {d0}, [r0] | |
623 | vld1.8 {q1}, [r1] | |
624 | vrev64.32 d0, d0 | |
625 | vmov d1, r12, r3 | |
626 | vzero | |
627 | mul96 | |
628 | vrev64.32 d0, d0 | |
629 | vmov r3, d1[1] | |
630 | vst1.8 {d0}, [r0] | |
631 | str r3, [r0, #8] | |
632 | bx r14 | |
633 | ENDFUNC | |
634 | ||
635 | FUNC(gcm_mulk_96l_arm_crypto) | |
636 | // On entry, r0 points to a 128-bit field element A in little-endian | |
637 | // words format; r1 points to a field-element K in register format. | |
638 | // On exit, A is updated with the product A K. | |
639 | ||
640 | ldr r3, [r0, #8] | |
641 | mov r12, #0 | |
642 | vld1.8 {d0}, [r0] | |
643 | vld1.8 {q1}, [r1] | |
644 | vmov d1, r3, r12 | |
645 | vrev64.8 q0, q0 | |
646 | mul96 | |
647 | vrev64.8 q0, q0 | |
648 | vmov r3, d1[0] | |
649 | vst1.8 {d0}, [r0] | |
650 | str r3, [r0, #8] | |
651 | bx r14 | |
652 | ENDFUNC | |
653 | ||
654 | FUNC(gcm_mulk_192b_arm_crypto) | |
655 | // On entry, r0 points to a 192-bit field element A in big-endian | |
656 | // words format; r1 points to a field-element K in register format. | |
657 | // On exit, A is updated with the product A K. | |
658 | ||
659 | vld1.8 {d0-d2}, [r0] | |
660 | vld1.8 {d3-d5}, [r1] | |
661 | vrev64.32 q0, q0 | |
662 | vrev64.32 d2, d2 | |
663 | mul192 | |
664 | vrev64.32 q0, q0 | |
665 | vrev64.32 d2, d2 | |
666 | vst1.8 {d0-d2}, [r0] | |
667 | bx r14 | |
668 | ENDFUNC | |
669 | ||
670 | FUNC(gcm_mulk_192l_arm_crypto) | |
671 | // On entry, r0 points to a 192-bit field element A in little-endian | |
672 | // words format; r1 points to a field-element K in register format. | |
673 | // On exit, A is updated with the product A K. | |
674 | ||
675 | vld1.8 {d0-d2}, [r0] | |
676 | vld1.8 {d3-d5}, [r1] | |
677 | vrev64.8 q0, q0 | |
678 | vrev64.8 d2, d2 | |
679 | mul192 | |
680 | vrev64.8 q0, q0 | |
681 | vrev64.8 d2, d2 | |
682 | vst1.8 {d0-d2}, [r0] | |
683 | bx r14 | |
684 | ENDFUNC | |
685 | ||
686 | FUNC(gcm_mulk_256b_arm_crypto) | |
687 | // On entry, r0 points to a 256-bit field element A in big-endian | |
688 | // words format; r1 points to a field-element K in register format. | |
689 | // On exit, A is updated with the product A K. | |
690 | ||
691 | vld1.8 {q0, q1}, [r0] | |
692 | vld1.8 {q2, q3}, [r1] | |
693 | vrev64.32 q0, q0 | |
694 | vrev64.32 q1, q1 | |
695 | mul256 | |
696 | vrev64.32 q0, q0 | |
697 | vrev64.32 q1, q1 | |
698 | vst1.8 {q0, q1}, [r0] | |
699 | bx r14 | |
700 | ENDFUNC | |
701 | ||
702 | FUNC(gcm_mulk_256l_arm_crypto) | |
703 | // On entry, r0 points to a 256-bit field element A in little-endian | |
704 | // words format; r1 points to a field-element K in register format. | |
705 | // On exit, A is updated with the product A K. | |
706 | ||
707 | vld1.8 {q0, q1}, [r0] | |
708 | vld1.8 {q2, q3}, [r1] | |
709 | vrev64.8 q0, q0 | |
710 | vrev64.8 q1, q1 | |
711 | mul256 | |
712 | vrev64.8 q0, q0 | |
713 | vrev64.8 q1, q1 | |
714 | vst1.8 {q0, q1}, [r0] | |
715 | bx r14 | |
716 | ENDFUNC | |
717 | ||
718 | ///----- That's all, folks -------------------------------------------------- |