progs/perftest.c: Use from Glibc syscall numbers.
[catacomb] / symm / gcm-arm64-pmull.S
1 /// -*- mode: asm; asm-comment-char: ?/ -*-
2 ///
3 /// GCM acceleration for ARM64 processors
4 ///
5 /// (c) 2019 Straylight/Edgeware
6 ///
7
8 ///----- Licensing notice ---------------------------------------------------
9 ///
10 /// This file is part of Catacomb.
11 ///
12 /// Catacomb is free software: you can redistribute it and/or modify it
13 /// under the terms of the GNU Library General Public License as published
14 /// by the Free Software Foundation; either version 2 of the License, or
15 /// (at your option) any later version.
16 ///
17 /// Catacomb is distributed in the hope that it will be useful, but
18 /// WITHOUT ANY WARRANTY; without even the implied warranty of
19 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 /// Library General Public License for more details.
21 ///
22 /// You should have received a copy of the GNU Library General Public
23 /// License along with Catacomb. If not, write to the Free Software
24 /// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
25 /// USA.
26
27 ///--------------------------------------------------------------------------
28 /// Preliminaries.
29
30 #include "config.h"
31 #include "asm-common.h"
32
33 .arch armv8-a+crypto
34
35 .text
36
37 ///--------------------------------------------------------------------------
38 /// Multiplication macros.
39
40 // The good news is that we have a fancy instruction to do the
41 // multiplications. The bad news is that it's not particularly well-
42 // suited to the job.
43 //
44 // For one thing, it only does a 64-bit multiplication, so in general
45 // we'll need to synthesize the full-width multiply by hand. For
46 // another thing, it doesn't help with the reduction, so we have to
47 // do that by hand too. And, finally, GCM has crazy bit ordering,
48 // and the instruction does nothing useful for that at all.
49 //
50 // Focusing on that last problem first: the bits aren't in monotonic
51 // significance order unless we permute them. Fortunately, ARM64 has
52 // an instruction which will just permute the bits in each byte for
53 // us, so we don't have to worry about this very much.
54 //
55 // Our main weapons, the `pmull' and `pmull2' instructions, work on
56 // 64-bit operands, in half of a vector register, and produce 128-bit
57 // results. But neither of them will multiply the high half of one
58 // vector by the low half of a second one, so we have a problem,
59 // which we solve by representing one of the operands redundantly:
60 // rather than packing the 64-bit pieces together, we duplicate each
61 // 64-bit piece across both halves of a register.
62 //
63 // The commentary for `mul128' is the most detailed. The other
64 // macros assume that you've already read and understood that.
65
66 .macro mul128
67 // Enter with u and v in v0 and v1/v2 respectively, and 0 in v31;
68 // leave with z = u v in v0. Clobbers v1--v6.
69
70 // First for the double-precision multiplication. It's tempting to
71 // use Karatsuba's identity here, but I suspect that loses more in
72 // the shifting, bit-twiddling, and dependency chains that it gains
73 // in saving a multiplication which otherwise pipelines well.
74 // v0 = // (u_1; u_0)
75 // v1/v2 = // (v_1; v_0)
76 pmull2 v3.1q, v0.2d, v1.2d // u_1 v_0
77 pmull v4.1q, v0.1d, v2.1d // u_0 v_1
78 pmull2 v5.1q, v0.2d, v2.2d // (x_3; t_1) = u_1 v_1
79 pmull v6.1q, v0.1d, v1.1d // (t_0; x_0) = u_0 v_0
80
81 // Arrange the pieces to form a double-precision polynomial.
82 eor v3.16b, v3.16b, v4.16b // (m_1; m_0) = u_0 v_1 + u_1 v_0
83 vshr128 v4, v3, 64 // (0; m_1)
84 vshl128 v3, v3, 64 // (m_0; 0)
85 eor v1.16b, v5.16b, v4.16b // (x_3; x_2)
86 eor v0.16b, v6.16b, v3.16b // (x_1; x_0)
87
88 // And now the only remaining difficulty is that the result needs to
89 // be reduced modulo p(t) = t^128 + t^7 + t^2 + t + 1. Let R = t^128
90 // = t^7 + t^2 + t + 1 in our field. So far, we've calculated z_0
91 // and z_1 such that z_0 + z_1 R = u v using the identity R = t^128:
92 // now we must collapse the two halves of y together using the other
93 // identity R = t^7 + t^2 + t + 1.
94 //
95 // We do this by working on y_2 and y_3 separately, so consider y_i
96 // for i = 2 or 3. Certainly, y_i t^{64i} = y_i R t^{64(i-2) =
97 // (t^7 + t^2 + t + 1) y_i t^{64(i-2)}, but we can't use that
98 // directly without breaking up the 64-bit word structure. Instead,
99 // we start by considering just y_i t^7 t^{64(i-2)}, which again
100 // looks tricky. Now, split y_i = a_i + t^57 b_i, with deg a_i < 57;
101 // then
102 //
103 // y_i t^7 t^{64(i-2)} = a_i t^7 t^{64(i-2)} + b_i t^{64(i-1)}
104 //
105 // We can similarly decompose y_i t^2 and y_i t into a pair of 64-bit
106 // contributions to the t^{64(i-2)} and t^{64(i-1)} words, but the
107 // splits are different. This is lovely, with one small snag: when
108 // we do this to y_3, we end up with a contribution back into the
109 // t^128 coefficient word. But notice that only the low seven bits
110 // of this word are affected, so there's no knock-on contribution
111 // into the t^64 word. Therefore, if we handle the high bits of each
112 // word together, and then the low bits, everything will be fine.
113
114 // First, shift the high bits down.
115 ushr v2.2d, v1.2d, #63 // the b_i for t
116 ushr v3.2d, v1.2d, #62 // the b_i for t^2
117 ushr v4.2d, v1.2d, #57 // the b_i for t^7
118 eor v2.16b, v2.16b, v3.16b // add them all together
119 eor v2.16b, v2.16b, v4.16b
120 vshr128 v3, v2, 64
121 vshl128 v4, v2, 64
122 eor v1.16b, v1.16b, v3.16b // contribution into high half
123 eor v0.16b, v0.16b, v4.16b // and low half
124
125 // And then shift the low bits up.
126 shl v2.2d, v1.2d, #1
127 shl v3.2d, v1.2d, #2
128 shl v4.2d, v1.2d, #7
129 eor v1.16b, v1.16b, v2.16b // unit and t contribs
130 eor v3.16b, v3.16b, v4.16b // t^2 and t^7 contribs
131 eor v0.16b, v0.16b, v1.16b // mix everything together
132 eor v0.16b, v0.16b, v3.16b // ... and we're done
133 .endm
134
135 .macro mul64
136 // Enter with u and v in the low halves of v0 and v1, respectively;
137 // leave with z = u v in x2. Clobbers x2--x4.
138
139 // The multiplication is thankfully easy.
140 // v0 = // (?; u)
141 // v1 = // (?; v)
142 pmull v0.1q, v0.1d, v1.1d // u v
143
144 // Now we must reduce. This is essentially the same as the 128-bit
145 // case above, but mostly simpler because everything is smaller. The
146 // polynomial this time is p(t) = t^64 + t^4 + t^3 + t + 1.
147
148 // Before we get stuck in, transfer the product to general-purpose
149 // registers.
150 mov x3, v0.d[1]
151 mov x2, v0.d[0]
152
153 // First, shift the high bits down.
154 eor x4, x3, x3, lsr #1 // pre-mix t^3 and t^4
155 eor x3, x3, x3, lsr #63 // mix in t contribution
156 eor x3, x3, x4, lsr #60 // shift and mix in t^3 and t^4
157
158 // And then shift the low bits up.
159 eor x3, x3, x3, lsl #1 // mix unit and t; pre-mix t^3, t^4
160 eor x2, x2, x3 // fold them in
161 eor x2, x2, x3, lsl #3 // and t^3 and t^4
162 .endm
163
164 .macro mul96
165 // Enter with u in the least-significant 96 bits of v0, with zero in
166 // the upper 32 bits, and with the least-significant 64 bits of v in
167 // both halves of v1, and the upper 32 bits of v in the low 32 bits
168 // of each half of v2, with zero in the upper 32 bits; and with zero
169 // in v31. Yes, that's a bit hairy. Leave with the product u v in
170 // the low 96 bits of v0, and /junk/ in the high 32 bits. Clobbers
171 // v1--v6.
172
173 // This is an inconvenient size. There's nothing for it but to do
174 // four multiplications, as if for the 128-bit case. It's possible
175 // that there's cruft in the top 32 bits of the input registers, so
176 // shift both of them up by four bytes before we start. This will
177 // mean that the high 64 bits of the result (from GCM's viewpoint)
178 // will be zero.
179 // v0 = // (u_2; u_0 + u_1 t^32)
180 // v1 = // (v_0 + v_1 t^32; v_0 + v_1 t^32)
181 // v2 = // (v_2; v_2)
182 pmull2 v5.1q, v0.2d, v1.2d // u_2 (v_0 + v_1 t^32) t^32 = e_0
183 pmull v4.1q, v0.1d, v2.1d // v_2 (u_0 + u_1 t^32) t^32 = e_1
184 pmull2 v6.1q, v0.2d, v2.2d // u_2 v_2 = d = (0; d)
185 pmull v3.1q, v0.1d, v1.1d // u_0 v_0 + (u_0 v_1 + u_1 v_0) t^32
186 // + u_1 v_1 t^64 = f
187
188 // Extract the high and low halves of the 192-bit result. The answer
189 // we want is d t^128 + e t^64 + f, where e = e_0 + e_1. The low 96
190 // bits of the answer will end up in v0, with junk in the top 32
191 // bits; the high 96 bits will end up in v1, which must have zero in
192 // its top 32 bits.
193 //
194 // Here, bot(x) is the low 96 bits of a 192-bit quantity x, arranged
195 // in the low 96 bits of a SIMD register, with junk in the top 32
196 // bits; and top(x) is the high 96 bits, also arranged in the low 96
197 // bits of a register, with /zero/ in the top 32 bits.
198 eor v4.16b, v4.16b, v5.16b // e_0 + e_1 = e
199 vshl128 v6, v6, 32 // top(d t^128)
200 vshr128 v5, v4, 32 // top(e t^64)
201 vshl128 v4, v4, 64 // bot(e t^64)
202 vshr128 v1, v3, 96 // top(f)
203 eor v6.16b, v6.16b, v5.16b // top(d t^128 + e t^64)
204 eor v0.16b, v3.16b, v4.16b // bot([d t^128] + e t^64 + f)
205 eor v1.16b, v1.16b, v6.16b // top(e t^64 + d t^128 + f)
206
207 // Finally, the reduction. This is essentially the same as the
208 // 128-bit case, except that the polynomial is p(t) = t^96 + t^10 +
209 // t^9 + t^6 + 1. The degrees are larger but not enough to cause
210 // trouble for the general approach. Unfortunately, we have to do
211 // this in 32-bit pieces rather than 64.
212
213 // First, shift the high bits down.
214 ushr v2.4s, v1.4s, #26 // the b_i for t^6
215 ushr v3.4s, v1.4s, #23 // the b_i for t^9
216 ushr v4.4s, v1.4s, #22 // the b_i for t^10
217 eor v2.16b, v2.16b, v3.16b // add them all together
218 eor v2.16b, v2.16b, v4.16b
219 vshr128 v3, v2, 64 // contribution for high half
220 vshl128 v2, v2, 32 // contribution for low half
221 eor v1.16b, v1.16b, v3.16b // apply to high half
222 eor v0.16b, v0.16b, v2.16b // and low half
223
224 // And then shift the low bits up.
225 shl v2.4s, v1.4s, #6
226 shl v3.4s, v1.4s, #9
227 shl v4.4s, v1.4s, #10
228 eor v1.16b, v1.16b, v2.16b // unit and t^6 contribs
229 eor v3.16b, v3.16b, v4.16b // t^9 and t^10 contribs
230 eor v0.16b, v0.16b, v1.16b // mix everything together
231 eor v0.16b, v0.16b, v3.16b // ... and we're done
232 .endm
233
234 .macro mul192
235 // Enter with u in v0 and the less-significant half of v1, with v
236 // duplicated across both halves of v2/v3/v4, and with zero in v31.
237 // Leave with the product u v in v0 and the bottom half of v1.
238 // Clobbers v16--v25.
239
240 // Start multiplying and accumulating pieces of product.
241 // v0 = // (u_1; u_0)
242 // v1 = // (?; u_2)
243 // v2 = // (v_0; v_0)
244 // v3 = // (v_1; v_1)
245 // v4 = // (v_2; v_2)
246 pmull v16.1q, v0.1d, v2.1d // a = u_0 v_0
247
248 pmull v19.1q, v0.1d, v3.1d // u_0 v_1
249 pmull2 v21.1q, v0.2d, v2.2d // u_1 v_0
250
251 pmull v17.1q, v0.1d, v4.1d // u_0 v_2
252 pmull2 v22.1q, v0.2d, v3.2d // u_1 v_1
253 pmull v23.1q, v1.1d, v2.1d // u_2 v_0
254 eor v19.16b, v19.16b, v21.16b // b = u_0 v_1 + u_1 v_0
255
256 pmull2 v20.1q, v0.2d, v4.2d // u_1 v_2
257 pmull v24.1q, v1.1d, v3.1d // u_2 v_1
258 eor v17.16b, v17.16b, v22.16b // u_0 v_2 + u_1 v_1
259
260 pmull v18.1q, v1.1d, v4.1d // e = u_2 v_2
261 eor v17.16b, v17.16b, v23.16b // c = u_0 v_2 + u_1 v_1 + u_2 v_1
262 eor v20.16b, v20.16b, v24.16b // d = u_1 v_2 + u_2 v_1
263
264 // Piece the product together.
265 // v16 = // (a_1; a_0)
266 // v19 = // (b_1; b_0)
267 // v17 = // (c_1; c_0)
268 // v20 = // (d_1; d_0)
269 // v18 = // (e_1; e_0)
270 vshl128 v21, v19, 64 // (b_0; 0)
271 ext v22.16b, v19.16b, v20.16b, #8 // (d_0; b_1)
272 vshr128 v23, v20, 64 // (0; d_1)
273 eor v16.16b, v16.16b, v21.16b // (x_1; x_0)
274 eor v17.16b, v17.16b, v22.16b // (x_3; x_2)
275 eor v18.16b, v18.16b, v23.16b // (x_5; x_4)
276
277 // Next, the reduction. Our polynomial this time is p(x) = t^192 +
278 // t^7 + t^2 + t + 1. Yes, the magic numbers are the same as the
279 // 128-bit case. I don't know why.
280
281 // First, shift the high bits down.
282 // v16 = // (y_1; y_0)
283 // v17 = // (y_3; y_2)
284 // v18 = // (y_5; y_4)
285 mov v19.d[0], v17.d[1] // (?; y_3)
286
287 ushr v23.2d, v18.2d, #63 // hi b_i for t
288 ushr d20, d19, #63 // lo b_i for t
289 ushr v24.2d, v18.2d, #62 // hi b_i for t^2
290 ushr d21, d19, #62 // lo b_i for t^2
291 ushr v25.2d, v18.2d, #57 // hi b_i for t^7
292 ushr d22, d19, #57 // lo b_i for t^7
293 eor v23.16b, v23.16b, v24.16b // mix them all together
294 eor v20.8b, v20.8b, v21.8b
295 eor v23.16b, v23.16b, v25.16b
296 eor v20.8b, v20.8b, v22.8b
297
298 // Permute the high pieces while we fold in the b_i.
299 eor v17.16b, v17.16b, v23.16b
300 vshl128 v20, v20, 64
301 mov v19.d[0], v18.d[1] // (?; y_5)
302 ext v18.16b, v17.16b, v18.16b, #8 // (y_4; y_3)
303 eor v16.16b, v16.16b, v20.16b
304
305 // And finally shift the low bits up.
306 // v16 = // (y'_1; y'_0)
307 // v17 = // (?; y'_2)
308 // v18 = // (y'_4; y'_3)
309 // v19 = // (?; y'_5)
310 shl v20.2d, v18.2d, #1
311 shl d23, d19, #1
312 shl v21.2d, v18.2d, #2
313 shl d24, d19, #2
314 shl v22.2d, v18.2d, #7
315 shl d25, d19, #7
316 eor v18.16b, v18.16b, v20.16b // unit and t contribs
317 eor v19.8b, v19.8b, v23.8b
318 eor v21.16b, v21.16b, v22.16b // t^2 and t^7 contribs
319 eor v24.8b, v24.8b, v25.8b
320 eor v18.16b, v18.16b, v21.16b // all contribs
321 eor v19.8b, v19.8b, v24.8b
322 eor v0.16b, v16.16b, v18.16b // mix them into the low half
323 eor v1.8b, v17.8b, v19.8b
324 .endm
325
326 .macro mul256
327 // Enter with u in v0/v1, with v duplicated across both halves of
328 // v2--v5, and with zero in v31. Leave with the product u v in
329 // v0/v1. Clobbers ???.
330
331 // Now it's starting to look worthwhile to do Karatsuba. Suppose
332 // u = u_0 + u_1 B and v = v_0 + v_1 B. Then
333 //
334 // u v = (u_0 v_0) + (u_0 v_1 + u_1 v_0) B + (u_1 v_1) B^2
335 //
336 // Name these coefficients of B^i be a, b, and c, respectively, and
337 // let r = u_0 + u_1 and s = v_0 + v_1. Then observe that
338 //
339 // q = r s = (u_0 + u_1) (v_0 + v_1)
340 // = (u_0 v_0) + (u1 v_1) + (u_0 v_1 + u_1 v_0)
341 // = a + c + b
342 //
343 // The first two terms we've already calculated; the last is the
344 // remaining one we want. We'll set B = t^128. We know how to do
345 // 128-bit multiplications already, and Karatsuba is too annoying
346 // there, so there'll be 12 multiplications altogether, rather than
347 // the 16 we'd have if we did this the naïve way.
348 // v0 = // u_0 = (u_01; u_00)
349 // v1 = // u_1 = (u_11; u_10)
350 // v2 = // (v_00; v_00)
351 // v3 = // (v_01; v_01)
352 // v4 = // (v_10; v_10)
353 // v5 = // (v_11; v_11)
354
355 eor v28.16b, v0.16b, v1.16b // u_* = (u_01 + u_11; u_00 + u_10)
356 eor v29.16b, v2.16b, v4.16b // v_*0 = v_00 + v_10
357 eor v30.16b, v3.16b, v5.16b // v_*1 = v_01 + v_11
358
359 // Start by building the cross product, q = u_* v_*.
360 pmull v24.1q, v28.1d, v30.1d // u_*0 v_*1
361 pmull2 v25.1q, v28.2d, v29.2d // u_*1 v_*0
362 pmull v20.1q, v28.1d, v29.1d // u_*0 v_*0
363 pmull2 v21.1q, v28.2d, v30.2d // u_*1 v_*1
364 eor v24.16b, v24.16b, v25.16b // u_*0 v_*1 + u_*1 v_*0
365 vshr128 v25, v24, 64
366 vshl128 v24, v24, 64
367 eor v20.16b, v20.16b, v24.16b // q_0
368 eor v21.16b, v21.16b, v25.16b // q_1
369
370 // Next, work on the low half, a = u_0 v_0
371 pmull v24.1q, v0.1d, v3.1d // u_00 v_01
372 pmull2 v25.1q, v0.2d, v2.2d // u_01 v_00
373 pmull v16.1q, v0.1d, v2.1d // u_00 v_00
374 pmull2 v17.1q, v0.2d, v3.2d // u_01 v_01
375 eor v24.16b, v24.16b, v25.16b // u_00 v_01 + u_01 v_00
376 vshr128 v25, v24, 64
377 vshl128 v24, v24, 64
378 eor v16.16b, v16.16b, v24.16b // a_0
379 eor v17.16b, v17.16b, v25.16b // a_1
380
381 // Mix the pieces we have so far.
382 eor v20.16b, v20.16b, v16.16b
383 eor v21.16b, v21.16b, v17.16b
384
385 // Finally, work on the high half, c = u_1 v_1
386 pmull v24.1q, v1.1d, v5.1d // u_10 v_11
387 pmull2 v25.1q, v1.2d, v4.2d // u_11 v_10
388 pmull v18.1q, v1.1d, v4.1d // u_10 v_10
389 pmull2 v19.1q, v1.2d, v5.2d // u_11 v_11
390 eor v24.16b, v24.16b, v25.16b // u_10 v_11 + u_11 v_10
391 vshr128 v25, v24, 64
392 vshl128 v24, v24, 64
393 eor v18.16b, v18.16b, v24.16b // c_0
394 eor v19.16b, v19.16b, v25.16b // c_1
395
396 // Finish mixing the product together.
397 eor v20.16b, v20.16b, v18.16b
398 eor v21.16b, v21.16b, v19.16b
399 eor v17.16b, v17.16b, v20.16b
400 eor v18.16b, v18.16b, v21.16b
401
402 // Now we must reduce. This is essentially the same as the 192-bit
403 // case above, but more complicated because everything is bigger.
404 // The polynomial this time is p(t) = t^256 + t^10 + t^5 + t^2 + 1.
405 // v16 = // (y_1; y_0)
406 // v17 = // (y_3; y_2)
407 // v18 = // (y_5; y_4)
408 // v19 = // (y_7; y_6)
409 ushr v24.2d, v18.2d, #62 // (y_5; y_4) b_i for t^2
410 ushr v25.2d, v19.2d, #62 // (y_7; y_6) b_i for t^2
411 ushr v26.2d, v18.2d, #59 // (y_5; y_4) b_i for t^5
412 ushr v27.2d, v19.2d, #59 // (y_7; y_6) b_i for t^5
413 ushr v28.2d, v18.2d, #54 // (y_5; y_4) b_i for t^10
414 ushr v29.2d, v19.2d, #54 // (y_7; y_6) b_i for t^10
415 eor v24.16b, v24.16b, v26.16b // mix the contributions together
416 eor v25.16b, v25.16b, v27.16b
417 eor v24.16b, v24.16b, v28.16b
418 eor v25.16b, v25.16b, v29.16b
419 vshr128 v26, v25, 64 // slide contribs into position
420 ext v25.16b, v24.16b, v25.16b, #8
421 vshl128 v24, v24, 64
422 eor v18.16b, v18.16b, v26.16b
423 eor v17.16b, v17.16b, v25.16b
424 eor v16.16b, v16.16b, v24.16b
425
426 // And then shift the low bits up.
427 // v16 = // (y'_1; y'_0)
428 // v17 = // (y'_3; y'_2)
429 // v18 = // (y'_5; y'_4)
430 // v19 = // (y'_7; y'_6)
431 shl v24.2d, v18.2d, #2 // (y_5; y'_4) a_i for t^2
432 shl v25.2d, v19.2d, #2 // (y_7; y_6) a_i for t^2
433 shl v26.2d, v18.2d, #5 // (y_5; y'_4) a_i for t^5
434 shl v27.2d, v19.2d, #5 // (y_7; y_6) a_i for t^5
435 shl v28.2d, v18.2d, #10 // (y_5; y'_4) a_i for t^10
436 shl v29.2d, v19.2d, #10 // (y_7; y_6) a_i for t^10
437 eor v18.16b, v18.16b, v24.16b // mix the contributions together
438 eor v19.16b, v19.16b, v25.16b
439 eor v26.16b, v26.16b, v28.16b
440 eor v27.16b, v27.16b, v29.16b
441 eor v18.16b, v18.16b, v26.16b
442 eor v19.16b, v19.16b, v27.16b
443 eor v0.16b, v16.16b, v18.16b
444 eor v1.16b, v17.16b, v19.16b
445 .endm
446
447 ///--------------------------------------------------------------------------
448 /// Main code.
449
450 // There are a number of representations of field elements in this code and
451 // it can be confusing.
452 //
453 // * The `external format' consists of a sequence of contiguous bytes in
454 // memory called a `block'. The GCM spec explains how to interpret this
455 // block as an element of a finite field. As discussed extensively, this
456 // representation is very annoying for a number of reasons. On the other
457 // hand, this code never actually deals with it directly.
458 //
459 // * The `register format' consists of one or more SIMD registers,
460 // depending on the block size. The bits in each byte are reversed,
461 // compared to the external format, which makes the polynomials
462 // completely vanilla, unlike all of the other GCM implementations.
463 //
464 // * The `table format' is just like the `register format', only the two
465 // halves of 128-bit SIMD register are the same, so we need twice as many
466 // registers.
467 //
468 // * The `words' format consists of a sequence of bytes, as in the
469 // `external format', but, according to the blockcipher in use, the bytes
470 // within each 32-bit word may be reversed (`big-endian') or not
471 // (`little-endian'). Accordingly, there are separate entry points for
472 // each variant, identified with `b' or `l'.
473
474 FUNC(gcm_mulk_128b_arm64_pmull)
475 // On entry, x0 points to a 128-bit field element A in big-endian
476 // words format; x1 points to a field-element K in table format. On
477 // exit, A is updated with the product A K.
478
479 ldr q0, [x0]
480 ldp q1, q2, [x1]
481 rev32 v0.16b, v0.16b
482 vzero
483 rbit v0.16b, v0.16b
484 mul128
485 rbit v0.16b, v0.16b
486 rev32 v0.16b, v0.16b
487 str q0, [x0]
488 ret
489 ENDFUNC
490
491 FUNC(gcm_mulk_128l_arm64_pmull)
492 // On entry, x0 points to a 128-bit field element A in little-endian
493 // words format; x1 points to a field-element K in table format. On
494 // exit, A is updated with the product A K.
495
496 ldr q0, [x0]
497 ldp q1, q2, [x1]
498 vzero
499 rbit v0.16b, v0.16b
500 mul128
501 rbit v0.16b, v0.16b
502 str q0, [x0]
503 ret
504 ENDFUNC
505
506 FUNC(gcm_mulk_64b_arm64_pmull)
507 // On entry, x0 points to a 64-bit field element A in big-endian
508 // words format; x1 points to a field-element K in table format. On
509 // exit, A is updated with the product A K.
510
511 ldr d0, [x0]
512 ldr q1, [x1]
513 rev32 v0.8b, v0.8b
514 rbit v0.8b, v0.8b
515 mul64
516 rbit x2, x2
517 ror x2, x2, #32
518 str x2, [x0]
519 ret
520 ENDFUNC
521
522 FUNC(gcm_mulk_64l_arm64_pmull)
523 // On entry, x0 points to a 64-bit field element A in little-endian
524 // words format; x1 points to a field-element K in table format. On
525 // exit, A is updated with the product A K.
526
527 ldr d0, [x0]
528 ldr q1, [x1]
529 rbit v0.8b, v0.8b
530 mul64
531 rbit x2, x2
532 rev x2, x2
533 str x2, [x0]
534 ret
535 ENDFUNC
536
537 FUNC(gcm_mulk_96b_arm64_pmull)
538 // On entry, x0 points to a 96-bit field element A in big-endian
539 // words format; x1 points to a field-element K in table format. On
540 // exit, A is updated with the product A K.
541
542 ldr w2, [x0, #8]
543 ldr d0, [x0, #0]
544 mov v0.d[1], x2
545 ldp q1, q2, [x1]
546 rev32 v0.16b, v0.16b
547 vzero
548 rbit v0.16b, v0.16b
549 mul96
550 rbit v0.16b, v0.16b
551 rev32 v0.16b, v0.16b
552 mov w2, v0.s[2]
553 str d0, [x0, #0]
554 str w2, [x0, #8]
555 ret
556 ENDFUNC
557
558 FUNC(gcm_mulk_96l_arm64_pmull)
559 // On entry, x0 points to a 96-bit field element A in little-endian
560 // words format; x1 points to a field-element K in table format. On
561 // exit, A is updated with the product A K.
562
563 ldr d0, [x0, #0]
564 ldr w2, [x0, #8]
565 mov v0.d[1], x2
566 ldp q1, q2, [x1]
567 rbit v0.16b, v0.16b
568 vzero
569 mul96
570 rbit v0.16b, v0.16b
571 mov w2, v0.s[2]
572 str d0, [x0, #0]
573 str w2, [x0, #8]
574 ret
575 ENDFUNC
576
577 FUNC(gcm_mulk_192b_arm64_pmull)
578 // On entry, x0 points to a 192-bit field element A in big-endian
579 // words format; x1 points to a field-element K in table format. On
580 // exit, A is updated with the product A K.
581
582 ldr q0, [x0, #0]
583 ldr d1, [x0, #16]
584 ldp q2, q3, [x1, #0]
585 ldr q4, [x1, #32]
586 rev32 v0.16b, v0.16b
587 rev32 v1.8b, v1.8b
588 rbit v0.16b, v0.16b
589 rbit v1.8b, v1.8b
590 vzero
591 mul192
592 rev32 v0.16b, v0.16b
593 rev32 v1.8b, v1.8b
594 rbit v0.16b, v0.16b
595 rbit v1.8b, v1.8b
596 str q0, [x0, #0]
597 str d1, [x0, #16]
598 ret
599 ENDFUNC
600
601 FUNC(gcm_mulk_192l_arm64_pmull)
602 // On entry, x0 points to a 192-bit field element A in little-endian
603 // words format; x1 points to a field-element K in table format. On
604 // exit, A is updated with the product A K.
605
606 ldr q0, [x0, #0]
607 ldr d1, [x0, #16]
608 ldp q2, q3, [x1, #0]
609 ldr q4, [x1, #32]
610 rbit v0.16b, v0.16b
611 rbit v1.8b, v1.8b
612 vzero
613 mul192
614 rbit v0.16b, v0.16b
615 rbit v1.8b, v1.8b
616 str q0, [x0, #0]
617 str d1, [x0, #16]
618 ret
619 ENDFUNC
620
621 FUNC(gcm_mulk_256b_arm64_pmull)
622 // On entry, x0 points to a 256-bit field element A in big-endian
623 // words format; x1 points to a field-element K in table format. On
624 // exit, A is updated with the product A K.
625
626 ldp q0, q1, [x0]
627 ldp q2, q3, [x1, #0]
628 ldp q4, q5, [x1, #32]
629 rev32 v0.16b, v0.16b
630 rev32 v1.16b, v1.16b
631 rbit v0.16b, v0.16b
632 rbit v1.16b, v1.16b
633 vzero
634 mul256
635 rev32 v0.16b, v0.16b
636 rev32 v1.16b, v1.16b
637 rbit v0.16b, v0.16b
638 rbit v1.16b, v1.16b
639 stp q0, q1, [x0]
640 ret
641 ENDFUNC
642
643 FUNC(gcm_mulk_256l_arm64_pmull)
644 // On entry, x0 points to a 256-bit field element A in little-endian
645 // words format; x1 points to a field-element K in table format. On
646 // exit, A is updated with the product A K.
647
648 ldp q0, q1, [x0]
649 ldp q2, q3, [x1, #0]
650 ldp q4, q5, [x1, #32]
651 rbit v0.16b, v0.16b
652 rbit v1.16b, v1.16b
653 vzero
654 mul256
655 rbit v0.16b, v0.16b
656 rbit v1.16b, v1.16b
657 stp q0, q1, [x0]
658 ret
659 ENDFUNC
660
661 ///----- That's all, folks --------------------------------------------------