base/asm-common.h, *.S: Use consistent little-endian notation for SIMD regs.
[catacomb] / symm / gcm-x86ish-pclmul.S
CommitLineData
9e6a4409
MW
1/// -*- mode: asm; asm-comment-char: ?/ -*-
2///
3/// GCM acceleration for x86 processors
4///
5/// (c) 2018 Straylight/Edgeware
6///
7
8///----- Licensing notice ---------------------------------------------------
9///
10/// This file is part of Catacomb.
11///
12/// Catacomb is free software: you can redistribute it and/or modify it
13/// under the terms of the GNU Library General Public License as published
14/// by the Free Software Foundation; either version 2 of the License, or
15/// (at your option) any later version.
16///
17/// Catacomb is distributed in the hope that it will be useful, but
18/// WITHOUT ANY WARRANTY; without even the implied warranty of
19/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20/// Library General Public License for more details.
21///
22/// You should have received a copy of the GNU Library General Public
23/// License along with Catacomb. If not, write to the Free Software
24/// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
25/// USA.
26
27///--------------------------------------------------------------------------
28/// Preliminaries.
29
30#include "config.h"
31#include "asm-common.h"
32
33 .arch .pclmul
34
35 .text
36
37///--------------------------------------------------------------------------
38/// Common register allocation.
39
40#if CPUFAM_X86
41# define A eax
42# define K edx
43#elif CPUFAM_AMD64 && ABI_SYSV
44# define A rdi
45# define K rsi
46#elif CPUFAM_AMD64 && ABI_WIN
47# define A rcx
48# define K rdx
49#endif
50
51///--------------------------------------------------------------------------
52/// Multiplication macros.
53
54 // The good news is that we have a fancy instruction to do the
55 // multiplications. The bad news is that it's not particularly well-
56 // suited to the job.
57 //
58 // For one thing, it only does a 64-bit multiplication, so in general
59 // we'll need to synthesize the full-width multiply by hand. For
60 // another thing, it doesn't help with the reduction, so we have to
61 // do that by hand too. And, finally, GCM has crazy bit ordering,
62 // and the instruction does nothing useful for that at all.
63 //
64 // Focusing on that last problem first: the bits aren't in monotonic
65 // significance order unless we permute them. If we reverse the byte
66 // order, then we'll have the bits in monotonic order, but backwards,
67 // so the degree-0 coefficient will be in the most-significant bit.
68 //
69 // This is less of a difficulty than it seems at first, because
70 // algebra. Suppose we are given u = SUM_{0<=i<n} u_i t^i and v =
71 // SUM_{0<=j<n} v_j t^j; then
72 //
73 // u v = SUM_{0<=i,j<n} u_i v_j t^{i+j}
74 //
75 // Suppose instead that we're given ũ = SUM_{0<=i<n} u_{n-i-1} t^i
d7c0f9a7 76 // and ṽ = SUM_{0<=j<n} v_{n-j-1} t^j, so the bits are backwards.
9e6a4409
MW
77 // Then
78 //
d7c0f9a7 79 // ũ ṽ = SUM_{0<=i,j<n} u_{n-i-1} v_{n-j-1} t^{i+j}
9e6a4409
MW
80 // = SUM_{0<=i,j<n} u_i v_j t^{2n-2-(i+j)}
81 //
82 // which is almost the bit-reversal of u v, only it's shifted right
ac4a43c1
MW
83 // by one place. Putting this another way, what we have is actually
84 // the bit reversal of the product u v t. We could get the correct
85 // answer (modulo p(t)) if we'd sneakily divided one of the operands
86 // by t before we started. Conveniently, v is actually the secret
87 // value k set up by the GCM `mktable' function, so we can arrange to
88 // actually store k/t (mod p(t)) and then the product will come out
89 // correct (modulo p(t)) and we won't have anything more to worry
90 // about here.
9e6a4409
MW
91 //
92 // That was important to think about, but there's not a great deal to
93 // do about it yet other than to convert what we've got from the
94 // blockcipher's byte-ordering convention to our big-endian
95 // convention. Since this depends on the blockcipher convention,
96 // we'll leave the caller to cope with this: the macros here will
97 // assume that the operands are in `register' format, which is the
98 // byte-reversal of the external representation, padded at the
99 // most-significant end except for 96-bit blocks, which are
100 // zero-padded at the least-significant end (see `mul96' for the
101 // details). In the commentary, pieces of polynomial are numbered
102 // according to the degree of the coefficients, so the unit
103 // coefficient of some polynomial a is in a_0.
104 //
105 // The commentary for `mul128' is the most detailed. The other
106 // macros assume that you've already read and understood that.
107
108.macro mul128
109 // Enter with u and v in xmm0 and xmm1 respectively; leave with z =
110 // u v in xmm0. Clobbers xmm1--xmm4.
111
112 // First for the double-precision multiplication. It's tempting to
113 // use Karatsuba's identity here, but I suspect that loses more in
114 // the shifting, bit-twiddling, and dependency chains that it gains
115 // in saving a multiplication which otherwise pipelines well.
981a9e5d
MW
116 // xmm0 = // (u_0; u_1)
117 // xmm1 = // (v_0; v_1)
118 movdqa xmm2, xmm1 // (v_0; v_1) again
119 movdqa xmm3, xmm0 // (u_0; u_1) again
120 movdqa xmm4, xmm0 // (u_0; u_1) yet again
9e6a4409
MW
121 pclmulhqlqdq xmm2, xmm0 // u_1 v_0
122 pclmullqlqdq xmm0, xmm1 // u_1 v_1
123 pclmulhqlqdq xmm3, xmm1 // u_0 v_1
124 pclmulhqhqdq xmm4, xmm1 // u_0 v_0
125
126 // Arrange the pieces to form a double-precision polynomial.
981a9e5d
MW
127 pxor xmm2, xmm3 // (m_0; m_1) = u_1 v_0 + u_0 v_1
128 movdqa xmm1, xmm2 // (m_0; m_1) again
129 pslldq xmm2, 8 // (m_1; 0)
130 psrldq xmm1, 8 // (0; m_0)
ac4a43c1
MW
131 pxor xmm0, xmm2 // z_1 = u_1 v_1 + m_1
132 pxor xmm1, xmm4 // z_0 = u_0 v_0 + t^64 m_0
133
134 // The remaining problem is that the result needs to be reduced
9e6a4409
MW
135 // modulo p(t) = t^128 + t^7 + t^2 + t + 1. Let R = t^128 = t^7 +
136 // t^2 + t + 1 in our field. So far, we've calculated z_0 and z_1
137 // such that z_0 + z_1 R = u v using the identity R = t^128: now we
138 // must collapse the two halves of z together using the other
139 // identity R = t^7 + t^2 + t + 1.
140 //
141 // We do this by working on each 32-bit word of the high half of z
ac4a43c1
MW
142 // separately, so consider x_i, for some 4 <= i < 8. Certainly, x_i
143 // t^{32i} = x_i R t^{32(i-4)} = (t^7 + t^2 + t + 1) x_i t^{32(i-4)},
9e6a4409 144 // but we can't use that directly without breaking up the 32-bit word
ac4a43c1
MW
145 // structure. Instead, we start by considering just x_i t^7
146 // t^{32(i-4)}, which again looks tricky. Now, split x_i = a_i +
9e6a4409
MW
147 // t^25 b_i, with deg a_i < 25; then
148 //
ac4a43c1 149 // x_i t^7 t^{32(i-4)} = a_i t^7 t^{32(i-4)} + b_i t^{32(i-3)}
9e6a4409 150 //
ac4a43c1 151 // We can similarly decompose x_i t^2 and x_i t into a pair of 32-bit
9e6a4409
MW
152 // contributions to the t^{32(i-4)} and t^{32(i-3)} words, but the
153 // splits are different. This is lovely, with one small snag: when
ac4a43c1 154 // we do this to x_7, we end up with a contribution back into the
9e6a4409
MW
155 // t^128 coefficient word. But notice that only the low seven bits
156 // of this word are affected, so there's no knock-on contribution
157 // into the t^32 word. Therefore, if we handle the high bits of each
158 // word together, and then the low bits, everything will be fine.
159
160 // First, shift the high bits down.
981a9e5d
MW
161 movdqa xmm2, xmm0 // (x_4, x_5; x_6, x_7) again
162 movdqa xmm3, xmm0 // (x_4, x_5; x_6, x_7) yet again
163 movdqa xmm4, xmm0 // (x_4, x_5; x_6, x_7) again again
9e6a4409
MW
164 pslld xmm2, 31 // the b_i for t
165 pslld xmm3, 30 // the b_i for t^2
166 pslld xmm4, 25 // the b_i for t^7
167 pxor xmm2, xmm3 // add them all together
168 pxor xmm2, xmm4
169 movdqa xmm3, xmm2 // and a copy for later
170 psrldq xmm2, 4 // contribution into low half
171 pslldq xmm3, 12 // and high half
172 pxor xmm1, xmm2
173 pxor xmm0, xmm3
174
175 // And then shift the low bits up.
176 movdqa xmm2, xmm0
177 movdqa xmm3, xmm0
178 pxor xmm1, xmm0 // mix in the unit contribution
179 psrld xmm0, 1
180 psrld xmm2, 2
181 psrld xmm3, 7
182 pxor xmm1, xmm2 // low half, unit, and t^2 contribs
183 pxor xmm0, xmm3 // t and t^7 contribs
184 pxor xmm0, xmm1 // mix them together and we're done
185.endm
186
187.macro mul64
188 // Enter with u and v in the low halves of xmm0 and xmm1
189 // respectively; leave with z = u v in xmm0. Clobbers xmm1--xmm4.
190
191 // The multiplication is thankfully easy.
ac4a43c1 192 pclmullqlqdq xmm1, xmm0 // u v
9e6a4409
MW
193
194 // Now we must reduce. This is essentially the same as the 128-bit
195 // case above, but mostly simpler because everything is smaller. The
196 // polynomial this time is p(t) = t^64 + t^4 + t^3 + t + 1.
197
198 // First, we must detach the top (`low'!) half of the result.
981a9e5d
MW
199 movdqa xmm0, xmm1 // (x_0, x_1; x_2, x_3) again
200 psrldq xmm1, 8 // (0, 0; x_0, x_1)
9e6a4409
MW
201
202 // Next, shift the high bits down.
981a9e5d
MW
203 movdqa xmm2, xmm0 // (?, ?; x_2, x_3) again
204 movdqa xmm3, xmm0 // (?, ?; x_2, x_3) yet again
205 movdqa xmm4, xmm0 // (?, ?; x_2, x_3) again again
9e6a4409
MW
206 pslld xmm2, 31 // b_i for t
207 pslld xmm3, 29 // b_i for t^3
208 pslld xmm4, 28 // b_i for t^4
209 pxor xmm2, xmm3 // add them all together
210 pxor xmm2, xmm4
211 movdqa xmm3, xmm2 // and a copy for later
212 movq xmm2, xmm2 // zap high half
213 pslldq xmm3, 4 // contribution into high half
214 psrldq xmm2, 4 // and low half
215 pxor xmm0, xmm3
216 pxor xmm1, xmm2
217
218 // And then shift the low bits up.
219 movdqa xmm2, xmm0
220 movdqa xmm3, xmm0
221 pxor xmm1, xmm0 // mix in the unit contribution
222 psrld xmm0, 1
223 psrld xmm2, 3
224 psrld xmm3, 4
225 pxor xmm1, xmm2 // low half, unit, and t^3 contribs
226 pxor xmm0, xmm3 // t and t^4 contribs
227 pxor xmm0, xmm1 // mix them together and we're done
228.endm
229
230.macro mul96
231 // Enter with u and v in the /high/ three words of xmm0 and xmm1
232 // respectively (and zero in the low word); leave with z = u v in the
233 // high three words of xmm0, and /junk/ in the low word. Clobbers
234 // xmm1--xmm4.
235
236 // This is an inconvenient size. There's nothing for it but to do
237 // four multiplications, as if for the 128-bit case. It's possible
238 // that there's cruft in the top 32 bits of the input registers, so
239 // shift both of them up by four bytes before we start. This will
240 // mean that the high 64 bits of the result (from GCM's viewpoint)
241 // will be zero.
981a9e5d
MW
242 // xmm0 = // (u_0, u_1; u_2, 0)
243 // xmm1 = // (v_0, v_1; v_2, 0)
244 movdqa xmm2, xmm1 // (v_0, v_1; v_2, 0) again
245 movdqa xmm3, xmm0 // (u_0, u_1; u_2, 0) again
246 movdqa xmm4, xmm0 // (u_0, u_1; u_2, 0) yet again
9e6a4409
MW
247 pclmulhqlqdq xmm2, xmm0 // u_2 (v_1 t^32 + v_0) = e_0
248 pclmullqlqdq xmm0, xmm1 // u_2 v_2 = d = (0; d)
249 pclmulhqlqdq xmm3, xmm1 // v_2 (u_1 t^32 + u_0) = e_1
250 pclmulhqhqdq xmm4, xmm1 // u_0 v_0 + (u_1 v_0 + u_0 v_1) t^32
251 // + u_1 v_1 t^64 = f
252
253 // Extract the high and low halves of the 192-bit result. We don't
254 // need be too picky about the unused high words of the result
255 // registers. The answer we want is d t^128 + e t^64 + f, where e =
256 // e_0 + e_1.
257 //
981a9e5d
MW
258 // The place values for the two halves are (?, t^96; t^128, t^160)
259 // and (1, t^32; t^64, ?). But we also want to shift the high part
ac4a43c1 260 // left by a word, for symmetry's sake.
981a9e5d 261 psrldq xmm0, 8 // (0; d) = d t^128
9e6a4409
MW
262 pxor xmm2, xmm3 // e = (e_0 + e_1)
263 movdqa xmm1, xmm4 // f again
264 pxor xmm0, xmm2 // d t^128 + e t^64
265 psrldq xmm2, 12 // e[31..0] t^64
266 psrldq xmm1, 4 // f[95..0]
ac4a43c1
MW
267 pslldq xmm4, 12 // f[127..96], shifted
268 pslldq xmm0, 4 // shift high 96 bits
9e6a4409
MW
269 pxor xmm1, xmm2 // low 96 bits of result
270 pxor xmm0, xmm4 // high 96 bits of result
271
9e6a4409
MW
272 // Finally, the reduction. This is essentially the same as the
273 // 128-bit case, except that the polynomial is p(t) = t^96 + t^10 +
274 // t^9 + t^6 + 1. The degrees are larger but not enough to cause
275 // trouble for the general approach.
276
277 // First, shift the high bits down.
278 movdqa xmm2, xmm0 // copies of the high part
279 movdqa xmm3, xmm0
280 movdqa xmm4, xmm0
281 pslld xmm2, 26 // b_i for t^6
282 pslld xmm3, 23 // b_i for t^9
283 pslld xmm4, 22 // b_i for t^10
284 pxor xmm2, xmm3 // add them all together
285 pslldq xmm1, 4 // shift low part up to match
286 pxor xmm2, xmm4
287 movdqa xmm3, xmm2 // and a copy for later
288 pslldq xmm2, 8 // contribution to high half
289 psrldq xmm3, 4 // contribution to low half
290 pxor xmm1, xmm3
291 pxor xmm0, xmm2
292
293 // And then shift the low bits up.
294 movdqa xmm2, xmm0 // copies of the high part
295 movdqa xmm3, xmm0
296 pxor xmm1, xmm0 // mix in the unit contribution
297 psrld xmm0, 6
298 psrld xmm2, 9
299 psrld xmm3, 10
300 pxor xmm1, xmm2 // low half, unit, and t^9 contribs
301 pxor xmm0, xmm3 // t^6 and t^10 contribs
302 pxor xmm0, xmm1 // mix them together and we're done
303.endm
304
305.macro mul192
306 // Enter with u and v in xmm0/xmm1 and xmm2/xmm3 respectively; leave
307 // with z = u v in xmm0/xmm1 -- the top halves of the high registers
308 // are unimportant. Clobbers xmm2--xmm7.
309
310 // Start multiplying and accumulating pieces of product.
981a9e5d
MW
311 // xmm0 = // (u_1; u_2)
312 // xmm1 = // (?; u_0)
313 // xmm2 = // (v_1; v_2)
314 // xmm3 = // (?; v_0)
315 movdqa xmm4, xmm0 // (u_1; u_2) again
316 movdqa xmm5, xmm0 // (u_1; u_2) yet again
317 movdqa xmm6, xmm0 // (u_1; u_2) again again
318 movdqa xmm7, xmm3 // (?; v_0) again
319 punpcklqdq xmm3, xmm1 // (u_0; v_0)
9e6a4409 320 pclmulhqhqdq xmm4, xmm2 // u_1 v_1
ac4a43c1 321 pclmullqlqdq xmm1, xmm2 // u_0 v_2
9e6a4409
MW
322 pclmullqhqdq xmm5, xmm2 // u_2 v_1
323 pclmulhqlqdq xmm6, xmm2 // u_1 v_2
ac4a43c1
MW
324 pxor xmm1, xmm4 // u_0 v_2 + u_1 v_1
325 pclmullqlqdq xmm7, xmm0 // u_2 v_0
9e6a4409 326 pxor xmm5, xmm6 // b = u_2 v_1 + u_1 v_2
981a9e5d 327 movdqa xmm6, xmm0 // (u_1; u_2) like a bad penny
ac4a43c1 328 pxor xmm1, xmm7 // c = u_0 v_2 + u_1 v_1 + u_2 v_0
9e6a4409 329 pclmullqlqdq xmm0, xmm2 // a = u_2 v_2
ac4a43c1
MW
330 pclmulhqlqdq xmm6, xmm3 // u_1 v_0
331 pclmulhqhqdq xmm2, xmm3 // u_0 v_1
332 pclmullqhqdq xmm3, xmm3 // e = u_0 v_0
333 pxor xmm6, xmm2 // d = u_1 v_0 + u_0 v_1
9e6a4409 334
ac4a43c1
MW
335 // Next, the piecing together of the product. There's significant
336 // work here to leave the completed pieces in sensible registers.
981a9e5d
MW
337 // xmm0 = // (a_0; a_1) = a = u_2 v_2
338 // xmm5 = // (b_0; b_1) = b = u_1 v_2 + u_2 v_1
339 // xmm1 = // (c_0; c_1) = c = u_0 v_2 +
9e6a4409 340 // u_1 v_1 + u_2 v_0
981a9e5d
MW
341 // xmm6 = // (d_0; d_1) = d = u_0 v_1 + u_1 v_0
342 // xmm3 = // (e_0; e_1) = e = u_0 v_0
ac4a43c1 343 // xmm2, xmm4, xmm7 spare
981a9e5d
MW
344 movdqa xmm2, xmm6 // (d_0; d_1) again
345 movdqa xmm4, xmm5 // (b_0; b_1) again
346 pslldq xmm6, 8 // (d_1; 0)
347 psrldq xmm5, 8 // (0; b_0)
348 psrldq xmm2, 8 // (0; d_0)
349 pslldq xmm4, 8 // (b_1; 0)
350 pxor xmm5, xmm6 // (d_1; b_0)
351 pxor xmm0, xmm4 // (x_4; x_5) = (a_0 + b_1; a_1)
352 pxor xmm2, xmm3 // (x_0; x_1) = (e_0; e_1 + d_0)
353 pxor xmm1, xmm5 // (x_2; x_3) = (c_0 + d_1; b_0 + c_1)
9e6a4409
MW
354
355 // Next, the reduction. Our polynomial this time is p(x) = t^192 +
356 // t^7 + t^2 + t + 1. Yes, the magic numbers are the same as the
357 // 128-bit case. I don't know why.
358
359 // First, shift the high bits down.
981a9e5d
MW
360 // xmm0 = // (x_4; x_5)
361 // xmm1 = // (x_2; x_3)
362 // xmm2 = // (x_0; x_1)
9e6a4409 363 // xmm3--xmm7 spare
981a9e5d
MW
364 movdqa xmm3, xmm0 // (x_4; x_5) copy
365 movdqa xmm4, xmm0 // (x_4; x_5) copy
366 movdqa xmm5, xmm0 // (x_4; x_5) copy
367 pslld xmm3, 31 // (x_4; x_5) b_i for t
368 pslld xmm4, 30 // (x_4; x_5) b_i for t^2
369 pslld xmm5, 25 // (x_4; x_5) b_i for t^7
370 movq xmm6, xmm1 // (0; x_3) copy
9e6a4409 371 pxor xmm3, xmm4
981a9e5d 372 movq xmm7, xmm1 // (0; x_3) copy
9e6a4409 373 pxor xmm3, xmm5
981a9e5d
MW
374 movq xmm5, xmm1 // (0; x_3) copy
375 movdqa xmm4, xmm3 // (x_4; x_5) b_i combined
376 pslld xmm6, 31 // (0; x_3) b_i for t
377 pslld xmm7, 30 // (0; x_3) b_i for t^2
378 pslld xmm5, 25 // (0; x_3) b_i for t^7
379 psrldq xmm3, 12 // (x_4; x_5) low contrib
380 pslldq xmm4, 4 // (x_4; x_5) high contrib
9e6a4409
MW
381 pxor xmm6, xmm7
382 pxor xmm2, xmm3
383 pxor xmm6, xmm5
384 pxor xmm1, xmm4
385 pslldq xmm6, 4
386 pxor xmm2, xmm6
387
388 // And finally shift the low bits up. Unfortunately, we also have to
389 // split the low bits out.
981a9e5d
MW
390 // xmm0 = // (x'_4; x'_5)
391 // xmm1 = // (x'_2; x'_3)
392 // xmm2 = // (x'_0; x'_1)
393 movdqa xmm5, xmm1 // copies of (x'_2; x'_3)
9e6a4409
MW
394 movdqa xmm6, xmm1
395 movdqa xmm7, xmm1
981a9e5d
MW
396 psrldq xmm1, 8 // bring down (?; x'_2)
397 movdqa xmm3, xmm0 // copies of (x'_4; x'_5)
9e6a4409 398 movdqa xmm4, xmm0
981a9e5d
MW
399 punpcklqdq xmm1, xmm2 // (x'_1; x'_2)
400 psrldq xmm2, 8 // (?; x'_0)
9e6a4409
MW
401 pxor xmm2, xmm5 // low half and unit contrib
402 pxor xmm1, xmm0
403 psrld xmm5, 1
404 psrld xmm0, 1
405 psrld xmm6, 2
406 psrld xmm3, 2
407 psrld xmm7, 7
408 psrld xmm4, 7
409 pxor xmm2, xmm6 // low half, unit, t^2 contribs
410 pxor xmm1, xmm3
411 pxor xmm5, xmm7 // t and t^7 contribs
412 pxor xmm0, xmm4
413 pxor xmm5, xmm2 // mix everything together
414 pxor xmm0, xmm1
981a9e5d 415 movq xmm1, xmm5 // shunt (?; z_0) into proper place
9e6a4409
MW
416.endm
417
418.macro mul256
419 // Enter with u and v in xmm0/xmm1 and xmm2/xmm3 respectively; leave
420 // with z = u v in xmm0/xmm1. Clobbers xmm2--xmm7. On 32-bit x86,
421 // requires 16 bytes aligned space at SP; on amd64, also clobbers
422 // xmm8.
423
424 // Now it's starting to look worthwhile to do Karatsuba. Suppose
425 // u = u_0 + u_1 B and v = v_0 + v_1 B. Then
426 //
427 // u v = (u_0 v_0) + (u_0 v_1 + u_1 v_0) B + (u_1 v_1) B^2
428 //
429 // Name these coefficients of B^i be a, b, and c, respectively, and
430 // let r = u_0 + u_1 and s = v_0 + v_1. Then observe that
431 //
432 // q = r s = (u_0 + u_1) (v_0 + v_1)
433 // = (u_0 v_0) + (u1 v_1) + (u_0 v_1 + u_1 v_0)
9f4db500 434 // = a + c + b
9e6a4409
MW
435 //
436 // The first two terms we've already calculated; the last is the
437 // remaining one we want. We'll set B = t^128. We know how to do
438 // 128-bit multiplications already, and Karatsuba is too annoying
439 // there, so there'll be 12 multiplications altogether, rather than
440 // the 16 we'd have if we did this the naïve way.
441 //
442 // On x86, there aren't quite enough registers, so spill one for a
443 // bit. On AMD64, we can keep on going, so it's all good.
444
981a9e5d
MW
445 // xmm0 = // u_1 = (u_10; u_11)
446 // xmm1 = // u_0 = (u_00; u_01)
447 // xmm2 = // v_1 = (v_10; v_11)
448 // xmm3 = // v_0 = (v_00; v_01)
9e6a4409
MW
449 movdqa xmm4, xmm0 // u_1 again
450#if CPUFAM_X86
a90d420c 451 movdqa [SP + 0], xmm3
9e6a4409
MW
452#elif CPUFAM_AMD64
453 movdqa xmm8, xmm3
454# define V0 xmm8
455#endif
981a9e5d
MW
456 pxor xmm4, xmm1 // u_* = (u_00 + u_10; u_01 + u_11)
457 pxor xmm3, xmm2 // v_* = (v_00 + v_10; v_01 + v_11)
9e6a4409
MW
458
459 // Start by building the cross product, q = u_* v_*.
460 movdqa xmm7, xmm4 // more copies of u_*
461 movdqa xmm5, xmm4
462 movdqa xmm6, xmm4
463 pclmullqhqdq xmm4, xmm3 // u_*1 v_*0
464 pclmulhqlqdq xmm7, xmm3 // u_*0 v_*1
465 pclmullqlqdq xmm5, xmm3 // u_*1 v_*1
466 pclmulhqhqdq xmm6, xmm3 // u_*0 v_*0
467 pxor xmm4, xmm7 // u_*1 v_*0 + u_*0 v_*1
468 movdqa xmm7, xmm4
469 pslldq xmm4, 8
470 psrldq xmm7, 8
471 pxor xmm5, xmm4 // q_1
472 pxor xmm6, xmm7 // q_0
473
474 // Next, work on the high half, a = u_1 v_1.
475 movdqa xmm3, xmm0 // more copies of u_1
476 movdqa xmm4, xmm0
477 movdqa xmm7, xmm0
478 pclmullqhqdq xmm0, xmm2 // u_11 v_10
479 pclmulhqlqdq xmm3, xmm2 // u_10 v_11
480 pclmullqlqdq xmm4, xmm2 // u_11 v_11
481 pclmulhqhqdq xmm7, xmm2 // u_10 v_10
482#if CPUFAM_X86
a90d420c 483 movdqa xmm2, [SP + 0]
9e6a4409
MW
484# define V0 xmm2
485#endif
486 pxor xmm0, xmm3 // u_10 v_11 + u_11 v_10
487 movdqa xmm3, xmm0
488 pslldq xmm0, 8
489 psrldq xmm3, 8
ac4a43c1 490 pxor xmm4, xmm0 // x_3 = a_1
9e6a4409
MW
491 pxor xmm7, xmm3 // a_0
492
493 // Mix that into the product now forming in xmm4--xmm7.
494 pxor xmm5, xmm4 // a_1 + q_1
495 pxor xmm6, xmm7 // a_0 + q_0
496 pxor xmm5, xmm7 // a_0 + (a_1 + q_1)
497
498 // Finally, the low half, c = u_0 v_0.
499 movdqa xmm0, xmm1 // more copies of u_0
500 movdqa xmm3, xmm1
501 movdqa xmm7, xmm1
502 pclmullqhqdq xmm1, V0 // u_01 v_00
503 pclmulhqlqdq xmm0, V0 // u_00 v_01
504 pclmullqlqdq xmm3, V0 // u_01 v_01
505 pclmulhqhqdq xmm7, V0 // u_00 v_00
506 pxor xmm0, xmm1 // u_10 v_11 + u_11 v_10
507 movdqa xmm1, xmm0
508 pslldq xmm0, 8
509 psrldq xmm1, 8
510 pxor xmm3, xmm0 // c_1
511 pxor xmm7, xmm1 // x_0 = c_0
512
513 // And mix that in to complete the product.
514 pxor xmm6, xmm3 // (a_0 + q_0) + c_1
515 pxor xmm5, xmm3 // x_2 = a_0 + (a_1 + c_1 + q_1) = a_0 + b_1
516 pxor xmm6, xmm7 // x_1 = (a_0 + c_0 + q_0) + c_1 = b_0 + c_1
517
518#undef V0
519
9e6a4409
MW
520 // Now we must reduce. This is essentially the same as the 128-bit
521 // case above, but more complicated because everything is bigger.
522 // The polynomial this time is p(t) = t^256 + t^10 + t^5 + t^2 + 1.
523
524 // First, shift the high bits down.
ac4a43c1
MW
525 movdqa xmm0, xmm4 // x_3 again
526 movdqa xmm1, xmm4 // x_3 yet again
527 movdqa xmm2, xmm4 // x_3 again again
528 pslld xmm0, 30 // x_3: b_i for t^2
529 pslld xmm1, 27 // x_3: b_i for t^5
530 pslld xmm2, 22 // x_3: b_i for t^10
531 movdqa xmm3, xmm5 // x_2 again
532 pxor xmm0, xmm1
533 movdqa xmm1, xmm5 // x_2 again
534 pxor xmm0, xmm2 // b_3
535 movdqa xmm2, xmm5 // x_2 again
536 pslld xmm3, 30 // x_2: b_i for t^2
537 pslld xmm1, 27 // x_2: b_i for t^5
538 pslld xmm2, 22 // x_2: b_i for t^10
539 pxor xmm3, xmm1
540 movdqa xmm1, xmm0
541 pxor xmm3, xmm2 // b_2
542 psrldq xmm0, 4
543 movdqa xmm2, xmm3
544 pslldq xmm1, 12
545 psrldq xmm3, 4
546 pxor xmm6, xmm0
547 pslldq xmm2, 12
548 pxor xmm7, xmm3
549 pxor xmm5, xmm1
550 pxor xmm6, xmm2
9e6a4409
MW
551
552 // And then shift the low bits up.
ac4a43c1
MW
553 movdqa xmm0, xmm4 // x_3 again
554 movdqa xmm1, xmm5 // x_2 again
555 movdqa xmm2, xmm4 // x_3 yet again
556 movdqa xmm3, xmm5 // x_2 yet again
557 pxor xmm6, xmm4 // x_1 and unit contrib from x_3
558 pxor xmm7, xmm5 // x_0 and unit contrib from x_2
559 psrld xmm4, 2
560 psrld xmm5, 2
561 psrld xmm0, 5
562 psrld xmm1, 5
563 psrld xmm2, 10
564 psrld xmm3, 10
565 pxor xmm4, xmm6 // x_1, with x_3 units and t^2
566 pxor xmm5, xmm7 // x_0, with x_2 units and t^2
567 pxor xmm0, xmm2 // x_3 t^5 and t^10 contribs
568 pxor xmm1, xmm3 // x_2 t^5 and t^10 contribs
9e6a4409
MW
569 pxor xmm0, xmm4 // high half of reduced result
570 pxor xmm1, xmm5 // low half; all done
571.endm
572
573///--------------------------------------------------------------------------
574/// Main code.
575
576// There are a number of representations of field elements in this code and
577// it can be confusing.
578//
579// * The `external format' consists of a sequence of contiguous bytes in
580// memory called a `block'. The GCM spec explains how to interpret this
581// block as an element of a finite field. As discussed extensively, this
582// representation is very annoying for a number of reasons. On the other
583// hand, this code never actually deals with it directly.
584//
585// * The `register format' consists of one or more XMM registers, depending
586// on the block size. The bytes in these registers are in reverse order
587// -- so the least-significant byte of the lowest-numbered register holds
588// the /last/ byte in the block. If the block size is not a multiple of
589// 16 bytes, then there must be padding. 96-bit blocks are weird: the
590// padding is inserted at the /least/ significant end, so the register
981a9e5d 591// holds (x_2, x_1; x_0, 0); otherwise, the padding goes at the most
9e6a4409
MW
592// significant end.
593//
594// * The `words' format consists of a sequence of bytes, as in the
595// `external format', but, according to the blockcipher in use, the bytes
596// within each 32-bit word may be reversed (`big-endian') or not
597// (`little-endian'). Accordingly, there are separate entry points for
598// each variant, identified with `b' or `l'.
599
600#define SSEFUNC(f) \
601 FUNC(f##_avx); vzeroupper; endprologue; ENDFUNC; \
602 FUNC(f)
603
604SSEFUNC(gcm_mulk_128b_x86ish_pclmul)
605 // On entry, A points to a 128-bit field element in big-endian words
606 // format; K points to a field-element in register format. On exit,
607 // A is updated with the product A K.
608
609#if CPUFAM_X86
a90d420c
MW
610 mov A, [SP + 4]
611 mov K, [SP + 8]
9e6a4409
MW
612#endif
613 endprologue
614 movdqu xmm0, [A]
615 movdqu xmm1, [K]
981a9e5d 616 pshufd xmm0, xmm0, SHUF(0, 1, 2, 3)
9e6a4409 617 mul128
981a9e5d 618 pshufd xmm0, xmm0, SHUF(0, 1, 2, 3)
9e6a4409
MW
619 movdqu [A], xmm0
620 ret
621ENDFUNC
622
623SSEFUNC(gcm_mulk_128l_x86ish_pclmul)
624 // On entry, A points to a 128-bit field element in little-endian
625 // words format; K points to a field-element in register format. On
626 // exit, A is updated with the product A K.
627
628#if CPUFAM_X86
a90d420c
MW
629 mov A, [SP + 4]
630 mov K, [SP + 8]
9e6a4409
MW
631 ldgot ecx
632#endif
633 endprologue
634 movdqa xmm7, [INTADDR(swaptab_128l, ecx)]
635 movdqu xmm0, [A]
636 movdqu xmm1, [K]
637 pshufb xmm0, xmm7
638 mul128
639 pshufb xmm0, xmm7
640 movdqu [A], xmm0
641 ret
642ENDFUNC
643
644SSEFUNC(gcm_mulk_64b_x86ish_pclmul)
645 // On entry, A points to a 64-bit field element in big-endian words
646 // format; K points to a field-element in register format. On exit,
647 // A is updated with the product A K.
648
649#if CPUFAM_X86
a90d420c
MW
650 mov A, [SP + 4]
651 mov K, [SP + 8]
9e6a4409
MW
652#endif
653 endprologue
654 movq xmm0, [A]
655 movq xmm1, [K]
981a9e5d 656 pshufd xmm0, xmm0, SHUF(3, 3, 0, 1)
9e6a4409 657 mul64
981a9e5d 658 pshufd xmm0, xmm0, SHUF(3, 3, 0, 1)
9e6a4409
MW
659 movq [A], xmm0
660 ret
661ENDFUNC
662
663SSEFUNC(gcm_mulk_64l_x86ish_pclmul)
664 // On entry, A points to a 64-bit field element in little-endian
665 // words format; K points to a field-element in register format. On
666 // exit, A is updated with the product A K.
667
668#if CPUFAM_X86
a90d420c
MW
669 mov A, [SP + 4]
670 mov K, [SP + 8]
9e6a4409
MW
671 ldgot ecx
672#endif
673 endprologue
674 movdqa xmm7, [INTADDR(swaptab_64l, ecx)]
675 movq xmm0, [A]
676 movq xmm1, [K]
677 pshufb xmm0, xmm7
678 mul64
679 pshufb xmm0, xmm7
680 movq [A], xmm0
681 ret
682ENDFUNC
683
684SSEFUNC(gcm_mulk_96b_x86ish_pclmul)
685 // On entry, A points to a 96-bit field element in big-endian words
686 // format; K points to a field-element in register format (i.e., 16
687 // bytes, with the first four bytes zero). On exit, A is updated
688 // with the product A K.
689
690#if CPUFAM_X86
a90d420c
MW
691 mov A, [SP + 4]
692 mov K, [SP + 8]
9e6a4409
MW
693#endif
694 endprologue
695 movq xmm0, [A + 0]
696 movd xmm2, [A + 8]
697 movdqu xmm1, [K]
698 punpcklqdq xmm0, xmm2
981a9e5d 699 pshufd xmm0, xmm0, SHUF(0, 1, 2, 3)
9e6a4409 700 mul96
981a9e5d 701 pshufd xmm1, xmm0, SHUF(0, 1, 2, 3)
9e6a4409
MW
702 psrldq xmm0, 4
703 movq [A + 0], xmm1
704 movd [A + 8], xmm0
705 ret
706ENDFUNC
707
708SSEFUNC(gcm_mulk_96l_x86ish_pclmul)
709 // On entry, A points to a 96-bit field element in little-endian
710 // words format; K points to a field-element in register format
711 // (i.e., 16 bytes, with the first four bytes zero). On exit, A is
712 // updated with the product A K.
713
714#if CPUFAM_X86
a90d420c
MW
715 mov A, [SP + 4]
716 mov K, [SP + 8]
9e6a4409
MW
717 ldgot ecx
718#endif
719 endprologue
720 movdqa xmm7, [INTADDR(swaptab_128l, ecx)]
721 movq xmm0, [A + 0]
722 movd xmm2, [A + 8]
723 movdqu xmm1, [K]
724 punpcklqdq xmm0, xmm2
725 pshufb xmm0, xmm7
726 mul96
727 pshufb xmm0, xmm7
728 movq [A + 0], xmm0
729 psrldq xmm0, 8
730 movd [A + 8], xmm0
731 ret
732ENDFUNC
733
734SSEFUNC(gcm_mulk_192b_x86ish_pclmul)
735 // On entry, A points to a 192-bit field element in big-endian words
736 // format; K points to a field-element in register format. On exit,
737 // A is updated with the product A K.
738
739#if CPUFAM_X86
a90d420c
MW
740 mov A, [SP + 4]
741 mov K, [SP + 8]
9e6a4409
MW
742#endif
743#if CPUFAM_AMD64 && ABI_WIN
744 stalloc 2*16 + 8
745 savexmm xmm6, 0
746 savexmm xmm7, 16
747#endif
748 endprologue
749 movdqu xmm0, [A + 8]
750 movq xmm1, [A + 0]
751 movdqu xmm2, [K + 0]
752 movq xmm3, [K + 16]
981a9e5d
MW
753 pshufd xmm0, xmm0, SHUF(0, 1, 2, 3)
754 pshufd xmm1, xmm1, SHUF(3, 3, 0, 1)
9e6a4409 755 mul192
981a9e5d
MW
756 pshufd xmm0, xmm0, SHUF(0, 1, 2, 3)
757 pshufd xmm1, xmm1, SHUF(3, 3, 0, 1)
9e6a4409
MW
758 movdqu [A + 8], xmm0
759 movq [A + 0], xmm1
760#if CPUFAM_AMD64 && ABI_WIN
761 rstrxmm xmm6, 0
762 rstrxmm xmm7, 16
763 stfree 2*16 + 8
764#endif
765 ret
766ENDFUNC
767
768SSEFUNC(gcm_mulk_192l_x86ish_pclmul)
769 // On entry, A points to a 192-bit field element in little-endian
770 // words format; K points to a field-element in register format. On
771 // exit, A is updated with the product A K.
772
773#if CPUFAM_X86
a90d420c
MW
774 mov A, [SP + 4]
775 mov K, [SP + 8]
9e6a4409
MW
776 ldgot ecx
777#endif
778#if CPUFAM_AMD64 && ABI_WIN
779 stalloc 2*16 + 8
780 savexmm xmm6, 0
781 savexmm xmm7, 16
782#endif
783 endprologue
784 movdqu xmm0, [A + 8]
785 movq xmm1, [A + 0]
786 movdqu xmm2, [K + 0]
787 movq xmm3, [K + 16]
788 pshufb xmm0, [INTADDR(swaptab_128l, ecx)]
789 pshufb xmm1, [INTADDR(swaptab_64l, ecx)]
790 mul192
791 pshufb xmm0, [INTADDR(swaptab_128l, ecx)]
792 pshufb xmm1, [INTADDR(swaptab_64l, ecx)]
793 movdqu [A + 8], xmm0
794 movq [A + 0], xmm1
795#if CPUFAM_AMD64 && ABI_WIN
796 rstrxmm xmm6, 0
797 rstrxmm xmm7, 16
798 stfree 2*16 + 8
799#endif
800 ret
801ENDFUNC
802
803SSEFUNC(gcm_mulk_256b_x86ish_pclmul)
804 // On entry, A points to a 256-bit field element in big-endian words
805 // format; K points to a field-element in register format. On exit,
806 // A is updated with the product A K.
807
808#if CPUFAM_X86
a90d420c 809 pushreg BP
9e6a4409 810 setfp
a90d420c
MW
811 mov A, [SP + 8]
812 mov K, [SP + 12]
6d2bd7f1 813 stalloc 16
a90d420c 814 and SP, ~15
9e6a4409
MW
815#endif
816#if CPUFAM_AMD64 && ABI_WIN
817 stalloc 3*16 + 8
818 savexmm xmm6, 0
819 savexmm xmm7, 16
820 savexmm xmm8, 32
821#endif
822 endprologue
823 movdqu xmm0, [A + 16]
824 movdqu xmm1, [A + 0]
825 movdqu xmm2, [K + 0]
826 movdqu xmm3, [K + 16]
981a9e5d
MW
827 pshufd xmm0, xmm0, SHUF(0, 1, 2, 3)
828 pshufd xmm1, xmm1, SHUF(0, 1, 2, 3)
9e6a4409 829 mul256
981a9e5d
MW
830 pshufd xmm0, xmm0, SHUF(0, 1, 2, 3)
831 pshufd xmm1, xmm1, SHUF(0, 1, 2, 3)
9e6a4409
MW
832 movdqu [A + 16], xmm0
833 movdqu [A + 0], xmm1
834#if CPUFAM_X86
835 dropfp
a90d420c 836 popreg BP
9e6a4409
MW
837#endif
838#if CPUFAM_AMD64 && ABI_WIN
839 rstrxmm xmm6, 0
840 rstrxmm xmm7, 16
841 rstrxmm xmm8, 32
842 stfree 3*16 + 8
843#endif
844 ret
845ENDFUNC
846
847SSEFUNC(gcm_mulk_256l_x86ish_pclmul)
848 // On entry, A points to a 256-bit field element in little-endian
849 // words format; K points to a field-element in register format. On
850 // exit, A is updated with the product A K.
851
852#if CPUFAM_X86
a90d420c 853 pushreg BP
9e6a4409 854 setfp
a90d420c
MW
855 mov A, [SP + 8]
856 mov K, [SP + 12]
6d2bd7f1 857 stalloc 16
9e6a4409 858 ldgot ecx
6d2bd7f1 859 and SP, ~15
9e6a4409
MW
860#endif
861#if CPUFAM_AMD64 && ABI_WIN
862 stalloc 3*16 + 8
863 savexmm xmm6, 0
864 savexmm xmm7, 16
865 savexmm xmm8, 32
866#endif
867 endprologue
868 movdqa xmm7, [INTADDR(swaptab_128l, ecx)]
869 movdqu xmm0, [A + 16]
870 movdqu xmm1, [A + 0]
871 movdqu xmm2, [K + 0]
872 movdqu xmm3, [K + 16]
873 pshufb xmm0, xmm7
874 pshufb xmm1, xmm7
875 mul256
876 movdqa xmm7, [INTADDR(swaptab_128l, ecx)]
877 pshufb xmm0, xmm7
878 pshufb xmm1, xmm7
879 movdqu [A + 16], xmm0
880 movdqu [A + 0], xmm1
881#if CPUFAM_X86
882 dropfp
a90d420c 883 popreg BP
9e6a4409
MW
884#endif
885#if CPUFAM_AMD64 && ABI_WIN
886 rstrxmm xmm6, 0
887 rstrxmm xmm7, 16
888 rstrxmm xmm8, 32
889 stfree 3*16 + 8
890#endif
891 ret
892ENDFUNC
893
894 RODATA
895
896 .balign 16
897swaptab_128l:
898 // Table for byte-swapping little-endian words-format blocks larger
899 // than 64 bits.
900 .byte 15, 14, 13, 12, 11, 10, 9, 8
901 .byte 7, 6, 5, 4, 3, 2, 1, 0
902
903 .balign 16
904swaptab_64l:
905 // Table for byte-swapping 64-bit little-endian words-format blocks.
906 .byte 7, 6, 5, 4, 3, 2, 1, 0
907 .byte 255, 255, 255, 255, 255, 255, 255, 255
908
909///----- That's all, folks --------------------------------------------------