Commit | Line | Data |
---|---|---|
9e6a4409 MW |
1 | /// -*- mode: asm; asm-comment-char: ?/ -*- |
2 | /// | |
3 | /// GCM acceleration for x86 processors | |
4 | /// | |
5 | /// (c) 2018 Straylight/Edgeware | |
6 | /// | |
7 | ||
8 | ///----- Licensing notice --------------------------------------------------- | |
9 | /// | |
10 | /// This file is part of Catacomb. | |
11 | /// | |
12 | /// Catacomb is free software: you can redistribute it and/or modify it | |
13 | /// under the terms of the GNU Library General Public License as published | |
14 | /// by the Free Software Foundation; either version 2 of the License, or | |
15 | /// (at your option) any later version. | |
16 | /// | |
17 | /// Catacomb is distributed in the hope that it will be useful, but | |
18 | /// WITHOUT ANY WARRANTY; without even the implied warranty of | |
19 | /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
20 | /// Library General Public License for more details. | |
21 | /// | |
22 | /// You should have received a copy of the GNU Library General Public | |
23 | /// License along with Catacomb. If not, write to the Free Software | |
24 | /// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, | |
25 | /// USA. | |
26 | ||
27 | ///-------------------------------------------------------------------------- | |
28 | /// Preliminaries. | |
29 | ||
30 | #include "config.h" | |
31 | #include "asm-common.h" | |
32 | ||
33 | .arch .pclmul | |
34 | ||
35 | .text | |
36 | ||
37 | ///-------------------------------------------------------------------------- | |
38 | /// Common register allocation. | |
39 | ||
40 | #if CPUFAM_X86 | |
41 | # define A eax | |
42 | # define K edx | |
43 | #elif CPUFAM_AMD64 && ABI_SYSV | |
44 | # define A rdi | |
45 | # define K rsi | |
46 | #elif CPUFAM_AMD64 && ABI_WIN | |
47 | # define A rcx | |
48 | # define K rdx | |
49 | #endif | |
50 | ||
51 | ///-------------------------------------------------------------------------- | |
52 | /// Multiplication macros. | |
53 | ||
54 | // The good news is that we have a fancy instruction to do the | |
55 | // multiplications. The bad news is that it's not particularly well- | |
56 | // suited to the job. | |
57 | // | |
58 | // For one thing, it only does a 64-bit multiplication, so in general | |
59 | // we'll need to synthesize the full-width multiply by hand. For | |
60 | // another thing, it doesn't help with the reduction, so we have to | |
61 | // do that by hand too. And, finally, GCM has crazy bit ordering, | |
62 | // and the instruction does nothing useful for that at all. | |
63 | // | |
64 | // Focusing on that last problem first: the bits aren't in monotonic | |
65 | // significance order unless we permute them. If we reverse the byte | |
66 | // order, then we'll have the bits in monotonic order, but backwards, | |
67 | // so the degree-0 coefficient will be in the most-significant bit. | |
68 | // | |
69 | // This is less of a difficulty than it seems at first, because | |
70 | // algebra. Suppose we are given u = SUM_{0<=i<n} u_i t^i and v = | |
71 | // SUM_{0<=j<n} v_j t^j; then | |
72 | // | |
73 | // u v = SUM_{0<=i,j<n} u_i v_j t^{i+j} | |
74 | // | |
75 | // Suppose instead that we're given ũ = SUM_{0<=i<n} u_{n-i-1} t^i | |
76 | // and ṽ = SUM_{0<=j<n} v_{n-j-1} t^j, so the bits are backwards. | |
77 | // Then | |
78 | // | |
79 | // ũ ṽ = SUM_{0<=i,j<n} u_{n-i-1} v_{n-j-1} t^{i+j} | |
80 | // = SUM_{0<=i,j<n} u_i v_j t^{2n-2-(i+j)} | |
81 | // | |
82 | // which is almost the bit-reversal of u v, only it's shifted right | |
83 | // by one place. Oh, well: we'll have to shift it back later. | |
84 | // | |
85 | // That was important to think about, but there's not a great deal to | |
86 | // do about it yet other than to convert what we've got from the | |
87 | // blockcipher's byte-ordering convention to our big-endian | |
88 | // convention. Since this depends on the blockcipher convention, | |
89 | // we'll leave the caller to cope with this: the macros here will | |
90 | // assume that the operands are in `register' format, which is the | |
91 | // byte-reversal of the external representation, padded at the | |
92 | // most-significant end except for 96-bit blocks, which are | |
93 | // zero-padded at the least-significant end (see `mul96' for the | |
94 | // details). In the commentary, pieces of polynomial are numbered | |
95 | // according to the degree of the coefficients, so the unit | |
96 | // coefficient of some polynomial a is in a_0. | |
97 | // | |
98 | // The commentary for `mul128' is the most detailed. The other | |
99 | // macros assume that you've already read and understood that. | |
100 | ||
101 | .macro mul128 | |
102 | // Enter with u and v in xmm0 and xmm1 respectively; leave with z = | |
103 | // u v in xmm0. Clobbers xmm1--xmm4. | |
104 | ||
105 | // First for the double-precision multiplication. It's tempting to | |
106 | // use Karatsuba's identity here, but I suspect that loses more in | |
107 | // the shifting, bit-twiddling, and dependency chains that it gains | |
108 | // in saving a multiplication which otherwise pipelines well. | |
109 | // xmm0 = // (u_1; u_0) | |
110 | // xmm1 = // (v_1; v_0) | |
111 | movdqa xmm2, xmm1 // (v_1; v_0) again | |
112 | movdqa xmm3, xmm0 // (u_1; u_0) again | |
113 | movdqa xmm4, xmm0 // (u_1; u_0) yet again | |
114 | pclmulhqlqdq xmm2, xmm0 // u_1 v_0 | |
115 | pclmullqlqdq xmm0, xmm1 // u_1 v_1 | |
116 | pclmulhqlqdq xmm3, xmm1 // u_0 v_1 | |
117 | pclmulhqhqdq xmm4, xmm1 // u_0 v_0 | |
118 | ||
119 | // Arrange the pieces to form a double-precision polynomial. | |
120 | pxor xmm2, xmm3 // (m_1; m_0) = u_1 v_0 + u_0 v_1 | |
121 | movdqa xmm1, xmm2 // (m_1; m_0) again | |
122 | pslldq xmm2, 8 // (0; m_1) | |
123 | psrldq xmm1, 8 // (m_0; 0) | |
124 | pxor xmm0, xmm2 // x_1 = u_1 v_1 + m_1 | |
125 | pxor xmm1, xmm4 // x_0 = u_0 v_0 + t^64 m_0 | |
126 | ||
127 | // Two problems remain. The first is that this product is shifted | |
128 | // left (from GCM's backwards perspective) by one place, which is | |
129 | // annoying. Let's take care of that now. Once this is done, we'll | |
130 | // be properly in GCM's backwards bit-ordering, so xmm1 will hold the | |
131 | // low half of the product and xmm0 the high half. (The following | |
132 | // diagrams show bit 0 consistently on the right.) | |
133 | // | |
134 | // xmm1 | |
135 | // ,-------------.-------------.-------------.-------------. | |
136 | // | 0 x_0-x_30 | x_31-x_62 | x_63-x_94 | x_95-x_126 | | |
137 | // `-------------^-------------^-------------^-------------' | |
138 | // | |
139 | // xmm0 | |
140 | // ,-------------.-------------.-------------.-------------. | |
141 | // | x_127-x_158 | x_159-x_190 | x_191-x_222 | x_223-x_254 | | |
142 | // `-------------^-------------^-------------^-------------' | |
143 | // | |
144 | // We start by shifting each 32-bit lane right (from GCM's point of | |
145 | // view -- physically, left) by one place, which gives us this: | |
146 | // | |
147 | // low (xmm3) | |
148 | // ,-------------.-------------.-------------.-------------. | |
149 | // | x_0-x_30 0 | x_32-x_62 0 | x_64-x_94 0 | x_96-x_126 0| | |
150 | // `-------------^-------------^-------------^-------------' | |
151 | // | |
152 | // high (xmm2) | |
153 | // ,-------------.-------------.-------------.-------------. | |
154 | // |x_128-x_158 0|x_160-x_190 0|x_192-x_222 0|x_224-x_254 0| | |
155 | // `-------------^-------------^-------------^-------------' | |
156 | // | |
157 | // but we've lost a bunch of bits. We separately shift each lane | |
158 | // left by 31 places to give us the bits we lost. | |
159 | // | |
160 | // low (xmm1) | |
161 | // ,-------------.-------------.-------------.-------------. | |
162 | // | 0...0 | 0...0 x_31 | 0...0 x_63 | 0...0 x_95 | | |
163 | // `-------------^-------------^-------------^-------------' | |
164 | // | |
165 | // high (xmm0) | |
166 | // ,-------------.-------------.-------------.-------------. | |
167 | // | 0...0 x_127 | 0...0 x_159 | 0...0 x_191 | 0...0 x_223 | | |
168 | // `-------------^-------------^-------------^-------------' | |
169 | // | |
170 | // Which is close, but we don't get a cigar yet. To get the missing | |
171 | // bits into position, we shift each of these right by a lane, but, | |
172 | // alas, the x_127 falls off, so, separately, we shift the high | |
173 | // register left by three lanes, so that everything is lined up | |
174 | // properly when we OR them all together: | |
175 | // | |
176 | // low (xmm1) | |
177 | // ,-------------.-------------.-------------.-------------. | |
178 | // ? 0...0 x_31 | 0...0 x_63 | 0...0 x_95 | 0...0 | | |
179 | // `-------------^-------------^-------------^-------------' | |
180 | // | |
181 | // wrap (xmm4) | |
182 | // ,-------------.-------------.-------------.-------------. | |
183 | // | 0...0 | 0...0 | 0...0 | 0...0 x_127 | | |
184 | // `-------------^-------------^-------------^-------------' | |
185 | // | |
186 | // high (xmm0) | |
187 | // ,-------------.-------------.-------------.-------------. | |
188 | // | 0...0 x_159 | 0...0 x_191 | 0...0 x_223 | 0...0 | | |
189 | // `-------------^-------------^-------------^-------------' | |
190 | // | |
191 | // The `low' and `wrap' registers (xmm1, xmm3, xmm4) then collect the | |
192 | // low 128 coefficients, while the `high' registers (xmm0, xmm2) | |
193 | // collect the high 127 registers, leaving a zero bit at the most | |
194 | // significant end as we expect. | |
195 | ||
196 | // xmm0 = // (x_7, x_6; x_5, x_4) | |
197 | // xmm1 = // (x_3, x_2; x_1, x_0) | |
198 | movdqa xmm3, xmm1 // (x_3, x_2; x_1, x_0) again | |
199 | movdqa xmm2, xmm0 // (x_7, x_6; x_5, x_4) again | |
200 | psrld xmm1, 31 // shifted left; just the carries | |
201 | psrld xmm0, 31 | |
202 | pslld xmm3, 1 // shifted right, but dropped carries | |
203 | pslld xmm2, 1 | |
204 | movdqa xmm4, xmm0 // another copy for the carry around | |
205 | pslldq xmm1, 4 // move carries over | |
206 | pslldq xmm0, 4 | |
207 | psrldq xmm4, 12 // the big carry wraps around | |
208 | por xmm1, xmm3 | |
209 | por xmm0, xmm2 // (y_7, y_6; y_5, y_4) | |
210 | por xmm1, xmm4 // (y_3, y_2; y_1, y_0) | |
211 | ||
212 | // And the other problem is that the result needs to be reduced | |
213 | // modulo p(t) = t^128 + t^7 + t^2 + t + 1. Let R = t^128 = t^7 + | |
214 | // t^2 + t + 1 in our field. So far, we've calculated z_0 and z_1 | |
215 | // such that z_0 + z_1 R = u v using the identity R = t^128: now we | |
216 | // must collapse the two halves of z together using the other | |
217 | // identity R = t^7 + t^2 + t + 1. | |
218 | // | |
219 | // We do this by working on each 32-bit word of the high half of z | |
220 | // separately, so consider y_i, for some 4 <= i < 8. Certainly, y_i | |
221 | // t^{32i} = y_i R t^{32(i-4)} = (t^7 + t^2 + t + 1) y_i t^{32(i-4)}, | |
222 | // but we can't use that directly without breaking up the 32-bit word | |
223 | // structure. Instead, we start by considering just y_i t^7 | |
224 | // t^{32(i-4)}, which again looks tricky. Now, split y_i = a_i + | |
225 | // t^25 b_i, with deg a_i < 25; then | |
226 | // | |
227 | // y_i t^7 t^{32(i-4)} = a_i t^7 t^{32(i-4)} + b_i t^{32(i-3)} | |
228 | // | |
229 | // We can similarly decompose y_i t^2 and y_i t into a pair of 32-bit | |
230 | // contributions to the t^{32(i-4)} and t^{32(i-3)} words, but the | |
231 | // splits are different. This is lovely, with one small snag: when | |
232 | // we do this to y_7, we end up with a contribution back into the | |
233 | // t^128 coefficient word. But notice that only the low seven bits | |
234 | // of this word are affected, so there's no knock-on contribution | |
235 | // into the t^32 word. Therefore, if we handle the high bits of each | |
236 | // word together, and then the low bits, everything will be fine. | |
237 | ||
238 | // First, shift the high bits down. | |
239 | movdqa xmm2, xmm0 // (y_7, y_6; y_5, y_4) again | |
240 | movdqa xmm3, xmm0 // (y_7, y_6; y_5, y_4) yet again | |
241 | movdqa xmm4, xmm0 // (y_7, y_6; y_5, y_4) again again | |
242 | pslld xmm2, 31 // the b_i for t | |
243 | pslld xmm3, 30 // the b_i for t^2 | |
244 | pslld xmm4, 25 // the b_i for t^7 | |
245 | pxor xmm2, xmm3 // add them all together | |
246 | pxor xmm2, xmm4 | |
247 | movdqa xmm3, xmm2 // and a copy for later | |
248 | psrldq xmm2, 4 // contribution into low half | |
249 | pslldq xmm3, 12 // and high half | |
250 | pxor xmm1, xmm2 | |
251 | pxor xmm0, xmm3 | |
252 | ||
253 | // And then shift the low bits up. | |
254 | movdqa xmm2, xmm0 | |
255 | movdqa xmm3, xmm0 | |
256 | pxor xmm1, xmm0 // mix in the unit contribution | |
257 | psrld xmm0, 1 | |
258 | psrld xmm2, 2 | |
259 | psrld xmm3, 7 | |
260 | pxor xmm1, xmm2 // low half, unit, and t^2 contribs | |
261 | pxor xmm0, xmm3 // t and t^7 contribs | |
262 | pxor xmm0, xmm1 // mix them together and we're done | |
263 | .endm | |
264 | ||
265 | .macro mul64 | |
266 | // Enter with u and v in the low halves of xmm0 and xmm1 | |
267 | // respectively; leave with z = u v in xmm0. Clobbers xmm1--xmm4. | |
268 | ||
269 | // The multiplication is thankfully easy. | |
270 | pclmullqlqdq xmm0, xmm1 // u v | |
271 | ||
272 | // Shift the product up by one place. After this, we're in GCM | |
273 | // bizarro-world. | |
274 | movdqa xmm1, xmm0 // u v again | |
275 | psrld xmm0, 31 // shifted left; just the carries | |
276 | pslld xmm1, 1 // shifted right, but dropped carries | |
277 | pslldq xmm0, 4 // move carries over | |
278 | por xmm1, xmm0 // (y_3, y_2; y_1, y_0) | |
279 | ||
280 | // Now we must reduce. This is essentially the same as the 128-bit | |
281 | // case above, but mostly simpler because everything is smaller. The | |
282 | // polynomial this time is p(t) = t^64 + t^4 + t^3 + t + 1. | |
283 | ||
284 | // First, we must detach the top (`low'!) half of the result. | |
285 | movdqa xmm0, xmm1 // (y_3, y_2; y_1, y_0) again | |
286 | psrldq xmm1, 8 // (y_1, y_0; 0, 0) | |
287 | ||
288 | // Next, shift the high bits down. | |
289 | movdqa xmm2, xmm0 // (y_3, y_2; ?, ?) again | |
290 | movdqa xmm3, xmm0 // (y_3, y_2; ?, ?) yet again | |
291 | movdqa xmm4, xmm0 // (y_3, y_2; ?, ?) again again | |
292 | pslld xmm2, 31 // b_i for t | |
293 | pslld xmm3, 29 // b_i for t^3 | |
294 | pslld xmm4, 28 // b_i for t^4 | |
295 | pxor xmm2, xmm3 // add them all together | |
296 | pxor xmm2, xmm4 | |
297 | movdqa xmm3, xmm2 // and a copy for later | |
298 | movq xmm2, xmm2 // zap high half | |
299 | pslldq xmm3, 4 // contribution into high half | |
300 | psrldq xmm2, 4 // and low half | |
301 | pxor xmm0, xmm3 | |
302 | pxor xmm1, xmm2 | |
303 | ||
304 | // And then shift the low bits up. | |
305 | movdqa xmm2, xmm0 | |
306 | movdqa xmm3, xmm0 | |
307 | pxor xmm1, xmm0 // mix in the unit contribution | |
308 | psrld xmm0, 1 | |
309 | psrld xmm2, 3 | |
310 | psrld xmm3, 4 | |
311 | pxor xmm1, xmm2 // low half, unit, and t^3 contribs | |
312 | pxor xmm0, xmm3 // t and t^4 contribs | |
313 | pxor xmm0, xmm1 // mix them together and we're done | |
314 | .endm | |
315 | ||
316 | .macro mul96 | |
317 | // Enter with u and v in the /high/ three words of xmm0 and xmm1 | |
318 | // respectively (and zero in the low word); leave with z = u v in the | |
319 | // high three words of xmm0, and /junk/ in the low word. Clobbers | |
320 | // xmm1--xmm4. | |
321 | ||
322 | // This is an inconvenient size. There's nothing for it but to do | |
323 | // four multiplications, as if for the 128-bit case. It's possible | |
324 | // that there's cruft in the top 32 bits of the input registers, so | |
325 | // shift both of them up by four bytes before we start. This will | |
326 | // mean that the high 64 bits of the result (from GCM's viewpoint) | |
327 | // will be zero. | |
328 | // xmm0 = // (0, u_2; u_1, u_0) | |
329 | // xmm1 = // (0, v_2; v_1, v_0) | |
330 | movdqa xmm2, xmm1 // (0, v_2; v_1, v_0) again | |
331 | movdqa xmm3, xmm0 // (0, u_2; u_1, u_0) again | |
332 | movdqa xmm4, xmm0 // (0, u_2; u_1, u_0) yet again | |
333 | pclmulhqlqdq xmm2, xmm0 // u_2 (v_1 t^32 + v_0) = e_0 | |
334 | pclmullqlqdq xmm0, xmm1 // u_2 v_2 = d = (0; d) | |
335 | pclmulhqlqdq xmm3, xmm1 // v_2 (u_1 t^32 + u_0) = e_1 | |
336 | pclmulhqhqdq xmm4, xmm1 // u_0 v_0 + (u_1 v_0 + u_0 v_1) t^32 | |
337 | // + u_1 v_1 t^64 = f | |
338 | ||
339 | // Extract the high and low halves of the 192-bit result. We don't | |
340 | // need be too picky about the unused high words of the result | |
341 | // registers. The answer we want is d t^128 + e t^64 + f, where e = | |
342 | // e_0 + e_1. | |
343 | // | |
344 | // The place values for the two halves are (t^160, t^128; t^96, ?) | |
345 | // and (?, t^64; t^32, 1). | |
346 | psrldq xmm0, 8 // (d; 0) = d t^128 | |
347 | pxor xmm2, xmm3 // e = (e_0 + e_1) | |
348 | movdqa xmm1, xmm4 // f again | |
349 | pxor xmm0, xmm2 // d t^128 + e t^64 | |
350 | psrldq xmm2, 12 // e[31..0] t^64 | |
351 | psrldq xmm1, 4 // f[95..0] | |
352 | pslldq xmm4, 8 // f[127..96] | |
353 | pxor xmm1, xmm2 // low 96 bits of result | |
354 | pxor xmm0, xmm4 // high 96 bits of result | |
355 | ||
356 | // Next, shift everything one bit to the left to compensate for GCM's | |
357 | // strange ordering. This will be easier if we shift up the high | |
358 | // half by a word before we start. After this we're in GCM bizarro- | |
359 | // world. | |
360 | movdqa xmm3, xmm1 // low half again | |
361 | pslldq xmm0, 4 // shift high half | |
362 | psrld xmm1, 31 // shift low half down: just carries | |
363 | movdqa xmm2, xmm0 // copy high half | |
364 | pslld xmm3, 1 // shift low half down: drop carries | |
365 | psrld xmm0, 31 // shift high half up: just carries | |
366 | pslld xmm2, 1 // shift high half down: drop carries | |
367 | movdqa xmm4, xmm0 // copy high carries for carry-around | |
368 | pslldq xmm0, 4 // shift carries down | |
369 | pslldq xmm1, 4 | |
370 | psrldq xmm4, 12 // the big carry wraps around | |
371 | por xmm1, xmm3 | |
372 | por xmm0, xmm2 | |
373 | por xmm1, xmm4 | |
374 | ||
375 | // Finally, the reduction. This is essentially the same as the | |
376 | // 128-bit case, except that the polynomial is p(t) = t^96 + t^10 + | |
377 | // t^9 + t^6 + 1. The degrees are larger but not enough to cause | |
378 | // trouble for the general approach. | |
379 | ||
380 | // First, shift the high bits down. | |
381 | movdqa xmm2, xmm0 // copies of the high part | |
382 | movdqa xmm3, xmm0 | |
383 | movdqa xmm4, xmm0 | |
384 | pslld xmm2, 26 // b_i for t^6 | |
385 | pslld xmm3, 23 // b_i for t^9 | |
386 | pslld xmm4, 22 // b_i for t^10 | |
387 | pxor xmm2, xmm3 // add them all together | |
388 | pslldq xmm1, 4 // shift low part up to match | |
389 | pxor xmm2, xmm4 | |
390 | movdqa xmm3, xmm2 // and a copy for later | |
391 | pslldq xmm2, 8 // contribution to high half | |
392 | psrldq xmm3, 4 // contribution to low half | |
393 | pxor xmm1, xmm3 | |
394 | pxor xmm0, xmm2 | |
395 | ||
396 | // And then shift the low bits up. | |
397 | movdqa xmm2, xmm0 // copies of the high part | |
398 | movdqa xmm3, xmm0 | |
399 | pxor xmm1, xmm0 // mix in the unit contribution | |
400 | psrld xmm0, 6 | |
401 | psrld xmm2, 9 | |
402 | psrld xmm3, 10 | |
403 | pxor xmm1, xmm2 // low half, unit, and t^9 contribs | |
404 | pxor xmm0, xmm3 // t^6 and t^10 contribs | |
405 | pxor xmm0, xmm1 // mix them together and we're done | |
406 | .endm | |
407 | ||
408 | .macro mul192 | |
409 | // Enter with u and v in xmm0/xmm1 and xmm2/xmm3 respectively; leave | |
410 | // with z = u v in xmm0/xmm1 -- the top halves of the high registers | |
411 | // are unimportant. Clobbers xmm2--xmm7. | |
412 | ||
413 | // Start multiplying and accumulating pieces of product. | |
414 | // xmm0 = // (u_2; u_1) | |
415 | // xmm1 = // (u_0; ?) | |
416 | // xmm2 = // (v_2; v_1) | |
417 | // xmm3 = // (v_0; ?) | |
418 | movdqa xmm4, xmm0 // (u_2; u_1) again | |
419 | movdqa xmm5, xmm0 // (u_2; u_1) yet again | |
420 | movdqa xmm6, xmm0 // (u_2; u_1) again again | |
421 | movdqa xmm7, xmm1 // (u_0; ?) again | |
422 | punpcklqdq xmm1, xmm3 // (u_0; v_0) | |
423 | pclmulhqhqdq xmm4, xmm2 // u_1 v_1 | |
424 | pclmullqlqdq xmm3, xmm0 // u_2 v_0 | |
425 | pclmullqhqdq xmm5, xmm2 // u_2 v_1 | |
426 | pclmulhqlqdq xmm6, xmm2 // u_1 v_2 | |
427 | pxor xmm4, xmm3 // u_2 v_0 + u_1 v_1 | |
428 | pclmullqlqdq xmm7, xmm2 // u_0 v_2 | |
429 | pxor xmm5, xmm6 // b = u_2 v_1 + u_1 v_2 | |
430 | movdqa xmm6, xmm0 // (u_2; u_1) like a bad penny | |
431 | pxor xmm4, xmm7 // c = u_0 v_2 + u_1 v_1 + u_2 v_0 | |
432 | pclmullqlqdq xmm0, xmm2 // a = u_2 v_2 | |
433 | pclmulhqhqdq xmm6, xmm1 // u_1 v_0 | |
434 | pclmulhqlqdq xmm2, xmm1 // u_0 v_1 | |
435 | pclmullqhqdq xmm1, xmm1 // e = u_0 v_0 | |
436 | pxor xmm2, xmm6 // d = u_1 v_0 + u_0 v_1 | |
437 | ||
438 | // Next, the piecing together of the product. | |
439 | // xmm0 = // (a_1; a_0) = a = u_2 v_2 | |
440 | // xmm5 = // (b_1; b_0) = b = u_1 v_2 + u_2 v_1 | |
441 | // xmm4 = // (c_1; c_0) = c = u_0 v_2 + | |
442 | // u_1 v_1 + u_2 v_0 | |
443 | // xmm2 = // (d_1; d_0) = d = u_0 v_1 + u_1 v_0 | |
444 | // xmm1 = // (e_1; e_0) = e = u_0 v_0 | |
445 | // xmm3, xmm6, xmm7 spare | |
446 | movdqa xmm3, xmm2 // (d_1; d_0) again | |
447 | movdqa xmm6, xmm5 // (b_1; b_0) again | |
448 | pslldq xmm2, 8 // (0; d_1) | |
449 | psrldq xmm5, 8 // (b_0; 0) | |
450 | psrldq xmm3, 8 // (d_0; 0) | |
451 | pslldq xmm6, 8 // (0; b_1) | |
452 | pxor xmm5, xmm2 // (b_0; d_1) | |
453 | pxor xmm0, xmm6 // x_2 = (a_1; a_0 + b_1) | |
454 | pxor xmm3, xmm1 // x_0 = (e_1 + d_0; e_0) | |
455 | pxor xmm4, xmm5 // x_1 = (b_0 + c_1; c_0 + d_1) | |
456 | ||
457 | // Now, shift it right (from GCM's point of view) by one bit, and try | |
458 | // to leave the result in less random registers. After this, we'll | |
459 | // be in GCM bizarro-world. | |
460 | // xmm1, xmm2, xmm5, xmm6, xmm7 spare | |
461 | movdqa xmm5, xmm0 // copy x_2 | |
462 | movdqa xmm1, xmm4 // copy x_1 | |
463 | movdqa xmm2, xmm3 // copy x_0 | |
464 | psrld xmm0, 31 // x_2 carries | |
465 | psrld xmm4, 31 // x_1 carries | |
466 | psrld xmm3, 31 // x_0 carries | |
467 | pslld xmm5, 1 // x_2 shifted | |
468 | pslld xmm1, 1 // x_1 shifted | |
469 | pslld xmm2, 1 // x_0 shifted | |
470 | movdqa xmm6, xmm0 // x_2 carry copy | |
471 | movdqa xmm7, xmm4 // x_1 carry copy | |
472 | pslldq xmm0, 4 // x_2 carry shifted | |
473 | pslldq xmm4, 4 // x_1 carry shifted | |
474 | pslldq xmm3, 4 // x_0 carry shifted | |
475 | psrldq xmm6, 12 // x_2 carry out | |
476 | psrldq xmm7, 12 // x_1 carry out | |
477 | por xmm0, xmm5 // (y_5; y_4) | |
478 | por xmm1, xmm4 | |
479 | por xmm2, xmm3 | |
480 | por xmm1, xmm6 // (y_3; y_2) | |
481 | por xmm2, xmm7 // (y_1; y_0) | |
482 | ||
483 | // Next, the reduction. Our polynomial this time is p(x) = t^192 + | |
484 | // t^7 + t^2 + t + 1. Yes, the magic numbers are the same as the | |
485 | // 128-bit case. I don't know why. | |
486 | ||
487 | // First, shift the high bits down. | |
488 | // xmm0 = // (y_5; y_4) | |
489 | // xmm1 = // (y_3; y_2) | |
490 | // xmm2 = // (y_1; y_0) | |
491 | // xmm3--xmm7 spare | |
492 | movdqa xmm3, xmm0 // (y_5; y_4) copy | |
493 | movdqa xmm4, xmm0 // (y_5; y_4) copy | |
494 | movdqa xmm5, xmm0 // (y_5; y_4) copy | |
495 | pslld xmm3, 31 // (y_5; y_4) b_i for t | |
496 | pslld xmm4, 30 // (y_5; y_4) b_i for t^2 | |
497 | pslld xmm5, 25 // (y_5; y_4) b_i for t^7 | |
498 | movq xmm6, xmm1 // (y_3; 0) copy | |
499 | pxor xmm3, xmm4 | |
500 | movq xmm7, xmm1 // (y_3; 0) copy | |
501 | pxor xmm3, xmm5 | |
502 | movq xmm5, xmm1 // (y_3; 0) copy | |
503 | movdqa xmm4, xmm3 // (y_5; y_4) b_i combined | |
504 | pslld xmm6, 31 // (y_3; 0) b_i for t | |
505 | pslld xmm7, 30 // (y_3; 0) b_i for t^2 | |
506 | pslld xmm5, 25 // (y_3; 0) b_i for t^7 | |
507 | psrldq xmm3, 12 // (y_5; y_4) low contrib | |
508 | pslldq xmm4, 4 // (y_5; y_4) high contrib | |
509 | pxor xmm6, xmm7 | |
510 | pxor xmm2, xmm3 | |
511 | pxor xmm6, xmm5 | |
512 | pxor xmm1, xmm4 | |
513 | pslldq xmm6, 4 | |
514 | pxor xmm2, xmm6 | |
515 | ||
516 | // And finally shift the low bits up. Unfortunately, we also have to | |
517 | // split the low bits out. | |
518 | // xmm0 = // (y'_5; y'_4) | |
519 | // xmm1 = // (y'_3; y'_2) | |
520 | // xmm2 = // (y'_1; y'_0) | |
521 | movdqa xmm5, xmm1 // copies of (y'_3; y'_2) | |
522 | movdqa xmm6, xmm1 | |
523 | movdqa xmm7, xmm1 | |
524 | psrldq xmm1, 8 // bring down (y'_2; ?) | |
525 | movdqa xmm3, xmm0 // copies of (y'_5; y'_4) | |
526 | movdqa xmm4, xmm0 | |
527 | punpcklqdq xmm1, xmm2 // (y'_2; y'_1) | |
528 | psrldq xmm2, 8 // (y'_0; ?) | |
529 | pxor xmm2, xmm5 // low half and unit contrib | |
530 | pxor xmm1, xmm0 | |
531 | psrld xmm5, 1 | |
532 | psrld xmm0, 1 | |
533 | psrld xmm6, 2 | |
534 | psrld xmm3, 2 | |
535 | psrld xmm7, 7 | |
536 | psrld xmm4, 7 | |
537 | pxor xmm2, xmm6 // low half, unit, t^2 contribs | |
538 | pxor xmm1, xmm3 | |
539 | pxor xmm5, xmm7 // t and t^7 contribs | |
540 | pxor xmm0, xmm4 | |
541 | pxor xmm5, xmm2 // mix everything together | |
542 | pxor xmm0, xmm1 | |
543 | movq xmm1, xmm5 // shunt (z_0; ?) into proper place | |
544 | .endm | |
545 | ||
546 | .macro mul256 | |
547 | // Enter with u and v in xmm0/xmm1 and xmm2/xmm3 respectively; leave | |
548 | // with z = u v in xmm0/xmm1. Clobbers xmm2--xmm7. On 32-bit x86, | |
549 | // requires 16 bytes aligned space at SP; on amd64, also clobbers | |
550 | // xmm8. | |
551 | ||
552 | // Now it's starting to look worthwhile to do Karatsuba. Suppose | |
553 | // u = u_0 + u_1 B and v = v_0 + v_1 B. Then | |
554 | // | |
555 | // u v = (u_0 v_0) + (u_0 v_1 + u_1 v_0) B + (u_1 v_1) B^2 | |
556 | // | |
557 | // Name these coefficients of B^i be a, b, and c, respectively, and | |
558 | // let r = u_0 + u_1 and s = v_0 + v_1. Then observe that | |
559 | // | |
560 | // q = r s = (u_0 + u_1) (v_0 + v_1) | |
561 | // = (u_0 v_0) + (u1 v_1) + (u_0 v_1 + u_1 v_0) | |
562 | // = a + d + c | |
563 | // | |
564 | // The first two terms we've already calculated; the last is the | |
565 | // remaining one we want. We'll set B = t^128. We know how to do | |
566 | // 128-bit multiplications already, and Karatsuba is too annoying | |
567 | // there, so there'll be 12 multiplications altogether, rather than | |
568 | // the 16 we'd have if we did this the naïve way. | |
569 | // | |
570 | // On x86, there aren't quite enough registers, so spill one for a | |
571 | // bit. On AMD64, we can keep on going, so it's all good. | |
572 | ||
573 | // xmm0 = // u_1 = (u_11; u_10) | |
574 | // xmm1 = // u_0 = (u_01; u_00) | |
575 | // xmm2 = // v_1 = (v_11; v_10) | |
576 | // xmm3 = // v_0 = (v_01; v_00) | |
577 | movdqa xmm4, xmm0 // u_1 again | |
578 | #if CPUFAM_X86 | |
579 | movdqa [esp + 0], xmm3 | |
580 | #elif CPUFAM_AMD64 | |
581 | movdqa xmm8, xmm3 | |
582 | # define V0 xmm8 | |
583 | #endif | |
584 | pxor xmm4, xmm1 // u_* = (u_01 + u_11; u_00 + u_10) | |
585 | pxor xmm3, xmm2 // v_* = (v_01 + v_11; v_00 + v_10) | |
586 | ||
587 | // Start by building the cross product, q = u_* v_*. | |
588 | movdqa xmm7, xmm4 // more copies of u_* | |
589 | movdqa xmm5, xmm4 | |
590 | movdqa xmm6, xmm4 | |
591 | pclmullqhqdq xmm4, xmm3 // u_*1 v_*0 | |
592 | pclmulhqlqdq xmm7, xmm3 // u_*0 v_*1 | |
593 | pclmullqlqdq xmm5, xmm3 // u_*1 v_*1 | |
594 | pclmulhqhqdq xmm6, xmm3 // u_*0 v_*0 | |
595 | pxor xmm4, xmm7 // u_*1 v_*0 + u_*0 v_*1 | |
596 | movdqa xmm7, xmm4 | |
597 | pslldq xmm4, 8 | |
598 | psrldq xmm7, 8 | |
599 | pxor xmm5, xmm4 // q_1 | |
600 | pxor xmm6, xmm7 // q_0 | |
601 | ||
602 | // Next, work on the high half, a = u_1 v_1. | |
603 | movdqa xmm3, xmm0 // more copies of u_1 | |
604 | movdqa xmm4, xmm0 | |
605 | movdqa xmm7, xmm0 | |
606 | pclmullqhqdq xmm0, xmm2 // u_11 v_10 | |
607 | pclmulhqlqdq xmm3, xmm2 // u_10 v_11 | |
608 | pclmullqlqdq xmm4, xmm2 // u_11 v_11 | |
609 | pclmulhqhqdq xmm7, xmm2 // u_10 v_10 | |
610 | #if CPUFAM_X86 | |
611 | movdqa xmm2, [esp + 0] | |
612 | # define V0 xmm2 | |
613 | #endif | |
614 | pxor xmm0, xmm3 // u_10 v_11 + u_11 v_10 | |
615 | movdqa xmm3, xmm0 | |
616 | pslldq xmm0, 8 | |
617 | psrldq xmm3, 8 | |
618 | pxor xmm4, xmm0 // x_1 = a_1 | |
619 | pxor xmm7, xmm3 // a_0 | |
620 | ||
621 | // Mix that into the product now forming in xmm4--xmm7. | |
622 | pxor xmm5, xmm4 // a_1 + q_1 | |
623 | pxor xmm6, xmm7 // a_0 + q_0 | |
624 | pxor xmm5, xmm7 // a_0 + (a_1 + q_1) | |
625 | ||
626 | // Finally, the low half, c = u_0 v_0. | |
627 | movdqa xmm0, xmm1 // more copies of u_0 | |
628 | movdqa xmm3, xmm1 | |
629 | movdqa xmm7, xmm1 | |
630 | pclmullqhqdq xmm1, V0 // u_01 v_00 | |
631 | pclmulhqlqdq xmm0, V0 // u_00 v_01 | |
632 | pclmullqlqdq xmm3, V0 // u_01 v_01 | |
633 | pclmulhqhqdq xmm7, V0 // u_00 v_00 | |
634 | pxor xmm0, xmm1 // u_10 v_11 + u_11 v_10 | |
635 | movdqa xmm1, xmm0 | |
636 | pslldq xmm0, 8 | |
637 | psrldq xmm1, 8 | |
638 | pxor xmm3, xmm0 // c_1 | |
639 | pxor xmm7, xmm1 // x_0 = c_0 | |
640 | ||
641 | // And mix that in to complete the product. | |
642 | pxor xmm6, xmm3 // (a_0 + q_0) + c_1 | |
643 | pxor xmm5, xmm3 // x_2 = a_0 + (a_1 + c_1 + q_1) = a_0 + b_1 | |
644 | pxor xmm6, xmm7 // x_1 = (a_0 + c_0 + q_0) + c_1 = b_0 + c_1 | |
645 | ||
646 | #undef V0 | |
647 | ||
648 | // Now we need to shift that whole lot one bit to the left. This | |
649 | // will also give us an opportunity to put the product back in | |
650 | // xmm0--xmm3. This is a slightly merry dance because it's nearly | |
651 | // pipelined but we don't have enough registers. | |
652 | // | |
653 | // After this, we'll be in GCM bizarro-world. | |
654 | movdqa xmm0, xmm4 // x_3 again | |
655 | psrld xmm4, 31 // x_3 carries | |
656 | pslld xmm0, 1 // x_3 shifted left | |
657 | movdqa xmm3, xmm4 // x_3 copy carries | |
658 | movdqa xmm1, xmm5 // x_2 again | |
659 | pslldq xmm4, 4 // x_3 carries shifted up | |
660 | psrld xmm5, 31 // x_2 carries | |
661 | psrldq xmm3, 12 // x_3 big carry out | |
662 | pslld xmm1, 1 // x_2 shifted left | |
663 | por xmm0, xmm4 // x_3 mixed together | |
664 | movdqa xmm4, xmm5 // x_2 copy carries | |
665 | movdqa xmm2, xmm6 // x_1 again | |
666 | pslldq xmm5, 4 // x_2 carries shifted up | |
667 | psrld xmm6, 31 // x_1 carries | |
668 | psrldq xmm4, 12 // x_2 big carry out | |
669 | pslld xmm2, 1 // x_1 shifted | |
670 | por xmm1, xmm5 // x_2 mixed together | |
671 | movdqa xmm5, xmm6 // x_1 copy carries | |
672 | por xmm1, xmm3 // x_2 with carry from x_3 | |
673 | movdqa xmm3, xmm7 // x_0 again | |
674 | pslldq xmm6, 4 // x_1 carries shifted up | |
675 | psrld xmm7, 31 // x_2 carries | |
676 | psrldq xmm5, 12 // x_1 big carry out | |
677 | pslld xmm3, 1 // x_0 shifted | |
678 | por xmm2, xmm6 // x_1 mixed together | |
679 | pslldq xmm7, 4 // x_0 carries shifted up | |
680 | por xmm2, xmm4 // x_1 with carry from x_2 | |
681 | por xmm3, xmm7 // x_0 mixed together | |
682 | por xmm3, xmm5 // x_0 with carry from x_1 | |
683 | ||
684 | // Now we must reduce. This is essentially the same as the 128-bit | |
685 | // case above, but more complicated because everything is bigger. | |
686 | // The polynomial this time is p(t) = t^256 + t^10 + t^5 + t^2 + 1. | |
687 | ||
688 | // First, shift the high bits down. | |
689 | movdqa xmm4, xmm0 // y_3 again | |
690 | movdqa xmm5, xmm0 // y_3 yet again | |
691 | movdqa xmm6, xmm0 // y_3 again again | |
692 | pslld xmm4, 30 // y_3: b_i for t^2 | |
693 | pslld xmm5, 27 // y_3: b_i for t^5 | |
694 | pslld xmm6, 22 // y_3: b_i for t^10 | |
695 | movdqa xmm7, xmm1 // y_2 again | |
696 | pxor xmm4, xmm5 | |
697 | movdqa xmm5, xmm1 // y_2 again | |
698 | pxor xmm4, xmm6 | |
699 | movdqa xmm6, xmm1 // y_2 again | |
700 | pslld xmm7, 30 // y_2: b_i for t^2 | |
701 | pslld xmm5, 27 // y_2: b_i for t^5 | |
702 | pslld xmm6, 22 // y_2: b_i for t^10 | |
703 | pxor xmm7, xmm5 | |
704 | movdqa xmm5, xmm4 | |
705 | pxor xmm7, xmm6 | |
706 | psrldq xmm4, 4 | |
707 | movdqa xmm6, xmm7 | |
708 | pslldq xmm5, 12 | |
709 | psrldq xmm7, 4 | |
710 | pxor xmm2, xmm4 | |
711 | pslldq xmm6, 12 | |
712 | pxor xmm3, xmm7 | |
713 | pxor xmm1, xmm5 | |
714 | pxor xmm2, xmm6 | |
715 | ||
716 | // And then shift the low bits up. | |
717 | movdqa xmm4, xmm0 // y_3 again | |
718 | movdqa xmm5, xmm1 // y_2 again | |
719 | movdqa xmm6, xmm0 // y_3 yet again | |
720 | movdqa xmm7, xmm1 // y_2 yet again | |
721 | pxor xmm2, xmm0 // y_1 and unit contrib from y_3 | |
722 | pxor xmm3, xmm1 // y_0 and unit contrib from y_2 | |
723 | psrld xmm0, 2 | |
724 | psrld xmm1, 2 | |
725 | psrld xmm4, 5 | |
726 | psrld xmm5, 5 | |
727 | psrld xmm6, 10 | |
728 | psrld xmm7, 10 | |
729 | pxor xmm0, xmm2 // y_1, with y_3 units and t^2 | |
730 | pxor xmm1, xmm3 // y_0, with y_2 units and t^2 | |
731 | pxor xmm4, xmm6 // y_3 t^5 and t^10 contribs | |
732 | pxor xmm5, xmm7 // y_2 t^5 and t^10 contribs | |
733 | pxor xmm0, xmm4 // high half of reduced result | |
734 | pxor xmm1, xmm5 // low half; all done | |
735 | .endm | |
736 | ||
737 | ///-------------------------------------------------------------------------- | |
738 | /// Main code. | |
739 | ||
740 | // There are a number of representations of field elements in this code and | |
741 | // it can be confusing. | |
742 | // | |
743 | // * The `external format' consists of a sequence of contiguous bytes in | |
744 | // memory called a `block'. The GCM spec explains how to interpret this | |
745 | // block as an element of a finite field. As discussed extensively, this | |
746 | // representation is very annoying for a number of reasons. On the other | |
747 | // hand, this code never actually deals with it directly. | |
748 | // | |
749 | // * The `register format' consists of one or more XMM registers, depending | |
750 | // on the block size. The bytes in these registers are in reverse order | |
751 | // -- so the least-significant byte of the lowest-numbered register holds | |
752 | // the /last/ byte in the block. If the block size is not a multiple of | |
753 | // 16 bytes, then there must be padding. 96-bit blocks are weird: the | |
754 | // padding is inserted at the /least/ significant end, so the register | |
755 | // holds (0, x_0; x_1, x_2); otherwise, the padding goes at the most | |
756 | // significant end. | |
757 | // | |
758 | // * The `words' format consists of a sequence of bytes, as in the | |
759 | // `external format', but, according to the blockcipher in use, the bytes | |
760 | // within each 32-bit word may be reversed (`big-endian') or not | |
761 | // (`little-endian'). Accordingly, there are separate entry points for | |
762 | // each variant, identified with `b' or `l'. | |
763 | ||
764 | #define SSEFUNC(f) \ | |
765 | FUNC(f##_avx); vzeroupper; endprologue; ENDFUNC; \ | |
766 | FUNC(f) | |
767 | ||
768 | SSEFUNC(gcm_mulk_128b_x86ish_pclmul) | |
769 | // On entry, A points to a 128-bit field element in big-endian words | |
770 | // format; K points to a field-element in register format. On exit, | |
771 | // A is updated with the product A K. | |
772 | ||
773 | #if CPUFAM_X86 | |
774 | mov A, [esp + 4] | |
775 | mov K, [esp + 8] | |
776 | #endif | |
777 | endprologue | |
778 | movdqu xmm0, [A] | |
779 | movdqu xmm1, [K] | |
780 | pshufd xmm0, xmm0, SHUF(3, 2, 1, 0) | |
781 | mul128 | |
782 | pshufd xmm0, xmm0, SHUF(3, 2, 1, 0) | |
783 | movdqu [A], xmm0 | |
784 | ret | |
785 | ENDFUNC | |
786 | ||
787 | SSEFUNC(gcm_mulk_128l_x86ish_pclmul) | |
788 | // On entry, A points to a 128-bit field element in little-endian | |
789 | // words format; K points to a field-element in register format. On | |
790 | // exit, A is updated with the product A K. | |
791 | ||
792 | #if CPUFAM_X86 | |
793 | mov A, [esp + 4] | |
794 | mov K, [esp + 8] | |
795 | ldgot ecx | |
796 | #endif | |
797 | endprologue | |
798 | movdqa xmm7, [INTADDR(swaptab_128l, ecx)] | |
799 | movdqu xmm0, [A] | |
800 | movdqu xmm1, [K] | |
801 | pshufb xmm0, xmm7 | |
802 | mul128 | |
803 | pshufb xmm0, xmm7 | |
804 | movdqu [A], xmm0 | |
805 | ret | |
806 | ENDFUNC | |
807 | ||
808 | SSEFUNC(gcm_mulk_64b_x86ish_pclmul) | |
809 | // On entry, A points to a 64-bit field element in big-endian words | |
810 | // format; K points to a field-element in register format. On exit, | |
811 | // A is updated with the product A K. | |
812 | ||
813 | #if CPUFAM_X86 | |
814 | mov A, [esp + 4] | |
815 | mov K, [esp + 8] | |
816 | #endif | |
817 | endprologue | |
818 | movq xmm0, [A] | |
819 | movq xmm1, [K] | |
820 | pshufd xmm0, xmm0, SHUF(1, 0, 3, 3) | |
821 | mul64 | |
822 | pshufd xmm0, xmm0, SHUF(1, 0, 3, 3) | |
823 | movq [A], xmm0 | |
824 | ret | |
825 | ENDFUNC | |
826 | ||
827 | SSEFUNC(gcm_mulk_64l_x86ish_pclmul) | |
828 | // On entry, A points to a 64-bit field element in little-endian | |
829 | // words format; K points to a field-element in register format. On | |
830 | // exit, A is updated with the product A K. | |
831 | ||
832 | #if CPUFAM_X86 | |
833 | mov A, [esp + 4] | |
834 | mov K, [esp + 8] | |
835 | ldgot ecx | |
836 | #endif | |
837 | endprologue | |
838 | movdqa xmm7, [INTADDR(swaptab_64l, ecx)] | |
839 | movq xmm0, [A] | |
840 | movq xmm1, [K] | |
841 | pshufb xmm0, xmm7 | |
842 | mul64 | |
843 | pshufb xmm0, xmm7 | |
844 | movq [A], xmm0 | |
845 | ret | |
846 | ENDFUNC | |
847 | ||
848 | SSEFUNC(gcm_mulk_96b_x86ish_pclmul) | |
849 | // On entry, A points to a 96-bit field element in big-endian words | |
850 | // format; K points to a field-element in register format (i.e., 16 | |
851 | // bytes, with the first four bytes zero). On exit, A is updated | |
852 | // with the product A K. | |
853 | ||
854 | #if CPUFAM_X86 | |
855 | mov A, [esp + 4] | |
856 | mov K, [esp + 8] | |
857 | #endif | |
858 | endprologue | |
859 | movq xmm0, [A + 0] | |
860 | movd xmm2, [A + 8] | |
861 | movdqu xmm1, [K] | |
862 | punpcklqdq xmm0, xmm2 | |
863 | pshufd xmm0, xmm0, SHUF(3, 2, 1, 0) | |
864 | mul96 | |
865 | pshufd xmm1, xmm0, SHUF(3, 2, 1, 0) | |
866 | psrldq xmm0, 4 | |
867 | movq [A + 0], xmm1 | |
868 | movd [A + 8], xmm0 | |
869 | ret | |
870 | ENDFUNC | |
871 | ||
872 | SSEFUNC(gcm_mulk_96l_x86ish_pclmul) | |
873 | // On entry, A points to a 96-bit field element in little-endian | |
874 | // words format; K points to a field-element in register format | |
875 | // (i.e., 16 bytes, with the first four bytes zero). On exit, A is | |
876 | // updated with the product A K. | |
877 | ||
878 | #if CPUFAM_X86 | |
879 | mov A, [esp + 4] | |
880 | mov K, [esp + 8] | |
881 | ldgot ecx | |
882 | #endif | |
883 | endprologue | |
884 | movdqa xmm7, [INTADDR(swaptab_128l, ecx)] | |
885 | movq xmm0, [A + 0] | |
886 | movd xmm2, [A + 8] | |
887 | movdqu xmm1, [K] | |
888 | punpcklqdq xmm0, xmm2 | |
889 | pshufb xmm0, xmm7 | |
890 | mul96 | |
891 | pshufb xmm0, xmm7 | |
892 | movq [A + 0], xmm0 | |
893 | psrldq xmm0, 8 | |
894 | movd [A + 8], xmm0 | |
895 | ret | |
896 | ENDFUNC | |
897 | ||
898 | SSEFUNC(gcm_mulk_192b_x86ish_pclmul) | |
899 | // On entry, A points to a 192-bit field element in big-endian words | |
900 | // format; K points to a field-element in register format. On exit, | |
901 | // A is updated with the product A K. | |
902 | ||
903 | #if CPUFAM_X86 | |
904 | mov A, [esp + 4] | |
905 | mov K, [esp + 8] | |
906 | #endif | |
907 | #if CPUFAM_AMD64 && ABI_WIN | |
908 | stalloc 2*16 + 8 | |
909 | savexmm xmm6, 0 | |
910 | savexmm xmm7, 16 | |
911 | #endif | |
912 | endprologue | |
913 | movdqu xmm0, [A + 8] | |
914 | movq xmm1, [A + 0] | |
915 | movdqu xmm2, [K + 0] | |
916 | movq xmm3, [K + 16] | |
917 | pshufd xmm0, xmm0, SHUF(3, 2, 1, 0) | |
918 | pshufd xmm1, xmm1, SHUF(1, 0, 3, 3) | |
919 | mul192 | |
920 | pshufd xmm0, xmm0, SHUF(3, 2, 1, 0) | |
921 | pshufd xmm1, xmm1, SHUF(1, 0, 3, 3) | |
922 | movdqu [A + 8], xmm0 | |
923 | movq [A + 0], xmm1 | |
924 | #if CPUFAM_AMD64 && ABI_WIN | |
925 | rstrxmm xmm6, 0 | |
926 | rstrxmm xmm7, 16 | |
927 | stfree 2*16 + 8 | |
928 | #endif | |
929 | ret | |
930 | ENDFUNC | |
931 | ||
932 | SSEFUNC(gcm_mulk_192l_x86ish_pclmul) | |
933 | // On entry, A points to a 192-bit field element in little-endian | |
934 | // words format; K points to a field-element in register format. On | |
935 | // exit, A is updated with the product A K. | |
936 | ||
937 | #if CPUFAM_X86 | |
938 | mov A, [esp + 4] | |
939 | mov K, [esp + 8] | |
940 | ldgot ecx | |
941 | #endif | |
942 | #if CPUFAM_AMD64 && ABI_WIN | |
943 | stalloc 2*16 + 8 | |
944 | savexmm xmm6, 0 | |
945 | savexmm xmm7, 16 | |
946 | #endif | |
947 | endprologue | |
948 | movdqu xmm0, [A + 8] | |
949 | movq xmm1, [A + 0] | |
950 | movdqu xmm2, [K + 0] | |
951 | movq xmm3, [K + 16] | |
952 | pshufb xmm0, [INTADDR(swaptab_128l, ecx)] | |
953 | pshufb xmm1, [INTADDR(swaptab_64l, ecx)] | |
954 | mul192 | |
955 | pshufb xmm0, [INTADDR(swaptab_128l, ecx)] | |
956 | pshufb xmm1, [INTADDR(swaptab_64l, ecx)] | |
957 | movdqu [A + 8], xmm0 | |
958 | movq [A + 0], xmm1 | |
959 | #if CPUFAM_AMD64 && ABI_WIN | |
960 | rstrxmm xmm6, 0 | |
961 | rstrxmm xmm7, 16 | |
962 | stfree 2*16 + 8 | |
963 | #endif | |
964 | ret | |
965 | ENDFUNC | |
966 | ||
967 | SSEFUNC(gcm_mulk_256b_x86ish_pclmul) | |
968 | // On entry, A points to a 256-bit field element in big-endian words | |
969 | // format; K points to a field-element in register format. On exit, | |
970 | // A is updated with the product A K. | |
971 | ||
972 | #if CPUFAM_X86 | |
973 | pushreg ebp | |
974 | setfp | |
975 | mov A, [esp + 8] | |
976 | mov K, [esp + 12] | |
977 | and esp, ~15 | |
978 | sub esp, 16 | |
979 | #endif | |
980 | #if CPUFAM_AMD64 && ABI_WIN | |
981 | stalloc 3*16 + 8 | |
982 | savexmm xmm6, 0 | |
983 | savexmm xmm7, 16 | |
984 | savexmm xmm8, 32 | |
985 | #endif | |
986 | endprologue | |
987 | movdqu xmm0, [A + 16] | |
988 | movdqu xmm1, [A + 0] | |
989 | movdqu xmm2, [K + 0] | |
990 | movdqu xmm3, [K + 16] | |
991 | pshufd xmm0, xmm0, SHUF(3, 2, 1, 0) | |
992 | pshufd xmm1, xmm1, SHUF(3, 2, 1, 0) | |
993 | mul256 | |
994 | pshufd xmm0, xmm0, SHUF(3, 2, 1, 0) | |
995 | pshufd xmm1, xmm1, SHUF(3, 2, 1, 0) | |
996 | movdqu [A + 16], xmm0 | |
997 | movdqu [A + 0], xmm1 | |
998 | #if CPUFAM_X86 | |
999 | dropfp | |
1000 | popreg ebp | |
1001 | #endif | |
1002 | #if CPUFAM_AMD64 && ABI_WIN | |
1003 | rstrxmm xmm6, 0 | |
1004 | rstrxmm xmm7, 16 | |
1005 | rstrxmm xmm8, 32 | |
1006 | stfree 3*16 + 8 | |
1007 | #endif | |
1008 | ret | |
1009 | ENDFUNC | |
1010 | ||
1011 | SSEFUNC(gcm_mulk_256l_x86ish_pclmul) | |
1012 | // On entry, A points to a 256-bit field element in little-endian | |
1013 | // words format; K points to a field-element in register format. On | |
1014 | // exit, A is updated with the product A K. | |
1015 | ||
1016 | #if CPUFAM_X86 | |
1017 | pushreg ebp | |
1018 | setfp | |
1019 | mov A, [esp + 8] | |
1020 | mov K, [esp + 12] | |
1021 | and esp, ~15 | |
1022 | ldgot ecx | |
1023 | sub esp, 16 | |
1024 | #endif | |
1025 | #if CPUFAM_AMD64 && ABI_WIN | |
1026 | stalloc 3*16 + 8 | |
1027 | savexmm xmm6, 0 | |
1028 | savexmm xmm7, 16 | |
1029 | savexmm xmm8, 32 | |
1030 | #endif | |
1031 | endprologue | |
1032 | movdqa xmm7, [INTADDR(swaptab_128l, ecx)] | |
1033 | movdqu xmm0, [A + 16] | |
1034 | movdqu xmm1, [A + 0] | |
1035 | movdqu xmm2, [K + 0] | |
1036 | movdqu xmm3, [K + 16] | |
1037 | pshufb xmm0, xmm7 | |
1038 | pshufb xmm1, xmm7 | |
1039 | mul256 | |
1040 | movdqa xmm7, [INTADDR(swaptab_128l, ecx)] | |
1041 | pshufb xmm0, xmm7 | |
1042 | pshufb xmm1, xmm7 | |
1043 | movdqu [A + 16], xmm0 | |
1044 | movdqu [A + 0], xmm1 | |
1045 | #if CPUFAM_X86 | |
1046 | dropfp | |
1047 | popreg ebp | |
1048 | #endif | |
1049 | #if CPUFAM_AMD64 && ABI_WIN | |
1050 | rstrxmm xmm6, 0 | |
1051 | rstrxmm xmm7, 16 | |
1052 | rstrxmm xmm8, 32 | |
1053 | stfree 3*16 + 8 | |
1054 | #endif | |
1055 | ret | |
1056 | ENDFUNC | |
1057 | ||
1058 | RODATA | |
1059 | ||
1060 | .balign 16 | |
1061 | swaptab_128l: | |
1062 | // Table for byte-swapping little-endian words-format blocks larger | |
1063 | // than 64 bits. | |
1064 | .byte 15, 14, 13, 12, 11, 10, 9, 8 | |
1065 | .byte 7, 6, 5, 4, 3, 2, 1, 0 | |
1066 | ||
1067 | .balign 16 | |
1068 | swaptab_64l: | |
1069 | // Table for byte-swapping 64-bit little-endian words-format blocks. | |
1070 | .byte 7, 6, 5, 4, 3, 2, 1, 0 | |
1071 | .byte 255, 255, 255, 255, 255, 255, 255, 255 | |
1072 | ||
1073 | ///----- That's all, folks -------------------------------------------------- |