Commit | Line | Data |
---|---|---|
9e6a4409 MW |
1 | /// -*- mode: asm; asm-comment-char: ?/ -*- |
2 | /// | |
3 | /// GCM acceleration for x86 processors | |
4 | /// | |
5 | /// (c) 2018 Straylight/Edgeware | |
6 | /// | |
7 | ||
8 | ///----- Licensing notice --------------------------------------------------- | |
9 | /// | |
10 | /// This file is part of Catacomb. | |
11 | /// | |
12 | /// Catacomb is free software: you can redistribute it and/or modify it | |
13 | /// under the terms of the GNU Library General Public License as published | |
14 | /// by the Free Software Foundation; either version 2 of the License, or | |
15 | /// (at your option) any later version. | |
16 | /// | |
17 | /// Catacomb is distributed in the hope that it will be useful, but | |
18 | /// WITHOUT ANY WARRANTY; without even the implied warranty of | |
19 | /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
20 | /// Library General Public License for more details. | |
21 | /// | |
22 | /// You should have received a copy of the GNU Library General Public | |
23 | /// License along with Catacomb. If not, write to the Free Software | |
24 | /// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, | |
25 | /// USA. | |
26 | ||
27 | ///-------------------------------------------------------------------------- | |
28 | /// Preliminaries. | |
29 | ||
30 | #include "config.h" | |
31 | #include "asm-common.h" | |
32 | ||
33 | .arch .pclmul | |
34 | ||
35 | .text | |
36 | ||
37 | ///-------------------------------------------------------------------------- | |
38 | /// Common register allocation. | |
39 | ||
40 | #if CPUFAM_X86 | |
41 | # define A eax | |
42 | # define K edx | |
43 | #elif CPUFAM_AMD64 && ABI_SYSV | |
44 | # define A rdi | |
45 | # define K rsi | |
46 | #elif CPUFAM_AMD64 && ABI_WIN | |
47 | # define A rcx | |
48 | # define K rdx | |
49 | #endif | |
50 | ||
51 | ///-------------------------------------------------------------------------- | |
52 | /// Multiplication macros. | |
53 | ||
54 | // The good news is that we have a fancy instruction to do the | |
55 | // multiplications. The bad news is that it's not particularly well- | |
56 | // suited to the job. | |
57 | // | |
58 | // For one thing, it only does a 64-bit multiplication, so in general | |
59 | // we'll need to synthesize the full-width multiply by hand. For | |
60 | // another thing, it doesn't help with the reduction, so we have to | |
61 | // do that by hand too. And, finally, GCM has crazy bit ordering, | |
62 | // and the instruction does nothing useful for that at all. | |
63 | // | |
64 | // Focusing on that last problem first: the bits aren't in monotonic | |
65 | // significance order unless we permute them. If we reverse the byte | |
66 | // order, then we'll have the bits in monotonic order, but backwards, | |
67 | // so the degree-0 coefficient will be in the most-significant bit. | |
68 | // | |
69 | // This is less of a difficulty than it seems at first, because | |
70 | // algebra. Suppose we are given u = SUM_{0<=i<n} u_i t^i and v = | |
71 | // SUM_{0<=j<n} v_j t^j; then | |
72 | // | |
73 | // u v = SUM_{0<=i,j<n} u_i v_j t^{i+j} | |
74 | // | |
75 | // Suppose instead that we're given ũ = SUM_{0<=i<n} u_{n-i-1} t^i | |
d7c0f9a7 | 76 | // and ṽ = SUM_{0<=j<n} v_{n-j-1} t^j, so the bits are backwards. |
9e6a4409 MW |
77 | // Then |
78 | // | |
d7c0f9a7 | 79 | // ũ ṽ = SUM_{0<=i,j<n} u_{n-i-1} v_{n-j-1} t^{i+j} |
9e6a4409 MW |
80 | // = SUM_{0<=i,j<n} u_i v_j t^{2n-2-(i+j)} |
81 | // | |
82 | // which is almost the bit-reversal of u v, only it's shifted right | |
ac4a43c1 MW |
83 | // by one place. Putting this another way, what we have is actually |
84 | // the bit reversal of the product u v t. We could get the correct | |
85 | // answer (modulo p(t)) if we'd sneakily divided one of the operands | |
86 | // by t before we started. Conveniently, v is actually the secret | |
87 | // value k set up by the GCM `mktable' function, so we can arrange to | |
88 | // actually store k/t (mod p(t)) and then the product will come out | |
89 | // correct (modulo p(t)) and we won't have anything more to worry | |
90 | // about here. | |
9e6a4409 MW |
91 | // |
92 | // That was important to think about, but there's not a great deal to | |
93 | // do about it yet other than to convert what we've got from the | |
94 | // blockcipher's byte-ordering convention to our big-endian | |
95 | // convention. Since this depends on the blockcipher convention, | |
96 | // we'll leave the caller to cope with this: the macros here will | |
97 | // assume that the operands are in `register' format, which is the | |
98 | // byte-reversal of the external representation, padded at the | |
99 | // most-significant end except for 96-bit blocks, which are | |
100 | // zero-padded at the least-significant end (see `mul96' for the | |
101 | // details). In the commentary, pieces of polynomial are numbered | |
102 | // according to the degree of the coefficients, so the unit | |
103 | // coefficient of some polynomial a is in a_0. | |
104 | // | |
105 | // The commentary for `mul128' is the most detailed. The other | |
106 | // macros assume that you've already read and understood that. | |
107 | ||
108 | .macro mul128 | |
109 | // Enter with u and v in xmm0 and xmm1 respectively; leave with z = | |
110 | // u v in xmm0. Clobbers xmm1--xmm4. | |
111 | ||
112 | // First for the double-precision multiplication. It's tempting to | |
113 | // use Karatsuba's identity here, but I suspect that loses more in | |
114 | // the shifting, bit-twiddling, and dependency chains that it gains | |
115 | // in saving a multiplication which otherwise pipelines well. | |
116 | // xmm0 = // (u_1; u_0) | |
117 | // xmm1 = // (v_1; v_0) | |
118 | movdqa xmm2, xmm1 // (v_1; v_0) again | |
119 | movdqa xmm3, xmm0 // (u_1; u_0) again | |
120 | movdqa xmm4, xmm0 // (u_1; u_0) yet again | |
121 | pclmulhqlqdq xmm2, xmm0 // u_1 v_0 | |
122 | pclmullqlqdq xmm0, xmm1 // u_1 v_1 | |
123 | pclmulhqlqdq xmm3, xmm1 // u_0 v_1 | |
124 | pclmulhqhqdq xmm4, xmm1 // u_0 v_0 | |
125 | ||
126 | // Arrange the pieces to form a double-precision polynomial. | |
127 | pxor xmm2, xmm3 // (m_1; m_0) = u_1 v_0 + u_0 v_1 | |
128 | movdqa xmm1, xmm2 // (m_1; m_0) again | |
129 | pslldq xmm2, 8 // (0; m_1) | |
130 | psrldq xmm1, 8 // (m_0; 0) | |
ac4a43c1 MW |
131 | pxor xmm0, xmm2 // z_1 = u_1 v_1 + m_1 |
132 | pxor xmm1, xmm4 // z_0 = u_0 v_0 + t^64 m_0 | |
133 | ||
134 | // The remaining problem is that the result needs to be reduced | |
9e6a4409 MW |
135 | // modulo p(t) = t^128 + t^7 + t^2 + t + 1. Let R = t^128 = t^7 + |
136 | // t^2 + t + 1 in our field. So far, we've calculated z_0 and z_1 | |
137 | // such that z_0 + z_1 R = u v using the identity R = t^128: now we | |
138 | // must collapse the two halves of z together using the other | |
139 | // identity R = t^7 + t^2 + t + 1. | |
140 | // | |
141 | // We do this by working on each 32-bit word of the high half of z | |
ac4a43c1 MW |
142 | // separately, so consider x_i, for some 4 <= i < 8. Certainly, x_i |
143 | // t^{32i} = x_i R t^{32(i-4)} = (t^7 + t^2 + t + 1) x_i t^{32(i-4)}, | |
9e6a4409 | 144 | // but we can't use that directly without breaking up the 32-bit word |
ac4a43c1 MW |
145 | // structure. Instead, we start by considering just x_i t^7 |
146 | // t^{32(i-4)}, which again looks tricky. Now, split x_i = a_i + | |
9e6a4409 MW |
147 | // t^25 b_i, with deg a_i < 25; then |
148 | // | |
ac4a43c1 | 149 | // x_i t^7 t^{32(i-4)} = a_i t^7 t^{32(i-4)} + b_i t^{32(i-3)} |
9e6a4409 | 150 | // |
ac4a43c1 | 151 | // We can similarly decompose x_i t^2 and x_i t into a pair of 32-bit |
9e6a4409 MW |
152 | // contributions to the t^{32(i-4)} and t^{32(i-3)} words, but the |
153 | // splits are different. This is lovely, with one small snag: when | |
ac4a43c1 | 154 | // we do this to x_7, we end up with a contribution back into the |
9e6a4409 MW |
155 | // t^128 coefficient word. But notice that only the low seven bits |
156 | // of this word are affected, so there's no knock-on contribution | |
157 | // into the t^32 word. Therefore, if we handle the high bits of each | |
158 | // word together, and then the low bits, everything will be fine. | |
159 | ||
160 | // First, shift the high bits down. | |
ac4a43c1 MW |
161 | movdqa xmm2, xmm0 // (x_7, x_6; x_5, x_4) again |
162 | movdqa xmm3, xmm0 // (x_7, x_6; x_5, x_4) yet again | |
163 | movdqa xmm4, xmm0 // (x_7, x_6; x_5, x_4) again again | |
9e6a4409 MW |
164 | pslld xmm2, 31 // the b_i for t |
165 | pslld xmm3, 30 // the b_i for t^2 | |
166 | pslld xmm4, 25 // the b_i for t^7 | |
167 | pxor xmm2, xmm3 // add them all together | |
168 | pxor xmm2, xmm4 | |
169 | movdqa xmm3, xmm2 // and a copy for later | |
170 | psrldq xmm2, 4 // contribution into low half | |
171 | pslldq xmm3, 12 // and high half | |
172 | pxor xmm1, xmm2 | |
173 | pxor xmm0, xmm3 | |
174 | ||
175 | // And then shift the low bits up. | |
176 | movdqa xmm2, xmm0 | |
177 | movdqa xmm3, xmm0 | |
178 | pxor xmm1, xmm0 // mix in the unit contribution | |
179 | psrld xmm0, 1 | |
180 | psrld xmm2, 2 | |
181 | psrld xmm3, 7 | |
182 | pxor xmm1, xmm2 // low half, unit, and t^2 contribs | |
183 | pxor xmm0, xmm3 // t and t^7 contribs | |
184 | pxor xmm0, xmm1 // mix them together and we're done | |
185 | .endm | |
186 | ||
187 | .macro mul64 | |
188 | // Enter with u and v in the low halves of xmm0 and xmm1 | |
189 | // respectively; leave with z = u v in xmm0. Clobbers xmm1--xmm4. | |
190 | ||
191 | // The multiplication is thankfully easy. | |
ac4a43c1 | 192 | pclmullqlqdq xmm1, xmm0 // u v |
9e6a4409 MW |
193 | |
194 | // Now we must reduce. This is essentially the same as the 128-bit | |
195 | // case above, but mostly simpler because everything is smaller. The | |
196 | // polynomial this time is p(t) = t^64 + t^4 + t^3 + t + 1. | |
197 | ||
198 | // First, we must detach the top (`low'!) half of the result. | |
ac4a43c1 MW |
199 | movdqa xmm0, xmm1 // (x_3, x_2; x_1, x_0) again |
200 | psrldq xmm1, 8 // (x_1, x_0; 0, 0) | |
9e6a4409 MW |
201 | |
202 | // Next, shift the high bits down. | |
ac4a43c1 MW |
203 | movdqa xmm2, xmm0 // (x_3, x_2; ?, ?) again |
204 | movdqa xmm3, xmm0 // (x_3, x_2; ?, ?) yet again | |
205 | movdqa xmm4, xmm0 // (x_3, x_2; ?, ?) again again | |
9e6a4409 MW |
206 | pslld xmm2, 31 // b_i for t |
207 | pslld xmm3, 29 // b_i for t^3 | |
208 | pslld xmm4, 28 // b_i for t^4 | |
209 | pxor xmm2, xmm3 // add them all together | |
210 | pxor xmm2, xmm4 | |
211 | movdqa xmm3, xmm2 // and a copy for later | |
212 | movq xmm2, xmm2 // zap high half | |
213 | pslldq xmm3, 4 // contribution into high half | |
214 | psrldq xmm2, 4 // and low half | |
215 | pxor xmm0, xmm3 | |
216 | pxor xmm1, xmm2 | |
217 | ||
218 | // And then shift the low bits up. | |
219 | movdqa xmm2, xmm0 | |
220 | movdqa xmm3, xmm0 | |
221 | pxor xmm1, xmm0 // mix in the unit contribution | |
222 | psrld xmm0, 1 | |
223 | psrld xmm2, 3 | |
224 | psrld xmm3, 4 | |
225 | pxor xmm1, xmm2 // low half, unit, and t^3 contribs | |
226 | pxor xmm0, xmm3 // t and t^4 contribs | |
227 | pxor xmm0, xmm1 // mix them together and we're done | |
228 | .endm | |
229 | ||
230 | .macro mul96 | |
231 | // Enter with u and v in the /high/ three words of xmm0 and xmm1 | |
232 | // respectively (and zero in the low word); leave with z = u v in the | |
233 | // high three words of xmm0, and /junk/ in the low word. Clobbers | |
234 | // xmm1--xmm4. | |
235 | ||
236 | // This is an inconvenient size. There's nothing for it but to do | |
237 | // four multiplications, as if for the 128-bit case. It's possible | |
238 | // that there's cruft in the top 32 bits of the input registers, so | |
239 | // shift both of them up by four bytes before we start. This will | |
240 | // mean that the high 64 bits of the result (from GCM's viewpoint) | |
241 | // will be zero. | |
242 | // xmm0 = // (0, u_2; u_1, u_0) | |
243 | // xmm1 = // (0, v_2; v_1, v_0) | |
244 | movdqa xmm2, xmm1 // (0, v_2; v_1, v_0) again | |
245 | movdqa xmm3, xmm0 // (0, u_2; u_1, u_0) again | |
246 | movdqa xmm4, xmm0 // (0, u_2; u_1, u_0) yet again | |
247 | pclmulhqlqdq xmm2, xmm0 // u_2 (v_1 t^32 + v_0) = e_0 | |
248 | pclmullqlqdq xmm0, xmm1 // u_2 v_2 = d = (0; d) | |
249 | pclmulhqlqdq xmm3, xmm1 // v_2 (u_1 t^32 + u_0) = e_1 | |
250 | pclmulhqhqdq xmm4, xmm1 // u_0 v_0 + (u_1 v_0 + u_0 v_1) t^32 | |
251 | // + u_1 v_1 t^64 = f | |
252 | ||
253 | // Extract the high and low halves of the 192-bit result. We don't | |
254 | // need be too picky about the unused high words of the result | |
255 | // registers. The answer we want is d t^128 + e t^64 + f, where e = | |
256 | // e_0 + e_1. | |
257 | // | |
258 | // The place values for the two halves are (t^160, t^128; t^96, ?) | |
ac4a43c1 MW |
259 | // and (?, t^64; t^32, 1). But we also want to shift the high part |
260 | // left by a word, for symmetry's sake. | |
9e6a4409 MW |
261 | psrldq xmm0, 8 // (d; 0) = d t^128 |
262 | pxor xmm2, xmm3 // e = (e_0 + e_1) | |
263 | movdqa xmm1, xmm4 // f again | |
264 | pxor xmm0, xmm2 // d t^128 + e t^64 | |
265 | psrldq xmm2, 12 // e[31..0] t^64 | |
266 | psrldq xmm1, 4 // f[95..0] | |
ac4a43c1 MW |
267 | pslldq xmm4, 12 // f[127..96], shifted |
268 | pslldq xmm0, 4 // shift high 96 bits | |
9e6a4409 MW |
269 | pxor xmm1, xmm2 // low 96 bits of result |
270 | pxor xmm0, xmm4 // high 96 bits of result | |
271 | ||
9e6a4409 MW |
272 | // Finally, the reduction. This is essentially the same as the |
273 | // 128-bit case, except that the polynomial is p(t) = t^96 + t^10 + | |
274 | // t^9 + t^6 + 1. The degrees are larger but not enough to cause | |
275 | // trouble for the general approach. | |
276 | ||
277 | // First, shift the high bits down. | |
278 | movdqa xmm2, xmm0 // copies of the high part | |
279 | movdqa xmm3, xmm0 | |
280 | movdqa xmm4, xmm0 | |
281 | pslld xmm2, 26 // b_i for t^6 | |
282 | pslld xmm3, 23 // b_i for t^9 | |
283 | pslld xmm4, 22 // b_i for t^10 | |
284 | pxor xmm2, xmm3 // add them all together | |
285 | pslldq xmm1, 4 // shift low part up to match | |
286 | pxor xmm2, xmm4 | |
287 | movdqa xmm3, xmm2 // and a copy for later | |
288 | pslldq xmm2, 8 // contribution to high half | |
289 | psrldq xmm3, 4 // contribution to low half | |
290 | pxor xmm1, xmm3 | |
291 | pxor xmm0, xmm2 | |
292 | ||
293 | // And then shift the low bits up. | |
294 | movdqa xmm2, xmm0 // copies of the high part | |
295 | movdqa xmm3, xmm0 | |
296 | pxor xmm1, xmm0 // mix in the unit contribution | |
297 | psrld xmm0, 6 | |
298 | psrld xmm2, 9 | |
299 | psrld xmm3, 10 | |
300 | pxor xmm1, xmm2 // low half, unit, and t^9 contribs | |
301 | pxor xmm0, xmm3 // t^6 and t^10 contribs | |
302 | pxor xmm0, xmm1 // mix them together and we're done | |
303 | .endm | |
304 | ||
305 | .macro mul192 | |
306 | // Enter with u and v in xmm0/xmm1 and xmm2/xmm3 respectively; leave | |
307 | // with z = u v in xmm0/xmm1 -- the top halves of the high registers | |
308 | // are unimportant. Clobbers xmm2--xmm7. | |
309 | ||
310 | // Start multiplying and accumulating pieces of product. | |
311 | // xmm0 = // (u_2; u_1) | |
312 | // xmm1 = // (u_0; ?) | |
313 | // xmm2 = // (v_2; v_1) | |
314 | // xmm3 = // (v_0; ?) | |
315 | movdqa xmm4, xmm0 // (u_2; u_1) again | |
316 | movdqa xmm5, xmm0 // (u_2; u_1) yet again | |
317 | movdqa xmm6, xmm0 // (u_2; u_1) again again | |
ac4a43c1 MW |
318 | movdqa xmm7, xmm3 // (v_0; ?) again |
319 | punpcklqdq xmm3, xmm1 // (v_0; u_0) | |
9e6a4409 | 320 | pclmulhqhqdq xmm4, xmm2 // u_1 v_1 |
ac4a43c1 | 321 | pclmullqlqdq xmm1, xmm2 // u_0 v_2 |
9e6a4409 MW |
322 | pclmullqhqdq xmm5, xmm2 // u_2 v_1 |
323 | pclmulhqlqdq xmm6, xmm2 // u_1 v_2 | |
ac4a43c1 MW |
324 | pxor xmm1, xmm4 // u_0 v_2 + u_1 v_1 |
325 | pclmullqlqdq xmm7, xmm0 // u_2 v_0 | |
9e6a4409 MW |
326 | pxor xmm5, xmm6 // b = u_2 v_1 + u_1 v_2 |
327 | movdqa xmm6, xmm0 // (u_2; u_1) like a bad penny | |
ac4a43c1 | 328 | pxor xmm1, xmm7 // c = u_0 v_2 + u_1 v_1 + u_2 v_0 |
9e6a4409 | 329 | pclmullqlqdq xmm0, xmm2 // a = u_2 v_2 |
ac4a43c1 MW |
330 | pclmulhqlqdq xmm6, xmm3 // u_1 v_0 |
331 | pclmulhqhqdq xmm2, xmm3 // u_0 v_1 | |
332 | pclmullqhqdq xmm3, xmm3 // e = u_0 v_0 | |
333 | pxor xmm6, xmm2 // d = u_1 v_0 + u_0 v_1 | |
9e6a4409 | 334 | |
ac4a43c1 MW |
335 | // Next, the piecing together of the product. There's significant |
336 | // work here to leave the completed pieces in sensible registers. | |
9e6a4409 MW |
337 | // xmm0 = // (a_1; a_0) = a = u_2 v_2 |
338 | // xmm5 = // (b_1; b_0) = b = u_1 v_2 + u_2 v_1 | |
ac4a43c1 | 339 | // xmm1 = // (c_1; c_0) = c = u_0 v_2 + |
9e6a4409 | 340 | // u_1 v_1 + u_2 v_0 |
ac4a43c1 MW |
341 | // xmm6 = // (d_1; d_0) = d = u_0 v_1 + u_1 v_0 |
342 | // xmm3 = // (e_1; e_0) = e = u_0 v_0 | |
343 | // xmm2, xmm4, xmm7 spare | |
344 | movdqa xmm2, xmm6 // (d_1; d_0) again | |
345 | movdqa xmm4, xmm5 // (b_1; b_0) again | |
346 | pslldq xmm6, 8 // (0; d_1) | |
9e6a4409 | 347 | psrldq xmm5, 8 // (b_0; 0) |
ac4a43c1 MW |
348 | psrldq xmm2, 8 // (d_0; 0) |
349 | pslldq xmm4, 8 // (0; b_1) | |
350 | pxor xmm5, xmm6 // (b_0; d_1) | |
351 | pxor xmm0, xmm4 // (x_5; x_4) = (a_1; a_0 + b_1) | |
352 | pxor xmm2, xmm3 // (x_1; x_0) = (e_1 + d_0; e_0) | |
353 | pxor xmm1, xmm5 // (x_3; x_2) = (b_0 + c_1; c_0 + d_1) | |
9e6a4409 MW |
354 | |
355 | // Next, the reduction. Our polynomial this time is p(x) = t^192 + | |
356 | // t^7 + t^2 + t + 1. Yes, the magic numbers are the same as the | |
357 | // 128-bit case. I don't know why. | |
358 | ||
359 | // First, shift the high bits down. | |
ac4a43c1 MW |
360 | // xmm0 = // (x_5; x_4) |
361 | // xmm1 = // (x_3; x_2) | |
362 | // xmm2 = // (x_1; x_0) | |
9e6a4409 | 363 | // xmm3--xmm7 spare |
ac4a43c1 MW |
364 | movdqa xmm3, xmm0 // (x_5; x_4) copy |
365 | movdqa xmm4, xmm0 // (x_5; x_4) copy | |
366 | movdqa xmm5, xmm0 // (x_5; x_4) copy | |
367 | pslld xmm3, 31 // (x_5; x_4) b_i for t | |
368 | pslld xmm4, 30 // (x_5; x_4) b_i for t^2 | |
369 | pslld xmm5, 25 // (x_5; x_4) b_i for t^7 | |
370 | movq xmm6, xmm1 // (x_3; 0) copy | |
9e6a4409 | 371 | pxor xmm3, xmm4 |
ac4a43c1 | 372 | movq xmm7, xmm1 // (x_3; 0) copy |
9e6a4409 | 373 | pxor xmm3, xmm5 |
ac4a43c1 MW |
374 | movq xmm5, xmm1 // (x_3; 0) copy |
375 | movdqa xmm4, xmm3 // (x_5; x_4) b_i combined | |
376 | pslld xmm6, 31 // (x_3; 0) b_i for t | |
377 | pslld xmm7, 30 // (x_3; 0) b_i for t^2 | |
378 | pslld xmm5, 25 // (x_3; 0) b_i for t^7 | |
379 | psrldq xmm3, 12 // (x_5; x_4) low contrib | |
380 | pslldq xmm4, 4 // (x_5; x_4) high contrib | |
9e6a4409 MW |
381 | pxor xmm6, xmm7 |
382 | pxor xmm2, xmm3 | |
383 | pxor xmm6, xmm5 | |
384 | pxor xmm1, xmm4 | |
385 | pslldq xmm6, 4 | |
386 | pxor xmm2, xmm6 | |
387 | ||
388 | // And finally shift the low bits up. Unfortunately, we also have to | |
389 | // split the low bits out. | |
ac4a43c1 MW |
390 | // xmm0 = // (x'_5; x'_4) |
391 | // xmm1 = // (x'_3; x'_2) | |
392 | // xmm2 = // (x'_1; x'_0) | |
393 | movdqa xmm5, xmm1 // copies of (x'_3; x'_2) | |
9e6a4409 MW |
394 | movdqa xmm6, xmm1 |
395 | movdqa xmm7, xmm1 | |
ac4a43c1 MW |
396 | psrldq xmm1, 8 // bring down (x'_2; ?) |
397 | movdqa xmm3, xmm0 // copies of (x'_5; x'_4) | |
9e6a4409 | 398 | movdqa xmm4, xmm0 |
ac4a43c1 MW |
399 | punpcklqdq xmm1, xmm2 // (x'_2; x'_1) |
400 | psrldq xmm2, 8 // (x'_0; ?) | |
9e6a4409 MW |
401 | pxor xmm2, xmm5 // low half and unit contrib |
402 | pxor xmm1, xmm0 | |
403 | psrld xmm5, 1 | |
404 | psrld xmm0, 1 | |
405 | psrld xmm6, 2 | |
406 | psrld xmm3, 2 | |
407 | psrld xmm7, 7 | |
408 | psrld xmm4, 7 | |
409 | pxor xmm2, xmm6 // low half, unit, t^2 contribs | |
410 | pxor xmm1, xmm3 | |
411 | pxor xmm5, xmm7 // t and t^7 contribs | |
412 | pxor xmm0, xmm4 | |
413 | pxor xmm5, xmm2 // mix everything together | |
414 | pxor xmm0, xmm1 | |
415 | movq xmm1, xmm5 // shunt (z_0; ?) into proper place | |
416 | .endm | |
417 | ||
418 | .macro mul256 | |
419 | // Enter with u and v in xmm0/xmm1 and xmm2/xmm3 respectively; leave | |
420 | // with z = u v in xmm0/xmm1. Clobbers xmm2--xmm7. On 32-bit x86, | |
421 | // requires 16 bytes aligned space at SP; on amd64, also clobbers | |
422 | // xmm8. | |
423 | ||
424 | // Now it's starting to look worthwhile to do Karatsuba. Suppose | |
425 | // u = u_0 + u_1 B and v = v_0 + v_1 B. Then | |
426 | // | |
427 | // u v = (u_0 v_0) + (u_0 v_1 + u_1 v_0) B + (u_1 v_1) B^2 | |
428 | // | |
429 | // Name these coefficients of B^i be a, b, and c, respectively, and | |
430 | // let r = u_0 + u_1 and s = v_0 + v_1. Then observe that | |
431 | // | |
432 | // q = r s = (u_0 + u_1) (v_0 + v_1) | |
433 | // = (u_0 v_0) + (u1 v_1) + (u_0 v_1 + u_1 v_0) | |
9f4db500 | 434 | // = a + c + b |
9e6a4409 MW |
435 | // |
436 | // The first two terms we've already calculated; the last is the | |
437 | // remaining one we want. We'll set B = t^128. We know how to do | |
438 | // 128-bit multiplications already, and Karatsuba is too annoying | |
439 | // there, so there'll be 12 multiplications altogether, rather than | |
440 | // the 16 we'd have if we did this the naïve way. | |
441 | // | |
442 | // On x86, there aren't quite enough registers, so spill one for a | |
443 | // bit. On AMD64, we can keep on going, so it's all good. | |
444 | ||
445 | // xmm0 = // u_1 = (u_11; u_10) | |
446 | // xmm1 = // u_0 = (u_01; u_00) | |
447 | // xmm2 = // v_1 = (v_11; v_10) | |
448 | // xmm3 = // v_0 = (v_01; v_00) | |
449 | movdqa xmm4, xmm0 // u_1 again | |
450 | #if CPUFAM_X86 | |
a90d420c | 451 | movdqa [SP + 0], xmm3 |
9e6a4409 MW |
452 | #elif CPUFAM_AMD64 |
453 | movdqa xmm8, xmm3 | |
454 | # define V0 xmm8 | |
455 | #endif | |
456 | pxor xmm4, xmm1 // u_* = (u_01 + u_11; u_00 + u_10) | |
457 | pxor xmm3, xmm2 // v_* = (v_01 + v_11; v_00 + v_10) | |
458 | ||
459 | // Start by building the cross product, q = u_* v_*. | |
460 | movdqa xmm7, xmm4 // more copies of u_* | |
461 | movdqa xmm5, xmm4 | |
462 | movdqa xmm6, xmm4 | |
463 | pclmullqhqdq xmm4, xmm3 // u_*1 v_*0 | |
464 | pclmulhqlqdq xmm7, xmm3 // u_*0 v_*1 | |
465 | pclmullqlqdq xmm5, xmm3 // u_*1 v_*1 | |
466 | pclmulhqhqdq xmm6, xmm3 // u_*0 v_*0 | |
467 | pxor xmm4, xmm7 // u_*1 v_*0 + u_*0 v_*1 | |
468 | movdqa xmm7, xmm4 | |
469 | pslldq xmm4, 8 | |
470 | psrldq xmm7, 8 | |
471 | pxor xmm5, xmm4 // q_1 | |
472 | pxor xmm6, xmm7 // q_0 | |
473 | ||
474 | // Next, work on the high half, a = u_1 v_1. | |
475 | movdqa xmm3, xmm0 // more copies of u_1 | |
476 | movdqa xmm4, xmm0 | |
477 | movdqa xmm7, xmm0 | |
478 | pclmullqhqdq xmm0, xmm2 // u_11 v_10 | |
479 | pclmulhqlqdq xmm3, xmm2 // u_10 v_11 | |
480 | pclmullqlqdq xmm4, xmm2 // u_11 v_11 | |
481 | pclmulhqhqdq xmm7, xmm2 // u_10 v_10 | |
482 | #if CPUFAM_X86 | |
a90d420c | 483 | movdqa xmm2, [SP + 0] |
9e6a4409 MW |
484 | # define V0 xmm2 |
485 | #endif | |
486 | pxor xmm0, xmm3 // u_10 v_11 + u_11 v_10 | |
487 | movdqa xmm3, xmm0 | |
488 | pslldq xmm0, 8 | |
489 | psrldq xmm3, 8 | |
ac4a43c1 | 490 | pxor xmm4, xmm0 // x_3 = a_1 |
9e6a4409 MW |
491 | pxor xmm7, xmm3 // a_0 |
492 | ||
493 | // Mix that into the product now forming in xmm4--xmm7. | |
494 | pxor xmm5, xmm4 // a_1 + q_1 | |
495 | pxor xmm6, xmm7 // a_0 + q_0 | |
496 | pxor xmm5, xmm7 // a_0 + (a_1 + q_1) | |
497 | ||
498 | // Finally, the low half, c = u_0 v_0. | |
499 | movdqa xmm0, xmm1 // more copies of u_0 | |
500 | movdqa xmm3, xmm1 | |
501 | movdqa xmm7, xmm1 | |
502 | pclmullqhqdq xmm1, V0 // u_01 v_00 | |
503 | pclmulhqlqdq xmm0, V0 // u_00 v_01 | |
504 | pclmullqlqdq xmm3, V0 // u_01 v_01 | |
505 | pclmulhqhqdq xmm7, V0 // u_00 v_00 | |
506 | pxor xmm0, xmm1 // u_10 v_11 + u_11 v_10 | |
507 | movdqa xmm1, xmm0 | |
508 | pslldq xmm0, 8 | |
509 | psrldq xmm1, 8 | |
510 | pxor xmm3, xmm0 // c_1 | |
511 | pxor xmm7, xmm1 // x_0 = c_0 | |
512 | ||
513 | // And mix that in to complete the product. | |
514 | pxor xmm6, xmm3 // (a_0 + q_0) + c_1 | |
515 | pxor xmm5, xmm3 // x_2 = a_0 + (a_1 + c_1 + q_1) = a_0 + b_1 | |
516 | pxor xmm6, xmm7 // x_1 = (a_0 + c_0 + q_0) + c_1 = b_0 + c_1 | |
517 | ||
518 | #undef V0 | |
519 | ||
9e6a4409 MW |
520 | // Now we must reduce. This is essentially the same as the 128-bit |
521 | // case above, but more complicated because everything is bigger. | |
522 | // The polynomial this time is p(t) = t^256 + t^10 + t^5 + t^2 + 1. | |
523 | ||
524 | // First, shift the high bits down. | |
ac4a43c1 MW |
525 | movdqa xmm0, xmm4 // x_3 again |
526 | movdqa xmm1, xmm4 // x_3 yet again | |
527 | movdqa xmm2, xmm4 // x_3 again again | |
528 | pslld xmm0, 30 // x_3: b_i for t^2 | |
529 | pslld xmm1, 27 // x_3: b_i for t^5 | |
530 | pslld xmm2, 22 // x_3: b_i for t^10 | |
531 | movdqa xmm3, xmm5 // x_2 again | |
532 | pxor xmm0, xmm1 | |
533 | movdqa xmm1, xmm5 // x_2 again | |
534 | pxor xmm0, xmm2 // b_3 | |
535 | movdqa xmm2, xmm5 // x_2 again | |
536 | pslld xmm3, 30 // x_2: b_i for t^2 | |
537 | pslld xmm1, 27 // x_2: b_i for t^5 | |
538 | pslld xmm2, 22 // x_2: b_i for t^10 | |
539 | pxor xmm3, xmm1 | |
540 | movdqa xmm1, xmm0 | |
541 | pxor xmm3, xmm2 // b_2 | |
542 | psrldq xmm0, 4 | |
543 | movdqa xmm2, xmm3 | |
544 | pslldq xmm1, 12 | |
545 | psrldq xmm3, 4 | |
546 | pxor xmm6, xmm0 | |
547 | pslldq xmm2, 12 | |
548 | pxor xmm7, xmm3 | |
549 | pxor xmm5, xmm1 | |
550 | pxor xmm6, xmm2 | |
9e6a4409 MW |
551 | |
552 | // And then shift the low bits up. | |
ac4a43c1 MW |
553 | movdqa xmm0, xmm4 // x_3 again |
554 | movdqa xmm1, xmm5 // x_2 again | |
555 | movdqa xmm2, xmm4 // x_3 yet again | |
556 | movdqa xmm3, xmm5 // x_2 yet again | |
557 | pxor xmm6, xmm4 // x_1 and unit contrib from x_3 | |
558 | pxor xmm7, xmm5 // x_0 and unit contrib from x_2 | |
559 | psrld xmm4, 2 | |
560 | psrld xmm5, 2 | |
561 | psrld xmm0, 5 | |
562 | psrld xmm1, 5 | |
563 | psrld xmm2, 10 | |
564 | psrld xmm3, 10 | |
565 | pxor xmm4, xmm6 // x_1, with x_3 units and t^2 | |
566 | pxor xmm5, xmm7 // x_0, with x_2 units and t^2 | |
567 | pxor xmm0, xmm2 // x_3 t^5 and t^10 contribs | |
568 | pxor xmm1, xmm3 // x_2 t^5 and t^10 contribs | |
9e6a4409 MW |
569 | pxor xmm0, xmm4 // high half of reduced result |
570 | pxor xmm1, xmm5 // low half; all done | |
571 | .endm | |
572 | ||
573 | ///-------------------------------------------------------------------------- | |
574 | /// Main code. | |
575 | ||
576 | // There are a number of representations of field elements in this code and | |
577 | // it can be confusing. | |
578 | // | |
579 | // * The `external format' consists of a sequence of contiguous bytes in | |
580 | // memory called a `block'. The GCM spec explains how to interpret this | |
581 | // block as an element of a finite field. As discussed extensively, this | |
582 | // representation is very annoying for a number of reasons. On the other | |
583 | // hand, this code never actually deals with it directly. | |
584 | // | |
585 | // * The `register format' consists of one or more XMM registers, depending | |
586 | // on the block size. The bytes in these registers are in reverse order | |
587 | // -- so the least-significant byte of the lowest-numbered register holds | |
588 | // the /last/ byte in the block. If the block size is not a multiple of | |
589 | // 16 bytes, then there must be padding. 96-bit blocks are weird: the | |
590 | // padding is inserted at the /least/ significant end, so the register | |
591 | // holds (0, x_0; x_1, x_2); otherwise, the padding goes at the most | |
592 | // significant end. | |
593 | // | |
594 | // * The `words' format consists of a sequence of bytes, as in the | |
595 | // `external format', but, according to the blockcipher in use, the bytes | |
596 | // within each 32-bit word may be reversed (`big-endian') or not | |
597 | // (`little-endian'). Accordingly, there are separate entry points for | |
598 | // each variant, identified with `b' or `l'. | |
599 | ||
600 | #define SSEFUNC(f) \ | |
601 | FUNC(f##_avx); vzeroupper; endprologue; ENDFUNC; \ | |
602 | FUNC(f) | |
603 | ||
604 | SSEFUNC(gcm_mulk_128b_x86ish_pclmul) | |
605 | // On entry, A points to a 128-bit field element in big-endian words | |
606 | // format; K points to a field-element in register format. On exit, | |
607 | // A is updated with the product A K. | |
608 | ||
609 | #if CPUFAM_X86 | |
a90d420c MW |
610 | mov A, [SP + 4] |
611 | mov K, [SP + 8] | |
9e6a4409 MW |
612 | #endif |
613 | endprologue | |
614 | movdqu xmm0, [A] | |
615 | movdqu xmm1, [K] | |
616 | pshufd xmm0, xmm0, SHUF(3, 2, 1, 0) | |
617 | mul128 | |
618 | pshufd xmm0, xmm0, SHUF(3, 2, 1, 0) | |
619 | movdqu [A], xmm0 | |
620 | ret | |
621 | ENDFUNC | |
622 | ||
623 | SSEFUNC(gcm_mulk_128l_x86ish_pclmul) | |
624 | // On entry, A points to a 128-bit field element in little-endian | |
625 | // words format; K points to a field-element in register format. On | |
626 | // exit, A is updated with the product A K. | |
627 | ||
628 | #if CPUFAM_X86 | |
a90d420c MW |
629 | mov A, [SP + 4] |
630 | mov K, [SP + 8] | |
9e6a4409 MW |
631 | ldgot ecx |
632 | #endif | |
633 | endprologue | |
634 | movdqa xmm7, [INTADDR(swaptab_128l, ecx)] | |
635 | movdqu xmm0, [A] | |
636 | movdqu xmm1, [K] | |
637 | pshufb xmm0, xmm7 | |
638 | mul128 | |
639 | pshufb xmm0, xmm7 | |
640 | movdqu [A], xmm0 | |
641 | ret | |
642 | ENDFUNC | |
643 | ||
644 | SSEFUNC(gcm_mulk_64b_x86ish_pclmul) | |
645 | // On entry, A points to a 64-bit field element in big-endian words | |
646 | // format; K points to a field-element in register format. On exit, | |
647 | // A is updated with the product A K. | |
648 | ||
649 | #if CPUFAM_X86 | |
a90d420c MW |
650 | mov A, [SP + 4] |
651 | mov K, [SP + 8] | |
9e6a4409 MW |
652 | #endif |
653 | endprologue | |
654 | movq xmm0, [A] | |
655 | movq xmm1, [K] | |
656 | pshufd xmm0, xmm0, SHUF(1, 0, 3, 3) | |
657 | mul64 | |
658 | pshufd xmm0, xmm0, SHUF(1, 0, 3, 3) | |
659 | movq [A], xmm0 | |
660 | ret | |
661 | ENDFUNC | |
662 | ||
663 | SSEFUNC(gcm_mulk_64l_x86ish_pclmul) | |
664 | // On entry, A points to a 64-bit field element in little-endian | |
665 | // words format; K points to a field-element in register format. On | |
666 | // exit, A is updated with the product A K. | |
667 | ||
668 | #if CPUFAM_X86 | |
a90d420c MW |
669 | mov A, [SP + 4] |
670 | mov K, [SP + 8] | |
9e6a4409 MW |
671 | ldgot ecx |
672 | #endif | |
673 | endprologue | |
674 | movdqa xmm7, [INTADDR(swaptab_64l, ecx)] | |
675 | movq xmm0, [A] | |
676 | movq xmm1, [K] | |
677 | pshufb xmm0, xmm7 | |
678 | mul64 | |
679 | pshufb xmm0, xmm7 | |
680 | movq [A], xmm0 | |
681 | ret | |
682 | ENDFUNC | |
683 | ||
684 | SSEFUNC(gcm_mulk_96b_x86ish_pclmul) | |
685 | // On entry, A points to a 96-bit field element in big-endian words | |
686 | // format; K points to a field-element in register format (i.e., 16 | |
687 | // bytes, with the first four bytes zero). On exit, A is updated | |
688 | // with the product A K. | |
689 | ||
690 | #if CPUFAM_X86 | |
a90d420c MW |
691 | mov A, [SP + 4] |
692 | mov K, [SP + 8] | |
9e6a4409 MW |
693 | #endif |
694 | endprologue | |
695 | movq xmm0, [A + 0] | |
696 | movd xmm2, [A + 8] | |
697 | movdqu xmm1, [K] | |
698 | punpcklqdq xmm0, xmm2 | |
699 | pshufd xmm0, xmm0, SHUF(3, 2, 1, 0) | |
700 | mul96 | |
701 | pshufd xmm1, xmm0, SHUF(3, 2, 1, 0) | |
702 | psrldq xmm0, 4 | |
703 | movq [A + 0], xmm1 | |
704 | movd [A + 8], xmm0 | |
705 | ret | |
706 | ENDFUNC | |
707 | ||
708 | SSEFUNC(gcm_mulk_96l_x86ish_pclmul) | |
709 | // On entry, A points to a 96-bit field element in little-endian | |
710 | // words format; K points to a field-element in register format | |
711 | // (i.e., 16 bytes, with the first four bytes zero). On exit, A is | |
712 | // updated with the product A K. | |
713 | ||
714 | #if CPUFAM_X86 | |
a90d420c MW |
715 | mov A, [SP + 4] |
716 | mov K, [SP + 8] | |
9e6a4409 MW |
717 | ldgot ecx |
718 | #endif | |
719 | endprologue | |
720 | movdqa xmm7, [INTADDR(swaptab_128l, ecx)] | |
721 | movq xmm0, [A + 0] | |
722 | movd xmm2, [A + 8] | |
723 | movdqu xmm1, [K] | |
724 | punpcklqdq xmm0, xmm2 | |
725 | pshufb xmm0, xmm7 | |
726 | mul96 | |
727 | pshufb xmm0, xmm7 | |
728 | movq [A + 0], xmm0 | |
729 | psrldq xmm0, 8 | |
730 | movd [A + 8], xmm0 | |
731 | ret | |
732 | ENDFUNC | |
733 | ||
734 | SSEFUNC(gcm_mulk_192b_x86ish_pclmul) | |
735 | // On entry, A points to a 192-bit field element in big-endian words | |
736 | // format; K points to a field-element in register format. On exit, | |
737 | // A is updated with the product A K. | |
738 | ||
739 | #if CPUFAM_X86 | |
a90d420c MW |
740 | mov A, [SP + 4] |
741 | mov K, [SP + 8] | |
9e6a4409 MW |
742 | #endif |
743 | #if CPUFAM_AMD64 && ABI_WIN | |
744 | stalloc 2*16 + 8 | |
745 | savexmm xmm6, 0 | |
746 | savexmm xmm7, 16 | |
747 | #endif | |
748 | endprologue | |
749 | movdqu xmm0, [A + 8] | |
750 | movq xmm1, [A + 0] | |
751 | movdqu xmm2, [K + 0] | |
752 | movq xmm3, [K + 16] | |
753 | pshufd xmm0, xmm0, SHUF(3, 2, 1, 0) | |
754 | pshufd xmm1, xmm1, SHUF(1, 0, 3, 3) | |
755 | mul192 | |
756 | pshufd xmm0, xmm0, SHUF(3, 2, 1, 0) | |
757 | pshufd xmm1, xmm1, SHUF(1, 0, 3, 3) | |
758 | movdqu [A + 8], xmm0 | |
759 | movq [A + 0], xmm1 | |
760 | #if CPUFAM_AMD64 && ABI_WIN | |
761 | rstrxmm xmm6, 0 | |
762 | rstrxmm xmm7, 16 | |
763 | stfree 2*16 + 8 | |
764 | #endif | |
765 | ret | |
766 | ENDFUNC | |
767 | ||
768 | SSEFUNC(gcm_mulk_192l_x86ish_pclmul) | |
769 | // On entry, A points to a 192-bit field element in little-endian | |
770 | // words format; K points to a field-element in register format. On | |
771 | // exit, A is updated with the product A K. | |
772 | ||
773 | #if CPUFAM_X86 | |
a90d420c MW |
774 | mov A, [SP + 4] |
775 | mov K, [SP + 8] | |
9e6a4409 MW |
776 | ldgot ecx |
777 | #endif | |
778 | #if CPUFAM_AMD64 && ABI_WIN | |
779 | stalloc 2*16 + 8 | |
780 | savexmm xmm6, 0 | |
781 | savexmm xmm7, 16 | |
782 | #endif | |
783 | endprologue | |
784 | movdqu xmm0, [A + 8] | |
785 | movq xmm1, [A + 0] | |
786 | movdqu xmm2, [K + 0] | |
787 | movq xmm3, [K + 16] | |
788 | pshufb xmm0, [INTADDR(swaptab_128l, ecx)] | |
789 | pshufb xmm1, [INTADDR(swaptab_64l, ecx)] | |
790 | mul192 | |
791 | pshufb xmm0, [INTADDR(swaptab_128l, ecx)] | |
792 | pshufb xmm1, [INTADDR(swaptab_64l, ecx)] | |
793 | movdqu [A + 8], xmm0 | |
794 | movq [A + 0], xmm1 | |
795 | #if CPUFAM_AMD64 && ABI_WIN | |
796 | rstrxmm xmm6, 0 | |
797 | rstrxmm xmm7, 16 | |
798 | stfree 2*16 + 8 | |
799 | #endif | |
800 | ret | |
801 | ENDFUNC | |
802 | ||
803 | SSEFUNC(gcm_mulk_256b_x86ish_pclmul) | |
804 | // On entry, A points to a 256-bit field element in big-endian words | |
805 | // format; K points to a field-element in register format. On exit, | |
806 | // A is updated with the product A K. | |
807 | ||
808 | #if CPUFAM_X86 | |
a90d420c | 809 | pushreg BP |
9e6a4409 | 810 | setfp |
a90d420c MW |
811 | mov A, [SP + 8] |
812 | mov K, [SP + 12] | |
6d2bd7f1 | 813 | stalloc 16 |
a90d420c | 814 | and SP, ~15 |
9e6a4409 MW |
815 | #endif |
816 | #if CPUFAM_AMD64 && ABI_WIN | |
817 | stalloc 3*16 + 8 | |
818 | savexmm xmm6, 0 | |
819 | savexmm xmm7, 16 | |
820 | savexmm xmm8, 32 | |
821 | #endif | |
822 | endprologue | |
823 | movdqu xmm0, [A + 16] | |
824 | movdqu xmm1, [A + 0] | |
825 | movdqu xmm2, [K + 0] | |
826 | movdqu xmm3, [K + 16] | |
827 | pshufd xmm0, xmm0, SHUF(3, 2, 1, 0) | |
828 | pshufd xmm1, xmm1, SHUF(3, 2, 1, 0) | |
829 | mul256 | |
830 | pshufd xmm0, xmm0, SHUF(3, 2, 1, 0) | |
831 | pshufd xmm1, xmm1, SHUF(3, 2, 1, 0) | |
832 | movdqu [A + 16], xmm0 | |
833 | movdqu [A + 0], xmm1 | |
834 | #if CPUFAM_X86 | |
835 | dropfp | |
a90d420c | 836 | popreg BP |
9e6a4409 MW |
837 | #endif |
838 | #if CPUFAM_AMD64 && ABI_WIN | |
839 | rstrxmm xmm6, 0 | |
840 | rstrxmm xmm7, 16 | |
841 | rstrxmm xmm8, 32 | |
842 | stfree 3*16 + 8 | |
843 | #endif | |
844 | ret | |
845 | ENDFUNC | |
846 | ||
847 | SSEFUNC(gcm_mulk_256l_x86ish_pclmul) | |
848 | // On entry, A points to a 256-bit field element in little-endian | |
849 | // words format; K points to a field-element in register format. On | |
850 | // exit, A is updated with the product A K. | |
851 | ||
852 | #if CPUFAM_X86 | |
a90d420c | 853 | pushreg BP |
9e6a4409 | 854 | setfp |
a90d420c MW |
855 | mov A, [SP + 8] |
856 | mov K, [SP + 12] | |
6d2bd7f1 | 857 | stalloc 16 |
9e6a4409 | 858 | ldgot ecx |
6d2bd7f1 | 859 | and SP, ~15 |
9e6a4409 MW |
860 | #endif |
861 | #if CPUFAM_AMD64 && ABI_WIN | |
862 | stalloc 3*16 + 8 | |
863 | savexmm xmm6, 0 | |
864 | savexmm xmm7, 16 | |
865 | savexmm xmm8, 32 | |
866 | #endif | |
867 | endprologue | |
868 | movdqa xmm7, [INTADDR(swaptab_128l, ecx)] | |
869 | movdqu xmm0, [A + 16] | |
870 | movdqu xmm1, [A + 0] | |
871 | movdqu xmm2, [K + 0] | |
872 | movdqu xmm3, [K + 16] | |
873 | pshufb xmm0, xmm7 | |
874 | pshufb xmm1, xmm7 | |
875 | mul256 | |
876 | movdqa xmm7, [INTADDR(swaptab_128l, ecx)] | |
877 | pshufb xmm0, xmm7 | |
878 | pshufb xmm1, xmm7 | |
879 | movdqu [A + 16], xmm0 | |
880 | movdqu [A + 0], xmm1 | |
881 | #if CPUFAM_X86 | |
882 | dropfp | |
a90d420c | 883 | popreg BP |
9e6a4409 MW |
884 | #endif |
885 | #if CPUFAM_AMD64 && ABI_WIN | |
886 | rstrxmm xmm6, 0 | |
887 | rstrxmm xmm7, 16 | |
888 | rstrxmm xmm8, 32 | |
889 | stfree 3*16 + 8 | |
890 | #endif | |
891 | ret | |
892 | ENDFUNC | |
893 | ||
894 | RODATA | |
895 | ||
896 | .balign 16 | |
897 | swaptab_128l: | |
898 | // Table for byte-swapping little-endian words-format blocks larger | |
899 | // than 64 bits. | |
900 | .byte 15, 14, 13, 12, 11, 10, 9, 8 | |
901 | .byte 7, 6, 5, 4, 3, 2, 1, 0 | |
902 | ||
903 | .balign 16 | |
904 | swaptab_64l: | |
905 | // Table for byte-swapping 64-bit little-endian words-format blocks. | |
906 | .byte 7, 6, 5, 4, 3, 2, 1, 0 | |
907 | .byte 255, 255, 255, 255, 255, 255, 255, 255 | |
908 | ||
909 | ///----- That's all, folks -------------------------------------------------- |