x86ish *.S: Use `stalloc' consistently to allocate space on the stack.
[catacomb] / symm / gcm-x86ish-pclmul.S
CommitLineData
9e6a4409
MW
1/// -*- mode: asm; asm-comment-char: ?/ -*-
2///
3/// GCM acceleration for x86 processors
4///
5/// (c) 2018 Straylight/Edgeware
6///
7
8///----- Licensing notice ---------------------------------------------------
9///
10/// This file is part of Catacomb.
11///
12/// Catacomb is free software: you can redistribute it and/or modify it
13/// under the terms of the GNU Library General Public License as published
14/// by the Free Software Foundation; either version 2 of the License, or
15/// (at your option) any later version.
16///
17/// Catacomb is distributed in the hope that it will be useful, but
18/// WITHOUT ANY WARRANTY; without even the implied warranty of
19/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20/// Library General Public License for more details.
21///
22/// You should have received a copy of the GNU Library General Public
23/// License along with Catacomb. If not, write to the Free Software
24/// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
25/// USA.
26
27///--------------------------------------------------------------------------
28/// Preliminaries.
29
30#include "config.h"
31#include "asm-common.h"
32
33 .arch .pclmul
34
35 .text
36
37///--------------------------------------------------------------------------
38/// Common register allocation.
39
40#if CPUFAM_X86
41# define A eax
42# define K edx
43#elif CPUFAM_AMD64 && ABI_SYSV
44# define A rdi
45# define K rsi
46#elif CPUFAM_AMD64 && ABI_WIN
47# define A rcx
48# define K rdx
49#endif
50
51///--------------------------------------------------------------------------
52/// Multiplication macros.
53
54 // The good news is that we have a fancy instruction to do the
55 // multiplications. The bad news is that it's not particularly well-
56 // suited to the job.
57 //
58 // For one thing, it only does a 64-bit multiplication, so in general
59 // we'll need to synthesize the full-width multiply by hand. For
60 // another thing, it doesn't help with the reduction, so we have to
61 // do that by hand too. And, finally, GCM has crazy bit ordering,
62 // and the instruction does nothing useful for that at all.
63 //
64 // Focusing on that last problem first: the bits aren't in monotonic
65 // significance order unless we permute them. If we reverse the byte
66 // order, then we'll have the bits in monotonic order, but backwards,
67 // so the degree-0 coefficient will be in the most-significant bit.
68 //
69 // This is less of a difficulty than it seems at first, because
70 // algebra. Suppose we are given u = SUM_{0<=i<n} u_i t^i and v =
71 // SUM_{0<=j<n} v_j t^j; then
72 //
73 // u v = SUM_{0<=i,j<n} u_i v_j t^{i+j}
74 //
75 // Suppose instead that we're given ũ = SUM_{0<=i<n} u_{n-i-1} t^i
76 // and ṽ = SUM_{0<=j<n} v_{n-j-1} t^j, so the bits are backwards.
77 // Then
78 //
79 // ũ ṽ = SUM_{0<=i,j<n} u_{n-i-1} v_{n-j-1} t^{i+j}
80 // = SUM_{0<=i,j<n} u_i v_j t^{2n-2-(i+j)}
81 //
82 // which is almost the bit-reversal of u v, only it's shifted right
83 // by one place. Oh, well: we'll have to shift it back later.
84 //
85 // That was important to think about, but there's not a great deal to
86 // do about it yet other than to convert what we've got from the
87 // blockcipher's byte-ordering convention to our big-endian
88 // convention. Since this depends on the blockcipher convention,
89 // we'll leave the caller to cope with this: the macros here will
90 // assume that the operands are in `register' format, which is the
91 // byte-reversal of the external representation, padded at the
92 // most-significant end except for 96-bit blocks, which are
93 // zero-padded at the least-significant end (see `mul96' for the
94 // details). In the commentary, pieces of polynomial are numbered
95 // according to the degree of the coefficients, so the unit
96 // coefficient of some polynomial a is in a_0.
97 //
98 // The commentary for `mul128' is the most detailed. The other
99 // macros assume that you've already read and understood that.
100
101.macro mul128
102 // Enter with u and v in xmm0 and xmm1 respectively; leave with z =
103 // u v in xmm0. Clobbers xmm1--xmm4.
104
105 // First for the double-precision multiplication. It's tempting to
106 // use Karatsuba's identity here, but I suspect that loses more in
107 // the shifting, bit-twiddling, and dependency chains that it gains
108 // in saving a multiplication which otherwise pipelines well.
109 // xmm0 = // (u_1; u_0)
110 // xmm1 = // (v_1; v_0)
111 movdqa xmm2, xmm1 // (v_1; v_0) again
112 movdqa xmm3, xmm0 // (u_1; u_0) again
113 movdqa xmm4, xmm0 // (u_1; u_0) yet again
114 pclmulhqlqdq xmm2, xmm0 // u_1 v_0
115 pclmullqlqdq xmm0, xmm1 // u_1 v_1
116 pclmulhqlqdq xmm3, xmm1 // u_0 v_1
117 pclmulhqhqdq xmm4, xmm1 // u_0 v_0
118
119 // Arrange the pieces to form a double-precision polynomial.
120 pxor xmm2, xmm3 // (m_1; m_0) = u_1 v_0 + u_0 v_1
121 movdqa xmm1, xmm2 // (m_1; m_0) again
122 pslldq xmm2, 8 // (0; m_1)
123 psrldq xmm1, 8 // (m_0; 0)
124 pxor xmm0, xmm2 // x_1 = u_1 v_1 + m_1
125 pxor xmm1, xmm4 // x_0 = u_0 v_0 + t^64 m_0
126
127 // Two problems remain. The first is that this product is shifted
128 // left (from GCM's backwards perspective) by one place, which is
129 // annoying. Let's take care of that now. Once this is done, we'll
130 // be properly in GCM's backwards bit-ordering, so xmm1 will hold the
131 // low half of the product and xmm0 the high half. (The following
132 // diagrams show bit 0 consistently on the right.)
133 //
134 // xmm1
135 // ,-------------.-------------.-------------.-------------.
136 // | 0 x_0-x_30 | x_31-x_62 | x_63-x_94 | x_95-x_126 |
137 // `-------------^-------------^-------------^-------------'
138 //
139 // xmm0
140 // ,-------------.-------------.-------------.-------------.
141 // | x_127-x_158 | x_159-x_190 | x_191-x_222 | x_223-x_254 |
142 // `-------------^-------------^-------------^-------------'
143 //
144 // We start by shifting each 32-bit lane right (from GCM's point of
145 // view -- physically, left) by one place, which gives us this:
146 //
147 // low (xmm3)
148 // ,-------------.-------------.-------------.-------------.
149 // | x_0-x_30 0 | x_32-x_62 0 | x_64-x_94 0 | x_96-x_126 0|
150 // `-------------^-------------^-------------^-------------'
151 //
152 // high (xmm2)
153 // ,-------------.-------------.-------------.-------------.
154 // |x_128-x_158 0|x_160-x_190 0|x_192-x_222 0|x_224-x_254 0|
155 // `-------------^-------------^-------------^-------------'
156 //
157 // but we've lost a bunch of bits. We separately shift each lane
158 // left by 31 places to give us the bits we lost.
159 //
160 // low (xmm1)
161 // ,-------------.-------------.-------------.-------------.
162 // | 0...0 | 0...0 x_31 | 0...0 x_63 | 0...0 x_95 |
163 // `-------------^-------------^-------------^-------------'
164 //
165 // high (xmm0)
166 // ,-------------.-------------.-------------.-------------.
167 // | 0...0 x_127 | 0...0 x_159 | 0...0 x_191 | 0...0 x_223 |
168 // `-------------^-------------^-------------^-------------'
169 //
170 // Which is close, but we don't get a cigar yet. To get the missing
171 // bits into position, we shift each of these right by a lane, but,
172 // alas, the x_127 falls off, so, separately, we shift the high
173 // register left by three lanes, so that everything is lined up
174 // properly when we OR them all together:
175 //
176 // low (xmm1)
177 // ,-------------.-------------.-------------.-------------.
178 // ? 0...0 x_31 | 0...0 x_63 | 0...0 x_95 | 0...0 |
179 // `-------------^-------------^-------------^-------------'
180 //
181 // wrap (xmm4)
182 // ,-------------.-------------.-------------.-------------.
183 // | 0...0 | 0...0 | 0...0 | 0...0 x_127 |
184 // `-------------^-------------^-------------^-------------'
185 //
186 // high (xmm0)
187 // ,-------------.-------------.-------------.-------------.
188 // | 0...0 x_159 | 0...0 x_191 | 0...0 x_223 | 0...0 |
189 // `-------------^-------------^-------------^-------------'
190 //
191 // The `low' and `wrap' registers (xmm1, xmm3, xmm4) then collect the
192 // low 128 coefficients, while the `high' registers (xmm0, xmm2)
193 // collect the high 127 registers, leaving a zero bit at the most
194 // significant end as we expect.
195
196 // xmm0 = // (x_7, x_6; x_5, x_4)
197 // xmm1 = // (x_3, x_2; x_1, x_0)
198 movdqa xmm3, xmm1 // (x_3, x_2; x_1, x_0) again
199 movdqa xmm2, xmm0 // (x_7, x_6; x_5, x_4) again
200 psrld xmm1, 31 // shifted left; just the carries
201 psrld xmm0, 31
202 pslld xmm3, 1 // shifted right, but dropped carries
203 pslld xmm2, 1
204 movdqa xmm4, xmm0 // another copy for the carry around
205 pslldq xmm1, 4 // move carries over
206 pslldq xmm0, 4
207 psrldq xmm4, 12 // the big carry wraps around
208 por xmm1, xmm3
209 por xmm0, xmm2 // (y_7, y_6; y_5, y_4)
210 por xmm1, xmm4 // (y_3, y_2; y_1, y_0)
211
212 // And the other problem is that the result needs to be reduced
213 // modulo p(t) = t^128 + t^7 + t^2 + t + 1. Let R = t^128 = t^7 +
214 // t^2 + t + 1 in our field. So far, we've calculated z_0 and z_1
215 // such that z_0 + z_1 R = u v using the identity R = t^128: now we
216 // must collapse the two halves of z together using the other
217 // identity R = t^7 + t^2 + t + 1.
218 //
219 // We do this by working on each 32-bit word of the high half of z
220 // separately, so consider y_i, for some 4 <= i < 8. Certainly, y_i
221 // t^{32i} = y_i R t^{32(i-4)} = (t^7 + t^2 + t + 1) y_i t^{32(i-4)},
222 // but we can't use that directly without breaking up the 32-bit word
223 // structure. Instead, we start by considering just y_i t^7
224 // t^{32(i-4)}, which again looks tricky. Now, split y_i = a_i +
225 // t^25 b_i, with deg a_i < 25; then
226 //
227 // y_i t^7 t^{32(i-4)} = a_i t^7 t^{32(i-4)} + b_i t^{32(i-3)}
228 //
229 // We can similarly decompose y_i t^2 and y_i t into a pair of 32-bit
230 // contributions to the t^{32(i-4)} and t^{32(i-3)} words, but the
231 // splits are different. This is lovely, with one small snag: when
232 // we do this to y_7, we end up with a contribution back into the
233 // t^128 coefficient word. But notice that only the low seven bits
234 // of this word are affected, so there's no knock-on contribution
235 // into the t^32 word. Therefore, if we handle the high bits of each
236 // word together, and then the low bits, everything will be fine.
237
238 // First, shift the high bits down.
239 movdqa xmm2, xmm0 // (y_7, y_6; y_5, y_4) again
240 movdqa xmm3, xmm0 // (y_7, y_6; y_5, y_4) yet again
241 movdqa xmm4, xmm0 // (y_7, y_6; y_5, y_4) again again
242 pslld xmm2, 31 // the b_i for t
243 pslld xmm3, 30 // the b_i for t^2
244 pslld xmm4, 25 // the b_i for t^7
245 pxor xmm2, xmm3 // add them all together
246 pxor xmm2, xmm4
247 movdqa xmm3, xmm2 // and a copy for later
248 psrldq xmm2, 4 // contribution into low half
249 pslldq xmm3, 12 // and high half
250 pxor xmm1, xmm2
251 pxor xmm0, xmm3
252
253 // And then shift the low bits up.
254 movdqa xmm2, xmm0
255 movdqa xmm3, xmm0
256 pxor xmm1, xmm0 // mix in the unit contribution
257 psrld xmm0, 1
258 psrld xmm2, 2
259 psrld xmm3, 7
260 pxor xmm1, xmm2 // low half, unit, and t^2 contribs
261 pxor xmm0, xmm3 // t and t^7 contribs
262 pxor xmm0, xmm1 // mix them together and we're done
263.endm
264
265.macro mul64
266 // Enter with u and v in the low halves of xmm0 and xmm1
267 // respectively; leave with z = u v in xmm0. Clobbers xmm1--xmm4.
268
269 // The multiplication is thankfully easy.
270 pclmullqlqdq xmm0, xmm1 // u v
271
272 // Shift the product up by one place. After this, we're in GCM
273 // bizarro-world.
274 movdqa xmm1, xmm0 // u v again
275 psrld xmm0, 31 // shifted left; just the carries
276 pslld xmm1, 1 // shifted right, but dropped carries
277 pslldq xmm0, 4 // move carries over
278 por xmm1, xmm0 // (y_3, y_2; y_1, y_0)
279
280 // Now we must reduce. This is essentially the same as the 128-bit
281 // case above, but mostly simpler because everything is smaller. The
282 // polynomial this time is p(t) = t^64 + t^4 + t^3 + t + 1.
283
284 // First, we must detach the top (`low'!) half of the result.
285 movdqa xmm0, xmm1 // (y_3, y_2; y_1, y_0) again
286 psrldq xmm1, 8 // (y_1, y_0; 0, 0)
287
288 // Next, shift the high bits down.
289 movdqa xmm2, xmm0 // (y_3, y_2; ?, ?) again
290 movdqa xmm3, xmm0 // (y_3, y_2; ?, ?) yet again
291 movdqa xmm4, xmm0 // (y_3, y_2; ?, ?) again again
292 pslld xmm2, 31 // b_i for t
293 pslld xmm3, 29 // b_i for t^3
294 pslld xmm4, 28 // b_i for t^4
295 pxor xmm2, xmm3 // add them all together
296 pxor xmm2, xmm4
297 movdqa xmm3, xmm2 // and a copy for later
298 movq xmm2, xmm2 // zap high half
299 pslldq xmm3, 4 // contribution into high half
300 psrldq xmm2, 4 // and low half
301 pxor xmm0, xmm3
302 pxor xmm1, xmm2
303
304 // And then shift the low bits up.
305 movdqa xmm2, xmm0
306 movdqa xmm3, xmm0
307 pxor xmm1, xmm0 // mix in the unit contribution
308 psrld xmm0, 1
309 psrld xmm2, 3
310 psrld xmm3, 4
311 pxor xmm1, xmm2 // low half, unit, and t^3 contribs
312 pxor xmm0, xmm3 // t and t^4 contribs
313 pxor xmm0, xmm1 // mix them together and we're done
314.endm
315
316.macro mul96
317 // Enter with u and v in the /high/ three words of xmm0 and xmm1
318 // respectively (and zero in the low word); leave with z = u v in the
319 // high three words of xmm0, and /junk/ in the low word. Clobbers
320 // xmm1--xmm4.
321
322 // This is an inconvenient size. There's nothing for it but to do
323 // four multiplications, as if for the 128-bit case. It's possible
324 // that there's cruft in the top 32 bits of the input registers, so
325 // shift both of them up by four bytes before we start. This will
326 // mean that the high 64 bits of the result (from GCM's viewpoint)
327 // will be zero.
328 // xmm0 = // (0, u_2; u_1, u_0)
329 // xmm1 = // (0, v_2; v_1, v_0)
330 movdqa xmm2, xmm1 // (0, v_2; v_1, v_0) again
331 movdqa xmm3, xmm0 // (0, u_2; u_1, u_0) again
332 movdqa xmm4, xmm0 // (0, u_2; u_1, u_0) yet again
333 pclmulhqlqdq xmm2, xmm0 // u_2 (v_1 t^32 + v_0) = e_0
334 pclmullqlqdq xmm0, xmm1 // u_2 v_2 = d = (0; d)
335 pclmulhqlqdq xmm3, xmm1 // v_2 (u_1 t^32 + u_0) = e_1
336 pclmulhqhqdq xmm4, xmm1 // u_0 v_0 + (u_1 v_0 + u_0 v_1) t^32
337 // + u_1 v_1 t^64 = f
338
339 // Extract the high and low halves of the 192-bit result. We don't
340 // need be too picky about the unused high words of the result
341 // registers. The answer we want is d t^128 + e t^64 + f, where e =
342 // e_0 + e_1.
343 //
344 // The place values for the two halves are (t^160, t^128; t^96, ?)
345 // and (?, t^64; t^32, 1).
346 psrldq xmm0, 8 // (d; 0) = d t^128
347 pxor xmm2, xmm3 // e = (e_0 + e_1)
348 movdqa xmm1, xmm4 // f again
349 pxor xmm0, xmm2 // d t^128 + e t^64
350 psrldq xmm2, 12 // e[31..0] t^64
351 psrldq xmm1, 4 // f[95..0]
352 pslldq xmm4, 8 // f[127..96]
353 pxor xmm1, xmm2 // low 96 bits of result
354 pxor xmm0, xmm4 // high 96 bits of result
355
356 // Next, shift everything one bit to the left to compensate for GCM's
357 // strange ordering. This will be easier if we shift up the high
358 // half by a word before we start. After this we're in GCM bizarro-
359 // world.
360 movdqa xmm3, xmm1 // low half again
361 pslldq xmm0, 4 // shift high half
362 psrld xmm1, 31 // shift low half down: just carries
363 movdqa xmm2, xmm0 // copy high half
364 pslld xmm3, 1 // shift low half down: drop carries
365 psrld xmm0, 31 // shift high half up: just carries
366 pslld xmm2, 1 // shift high half down: drop carries
367 movdqa xmm4, xmm0 // copy high carries for carry-around
368 pslldq xmm0, 4 // shift carries down
369 pslldq xmm1, 4
370 psrldq xmm4, 12 // the big carry wraps around
371 por xmm1, xmm3
372 por xmm0, xmm2
373 por xmm1, xmm4
374
375 // Finally, the reduction. This is essentially the same as the
376 // 128-bit case, except that the polynomial is p(t) = t^96 + t^10 +
377 // t^9 + t^6 + 1. The degrees are larger but not enough to cause
378 // trouble for the general approach.
379
380 // First, shift the high bits down.
381 movdqa xmm2, xmm0 // copies of the high part
382 movdqa xmm3, xmm0
383 movdqa xmm4, xmm0
384 pslld xmm2, 26 // b_i for t^6
385 pslld xmm3, 23 // b_i for t^9
386 pslld xmm4, 22 // b_i for t^10
387 pxor xmm2, xmm3 // add them all together
388 pslldq xmm1, 4 // shift low part up to match
389 pxor xmm2, xmm4
390 movdqa xmm3, xmm2 // and a copy for later
391 pslldq xmm2, 8 // contribution to high half
392 psrldq xmm3, 4 // contribution to low half
393 pxor xmm1, xmm3
394 pxor xmm0, xmm2
395
396 // And then shift the low bits up.
397 movdqa xmm2, xmm0 // copies of the high part
398 movdqa xmm3, xmm0
399 pxor xmm1, xmm0 // mix in the unit contribution
400 psrld xmm0, 6
401 psrld xmm2, 9
402 psrld xmm3, 10
403 pxor xmm1, xmm2 // low half, unit, and t^9 contribs
404 pxor xmm0, xmm3 // t^6 and t^10 contribs
405 pxor xmm0, xmm1 // mix them together and we're done
406.endm
407
408.macro mul192
409 // Enter with u and v in xmm0/xmm1 and xmm2/xmm3 respectively; leave
410 // with z = u v in xmm0/xmm1 -- the top halves of the high registers
411 // are unimportant. Clobbers xmm2--xmm7.
412
413 // Start multiplying and accumulating pieces of product.
414 // xmm0 = // (u_2; u_1)
415 // xmm1 = // (u_0; ?)
416 // xmm2 = // (v_2; v_1)
417 // xmm3 = // (v_0; ?)
418 movdqa xmm4, xmm0 // (u_2; u_1) again
419 movdqa xmm5, xmm0 // (u_2; u_1) yet again
420 movdqa xmm6, xmm0 // (u_2; u_1) again again
421 movdqa xmm7, xmm1 // (u_0; ?) again
422 punpcklqdq xmm1, xmm3 // (u_0; v_0)
423 pclmulhqhqdq xmm4, xmm2 // u_1 v_1
424 pclmullqlqdq xmm3, xmm0 // u_2 v_0
425 pclmullqhqdq xmm5, xmm2 // u_2 v_1
426 pclmulhqlqdq xmm6, xmm2 // u_1 v_2
427 pxor xmm4, xmm3 // u_2 v_0 + u_1 v_1
428 pclmullqlqdq xmm7, xmm2 // u_0 v_2
429 pxor xmm5, xmm6 // b = u_2 v_1 + u_1 v_2
430 movdqa xmm6, xmm0 // (u_2; u_1) like a bad penny
431 pxor xmm4, xmm7 // c = u_0 v_2 + u_1 v_1 + u_2 v_0
432 pclmullqlqdq xmm0, xmm2 // a = u_2 v_2
433 pclmulhqhqdq xmm6, xmm1 // u_1 v_0
434 pclmulhqlqdq xmm2, xmm1 // u_0 v_1
435 pclmullqhqdq xmm1, xmm1 // e = u_0 v_0
436 pxor xmm2, xmm6 // d = u_1 v_0 + u_0 v_1
437
438 // Next, the piecing together of the product.
439 // xmm0 = // (a_1; a_0) = a = u_2 v_2
440 // xmm5 = // (b_1; b_0) = b = u_1 v_2 + u_2 v_1
441 // xmm4 = // (c_1; c_0) = c = u_0 v_2 +
442 // u_1 v_1 + u_2 v_0
443 // xmm2 = // (d_1; d_0) = d = u_0 v_1 + u_1 v_0
444 // xmm1 = // (e_1; e_0) = e = u_0 v_0
445 // xmm3, xmm6, xmm7 spare
446 movdqa xmm3, xmm2 // (d_1; d_0) again
447 movdqa xmm6, xmm5 // (b_1; b_0) again
448 pslldq xmm2, 8 // (0; d_1)
449 psrldq xmm5, 8 // (b_0; 0)
450 psrldq xmm3, 8 // (d_0; 0)
451 pslldq xmm6, 8 // (0; b_1)
452 pxor xmm5, xmm2 // (b_0; d_1)
453 pxor xmm0, xmm6 // x_2 = (a_1; a_0 + b_1)
454 pxor xmm3, xmm1 // x_0 = (e_1 + d_0; e_0)
455 pxor xmm4, xmm5 // x_1 = (b_0 + c_1; c_0 + d_1)
456
457 // Now, shift it right (from GCM's point of view) by one bit, and try
458 // to leave the result in less random registers. After this, we'll
459 // be in GCM bizarro-world.
460 // xmm1, xmm2, xmm5, xmm6, xmm7 spare
461 movdqa xmm5, xmm0 // copy x_2
462 movdqa xmm1, xmm4 // copy x_1
463 movdqa xmm2, xmm3 // copy x_0
464 psrld xmm0, 31 // x_2 carries
465 psrld xmm4, 31 // x_1 carries
466 psrld xmm3, 31 // x_0 carries
467 pslld xmm5, 1 // x_2 shifted
468 pslld xmm1, 1 // x_1 shifted
469 pslld xmm2, 1 // x_0 shifted
470 movdqa xmm6, xmm0 // x_2 carry copy
471 movdqa xmm7, xmm4 // x_1 carry copy
472 pslldq xmm0, 4 // x_2 carry shifted
473 pslldq xmm4, 4 // x_1 carry shifted
474 pslldq xmm3, 4 // x_0 carry shifted
475 psrldq xmm6, 12 // x_2 carry out
476 psrldq xmm7, 12 // x_1 carry out
477 por xmm0, xmm5 // (y_5; y_4)
478 por xmm1, xmm4
479 por xmm2, xmm3
480 por xmm1, xmm6 // (y_3; y_2)
481 por xmm2, xmm7 // (y_1; y_0)
482
483 // Next, the reduction. Our polynomial this time is p(x) = t^192 +
484 // t^7 + t^2 + t + 1. Yes, the magic numbers are the same as the
485 // 128-bit case. I don't know why.
486
487 // First, shift the high bits down.
488 // xmm0 = // (y_5; y_4)
489 // xmm1 = // (y_3; y_2)
490 // xmm2 = // (y_1; y_0)
491 // xmm3--xmm7 spare
492 movdqa xmm3, xmm0 // (y_5; y_4) copy
493 movdqa xmm4, xmm0 // (y_5; y_4) copy
494 movdqa xmm5, xmm0 // (y_5; y_4) copy
495 pslld xmm3, 31 // (y_5; y_4) b_i for t
496 pslld xmm4, 30 // (y_5; y_4) b_i for t^2
497 pslld xmm5, 25 // (y_5; y_4) b_i for t^7
498 movq xmm6, xmm1 // (y_3; 0) copy
499 pxor xmm3, xmm4
500 movq xmm7, xmm1 // (y_3; 0) copy
501 pxor xmm3, xmm5
502 movq xmm5, xmm1 // (y_3; 0) copy
503 movdqa xmm4, xmm3 // (y_5; y_4) b_i combined
504 pslld xmm6, 31 // (y_3; 0) b_i for t
505 pslld xmm7, 30 // (y_3; 0) b_i for t^2
506 pslld xmm5, 25 // (y_3; 0) b_i for t^7
507 psrldq xmm3, 12 // (y_5; y_4) low contrib
508 pslldq xmm4, 4 // (y_5; y_4) high contrib
509 pxor xmm6, xmm7
510 pxor xmm2, xmm3
511 pxor xmm6, xmm5
512 pxor xmm1, xmm4
513 pslldq xmm6, 4
514 pxor xmm2, xmm6
515
516 // And finally shift the low bits up. Unfortunately, we also have to
517 // split the low bits out.
518 // xmm0 = // (y'_5; y'_4)
519 // xmm1 = // (y'_3; y'_2)
520 // xmm2 = // (y'_1; y'_0)
521 movdqa xmm5, xmm1 // copies of (y'_3; y'_2)
522 movdqa xmm6, xmm1
523 movdqa xmm7, xmm1
524 psrldq xmm1, 8 // bring down (y'_2; ?)
525 movdqa xmm3, xmm0 // copies of (y'_5; y'_4)
526 movdqa xmm4, xmm0
527 punpcklqdq xmm1, xmm2 // (y'_2; y'_1)
528 psrldq xmm2, 8 // (y'_0; ?)
529 pxor xmm2, xmm5 // low half and unit contrib
530 pxor xmm1, xmm0
531 psrld xmm5, 1
532 psrld xmm0, 1
533 psrld xmm6, 2
534 psrld xmm3, 2
535 psrld xmm7, 7
536 psrld xmm4, 7
537 pxor xmm2, xmm6 // low half, unit, t^2 contribs
538 pxor xmm1, xmm3
539 pxor xmm5, xmm7 // t and t^7 contribs
540 pxor xmm0, xmm4
541 pxor xmm5, xmm2 // mix everything together
542 pxor xmm0, xmm1
543 movq xmm1, xmm5 // shunt (z_0; ?) into proper place
544.endm
545
546.macro mul256
547 // Enter with u and v in xmm0/xmm1 and xmm2/xmm3 respectively; leave
548 // with z = u v in xmm0/xmm1. Clobbers xmm2--xmm7. On 32-bit x86,
549 // requires 16 bytes aligned space at SP; on amd64, also clobbers
550 // xmm8.
551
552 // Now it's starting to look worthwhile to do Karatsuba. Suppose
553 // u = u_0 + u_1 B and v = v_0 + v_1 B. Then
554 //
555 // u v = (u_0 v_0) + (u_0 v_1 + u_1 v_0) B + (u_1 v_1) B^2
556 //
557 // Name these coefficients of B^i be a, b, and c, respectively, and
558 // let r = u_0 + u_1 and s = v_0 + v_1. Then observe that
559 //
560 // q = r s = (u_0 + u_1) (v_0 + v_1)
561 // = (u_0 v_0) + (u1 v_1) + (u_0 v_1 + u_1 v_0)
562 // = a + d + c
563 //
564 // The first two terms we've already calculated; the last is the
565 // remaining one we want. We'll set B = t^128. We know how to do
566 // 128-bit multiplications already, and Karatsuba is too annoying
567 // there, so there'll be 12 multiplications altogether, rather than
568 // the 16 we'd have if we did this the naïve way.
569 //
570 // On x86, there aren't quite enough registers, so spill one for a
571 // bit. On AMD64, we can keep on going, so it's all good.
572
573 // xmm0 = // u_1 = (u_11; u_10)
574 // xmm1 = // u_0 = (u_01; u_00)
575 // xmm2 = // v_1 = (v_11; v_10)
576 // xmm3 = // v_0 = (v_01; v_00)
577 movdqa xmm4, xmm0 // u_1 again
578#if CPUFAM_X86
a90d420c 579 movdqa [SP + 0], xmm3
9e6a4409
MW
580#elif CPUFAM_AMD64
581 movdqa xmm8, xmm3
582# define V0 xmm8
583#endif
584 pxor xmm4, xmm1 // u_* = (u_01 + u_11; u_00 + u_10)
585 pxor xmm3, xmm2 // v_* = (v_01 + v_11; v_00 + v_10)
586
587 // Start by building the cross product, q = u_* v_*.
588 movdqa xmm7, xmm4 // more copies of u_*
589 movdqa xmm5, xmm4
590 movdqa xmm6, xmm4
591 pclmullqhqdq xmm4, xmm3 // u_*1 v_*0
592 pclmulhqlqdq xmm7, xmm3 // u_*0 v_*1
593 pclmullqlqdq xmm5, xmm3 // u_*1 v_*1
594 pclmulhqhqdq xmm6, xmm3 // u_*0 v_*0
595 pxor xmm4, xmm7 // u_*1 v_*0 + u_*0 v_*1
596 movdqa xmm7, xmm4
597 pslldq xmm4, 8
598 psrldq xmm7, 8
599 pxor xmm5, xmm4 // q_1
600 pxor xmm6, xmm7 // q_0
601
602 // Next, work on the high half, a = u_1 v_1.
603 movdqa xmm3, xmm0 // more copies of u_1
604 movdqa xmm4, xmm0
605 movdqa xmm7, xmm0
606 pclmullqhqdq xmm0, xmm2 // u_11 v_10
607 pclmulhqlqdq xmm3, xmm2 // u_10 v_11
608 pclmullqlqdq xmm4, xmm2 // u_11 v_11
609 pclmulhqhqdq xmm7, xmm2 // u_10 v_10
610#if CPUFAM_X86
a90d420c 611 movdqa xmm2, [SP + 0]
9e6a4409
MW
612# define V0 xmm2
613#endif
614 pxor xmm0, xmm3 // u_10 v_11 + u_11 v_10
615 movdqa xmm3, xmm0
616 pslldq xmm0, 8
617 psrldq xmm3, 8
618 pxor xmm4, xmm0 // x_1 = a_1
619 pxor xmm7, xmm3 // a_0
620
621 // Mix that into the product now forming in xmm4--xmm7.
622 pxor xmm5, xmm4 // a_1 + q_1
623 pxor xmm6, xmm7 // a_0 + q_0
624 pxor xmm5, xmm7 // a_0 + (a_1 + q_1)
625
626 // Finally, the low half, c = u_0 v_0.
627 movdqa xmm0, xmm1 // more copies of u_0
628 movdqa xmm3, xmm1
629 movdqa xmm7, xmm1
630 pclmullqhqdq xmm1, V0 // u_01 v_00
631 pclmulhqlqdq xmm0, V0 // u_00 v_01
632 pclmullqlqdq xmm3, V0 // u_01 v_01
633 pclmulhqhqdq xmm7, V0 // u_00 v_00
634 pxor xmm0, xmm1 // u_10 v_11 + u_11 v_10
635 movdqa xmm1, xmm0
636 pslldq xmm0, 8
637 psrldq xmm1, 8
638 pxor xmm3, xmm0 // c_1
639 pxor xmm7, xmm1 // x_0 = c_0
640
641 // And mix that in to complete the product.
642 pxor xmm6, xmm3 // (a_0 + q_0) + c_1
643 pxor xmm5, xmm3 // x_2 = a_0 + (a_1 + c_1 + q_1) = a_0 + b_1
644 pxor xmm6, xmm7 // x_1 = (a_0 + c_0 + q_0) + c_1 = b_0 + c_1
645
646#undef V0
647
648 // Now we need to shift that whole lot one bit to the left. This
649 // will also give us an opportunity to put the product back in
650 // xmm0--xmm3. This is a slightly merry dance because it's nearly
651 // pipelined but we don't have enough registers.
652 //
653 // After this, we'll be in GCM bizarro-world.
654 movdqa xmm0, xmm4 // x_3 again
655 psrld xmm4, 31 // x_3 carries
656 pslld xmm0, 1 // x_3 shifted left
657 movdqa xmm3, xmm4 // x_3 copy carries
658 movdqa xmm1, xmm5 // x_2 again
659 pslldq xmm4, 4 // x_3 carries shifted up
660 psrld xmm5, 31 // x_2 carries
661 psrldq xmm3, 12 // x_3 big carry out
662 pslld xmm1, 1 // x_2 shifted left
663 por xmm0, xmm4 // x_3 mixed together
664 movdqa xmm4, xmm5 // x_2 copy carries
665 movdqa xmm2, xmm6 // x_1 again
666 pslldq xmm5, 4 // x_2 carries shifted up
667 psrld xmm6, 31 // x_1 carries
668 psrldq xmm4, 12 // x_2 big carry out
669 pslld xmm2, 1 // x_1 shifted
670 por xmm1, xmm5 // x_2 mixed together
671 movdqa xmm5, xmm6 // x_1 copy carries
672 por xmm1, xmm3 // x_2 with carry from x_3
673 movdqa xmm3, xmm7 // x_0 again
674 pslldq xmm6, 4 // x_1 carries shifted up
675 psrld xmm7, 31 // x_2 carries
676 psrldq xmm5, 12 // x_1 big carry out
677 pslld xmm3, 1 // x_0 shifted
678 por xmm2, xmm6 // x_1 mixed together
679 pslldq xmm7, 4 // x_0 carries shifted up
680 por xmm2, xmm4 // x_1 with carry from x_2
681 por xmm3, xmm7 // x_0 mixed together
682 por xmm3, xmm5 // x_0 with carry from x_1
683
684 // Now we must reduce. This is essentially the same as the 128-bit
685 // case above, but more complicated because everything is bigger.
686 // The polynomial this time is p(t) = t^256 + t^10 + t^5 + t^2 + 1.
687
688 // First, shift the high bits down.
689 movdqa xmm4, xmm0 // y_3 again
690 movdqa xmm5, xmm0 // y_3 yet again
691 movdqa xmm6, xmm0 // y_3 again again
692 pslld xmm4, 30 // y_3: b_i for t^2
693 pslld xmm5, 27 // y_3: b_i for t^5
694 pslld xmm6, 22 // y_3: b_i for t^10
695 movdqa xmm7, xmm1 // y_2 again
696 pxor xmm4, xmm5
697 movdqa xmm5, xmm1 // y_2 again
698 pxor xmm4, xmm6
699 movdqa xmm6, xmm1 // y_2 again
700 pslld xmm7, 30 // y_2: b_i for t^2
701 pslld xmm5, 27 // y_2: b_i for t^5
702 pslld xmm6, 22 // y_2: b_i for t^10
703 pxor xmm7, xmm5
704 movdqa xmm5, xmm4
705 pxor xmm7, xmm6
706 psrldq xmm4, 4
707 movdqa xmm6, xmm7
708 pslldq xmm5, 12
709 psrldq xmm7, 4
710 pxor xmm2, xmm4
711 pslldq xmm6, 12
712 pxor xmm3, xmm7
713 pxor xmm1, xmm5
714 pxor xmm2, xmm6
715
716 // And then shift the low bits up.
717 movdqa xmm4, xmm0 // y_3 again
718 movdqa xmm5, xmm1 // y_2 again
719 movdqa xmm6, xmm0 // y_3 yet again
720 movdqa xmm7, xmm1 // y_2 yet again
721 pxor xmm2, xmm0 // y_1 and unit contrib from y_3
722 pxor xmm3, xmm1 // y_0 and unit contrib from y_2
723 psrld xmm0, 2
724 psrld xmm1, 2
725 psrld xmm4, 5
726 psrld xmm5, 5
727 psrld xmm6, 10
728 psrld xmm7, 10
729 pxor xmm0, xmm2 // y_1, with y_3 units and t^2
730 pxor xmm1, xmm3 // y_0, with y_2 units and t^2
731 pxor xmm4, xmm6 // y_3 t^5 and t^10 contribs
732 pxor xmm5, xmm7 // y_2 t^5 and t^10 contribs
733 pxor xmm0, xmm4 // high half of reduced result
734 pxor xmm1, xmm5 // low half; all done
735.endm
736
737///--------------------------------------------------------------------------
738/// Main code.
739
740// There are a number of representations of field elements in this code and
741// it can be confusing.
742//
743// * The `external format' consists of a sequence of contiguous bytes in
744// memory called a `block'. The GCM spec explains how to interpret this
745// block as an element of a finite field. As discussed extensively, this
746// representation is very annoying for a number of reasons. On the other
747// hand, this code never actually deals with it directly.
748//
749// * The `register format' consists of one or more XMM registers, depending
750// on the block size. The bytes in these registers are in reverse order
751// -- so the least-significant byte of the lowest-numbered register holds
752// the /last/ byte in the block. If the block size is not a multiple of
753// 16 bytes, then there must be padding. 96-bit blocks are weird: the
754// padding is inserted at the /least/ significant end, so the register
755// holds (0, x_0; x_1, x_2); otherwise, the padding goes at the most
756// significant end.
757//
758// * The `words' format consists of a sequence of bytes, as in the
759// `external format', but, according to the blockcipher in use, the bytes
760// within each 32-bit word may be reversed (`big-endian') or not
761// (`little-endian'). Accordingly, there are separate entry points for
762// each variant, identified with `b' or `l'.
763
764#define SSEFUNC(f) \
765 FUNC(f##_avx); vzeroupper; endprologue; ENDFUNC; \
766 FUNC(f)
767
768SSEFUNC(gcm_mulk_128b_x86ish_pclmul)
769 // On entry, A points to a 128-bit field element in big-endian words
770 // format; K points to a field-element in register format. On exit,
771 // A is updated with the product A K.
772
773#if CPUFAM_X86
a90d420c
MW
774 mov A, [SP + 4]
775 mov K, [SP + 8]
9e6a4409
MW
776#endif
777 endprologue
778 movdqu xmm0, [A]
779 movdqu xmm1, [K]
780 pshufd xmm0, xmm0, SHUF(3, 2, 1, 0)
781 mul128
782 pshufd xmm0, xmm0, SHUF(3, 2, 1, 0)
783 movdqu [A], xmm0
784 ret
785ENDFUNC
786
787SSEFUNC(gcm_mulk_128l_x86ish_pclmul)
788 // On entry, A points to a 128-bit field element in little-endian
789 // words format; K points to a field-element in register format. On
790 // exit, A is updated with the product A K.
791
792#if CPUFAM_X86
a90d420c
MW
793 mov A, [SP + 4]
794 mov K, [SP + 8]
9e6a4409
MW
795 ldgot ecx
796#endif
797 endprologue
798 movdqa xmm7, [INTADDR(swaptab_128l, ecx)]
799 movdqu xmm0, [A]
800 movdqu xmm1, [K]
801 pshufb xmm0, xmm7
802 mul128
803 pshufb xmm0, xmm7
804 movdqu [A], xmm0
805 ret
806ENDFUNC
807
808SSEFUNC(gcm_mulk_64b_x86ish_pclmul)
809 // On entry, A points to a 64-bit field element in big-endian words
810 // format; K points to a field-element in register format. On exit,
811 // A is updated with the product A K.
812
813#if CPUFAM_X86
a90d420c
MW
814 mov A, [SP + 4]
815 mov K, [SP + 8]
9e6a4409
MW
816#endif
817 endprologue
818 movq xmm0, [A]
819 movq xmm1, [K]
820 pshufd xmm0, xmm0, SHUF(1, 0, 3, 3)
821 mul64
822 pshufd xmm0, xmm0, SHUF(1, 0, 3, 3)
823 movq [A], xmm0
824 ret
825ENDFUNC
826
827SSEFUNC(gcm_mulk_64l_x86ish_pclmul)
828 // On entry, A points to a 64-bit field element in little-endian
829 // words format; K points to a field-element in register format. On
830 // exit, A is updated with the product A K.
831
832#if CPUFAM_X86
a90d420c
MW
833 mov A, [SP + 4]
834 mov K, [SP + 8]
9e6a4409
MW
835 ldgot ecx
836#endif
837 endprologue
838 movdqa xmm7, [INTADDR(swaptab_64l, ecx)]
839 movq xmm0, [A]
840 movq xmm1, [K]
841 pshufb xmm0, xmm7
842 mul64
843 pshufb xmm0, xmm7
844 movq [A], xmm0
845 ret
846ENDFUNC
847
848SSEFUNC(gcm_mulk_96b_x86ish_pclmul)
849 // On entry, A points to a 96-bit field element in big-endian words
850 // format; K points to a field-element in register format (i.e., 16
851 // bytes, with the first four bytes zero). On exit, A is updated
852 // with the product A K.
853
854#if CPUFAM_X86
a90d420c
MW
855 mov A, [SP + 4]
856 mov K, [SP + 8]
9e6a4409
MW
857#endif
858 endprologue
859 movq xmm0, [A + 0]
860 movd xmm2, [A + 8]
861 movdqu xmm1, [K]
862 punpcklqdq xmm0, xmm2
863 pshufd xmm0, xmm0, SHUF(3, 2, 1, 0)
864 mul96
865 pshufd xmm1, xmm0, SHUF(3, 2, 1, 0)
866 psrldq xmm0, 4
867 movq [A + 0], xmm1
868 movd [A + 8], xmm0
869 ret
870ENDFUNC
871
872SSEFUNC(gcm_mulk_96l_x86ish_pclmul)
873 // On entry, A points to a 96-bit field element in little-endian
874 // words format; K points to a field-element in register format
875 // (i.e., 16 bytes, with the first four bytes zero). On exit, A is
876 // updated with the product A K.
877
878#if CPUFAM_X86
a90d420c
MW
879 mov A, [SP + 4]
880 mov K, [SP + 8]
9e6a4409
MW
881 ldgot ecx
882#endif
883 endprologue
884 movdqa xmm7, [INTADDR(swaptab_128l, ecx)]
885 movq xmm0, [A + 0]
886 movd xmm2, [A + 8]
887 movdqu xmm1, [K]
888 punpcklqdq xmm0, xmm2
889 pshufb xmm0, xmm7
890 mul96
891 pshufb xmm0, xmm7
892 movq [A + 0], xmm0
893 psrldq xmm0, 8
894 movd [A + 8], xmm0
895 ret
896ENDFUNC
897
898SSEFUNC(gcm_mulk_192b_x86ish_pclmul)
899 // On entry, A points to a 192-bit field element in big-endian words
900 // format; K points to a field-element in register format. On exit,
901 // A is updated with the product A K.
902
903#if CPUFAM_X86
a90d420c
MW
904 mov A, [SP + 4]
905 mov K, [SP + 8]
9e6a4409
MW
906#endif
907#if CPUFAM_AMD64 && ABI_WIN
908 stalloc 2*16 + 8
909 savexmm xmm6, 0
910 savexmm xmm7, 16
911#endif
912 endprologue
913 movdqu xmm0, [A + 8]
914 movq xmm1, [A + 0]
915 movdqu xmm2, [K + 0]
916 movq xmm3, [K + 16]
917 pshufd xmm0, xmm0, SHUF(3, 2, 1, 0)
918 pshufd xmm1, xmm1, SHUF(1, 0, 3, 3)
919 mul192
920 pshufd xmm0, xmm0, SHUF(3, 2, 1, 0)
921 pshufd xmm1, xmm1, SHUF(1, 0, 3, 3)
922 movdqu [A + 8], xmm0
923 movq [A + 0], xmm1
924#if CPUFAM_AMD64 && ABI_WIN
925 rstrxmm xmm6, 0
926 rstrxmm xmm7, 16
927 stfree 2*16 + 8
928#endif
929 ret
930ENDFUNC
931
932SSEFUNC(gcm_mulk_192l_x86ish_pclmul)
933 // On entry, A points to a 192-bit field element in little-endian
934 // words format; K points to a field-element in register format. On
935 // exit, A is updated with the product A K.
936
937#if CPUFAM_X86
a90d420c
MW
938 mov A, [SP + 4]
939 mov K, [SP + 8]
9e6a4409
MW
940 ldgot ecx
941#endif
942#if CPUFAM_AMD64 && ABI_WIN
943 stalloc 2*16 + 8
944 savexmm xmm6, 0
945 savexmm xmm7, 16
946#endif
947 endprologue
948 movdqu xmm0, [A + 8]
949 movq xmm1, [A + 0]
950 movdqu xmm2, [K + 0]
951 movq xmm3, [K + 16]
952 pshufb xmm0, [INTADDR(swaptab_128l, ecx)]
953 pshufb xmm1, [INTADDR(swaptab_64l, ecx)]
954 mul192
955 pshufb xmm0, [INTADDR(swaptab_128l, ecx)]
956 pshufb xmm1, [INTADDR(swaptab_64l, ecx)]
957 movdqu [A + 8], xmm0
958 movq [A + 0], xmm1
959#if CPUFAM_AMD64 && ABI_WIN
960 rstrxmm xmm6, 0
961 rstrxmm xmm7, 16
962 stfree 2*16 + 8
963#endif
964 ret
965ENDFUNC
966
967SSEFUNC(gcm_mulk_256b_x86ish_pclmul)
968 // On entry, A points to a 256-bit field element in big-endian words
969 // format; K points to a field-element in register format. On exit,
970 // A is updated with the product A K.
971
972#if CPUFAM_X86
a90d420c 973 pushreg BP
9e6a4409 974 setfp
a90d420c
MW
975 mov A, [SP + 8]
976 mov K, [SP + 12]
6d2bd7f1 977 stalloc 16
a90d420c 978 and SP, ~15
9e6a4409
MW
979#endif
980#if CPUFAM_AMD64 && ABI_WIN
981 stalloc 3*16 + 8
982 savexmm xmm6, 0
983 savexmm xmm7, 16
984 savexmm xmm8, 32
985#endif
986 endprologue
987 movdqu xmm0, [A + 16]
988 movdqu xmm1, [A + 0]
989 movdqu xmm2, [K + 0]
990 movdqu xmm3, [K + 16]
991 pshufd xmm0, xmm0, SHUF(3, 2, 1, 0)
992 pshufd xmm1, xmm1, SHUF(3, 2, 1, 0)
993 mul256
994 pshufd xmm0, xmm0, SHUF(3, 2, 1, 0)
995 pshufd xmm1, xmm1, SHUF(3, 2, 1, 0)
996 movdqu [A + 16], xmm0
997 movdqu [A + 0], xmm1
998#if CPUFAM_X86
999 dropfp
a90d420c 1000 popreg BP
9e6a4409
MW
1001#endif
1002#if CPUFAM_AMD64 && ABI_WIN
1003 rstrxmm xmm6, 0
1004 rstrxmm xmm7, 16
1005 rstrxmm xmm8, 32
1006 stfree 3*16 + 8
1007#endif
1008 ret
1009ENDFUNC
1010
1011SSEFUNC(gcm_mulk_256l_x86ish_pclmul)
1012 // On entry, A points to a 256-bit field element in little-endian
1013 // words format; K points to a field-element in register format. On
1014 // exit, A is updated with the product A K.
1015
1016#if CPUFAM_X86
a90d420c 1017 pushreg BP
9e6a4409 1018 setfp
a90d420c
MW
1019 mov A, [SP + 8]
1020 mov K, [SP + 12]
6d2bd7f1 1021 stalloc 16
9e6a4409 1022 ldgot ecx
6d2bd7f1 1023 and SP, ~15
9e6a4409
MW
1024#endif
1025#if CPUFAM_AMD64 && ABI_WIN
1026 stalloc 3*16 + 8
1027 savexmm xmm6, 0
1028 savexmm xmm7, 16
1029 savexmm xmm8, 32
1030#endif
1031 endprologue
1032 movdqa xmm7, [INTADDR(swaptab_128l, ecx)]
1033 movdqu xmm0, [A + 16]
1034 movdqu xmm1, [A + 0]
1035 movdqu xmm2, [K + 0]
1036 movdqu xmm3, [K + 16]
1037 pshufb xmm0, xmm7
1038 pshufb xmm1, xmm7
1039 mul256
1040 movdqa xmm7, [INTADDR(swaptab_128l, ecx)]
1041 pshufb xmm0, xmm7
1042 pshufb xmm1, xmm7
1043 movdqu [A + 16], xmm0
1044 movdqu [A + 0], xmm1
1045#if CPUFAM_X86
1046 dropfp
a90d420c 1047 popreg BP
9e6a4409
MW
1048#endif
1049#if CPUFAM_AMD64 && ABI_WIN
1050 rstrxmm xmm6, 0
1051 rstrxmm xmm7, 16
1052 rstrxmm xmm8, 32
1053 stfree 3*16 + 8
1054#endif
1055 ret
1056ENDFUNC
1057
1058 RODATA
1059
1060 .balign 16
1061swaptab_128l:
1062 // Table for byte-swapping little-endian words-format blocks larger
1063 // than 64 bits.
1064 .byte 15, 14, 13, 12, 11, 10, 9, 8
1065 .byte 7, 6, 5, 4, 3, 2, 1, 0
1066
1067 .balign 16
1068swaptab_64l:
1069 // Table for byte-swapping 64-bit little-endian words-format blocks.
1070 .byte 7, 6, 5, 4, 3, 2, 1, 0
1071 .byte 255, 255, 255, 255, 255, 255, 255, 255
1072
1073///----- That's all, folks --------------------------------------------------