progs/perftest.c: Use from Glibc syscall numbers.
[catacomb] / symm / rijndael-x86ish-aesni.S
1 /// -*- mode: asm; asm-comment-char: ?/ -*-
2 ///
3 /// AESNI-based implementation of Rijndael
4 ///
5 /// (c) 2015 Straylight/Edgeware
6 ///
7
8 ///----- Licensing notice ---------------------------------------------------
9 ///
10 /// This file is part of Catacomb.
11 ///
12 /// Catacomb is free software; you can redistribute it and/or modify
13 /// it under the terms of the GNU Library General Public License as
14 /// published by the Free Software Foundation; either version 2 of the
15 /// License, or (at your option) any later version.
16 ///
17 /// Catacomb is distributed in the hope that it will be useful,
18 /// but WITHOUT ANY WARRANTY; without even the implied warranty of
19 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 /// GNU Library General Public License for more details.
21 ///
22 /// You should have received a copy of the GNU Library General Public
23 /// License along with Catacomb; if not, write to the Free
24 /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
25 /// MA 02111-1307, USA.
26
27 ///--------------------------------------------------------------------------
28 /// Preliminaries.
29
30 #include "config.h"
31 #include "asm-common.h"
32
33 .arch .aes
34
35 .extern F(abort)
36 .extern F(rijndael_rcon)
37
38 .text
39
40 ///--------------------------------------------------------------------------
41 /// Main code.
42
43 /// The AESNI instructions implement a little-endian version of AES, but
44 /// Catacomb's internal interface presents as big-endian so as to work better
45 /// with things like GCM. We therefore maintain the round keys in
46 /// little-endian form, and have to end-swap blocks in and out.
47 ///
48 /// For added amusement, the AESNI instructions don't implement the
49 /// larger-block versions of Rijndael, so we have to end-swap the keys if
50 /// we're preparing for one of those.
51
52 // Useful constants.
53 .equ maxrounds, 16 // maximum number of rounds
54 .equ maxblksz, 32 // maximum block size, in bytes
55 .equ kbufsz, maxblksz*(maxrounds + 1) // size of a key-schedule buffer
56
57 // Context structure.
58 .equ nr, 0 // number of rounds
59 .equ w, nr + 4 // encryption key words
60 .equ wi, w + kbufsz // decryption key words
61
62 ///--------------------------------------------------------------------------
63 /// Key setup.
64
65 FUNC(rijndael_setup_x86ish_aesni_avx)
66 vzeroupper // avoid penalty on `legacy' XMM access
67 endprologue
68 // and drop through...
69 ENDFUNC
70
71 FUNC(rijndael_setup_x86ish_aesni)
72
73 #if CPUFAM_X86
74 // Arguments are on the stack. We'll need to stack the caller's
75 // register veriables, but we'll manage.
76
77 # define CTX BP // context pointer
78 # define BLKSZ [SP + 24] // block size
79
80 # define KSZ ebx // key size
81 # define NKW edx // total number of key words
82 # define NKW_NEEDS_REFRESH 1 // ... needs recalculating
83 # define RCON ecx // round constants table
84 # define LIM edx // limit pointer
85 # define CYIX edi // index in shift-register cycle
86
87 # define NR ecx // number of rounds
88 # define LRK eax // distance to last key
89 # define BLKOFF edx // block size in bytes
90
91 // Stack the caller's registers.
92 pushreg BP
93 pushreg ebx
94 pushreg esi
95 pushreg edi
96
97 // Set up our own variables.
98 mov CTX, [SP + 20] // context base pointer
99 mov SI, [SP + 28] // key material
100 mov KSZ, [SP + 32] // key size, in words
101 #endif
102
103 #if CPUFAM_AMD64 && ABI_SYSV
104 // Arguments are in registers. We have plenty, but, to be honest,
105 // the initial register allocation is a bit annoying.
106
107 # define CTX r8 // context pointer
108 # define BLKSZ r9d // block size
109
110 # define KSZ edx // key size
111 # define NKW r10d // total number of key words
112 # define RCON rdi // round constants table
113 # define LIM rcx // limit pointer
114 # define CYIX r11d // index in shift-register cycle
115
116 # define NR ecx // number of rounds
117 # define LRK eax // distance to last key
118 # define BLKOFF r9d // block size in bytes
119
120 // Move arguments to more useful places.
121 mov CTX, rdi // context base pointer
122 mov BLKSZ, esi // block size in words
123 mov SI, rdx // key material
124 mov KSZ, ecx // key size, in words
125 #endif
126
127 #if CPUFAM_AMD64 && ABI_WIN
128 // Arguments are in different registers, and they're a little tight.
129
130 # define CTX r8 // context pointer
131 # define BLKSZ edx // block size
132
133 # define KSZ r9d // key size
134 # define NKW r10d // total number of key words
135 # define RCON rdi // round constants table
136 # define LIM rcx // limit pointer
137 # define CYIX r11d // index in shift-register cycle
138
139 # define NR ecx // number of rounds
140 # define LRK eax // distance to last key
141 # define BLKOFF edx // block size in bytes
142
143 // We'll need the index registers, which belong to the caller in this
144 // ABI.
145 pushreg rsi
146 pushreg rdi
147
148 // Move arguments to more useful places.
149 mov rsi, r8 // key material
150 mov CTX, rcx // context base pointer
151 #endif
152
153 endprologue
154
155 // The initial round key material is taken directly from the input
156 // key, so copy it over.
157 #if CPUFAM_AMD64 && ABI_SYSV
158 // We've been lucky. We already have a copy of the context pointer
159 // in rdi, and the key size in ecx.
160 add rdi, w
161 #else
162 lea DI, [CTX + w]
163 mov ecx, KSZ
164 #endif
165 rep movsd
166
167 // Find out other useful things.
168 mov NKW, [CTX + nr] // number of rounds
169 add NKW, 1
170 imul NKW, BLKSZ // total key size in words
171 #if !NKW_NEEDS_REFRESH
172 // If we can't keep NKW for later, then we use the same register for
173 // it and LIM, so this move is unnecessary.
174 mov DWORD(LIM), NKW
175 #endif
176 sub DWORD(LIM), KSZ // offset by the key size
177
178 // Find the round constants.
179 ldgot WHOLE(c)
180 leaext RCON, F(rijndael_rcon), WHOLE(c)
181
182 // Prepare for the main loop.
183 lea SI, [CTX + w]
184 mov eax, [SI + 4*WHOLE(KSZ) - 4] // most recent key word
185 lea LIM, [SI + 4*LIM] // limit, offset by one key expansion
186 xor CYIX, CYIX // start of new cycle
187
188 // Main key expansion loop. The first word of each key-length chunk
189 // needs special treatment.
190 //
191 // This is rather tedious because the Intel `AESKEYGENASSIST'
192 // instruction is very strangely shaped. Firstly, it wants to
193 // operate on vast SSE registers, even though we're data-blocked from
194 // doing more than operation at a time unless we're doing two key
195 // schedules simultaneously -- and even then we can't do more than
196 // two, because the instruction ignores two of its input words
197 // entirely, and produces two different outputs for each of the other
198 // two. And secondly it insists on taking the magic round constant
199 // as an immediate, so it's kind of annoying if you're not
200 // open-coding the whole thing. It's much easier to leave that as
201 // zero and XOR in the round constant by hand.
202 0: cmp CYIX, 0 // first word of the cycle?
203 je 1f
204 cmp CYIX, 4 // fourth word of the cycle?
205 jne 2f
206 cmp KSZ, 7 // and a large key?
207 jb 2f
208
209 // Fourth word of the cycle, and seven or eight words of key. Do a
210 // byte substitution.
211 movd xmm0, eax
212 pshufd xmm0, xmm0, SHUF(2, 1, 0, 3)
213 aeskeygenassist xmm1, xmm0, 0
214 movd eax, xmm1
215 jmp 2f
216
217 // First word of the cycle. This is the complicated piece.
218 1: movd xmm0, eax
219 pshufd xmm0, xmm0, SHUF(0, 3, 2, 1)
220 aeskeygenassist xmm1, xmm0, 0
221 pshufd xmm1, xmm1, SHUF(2, 1, 0, 3)
222 movd eax, xmm1
223 xor al, [RCON]
224 inc RCON
225
226 // Common tail. Mix in the corresponding word from the previous
227 // cycle and prepare for the next loop.
228 2: xor eax, [SI]
229 mov [SI + 4*WHOLE(KSZ)], eax
230 add SI, 4
231 inc CYIX
232 cmp SI, LIM
233 jae 9f
234 cmp CYIX, KSZ
235 jb 0b
236 xor CYIX, CYIX
237 jmp 0b
238
239 // Next job is to construct the decryption keys. The keys for the
240 // first and last rounds don't need to be mangled, but the remaining
241 // ones do -- and they all need to be reordered too.
242 //
243 // The plan of action, then, is to copy the final encryption round's
244 // keys into place first, then to do each of the intermediate rounds
245 // in reverse order, and finally do the first round.
246 //
247 // Do all of the heavy lifting with SSE registers. The order we're
248 // doing this in means that it's OK if we read or write too much, and
249 // there's easily enough buffer space for the over-enthusiastic reads
250 // and writes because the context has space for 32-byte blocks, which
251 // is our maximum and an exact fit for two SSE registers.
252 9: mov NR, [CTX + nr] // number of rounds
253 #if NKW_NEEDS_REFRESH
254 mov BLKOFF, BLKSZ
255 mov LRK, NR
256 imul LRK, BLKOFF
257 #else
258 // If we retain NKW, then BLKSZ and BLKOFF are the same register
259 // because we won't need the former again.
260 mov LRK, NKW
261 sub LRK, BLKSZ
262 #endif
263 lea DI, [CTX + wi]
264 lea SI, [CTX + w + 4*WHOLE(LRK)] // last round's keys
265 shl BLKOFF, 2 // block size (in bytes now)
266
267 // Copy the last encryption round's keys.
268 movdqu xmm0, [SI]
269 movdqu [DI], xmm0
270 cmp BLKOFF, 16
271 jbe 0f
272 movdqu xmm0, [SI + 16]
273 movdqu [DI + 16], xmm0
274
275 // Update the loop variables and stop if we've finished.
276 0: add DI, WHOLE(BLKOFF)
277 sub SI, WHOLE(BLKOFF)
278 sub NR, 1
279 jbe 9f
280
281 // Do another middle round's keys...
282 movdqu xmm0, [SI]
283 aesimc xmm0, xmm0
284 movdqu [DI], xmm0
285 cmp BLKOFF, 16
286 jbe 0b
287 movdqu xmm0, [SI + 16]
288 aesimc xmm0, xmm0
289 movdqu [DI + 16], xmm0
290 jmp 0b
291
292 // Finally do the first encryption round.
293 9: movdqu xmm0, [SI]
294 movdqu [DI], xmm0
295 cmp BLKOFF, 16
296 jbe 1f
297 movdqu xmm0, [SI + 16]
298 movdqu [DI + 16], xmm0
299
300 // If the block size is not exactly four words then we must end-swap
301 // everything. We can use fancy SSE toys for this.
302 1: cmp BLKOFF, 16
303 je 9f
304
305 // Find the byte-reordering table.
306 ldgot ecx
307 movdqa xmm5, [INTADDR(endswap_tab, ecx)]
308
309 #if NKW_NEEDS_REFRESH
310 // Calculate the number of subkey words again. (It's a good job
311 // we've got a fast multiplier.)
312 mov NKW, [CTX + nr]
313 add NKW, 1
314 imul NKW, BLKSZ
315 #endif
316
317 // End-swap the encryption keys.
318 lea SI, [CTX + w]
319 call endswap_block
320
321 // And the decryption keys.
322 lea SI, [CTX + wi]
323 call endswap_block
324
325 9: // All done.
326 #if CPUFAM_X86
327 popreg edi
328 popreg esi
329 popreg ebx
330 popreg BP
331 #endif
332 #if CPUFAM_AMD64 && ABI_WIN
333 popreg rdi
334 popreg rsi
335 #endif
336 ret
337
338 ENDFUNC
339
340 INTFUNC(endswap_block)
341 // End-swap NKW words starting at SI. The end-swapping table is
342 // already loaded into XMM5; and it's OK to work in 16-byte chunks.
343 endprologue
344
345 mov ecx, NKW
346 0: movdqu xmm1, [SI]
347 pshufb xmm1, xmm5
348 movdqu [SI], xmm1
349 add SI, 16
350 sub ecx, 4
351 ja 0b
352
353 ret
354
355 ENDFUNC
356
357 #undef CTX
358 #undef BLKSZ
359 #undef SI
360 #undef DI
361 #undef KSZ
362 #undef RCON
363 #undef LIM
364 #undef NR
365 #undef LRK
366 #undef BLKOFF
367
368 ///--------------------------------------------------------------------------
369 /// Encrypting and decrypting blocks.
370
371 .macro encdec op, aes, koff
372 FUNC(rijndael_\op\()_x86ish_aesni_avx)
373 vzeroupper // avoid XMM penalties
374 endprologue
375 // and drop through...
376 ENDFUNC
377
378 FUNC(rijndael_\op\()_x86ish_aesni)
379
380 #if CPUFAM_X86
381 // Arguments come in on the stack, and need to be collected. We
382 // don't have a shortage of registers.
383
384 # define K eax
385 # define SRC edx
386 # define DST edx
387 # define NR ecx
388
389 mov K, [SP + 4]
390 mov SRC, [SP + 8]
391 #endif
392
393 #if CPUFAM_AMD64 && ABI_SYSV
394 // Arguments come in registers. All is good.
395
396 # define K rdi
397 # define SRC rsi
398 # define DST rdx
399 # define NR eax
400 #endif
401
402 #if CPUFAM_AMD64 && ABI_WIN
403 // Arguments come in different registers.
404
405 # define K rcx
406 # define SRC rdx
407 # define DST r8
408 # define NR eax
409 #endif
410
411 endprologue
412
413 // Find the magic endianness-swapping table.
414 ldgot ecx
415 movdqa xmm5, [INTADDR(endswap_tab, ecx)]
416
417 // Initial setup.
418 movdqu xmm0, [SRC]
419 pshufb xmm0, xmm5
420 mov NR, [K + nr]
421 add K, \koff
422
423 // Initial whitening.
424 movdqu xmm1, [K]
425 add K, 16
426 pxor xmm0, xmm1
427 #if CPUFAM_X86
428 mov DST, [SP + 12]
429 #endif
430
431 // Dispatch to the correct code.
432 cmp NR, 10
433 je 10f
434 jb bogus
435 cmp NR, 14
436 je 14f
437 ja bogus
438 cmp NR, 12
439 je 12f
440 jb 11f
441 jmp 13f
442
443 .align 2
444
445 // 14 rounds...
446 14: movdqu xmm1, [K]
447 add K, 16
448 \aes xmm0, xmm1
449
450 // 13 rounds...
451 13: movdqu xmm1, [K]
452 add K, 16
453 \aes xmm0, xmm1
454
455 // 12 rounds...
456 12: movdqu xmm1, [K]
457 add K, 16
458 \aes xmm0, xmm1
459
460 // 11 rounds...
461 11: movdqu xmm1, [K]
462 add K, 16
463 \aes xmm0, xmm1
464
465 // 10 rounds...
466 10: movdqu xmm1, [K]
467 \aes xmm0, xmm1
468
469 // 9 rounds...
470 movdqu xmm1, [K + 16]
471 \aes xmm0, xmm1
472
473 // 8 rounds...
474 movdqu xmm1, [K + 32]
475 \aes xmm0, xmm1
476
477 // 7 rounds...
478 movdqu xmm1, [K + 48]
479 \aes xmm0, xmm1
480
481 // 6 rounds...
482 movdqu xmm1, [K + 64]
483 \aes xmm0, xmm1
484
485 // 5 rounds...
486 movdqu xmm1, [K + 80]
487 \aes xmm0, xmm1
488
489 // 4 rounds...
490 movdqu xmm1, [K + 96]
491 \aes xmm0, xmm1
492
493 // 3 rounds...
494 movdqu xmm1, [K + 112]
495 \aes xmm0, xmm1
496
497 // 2 rounds...
498 movdqu xmm1, [K + 128]
499 \aes xmm0, xmm1
500
501 // Final round...
502 movdqu xmm1, [K + 144]
503 \aes\()last xmm0, xmm1
504
505 // Unpermute the ciphertext block and store it.
506 pshufb xmm0, xmm5
507 movdqu [DST], xmm0
508
509 // And we're done.
510 ret
511
512 #undef K
513 #undef SRC
514 #undef DST
515 #undef NR
516
517 ENDFUNC
518 .endm
519
520 encdec eblk, aesenc, w
521 encdec dblk, aesdec, wi
522
523 ///--------------------------------------------------------------------------
524 /// Random utilities.
525
526 INTFUNC(bogus)
527 // Abort the process because of a programming error. Indirecting
528 // through this point serves several purposes: (a) by CALLing, rather
529 // than branching to, `abort', we can save the return address, which
530 // might at least provide a hint as to what went wrong; (b) we don't
531 // have conditional CALLs (and they'd be big anyway); and (c) we can
532 // write a HLT here as a backstop against `abort' being mad.
533 endprologue
534
535 callext F(abort)
536 0: hlt
537 jmp 0b
538
539 ENDFUNC
540
541 ///--------------------------------------------------------------------------
542 /// Data tables.
543
544 RODATA
545
546 .align 16
547 endswap_tab:
548 .byte 3, 2, 1, 0
549 .byte 7, 6, 5, 4
550 .byte 11, 10, 9, 8
551 .byte 15, 14, 13, 12
552
553 ///----- That's all, folks --------------------------------------------------