progs/perftest.c: Use from Glibc syscall numbers.
[catacomb] / symm / rijndael-x86ish-aesni.S
CommitLineData
1a0c09c4
MW
1/// -*- mode: asm; asm-comment-char: ?/ -*-
2///
3/// AESNI-based implementation of Rijndael
4///
5/// (c) 2015 Straylight/Edgeware
6///
7
8///----- Licensing notice ---------------------------------------------------
9///
10/// This file is part of Catacomb.
11///
12/// Catacomb is free software; you can redistribute it and/or modify
13/// it under the terms of the GNU Library General Public License as
14/// published by the Free Software Foundation; either version 2 of the
15/// License, or (at your option) any later version.
16///
17/// Catacomb is distributed in the hope that it will be useful,
18/// but WITHOUT ANY WARRANTY; without even the implied warranty of
19/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20/// GNU Library General Public License for more details.
21///
22/// You should have received a copy of the GNU Library General Public
23/// License along with Catacomb; if not, write to the Free
24/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
25/// MA 02111-1307, USA.
26
27///--------------------------------------------------------------------------
df07f2c0 28/// Preliminaries.
1a0c09c4
MW
29
30#include "config.h"
31#include "asm-common.h"
32
df07f2c0
MW
33 .arch .aes
34
81bc2bb8
MW
35 .extern F(abort)
36 .extern F(rijndael_rcon)
1a0c09c4 37
df07f2c0
MW
38 .text
39
1a0c09c4
MW
40///--------------------------------------------------------------------------
41/// Main code.
42
1a0c09c4
MW
43/// The AESNI instructions implement a little-endian version of AES, but
44/// Catacomb's internal interface presents as big-endian so as to work better
45/// with things like GCM. We therefore maintain the round keys in
46/// little-endian form, and have to end-swap blocks in and out.
47///
48/// For added amusement, the AESNI instructions don't implement the
49/// larger-block versions of Rijndael, so we have to end-swap the keys if
50/// we're preparing for one of those.
51
52 // Useful constants.
53 .equ maxrounds, 16 // maximum number of rounds
54 .equ maxblksz, 32 // maximum block size, in bytes
55 .equ kbufsz, maxblksz*(maxrounds + 1) // size of a key-schedule buffer
56
57 // Context structure.
58 .equ nr, 0 // number of rounds
59 .equ w, nr + 4 // encryption key words
60 .equ wi, w + kbufsz // decryption key words
61
62///--------------------------------------------------------------------------
63/// Key setup.
64
b9b279b4
MW
65FUNC(rijndael_setup_x86ish_aesni_avx)
66 vzeroupper // avoid penalty on `legacy' XMM access
67 endprologue
68 // and drop through...
69ENDFUNC
70
0f23f75f 71FUNC(rijndael_setup_x86ish_aesni)
1a0c09c4 72
0f23f75f
MW
73#if CPUFAM_X86
74 // Arguments are on the stack. We'll need to stack the caller's
75 // register veriables, but we'll manage.
1a0c09c4 76
a90d420c
MW
77# define CTX BP // context pointer
78# define BLKSZ [SP + 24] // block size
0f23f75f 79
0f23f75f 80# define KSZ ebx // key size
0f23f75f
MW
81# define NKW edx // total number of key words
82# define NKW_NEEDS_REFRESH 1 // ... needs recalculating
83# define RCON ecx // round constants table
84# define LIM edx // limit pointer
16021451 85# define CYIX edi // index in shift-register cycle
0f23f75f
MW
86
87# define NR ecx // number of rounds
88# define LRK eax // distance to last key
0f23f75f 89# define BLKOFF edx // block size in bytes
0f23f75f
MW
90
91 // Stack the caller's registers.
a90d420c 92 pushreg BP
0923a413
MW
93 pushreg ebx
94 pushreg esi
95 pushreg edi
1a0c09c4 96
0f23f75f 97 // Set up our own variables.
a90d420c
MW
98 mov CTX, [SP + 20] // context base pointer
99 mov SI, [SP + 28] // key material
100 mov KSZ, [SP + 32] // key size, in words
0f23f75f
MW
101#endif
102
103#if CPUFAM_AMD64 && ABI_SYSV
104 // Arguments are in registers. We have plenty, but, to be honest,
105 // the initial register allocation is a bit annoying.
106
107# define CTX r8 // context pointer
108# define BLKSZ r9d // block size
109
0f23f75f 110# define KSZ edx // key size
0f23f75f
MW
111# define NKW r10d // total number of key words
112# define RCON rdi // round constants table
43ea7558 113# define LIM rcx // limit pointer
16021451 114# define CYIX r11d // index in shift-register cycle
0f23f75f
MW
115
116# define NR ecx // number of rounds
117# define LRK eax // distance to last key
0f23f75f 118# define BLKOFF r9d // block size in bytes
0f23f75f
MW
119
120 // Move arguments to more useful places.
121 mov CTX, rdi // context base pointer
122 mov BLKSZ, esi // block size in words
123 mov SI, rdx // key material
124 mov KSZ, ecx // key size, in words
125#endif
126
127#if CPUFAM_AMD64 && ABI_WIN
128 // Arguments are in different registers, and they're a little tight.
129
130# define CTX r8 // context pointer
131# define BLKSZ edx // block size
132
0f23f75f 133# define KSZ r9d // key size
0f23f75f
MW
134# define NKW r10d // total number of key words
135# define RCON rdi // round constants table
43ea7558 136# define LIM rcx // limit pointer
16021451 137# define CYIX r11d // index in shift-register cycle
0f23f75f
MW
138
139# define NR ecx // number of rounds
140# define LRK eax // distance to last key
0f23f75f 141# define BLKOFF edx // block size in bytes
0f23f75f
MW
142
143 // We'll need the index registers, which belong to the caller in this
144 // ABI.
0923a413
MW
145 pushreg rsi
146 pushreg rdi
0f23f75f
MW
147
148 // Move arguments to more useful places.
43ea7558 149 mov rsi, r8 // key material
0f23f75f
MW
150 mov CTX, rcx // context base pointer
151#endif
152
0923a413
MW
153 endprologue
154
1a0c09c4
MW
155 // The initial round key material is taken directly from the input
156 // key, so copy it over.
0f23f75f
MW
157#if CPUFAM_AMD64 && ABI_SYSV
158 // We've been lucky. We already have a copy of the context pointer
159 // in rdi, and the key size in ecx.
43ea7558 160 add rdi, w
0f23f75f
MW
161#else
162 lea DI, [CTX + w]
163 mov ecx, KSZ
164#endif
1a0c09c4
MW
165 rep movsd
166
167 // Find out other useful things.
0f23f75f
MW
168 mov NKW, [CTX + nr] // number of rounds
169 add NKW, 1
170 imul NKW, BLKSZ // total key size in words
171#if !NKW_NEEDS_REFRESH
172 // If we can't keep NKW for later, then we use the same register for
173 // it and LIM, so this move is unnecessary.
43ea7558 174 mov DWORD(LIM), NKW
0f23f75f 175#endif
43ea7558 176 sub DWORD(LIM), KSZ // offset by the key size
1a0c09c4
MW
177
178 // Find the round constants.
43ea7558
MW
179 ldgot WHOLE(c)
180 leaext RCON, F(rijndael_rcon), WHOLE(c)
1a0c09c4
MW
181
182 // Prepare for the main loop.
0f23f75f 183 lea SI, [CTX + w]
43ea7558 184 mov eax, [SI + 4*WHOLE(KSZ) - 4] // most recent key word
0f23f75f 185 lea LIM, [SI + 4*LIM] // limit, offset by one key expansion
16021451 186 xor CYIX, CYIX // start of new cycle
1a0c09c4
MW
187
188 // Main key expansion loop. The first word of each key-length chunk
189 // needs special treatment.
190 //
191 // This is rather tedious because the Intel `AESKEYGENASSIST'
192 // instruction is very strangely shaped. Firstly, it wants to
193 // operate on vast SSE registers, even though we're data-blocked from
194 // doing more than operation at a time unless we're doing two key
195 // schedules simultaneously -- and even then we can't do more than
196 // two, because the instruction ignores two of its input words
197 // entirely, and produces two different outputs for each of the other
198 // two. And secondly it insists on taking the magic round constant
199 // as an immediate, so it's kind of annoying if you're not
200 // open-coding the whole thing. It's much easier to leave that as
201 // zero and XOR in the round constant by hand.
16021451
MW
2020: cmp CYIX, 0 // first word of the cycle?
203 je 1f
204 cmp CYIX, 4 // fourth word of the cycle?
205 jne 2f
206 cmp KSZ, 7 // and a large key?
207 jb 2f
208
209 // Fourth word of the cycle, and seven or eight words of key. Do a
210 // byte substitution.
211 movd xmm0, eax
981a9e5d 212 pshufd xmm0, xmm0, SHUF(2, 1, 0, 3)
16021451
MW
213 aeskeygenassist xmm1, xmm0, 0
214 movd eax, xmm1
215 jmp 2f
216
217 // First word of the cycle. This is the complicated piece.
2181: movd xmm0, eax
981a9e5d 219 pshufd xmm0, xmm0, SHUF(0, 3, 2, 1)
1a0c09c4 220 aeskeygenassist xmm1, xmm0, 0
981a9e5d 221 pshufd xmm1, xmm1, SHUF(2, 1, 0, 3)
1a0c09c4 222 movd eax, xmm1
0f23f75f
MW
223 xor al, [RCON]
224 inc RCON
1a0c09c4 225
16021451
MW
226 // Common tail. Mix in the corresponding word from the previous
227 // cycle and prepare for the next loop.
2282: xor eax, [SI]
43ea7558 229 mov [SI + 4*WHOLE(KSZ)], eax
0f23f75f 230 add SI, 4
16021451 231 inc CYIX
0f23f75f 232 cmp SI, LIM
89b34050 233 jae 9f
16021451 234 cmp CYIX, KSZ
89b34050 235 jb 0b
16021451 236 xor CYIX, CYIX
89b34050 237 jmp 0b
1a0c09c4
MW
238
239 // Next job is to construct the decryption keys. The keys for the
240 // first and last rounds don't need to be mangled, but the remaining
241 // ones do -- and they all need to be reordered too.
242 //
243 // The plan of action, then, is to copy the final encryption round's
244 // keys into place first, then to do each of the intermediate rounds
245 // in reverse order, and finally do the first round.
246 //
247 // Do all of the heavy lifting with SSE registers. The order we're
248 // doing this in means that it's OK if we read or write too much, and
249 // there's easily enough buffer space for the over-enthusiastic reads
250 // and writes because the context has space for 32-byte blocks, which
251 // is our maximum and an exact fit for two SSE registers.
89b34050 2529: mov NR, [CTX + nr] // number of rounds
0f23f75f
MW
253#if NKW_NEEDS_REFRESH
254 mov BLKOFF, BLKSZ
255 mov LRK, NR
256 imul LRK, BLKOFF
257#else
258 // If we retain NKW, then BLKSZ and BLKOFF are the same register
259 // because we won't need the former again.
260 mov LRK, NKW
261 sub LRK, BLKSZ
262#endif
263 lea DI, [CTX + wi]
43ea7558 264 lea SI, [CTX + w + 4*WHOLE(LRK)] // last round's keys
0f23f75f 265 shl BLKOFF, 2 // block size (in bytes now)
1a0c09c4
MW
266
267 // Copy the last encryption round's keys.
0f23f75f
MW
268 movdqu xmm0, [SI]
269 movdqu [DI], xmm0
270 cmp BLKOFF, 16
89b34050 271 jbe 0f
0f23f75f
MW
272 movdqu xmm0, [SI + 16]
273 movdqu [DI + 16], xmm0
1a0c09c4
MW
274
275 // Update the loop variables and stop if we've finished.
43ea7558
MW
2760: add DI, WHOLE(BLKOFF)
277 sub SI, WHOLE(BLKOFF)
0f23f75f 278 sub NR, 1
89b34050 279 jbe 9f
1a0c09c4
MW
280
281 // Do another middle round's keys...
0f23f75f 282 movdqu xmm0, [SI]
1a0c09c4 283 aesimc xmm0, xmm0
0f23f75f
MW
284 movdqu [DI], xmm0
285 cmp BLKOFF, 16
89b34050 286 jbe 0b
0f23f75f 287 movdqu xmm0, [SI + 16]
1a0c09c4 288 aesimc xmm0, xmm0
0f23f75f 289 movdqu [DI + 16], xmm0
89b34050 290 jmp 0b
1a0c09c4
MW
291
292 // Finally do the first encryption round.
89b34050 2939: movdqu xmm0, [SI]
0f23f75f
MW
294 movdqu [DI], xmm0
295 cmp BLKOFF, 16
89b34050 296 jbe 1f
0f23f75f
MW
297 movdqu xmm0, [SI + 16]
298 movdqu [DI + 16], xmm0
1a0c09c4
MW
299
300 // If the block size is not exactly four words then we must end-swap
301 // everything. We can use fancy SSE toys for this.
89b34050
MW
3021: cmp BLKOFF, 16
303 je 9f
1a0c09c4
MW
304
305 // Find the byte-reordering table.
306 ldgot ecx
8d6ca554 307 movdqa xmm5, [INTADDR(endswap_tab, ecx)]
1a0c09c4 308
0f23f75f 309#if NKW_NEEDS_REFRESH
1a0c09c4
MW
310 // Calculate the number of subkey words again. (It's a good job
311 // we've got a fast multiplier.)
0f23f75f
MW
312 mov NKW, [CTX + nr]
313 add NKW, 1
314 imul NKW, BLKSZ
315#endif
1a0c09c4
MW
316
317 // End-swap the encryption keys.
0f23f75f 318 lea SI, [CTX + w]
1a0c09c4
MW
319 call endswap_block
320
321 // And the decryption keys.
0f23f75f 322 lea SI, [CTX + wi]
1a0c09c4
MW
323 call endswap_block
324
89b34050 3259: // All done.
0f23f75f 326#if CPUFAM_X86
0923a413
MW
327 popreg edi
328 popreg esi
329 popreg ebx
a90d420c 330 popreg BP
0f23f75f
MW
331#endif
332#if CPUFAM_AMD64 && ABI_WIN
0923a413
MW
333 popreg rdi
334 popreg rsi
0f23f75f 335#endif
1a0c09c4
MW
336 ret
337
1a517bb3
MW
338ENDFUNC
339
340INTFUNC(endswap_block)
1a384903 341 // End-swap NKW words starting at SI. The end-swapping table is
8d6ca554 342 // already loaded into XMM5; and it's OK to work in 16-byte chunks.
0923a413 343 endprologue
1a517bb3 344
1a384903
MW
345 mov ecx, NKW
3460: movdqu xmm1, [SI]
8d6ca554 347 pshufb xmm1, xmm5
0f23f75f
MW
348 movdqu [SI], xmm1
349 add SI, 16
1a0c09c4 350 sub ecx, 4
1a384903 351 ja 0b
1a517bb3 352
1a0c09c4
MW
353 ret
354
1a517bb3
MW
355ENDFUNC
356
0f23f75f
MW
357#undef CTX
358#undef BLKSZ
359#undef SI
360#undef DI
361#undef KSZ
0f23f75f 362#undef RCON
0f23f75f
MW
363#undef LIM
364#undef NR
365#undef LRK
0f23f75f 366#undef BLKOFF
0f23f75f 367
1a0c09c4
MW
368///--------------------------------------------------------------------------
369/// Encrypting and decrypting blocks.
370
8a1aa284 371.macro encdec op, aes, koff
b9b279b4
MW
372 FUNC(rijndael_\op\()_x86ish_aesni_avx)
373 vzeroupper // avoid XMM penalties
374 endprologue
375 // and drop through...
376 ENDFUNC
377
8a1aa284 378 FUNC(rijndael_\op\()_x86ish_aesni)
1a0c09c4 379
0f23f75f
MW
380#if CPUFAM_X86
381 // Arguments come in on the stack, and need to be collected. We
382 // don't have a shortage of registers.
383
c410f911 384# define K eax
0f23f75f
MW
385# define SRC edx
386# define DST edx
c410f911 387# define NR ecx
0f23f75f 388
a90d420c
MW
389 mov K, [SP + 4]
390 mov SRC, [SP + 8]
0f23f75f
MW
391#endif
392
393#if CPUFAM_AMD64 && ABI_SYSV
394 // Arguments come in registers. All is good.
395
396# define K rdi
397# define SRC rsi
398# define DST rdx
399# define NR eax
400#endif
401
402#if CPUFAM_AMD64 && ABI_WIN
403 // Arguments come in different registers.
404
405# define K rcx
406# define SRC rdx
407# define DST r8
408# define NR eax
409#endif
410
0923a413
MW
411 endprologue
412
28321c96
MW
413 // Find the magic endianness-swapping table.
414 ldgot ecx
415 movdqa xmm5, [INTADDR(endswap_tab, ecx)]
416
0f23f75f
MW
417 // Initial setup.
418 movdqu xmm0, [SRC]
8d6ca554 419 pshufb xmm0, xmm5
0f23f75f
MW
420 mov NR, [K + nr]
421 add K, \koff
1a0c09c4
MW
422
423 // Initial whitening.
0f23f75f
MW
424 movdqu xmm1, [K]
425 add K, 16
1a0c09c4 426 pxor xmm0, xmm1
1d63fee4 427#if CPUFAM_X86
a90d420c 428 mov DST, [SP + 12]
1d63fee4 429#endif
1a0c09c4
MW
430
431 // Dispatch to the correct code.
0f23f75f 432 cmp NR, 10
e297526c 433 je 10f
1a0c09c4 434 jb bogus
0f23f75f 435 cmp NR, 14
e297526c 436 je 14f
1a0c09c4 437 ja bogus
0f23f75f 438 cmp NR, 12
e297526c
MW
439 je 12f
440 jb 11f
441 jmp 13f
1a0c09c4
MW
442
443 .align 2
444
445 // 14 rounds...
0f23f75f
MW
44614: movdqu xmm1, [K]
447 add K, 16
e297526c 448 \aes xmm0, xmm1
1a0c09c4
MW
449
450 // 13 rounds...
0f23f75f
MW
45113: movdqu xmm1, [K]
452 add K, 16
e297526c 453 \aes xmm0, xmm1
1a0c09c4
MW
454
455 // 12 rounds...
0f23f75f
MW
45612: movdqu xmm1, [K]
457 add K, 16
e297526c 458 \aes xmm0, xmm1
1a0c09c4
MW
459
460 // 11 rounds...
0f23f75f
MW
46111: movdqu xmm1, [K]
462 add K, 16
e297526c 463 \aes xmm0, xmm1
1a0c09c4
MW
464
465 // 10 rounds...
0f23f75f 46610: movdqu xmm1, [K]
e297526c 467 \aes xmm0, xmm1
1a0c09c4
MW
468
469 // 9 rounds...
0f23f75f 470 movdqu xmm1, [K + 16]
e297526c 471 \aes xmm0, xmm1
1a0c09c4
MW
472
473 // 8 rounds...
0f23f75f 474 movdqu xmm1, [K + 32]
e297526c 475 \aes xmm0, xmm1
1a0c09c4
MW
476
477 // 7 rounds...
0f23f75f 478 movdqu xmm1, [K + 48]
e297526c 479 \aes xmm0, xmm1
1a0c09c4
MW
480
481 // 6 rounds...
0f23f75f 482 movdqu xmm1, [K + 64]
e297526c 483 \aes xmm0, xmm1
1a0c09c4
MW
484
485 // 5 rounds...
0f23f75f 486 movdqu xmm1, [K + 80]
e297526c 487 \aes xmm0, xmm1
1a0c09c4
MW
488
489 // 4 rounds...
0f23f75f 490 movdqu xmm1, [K + 96]
e297526c 491 \aes xmm0, xmm1
1a0c09c4
MW
492
493 // 3 rounds...
0f23f75f 494 movdqu xmm1, [K + 112]
e297526c 495 \aes xmm0, xmm1
1a0c09c4
MW
496
497 // 2 rounds...
0f23f75f 498 movdqu xmm1, [K + 128]
e297526c 499 \aes xmm0, xmm1
1a0c09c4
MW
500
501 // Final round...
0f23f75f 502 movdqu xmm1, [K + 144]
e297526c 503 \aes\()last xmm0, xmm1
1a0c09c4
MW
504
505 // Unpermute the ciphertext block and store it.
8d6ca554 506 pshufb xmm0, xmm5
0f23f75f 507 movdqu [DST], xmm0
1a0c09c4
MW
508
509 // And we're done.
510 ret
511
0f23f75f
MW
512#undef K
513#undef SRC
514#undef DST
515#undef NR
516
8a1aa284
MW
517 ENDFUNC
518.endm
1a0c09c4 519
e297526c
MW
520 encdec eblk, aesenc, w
521 encdec dblk, aesdec, wi
1a0c09c4
MW
522
523///--------------------------------------------------------------------------
524/// Random utilities.
525
1a517bb3 526INTFUNC(bogus)
1a0c09c4
MW
527 // Abort the process because of a programming error. Indirecting
528 // through this point serves several purposes: (a) by CALLing, rather
529 // than branching to, `abort', we can save the return address, which
530 // might at least provide a hint as to what went wrong; (b) we don't
531 // have conditional CALLs (and they'd be big anyway); and (c) we can
532 // write a HLT here as a backstop against `abort' being mad.
0923a413 533 endprologue
1a517bb3
MW
534
535 callext F(abort)
1a0c09c4
MW
5360: hlt
537 jmp 0b
538
1a517bb3
MW
539ENDFUNC
540
1a0c09c4
MW
541///--------------------------------------------------------------------------
542/// Data tables.
543
645fcce0
MW
544 RODATA
545
1a0c09c4
MW
546 .align 16
547endswap_tab:
548 .byte 3, 2, 1, 0
549 .byte 7, 6, 5, 4
550 .byte 11, 10, 9, 8
551 .byte 15, 14, 13, 12
552
553///----- That's all, folks --------------------------------------------------