Merge branch '2.5.x'
[catacomb] / symm / rijndael-x86ish-aesni.S
CommitLineData
1a0c09c4
MW
1/// -*- mode: asm; asm-comment-char: ?/ -*-
2///
3/// AESNI-based implementation of Rijndael
4///
5/// (c) 2015 Straylight/Edgeware
6///
7
8///----- Licensing notice ---------------------------------------------------
9///
10/// This file is part of Catacomb.
11///
12/// Catacomb is free software; you can redistribute it and/or modify
13/// it under the terms of the GNU Library General Public License as
14/// published by the Free Software Foundation; either version 2 of the
15/// License, or (at your option) any later version.
16///
17/// Catacomb is distributed in the hope that it will be useful,
18/// but WITHOUT ANY WARRANTY; without even the implied warranty of
19/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20/// GNU Library General Public License for more details.
21///
22/// You should have received a copy of the GNU Library General Public
23/// License along with Catacomb; if not, write to the Free
24/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
25/// MA 02111-1307, USA.
26
27///--------------------------------------------------------------------------
df07f2c0 28/// Preliminaries.
1a0c09c4
MW
29
30#include "config.h"
31#include "asm-common.h"
32
df07f2c0
MW
33 .arch .aes
34
81bc2bb8
MW
35 .extern F(abort)
36 .extern F(rijndael_rcon)
1a0c09c4 37
df07f2c0
MW
38 .text
39
1a0c09c4
MW
40///--------------------------------------------------------------------------
41/// Main code.
42
1a0c09c4
MW
43/// The AESNI instructions implement a little-endian version of AES, but
44/// Catacomb's internal interface presents as big-endian so as to work better
45/// with things like GCM. We therefore maintain the round keys in
46/// little-endian form, and have to end-swap blocks in and out.
47///
48/// For added amusement, the AESNI instructions don't implement the
49/// larger-block versions of Rijndael, so we have to end-swap the keys if
50/// we're preparing for one of those.
51
52 // Useful constants.
53 .equ maxrounds, 16 // maximum number of rounds
54 .equ maxblksz, 32 // maximum block size, in bytes
55 .equ kbufsz, maxblksz*(maxrounds + 1) // size of a key-schedule buffer
56
57 // Context structure.
58 .equ nr, 0 // number of rounds
59 .equ w, nr + 4 // encryption key words
60 .equ wi, w + kbufsz // decryption key words
61
62///--------------------------------------------------------------------------
63/// Key setup.
64
b9b279b4
MW
65FUNC(rijndael_setup_x86ish_aesni_avx)
66 vzeroupper // avoid penalty on `legacy' XMM access
67 endprologue
68 // and drop through...
69ENDFUNC
70
0f23f75f 71FUNC(rijndael_setup_x86ish_aesni)
1a0c09c4 72
43ea7558
MW
73#define SI WHOLE(si)
74#define DI WHOLE(di)
75
0f23f75f
MW
76#if CPUFAM_X86
77 // Arguments are on the stack. We'll need to stack the caller's
78 // register veriables, but we'll manage.
1a0c09c4 79
0f23f75f
MW
80# define CTX ebp // context pointer
81# define BLKSZ [esp + 24] // block size
82
0f23f75f 83# define KSZ ebx // key size
0f23f75f
MW
84# define NKW edx // total number of key words
85# define NKW_NEEDS_REFRESH 1 // ... needs recalculating
86# define RCON ecx // round constants table
87# define LIM edx // limit pointer
16021451 88# define CYIX edi // index in shift-register cycle
0f23f75f
MW
89
90# define NR ecx // number of rounds
91# define LRK eax // distance to last key
0f23f75f 92# define BLKOFF edx // block size in bytes
0f23f75f
MW
93
94 // Stack the caller's registers.
0923a413
MW
95 pushreg ebp
96 pushreg ebx
97 pushreg esi
98 pushreg edi
1a0c09c4 99
0f23f75f
MW
100 // Set up our own variables.
101 mov CTX, [esp + 20] // context base pointer
102 mov SI, [esp + 28] // key material
103 mov KSZ, [esp + 32] // key size, in words
104#endif
105
106#if CPUFAM_AMD64 && ABI_SYSV
107 // Arguments are in registers. We have plenty, but, to be honest,
108 // the initial register allocation is a bit annoying.
109
110# define CTX r8 // context pointer
111# define BLKSZ r9d // block size
112
0f23f75f 113# define KSZ edx // key size
0f23f75f
MW
114# define NKW r10d // total number of key words
115# define RCON rdi // round constants table
43ea7558 116# define LIM rcx // limit pointer
16021451 117# define CYIX r11d // index in shift-register cycle
0f23f75f
MW
118
119# define NR ecx // number of rounds
120# define LRK eax // distance to last key
0f23f75f 121# define BLKOFF r9d // block size in bytes
0f23f75f
MW
122
123 // Move arguments to more useful places.
124 mov CTX, rdi // context base pointer
125 mov BLKSZ, esi // block size in words
126 mov SI, rdx // key material
127 mov KSZ, ecx // key size, in words
128#endif
129
130#if CPUFAM_AMD64 && ABI_WIN
131 // Arguments are in different registers, and they're a little tight.
132
133# define CTX r8 // context pointer
134# define BLKSZ edx // block size
135
0f23f75f 136# define KSZ r9d // key size
0f23f75f
MW
137# define NKW r10d // total number of key words
138# define RCON rdi // round constants table
43ea7558 139# define LIM rcx // limit pointer
16021451 140# define CYIX r11d // index in shift-register cycle
0f23f75f
MW
141
142# define NR ecx // number of rounds
143# define LRK eax // distance to last key
0f23f75f 144# define BLKOFF edx // block size in bytes
0f23f75f
MW
145
146 // We'll need the index registers, which belong to the caller in this
147 // ABI.
0923a413
MW
148 pushreg rsi
149 pushreg rdi
0f23f75f
MW
150
151 // Move arguments to more useful places.
43ea7558 152 mov rsi, r8 // key material
0f23f75f
MW
153 mov CTX, rcx // context base pointer
154#endif
155
0923a413
MW
156 endprologue
157
1a0c09c4
MW
158 // The initial round key material is taken directly from the input
159 // key, so copy it over.
0f23f75f
MW
160#if CPUFAM_AMD64 && ABI_SYSV
161 // We've been lucky. We already have a copy of the context pointer
162 // in rdi, and the key size in ecx.
43ea7558 163 add rdi, w
0f23f75f
MW
164#else
165 lea DI, [CTX + w]
166 mov ecx, KSZ
167#endif
1a0c09c4
MW
168 rep movsd
169
170 // Find out other useful things.
0f23f75f
MW
171 mov NKW, [CTX + nr] // number of rounds
172 add NKW, 1
173 imul NKW, BLKSZ // total key size in words
174#if !NKW_NEEDS_REFRESH
175 // If we can't keep NKW for later, then we use the same register for
176 // it and LIM, so this move is unnecessary.
43ea7558 177 mov DWORD(LIM), NKW
0f23f75f 178#endif
43ea7558 179 sub DWORD(LIM), KSZ // offset by the key size
1a0c09c4
MW
180
181 // Find the round constants.
43ea7558
MW
182 ldgot WHOLE(c)
183 leaext RCON, F(rijndael_rcon), WHOLE(c)
1a0c09c4
MW
184
185 // Prepare for the main loop.
0f23f75f 186 lea SI, [CTX + w]
43ea7558 187 mov eax, [SI + 4*WHOLE(KSZ) - 4] // most recent key word
0f23f75f 188 lea LIM, [SI + 4*LIM] // limit, offset by one key expansion
16021451 189 xor CYIX, CYIX // start of new cycle
1a0c09c4
MW
190
191 // Main key expansion loop. The first word of each key-length chunk
192 // needs special treatment.
193 //
194 // This is rather tedious because the Intel `AESKEYGENASSIST'
195 // instruction is very strangely shaped. Firstly, it wants to
196 // operate on vast SSE registers, even though we're data-blocked from
197 // doing more than operation at a time unless we're doing two key
198 // schedules simultaneously -- and even then we can't do more than
199 // two, because the instruction ignores two of its input words
200 // entirely, and produces two different outputs for each of the other
201 // two. And secondly it insists on taking the magic round constant
202 // as an immediate, so it's kind of annoying if you're not
203 // open-coding the whole thing. It's much easier to leave that as
204 // zero and XOR in the round constant by hand.
16021451
MW
2050: cmp CYIX, 0 // first word of the cycle?
206 je 1f
207 cmp CYIX, 4 // fourth word of the cycle?
208 jne 2f
209 cmp KSZ, 7 // and a large key?
210 jb 2f
211
212 // Fourth word of the cycle, and seven or eight words of key. Do a
213 // byte substitution.
214 movd xmm0, eax
a117c06f 215 pshufd xmm0, xmm0, SHUF(3, 0, 1, 2)
16021451
MW
216 aeskeygenassist xmm1, xmm0, 0
217 movd eax, xmm1
218 jmp 2f
219
220 // First word of the cycle. This is the complicated piece.
2211: movd xmm0, eax
a117c06f 222 pshufd xmm0, xmm0, SHUF(1, 2, 3, 0)
1a0c09c4 223 aeskeygenassist xmm1, xmm0, 0
a117c06f 224 pshufd xmm1, xmm1, SHUF(3, 0, 1, 2)
1a0c09c4 225 movd eax, xmm1
0f23f75f
MW
226 xor al, [RCON]
227 inc RCON
1a0c09c4 228
16021451
MW
229 // Common tail. Mix in the corresponding word from the previous
230 // cycle and prepare for the next loop.
2312: xor eax, [SI]
43ea7558 232 mov [SI + 4*WHOLE(KSZ)], eax
0f23f75f 233 add SI, 4
16021451 234 inc CYIX
0f23f75f 235 cmp SI, LIM
89b34050 236 jae 9f
16021451 237 cmp CYIX, KSZ
89b34050 238 jb 0b
16021451 239 xor CYIX, CYIX
89b34050 240 jmp 0b
1a0c09c4
MW
241
242 // Next job is to construct the decryption keys. The keys for the
243 // first and last rounds don't need to be mangled, but the remaining
244 // ones do -- and they all need to be reordered too.
245 //
246 // The plan of action, then, is to copy the final encryption round's
247 // keys into place first, then to do each of the intermediate rounds
248 // in reverse order, and finally do the first round.
249 //
250 // Do all of the heavy lifting with SSE registers. The order we're
251 // doing this in means that it's OK if we read or write too much, and
252 // there's easily enough buffer space for the over-enthusiastic reads
253 // and writes because the context has space for 32-byte blocks, which
254 // is our maximum and an exact fit for two SSE registers.
89b34050 2559: mov NR, [CTX + nr] // number of rounds
0f23f75f
MW
256#if NKW_NEEDS_REFRESH
257 mov BLKOFF, BLKSZ
258 mov LRK, NR
259 imul LRK, BLKOFF
260#else
261 // If we retain NKW, then BLKSZ and BLKOFF are the same register
262 // because we won't need the former again.
263 mov LRK, NKW
264 sub LRK, BLKSZ
265#endif
266 lea DI, [CTX + wi]
43ea7558 267 lea SI, [CTX + w + 4*WHOLE(LRK)] // last round's keys
0f23f75f 268 shl BLKOFF, 2 // block size (in bytes now)
1a0c09c4
MW
269
270 // Copy the last encryption round's keys.
0f23f75f
MW
271 movdqu xmm0, [SI]
272 movdqu [DI], xmm0
273 cmp BLKOFF, 16
89b34050 274 jbe 0f
0f23f75f
MW
275 movdqu xmm0, [SI + 16]
276 movdqu [DI + 16], xmm0
1a0c09c4
MW
277
278 // Update the loop variables and stop if we've finished.
43ea7558
MW
2790: add DI, WHOLE(BLKOFF)
280 sub SI, WHOLE(BLKOFF)
0f23f75f 281 sub NR, 1
89b34050 282 jbe 9f
1a0c09c4
MW
283
284 // Do another middle round's keys...
0f23f75f 285 movdqu xmm0, [SI]
1a0c09c4 286 aesimc xmm0, xmm0
0f23f75f
MW
287 movdqu [DI], xmm0
288 cmp BLKOFF, 16
89b34050 289 jbe 0b
0f23f75f 290 movdqu xmm0, [SI + 16]
1a0c09c4 291 aesimc xmm0, xmm0
0f23f75f 292 movdqu [DI + 16], xmm0
89b34050 293 jmp 0b
1a0c09c4
MW
294
295 // Finally do the first encryption round.
89b34050 2969: movdqu xmm0, [SI]
0f23f75f
MW
297 movdqu [DI], xmm0
298 cmp BLKOFF, 16
89b34050 299 jbe 1f
0f23f75f
MW
300 movdqu xmm0, [SI + 16]
301 movdqu [DI + 16], xmm0
1a0c09c4
MW
302
303 // If the block size is not exactly four words then we must end-swap
304 // everything. We can use fancy SSE toys for this.
89b34050
MW
3051: cmp BLKOFF, 16
306 je 9f
1a0c09c4
MW
307
308 // Find the byte-reordering table.
309 ldgot ecx
8d6ca554 310 movdqa xmm5, [INTADDR(endswap_tab, ecx)]
1a0c09c4 311
0f23f75f 312#if NKW_NEEDS_REFRESH
1a0c09c4
MW
313 // Calculate the number of subkey words again. (It's a good job
314 // we've got a fast multiplier.)
0f23f75f
MW
315 mov NKW, [CTX + nr]
316 add NKW, 1
317 imul NKW, BLKSZ
318#endif
1a0c09c4
MW
319
320 // End-swap the encryption keys.
0f23f75f 321 lea SI, [CTX + w]
1a0c09c4
MW
322 call endswap_block
323
324 // And the decryption keys.
0f23f75f 325 lea SI, [CTX + wi]
1a0c09c4
MW
326 call endswap_block
327
89b34050 3289: // All done.
0f23f75f 329#if CPUFAM_X86
0923a413
MW
330 popreg edi
331 popreg esi
332 popreg ebx
333 popreg ebp
0f23f75f
MW
334#endif
335#if CPUFAM_AMD64 && ABI_WIN
0923a413
MW
336 popreg rdi
337 popreg rsi
0f23f75f 338#endif
1a0c09c4
MW
339 ret
340
1a517bb3
MW
341ENDFUNC
342
343INTFUNC(endswap_block)
1a384903 344 // End-swap NKW words starting at SI. The end-swapping table is
8d6ca554 345 // already loaded into XMM5; and it's OK to work in 16-byte chunks.
0923a413 346 endprologue
1a517bb3 347
1a384903
MW
348 mov ecx, NKW
3490: movdqu xmm1, [SI]
8d6ca554 350 pshufb xmm1, xmm5
0f23f75f
MW
351 movdqu [SI], xmm1
352 add SI, 16
1a0c09c4 353 sub ecx, 4
1a384903 354 ja 0b
1a517bb3 355
1a0c09c4
MW
356 ret
357
1a517bb3
MW
358ENDFUNC
359
0f23f75f
MW
360#undef CTX
361#undef BLKSZ
362#undef SI
363#undef DI
364#undef KSZ
0f23f75f 365#undef RCON
0f23f75f
MW
366#undef LIM
367#undef NR
368#undef LRK
0f23f75f 369#undef BLKOFF
0f23f75f 370
1a0c09c4
MW
371///--------------------------------------------------------------------------
372/// Encrypting and decrypting blocks.
373
8a1aa284 374.macro encdec op, aes, koff
b9b279b4
MW
375 FUNC(rijndael_\op\()_x86ish_aesni_avx)
376 vzeroupper // avoid XMM penalties
377 endprologue
378 // and drop through...
379 ENDFUNC
380
8a1aa284 381 FUNC(rijndael_\op\()_x86ish_aesni)
1a0c09c4 382
0f23f75f
MW
383#if CPUFAM_X86
384 // Arguments come in on the stack, and need to be collected. We
385 // don't have a shortage of registers.
386
c410f911 387# define K eax
0f23f75f
MW
388# define SRC edx
389# define DST edx
c410f911 390# define NR ecx
0f23f75f
MW
391
392 mov K, [esp + 4]
393 mov SRC, [esp + 8]
394#endif
395
396#if CPUFAM_AMD64 && ABI_SYSV
397 // Arguments come in registers. All is good.
398
399# define K rdi
400# define SRC rsi
401# define DST rdx
402# define NR eax
403#endif
404
405#if CPUFAM_AMD64 && ABI_WIN
406 // Arguments come in different registers.
407
408# define K rcx
409# define SRC rdx
410# define DST r8
411# define NR eax
412#endif
413
0923a413
MW
414 endprologue
415
28321c96
MW
416 // Find the magic endianness-swapping table.
417 ldgot ecx
418 movdqa xmm5, [INTADDR(endswap_tab, ecx)]
419
0f23f75f
MW
420 // Initial setup.
421 movdqu xmm0, [SRC]
8d6ca554 422 pshufb xmm0, xmm5
0f23f75f
MW
423 mov NR, [K + nr]
424 add K, \koff
1a0c09c4
MW
425
426 // Initial whitening.
0f23f75f
MW
427 movdqu xmm1, [K]
428 add K, 16
1a0c09c4 429 pxor xmm0, xmm1
1d63fee4
MW
430#if CPUFAM_X86
431 mov DST, [esp + 12]
432#endif
1a0c09c4
MW
433
434 // Dispatch to the correct code.
0f23f75f 435 cmp NR, 10
e297526c 436 je 10f
1a0c09c4 437 jb bogus
0f23f75f 438 cmp NR, 14
e297526c 439 je 14f
1a0c09c4 440 ja bogus
0f23f75f 441 cmp NR, 12
e297526c
MW
442 je 12f
443 jb 11f
444 jmp 13f
1a0c09c4
MW
445
446 .align 2
447
448 // 14 rounds...
0f23f75f
MW
44914: movdqu xmm1, [K]
450 add K, 16
e297526c 451 \aes xmm0, xmm1
1a0c09c4
MW
452
453 // 13 rounds...
0f23f75f
MW
45413: movdqu xmm1, [K]
455 add K, 16
e297526c 456 \aes xmm0, xmm1
1a0c09c4
MW
457
458 // 12 rounds...
0f23f75f
MW
45912: movdqu xmm1, [K]
460 add K, 16
e297526c 461 \aes xmm0, xmm1
1a0c09c4
MW
462
463 // 11 rounds...
0f23f75f
MW
46411: movdqu xmm1, [K]
465 add K, 16
e297526c 466 \aes xmm0, xmm1
1a0c09c4
MW
467
468 // 10 rounds...
0f23f75f 46910: movdqu xmm1, [K]
e297526c 470 \aes xmm0, xmm1
1a0c09c4
MW
471
472 // 9 rounds...
0f23f75f 473 movdqu xmm1, [K + 16]
e297526c 474 \aes xmm0, xmm1
1a0c09c4
MW
475
476 // 8 rounds...
0f23f75f 477 movdqu xmm1, [K + 32]
e297526c 478 \aes xmm0, xmm1
1a0c09c4
MW
479
480 // 7 rounds...
0f23f75f 481 movdqu xmm1, [K + 48]
e297526c 482 \aes xmm0, xmm1
1a0c09c4
MW
483
484 // 6 rounds...
0f23f75f 485 movdqu xmm1, [K + 64]
e297526c 486 \aes xmm0, xmm1
1a0c09c4
MW
487
488 // 5 rounds...
0f23f75f 489 movdqu xmm1, [K + 80]
e297526c 490 \aes xmm0, xmm1
1a0c09c4
MW
491
492 // 4 rounds...
0f23f75f 493 movdqu xmm1, [K + 96]
e297526c 494 \aes xmm0, xmm1
1a0c09c4
MW
495
496 // 3 rounds...
0f23f75f 497 movdqu xmm1, [K + 112]
e297526c 498 \aes xmm0, xmm1
1a0c09c4
MW
499
500 // 2 rounds...
0f23f75f 501 movdqu xmm1, [K + 128]
e297526c 502 \aes xmm0, xmm1
1a0c09c4
MW
503
504 // Final round...
0f23f75f 505 movdqu xmm1, [K + 144]
e297526c 506 \aes\()last xmm0, xmm1
1a0c09c4
MW
507
508 // Unpermute the ciphertext block and store it.
8d6ca554 509 pshufb xmm0, xmm5
0f23f75f 510 movdqu [DST], xmm0
1a0c09c4
MW
511
512 // And we're done.
513 ret
514
0f23f75f
MW
515#undef K
516#undef SRC
517#undef DST
518#undef NR
519
8a1aa284
MW
520 ENDFUNC
521.endm
1a0c09c4 522
e297526c
MW
523 encdec eblk, aesenc, w
524 encdec dblk, aesdec, wi
1a0c09c4
MW
525
526///--------------------------------------------------------------------------
527/// Random utilities.
528
1a517bb3 529INTFUNC(bogus)
1a0c09c4
MW
530 // Abort the process because of a programming error. Indirecting
531 // through this point serves several purposes: (a) by CALLing, rather
532 // than branching to, `abort', we can save the return address, which
533 // might at least provide a hint as to what went wrong; (b) we don't
534 // have conditional CALLs (and they'd be big anyway); and (c) we can
535 // write a HLT here as a backstop against `abort' being mad.
0923a413 536 endprologue
1a517bb3
MW
537
538 callext F(abort)
1a0c09c4
MW
5390: hlt
540 jmp 0b
541
1a517bb3
MW
542ENDFUNC
543
1a0c09c4
MW
544///--------------------------------------------------------------------------
545/// Data tables.
546
645fcce0
MW
547 RODATA
548
1a0c09c4
MW
549 .align 16
550endswap_tab:
551 .byte 3, 2, 1, 0
552 .byte 7, 6, 5, 4
553 .byte 11, 10, 9, 8
554 .byte 15, 14, 13, 12
555
556///----- That's all, folks --------------------------------------------------