symm/blkc.h: Add explicitly big- and little-endian `STEP', `ADD' and `SET'.
[catacomb] / symm / rijndael-x86ish-aesni.S
CommitLineData
1a0c09c4
MW
1/// -*- mode: asm; asm-comment-char: ?/ -*-
2///
3/// AESNI-based implementation of Rijndael
4///
5/// (c) 2015 Straylight/Edgeware
6///
7
8///----- Licensing notice ---------------------------------------------------
9///
10/// This file is part of Catacomb.
11///
12/// Catacomb is free software; you can redistribute it and/or modify
13/// it under the terms of the GNU Library General Public License as
14/// published by the Free Software Foundation; either version 2 of the
15/// License, or (at your option) any later version.
16///
17/// Catacomb is distributed in the hope that it will be useful,
18/// but WITHOUT ANY WARRANTY; without even the implied warranty of
19/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20/// GNU Library General Public License for more details.
21///
22/// You should have received a copy of the GNU Library General Public
23/// License along with Catacomb; if not, write to the Free
24/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
25/// MA 02111-1307, USA.
26
27///--------------------------------------------------------------------------
28/// External definitions.
29
30#include "config.h"
31#include "asm-common.h"
32
81bc2bb8
MW
33 .extern F(abort)
34 .extern F(rijndael_rcon)
1a0c09c4
MW
35
36///--------------------------------------------------------------------------
37/// Main code.
38
39 .arch .aes
bc9ac7eb 40 .text
1a0c09c4
MW
41
42/// The AESNI instructions implement a little-endian version of AES, but
43/// Catacomb's internal interface presents as big-endian so as to work better
44/// with things like GCM. We therefore maintain the round keys in
45/// little-endian form, and have to end-swap blocks in and out.
46///
47/// For added amusement, the AESNI instructions don't implement the
48/// larger-block versions of Rijndael, so we have to end-swap the keys if
49/// we're preparing for one of those.
50
51 // Useful constants.
52 .equ maxrounds, 16 // maximum number of rounds
53 .equ maxblksz, 32 // maximum block size, in bytes
54 .equ kbufsz, maxblksz*(maxrounds + 1) // size of a key-schedule buffer
55
56 // Context structure.
57 .equ nr, 0 // number of rounds
58 .equ w, nr + 4 // encryption key words
59 .equ wi, w + kbufsz // decryption key words
60
61///--------------------------------------------------------------------------
62/// Key setup.
63
b9b279b4
MW
64FUNC(rijndael_setup_x86ish_aesni_avx)
65 vzeroupper // avoid penalty on `legacy' XMM access
66 endprologue
67 // and drop through...
68ENDFUNC
69
0f23f75f 70FUNC(rijndael_setup_x86ish_aesni)
1a0c09c4 71
43ea7558
MW
72#define SI WHOLE(si)
73#define DI WHOLE(di)
74
0f23f75f
MW
75#if CPUFAM_X86
76 // Arguments are on the stack. We'll need to stack the caller's
77 // register veriables, but we'll manage.
1a0c09c4 78
0f23f75f
MW
79# define CTX ebp // context pointer
80# define BLKSZ [esp + 24] // block size
81
0f23f75f 82# define KSZ ebx // key size
0f23f75f
MW
83# define NKW edx // total number of key words
84# define NKW_NEEDS_REFRESH 1 // ... needs recalculating
85# define RCON ecx // round constants table
86# define LIM edx // limit pointer
16021451 87# define CYIX edi // index in shift-register cycle
0f23f75f
MW
88
89# define NR ecx // number of rounds
90# define LRK eax // distance to last key
0f23f75f 91# define BLKOFF edx // block size in bytes
0f23f75f
MW
92
93 // Stack the caller's registers.
0923a413
MW
94 pushreg ebp
95 pushreg ebx
96 pushreg esi
97 pushreg edi
1a0c09c4 98
0f23f75f
MW
99 // Set up our own variables.
100 mov CTX, [esp + 20] // context base pointer
101 mov SI, [esp + 28] // key material
102 mov KSZ, [esp + 32] // key size, in words
103#endif
104
105#if CPUFAM_AMD64 && ABI_SYSV
106 // Arguments are in registers. We have plenty, but, to be honest,
107 // the initial register allocation is a bit annoying.
108
109# define CTX r8 // context pointer
110# define BLKSZ r9d // block size
111
0f23f75f 112# define KSZ edx // key size
0f23f75f
MW
113# define NKW r10d // total number of key words
114# define RCON rdi // round constants table
43ea7558 115# define LIM rcx // limit pointer
16021451 116# define CYIX r11d // index in shift-register cycle
0f23f75f
MW
117
118# define NR ecx // number of rounds
119# define LRK eax // distance to last key
0f23f75f 120# define BLKOFF r9d // block size in bytes
0f23f75f
MW
121
122 // Move arguments to more useful places.
123 mov CTX, rdi // context base pointer
124 mov BLKSZ, esi // block size in words
125 mov SI, rdx // key material
126 mov KSZ, ecx // key size, in words
127#endif
128
129#if CPUFAM_AMD64 && ABI_WIN
130 // Arguments are in different registers, and they're a little tight.
131
132# define CTX r8 // context pointer
133# define BLKSZ edx // block size
134
0f23f75f 135# define KSZ r9d // key size
0f23f75f
MW
136# define NKW r10d // total number of key words
137# define RCON rdi // round constants table
43ea7558 138# define LIM rcx // limit pointer
16021451 139# define CYIX r11d // index in shift-register cycle
0f23f75f
MW
140
141# define NR ecx // number of rounds
142# define LRK eax // distance to last key
0f23f75f 143# define BLKOFF edx // block size in bytes
0f23f75f
MW
144
145 // We'll need the index registers, which belong to the caller in this
146 // ABI.
0923a413
MW
147 pushreg rsi
148 pushreg rdi
0f23f75f
MW
149
150 // Move arguments to more useful places.
43ea7558 151 mov rsi, r8 // key material
0f23f75f
MW
152 mov CTX, rcx // context base pointer
153#endif
154
0923a413
MW
155 endprologue
156
1a0c09c4
MW
157 // The initial round key material is taken directly from the input
158 // key, so copy it over.
0f23f75f
MW
159#if CPUFAM_AMD64 && ABI_SYSV
160 // We've been lucky. We already have a copy of the context pointer
161 // in rdi, and the key size in ecx.
43ea7558 162 add rdi, w
0f23f75f
MW
163#else
164 lea DI, [CTX + w]
165 mov ecx, KSZ
166#endif
1a0c09c4
MW
167 rep movsd
168
169 // Find out other useful things.
0f23f75f
MW
170 mov NKW, [CTX + nr] // number of rounds
171 add NKW, 1
172 imul NKW, BLKSZ // total key size in words
173#if !NKW_NEEDS_REFRESH
174 // If we can't keep NKW for later, then we use the same register for
175 // it and LIM, so this move is unnecessary.
43ea7558 176 mov DWORD(LIM), NKW
0f23f75f 177#endif
43ea7558 178 sub DWORD(LIM), KSZ // offset by the key size
1a0c09c4
MW
179
180 // Find the round constants.
43ea7558
MW
181 ldgot WHOLE(c)
182 leaext RCON, F(rijndael_rcon), WHOLE(c)
1a0c09c4
MW
183
184 // Prepare for the main loop.
0f23f75f 185 lea SI, [CTX + w]
43ea7558 186 mov eax, [SI + 4*WHOLE(KSZ) - 4] // most recent key word
0f23f75f 187 lea LIM, [SI + 4*LIM] // limit, offset by one key expansion
16021451 188 xor CYIX, CYIX // start of new cycle
1a0c09c4
MW
189
190 // Main key expansion loop. The first word of each key-length chunk
191 // needs special treatment.
192 //
193 // This is rather tedious because the Intel `AESKEYGENASSIST'
194 // instruction is very strangely shaped. Firstly, it wants to
195 // operate on vast SSE registers, even though we're data-blocked from
196 // doing more than operation at a time unless we're doing two key
197 // schedules simultaneously -- and even then we can't do more than
198 // two, because the instruction ignores two of its input words
199 // entirely, and produces two different outputs for each of the other
200 // two. And secondly it insists on taking the magic round constant
201 // as an immediate, so it's kind of annoying if you're not
202 // open-coding the whole thing. It's much easier to leave that as
203 // zero and XOR in the round constant by hand.
16021451
MW
2040: cmp CYIX, 0 // first word of the cycle?
205 je 1f
206 cmp CYIX, 4 // fourth word of the cycle?
207 jne 2f
208 cmp KSZ, 7 // and a large key?
209 jb 2f
210
211 // Fourth word of the cycle, and seven or eight words of key. Do a
212 // byte substitution.
213 movd xmm0, eax
a117c06f 214 pshufd xmm0, xmm0, SHUF(3, 0, 1, 2)
16021451
MW
215 aeskeygenassist xmm1, xmm0, 0
216 movd eax, xmm1
217 jmp 2f
218
219 // First word of the cycle. This is the complicated piece.
2201: movd xmm0, eax
a117c06f 221 pshufd xmm0, xmm0, SHUF(1, 2, 3, 0)
1a0c09c4 222 aeskeygenassist xmm1, xmm0, 0
a117c06f 223 pshufd xmm1, xmm1, SHUF(3, 0, 1, 2)
1a0c09c4 224 movd eax, xmm1
0f23f75f
MW
225 xor al, [RCON]
226 inc RCON
1a0c09c4 227
16021451
MW
228 // Common tail. Mix in the corresponding word from the previous
229 // cycle and prepare for the next loop.
2302: xor eax, [SI]
43ea7558 231 mov [SI + 4*WHOLE(KSZ)], eax
0f23f75f 232 add SI, 4
16021451 233 inc CYIX
0f23f75f 234 cmp SI, LIM
89b34050 235 jae 9f
16021451 236 cmp CYIX, KSZ
89b34050 237 jb 0b
16021451 238 xor CYIX, CYIX
89b34050 239 jmp 0b
1a0c09c4
MW
240
241 // Next job is to construct the decryption keys. The keys for the
242 // first and last rounds don't need to be mangled, but the remaining
243 // ones do -- and they all need to be reordered too.
244 //
245 // The plan of action, then, is to copy the final encryption round's
246 // keys into place first, then to do each of the intermediate rounds
247 // in reverse order, and finally do the first round.
248 //
249 // Do all of the heavy lifting with SSE registers. The order we're
250 // doing this in means that it's OK if we read or write too much, and
251 // there's easily enough buffer space for the over-enthusiastic reads
252 // and writes because the context has space for 32-byte blocks, which
253 // is our maximum and an exact fit for two SSE registers.
89b34050 2549: mov NR, [CTX + nr] // number of rounds
0f23f75f
MW
255#if NKW_NEEDS_REFRESH
256 mov BLKOFF, BLKSZ
257 mov LRK, NR
258 imul LRK, BLKOFF
259#else
260 // If we retain NKW, then BLKSZ and BLKOFF are the same register
261 // because we won't need the former again.
262 mov LRK, NKW
263 sub LRK, BLKSZ
264#endif
265 lea DI, [CTX + wi]
43ea7558 266 lea SI, [CTX + w + 4*WHOLE(LRK)] // last round's keys
0f23f75f 267 shl BLKOFF, 2 // block size (in bytes now)
1a0c09c4
MW
268
269 // Copy the last encryption round's keys.
0f23f75f
MW
270 movdqu xmm0, [SI]
271 movdqu [DI], xmm0
272 cmp BLKOFF, 16
89b34050 273 jbe 0f
0f23f75f
MW
274 movdqu xmm0, [SI + 16]
275 movdqu [DI + 16], xmm0
1a0c09c4
MW
276
277 // Update the loop variables and stop if we've finished.
43ea7558
MW
2780: add DI, WHOLE(BLKOFF)
279 sub SI, WHOLE(BLKOFF)
0f23f75f 280 sub NR, 1
89b34050 281 jbe 9f
1a0c09c4
MW
282
283 // Do another middle round's keys...
0f23f75f 284 movdqu xmm0, [SI]
1a0c09c4 285 aesimc xmm0, xmm0
0f23f75f
MW
286 movdqu [DI], xmm0
287 cmp BLKOFF, 16
89b34050 288 jbe 0b
0f23f75f 289 movdqu xmm0, [SI + 16]
1a0c09c4 290 aesimc xmm0, xmm0
0f23f75f 291 movdqu [DI + 16], xmm0
89b34050 292 jmp 0b
1a0c09c4
MW
293
294 // Finally do the first encryption round.
89b34050 2959: movdqu xmm0, [SI]
0f23f75f
MW
296 movdqu [DI], xmm0
297 cmp BLKOFF, 16
89b34050 298 jbe 1f
0f23f75f
MW
299 movdqu xmm0, [SI + 16]
300 movdqu [DI + 16], xmm0
1a0c09c4
MW
301
302 // If the block size is not exactly four words then we must end-swap
303 // everything. We can use fancy SSE toys for this.
89b34050
MW
3041: cmp BLKOFF, 16
305 je 9f
1a0c09c4
MW
306
307 // Find the byte-reordering table.
308 ldgot ecx
8d6ca554 309 movdqa xmm5, [INTADDR(endswap_tab, ecx)]
1a0c09c4 310
0f23f75f 311#if NKW_NEEDS_REFRESH
1a0c09c4
MW
312 // Calculate the number of subkey words again. (It's a good job
313 // we've got a fast multiplier.)
0f23f75f
MW
314 mov NKW, [CTX + nr]
315 add NKW, 1
316 imul NKW, BLKSZ
317#endif
1a0c09c4
MW
318
319 // End-swap the encryption keys.
0f23f75f 320 lea SI, [CTX + w]
1a0c09c4
MW
321 call endswap_block
322
323 // And the decryption keys.
0f23f75f 324 lea SI, [CTX + wi]
1a0c09c4
MW
325 call endswap_block
326
89b34050 3279: // All done.
0f23f75f 328#if CPUFAM_X86
0923a413
MW
329 popreg edi
330 popreg esi
331 popreg ebx
332 popreg ebp
0f23f75f
MW
333#endif
334#if CPUFAM_AMD64 && ABI_WIN
0923a413
MW
335 popreg rdi
336 popreg rsi
0f23f75f 337#endif
1a0c09c4
MW
338 ret
339
1a517bb3
MW
340ENDFUNC
341
342INTFUNC(endswap_block)
1a384903 343 // End-swap NKW words starting at SI. The end-swapping table is
8d6ca554 344 // already loaded into XMM5; and it's OK to work in 16-byte chunks.
0923a413 345 endprologue
1a517bb3 346
1a384903
MW
347 mov ecx, NKW
3480: movdqu xmm1, [SI]
8d6ca554 349 pshufb xmm1, xmm5
0f23f75f
MW
350 movdqu [SI], xmm1
351 add SI, 16
1a0c09c4 352 sub ecx, 4
1a384903 353 ja 0b
1a517bb3 354
1a0c09c4
MW
355 ret
356
1a517bb3
MW
357ENDFUNC
358
0f23f75f
MW
359#undef CTX
360#undef BLKSZ
361#undef SI
362#undef DI
363#undef KSZ
0f23f75f 364#undef RCON
0f23f75f
MW
365#undef LIM
366#undef NR
367#undef LRK
0f23f75f 368#undef BLKOFF
0f23f75f 369
1a0c09c4
MW
370///--------------------------------------------------------------------------
371/// Encrypting and decrypting blocks.
372
8a1aa284 373.macro encdec op, aes, koff
b9b279b4
MW
374 FUNC(rijndael_\op\()_x86ish_aesni_avx)
375 vzeroupper // avoid XMM penalties
376 endprologue
377 // and drop through...
378 ENDFUNC
379
8a1aa284 380 FUNC(rijndael_\op\()_x86ish_aesni)
1a0c09c4 381
0f23f75f
MW
382#if CPUFAM_X86
383 // Arguments come in on the stack, and need to be collected. We
384 // don't have a shortage of registers.
385
c410f911 386# define K eax
0f23f75f
MW
387# define SRC edx
388# define DST edx
c410f911 389# define NR ecx
0f23f75f
MW
390
391 mov K, [esp + 4]
392 mov SRC, [esp + 8]
393#endif
394
395#if CPUFAM_AMD64 && ABI_SYSV
396 // Arguments come in registers. All is good.
397
398# define K rdi
399# define SRC rsi
400# define DST rdx
401# define NR eax
402#endif
403
404#if CPUFAM_AMD64 && ABI_WIN
405 // Arguments come in different registers.
406
407# define K rcx
408# define SRC rdx
409# define DST r8
410# define NR eax
411#endif
412
0923a413
MW
413 endprologue
414
28321c96
MW
415 // Find the magic endianness-swapping table.
416 ldgot ecx
417 movdqa xmm5, [INTADDR(endswap_tab, ecx)]
418
0f23f75f
MW
419 // Initial setup.
420 movdqu xmm0, [SRC]
8d6ca554 421 pshufb xmm0, xmm5
0f23f75f
MW
422 mov NR, [K + nr]
423 add K, \koff
1a0c09c4
MW
424
425 // Initial whitening.
0f23f75f
MW
426 movdqu xmm1, [K]
427 add K, 16
1a0c09c4 428 pxor xmm0, xmm1
1d63fee4
MW
429#if CPUFAM_X86
430 mov DST, [esp + 12]
431#endif
1a0c09c4
MW
432
433 // Dispatch to the correct code.
0f23f75f 434 cmp NR, 10
e297526c 435 je 10f
1a0c09c4 436 jb bogus
0f23f75f 437 cmp NR, 14
e297526c 438 je 14f
1a0c09c4 439 ja bogus
0f23f75f 440 cmp NR, 12
e297526c
MW
441 je 12f
442 jb 11f
443 jmp 13f
1a0c09c4
MW
444
445 .align 2
446
447 // 14 rounds...
0f23f75f
MW
44814: movdqu xmm1, [K]
449 add K, 16
e297526c 450 \aes xmm0, xmm1
1a0c09c4
MW
451
452 // 13 rounds...
0f23f75f
MW
45313: movdqu xmm1, [K]
454 add K, 16
e297526c 455 \aes xmm0, xmm1
1a0c09c4
MW
456
457 // 12 rounds...
0f23f75f
MW
45812: movdqu xmm1, [K]
459 add K, 16
e297526c 460 \aes xmm0, xmm1
1a0c09c4
MW
461
462 // 11 rounds...
0f23f75f
MW
46311: movdqu xmm1, [K]
464 add K, 16
e297526c 465 \aes xmm0, xmm1
1a0c09c4
MW
466
467 // 10 rounds...
0f23f75f 46810: movdqu xmm1, [K]
e297526c 469 \aes xmm0, xmm1
1a0c09c4
MW
470
471 // 9 rounds...
0f23f75f 472 movdqu xmm1, [K + 16]
e297526c 473 \aes xmm0, xmm1
1a0c09c4
MW
474
475 // 8 rounds...
0f23f75f 476 movdqu xmm1, [K + 32]
e297526c 477 \aes xmm0, xmm1
1a0c09c4
MW
478
479 // 7 rounds...
0f23f75f 480 movdqu xmm1, [K + 48]
e297526c 481 \aes xmm0, xmm1
1a0c09c4
MW
482
483 // 6 rounds...
0f23f75f 484 movdqu xmm1, [K + 64]
e297526c 485 \aes xmm0, xmm1
1a0c09c4
MW
486
487 // 5 rounds...
0f23f75f 488 movdqu xmm1, [K + 80]
e297526c 489 \aes xmm0, xmm1
1a0c09c4
MW
490
491 // 4 rounds...
0f23f75f 492 movdqu xmm1, [K + 96]
e297526c 493 \aes xmm0, xmm1
1a0c09c4
MW
494
495 // 3 rounds...
0f23f75f 496 movdqu xmm1, [K + 112]
e297526c 497 \aes xmm0, xmm1
1a0c09c4
MW
498
499 // 2 rounds...
0f23f75f 500 movdqu xmm1, [K + 128]
e297526c 501 \aes xmm0, xmm1
1a0c09c4
MW
502
503 // Final round...
0f23f75f 504 movdqu xmm1, [K + 144]
e297526c 505 \aes\()last xmm0, xmm1
1a0c09c4
MW
506
507 // Unpermute the ciphertext block and store it.
8d6ca554 508 pshufb xmm0, xmm5
0f23f75f 509 movdqu [DST], xmm0
1a0c09c4
MW
510
511 // And we're done.
512 ret
513
0f23f75f
MW
514#undef K
515#undef SRC
516#undef DST
517#undef NR
518
8a1aa284
MW
519 ENDFUNC
520.endm
1a0c09c4 521
e297526c
MW
522 encdec eblk, aesenc, w
523 encdec dblk, aesdec, wi
1a0c09c4
MW
524
525///--------------------------------------------------------------------------
526/// Random utilities.
527
1a517bb3 528INTFUNC(bogus)
1a0c09c4
MW
529 // Abort the process because of a programming error. Indirecting
530 // through this point serves several purposes: (a) by CALLing, rather
531 // than branching to, `abort', we can save the return address, which
532 // might at least provide a hint as to what went wrong; (b) we don't
533 // have conditional CALLs (and they'd be big anyway); and (c) we can
534 // write a HLT here as a backstop against `abort' being mad.
0923a413 535 endprologue
1a517bb3
MW
536
537 callext F(abort)
1a0c09c4
MW
5380: hlt
539 jmp 0b
540
1a517bb3
MW
541ENDFUNC
542
1a0c09c4
MW
543///--------------------------------------------------------------------------
544/// Data tables.
545
645fcce0
MW
546 RODATA
547
1a0c09c4
MW
548 .align 16
549endswap_tab:
550 .byte 3, 2, 1, 0
551 .byte 7, 6, 5, 4
552 .byte 11, 10, 9, 8
553 .byte 15, 14, 13, 12
554
555///----- That's all, folks --------------------------------------------------