base/asm-common.h, *.S: Add `INTFUNC' macro for internal subroutines.
[catacomb] / symm / rijndael-x86ish-aesni.S
CommitLineData
1a0c09c4
MW
1/// -*- mode: asm; asm-comment-char: ?/ -*-
2///
3/// AESNI-based implementation of Rijndael
4///
5/// (c) 2015 Straylight/Edgeware
6///
7
8///----- Licensing notice ---------------------------------------------------
9///
10/// This file is part of Catacomb.
11///
12/// Catacomb is free software; you can redistribute it and/or modify
13/// it under the terms of the GNU Library General Public License as
14/// published by the Free Software Foundation; either version 2 of the
15/// License, or (at your option) any later version.
16///
17/// Catacomb is distributed in the hope that it will be useful,
18/// but WITHOUT ANY WARRANTY; without even the implied warranty of
19/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20/// GNU Library General Public License for more details.
21///
22/// You should have received a copy of the GNU Library General Public
23/// License along with Catacomb; if not, write to the Free
24/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
25/// MA 02111-1307, USA.
26
27///--------------------------------------------------------------------------
28/// External definitions.
29
30#include "config.h"
31#include "asm-common.h"
32
1a0c09c4
MW
33 .globl F(abort)
34 .globl F(rijndael_rcon)
35
36///--------------------------------------------------------------------------
37/// Main code.
38
39 .arch .aes
bc9ac7eb 40 .text
1a0c09c4
MW
41
42/// The AESNI instructions implement a little-endian version of AES, but
43/// Catacomb's internal interface presents as big-endian so as to work better
44/// with things like GCM. We therefore maintain the round keys in
45/// little-endian form, and have to end-swap blocks in and out.
46///
47/// For added amusement, the AESNI instructions don't implement the
48/// larger-block versions of Rijndael, so we have to end-swap the keys if
49/// we're preparing for one of those.
50
51 // Useful constants.
52 .equ maxrounds, 16 // maximum number of rounds
53 .equ maxblksz, 32 // maximum block size, in bytes
54 .equ kbufsz, maxblksz*(maxrounds + 1) // size of a key-schedule buffer
55
56 // Context structure.
57 .equ nr, 0 // number of rounds
58 .equ w, nr + 4 // encryption key words
59 .equ wi, w + kbufsz // decryption key words
60
61///--------------------------------------------------------------------------
62/// Key setup.
63
0f23f75f 64FUNC(rijndael_setup_x86ish_aesni)
1a0c09c4 65
43ea7558
MW
66#define SI WHOLE(si)
67#define DI WHOLE(di)
68
0f23f75f
MW
69#if CPUFAM_X86
70 // Arguments are on the stack. We'll need to stack the caller's
71 // register veriables, but we'll manage.
1a0c09c4 72
0f23f75f
MW
73# define CTX ebp // context pointer
74# define BLKSZ [esp + 24] // block size
75
0f23f75f 76# define KSZ ebx // key size
0f23f75f
MW
77# define NKW edx // total number of key words
78# define NKW_NEEDS_REFRESH 1 // ... needs recalculating
79# define RCON ecx // round constants table
80# define LIM edx // limit pointer
16021451 81# define CYIX edi // index in shift-register cycle
0f23f75f
MW
82
83# define NR ecx // number of rounds
84# define LRK eax // distance to last key
0f23f75f 85# define BLKOFF edx // block size in bytes
0f23f75f
MW
86
87 // Stack the caller's registers.
1a0c09c4
MW
88 push ebp
89 push ebx
90 push esi
91 push edi
92
0f23f75f
MW
93 // Set up our own variables.
94 mov CTX, [esp + 20] // context base pointer
95 mov SI, [esp + 28] // key material
96 mov KSZ, [esp + 32] // key size, in words
97#endif
98
99#if CPUFAM_AMD64 && ABI_SYSV
100 // Arguments are in registers. We have plenty, but, to be honest,
101 // the initial register allocation is a bit annoying.
102
103# define CTX r8 // context pointer
104# define BLKSZ r9d // block size
105
0f23f75f 106# define KSZ edx // key size
0f23f75f
MW
107# define NKW r10d // total number of key words
108# define RCON rdi // round constants table
43ea7558 109# define LIM rcx // limit pointer
16021451 110# define CYIX r11d // index in shift-register cycle
0f23f75f
MW
111
112# define NR ecx // number of rounds
113# define LRK eax // distance to last key
0f23f75f 114# define BLKOFF r9d // block size in bytes
0f23f75f
MW
115
116 // Move arguments to more useful places.
117 mov CTX, rdi // context base pointer
118 mov BLKSZ, esi // block size in words
119 mov SI, rdx // key material
120 mov KSZ, ecx // key size, in words
121#endif
122
123#if CPUFAM_AMD64 && ABI_WIN
124 // Arguments are in different registers, and they're a little tight.
125
126# define CTX r8 // context pointer
127# define BLKSZ edx // block size
128
0f23f75f 129# define KSZ r9d // key size
0f23f75f
MW
130# define NKW r10d // total number of key words
131# define RCON rdi // round constants table
43ea7558 132# define LIM rcx // limit pointer
16021451 133# define CYIX r11d // index in shift-register cycle
0f23f75f
MW
134
135# define NR ecx // number of rounds
136# define LRK eax // distance to last key
0f23f75f 137# define BLKOFF edx // block size in bytes
0f23f75f
MW
138
139 // We'll need the index registers, which belong to the caller in this
140 // ABI.
141 push rsi
f71dd54d 142 .seh_pushreg rsi
0f23f75f 143 push rdi
f71dd54d
MW
144 .seh_pushreg rdi
145 .seh_endprologue
0f23f75f
MW
146
147 // Move arguments to more useful places.
43ea7558 148 mov rsi, r8 // key material
0f23f75f
MW
149 mov CTX, rcx // context base pointer
150#endif
151
1a0c09c4
MW
152 // The initial round key material is taken directly from the input
153 // key, so copy it over.
0f23f75f
MW
154#if CPUFAM_AMD64 && ABI_SYSV
155 // We've been lucky. We already have a copy of the context pointer
156 // in rdi, and the key size in ecx.
43ea7558 157 add rdi, w
0f23f75f
MW
158#else
159 lea DI, [CTX + w]
160 mov ecx, KSZ
161#endif
1a0c09c4
MW
162 rep movsd
163
164 // Find out other useful things.
0f23f75f
MW
165 mov NKW, [CTX + nr] // number of rounds
166 add NKW, 1
167 imul NKW, BLKSZ // total key size in words
168#if !NKW_NEEDS_REFRESH
169 // If we can't keep NKW for later, then we use the same register for
170 // it and LIM, so this move is unnecessary.
43ea7558 171 mov DWORD(LIM), NKW
0f23f75f 172#endif
43ea7558 173 sub DWORD(LIM), KSZ // offset by the key size
1a0c09c4
MW
174
175 // Find the round constants.
43ea7558
MW
176 ldgot WHOLE(c)
177 leaext RCON, F(rijndael_rcon), WHOLE(c)
1a0c09c4
MW
178
179 // Prepare for the main loop.
0f23f75f 180 lea SI, [CTX + w]
43ea7558 181 mov eax, [SI + 4*WHOLE(KSZ) - 4] // most recent key word
0f23f75f 182 lea LIM, [SI + 4*LIM] // limit, offset by one key expansion
16021451 183 xor CYIX, CYIX // start of new cycle
1a0c09c4
MW
184
185 // Main key expansion loop. The first word of each key-length chunk
186 // needs special treatment.
187 //
188 // This is rather tedious because the Intel `AESKEYGENASSIST'
189 // instruction is very strangely shaped. Firstly, it wants to
190 // operate on vast SSE registers, even though we're data-blocked from
191 // doing more than operation at a time unless we're doing two key
192 // schedules simultaneously -- and even then we can't do more than
193 // two, because the instruction ignores two of its input words
194 // entirely, and produces two different outputs for each of the other
195 // two. And secondly it insists on taking the magic round constant
196 // as an immediate, so it's kind of annoying if you're not
197 // open-coding the whole thing. It's much easier to leave that as
198 // zero and XOR in the round constant by hand.
16021451
MW
1990: cmp CYIX, 0 // first word of the cycle?
200 je 1f
201 cmp CYIX, 4 // fourth word of the cycle?
202 jne 2f
203 cmp KSZ, 7 // and a large key?
204 jb 2f
205
206 // Fourth word of the cycle, and seven or eight words of key. Do a
207 // byte substitution.
208 movd xmm0, eax
a13b5730 209 pshufd xmm0, xmm0, SHUF(2, 1, 0, 3)
16021451
MW
210 aeskeygenassist xmm1, xmm0, 0
211 movd eax, xmm1
212 jmp 2f
213
214 // First word of the cycle. This is the complicated piece.
2151: movd xmm0, eax
a13b5730 216 pshufd xmm0, xmm0, SHUF(0, 3, 2, 1)
1a0c09c4 217 aeskeygenassist xmm1, xmm0, 0
a13b5730 218 pshufd xmm1, xmm1, SHUF(2, 1, 0, 3)
1a0c09c4 219 movd eax, xmm1
0f23f75f
MW
220 xor al, [RCON]
221 inc RCON
1a0c09c4 222
16021451
MW
223 // Common tail. Mix in the corresponding word from the previous
224 // cycle and prepare for the next loop.
2252: xor eax, [SI]
43ea7558 226 mov [SI + 4*WHOLE(KSZ)], eax
0f23f75f 227 add SI, 4
16021451 228 inc CYIX
0f23f75f 229 cmp SI, LIM
89b34050 230 jae 9f
16021451 231 cmp CYIX, KSZ
89b34050 232 jb 0b
16021451 233 xor CYIX, CYIX
89b34050 234 jmp 0b
1a0c09c4
MW
235
236 // Next job is to construct the decryption keys. The keys for the
237 // first and last rounds don't need to be mangled, but the remaining
238 // ones do -- and they all need to be reordered too.
239 //
240 // The plan of action, then, is to copy the final encryption round's
241 // keys into place first, then to do each of the intermediate rounds
242 // in reverse order, and finally do the first round.
243 //
244 // Do all of the heavy lifting with SSE registers. The order we're
245 // doing this in means that it's OK if we read or write too much, and
246 // there's easily enough buffer space for the over-enthusiastic reads
247 // and writes because the context has space for 32-byte blocks, which
248 // is our maximum and an exact fit for two SSE registers.
89b34050 2499: mov NR, [CTX + nr] // number of rounds
0f23f75f
MW
250#if NKW_NEEDS_REFRESH
251 mov BLKOFF, BLKSZ
252 mov LRK, NR
253 imul LRK, BLKOFF
254#else
255 // If we retain NKW, then BLKSZ and BLKOFF are the same register
256 // because we won't need the former again.
257 mov LRK, NKW
258 sub LRK, BLKSZ
259#endif
260 lea DI, [CTX + wi]
43ea7558 261 lea SI, [CTX + w + 4*WHOLE(LRK)] // last round's keys
0f23f75f 262 shl BLKOFF, 2 // block size (in bytes now)
1a0c09c4
MW
263
264 // Copy the last encryption round's keys.
0f23f75f
MW
265 movdqu xmm0, [SI]
266 movdqu [DI], xmm0
267 cmp BLKOFF, 16
89b34050 268 jbe 0f
0f23f75f
MW
269 movdqu xmm0, [SI + 16]
270 movdqu [DI + 16], xmm0
1a0c09c4
MW
271
272 // Update the loop variables and stop if we've finished.
43ea7558
MW
2730: add DI, WHOLE(BLKOFF)
274 sub SI, WHOLE(BLKOFF)
0f23f75f 275 sub NR, 1
89b34050 276 jbe 9f
1a0c09c4
MW
277
278 // Do another middle round's keys...
0f23f75f 279 movdqu xmm0, [SI]
1a0c09c4 280 aesimc xmm0, xmm0
0f23f75f
MW
281 movdqu [DI], xmm0
282 cmp BLKOFF, 16
89b34050 283 jbe 0b
0f23f75f 284 movdqu xmm0, [SI + 16]
1a0c09c4 285 aesimc xmm0, xmm0
0f23f75f 286 movdqu [DI + 16], xmm0
89b34050 287 jmp 0b
1a0c09c4
MW
288
289 // Finally do the first encryption round.
89b34050 2909: movdqu xmm0, [SI]
0f23f75f
MW
291 movdqu [DI], xmm0
292 cmp BLKOFF, 16
89b34050 293 jbe 1f
0f23f75f
MW
294 movdqu xmm0, [SI + 16]
295 movdqu [DI + 16], xmm0
1a0c09c4
MW
296
297 // If the block size is not exactly four words then we must end-swap
298 // everything. We can use fancy SSE toys for this.
89b34050
MW
2991: cmp BLKOFF, 16
300 je 9f
1a0c09c4
MW
301
302 // Find the byte-reordering table.
303 ldgot ecx
8d6ca554 304 movdqa xmm5, [INTADDR(endswap_tab, ecx)]
1a0c09c4 305
0f23f75f 306#if NKW_NEEDS_REFRESH
1a0c09c4
MW
307 // Calculate the number of subkey words again. (It's a good job
308 // we've got a fast multiplier.)
0f23f75f
MW
309 mov NKW, [CTX + nr]
310 add NKW, 1
311 imul NKW, BLKSZ
312#endif
1a0c09c4
MW
313
314 // End-swap the encryption keys.
0f23f75f 315 lea SI, [CTX + w]
1a0c09c4
MW
316 call endswap_block
317
318 // And the decryption keys.
0f23f75f 319 lea SI, [CTX + wi]
1a0c09c4
MW
320 call endswap_block
321
89b34050 3229: // All done.
0f23f75f
MW
323#if CPUFAM_X86
324 pop edi
1a0c09c4
MW
325 pop esi
326 pop ebx
327 pop ebp
0f23f75f
MW
328#endif
329#if CPUFAM_AMD64 && ABI_WIN
330 pop rdi
331 pop rsi
332#endif
1a0c09c4
MW
333 ret
334
1a517bb3
MW
335ENDFUNC
336
337INTFUNC(endswap_block)
1a384903 338 // End-swap NKW words starting at SI. The end-swapping table is
8d6ca554 339 // already loaded into XMM5; and it's OK to work in 16-byte chunks.
1a517bb3
MW
340#if CPUFAM_AMD64 && ABI_WIN
341 .seh_endprologue
342#endif
343
1a384903
MW
344 mov ecx, NKW
3450: movdqu xmm1, [SI]
8d6ca554 346 pshufb xmm1, xmm5
0f23f75f
MW
347 movdqu [SI], xmm1
348 add SI, 16
1a0c09c4 349 sub ecx, 4
1a384903 350 ja 0b
1a517bb3 351
1a0c09c4
MW
352 ret
353
1a517bb3
MW
354ENDFUNC
355
0f23f75f
MW
356#undef CTX
357#undef BLKSZ
358#undef SI
359#undef DI
360#undef KSZ
0f23f75f 361#undef RCON
0f23f75f
MW
362#undef LIM
363#undef NR
364#undef LRK
0f23f75f 365#undef BLKOFF
0f23f75f 366
1a0c09c4
MW
367///--------------------------------------------------------------------------
368/// Encrypting and decrypting blocks.
369
8a1aa284
MW
370.macro encdec op, aes, koff
371 FUNC(rijndael_\op\()_x86ish_aesni)
1a0c09c4 372
0f23f75f
MW
373#if CPUFAM_X86
374 // Arguments come in on the stack, and need to be collected. We
375 // don't have a shortage of registers.
376
c410f911 377# define K eax
0f23f75f
MW
378# define SRC edx
379# define DST edx
c410f911 380# define NR ecx
0f23f75f
MW
381
382 mov K, [esp + 4]
383 mov SRC, [esp + 8]
384#endif
385
386#if CPUFAM_AMD64 && ABI_SYSV
387 // Arguments come in registers. All is good.
388
389# define K rdi
390# define SRC rsi
391# define DST rdx
392# define NR eax
393#endif
394
395#if CPUFAM_AMD64 && ABI_WIN
396 // Arguments come in different registers.
397
398# define K rcx
399# define SRC rdx
400# define DST r8
401# define NR eax
f71dd54d 402 .seh_endprologue
0f23f75f
MW
403#endif
404
28321c96
MW
405 // Find the magic endianness-swapping table.
406 ldgot ecx
407 movdqa xmm5, [INTADDR(endswap_tab, ecx)]
408
0f23f75f
MW
409 // Initial setup.
410 movdqu xmm0, [SRC]
8d6ca554 411 pshufb xmm0, xmm5
0f23f75f
MW
412 mov NR, [K + nr]
413 add K, \koff
1a0c09c4
MW
414
415 // Initial whitening.
0f23f75f
MW
416 movdqu xmm1, [K]
417 add K, 16
1a0c09c4 418 pxor xmm0, xmm1
1d63fee4
MW
419#if CPUFAM_X86
420 mov DST, [esp + 12]
421#endif
1a0c09c4
MW
422
423 // Dispatch to the correct code.
0f23f75f 424 cmp NR, 10
e297526c 425 je 10f
1a0c09c4 426 jb bogus
0f23f75f 427 cmp NR, 14
e297526c 428 je 14f
1a0c09c4 429 ja bogus
0f23f75f 430 cmp NR, 12
e297526c
MW
431 je 12f
432 jb 11f
433 jmp 13f
1a0c09c4
MW
434
435 .align 2
436
437 // 14 rounds...
0f23f75f
MW
43814: movdqu xmm1, [K]
439 add K, 16
e297526c 440 \aes xmm0, xmm1
1a0c09c4
MW
441
442 // 13 rounds...
0f23f75f
MW
44313: movdqu xmm1, [K]
444 add K, 16
e297526c 445 \aes xmm0, xmm1
1a0c09c4
MW
446
447 // 12 rounds...
0f23f75f
MW
44812: movdqu xmm1, [K]
449 add K, 16
e297526c 450 \aes xmm0, xmm1
1a0c09c4
MW
451
452 // 11 rounds...
0f23f75f
MW
45311: movdqu xmm1, [K]
454 add K, 16
e297526c 455 \aes xmm0, xmm1
1a0c09c4
MW
456
457 // 10 rounds...
0f23f75f 45810: movdqu xmm1, [K]
e297526c 459 \aes xmm0, xmm1
1a0c09c4
MW
460
461 // 9 rounds...
0f23f75f 462 movdqu xmm1, [K + 16]
e297526c 463 \aes xmm0, xmm1
1a0c09c4
MW
464
465 // 8 rounds...
0f23f75f 466 movdqu xmm1, [K + 32]
e297526c 467 \aes xmm0, xmm1
1a0c09c4
MW
468
469 // 7 rounds...
0f23f75f 470 movdqu xmm1, [K + 48]
e297526c 471 \aes xmm0, xmm1
1a0c09c4
MW
472
473 // 6 rounds...
0f23f75f 474 movdqu xmm1, [K + 64]
e297526c 475 \aes xmm0, xmm1
1a0c09c4
MW
476
477 // 5 rounds...
0f23f75f 478 movdqu xmm1, [K + 80]
e297526c 479 \aes xmm0, xmm1
1a0c09c4
MW
480
481 // 4 rounds...
0f23f75f 482 movdqu xmm1, [K + 96]
e297526c 483 \aes xmm0, xmm1
1a0c09c4
MW
484
485 // 3 rounds...
0f23f75f 486 movdqu xmm1, [K + 112]
e297526c 487 \aes xmm0, xmm1
1a0c09c4
MW
488
489 // 2 rounds...
0f23f75f 490 movdqu xmm1, [K + 128]
e297526c 491 \aes xmm0, xmm1
1a0c09c4
MW
492
493 // Final round...
0f23f75f 494 movdqu xmm1, [K + 144]
e297526c 495 \aes\()last xmm0, xmm1
1a0c09c4
MW
496
497 // Unpermute the ciphertext block and store it.
8d6ca554 498 pshufb xmm0, xmm5
0f23f75f 499 movdqu [DST], xmm0
1a0c09c4
MW
500
501 // And we're done.
502 ret
503
0f23f75f
MW
504#undef K
505#undef SRC
506#undef DST
507#undef NR
508
8a1aa284
MW
509 ENDFUNC
510.endm
1a0c09c4 511
e297526c
MW
512 encdec eblk, aesenc, w
513 encdec dblk, aesdec, wi
1a0c09c4
MW
514
515///--------------------------------------------------------------------------
516/// Random utilities.
517
1a517bb3 518INTFUNC(bogus)
1a0c09c4
MW
519 // Abort the process because of a programming error. Indirecting
520 // through this point serves several purposes: (a) by CALLing, rather
521 // than branching to, `abort', we can save the return address, which
522 // might at least provide a hint as to what went wrong; (b) we don't
523 // have conditional CALLs (and they'd be big anyway); and (c) we can
524 // write a HLT here as a backstop against `abort' being mad.
1a517bb3
MW
525#if CPUFAM_AMD64 && ABI_WIN
526 .seh_endprologue
527#endif
528
529 callext F(abort)
1a0c09c4
MW
5300: hlt
531 jmp 0b
532
1a517bb3
MW
533ENDFUNC
534
1a0c09c4
MW
535///--------------------------------------------------------------------------
536/// Data tables.
537
645fcce0
MW
538 RODATA
539
1a0c09c4
MW
540 .align 16
541endswap_tab:
542 .byte 3, 2, 1, 0
543 .byte 7, 6, 5, 4
544 .byte 11, 10, 9, 8
545 .byte 15, 14, 13, 12
546
547///----- That's all, folks --------------------------------------------------