base/asm-common.h: Factor out `deposit fake literal pool' macro.
[catacomb] / symm / rijndael-x86ish-aesni.S
CommitLineData
1a0c09c4
MW
1/// -*- mode: asm; asm-comment-char: ?/ -*-
2///
3/// AESNI-based implementation of Rijndael
4///
5/// (c) 2015 Straylight/Edgeware
6///
7
8///----- Licensing notice ---------------------------------------------------
9///
10/// This file is part of Catacomb.
11///
12/// Catacomb is free software; you can redistribute it and/or modify
13/// it under the terms of the GNU Library General Public License as
14/// published by the Free Software Foundation; either version 2 of the
15/// License, or (at your option) any later version.
16///
17/// Catacomb is distributed in the hope that it will be useful,
18/// but WITHOUT ANY WARRANTY; without even the implied warranty of
19/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20/// GNU Library General Public License for more details.
21///
22/// You should have received a copy of the GNU Library General Public
23/// License along with Catacomb; if not, write to the Free
24/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
25/// MA 02111-1307, USA.
26
27///--------------------------------------------------------------------------
28/// External definitions.
29
30#include "config.h"
31#include "asm-common.h"
32
1a0c09c4
MW
33 .globl F(abort)
34 .globl F(rijndael_rcon)
35
36///--------------------------------------------------------------------------
47103664
MW
37/// Local utilities.
38
39// Magic constants for shuffling.
40#define ROTL 0x93
41#define ROT2 0x4e
42#define ROTR 0x39
43
44///--------------------------------------------------------------------------
1a0c09c4
MW
45/// Main code.
46
47 .arch .aes
bc9ac7eb 48 .text
1a0c09c4
MW
49
50/// The AESNI instructions implement a little-endian version of AES, but
51/// Catacomb's internal interface presents as big-endian so as to work better
52/// with things like GCM. We therefore maintain the round keys in
53/// little-endian form, and have to end-swap blocks in and out.
54///
55/// For added amusement, the AESNI instructions don't implement the
56/// larger-block versions of Rijndael, so we have to end-swap the keys if
57/// we're preparing for one of those.
58
59 // Useful constants.
60 .equ maxrounds, 16 // maximum number of rounds
61 .equ maxblksz, 32 // maximum block size, in bytes
62 .equ kbufsz, maxblksz*(maxrounds + 1) // size of a key-schedule buffer
63
64 // Context structure.
65 .equ nr, 0 // number of rounds
66 .equ w, nr + 4 // encryption key words
67 .equ wi, w + kbufsz // decryption key words
68
69///--------------------------------------------------------------------------
70/// Key setup.
71
0f23f75f 72FUNC(rijndael_setup_x86ish_aesni)
1a0c09c4 73
0f23f75f
MW
74#if CPUFAM_X86
75 // Arguments are on the stack. We'll need to stack the caller's
76 // register veriables, but we'll manage.
1a0c09c4 77
0f23f75f
MW
78# define CTX ebp // context pointer
79# define BLKSZ [esp + 24] // block size
80
81# define SI esi // source pointer
82# define DI edi // destination pointer
83
84# define KSZ ebx // key size
85# define KSZo ebx // ... as address offset
86# define NKW edx // total number of key words
87# define NKW_NEEDS_REFRESH 1 // ... needs recalculating
88# define RCON ecx // round constants table
89# define LIM edx // limit pointer
90# define LIMn edx // ... as integer offset from base
91
92# define NR ecx // number of rounds
93# define LRK eax // distance to last key
94# define LRKo eax // ... as address offset
95# define BLKOFF edx // block size in bytes
96# define BLKOFFo edx // ... as address offset
97
98 // Stack the caller's registers.
1a0c09c4
MW
99 push ebp
100 push ebx
101 push esi
102 push edi
103
0f23f75f
MW
104 // Set up our own variables.
105 mov CTX, [esp + 20] // context base pointer
106 mov SI, [esp + 28] // key material
107 mov KSZ, [esp + 32] // key size, in words
108#endif
109
110#if CPUFAM_AMD64 && ABI_SYSV
111 // Arguments are in registers. We have plenty, but, to be honest,
112 // the initial register allocation is a bit annoying.
113
114# define CTX r8 // context pointer
115# define BLKSZ r9d // block size
116
117# define SI rsi // source pointer
118# define DI rdi // destination pointer
119
120# define KSZ edx // key size
121# define KSZo rdx // ... as address offset
122# define NKW r10d // total number of key words
123# define RCON rdi // round constants table
124# define LIMn ecx // limit pointer
125# define LIM rcx // ... as integer offset from base
126
127# define NR ecx // number of rounds
128# define LRK eax // distance to last key
129# define LRKo rax // ... as address offset
130# define BLKOFF r9d // block size in bytes
131# define BLKOFFo r9 // ... as address offset
132
133 // Move arguments to more useful places.
134 mov CTX, rdi // context base pointer
135 mov BLKSZ, esi // block size in words
136 mov SI, rdx // key material
137 mov KSZ, ecx // key size, in words
138#endif
139
140#if CPUFAM_AMD64 && ABI_WIN
141 // Arguments are in different registers, and they're a little tight.
142
143# define CTX r8 // context pointer
144# define BLKSZ edx // block size
145
146# define SI rsi // source pointer
147# define DI rdi // destination pointer
148
149# define KSZ r9d // key size
150# define KSZo r9 // ... as address offset
151# define NKW r10d // total number of key words
152# define RCON rdi // round constants table
153# define LIMn ecx // limit pointer
154# define LIM rcx // ... as integer offset from base
155
156# define NR ecx // number of rounds
157# define LRK eax // distance to last key
158# define LRKo rax // ... as address offset
159# define BLKOFF edx // block size in bytes
160# define BLKOFFo rdx // ... as address offset
161
162 // We'll need the index registers, which belong to the caller in this
163 // ABI.
164 push rsi
165 push rdi
166
167 // Move arguments to more useful places.
168 mov SI, r8 // key material
169 mov CTX, rcx // context base pointer
170#endif
171
1a0c09c4
MW
172 // The initial round key material is taken directly from the input
173 // key, so copy it over.
0f23f75f
MW
174#if CPUFAM_AMD64 && ABI_SYSV
175 // We've been lucky. We already have a copy of the context pointer
176 // in rdi, and the key size in ecx.
177 add DI, w
178#else
179 lea DI, [CTX + w]
180 mov ecx, KSZ
181#endif
1a0c09c4
MW
182 rep movsd
183
184 // Find out other useful things.
0f23f75f
MW
185 mov NKW, [CTX + nr] // number of rounds
186 add NKW, 1
187 imul NKW, BLKSZ // total key size in words
188#if !NKW_NEEDS_REFRESH
189 // If we can't keep NKW for later, then we use the same register for
190 // it and LIM, so this move is unnecessary.
191 mov LIMn, NKW
192#endif
193 sub LIMn, KSZ // offset by the key size
1a0c09c4
MW
194
195 // Find the round constants.
196 ldgot ecx
811a896f 197 leaext RCON, F(rijndael_rcon), ecx
1a0c09c4
MW
198
199 // Prepare for the main loop.
0f23f75f
MW
200 lea SI, [CTX + w]
201 mov eax, [SI + 4*KSZo - 4] // most recent key word
202 lea LIM, [SI + 4*LIM] // limit, offset by one key expansion
1a0c09c4
MW
203
204 // Main key expansion loop. The first word of each key-length chunk
205 // needs special treatment.
206 //
207 // This is rather tedious because the Intel `AESKEYGENASSIST'
208 // instruction is very strangely shaped. Firstly, it wants to
209 // operate on vast SSE registers, even though we're data-blocked from
210 // doing more than operation at a time unless we're doing two key
211 // schedules simultaneously -- and even then we can't do more than
212 // two, because the instruction ignores two of its input words
213 // entirely, and produces two different outputs for each of the other
214 // two. And secondly it insists on taking the magic round constant
215 // as an immediate, so it's kind of annoying if you're not
216 // open-coding the whole thing. It's much easier to leave that as
217 // zero and XOR in the round constant by hand.
2189: movd xmm0, eax
47103664 219 pshufd xmm0, xmm0, ROTR
1a0c09c4 220 aeskeygenassist xmm1, xmm0, 0
47103664 221 pshufd xmm1, xmm1, ROTL
1a0c09c4 222 movd eax, xmm1
0f23f75f
MW
223 xor eax, [SI]
224 xor al, [RCON]
225 inc RCON
226 mov [SI + 4*KSZo], eax
227 add SI, 4
228 cmp SI, LIM
1a0c09c4
MW
229 jae 8f
230
231 // The next three words are simple...
0f23f75f
MW
232 xor eax, [SI]
233 mov [SI + 4*KSZo], eax
234 add SI, 4
235 cmp SI, LIM
1a0c09c4
MW
236 jae 8f
237
238 // (Word 2...)
0f23f75f
MW
239 xor eax, [SI]
240 mov [SI + 4*KSZo], eax
241 add SI, 4
242 cmp SI, LIM
1a0c09c4
MW
243 jae 8f
244
245 // (Word 3...)
0f23f75f
MW
246 xor eax, [SI]
247 mov [SI + 4*KSZo], eax
248 add SI, 4
249 cmp SI, LIM
1a0c09c4
MW
250 jae 8f
251
252 // Word 4. If the key is /more/ than 6 words long, then we must
253 // apply a substitution here.
0f23f75f 254 cmp KSZ, 5
1a0c09c4 255 jb 9b
0f23f75f 256 cmp KSZ, 7
1a0c09c4
MW
257 jb 0f
258 movd xmm0, eax
47103664 259 pshufd xmm0, xmm0, ROTL
1a0c09c4
MW
260 aeskeygenassist xmm1, xmm0, 0
261 movd eax, xmm1
0f23f75f
MW
2620: xor eax, [SI]
263 mov [SI + 4*KSZo], eax
264 add SI, 4
265 cmp SI, LIM
1a0c09c4
MW
266 jae 8f
267
268 // (Word 5...)
0f23f75f 269 cmp KSZ, 6
1a0c09c4 270 jb 9b
0f23f75f
MW
271 xor eax, [SI]
272 mov [SI + 4*KSZo], eax
273 add SI, 4
274 cmp SI, LIM
1a0c09c4
MW
275 jae 8f
276
277 // (Word 6...)
0f23f75f 278 cmp KSZ, 7
1a0c09c4 279 jb 9b
0f23f75f
MW
280 xor eax, [SI]
281 mov [SI + 4*KSZo], eax
282 add SI, 4
283 cmp SI, LIM
1a0c09c4
MW
284 jae 8f
285
286 // (Word 7...)
0f23f75f 287 cmp KSZ, 8
1a0c09c4 288 jb 9b
0f23f75f
MW
289 xor eax, [SI]
290 mov [SI + 4*KSZo], eax
291 add SI, 4
292 cmp SI, LIM
1a0c09c4
MW
293 jae 8f
294
295 // Must be done by now.
296 jmp 9b
297
298 // Next job is to construct the decryption keys. The keys for the
299 // first and last rounds don't need to be mangled, but the remaining
300 // ones do -- and they all need to be reordered too.
301 //
302 // The plan of action, then, is to copy the final encryption round's
303 // keys into place first, then to do each of the intermediate rounds
304 // in reverse order, and finally do the first round.
305 //
306 // Do all of the heavy lifting with SSE registers. The order we're
307 // doing this in means that it's OK if we read or write too much, and
308 // there's easily enough buffer space for the over-enthusiastic reads
309 // and writes because the context has space for 32-byte blocks, which
310 // is our maximum and an exact fit for two SSE registers.
0f23f75f
MW
3118: mov NR, [CTX + nr] // number of rounds
312#if NKW_NEEDS_REFRESH
313 mov BLKOFF, BLKSZ
314 mov LRK, NR
315 imul LRK, BLKOFF
316#else
317 // If we retain NKW, then BLKSZ and BLKOFF are the same register
318 // because we won't need the former again.
319 mov LRK, NKW
320 sub LRK, BLKSZ
321#endif
322 lea DI, [CTX + wi]
323 lea SI, [CTX + w + 4*LRKo] // last round's keys
324 shl BLKOFF, 2 // block size (in bytes now)
1a0c09c4
MW
325
326 // Copy the last encryption round's keys.
0f23f75f
MW
327 movdqu xmm0, [SI]
328 movdqu [DI], xmm0
329 cmp BLKOFF, 16
1a0c09c4 330 jbe 9f
0f23f75f
MW
331 movdqu xmm0, [SI + 16]
332 movdqu [DI + 16], xmm0
1a0c09c4
MW
333
334 // Update the loop variables and stop if we've finished.
0f23f75f
MW
3359: add DI, BLKOFFo
336 sub SI, BLKOFFo
337 sub NR, 1
1a0c09c4
MW
338 jbe 0f
339
340 // Do another middle round's keys...
0f23f75f 341 movdqu xmm0, [SI]
1a0c09c4 342 aesimc xmm0, xmm0
0f23f75f
MW
343 movdqu [DI], xmm0
344 cmp BLKOFF, 16
1a0c09c4 345 jbe 9b
0f23f75f 346 movdqu xmm0, [SI + 16]
1a0c09c4 347 aesimc xmm0, xmm0
0f23f75f 348 movdqu [DI + 16], xmm0
1a0c09c4
MW
349 jmp 9b
350
351 // Finally do the first encryption round.
0f23f75f
MW
3520: movdqu xmm0, [SI]
353 movdqu [DI], xmm0
354 cmp BLKOFF, 16
1a0c09c4 355 jbe 0f
0f23f75f
MW
356 movdqu xmm0, [SI + 16]
357 movdqu [DI + 16], xmm0
1a0c09c4
MW
358
359 // If the block size is not exactly four words then we must end-swap
360 // everything. We can use fancy SSE toys for this.
0f23f75f 3610: cmp BLKOFF, 16
1a0c09c4
MW
362 je 0f
363
364 // Find the byte-reordering table.
365 ldgot ecx
8d6ca554 366 movdqa xmm5, [INTADDR(endswap_tab, ecx)]
1a0c09c4 367
0f23f75f 368#if NKW_NEEDS_REFRESH
1a0c09c4
MW
369 // Calculate the number of subkey words again. (It's a good job
370 // we've got a fast multiplier.)
0f23f75f
MW
371 mov NKW, [CTX + nr]
372 add NKW, 1
373 imul NKW, BLKSZ
374#endif
1a0c09c4
MW
375
376 // End-swap the encryption keys.
0f23f75f
MW
377 mov ecx, NKW
378 lea SI, [CTX + w]
1a0c09c4
MW
379 call endswap_block
380
381 // And the decryption keys.
0f23f75f
MW
382 mov ecx, NKW
383 lea SI, [CTX + wi]
1a0c09c4
MW
384 call endswap_block
385
0f23f75f
MW
3860: // All done.
387#if CPUFAM_X86
388 pop edi
1a0c09c4
MW
389 pop esi
390 pop ebx
391 pop ebp
0f23f75f
MW
392#endif
393#if CPUFAM_AMD64 && ABI_WIN
394 pop rdi
395 pop rsi
396#endif
1a0c09c4
MW
397 ret
398
399 .align 16
400endswap_block:
0f23f75f 401 // End-swap ECX words starting at SI. The end-swapping table is
8d6ca554 402 // already loaded into XMM5; and it's OK to work in 16-byte chunks.
0f23f75f 403 movdqu xmm1, [SI]
8d6ca554 404 pshufb xmm1, xmm5
0f23f75f
MW
405 movdqu [SI], xmm1
406 add SI, 16
1a0c09c4
MW
407 sub ecx, 4
408 ja endswap_block
409 ret
410
0f23f75f
MW
411#undef CTX
412#undef BLKSZ
413#undef SI
414#undef DI
415#undef KSZ
416#undef KSZo
417#undef RCON
418#undef LIMn
419#undef LIM
420#undef NR
421#undef LRK
422#undef LRKo
423#undef BLKOFF
424#undef BLKOFFo
425
1a0c09c4
MW
426ENDFUNC
427
428///--------------------------------------------------------------------------
429/// Encrypting and decrypting blocks.
430
8a1aa284
MW
431.macro encdec op, aes, koff
432 FUNC(rijndael_\op\()_x86ish_aesni)
1a0c09c4
MW
433
434 // Find the magic endianness-swapping table.
435 ldgot ecx
8d6ca554 436 movdqa xmm5, [INTADDR(endswap_tab, ecx)]
1a0c09c4 437
0f23f75f
MW
438#if CPUFAM_X86
439 // Arguments come in on the stack, and need to be collected. We
440 // don't have a shortage of registers.
441
442# define K ecx
443# define SRC edx
444# define DST edx
445# define NR eax
446
447 mov K, [esp + 4]
448 mov SRC, [esp + 8]
449#endif
450
451#if CPUFAM_AMD64 && ABI_SYSV
452 // Arguments come in registers. All is good.
453
454# define K rdi
455# define SRC rsi
456# define DST rdx
457# define NR eax
458#endif
459
460#if CPUFAM_AMD64 && ABI_WIN
461 // Arguments come in different registers.
462
463# define K rcx
464# define SRC rdx
465# define DST r8
466# define NR eax
467#endif
468
469 // Initial setup.
470 movdqu xmm0, [SRC]
8d6ca554 471 pshufb xmm0, xmm5
0f23f75f
MW
472 mov NR, [K + nr]
473 add K, \koff
1a0c09c4
MW
474
475 // Initial whitening.
0f23f75f
MW
476 movdqu xmm1, [K]
477 add K, 16
1a0c09c4
MW
478 pxor xmm0, xmm1
479
480 // Dispatch to the correct code.
0f23f75f 481 cmp NR, 10
e297526c 482 je 10f
1a0c09c4 483 jb bogus
0f23f75f 484 cmp NR, 14
e297526c 485 je 14f
1a0c09c4 486 ja bogus
0f23f75f 487 cmp NR, 12
e297526c
MW
488 je 12f
489 jb 11f
490 jmp 13f
1a0c09c4
MW
491
492 .align 2
493
494 // 14 rounds...
0f23f75f
MW
49514: movdqu xmm1, [K]
496 add K, 16
e297526c 497 \aes xmm0, xmm1
1a0c09c4
MW
498
499 // 13 rounds...
0f23f75f
MW
50013: movdqu xmm1, [K]
501 add K, 16
e297526c 502 \aes xmm0, xmm1
1a0c09c4
MW
503
504 // 12 rounds...
0f23f75f
MW
50512: movdqu xmm1, [K]
506 add K, 16
e297526c 507 \aes xmm0, xmm1
1a0c09c4
MW
508
509 // 11 rounds...
0f23f75f
MW
51011: movdqu xmm1, [K]
511 add K, 16
e297526c 512 \aes xmm0, xmm1
1a0c09c4
MW
513
514 // 10 rounds...
0f23f75f 51510: movdqu xmm1, [K]
e297526c 516 \aes xmm0, xmm1
1a0c09c4
MW
517
518 // 9 rounds...
0f23f75f 519 movdqu xmm1, [K + 16]
e297526c 520 \aes xmm0, xmm1
1a0c09c4
MW
521
522 // 8 rounds...
0f23f75f 523 movdqu xmm1, [K + 32]
e297526c 524 \aes xmm0, xmm1
1a0c09c4
MW
525
526 // 7 rounds...
0f23f75f 527 movdqu xmm1, [K + 48]
e297526c 528 \aes xmm0, xmm1
1a0c09c4
MW
529
530 // 6 rounds...
0f23f75f 531 movdqu xmm1, [K + 64]
e297526c 532 \aes xmm0, xmm1
1a0c09c4
MW
533
534 // 5 rounds...
0f23f75f 535 movdqu xmm1, [K + 80]
e297526c 536 \aes xmm0, xmm1
1a0c09c4
MW
537
538 // 4 rounds...
0f23f75f 539 movdqu xmm1, [K + 96]
e297526c 540 \aes xmm0, xmm1
1a0c09c4
MW
541
542 // 3 rounds...
0f23f75f 543 movdqu xmm1, [K + 112]
e297526c 544 \aes xmm0, xmm1
1a0c09c4
MW
545
546 // 2 rounds...
0f23f75f 547 movdqu xmm1, [K + 128]
e297526c 548 \aes xmm0, xmm1
1a0c09c4
MW
549
550 // Final round...
0f23f75f 551 movdqu xmm1, [K + 144]
e297526c 552 \aes\()last xmm0, xmm1
1a0c09c4
MW
553
554 // Unpermute the ciphertext block and store it.
8d6ca554 555 pshufb xmm0, xmm5
0f23f75f
MW
556#if CPUFAM_X86
557 mov DST, [esp + 12]
558#endif
559 movdqu [DST], xmm0
1a0c09c4
MW
560
561 // And we're done.
562 ret
563
0f23f75f
MW
564#undef K
565#undef SRC
566#undef DST
567#undef NR
568
8a1aa284
MW
569 ENDFUNC
570.endm
1a0c09c4 571
e297526c
MW
572 encdec eblk, aesenc, w
573 encdec dblk, aesdec, wi
1a0c09c4
MW
574
575///--------------------------------------------------------------------------
576/// Random utilities.
577
578 .align 16
579 // Abort the process because of a programming error. Indirecting
580 // through this point serves several purposes: (a) by CALLing, rather
581 // than branching to, `abort', we can save the return address, which
582 // might at least provide a hint as to what went wrong; (b) we don't
583 // have conditional CALLs (and they'd be big anyway); and (c) we can
584 // write a HLT here as a backstop against `abort' being mad.
585bogus: callext F(abort)
5860: hlt
587 jmp 0b
588
589 gotaux ecx
590
591///--------------------------------------------------------------------------
592/// Data tables.
593
594 .align 16
595endswap_tab:
596 .byte 3, 2, 1, 0
597 .byte 7, 6, 5, 4
598 .byte 11, 10, 9, 8
599 .byte 15, 14, 13, 12
600
601///----- That's all, folks --------------------------------------------------