symm/rijndael-x86ish-aesni.S (rijndael_setup_x86ish_aesni): Label numbering.
[catacomb] / symm / rijndael-x86ish-aesni.S
CommitLineData
1a0c09c4
MW
1/// -*- mode: asm; asm-comment-char: ?/ -*-
2///
3/// AESNI-based implementation of Rijndael
4///
5/// (c) 2015 Straylight/Edgeware
6///
7
8///----- Licensing notice ---------------------------------------------------
9///
10/// This file is part of Catacomb.
11///
12/// Catacomb is free software; you can redistribute it and/or modify
13/// it under the terms of the GNU Library General Public License as
14/// published by the Free Software Foundation; either version 2 of the
15/// License, or (at your option) any later version.
16///
17/// Catacomb is distributed in the hope that it will be useful,
18/// but WITHOUT ANY WARRANTY; without even the implied warranty of
19/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20/// GNU Library General Public License for more details.
21///
22/// You should have received a copy of the GNU Library General Public
23/// License along with Catacomb; if not, write to the Free
24/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
25/// MA 02111-1307, USA.
26
27///--------------------------------------------------------------------------
28/// External definitions.
29
30#include "config.h"
31#include "asm-common.h"
32
1a0c09c4
MW
33 .globl F(abort)
34 .globl F(rijndael_rcon)
35
36///--------------------------------------------------------------------------
47103664
MW
37/// Local utilities.
38
39// Magic constants for shuffling.
40#define ROTL 0x93
41#define ROT2 0x4e
42#define ROTR 0x39
43
44///--------------------------------------------------------------------------
1a0c09c4
MW
45/// Main code.
46
47 .arch .aes
bc9ac7eb 48 .text
1a0c09c4
MW
49
50/// The AESNI instructions implement a little-endian version of AES, but
51/// Catacomb's internal interface presents as big-endian so as to work better
52/// with things like GCM. We therefore maintain the round keys in
53/// little-endian form, and have to end-swap blocks in and out.
54///
55/// For added amusement, the AESNI instructions don't implement the
56/// larger-block versions of Rijndael, so we have to end-swap the keys if
57/// we're preparing for one of those.
58
59 // Useful constants.
60 .equ maxrounds, 16 // maximum number of rounds
61 .equ maxblksz, 32 // maximum block size, in bytes
62 .equ kbufsz, maxblksz*(maxrounds + 1) // size of a key-schedule buffer
63
64 // Context structure.
65 .equ nr, 0 // number of rounds
66 .equ w, nr + 4 // encryption key words
67 .equ wi, w + kbufsz // decryption key words
68
69///--------------------------------------------------------------------------
70/// Key setup.
71
0f23f75f 72FUNC(rijndael_setup_x86ish_aesni)
1a0c09c4 73
0f23f75f
MW
74#if CPUFAM_X86
75 // Arguments are on the stack. We'll need to stack the caller's
76 // register veriables, but we'll manage.
1a0c09c4 77
0f23f75f
MW
78# define CTX ebp // context pointer
79# define BLKSZ [esp + 24] // block size
80
81# define SI esi // source pointer
82# define DI edi // destination pointer
83
84# define KSZ ebx // key size
85# define KSZo ebx // ... as address offset
86# define NKW edx // total number of key words
87# define NKW_NEEDS_REFRESH 1 // ... needs recalculating
88# define RCON ecx // round constants table
89# define LIM edx // limit pointer
90# define LIMn edx // ... as integer offset from base
91
92# define NR ecx // number of rounds
93# define LRK eax // distance to last key
94# define LRKo eax // ... as address offset
95# define BLKOFF edx // block size in bytes
96# define BLKOFFo edx // ... as address offset
97
98 // Stack the caller's registers.
1a0c09c4
MW
99 push ebp
100 push ebx
101 push esi
102 push edi
103
0f23f75f
MW
104 // Set up our own variables.
105 mov CTX, [esp + 20] // context base pointer
106 mov SI, [esp + 28] // key material
107 mov KSZ, [esp + 32] // key size, in words
108#endif
109
110#if CPUFAM_AMD64 && ABI_SYSV
111 // Arguments are in registers. We have plenty, but, to be honest,
112 // the initial register allocation is a bit annoying.
113
114# define CTX r8 // context pointer
115# define BLKSZ r9d // block size
116
117# define SI rsi // source pointer
118# define DI rdi // destination pointer
119
120# define KSZ edx // key size
121# define KSZo rdx // ... as address offset
122# define NKW r10d // total number of key words
123# define RCON rdi // round constants table
124# define LIMn ecx // limit pointer
125# define LIM rcx // ... as integer offset from base
126
127# define NR ecx // number of rounds
128# define LRK eax // distance to last key
129# define LRKo rax // ... as address offset
130# define BLKOFF r9d // block size in bytes
131# define BLKOFFo r9 // ... as address offset
132
133 // Move arguments to more useful places.
134 mov CTX, rdi // context base pointer
135 mov BLKSZ, esi // block size in words
136 mov SI, rdx // key material
137 mov KSZ, ecx // key size, in words
138#endif
139
140#if CPUFAM_AMD64 && ABI_WIN
141 // Arguments are in different registers, and they're a little tight.
142
143# define CTX r8 // context pointer
144# define BLKSZ edx // block size
145
146# define SI rsi // source pointer
147# define DI rdi // destination pointer
148
149# define KSZ r9d // key size
150# define KSZo r9 // ... as address offset
151# define NKW r10d // total number of key words
152# define RCON rdi // round constants table
153# define LIMn ecx // limit pointer
154# define LIM rcx // ... as integer offset from base
155
156# define NR ecx // number of rounds
157# define LRK eax // distance to last key
158# define LRKo rax // ... as address offset
159# define BLKOFF edx // block size in bytes
160# define BLKOFFo rdx // ... as address offset
161
162 // We'll need the index registers, which belong to the caller in this
163 // ABI.
164 push rsi
165 push rdi
166
167 // Move arguments to more useful places.
168 mov SI, r8 // key material
169 mov CTX, rcx // context base pointer
170#endif
171
1a0c09c4
MW
172 // The initial round key material is taken directly from the input
173 // key, so copy it over.
0f23f75f
MW
174#if CPUFAM_AMD64 && ABI_SYSV
175 // We've been lucky. We already have a copy of the context pointer
176 // in rdi, and the key size in ecx.
177 add DI, w
178#else
179 lea DI, [CTX + w]
180 mov ecx, KSZ
181#endif
1a0c09c4
MW
182 rep movsd
183
184 // Find out other useful things.
0f23f75f
MW
185 mov NKW, [CTX + nr] // number of rounds
186 add NKW, 1
187 imul NKW, BLKSZ // total key size in words
188#if !NKW_NEEDS_REFRESH
189 // If we can't keep NKW for later, then we use the same register for
190 // it and LIM, so this move is unnecessary.
191 mov LIMn, NKW
192#endif
193 sub LIMn, KSZ // offset by the key size
1a0c09c4
MW
194
195 // Find the round constants.
196 ldgot ecx
811a896f 197 leaext RCON, F(rijndael_rcon), ecx
1a0c09c4
MW
198
199 // Prepare for the main loop.
0f23f75f
MW
200 lea SI, [CTX + w]
201 mov eax, [SI + 4*KSZo - 4] // most recent key word
202 lea LIM, [SI + 4*LIM] // limit, offset by one key expansion
1a0c09c4
MW
203
204 // Main key expansion loop. The first word of each key-length chunk
205 // needs special treatment.
206 //
207 // This is rather tedious because the Intel `AESKEYGENASSIST'
208 // instruction is very strangely shaped. Firstly, it wants to
209 // operate on vast SSE registers, even though we're data-blocked from
210 // doing more than operation at a time unless we're doing two key
211 // schedules simultaneously -- and even then we can't do more than
212 // two, because the instruction ignores two of its input words
213 // entirely, and produces two different outputs for each of the other
214 // two. And secondly it insists on taking the magic round constant
215 // as an immediate, so it's kind of annoying if you're not
216 // open-coding the whole thing. It's much easier to leave that as
217 // zero and XOR in the round constant by hand.
89b34050 2180: movd xmm0, eax
47103664 219 pshufd xmm0, xmm0, ROTR
1a0c09c4 220 aeskeygenassist xmm1, xmm0, 0
47103664 221 pshufd xmm1, xmm1, ROTL
1a0c09c4 222 movd eax, xmm1
0f23f75f
MW
223 xor eax, [SI]
224 xor al, [RCON]
225 inc RCON
226 mov [SI + 4*KSZo], eax
227 add SI, 4
228 cmp SI, LIM
89b34050 229 jae 9f
1a0c09c4
MW
230
231 // The next three words are simple...
0f23f75f
MW
232 xor eax, [SI]
233 mov [SI + 4*KSZo], eax
234 add SI, 4
235 cmp SI, LIM
89b34050 236 jae 9f
1a0c09c4
MW
237
238 // (Word 2...)
0f23f75f
MW
239 xor eax, [SI]
240 mov [SI + 4*KSZo], eax
241 add SI, 4
242 cmp SI, LIM
89b34050 243 jae 9f
1a0c09c4
MW
244
245 // (Word 3...)
0f23f75f
MW
246 xor eax, [SI]
247 mov [SI + 4*KSZo], eax
248 add SI, 4
249 cmp SI, LIM
89b34050 250 jae 9f
1a0c09c4
MW
251
252 // Word 4. If the key is /more/ than 6 words long, then we must
253 // apply a substitution here.
0f23f75f 254 cmp KSZ, 5
89b34050 255 jb 0b
0f23f75f 256 cmp KSZ, 7
89b34050 257 jb 1f
1a0c09c4 258 movd xmm0, eax
47103664 259 pshufd xmm0, xmm0, ROTL
1a0c09c4
MW
260 aeskeygenassist xmm1, xmm0, 0
261 movd eax, xmm1
89b34050 2621: xor eax, [SI]
0f23f75f
MW
263 mov [SI + 4*KSZo], eax
264 add SI, 4
265 cmp SI, LIM
89b34050 266 jae 9f
1a0c09c4
MW
267
268 // (Word 5...)
0f23f75f 269 cmp KSZ, 6
89b34050 270 jb 0b
0f23f75f
MW
271 xor eax, [SI]
272 mov [SI + 4*KSZo], eax
273 add SI, 4
274 cmp SI, LIM
89b34050 275 jae 9f
1a0c09c4
MW
276
277 // (Word 6...)
0f23f75f 278 cmp KSZ, 7
89b34050 279 jb 0b
0f23f75f
MW
280 xor eax, [SI]
281 mov [SI + 4*KSZo], eax
282 add SI, 4
283 cmp SI, LIM
89b34050 284 jae 9f
1a0c09c4
MW
285
286 // (Word 7...)
0f23f75f 287 cmp KSZ, 8
89b34050 288 jb 0b
0f23f75f
MW
289 xor eax, [SI]
290 mov [SI + 4*KSZo], eax
291 add SI, 4
292 cmp SI, LIM
89b34050 293 jae 9f
1a0c09c4
MW
294
295 // Must be done by now.
89b34050 296 jmp 0b
1a0c09c4
MW
297
298 // Next job is to construct the decryption keys. The keys for the
299 // first and last rounds don't need to be mangled, but the remaining
300 // ones do -- and they all need to be reordered too.
301 //
302 // The plan of action, then, is to copy the final encryption round's
303 // keys into place first, then to do each of the intermediate rounds
304 // in reverse order, and finally do the first round.
305 //
306 // Do all of the heavy lifting with SSE registers. The order we're
307 // doing this in means that it's OK if we read or write too much, and
308 // there's easily enough buffer space for the over-enthusiastic reads
309 // and writes because the context has space for 32-byte blocks, which
310 // is our maximum and an exact fit for two SSE registers.
89b34050 3119: mov NR, [CTX + nr] // number of rounds
0f23f75f
MW
312#if NKW_NEEDS_REFRESH
313 mov BLKOFF, BLKSZ
314 mov LRK, NR
315 imul LRK, BLKOFF
316#else
317 // If we retain NKW, then BLKSZ and BLKOFF are the same register
318 // because we won't need the former again.
319 mov LRK, NKW
320 sub LRK, BLKSZ
321#endif
322 lea DI, [CTX + wi]
323 lea SI, [CTX + w + 4*LRKo] // last round's keys
324 shl BLKOFF, 2 // block size (in bytes now)
1a0c09c4
MW
325
326 // Copy the last encryption round's keys.
0f23f75f
MW
327 movdqu xmm0, [SI]
328 movdqu [DI], xmm0
329 cmp BLKOFF, 16
89b34050 330 jbe 0f
0f23f75f
MW
331 movdqu xmm0, [SI + 16]
332 movdqu [DI + 16], xmm0
1a0c09c4
MW
333
334 // Update the loop variables and stop if we've finished.
89b34050 3350: add DI, BLKOFFo
0f23f75f
MW
336 sub SI, BLKOFFo
337 sub NR, 1
89b34050 338 jbe 9f
1a0c09c4
MW
339
340 // Do another middle round's keys...
0f23f75f 341 movdqu xmm0, [SI]
1a0c09c4 342 aesimc xmm0, xmm0
0f23f75f
MW
343 movdqu [DI], xmm0
344 cmp BLKOFF, 16
89b34050 345 jbe 0b
0f23f75f 346 movdqu xmm0, [SI + 16]
1a0c09c4 347 aesimc xmm0, xmm0
0f23f75f 348 movdqu [DI + 16], xmm0
89b34050 349 jmp 0b
1a0c09c4
MW
350
351 // Finally do the first encryption round.
89b34050 3529: movdqu xmm0, [SI]
0f23f75f
MW
353 movdqu [DI], xmm0
354 cmp BLKOFF, 16
89b34050 355 jbe 1f
0f23f75f
MW
356 movdqu xmm0, [SI + 16]
357 movdqu [DI + 16], xmm0
1a0c09c4
MW
358
359 // If the block size is not exactly four words then we must end-swap
360 // everything. We can use fancy SSE toys for this.
89b34050
MW
3611: cmp BLKOFF, 16
362 je 9f
1a0c09c4
MW
363
364 // Find the byte-reordering table.
365 ldgot ecx
8d6ca554 366 movdqa xmm5, [INTADDR(endswap_tab, ecx)]
1a0c09c4 367
0f23f75f 368#if NKW_NEEDS_REFRESH
1a0c09c4
MW
369 // Calculate the number of subkey words again. (It's a good job
370 // we've got a fast multiplier.)
0f23f75f
MW
371 mov NKW, [CTX + nr]
372 add NKW, 1
373 imul NKW, BLKSZ
374#endif
1a0c09c4
MW
375
376 // End-swap the encryption keys.
0f23f75f 377 lea SI, [CTX + w]
1a0c09c4
MW
378 call endswap_block
379
380 // And the decryption keys.
0f23f75f 381 lea SI, [CTX + wi]
1a0c09c4
MW
382 call endswap_block
383
89b34050 3849: // All done.
0f23f75f
MW
385#if CPUFAM_X86
386 pop edi
1a0c09c4
MW
387 pop esi
388 pop ebx
389 pop ebp
0f23f75f
MW
390#endif
391#if CPUFAM_AMD64 && ABI_WIN
392 pop rdi
393 pop rsi
394#endif
1a0c09c4
MW
395 ret
396
397 .align 16
398endswap_block:
1a384903 399 // End-swap NKW words starting at SI. The end-swapping table is
8d6ca554 400 // already loaded into XMM5; and it's OK to work in 16-byte chunks.
1a384903
MW
401 mov ecx, NKW
4020: movdqu xmm1, [SI]
8d6ca554 403 pshufb xmm1, xmm5
0f23f75f
MW
404 movdqu [SI], xmm1
405 add SI, 16
1a0c09c4 406 sub ecx, 4
1a384903 407 ja 0b
1a0c09c4
MW
408 ret
409
0f23f75f
MW
410#undef CTX
411#undef BLKSZ
412#undef SI
413#undef DI
414#undef KSZ
415#undef KSZo
416#undef RCON
417#undef LIMn
418#undef LIM
419#undef NR
420#undef LRK
421#undef LRKo
422#undef BLKOFF
423#undef BLKOFFo
424
1a0c09c4
MW
425ENDFUNC
426
427///--------------------------------------------------------------------------
428/// Encrypting and decrypting blocks.
429
8a1aa284
MW
430.macro encdec op, aes, koff
431 FUNC(rijndael_\op\()_x86ish_aesni)
1a0c09c4
MW
432
433 // Find the magic endianness-swapping table.
434 ldgot ecx
8d6ca554 435 movdqa xmm5, [INTADDR(endswap_tab, ecx)]
1a0c09c4 436
0f23f75f
MW
437#if CPUFAM_X86
438 // Arguments come in on the stack, and need to be collected. We
439 // don't have a shortage of registers.
440
441# define K ecx
442# define SRC edx
443# define DST edx
444# define NR eax
445
446 mov K, [esp + 4]
447 mov SRC, [esp + 8]
448#endif
449
450#if CPUFAM_AMD64 && ABI_SYSV
451 // Arguments come in registers. All is good.
452
453# define K rdi
454# define SRC rsi
455# define DST rdx
456# define NR eax
457#endif
458
459#if CPUFAM_AMD64 && ABI_WIN
460 // Arguments come in different registers.
461
462# define K rcx
463# define SRC rdx
464# define DST r8
465# define NR eax
466#endif
467
468 // Initial setup.
469 movdqu xmm0, [SRC]
8d6ca554 470 pshufb xmm0, xmm5
0f23f75f
MW
471 mov NR, [K + nr]
472 add K, \koff
1a0c09c4
MW
473
474 // Initial whitening.
0f23f75f
MW
475 movdqu xmm1, [K]
476 add K, 16
1a0c09c4
MW
477 pxor xmm0, xmm1
478
479 // Dispatch to the correct code.
0f23f75f 480 cmp NR, 10
e297526c 481 je 10f
1a0c09c4 482 jb bogus
0f23f75f 483 cmp NR, 14
e297526c 484 je 14f
1a0c09c4 485 ja bogus
0f23f75f 486 cmp NR, 12
e297526c
MW
487 je 12f
488 jb 11f
489 jmp 13f
1a0c09c4
MW
490
491 .align 2
492
493 // 14 rounds...
0f23f75f
MW
49414: movdqu xmm1, [K]
495 add K, 16
e297526c 496 \aes xmm0, xmm1
1a0c09c4
MW
497
498 // 13 rounds...
0f23f75f
MW
49913: movdqu xmm1, [K]
500 add K, 16
e297526c 501 \aes xmm0, xmm1
1a0c09c4
MW
502
503 // 12 rounds...
0f23f75f
MW
50412: movdqu xmm1, [K]
505 add K, 16
e297526c 506 \aes xmm0, xmm1
1a0c09c4
MW
507
508 // 11 rounds...
0f23f75f
MW
50911: movdqu xmm1, [K]
510 add K, 16
e297526c 511 \aes xmm0, xmm1
1a0c09c4
MW
512
513 // 10 rounds...
0f23f75f 51410: movdqu xmm1, [K]
e297526c 515 \aes xmm0, xmm1
1a0c09c4
MW
516
517 // 9 rounds...
0f23f75f 518 movdqu xmm1, [K + 16]
e297526c 519 \aes xmm0, xmm1
1a0c09c4
MW
520
521 // 8 rounds...
0f23f75f 522 movdqu xmm1, [K + 32]
e297526c 523 \aes xmm0, xmm1
1a0c09c4
MW
524
525 // 7 rounds...
0f23f75f 526 movdqu xmm1, [K + 48]
e297526c 527 \aes xmm0, xmm1
1a0c09c4
MW
528
529 // 6 rounds...
0f23f75f 530 movdqu xmm1, [K + 64]
e297526c 531 \aes xmm0, xmm1
1a0c09c4
MW
532
533 // 5 rounds...
0f23f75f 534 movdqu xmm1, [K + 80]
e297526c 535 \aes xmm0, xmm1
1a0c09c4
MW
536
537 // 4 rounds...
0f23f75f 538 movdqu xmm1, [K + 96]
e297526c 539 \aes xmm0, xmm1
1a0c09c4
MW
540
541 // 3 rounds...
0f23f75f 542 movdqu xmm1, [K + 112]
e297526c 543 \aes xmm0, xmm1
1a0c09c4
MW
544
545 // 2 rounds...
0f23f75f 546 movdqu xmm1, [K + 128]
e297526c 547 \aes xmm0, xmm1
1a0c09c4
MW
548
549 // Final round...
0f23f75f 550 movdqu xmm1, [K + 144]
e297526c 551 \aes\()last xmm0, xmm1
1a0c09c4
MW
552
553 // Unpermute the ciphertext block and store it.
8d6ca554 554 pshufb xmm0, xmm5
0f23f75f
MW
555#if CPUFAM_X86
556 mov DST, [esp + 12]
557#endif
558 movdqu [DST], xmm0
1a0c09c4
MW
559
560 // And we're done.
561 ret
562
0f23f75f
MW
563#undef K
564#undef SRC
565#undef DST
566#undef NR
567
8a1aa284
MW
568 ENDFUNC
569.endm
1a0c09c4 570
e297526c
MW
571 encdec eblk, aesenc, w
572 encdec dblk, aesdec, wi
1a0c09c4
MW
573
574///--------------------------------------------------------------------------
575/// Random utilities.
576
577 .align 16
578 // Abort the process because of a programming error. Indirecting
579 // through this point serves several purposes: (a) by CALLing, rather
580 // than branching to, `abort', we can save the return address, which
581 // might at least provide a hint as to what went wrong; (b) we don't
582 // have conditional CALLs (and they'd be big anyway); and (c) we can
583 // write a HLT here as a backstop against `abort' being mad.
584bogus: callext F(abort)
5850: hlt
586 jmp 0b
587
588 gotaux ecx
589
590///--------------------------------------------------------------------------
591/// Data tables.
592
593 .align 16
594endswap_tab:
595 .byte 3, 2, 1, 0
596 .byte 7, 6, 5, 4
597 .byte 11, 10, 9, 8
598 .byte 15, 14, 13, 12
599
600///----- That's all, folks --------------------------------------------------