symm/rijndael-x86ish-aesni.S: Decorate `rijndael_rcon' correctly.
[catacomb] / symm / rijndael-x86ish-aesni.S
CommitLineData
1a0c09c4
MW
1/// -*- mode: asm; asm-comment-char: ?/ -*-
2///
3/// AESNI-based implementation of Rijndael
4///
5/// (c) 2015 Straylight/Edgeware
6///
7
8///----- Licensing notice ---------------------------------------------------
9///
10/// This file is part of Catacomb.
11///
12/// Catacomb is free software; you can redistribute it and/or modify
13/// it under the terms of the GNU Library General Public License as
14/// published by the Free Software Foundation; either version 2 of the
15/// License, or (at your option) any later version.
16///
17/// Catacomb is distributed in the hope that it will be useful,
18/// but WITHOUT ANY WARRANTY; without even the implied warranty of
19/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20/// GNU Library General Public License for more details.
21///
22/// You should have received a copy of the GNU Library General Public
23/// License along with Catacomb; if not, write to the Free
24/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
25/// MA 02111-1307, USA.
26
27///--------------------------------------------------------------------------
28/// External definitions.
29
30#include "config.h"
31#include "asm-common.h"
32
33///--------------------------------------------------------------------------
34/// External definitions.
35
36 .globl F(abort)
37 .globl F(rijndael_rcon)
38
39///--------------------------------------------------------------------------
47103664
MW
40/// Local utilities.
41
42// Magic constants for shuffling.
43#define ROTL 0x93
44#define ROT2 0x4e
45#define ROTR 0x39
46
47///--------------------------------------------------------------------------
1a0c09c4
MW
48/// Main code.
49
50 .arch .aes
bc9ac7eb 51 .text
1a0c09c4
MW
52
53/// The AESNI instructions implement a little-endian version of AES, but
54/// Catacomb's internal interface presents as big-endian so as to work better
55/// with things like GCM. We therefore maintain the round keys in
56/// little-endian form, and have to end-swap blocks in and out.
57///
58/// For added amusement, the AESNI instructions don't implement the
59/// larger-block versions of Rijndael, so we have to end-swap the keys if
60/// we're preparing for one of those.
61
62 // Useful constants.
63 .equ maxrounds, 16 // maximum number of rounds
64 .equ maxblksz, 32 // maximum block size, in bytes
65 .equ kbufsz, maxblksz*(maxrounds + 1) // size of a key-schedule buffer
66
67 // Context structure.
68 .equ nr, 0 // number of rounds
69 .equ w, nr + 4 // encryption key words
70 .equ wi, w + kbufsz // decryption key words
71
72///--------------------------------------------------------------------------
73/// Key setup.
74
0f23f75f 75FUNC(rijndael_setup_x86ish_aesni)
1a0c09c4 76
0f23f75f
MW
77#if CPUFAM_X86
78 // Arguments are on the stack. We'll need to stack the caller's
79 // register veriables, but we'll manage.
1a0c09c4 80
0f23f75f
MW
81# define CTX ebp // context pointer
82# define BLKSZ [esp + 24] // block size
83
84# define SI esi // source pointer
85# define DI edi // destination pointer
86
87# define KSZ ebx // key size
88# define KSZo ebx // ... as address offset
89# define NKW edx // total number of key words
90# define NKW_NEEDS_REFRESH 1 // ... needs recalculating
91# define RCON ecx // round constants table
92# define LIM edx // limit pointer
93# define LIMn edx // ... as integer offset from base
94
95# define NR ecx // number of rounds
96# define LRK eax // distance to last key
97# define LRKo eax // ... as address offset
98# define BLKOFF edx // block size in bytes
99# define BLKOFFo edx // ... as address offset
100
101 // Stack the caller's registers.
1a0c09c4
MW
102 push ebp
103 push ebx
104 push esi
105 push edi
106
0f23f75f
MW
107 // Set up our own variables.
108 mov CTX, [esp + 20] // context base pointer
109 mov SI, [esp + 28] // key material
110 mov KSZ, [esp + 32] // key size, in words
111#endif
112
113#if CPUFAM_AMD64 && ABI_SYSV
114 // Arguments are in registers. We have plenty, but, to be honest,
115 // the initial register allocation is a bit annoying.
116
117# define CTX r8 // context pointer
118# define BLKSZ r9d // block size
119
120# define SI rsi // source pointer
121# define DI rdi // destination pointer
122
123# define KSZ edx // key size
124# define KSZo rdx // ... as address offset
125# define NKW r10d // total number of key words
126# define RCON rdi // round constants table
127# define LIMn ecx // limit pointer
128# define LIM rcx // ... as integer offset from base
129
130# define NR ecx // number of rounds
131# define LRK eax // distance to last key
132# define LRKo rax // ... as address offset
133# define BLKOFF r9d // block size in bytes
134# define BLKOFFo r9 // ... as address offset
135
136 // Move arguments to more useful places.
137 mov CTX, rdi // context base pointer
138 mov BLKSZ, esi // block size in words
139 mov SI, rdx // key material
140 mov KSZ, ecx // key size, in words
141#endif
142
143#if CPUFAM_AMD64 && ABI_WIN
144 // Arguments are in different registers, and they're a little tight.
145
146# define CTX r8 // context pointer
147# define BLKSZ edx // block size
148
149# define SI rsi // source pointer
150# define DI rdi // destination pointer
151
152# define KSZ r9d // key size
153# define KSZo r9 // ... as address offset
154# define NKW r10d // total number of key words
155# define RCON rdi // round constants table
156# define LIMn ecx // limit pointer
157# define LIM rcx // ... as integer offset from base
158
159# define NR ecx // number of rounds
160# define LRK eax // distance to last key
161# define LRKo rax // ... as address offset
162# define BLKOFF edx // block size in bytes
163# define BLKOFFo rdx // ... as address offset
164
165 // We'll need the index registers, which belong to the caller in this
166 // ABI.
167 push rsi
168 push rdi
169
170 // Move arguments to more useful places.
171 mov SI, r8 // key material
172 mov CTX, rcx // context base pointer
173#endif
174
1a0c09c4
MW
175 // The initial round key material is taken directly from the input
176 // key, so copy it over.
0f23f75f
MW
177#if CPUFAM_AMD64 && ABI_SYSV
178 // We've been lucky. We already have a copy of the context pointer
179 // in rdi, and the key size in ecx.
180 add DI, w
181#else
182 lea DI, [CTX + w]
183 mov ecx, KSZ
184#endif
1a0c09c4
MW
185 rep movsd
186
187 // Find out other useful things.
0f23f75f
MW
188 mov NKW, [CTX + nr] // number of rounds
189 add NKW, 1
190 imul NKW, BLKSZ // total key size in words
191#if !NKW_NEEDS_REFRESH
192 // If we can't keep NKW for later, then we use the same register for
193 // it and LIM, so this move is unnecessary.
194 mov LIMn, NKW
195#endif
196 sub LIMn, KSZ // offset by the key size
1a0c09c4
MW
197
198 // Find the round constants.
199 ldgot ecx
811a896f 200 leaext RCON, F(rijndael_rcon), ecx
1a0c09c4
MW
201
202 // Prepare for the main loop.
0f23f75f
MW
203 lea SI, [CTX + w]
204 mov eax, [SI + 4*KSZo - 4] // most recent key word
205 lea LIM, [SI + 4*LIM] // limit, offset by one key expansion
1a0c09c4
MW
206
207 // Main key expansion loop. The first word of each key-length chunk
208 // needs special treatment.
209 //
210 // This is rather tedious because the Intel `AESKEYGENASSIST'
211 // instruction is very strangely shaped. Firstly, it wants to
212 // operate on vast SSE registers, even though we're data-blocked from
213 // doing more than operation at a time unless we're doing two key
214 // schedules simultaneously -- and even then we can't do more than
215 // two, because the instruction ignores two of its input words
216 // entirely, and produces two different outputs for each of the other
217 // two. And secondly it insists on taking the magic round constant
218 // as an immediate, so it's kind of annoying if you're not
219 // open-coding the whole thing. It's much easier to leave that as
220 // zero and XOR in the round constant by hand.
2219: movd xmm0, eax
47103664 222 pshufd xmm0, xmm0, ROTR
1a0c09c4 223 aeskeygenassist xmm1, xmm0, 0
47103664 224 pshufd xmm1, xmm1, ROTL
1a0c09c4 225 movd eax, xmm1
0f23f75f
MW
226 xor eax, [SI]
227 xor al, [RCON]
228 inc RCON
229 mov [SI + 4*KSZo], eax
230 add SI, 4
231 cmp SI, LIM
1a0c09c4
MW
232 jae 8f
233
234 // The next three words are simple...
0f23f75f
MW
235 xor eax, [SI]
236 mov [SI + 4*KSZo], eax
237 add SI, 4
238 cmp SI, LIM
1a0c09c4
MW
239 jae 8f
240
241 // (Word 2...)
0f23f75f
MW
242 xor eax, [SI]
243 mov [SI + 4*KSZo], eax
244 add SI, 4
245 cmp SI, LIM
1a0c09c4
MW
246 jae 8f
247
248 // (Word 3...)
0f23f75f
MW
249 xor eax, [SI]
250 mov [SI + 4*KSZo], eax
251 add SI, 4
252 cmp SI, LIM
1a0c09c4
MW
253 jae 8f
254
255 // Word 4. If the key is /more/ than 6 words long, then we must
256 // apply a substitution here.
0f23f75f 257 cmp KSZ, 5
1a0c09c4 258 jb 9b
0f23f75f 259 cmp KSZ, 7
1a0c09c4
MW
260 jb 0f
261 movd xmm0, eax
47103664 262 pshufd xmm0, xmm0, ROTL
1a0c09c4
MW
263 aeskeygenassist xmm1, xmm0, 0
264 movd eax, xmm1
0f23f75f
MW
2650: xor eax, [SI]
266 mov [SI + 4*KSZo], eax
267 add SI, 4
268 cmp SI, LIM
1a0c09c4
MW
269 jae 8f
270
271 // (Word 5...)
0f23f75f 272 cmp KSZ, 6
1a0c09c4 273 jb 9b
0f23f75f
MW
274 xor eax, [SI]
275 mov [SI + 4*KSZo], eax
276 add SI, 4
277 cmp SI, LIM
1a0c09c4
MW
278 jae 8f
279
280 // (Word 6...)
0f23f75f 281 cmp KSZ, 7
1a0c09c4 282 jb 9b
0f23f75f
MW
283 xor eax, [SI]
284 mov [SI + 4*KSZo], eax
285 add SI, 4
286 cmp SI, LIM
1a0c09c4
MW
287 jae 8f
288
289 // (Word 7...)
0f23f75f 290 cmp KSZ, 8
1a0c09c4 291 jb 9b
0f23f75f
MW
292 xor eax, [SI]
293 mov [SI + 4*KSZo], eax
294 add SI, 4
295 cmp SI, LIM
1a0c09c4
MW
296 jae 8f
297
298 // Must be done by now.
299 jmp 9b
300
301 // Next job is to construct the decryption keys. The keys for the
302 // first and last rounds don't need to be mangled, but the remaining
303 // ones do -- and they all need to be reordered too.
304 //
305 // The plan of action, then, is to copy the final encryption round's
306 // keys into place first, then to do each of the intermediate rounds
307 // in reverse order, and finally do the first round.
308 //
309 // Do all of the heavy lifting with SSE registers. The order we're
310 // doing this in means that it's OK if we read or write too much, and
311 // there's easily enough buffer space for the over-enthusiastic reads
312 // and writes because the context has space for 32-byte blocks, which
313 // is our maximum and an exact fit for two SSE registers.
0f23f75f
MW
3148: mov NR, [CTX + nr] // number of rounds
315#if NKW_NEEDS_REFRESH
316 mov BLKOFF, BLKSZ
317 mov LRK, NR
318 imul LRK, BLKOFF
319#else
320 // If we retain NKW, then BLKSZ and BLKOFF are the same register
321 // because we won't need the former again.
322 mov LRK, NKW
323 sub LRK, BLKSZ
324#endif
325 lea DI, [CTX + wi]
326 lea SI, [CTX + w + 4*LRKo] // last round's keys
327 shl BLKOFF, 2 // block size (in bytes now)
1a0c09c4
MW
328
329 // Copy the last encryption round's keys.
0f23f75f
MW
330 movdqu xmm0, [SI]
331 movdqu [DI], xmm0
332 cmp BLKOFF, 16
1a0c09c4 333 jbe 9f
0f23f75f
MW
334 movdqu xmm0, [SI + 16]
335 movdqu [DI + 16], xmm0
1a0c09c4
MW
336
337 // Update the loop variables and stop if we've finished.
0f23f75f
MW
3389: add DI, BLKOFFo
339 sub SI, BLKOFFo
340 sub NR, 1
1a0c09c4
MW
341 jbe 0f
342
343 // Do another middle round's keys...
0f23f75f 344 movdqu xmm0, [SI]
1a0c09c4 345 aesimc xmm0, xmm0
0f23f75f
MW
346 movdqu [DI], xmm0
347 cmp BLKOFF, 16
1a0c09c4 348 jbe 9b
0f23f75f 349 movdqu xmm0, [SI + 16]
1a0c09c4 350 aesimc xmm0, xmm0
0f23f75f 351 movdqu [DI + 16], xmm0
1a0c09c4
MW
352 jmp 9b
353
354 // Finally do the first encryption round.
0f23f75f
MW
3550: movdqu xmm0, [SI]
356 movdqu [DI], xmm0
357 cmp BLKOFF, 16
1a0c09c4 358 jbe 0f
0f23f75f
MW
359 movdqu xmm0, [SI + 16]
360 movdqu [DI + 16], xmm0
1a0c09c4
MW
361
362 // If the block size is not exactly four words then we must end-swap
363 // everything. We can use fancy SSE toys for this.
0f23f75f 3640: cmp BLKOFF, 16
1a0c09c4
MW
365 je 0f
366
367 // Find the byte-reordering table.
368 ldgot ecx
8d6ca554 369 movdqa xmm5, [INTADDR(endswap_tab, ecx)]
1a0c09c4 370
0f23f75f 371#if NKW_NEEDS_REFRESH
1a0c09c4
MW
372 // Calculate the number of subkey words again. (It's a good job
373 // we've got a fast multiplier.)
0f23f75f
MW
374 mov NKW, [CTX + nr]
375 add NKW, 1
376 imul NKW, BLKSZ
377#endif
1a0c09c4
MW
378
379 // End-swap the encryption keys.
0f23f75f
MW
380 mov ecx, NKW
381 lea SI, [CTX + w]
1a0c09c4
MW
382 call endswap_block
383
384 // And the decryption keys.
0f23f75f
MW
385 mov ecx, NKW
386 lea SI, [CTX + wi]
1a0c09c4
MW
387 call endswap_block
388
0f23f75f
MW
3890: // All done.
390#if CPUFAM_X86
391 pop edi
1a0c09c4
MW
392 pop esi
393 pop ebx
394 pop ebp
0f23f75f
MW
395#endif
396#if CPUFAM_AMD64 && ABI_WIN
397 pop rdi
398 pop rsi
399#endif
1a0c09c4
MW
400 ret
401
402 .align 16
403endswap_block:
0f23f75f 404 // End-swap ECX words starting at SI. The end-swapping table is
8d6ca554 405 // already loaded into XMM5; and it's OK to work in 16-byte chunks.
0f23f75f 406 movdqu xmm1, [SI]
8d6ca554 407 pshufb xmm1, xmm5
0f23f75f
MW
408 movdqu [SI], xmm1
409 add SI, 16
1a0c09c4
MW
410 sub ecx, 4
411 ja endswap_block
412 ret
413
0f23f75f
MW
414#undef CTX
415#undef BLKSZ
416#undef SI
417#undef DI
418#undef KSZ
419#undef KSZo
420#undef RCON
421#undef LIMn
422#undef LIM
423#undef NR
424#undef LRK
425#undef LRKo
426#undef BLKOFF
427#undef BLKOFFo
428
1a0c09c4
MW
429ENDFUNC
430
431///--------------------------------------------------------------------------
432/// Encrypting and decrypting blocks.
433
e297526c 434 .macro encdec op, aes, koff
0f23f75f 435FUNC(rijndael_\op\()_x86ish_aesni)
1a0c09c4
MW
436
437 // Find the magic endianness-swapping table.
438 ldgot ecx
8d6ca554 439 movdqa xmm5, [INTADDR(endswap_tab, ecx)]
1a0c09c4 440
0f23f75f
MW
441#if CPUFAM_X86
442 // Arguments come in on the stack, and need to be collected. We
443 // don't have a shortage of registers.
444
445# define K ecx
446# define SRC edx
447# define DST edx
448# define NR eax
449
450 mov K, [esp + 4]
451 mov SRC, [esp + 8]
452#endif
453
454#if CPUFAM_AMD64 && ABI_SYSV
455 // Arguments come in registers. All is good.
456
457# define K rdi
458# define SRC rsi
459# define DST rdx
460# define NR eax
461#endif
462
463#if CPUFAM_AMD64 && ABI_WIN
464 // Arguments come in different registers.
465
466# define K rcx
467# define SRC rdx
468# define DST r8
469# define NR eax
470#endif
471
472 // Initial setup.
473 movdqu xmm0, [SRC]
8d6ca554 474 pshufb xmm0, xmm5
0f23f75f
MW
475 mov NR, [K + nr]
476 add K, \koff
1a0c09c4
MW
477
478 // Initial whitening.
0f23f75f
MW
479 movdqu xmm1, [K]
480 add K, 16
1a0c09c4
MW
481 pxor xmm0, xmm1
482
483 // Dispatch to the correct code.
0f23f75f 484 cmp NR, 10
e297526c 485 je 10f
1a0c09c4 486 jb bogus
0f23f75f 487 cmp NR, 14
e297526c 488 je 14f
1a0c09c4 489 ja bogus
0f23f75f 490 cmp NR, 12
e297526c
MW
491 je 12f
492 jb 11f
493 jmp 13f
1a0c09c4
MW
494
495 .align 2
496
497 // 14 rounds...
0f23f75f
MW
49814: movdqu xmm1, [K]
499 add K, 16
e297526c 500 \aes xmm0, xmm1
1a0c09c4
MW
501
502 // 13 rounds...
0f23f75f
MW
50313: movdqu xmm1, [K]
504 add K, 16
e297526c 505 \aes xmm0, xmm1
1a0c09c4
MW
506
507 // 12 rounds...
0f23f75f
MW
50812: movdqu xmm1, [K]
509 add K, 16
e297526c 510 \aes xmm0, xmm1
1a0c09c4
MW
511
512 // 11 rounds...
0f23f75f
MW
51311: movdqu xmm1, [K]
514 add K, 16
e297526c 515 \aes xmm0, xmm1
1a0c09c4
MW
516
517 // 10 rounds...
0f23f75f 51810: movdqu xmm1, [K]
e297526c 519 \aes xmm0, xmm1
1a0c09c4
MW
520
521 // 9 rounds...
0f23f75f 522 movdqu xmm1, [K + 16]
e297526c 523 \aes xmm0, xmm1
1a0c09c4
MW
524
525 // 8 rounds...
0f23f75f 526 movdqu xmm1, [K + 32]
e297526c 527 \aes xmm0, xmm1
1a0c09c4
MW
528
529 // 7 rounds...
0f23f75f 530 movdqu xmm1, [K + 48]
e297526c 531 \aes xmm0, xmm1
1a0c09c4
MW
532
533 // 6 rounds...
0f23f75f 534 movdqu xmm1, [K + 64]
e297526c 535 \aes xmm0, xmm1
1a0c09c4
MW
536
537 // 5 rounds...
0f23f75f 538 movdqu xmm1, [K + 80]
e297526c 539 \aes xmm0, xmm1
1a0c09c4
MW
540
541 // 4 rounds...
0f23f75f 542 movdqu xmm1, [K + 96]
e297526c 543 \aes xmm0, xmm1
1a0c09c4
MW
544
545 // 3 rounds...
0f23f75f 546 movdqu xmm1, [K + 112]
e297526c 547 \aes xmm0, xmm1
1a0c09c4
MW
548
549 // 2 rounds...
0f23f75f 550 movdqu xmm1, [K + 128]
e297526c 551 \aes xmm0, xmm1
1a0c09c4
MW
552
553 // Final round...
0f23f75f 554 movdqu xmm1, [K + 144]
e297526c 555 \aes\()last xmm0, xmm1
1a0c09c4
MW
556
557 // Unpermute the ciphertext block and store it.
8d6ca554 558 pshufb xmm0, xmm5
0f23f75f
MW
559#if CPUFAM_X86
560 mov DST, [esp + 12]
561#endif
562 movdqu [DST], xmm0
1a0c09c4
MW
563
564 // And we're done.
565 ret
566
0f23f75f
MW
567#undef K
568#undef SRC
569#undef DST
570#undef NR
571
1a0c09c4 572ENDFUNC
e297526c 573 .endm
1a0c09c4 574
e297526c
MW
575 encdec eblk, aesenc, w
576 encdec dblk, aesdec, wi
1a0c09c4
MW
577
578///--------------------------------------------------------------------------
579/// Random utilities.
580
581 .align 16
582 // Abort the process because of a programming error. Indirecting
583 // through this point serves several purposes: (a) by CALLing, rather
584 // than branching to, `abort', we can save the return address, which
585 // might at least provide a hint as to what went wrong; (b) we don't
586 // have conditional CALLs (and they'd be big anyway); and (c) we can
587 // write a HLT here as a backstop against `abort' being mad.
588bogus: callext F(abort)
5890: hlt
590 jmp 0b
591
592 gotaux ecx
593
594///--------------------------------------------------------------------------
595/// Data tables.
596
597 .align 16
598endswap_tab:
599 .byte 3, 2, 1, 0
600 .byte 7, 6, 5, 4
601 .byte 11, 10, 9, 8
602 .byte 15, 14, 13, 12
603
604///----- That's all, folks --------------------------------------------------