Have a small reformatting session.
[catacomb] / symm / rijndael-x86ish-aesni.S
1 /// -*- mode: asm; asm-comment-char: ?/ -*-
2 ///
3 /// AESNI-based implementation of Rijndael
4 ///
5 /// (c) 2015 Straylight/Edgeware
6 ///
7
8 ///----- Licensing notice ---------------------------------------------------
9 ///
10 /// This file is part of Catacomb.
11 ///
12 /// Catacomb is free software; you can redistribute it and/or modify
13 /// it under the terms of the GNU Library General Public License as
14 /// published by the Free Software Foundation; either version 2 of the
15 /// License, or (at your option) any later version.
16 ///
17 /// Catacomb is distributed in the hope that it will be useful,
18 /// but WITHOUT ANY WARRANTY; without even the implied warranty of
19 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 /// GNU Library General Public License for more details.
21 ///
22 /// You should have received a copy of the GNU Library General Public
23 /// License along with Catacomb; if not, write to the Free
24 /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
25 /// MA 02111-1307, USA.
26
27 ///--------------------------------------------------------------------------
28 /// External definitions.
29
30 #include "config.h"
31 #include "asm-common.h"
32
33 .globl F(abort)
34 .globl F(rijndael_rcon)
35
36 ///--------------------------------------------------------------------------
37 /// Local utilities.
38
39 // Magic constants for shuffling.
40 #define ROTL 0x93
41 #define ROT2 0x4e
42 #define ROTR 0x39
43
44 ///--------------------------------------------------------------------------
45 /// Main code.
46
47 .arch .aes
48 .text
49
50 /// The AESNI instructions implement a little-endian version of AES, but
51 /// Catacomb's internal interface presents as big-endian so as to work better
52 /// with things like GCM. We therefore maintain the round keys in
53 /// little-endian form, and have to end-swap blocks in and out.
54 ///
55 /// For added amusement, the AESNI instructions don't implement the
56 /// larger-block versions of Rijndael, so we have to end-swap the keys if
57 /// we're preparing for one of those.
58
59 // Useful constants.
60 .equ maxrounds, 16 // maximum number of rounds
61 .equ maxblksz, 32 // maximum block size, in bytes
62 .equ kbufsz, maxblksz*(maxrounds + 1) // size of a key-schedule buffer
63
64 // Context structure.
65 .equ nr, 0 // number of rounds
66 .equ w, nr + 4 // encryption key words
67 .equ wi, w + kbufsz // decryption key words
68
69 ///--------------------------------------------------------------------------
70 /// Key setup.
71
72 FUNC(rijndael_setup_x86ish_aesni)
73
74 #if CPUFAM_X86
75 // Arguments are on the stack. We'll need to stack the caller's
76 // register veriables, but we'll manage.
77
78 # define CTX ebp // context pointer
79 # define BLKSZ [esp + 24] // block size
80
81 # define SI esi // source pointer
82 # define DI edi // destination pointer
83
84 # define KSZ ebx // key size
85 # define KSZo ebx // ... as address offset
86 # define NKW edx // total number of key words
87 # define NKW_NEEDS_REFRESH 1 // ... needs recalculating
88 # define RCON ecx // round constants table
89 # define LIM edx // limit pointer
90 # define LIMn edx // ... as integer offset from base
91
92 # define NR ecx // number of rounds
93 # define LRK eax // distance to last key
94 # define LRKo eax // ... as address offset
95 # define BLKOFF edx // block size in bytes
96 # define BLKOFFo edx // ... as address offset
97
98 // Stack the caller's registers.
99 push ebp
100 push ebx
101 push esi
102 push edi
103
104 // Set up our own variables.
105 mov CTX, [esp + 20] // context base pointer
106 mov SI, [esp + 28] // key material
107 mov KSZ, [esp + 32] // key size, in words
108 #endif
109
110 #if CPUFAM_AMD64 && ABI_SYSV
111 // Arguments are in registers. We have plenty, but, to be honest,
112 // the initial register allocation is a bit annoying.
113
114 # define CTX r8 // context pointer
115 # define BLKSZ r9d // block size
116
117 # define SI rsi // source pointer
118 # define DI rdi // destination pointer
119
120 # define KSZ edx // key size
121 # define KSZo rdx // ... as address offset
122 # define NKW r10d // total number of key words
123 # define RCON rdi // round constants table
124 # define LIMn ecx // limit pointer
125 # define LIM rcx // ... as integer offset from base
126
127 # define NR ecx // number of rounds
128 # define LRK eax // distance to last key
129 # define LRKo rax // ... as address offset
130 # define BLKOFF r9d // block size in bytes
131 # define BLKOFFo r9 // ... as address offset
132
133 // Move arguments to more useful places.
134 mov CTX, rdi // context base pointer
135 mov BLKSZ, esi // block size in words
136 mov SI, rdx // key material
137 mov KSZ, ecx // key size, in words
138 #endif
139
140 #if CPUFAM_AMD64 && ABI_WIN
141 // Arguments are in different registers, and they're a little tight.
142
143 # define CTX r8 // context pointer
144 # define BLKSZ edx // block size
145
146 # define SI rsi // source pointer
147 # define DI rdi // destination pointer
148
149 # define KSZ r9d // key size
150 # define KSZo r9 // ... as address offset
151 # define NKW r10d // total number of key words
152 # define RCON rdi // round constants table
153 # define LIMn ecx // limit pointer
154 # define LIM rcx // ... as integer offset from base
155
156 # define NR ecx // number of rounds
157 # define LRK eax // distance to last key
158 # define LRKo rax // ... as address offset
159 # define BLKOFF edx // block size in bytes
160 # define BLKOFFo rdx // ... as address offset
161
162 // We'll need the index registers, which belong to the caller in this
163 // ABI.
164 push rsi
165 push rdi
166
167 // Move arguments to more useful places.
168 mov SI, r8 // key material
169 mov CTX, rcx // context base pointer
170 #endif
171
172 // The initial round key material is taken directly from the input
173 // key, so copy it over.
174 #if CPUFAM_AMD64 && ABI_SYSV
175 // We've been lucky. We already have a copy of the context pointer
176 // in rdi, and the key size in ecx.
177 add DI, w
178 #else
179 lea DI, [CTX + w]
180 mov ecx, KSZ
181 #endif
182 rep movsd
183
184 // Find out other useful things.
185 mov NKW, [CTX + nr] // number of rounds
186 add NKW, 1
187 imul NKW, BLKSZ // total key size in words
188 #if !NKW_NEEDS_REFRESH
189 // If we can't keep NKW for later, then we use the same register for
190 // it and LIM, so this move is unnecessary.
191 mov LIMn, NKW
192 #endif
193 sub LIMn, KSZ // offset by the key size
194
195 // Find the round constants.
196 ldgot ecx
197 leaext RCON, F(rijndael_rcon), ecx
198
199 // Prepare for the main loop.
200 lea SI, [CTX + w]
201 mov eax, [SI + 4*KSZo - 4] // most recent key word
202 lea LIM, [SI + 4*LIM] // limit, offset by one key expansion
203
204 // Main key expansion loop. The first word of each key-length chunk
205 // needs special treatment.
206 //
207 // This is rather tedious because the Intel `AESKEYGENASSIST'
208 // instruction is very strangely shaped. Firstly, it wants to
209 // operate on vast SSE registers, even though we're data-blocked from
210 // doing more than operation at a time unless we're doing two key
211 // schedules simultaneously -- and even then we can't do more than
212 // two, because the instruction ignores two of its input words
213 // entirely, and produces two different outputs for each of the other
214 // two. And secondly it insists on taking the magic round constant
215 // as an immediate, so it's kind of annoying if you're not
216 // open-coding the whole thing. It's much easier to leave that as
217 // zero and XOR in the round constant by hand.
218 9: movd xmm0, eax
219 pshufd xmm0, xmm0, ROTR
220 aeskeygenassist xmm1, xmm0, 0
221 pshufd xmm1, xmm1, ROTL
222 movd eax, xmm1
223 xor eax, [SI]
224 xor al, [RCON]
225 inc RCON
226 mov [SI + 4*KSZo], eax
227 add SI, 4
228 cmp SI, LIM
229 jae 8f
230
231 // The next three words are simple...
232 xor eax, [SI]
233 mov [SI + 4*KSZo], eax
234 add SI, 4
235 cmp SI, LIM
236 jae 8f
237
238 // (Word 2...)
239 xor eax, [SI]
240 mov [SI + 4*KSZo], eax
241 add SI, 4
242 cmp SI, LIM
243 jae 8f
244
245 // (Word 3...)
246 xor eax, [SI]
247 mov [SI + 4*KSZo], eax
248 add SI, 4
249 cmp SI, LIM
250 jae 8f
251
252 // Word 4. If the key is /more/ than 6 words long, then we must
253 // apply a substitution here.
254 cmp KSZ, 5
255 jb 9b
256 cmp KSZ, 7
257 jb 0f
258 movd xmm0, eax
259 pshufd xmm0, xmm0, ROTL
260 aeskeygenassist xmm1, xmm0, 0
261 movd eax, xmm1
262 0: xor eax, [SI]
263 mov [SI + 4*KSZo], eax
264 add SI, 4
265 cmp SI, LIM
266 jae 8f
267
268 // (Word 5...)
269 cmp KSZ, 6
270 jb 9b
271 xor eax, [SI]
272 mov [SI + 4*KSZo], eax
273 add SI, 4
274 cmp SI, LIM
275 jae 8f
276
277 // (Word 6...)
278 cmp KSZ, 7
279 jb 9b
280 xor eax, [SI]
281 mov [SI + 4*KSZo], eax
282 add SI, 4
283 cmp SI, LIM
284 jae 8f
285
286 // (Word 7...)
287 cmp KSZ, 8
288 jb 9b
289 xor eax, [SI]
290 mov [SI + 4*KSZo], eax
291 add SI, 4
292 cmp SI, LIM
293 jae 8f
294
295 // Must be done by now.
296 jmp 9b
297
298 // Next job is to construct the decryption keys. The keys for the
299 // first and last rounds don't need to be mangled, but the remaining
300 // ones do -- and they all need to be reordered too.
301 //
302 // The plan of action, then, is to copy the final encryption round's
303 // keys into place first, then to do each of the intermediate rounds
304 // in reverse order, and finally do the first round.
305 //
306 // Do all of the heavy lifting with SSE registers. The order we're
307 // doing this in means that it's OK if we read or write too much, and
308 // there's easily enough buffer space for the over-enthusiastic reads
309 // and writes because the context has space for 32-byte blocks, which
310 // is our maximum and an exact fit for two SSE registers.
311 8: mov NR, [CTX + nr] // number of rounds
312 #if NKW_NEEDS_REFRESH
313 mov BLKOFF, BLKSZ
314 mov LRK, NR
315 imul LRK, BLKOFF
316 #else
317 // If we retain NKW, then BLKSZ and BLKOFF are the same register
318 // because we won't need the former again.
319 mov LRK, NKW
320 sub LRK, BLKSZ
321 #endif
322 lea DI, [CTX + wi]
323 lea SI, [CTX + w + 4*LRKo] // last round's keys
324 shl BLKOFF, 2 // block size (in bytes now)
325
326 // Copy the last encryption round's keys.
327 movdqu xmm0, [SI]
328 movdqu [DI], xmm0
329 cmp BLKOFF, 16
330 jbe 9f
331 movdqu xmm0, [SI + 16]
332 movdqu [DI + 16], xmm0
333
334 // Update the loop variables and stop if we've finished.
335 9: add DI, BLKOFFo
336 sub SI, BLKOFFo
337 sub NR, 1
338 jbe 0f
339
340 // Do another middle round's keys...
341 movdqu xmm0, [SI]
342 aesimc xmm0, xmm0
343 movdqu [DI], xmm0
344 cmp BLKOFF, 16
345 jbe 9b
346 movdqu xmm0, [SI + 16]
347 aesimc xmm0, xmm0
348 movdqu [DI + 16], xmm0
349 jmp 9b
350
351 // Finally do the first encryption round.
352 0: movdqu xmm0, [SI]
353 movdqu [DI], xmm0
354 cmp BLKOFF, 16
355 jbe 0f
356 movdqu xmm0, [SI + 16]
357 movdqu [DI + 16], xmm0
358
359 // If the block size is not exactly four words then we must end-swap
360 // everything. We can use fancy SSE toys for this.
361 0: cmp BLKOFF, 16
362 je 0f
363
364 // Find the byte-reordering table.
365 ldgot ecx
366 movdqa xmm5, [INTADDR(endswap_tab, ecx)]
367
368 #if NKW_NEEDS_REFRESH
369 // Calculate the number of subkey words again. (It's a good job
370 // we've got a fast multiplier.)
371 mov NKW, [CTX + nr]
372 add NKW, 1
373 imul NKW, BLKSZ
374 #endif
375
376 // End-swap the encryption keys.
377 mov ecx, NKW
378 lea SI, [CTX + w]
379 call endswap_block
380
381 // And the decryption keys.
382 mov ecx, NKW
383 lea SI, [CTX + wi]
384 call endswap_block
385
386 0: // All done.
387 #if CPUFAM_X86
388 pop edi
389 pop esi
390 pop ebx
391 pop ebp
392 #endif
393 #if CPUFAM_AMD64 && ABI_WIN
394 pop rdi
395 pop rsi
396 #endif
397 ret
398
399 .align 16
400 endswap_block:
401 // End-swap ECX words starting at SI. The end-swapping table is
402 // already loaded into XMM5; and it's OK to work in 16-byte chunks.
403 movdqu xmm1, [SI]
404 pshufb xmm1, xmm5
405 movdqu [SI], xmm1
406 add SI, 16
407 sub ecx, 4
408 ja endswap_block
409 ret
410
411 #undef CTX
412 #undef BLKSZ
413 #undef SI
414 #undef DI
415 #undef KSZ
416 #undef KSZo
417 #undef RCON
418 #undef LIMn
419 #undef LIM
420 #undef NR
421 #undef LRK
422 #undef LRKo
423 #undef BLKOFF
424 #undef BLKOFFo
425
426 ENDFUNC
427
428 ///--------------------------------------------------------------------------
429 /// Encrypting and decrypting blocks.
430
431 .macro encdec op, aes, koff
432 FUNC(rijndael_\op\()_x86ish_aesni)
433
434 // Find the magic endianness-swapping table.
435 ldgot ecx
436 movdqa xmm5, [INTADDR(endswap_tab, ecx)]
437
438 #if CPUFAM_X86
439 // Arguments come in on the stack, and need to be collected. We
440 // don't have a shortage of registers.
441
442 # define K ecx
443 # define SRC edx
444 # define DST edx
445 # define NR eax
446
447 mov K, [esp + 4]
448 mov SRC, [esp + 8]
449 #endif
450
451 #if CPUFAM_AMD64 && ABI_SYSV
452 // Arguments come in registers. All is good.
453
454 # define K rdi
455 # define SRC rsi
456 # define DST rdx
457 # define NR eax
458 #endif
459
460 #if CPUFAM_AMD64 && ABI_WIN
461 // Arguments come in different registers.
462
463 # define K rcx
464 # define SRC rdx
465 # define DST r8
466 # define NR eax
467 #endif
468
469 // Initial setup.
470 movdqu xmm0, [SRC]
471 pshufb xmm0, xmm5
472 mov NR, [K + nr]
473 add K, \koff
474
475 // Initial whitening.
476 movdqu xmm1, [K]
477 add K, 16
478 pxor xmm0, xmm1
479
480 // Dispatch to the correct code.
481 cmp NR, 10
482 je 10f
483 jb bogus
484 cmp NR, 14
485 je 14f
486 ja bogus
487 cmp NR, 12
488 je 12f
489 jb 11f
490 jmp 13f
491
492 .align 2
493
494 // 14 rounds...
495 14: movdqu xmm1, [K]
496 add K, 16
497 \aes xmm0, xmm1
498
499 // 13 rounds...
500 13: movdqu xmm1, [K]
501 add K, 16
502 \aes xmm0, xmm1
503
504 // 12 rounds...
505 12: movdqu xmm1, [K]
506 add K, 16
507 \aes xmm0, xmm1
508
509 // 11 rounds...
510 11: movdqu xmm1, [K]
511 add K, 16
512 \aes xmm0, xmm1
513
514 // 10 rounds...
515 10: movdqu xmm1, [K]
516 \aes xmm0, xmm1
517
518 // 9 rounds...
519 movdqu xmm1, [K + 16]
520 \aes xmm0, xmm1
521
522 // 8 rounds...
523 movdqu xmm1, [K + 32]
524 \aes xmm0, xmm1
525
526 // 7 rounds...
527 movdqu xmm1, [K + 48]
528 \aes xmm0, xmm1
529
530 // 6 rounds...
531 movdqu xmm1, [K + 64]
532 \aes xmm0, xmm1
533
534 // 5 rounds...
535 movdqu xmm1, [K + 80]
536 \aes xmm0, xmm1
537
538 // 4 rounds...
539 movdqu xmm1, [K + 96]
540 \aes xmm0, xmm1
541
542 // 3 rounds...
543 movdqu xmm1, [K + 112]
544 \aes xmm0, xmm1
545
546 // 2 rounds...
547 movdqu xmm1, [K + 128]
548 \aes xmm0, xmm1
549
550 // Final round...
551 movdqu xmm1, [K + 144]
552 \aes\()last xmm0, xmm1
553
554 // Unpermute the ciphertext block and store it.
555 pshufb xmm0, xmm5
556 #if CPUFAM_X86
557 mov DST, [esp + 12]
558 #endif
559 movdqu [DST], xmm0
560
561 // And we're done.
562 ret
563
564 #undef K
565 #undef SRC
566 #undef DST
567 #undef NR
568
569 ENDFUNC
570 .endm
571
572 encdec eblk, aesenc, w
573 encdec dblk, aesdec, wi
574
575 ///--------------------------------------------------------------------------
576 /// Random utilities.
577
578 .align 16
579 // Abort the process because of a programming error. Indirecting
580 // through this point serves several purposes: (a) by CALLing, rather
581 // than branching to, `abort', we can save the return address, which
582 // might at least provide a hint as to what went wrong; (b) we don't
583 // have conditional CALLs (and they'd be big anyway); and (c) we can
584 // write a HLT here as a backstop against `abort' being mad.
585 bogus: callext F(abort)
586 0: hlt
587 jmp 0b
588
589 gotaux ecx
590
591 ///--------------------------------------------------------------------------
592 /// Data tables.
593
594 .align 16
595 endswap_tab:
596 .byte 3, 2, 1, 0
597 .byte 7, 6, 5, 4
598 .byte 11, 10, 9, 8
599 .byte 15, 14, 13, 12
600
601 ///----- That's all, folks --------------------------------------------------