*.S: Use `.text' consistently to name the text section.
[catacomb] / symm / rijndael-x86ish-aesni.S
1 /// -*- mode: asm; asm-comment-char: ?/ -*-
2 ///
3 /// AESNI-based implementation of Rijndael
4 ///
5 /// (c) 2015 Straylight/Edgeware
6 ///
7
8 ///----- Licensing notice ---------------------------------------------------
9 ///
10 /// This file is part of Catacomb.
11 ///
12 /// Catacomb is free software; you can redistribute it and/or modify
13 /// it under the terms of the GNU Library General Public License as
14 /// published by the Free Software Foundation; either version 2 of the
15 /// License, or (at your option) any later version.
16 ///
17 /// Catacomb is distributed in the hope that it will be useful,
18 /// but WITHOUT ANY WARRANTY; without even the implied warranty of
19 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 /// GNU Library General Public License for more details.
21 ///
22 /// You should have received a copy of the GNU Library General Public
23 /// License along with Catacomb; if not, write to the Free
24 /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
25 /// MA 02111-1307, USA.
26
27 ///--------------------------------------------------------------------------
28 /// External definitions.
29
30 #include "config.h"
31 #include "asm-common.h"
32
33 ///--------------------------------------------------------------------------
34 /// External definitions.
35
36 .globl F(abort)
37 .globl F(rijndael_rcon)
38
39 ///--------------------------------------------------------------------------
40 /// Local utilities.
41
42 // Magic constants for shuffling.
43 #define ROTL 0x93
44 #define ROT2 0x4e
45 #define ROTR 0x39
46
47 ///--------------------------------------------------------------------------
48 /// Main code.
49
50 .arch .aes
51 .text
52
53 /// The AESNI instructions implement a little-endian version of AES, but
54 /// Catacomb's internal interface presents as big-endian so as to work better
55 /// with things like GCM. We therefore maintain the round keys in
56 /// little-endian form, and have to end-swap blocks in and out.
57 ///
58 /// For added amusement, the AESNI instructions don't implement the
59 /// larger-block versions of Rijndael, so we have to end-swap the keys if
60 /// we're preparing for one of those.
61
62 // Useful constants.
63 .equ maxrounds, 16 // maximum number of rounds
64 .equ maxblksz, 32 // maximum block size, in bytes
65 .equ kbufsz, maxblksz*(maxrounds + 1) // size of a key-schedule buffer
66
67 // Context structure.
68 .equ nr, 0 // number of rounds
69 .equ w, nr + 4 // encryption key words
70 .equ wi, w + kbufsz // decryption key words
71
72 ///--------------------------------------------------------------------------
73 /// Key setup.
74
75 FUNC(rijndael_setup_x86ish_aesni)
76
77 #if CPUFAM_X86
78 // Arguments are on the stack. We'll need to stack the caller's
79 // register veriables, but we'll manage.
80
81 # define CTX ebp // context pointer
82 # define BLKSZ [esp + 24] // block size
83
84 # define SI esi // source pointer
85 # define DI edi // destination pointer
86
87 # define KSZ ebx // key size
88 # define KSZo ebx // ... as address offset
89 # define NKW edx // total number of key words
90 # define NKW_NEEDS_REFRESH 1 // ... needs recalculating
91 # define RCON ecx // round constants table
92 # define LIM edx // limit pointer
93 # define LIMn edx // ... as integer offset from base
94
95 # define NR ecx // number of rounds
96 # define LRK eax // distance to last key
97 # define LRKo eax // ... as address offset
98 # define BLKOFF edx // block size in bytes
99 # define BLKOFFo edx // ... as address offset
100
101 // Stack the caller's registers.
102 push ebp
103 push ebx
104 push esi
105 push edi
106
107 // Set up our own variables.
108 mov CTX, [esp + 20] // context base pointer
109 mov SI, [esp + 28] // key material
110 mov KSZ, [esp + 32] // key size, in words
111 #endif
112
113 #if CPUFAM_AMD64 && ABI_SYSV
114 // Arguments are in registers. We have plenty, but, to be honest,
115 // the initial register allocation is a bit annoying.
116
117 # define CTX r8 // context pointer
118 # define BLKSZ r9d // block size
119
120 # define SI rsi // source pointer
121 # define DI rdi // destination pointer
122
123 # define KSZ edx // key size
124 # define KSZo rdx // ... as address offset
125 # define NKW r10d // total number of key words
126 # define RCON rdi // round constants table
127 # define LIMn ecx // limit pointer
128 # define LIM rcx // ... as integer offset from base
129
130 # define NR ecx // number of rounds
131 # define LRK eax // distance to last key
132 # define LRKo rax // ... as address offset
133 # define BLKOFF r9d // block size in bytes
134 # define BLKOFFo r9 // ... as address offset
135
136 // Move arguments to more useful places.
137 mov CTX, rdi // context base pointer
138 mov BLKSZ, esi // block size in words
139 mov SI, rdx // key material
140 mov KSZ, ecx // key size, in words
141 #endif
142
143 #if CPUFAM_AMD64 && ABI_WIN
144 // Arguments are in different registers, and they're a little tight.
145
146 # define CTX r8 // context pointer
147 # define BLKSZ edx // block size
148
149 # define SI rsi // source pointer
150 # define DI rdi // destination pointer
151
152 # define KSZ r9d // key size
153 # define KSZo r9 // ... as address offset
154 # define NKW r10d // total number of key words
155 # define RCON rdi // round constants table
156 # define LIMn ecx // limit pointer
157 # define LIM rcx // ... as integer offset from base
158
159 # define NR ecx // number of rounds
160 # define LRK eax // distance to last key
161 # define LRKo rax // ... as address offset
162 # define BLKOFF edx // block size in bytes
163 # define BLKOFFo rdx // ... as address offset
164
165 // We'll need the index registers, which belong to the caller in this
166 // ABI.
167 push rsi
168 push rdi
169
170 // Move arguments to more useful places.
171 mov SI, r8 // key material
172 mov CTX, rcx // context base pointer
173 #endif
174
175 // The initial round key material is taken directly from the input
176 // key, so copy it over.
177 #if CPUFAM_AMD64 && ABI_SYSV
178 // We've been lucky. We already have a copy of the context pointer
179 // in rdi, and the key size in ecx.
180 add DI, w
181 #else
182 lea DI, [CTX + w]
183 mov ecx, KSZ
184 #endif
185 rep movsd
186
187 // Find out other useful things.
188 mov NKW, [CTX + nr] // number of rounds
189 add NKW, 1
190 imul NKW, BLKSZ // total key size in words
191 #if !NKW_NEEDS_REFRESH
192 // If we can't keep NKW for later, then we use the same register for
193 // it and LIM, so this move is unnecessary.
194 mov LIMn, NKW
195 #endif
196 sub LIMn, KSZ // offset by the key size
197
198 // Find the round constants.
199 ldgot ecx
200 leaext RCON, rijndael_rcon, ecx
201
202 // Prepare for the main loop.
203 lea SI, [CTX + w]
204 mov eax, [SI + 4*KSZo - 4] // most recent key word
205 lea LIM, [SI + 4*LIM] // limit, offset by one key expansion
206
207 // Main key expansion loop. The first word of each key-length chunk
208 // needs special treatment.
209 //
210 // This is rather tedious because the Intel `AESKEYGENASSIST'
211 // instruction is very strangely shaped. Firstly, it wants to
212 // operate on vast SSE registers, even though we're data-blocked from
213 // doing more than operation at a time unless we're doing two key
214 // schedules simultaneously -- and even then we can't do more than
215 // two, because the instruction ignores two of its input words
216 // entirely, and produces two different outputs for each of the other
217 // two. And secondly it insists on taking the magic round constant
218 // as an immediate, so it's kind of annoying if you're not
219 // open-coding the whole thing. It's much easier to leave that as
220 // zero and XOR in the round constant by hand.
221 9: movd xmm0, eax
222 pshufd xmm0, xmm0, ROTR
223 aeskeygenassist xmm1, xmm0, 0
224 pshufd xmm1, xmm1, ROTL
225 movd eax, xmm1
226 xor eax, [SI]
227 xor al, [RCON]
228 inc RCON
229 mov [SI + 4*KSZo], eax
230 add SI, 4
231 cmp SI, LIM
232 jae 8f
233
234 // The next three words are simple...
235 xor eax, [SI]
236 mov [SI + 4*KSZo], eax
237 add SI, 4
238 cmp SI, LIM
239 jae 8f
240
241 // (Word 2...)
242 xor eax, [SI]
243 mov [SI + 4*KSZo], eax
244 add SI, 4
245 cmp SI, LIM
246 jae 8f
247
248 // (Word 3...)
249 xor eax, [SI]
250 mov [SI + 4*KSZo], eax
251 add SI, 4
252 cmp SI, LIM
253 jae 8f
254
255 // Word 4. If the key is /more/ than 6 words long, then we must
256 // apply a substitution here.
257 cmp KSZ, 5
258 jb 9b
259 cmp KSZ, 7
260 jb 0f
261 movd xmm0, eax
262 pshufd xmm0, xmm0, ROTL
263 aeskeygenassist xmm1, xmm0, 0
264 movd eax, xmm1
265 0: xor eax, [SI]
266 mov [SI + 4*KSZo], eax
267 add SI, 4
268 cmp SI, LIM
269 jae 8f
270
271 // (Word 5...)
272 cmp KSZ, 6
273 jb 9b
274 xor eax, [SI]
275 mov [SI + 4*KSZo], eax
276 add SI, 4
277 cmp SI, LIM
278 jae 8f
279
280 // (Word 6...)
281 cmp KSZ, 7
282 jb 9b
283 xor eax, [SI]
284 mov [SI + 4*KSZo], eax
285 add SI, 4
286 cmp SI, LIM
287 jae 8f
288
289 // (Word 7...)
290 cmp KSZ, 8
291 jb 9b
292 xor eax, [SI]
293 mov [SI + 4*KSZo], eax
294 add SI, 4
295 cmp SI, LIM
296 jae 8f
297
298 // Must be done by now.
299 jmp 9b
300
301 // Next job is to construct the decryption keys. The keys for the
302 // first and last rounds don't need to be mangled, but the remaining
303 // ones do -- and they all need to be reordered too.
304 //
305 // The plan of action, then, is to copy the final encryption round's
306 // keys into place first, then to do each of the intermediate rounds
307 // in reverse order, and finally do the first round.
308 //
309 // Do all of the heavy lifting with SSE registers. The order we're
310 // doing this in means that it's OK if we read or write too much, and
311 // there's easily enough buffer space for the over-enthusiastic reads
312 // and writes because the context has space for 32-byte blocks, which
313 // is our maximum and an exact fit for two SSE registers.
314 8: mov NR, [CTX + nr] // number of rounds
315 #if NKW_NEEDS_REFRESH
316 mov BLKOFF, BLKSZ
317 mov LRK, NR
318 imul LRK, BLKOFF
319 #else
320 // If we retain NKW, then BLKSZ and BLKOFF are the same register
321 // because we won't need the former again.
322 mov LRK, NKW
323 sub LRK, BLKSZ
324 #endif
325 lea DI, [CTX + wi]
326 lea SI, [CTX + w + 4*LRKo] // last round's keys
327 shl BLKOFF, 2 // block size (in bytes now)
328
329 // Copy the last encryption round's keys.
330 movdqu xmm0, [SI]
331 movdqu [DI], xmm0
332 cmp BLKOFF, 16
333 jbe 9f
334 movdqu xmm0, [SI + 16]
335 movdqu [DI + 16], xmm0
336
337 // Update the loop variables and stop if we've finished.
338 9: add DI, BLKOFFo
339 sub SI, BLKOFFo
340 sub NR, 1
341 jbe 0f
342
343 // Do another middle round's keys...
344 movdqu xmm0, [SI]
345 aesimc xmm0, xmm0
346 movdqu [DI], xmm0
347 cmp BLKOFF, 16
348 jbe 9b
349 movdqu xmm0, [SI + 16]
350 aesimc xmm0, xmm0
351 movdqu [DI + 16], xmm0
352 jmp 9b
353
354 // Finally do the first encryption round.
355 0: movdqu xmm0, [SI]
356 movdqu [DI], xmm0
357 cmp BLKOFF, 16
358 jbe 0f
359 movdqu xmm0, [SI + 16]
360 movdqu [DI + 16], xmm0
361
362 // If the block size is not exactly four words then we must end-swap
363 // everything. We can use fancy SSE toys for this.
364 0: cmp BLKOFF, 16
365 je 0f
366
367 // Find the byte-reordering table.
368 ldgot ecx
369 movdqa xmm5, [INTADDR(endswap_tab, ecx)]
370
371 #if NKW_NEEDS_REFRESH
372 // Calculate the number of subkey words again. (It's a good job
373 // we've got a fast multiplier.)
374 mov NKW, [CTX + nr]
375 add NKW, 1
376 imul NKW, BLKSZ
377 #endif
378
379 // End-swap the encryption keys.
380 mov ecx, NKW
381 lea SI, [CTX + w]
382 call endswap_block
383
384 // And the decryption keys.
385 mov ecx, NKW
386 lea SI, [CTX + wi]
387 call endswap_block
388
389 0: // All done.
390 #if CPUFAM_X86
391 pop edi
392 pop esi
393 pop ebx
394 pop ebp
395 #endif
396 #if CPUFAM_AMD64 && ABI_WIN
397 pop rdi
398 pop rsi
399 #endif
400 ret
401
402 .align 16
403 endswap_block:
404 // End-swap ECX words starting at SI. The end-swapping table is
405 // already loaded into XMM5; and it's OK to work in 16-byte chunks.
406 movdqu xmm1, [SI]
407 pshufb xmm1, xmm5
408 movdqu [SI], xmm1
409 add SI, 16
410 sub ecx, 4
411 ja endswap_block
412 ret
413
414 #undef CTX
415 #undef BLKSZ
416 #undef SI
417 #undef DI
418 #undef KSZ
419 #undef KSZo
420 #undef RCON
421 #undef LIMn
422 #undef LIM
423 #undef NR
424 #undef LRK
425 #undef LRKo
426 #undef BLKOFF
427 #undef BLKOFFo
428
429 ENDFUNC
430
431 ///--------------------------------------------------------------------------
432 /// Encrypting and decrypting blocks.
433
434 .macro encdec op, aes, koff
435 FUNC(rijndael_\op\()_x86ish_aesni)
436
437 // Find the magic endianness-swapping table.
438 ldgot ecx
439 movdqa xmm5, [INTADDR(endswap_tab, ecx)]
440
441 #if CPUFAM_X86
442 // Arguments come in on the stack, and need to be collected. We
443 // don't have a shortage of registers.
444
445 # define K ecx
446 # define SRC edx
447 # define DST edx
448 # define NR eax
449
450 mov K, [esp + 4]
451 mov SRC, [esp + 8]
452 #endif
453
454 #if CPUFAM_AMD64 && ABI_SYSV
455 // Arguments come in registers. All is good.
456
457 # define K rdi
458 # define SRC rsi
459 # define DST rdx
460 # define NR eax
461 #endif
462
463 #if CPUFAM_AMD64 && ABI_WIN
464 // Arguments come in different registers.
465
466 # define K rcx
467 # define SRC rdx
468 # define DST r8
469 # define NR eax
470 #endif
471
472 // Initial setup.
473 movdqu xmm0, [SRC]
474 pshufb xmm0, xmm5
475 mov NR, [K + nr]
476 add K, \koff
477
478 // Initial whitening.
479 movdqu xmm1, [K]
480 add K, 16
481 pxor xmm0, xmm1
482
483 // Dispatch to the correct code.
484 cmp NR, 10
485 je 10f
486 jb bogus
487 cmp NR, 14
488 je 14f
489 ja bogus
490 cmp NR, 12
491 je 12f
492 jb 11f
493 jmp 13f
494
495 .align 2
496
497 // 14 rounds...
498 14: movdqu xmm1, [K]
499 add K, 16
500 \aes xmm0, xmm1
501
502 // 13 rounds...
503 13: movdqu xmm1, [K]
504 add K, 16
505 \aes xmm0, xmm1
506
507 // 12 rounds...
508 12: movdqu xmm1, [K]
509 add K, 16
510 \aes xmm0, xmm1
511
512 // 11 rounds...
513 11: movdqu xmm1, [K]
514 add K, 16
515 \aes xmm0, xmm1
516
517 // 10 rounds...
518 10: movdqu xmm1, [K]
519 \aes xmm0, xmm1
520
521 // 9 rounds...
522 movdqu xmm1, [K + 16]
523 \aes xmm0, xmm1
524
525 // 8 rounds...
526 movdqu xmm1, [K + 32]
527 \aes xmm0, xmm1
528
529 // 7 rounds...
530 movdqu xmm1, [K + 48]
531 \aes xmm0, xmm1
532
533 // 6 rounds...
534 movdqu xmm1, [K + 64]
535 \aes xmm0, xmm1
536
537 // 5 rounds...
538 movdqu xmm1, [K + 80]
539 \aes xmm0, xmm1
540
541 // 4 rounds...
542 movdqu xmm1, [K + 96]
543 \aes xmm0, xmm1
544
545 // 3 rounds...
546 movdqu xmm1, [K + 112]
547 \aes xmm0, xmm1
548
549 // 2 rounds...
550 movdqu xmm1, [K + 128]
551 \aes xmm0, xmm1
552
553 // Final round...
554 movdqu xmm1, [K + 144]
555 \aes\()last xmm0, xmm1
556
557 // Unpermute the ciphertext block and store it.
558 pshufb xmm0, xmm5
559 #if CPUFAM_X86
560 mov DST, [esp + 12]
561 #endif
562 movdqu [DST], xmm0
563
564 // And we're done.
565 ret
566
567 #undef K
568 #undef SRC
569 #undef DST
570 #undef NR
571
572 ENDFUNC
573 .endm
574
575 encdec eblk, aesenc, w
576 encdec dblk, aesdec, wi
577
578 ///--------------------------------------------------------------------------
579 /// Random utilities.
580
581 .align 16
582 // Abort the process because of a programming error. Indirecting
583 // through this point serves several purposes: (a) by CALLing, rather
584 // than branching to, `abort', we can save the return address, which
585 // might at least provide a hint as to what went wrong; (b) we don't
586 // have conditional CALLs (and they'd be big anyway); and (c) we can
587 // write a HLT here as a backstop against `abort' being mad.
588 bogus: callext F(abort)
589 0: hlt
590 jmp 0b
591
592 gotaux ecx
593
594 ///--------------------------------------------------------------------------
595 /// Data tables.
596
597 .align 16
598 endswap_tab:
599 .byte 3, 2, 1, 0
600 .byte 7, 6, 5, 4
601 .byte 11, 10, 9, 8
602 .byte 15, 14, 13, 12
603
604 ///----- That's all, folks --------------------------------------------------