symm/rijndael-x86ish-aesni.S: Fix conflict in 32-bit register allocation.
[catacomb] / symm / rijndael-x86ish-aesni.S
1 /// -*- mode: asm; asm-comment-char: ?/ -*-
2 ///
3 /// AESNI-based implementation of Rijndael
4 ///
5 /// (c) 2015 Straylight/Edgeware
6 ///
7
8 ///----- Licensing notice ---------------------------------------------------
9 ///
10 /// This file is part of Catacomb.
11 ///
12 /// Catacomb is free software; you can redistribute it and/or modify
13 /// it under the terms of the GNU Library General Public License as
14 /// published by the Free Software Foundation; either version 2 of the
15 /// License, or (at your option) any later version.
16 ///
17 /// Catacomb is distributed in the hope that it will be useful,
18 /// but WITHOUT ANY WARRANTY; without even the implied warranty of
19 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 /// GNU Library General Public License for more details.
21 ///
22 /// You should have received a copy of the GNU Library General Public
23 /// License along with Catacomb; if not, write to the Free
24 /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
25 /// MA 02111-1307, USA.
26
27 ///--------------------------------------------------------------------------
28 /// External definitions.
29
30 #include "config.h"
31 #include "asm-common.h"
32
33 .globl F(abort)
34 .globl F(rijndael_rcon)
35
36 ///--------------------------------------------------------------------------
37 /// Main code.
38
39 .arch .aes
40 .text
41
42 /// The AESNI instructions implement a little-endian version of AES, but
43 /// Catacomb's internal interface presents as big-endian so as to work better
44 /// with things like GCM. We therefore maintain the round keys in
45 /// little-endian form, and have to end-swap blocks in and out.
46 ///
47 /// For added amusement, the AESNI instructions don't implement the
48 /// larger-block versions of Rijndael, so we have to end-swap the keys if
49 /// we're preparing for one of those.
50
51 // Useful constants.
52 .equ maxrounds, 16 // maximum number of rounds
53 .equ maxblksz, 32 // maximum block size, in bytes
54 .equ kbufsz, maxblksz*(maxrounds + 1) // size of a key-schedule buffer
55
56 // Context structure.
57 .equ nr, 0 // number of rounds
58 .equ w, nr + 4 // encryption key words
59 .equ wi, w + kbufsz // decryption key words
60
61 ///--------------------------------------------------------------------------
62 /// Key setup.
63
64 FUNC(rijndael_setup_x86ish_aesni)
65
66 #if CPUFAM_X86
67 // Arguments are on the stack. We'll need to stack the caller's
68 // register veriables, but we'll manage.
69
70 # define CTX ebp // context pointer
71 # define BLKSZ [esp + 24] // block size
72
73 # define SI esi // source pointer
74 # define DI edi // destination pointer
75
76 # define KSZ ebx // key size
77 # define KSZo ebx // ... as address offset
78 # define NKW edx // total number of key words
79 # define NKW_NEEDS_REFRESH 1 // ... needs recalculating
80 # define RCON ecx // round constants table
81 # define LIM edx // limit pointer
82 # define LIMn edx // ... as integer offset from base
83 # define CYIX edi // index in shift-register cycle
84
85 # define NR ecx // number of rounds
86 # define LRK eax // distance to last key
87 # define LRKo eax // ... as address offset
88 # define BLKOFF edx // block size in bytes
89 # define BLKOFFo edx // ... as address offset
90
91 // Stack the caller's registers.
92 push ebp
93 push ebx
94 push esi
95 push edi
96
97 // Set up our own variables.
98 mov CTX, [esp + 20] // context base pointer
99 mov SI, [esp + 28] // key material
100 mov KSZ, [esp + 32] // key size, in words
101 #endif
102
103 #if CPUFAM_AMD64 && ABI_SYSV
104 // Arguments are in registers. We have plenty, but, to be honest,
105 // the initial register allocation is a bit annoying.
106
107 # define CTX r8 // context pointer
108 # define BLKSZ r9d // block size
109
110 # define SI rsi // source pointer
111 # define DI rdi // destination pointer
112
113 # define KSZ edx // key size
114 # define KSZo rdx // ... as address offset
115 # define NKW r10d // total number of key words
116 # define RCON rdi // round constants table
117 # define LIMn ecx // limit pointer
118 # define LIM rcx // ... as integer offset from base
119 # define CYIX r11d // index in shift-register cycle
120
121 # define NR ecx // number of rounds
122 # define LRK eax // distance to last key
123 # define LRKo rax // ... as address offset
124 # define BLKOFF r9d // block size in bytes
125 # define BLKOFFo r9 // ... as address offset
126
127 // Move arguments to more useful places.
128 mov CTX, rdi // context base pointer
129 mov BLKSZ, esi // block size in words
130 mov SI, rdx // key material
131 mov KSZ, ecx // key size, in words
132 #endif
133
134 #if CPUFAM_AMD64 && ABI_WIN
135 // Arguments are in different registers, and they're a little tight.
136
137 # define CTX r8 // context pointer
138 # define BLKSZ edx // block size
139
140 # define SI rsi // source pointer
141 # define DI rdi // destination pointer
142
143 # define KSZ r9d // key size
144 # define KSZo r9 // ... as address offset
145 # define NKW r10d // total number of key words
146 # define RCON rdi // round constants table
147 # define LIMn ecx // limit pointer
148 # define LIM rcx // ... as integer offset from base
149 # define CYIX r11d // index in shift-register cycle
150
151 # define NR ecx // number of rounds
152 # define LRK eax // distance to last key
153 # define LRKo rax // ... as address offset
154 # define BLKOFF edx // block size in bytes
155 # define BLKOFFo rdx // ... as address offset
156
157 // We'll need the index registers, which belong to the caller in this
158 // ABI.
159 push rsi
160 .seh_pushreg rsi
161 push rdi
162 .seh_pushreg rdi
163 .seh_endprologue
164
165 // Move arguments to more useful places.
166 mov SI, r8 // key material
167 mov CTX, rcx // context base pointer
168 #endif
169
170 // The initial round key material is taken directly from the input
171 // key, so copy it over.
172 #if CPUFAM_AMD64 && ABI_SYSV
173 // We've been lucky. We already have a copy of the context pointer
174 // in rdi, and the key size in ecx.
175 add DI, w
176 #else
177 lea DI, [CTX + w]
178 mov ecx, KSZ
179 #endif
180 rep movsd
181
182 // Find out other useful things.
183 mov NKW, [CTX + nr] // number of rounds
184 add NKW, 1
185 imul NKW, BLKSZ // total key size in words
186 #if !NKW_NEEDS_REFRESH
187 // If we can't keep NKW for later, then we use the same register for
188 // it and LIM, so this move is unnecessary.
189 mov LIMn, NKW
190 #endif
191 sub LIMn, KSZ // offset by the key size
192
193 // Find the round constants.
194 ldgot ecx
195 leaext RCON, F(rijndael_rcon), ecx
196
197 // Prepare for the main loop.
198 lea SI, [CTX + w]
199 mov eax, [SI + 4*KSZo - 4] // most recent key word
200 lea LIM, [SI + 4*LIM] // limit, offset by one key expansion
201 xor CYIX, CYIX // start of new cycle
202
203 // Main key expansion loop. The first word of each key-length chunk
204 // needs special treatment.
205 //
206 // This is rather tedious because the Intel `AESKEYGENASSIST'
207 // instruction is very strangely shaped. Firstly, it wants to
208 // operate on vast SSE registers, even though we're data-blocked from
209 // doing more than operation at a time unless we're doing two key
210 // schedules simultaneously -- and even then we can't do more than
211 // two, because the instruction ignores two of its input words
212 // entirely, and produces two different outputs for each of the other
213 // two. And secondly it insists on taking the magic round constant
214 // as an immediate, so it's kind of annoying if you're not
215 // open-coding the whole thing. It's much easier to leave that as
216 // zero and XOR in the round constant by hand.
217 0: cmp CYIX, 0 // first word of the cycle?
218 je 1f
219 cmp CYIX, 4 // fourth word of the cycle?
220 jne 2f
221 cmp KSZ, 7 // and a large key?
222 jb 2f
223
224 // Fourth word of the cycle, and seven or eight words of key. Do a
225 // byte substitution.
226 movd xmm0, eax
227 pshufd xmm0, xmm0, SHUF(2, 1, 0, 3)
228 aeskeygenassist xmm1, xmm0, 0
229 movd eax, xmm1
230 jmp 2f
231
232 // First word of the cycle. This is the complicated piece.
233 1: movd xmm0, eax
234 pshufd xmm0, xmm0, SHUF(0, 3, 2, 1)
235 aeskeygenassist xmm1, xmm0, 0
236 pshufd xmm1, xmm1, SHUF(2, 1, 0, 3)
237 movd eax, xmm1
238 xor al, [RCON]
239 inc RCON
240
241 // Common tail. Mix in the corresponding word from the previous
242 // cycle and prepare for the next loop.
243 2: xor eax, [SI]
244 mov [SI + 4*KSZo], eax
245 add SI, 4
246 inc CYIX
247 cmp SI, LIM
248 jae 9f
249 cmp CYIX, KSZ
250 jb 0b
251 xor CYIX, CYIX
252 jmp 0b
253
254 // Next job is to construct the decryption keys. The keys for the
255 // first and last rounds don't need to be mangled, but the remaining
256 // ones do -- and they all need to be reordered too.
257 //
258 // The plan of action, then, is to copy the final encryption round's
259 // keys into place first, then to do each of the intermediate rounds
260 // in reverse order, and finally do the first round.
261 //
262 // Do all of the heavy lifting with SSE registers. The order we're
263 // doing this in means that it's OK if we read or write too much, and
264 // there's easily enough buffer space for the over-enthusiastic reads
265 // and writes because the context has space for 32-byte blocks, which
266 // is our maximum and an exact fit for two SSE registers.
267 9: mov NR, [CTX + nr] // number of rounds
268 #if NKW_NEEDS_REFRESH
269 mov BLKOFF, BLKSZ
270 mov LRK, NR
271 imul LRK, BLKOFF
272 #else
273 // If we retain NKW, then BLKSZ and BLKOFF are the same register
274 // because we won't need the former again.
275 mov LRK, NKW
276 sub LRK, BLKSZ
277 #endif
278 lea DI, [CTX + wi]
279 lea SI, [CTX + w + 4*LRKo] // last round's keys
280 shl BLKOFF, 2 // block size (in bytes now)
281
282 // Copy the last encryption round's keys.
283 movdqu xmm0, [SI]
284 movdqu [DI], xmm0
285 cmp BLKOFF, 16
286 jbe 0f
287 movdqu xmm0, [SI + 16]
288 movdqu [DI + 16], xmm0
289
290 // Update the loop variables and stop if we've finished.
291 0: add DI, BLKOFFo
292 sub SI, BLKOFFo
293 sub NR, 1
294 jbe 9f
295
296 // Do another middle round's keys...
297 movdqu xmm0, [SI]
298 aesimc xmm0, xmm0
299 movdqu [DI], xmm0
300 cmp BLKOFF, 16
301 jbe 0b
302 movdqu xmm0, [SI + 16]
303 aesimc xmm0, xmm0
304 movdqu [DI + 16], xmm0
305 jmp 0b
306
307 // Finally do the first encryption round.
308 9: movdqu xmm0, [SI]
309 movdqu [DI], xmm0
310 cmp BLKOFF, 16
311 jbe 1f
312 movdqu xmm0, [SI + 16]
313 movdqu [DI + 16], xmm0
314
315 // If the block size is not exactly four words then we must end-swap
316 // everything. We can use fancy SSE toys for this.
317 1: cmp BLKOFF, 16
318 je 9f
319
320 // Find the byte-reordering table.
321 ldgot ecx
322 movdqa xmm5, [INTADDR(endswap_tab, ecx)]
323
324 #if NKW_NEEDS_REFRESH
325 // Calculate the number of subkey words again. (It's a good job
326 // we've got a fast multiplier.)
327 mov NKW, [CTX + nr]
328 add NKW, 1
329 imul NKW, BLKSZ
330 #endif
331
332 // End-swap the encryption keys.
333 lea SI, [CTX + w]
334 call endswap_block
335
336 // And the decryption keys.
337 lea SI, [CTX + wi]
338 call endswap_block
339
340 9: // All done.
341 #if CPUFAM_X86
342 pop edi
343 pop esi
344 pop ebx
345 pop ebp
346 #endif
347 #if CPUFAM_AMD64 && ABI_WIN
348 pop rdi
349 pop rsi
350 #endif
351 ret
352
353 .align 16
354 endswap_block:
355 // End-swap NKW words starting at SI. The end-swapping table is
356 // already loaded into XMM5; and it's OK to work in 16-byte chunks.
357 mov ecx, NKW
358 0: movdqu xmm1, [SI]
359 pshufb xmm1, xmm5
360 movdqu [SI], xmm1
361 add SI, 16
362 sub ecx, 4
363 ja 0b
364 ret
365
366 #undef CTX
367 #undef BLKSZ
368 #undef SI
369 #undef DI
370 #undef KSZ
371 #undef KSZo
372 #undef RCON
373 #undef LIMn
374 #undef LIM
375 #undef NR
376 #undef LRK
377 #undef LRKo
378 #undef BLKOFF
379 #undef BLKOFFo
380
381 ENDFUNC
382
383 ///--------------------------------------------------------------------------
384 /// Encrypting and decrypting blocks.
385
386 .macro encdec op, aes, koff
387 FUNC(rijndael_\op\()_x86ish_aesni)
388
389 #if CPUFAM_X86
390 // Arguments come in on the stack, and need to be collected. We
391 // don't have a shortage of registers.
392
393 # define K eax
394 # define SRC edx
395 # define DST edx
396 # define NR ecx
397
398 mov K, [esp + 4]
399 mov SRC, [esp + 8]
400 #endif
401
402 #if CPUFAM_AMD64 && ABI_SYSV
403 // Arguments come in registers. All is good.
404
405 # define K rdi
406 # define SRC rsi
407 # define DST rdx
408 # define NR eax
409 #endif
410
411 #if CPUFAM_AMD64 && ABI_WIN
412 // Arguments come in different registers.
413
414 # define K rcx
415 # define SRC rdx
416 # define DST r8
417 # define NR eax
418 .seh_endprologue
419 #endif
420
421 // Find the magic endianness-swapping table.
422 ldgot ecx
423 movdqa xmm5, [INTADDR(endswap_tab, ecx)]
424
425 // Initial setup.
426 movdqu xmm0, [SRC]
427 pshufb xmm0, xmm5
428 mov NR, [K + nr]
429 add K, \koff
430
431 // Initial whitening.
432 movdqu xmm1, [K]
433 add K, 16
434 pxor xmm0, xmm1
435
436 // Dispatch to the correct code.
437 cmp NR, 10
438 je 10f
439 jb bogus
440 cmp NR, 14
441 je 14f
442 ja bogus
443 cmp NR, 12
444 je 12f
445 jb 11f
446 jmp 13f
447
448 .align 2
449
450 // 14 rounds...
451 14: movdqu xmm1, [K]
452 add K, 16
453 \aes xmm0, xmm1
454
455 // 13 rounds...
456 13: movdqu xmm1, [K]
457 add K, 16
458 \aes xmm0, xmm1
459
460 // 12 rounds...
461 12: movdqu xmm1, [K]
462 add K, 16
463 \aes xmm0, xmm1
464
465 // 11 rounds...
466 11: movdqu xmm1, [K]
467 add K, 16
468 \aes xmm0, xmm1
469
470 // 10 rounds...
471 10: movdqu xmm1, [K]
472 \aes xmm0, xmm1
473
474 // 9 rounds...
475 movdqu xmm1, [K + 16]
476 \aes xmm0, xmm1
477
478 // 8 rounds...
479 movdqu xmm1, [K + 32]
480 \aes xmm0, xmm1
481
482 // 7 rounds...
483 movdqu xmm1, [K + 48]
484 \aes xmm0, xmm1
485
486 // 6 rounds...
487 movdqu xmm1, [K + 64]
488 \aes xmm0, xmm1
489
490 // 5 rounds...
491 movdqu xmm1, [K + 80]
492 \aes xmm0, xmm1
493
494 // 4 rounds...
495 movdqu xmm1, [K + 96]
496 \aes xmm0, xmm1
497
498 // 3 rounds...
499 movdqu xmm1, [K + 112]
500 \aes xmm0, xmm1
501
502 // 2 rounds...
503 movdqu xmm1, [K + 128]
504 \aes xmm0, xmm1
505
506 // Final round...
507 movdqu xmm1, [K + 144]
508 \aes\()last xmm0, xmm1
509
510 // Unpermute the ciphertext block and store it.
511 pshufb xmm0, xmm5
512 #if CPUFAM_X86
513 mov DST, [esp + 12]
514 #endif
515 movdqu [DST], xmm0
516
517 // And we're done.
518 ret
519
520 #undef K
521 #undef SRC
522 #undef DST
523 #undef NR
524
525 ENDFUNC
526 .endm
527
528 encdec eblk, aesenc, w
529 encdec dblk, aesdec, wi
530
531 ///--------------------------------------------------------------------------
532 /// Random utilities.
533
534 .align 16
535 // Abort the process because of a programming error. Indirecting
536 // through this point serves several purposes: (a) by CALLing, rather
537 // than branching to, `abort', we can save the return address, which
538 // might at least provide a hint as to what went wrong; (b) we don't
539 // have conditional CALLs (and they'd be big anyway); and (c) we can
540 // write a HLT here as a backstop against `abort' being mad.
541 bogus: callext F(abort)
542 0: hlt
543 jmp 0b
544
545 ///--------------------------------------------------------------------------
546 /// Data tables.
547
548 .align 16
549 endswap_tab:
550 .byte 3, 2, 1, 0
551 .byte 7, 6, 5, 4
552 .byte 11, 10, 9, 8
553 .byte 15, 14, 13, 12
554
555 ///----- That's all, folks --------------------------------------------------