base/asm-common.h, symm/*.S: New macros for register name decoration.
[catacomb] / symm / rijndael-x86ish-aesni.S
1 /// -*- mode: asm; asm-comment-char: ?/ -*-
2 ///
3 /// AESNI-based implementation of Rijndael
4 ///
5 /// (c) 2015 Straylight/Edgeware
6 ///
7
8 ///----- Licensing notice ---------------------------------------------------
9 ///
10 /// This file is part of Catacomb.
11 ///
12 /// Catacomb is free software; you can redistribute it and/or modify
13 /// it under the terms of the GNU Library General Public License as
14 /// published by the Free Software Foundation; either version 2 of the
15 /// License, or (at your option) any later version.
16 ///
17 /// Catacomb is distributed in the hope that it will be useful,
18 /// but WITHOUT ANY WARRANTY; without even the implied warranty of
19 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 /// GNU Library General Public License for more details.
21 ///
22 /// You should have received a copy of the GNU Library General Public
23 /// License along with Catacomb; if not, write to the Free
24 /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
25 /// MA 02111-1307, USA.
26
27 ///--------------------------------------------------------------------------
28 /// External definitions.
29
30 #include "config.h"
31 #include "asm-common.h"
32
33 .globl F(abort)
34 .globl F(rijndael_rcon)
35
36 ///--------------------------------------------------------------------------
37 /// Main code.
38
39 .arch .aes
40 .text
41
42 /// The AESNI instructions implement a little-endian version of AES, but
43 /// Catacomb's internal interface presents as big-endian so as to work better
44 /// with things like GCM. We therefore maintain the round keys in
45 /// little-endian form, and have to end-swap blocks in and out.
46 ///
47 /// For added amusement, the AESNI instructions don't implement the
48 /// larger-block versions of Rijndael, so we have to end-swap the keys if
49 /// we're preparing for one of those.
50
51 // Useful constants.
52 .equ maxrounds, 16 // maximum number of rounds
53 .equ maxblksz, 32 // maximum block size, in bytes
54 .equ kbufsz, maxblksz*(maxrounds + 1) // size of a key-schedule buffer
55
56 // Context structure.
57 .equ nr, 0 // number of rounds
58 .equ w, nr + 4 // encryption key words
59 .equ wi, w + kbufsz // decryption key words
60
61 ///--------------------------------------------------------------------------
62 /// Key setup.
63
64 FUNC(rijndael_setup_x86ish_aesni)
65
66 #define SI WHOLE(si)
67 #define DI WHOLE(di)
68
69 #if CPUFAM_X86
70 // Arguments are on the stack. We'll need to stack the caller's
71 // register veriables, but we'll manage.
72
73 # define CTX ebp // context pointer
74 # define BLKSZ [esp + 24] // block size
75
76 # define KSZ ebx // key size
77 # define NKW edx // total number of key words
78 # define NKW_NEEDS_REFRESH 1 // ... needs recalculating
79 # define RCON ecx // round constants table
80 # define LIM edx // limit pointer
81 # define CYIX edi // index in shift-register cycle
82
83 # define NR ecx // number of rounds
84 # define LRK eax // distance to last key
85 # define BLKOFF edx // block size in bytes
86
87 // Stack the caller's registers.
88 push ebp
89 push ebx
90 push esi
91 push edi
92
93 // Set up our own variables.
94 mov CTX, [esp + 20] // context base pointer
95 mov SI, [esp + 28] // key material
96 mov KSZ, [esp + 32] // key size, in words
97 #endif
98
99 #if CPUFAM_AMD64 && ABI_SYSV
100 // Arguments are in registers. We have plenty, but, to be honest,
101 // the initial register allocation is a bit annoying.
102
103 # define CTX r8 // context pointer
104 # define BLKSZ r9d // block size
105
106 # define KSZ edx // key size
107 # define NKW r10d // total number of key words
108 # define RCON rdi // round constants table
109 # define LIM rcx // limit pointer
110 # define CYIX r11d // index in shift-register cycle
111
112 # define NR ecx // number of rounds
113 # define LRK eax // distance to last key
114 # define BLKOFF r9d // block size in bytes
115
116 // Move arguments to more useful places.
117 mov CTX, rdi // context base pointer
118 mov BLKSZ, esi // block size in words
119 mov SI, rdx // key material
120 mov KSZ, ecx // key size, in words
121 #endif
122
123 #if CPUFAM_AMD64 && ABI_WIN
124 // Arguments are in different registers, and they're a little tight.
125
126 # define CTX r8 // context pointer
127 # define BLKSZ edx // block size
128
129 # define KSZ r9d // key size
130 # define NKW r10d // total number of key words
131 # define RCON rdi // round constants table
132 # define LIM rcx // limit pointer
133 # define CYIX r11d // index in shift-register cycle
134
135 # define NR ecx // number of rounds
136 # define LRK eax // distance to last key
137 # define BLKOFF edx // block size in bytes
138
139 // We'll need the index registers, which belong to the caller in this
140 // ABI.
141 push rsi
142 .seh_pushreg rsi
143 push rdi
144 .seh_pushreg rdi
145 .seh_endprologue
146
147 // Move arguments to more useful places.
148 mov rsi, r8 // key material
149 mov CTX, rcx // context base pointer
150 #endif
151
152 // The initial round key material is taken directly from the input
153 // key, so copy it over.
154 #if CPUFAM_AMD64 && ABI_SYSV
155 // We've been lucky. We already have a copy of the context pointer
156 // in rdi, and the key size in ecx.
157 add rdi, w
158 #else
159 lea DI, [CTX + w]
160 mov ecx, KSZ
161 #endif
162 rep movsd
163
164 // Find out other useful things.
165 mov NKW, [CTX + nr] // number of rounds
166 add NKW, 1
167 imul NKW, BLKSZ // total key size in words
168 #if !NKW_NEEDS_REFRESH
169 // If we can't keep NKW for later, then we use the same register for
170 // it and LIM, so this move is unnecessary.
171 mov DWORD(LIM), NKW
172 #endif
173 sub DWORD(LIM), KSZ // offset by the key size
174
175 // Find the round constants.
176 ldgot WHOLE(c)
177 leaext RCON, F(rijndael_rcon), WHOLE(c)
178
179 // Prepare for the main loop.
180 lea SI, [CTX + w]
181 mov eax, [SI + 4*WHOLE(KSZ) - 4] // most recent key word
182 lea LIM, [SI + 4*LIM] // limit, offset by one key expansion
183 xor CYIX, CYIX // start of new cycle
184
185 // Main key expansion loop. The first word of each key-length chunk
186 // needs special treatment.
187 //
188 // This is rather tedious because the Intel `AESKEYGENASSIST'
189 // instruction is very strangely shaped. Firstly, it wants to
190 // operate on vast SSE registers, even though we're data-blocked from
191 // doing more than operation at a time unless we're doing two key
192 // schedules simultaneously -- and even then we can't do more than
193 // two, because the instruction ignores two of its input words
194 // entirely, and produces two different outputs for each of the other
195 // two. And secondly it insists on taking the magic round constant
196 // as an immediate, so it's kind of annoying if you're not
197 // open-coding the whole thing. It's much easier to leave that as
198 // zero and XOR in the round constant by hand.
199 0: cmp CYIX, 0 // first word of the cycle?
200 je 1f
201 cmp CYIX, 4 // fourth word of the cycle?
202 jne 2f
203 cmp KSZ, 7 // and a large key?
204 jb 2f
205
206 // Fourth word of the cycle, and seven or eight words of key. Do a
207 // byte substitution.
208 movd xmm0, eax
209 pshufd xmm0, xmm0, SHUF(2, 1, 0, 3)
210 aeskeygenassist xmm1, xmm0, 0
211 movd eax, xmm1
212 jmp 2f
213
214 // First word of the cycle. This is the complicated piece.
215 1: movd xmm0, eax
216 pshufd xmm0, xmm0, SHUF(0, 3, 2, 1)
217 aeskeygenassist xmm1, xmm0, 0
218 pshufd xmm1, xmm1, SHUF(2, 1, 0, 3)
219 movd eax, xmm1
220 xor al, [RCON]
221 inc RCON
222
223 // Common tail. Mix in the corresponding word from the previous
224 // cycle and prepare for the next loop.
225 2: xor eax, [SI]
226 mov [SI + 4*WHOLE(KSZ)], eax
227 add SI, 4
228 inc CYIX
229 cmp SI, LIM
230 jae 9f
231 cmp CYIX, KSZ
232 jb 0b
233 xor CYIX, CYIX
234 jmp 0b
235
236 // Next job is to construct the decryption keys. The keys for the
237 // first and last rounds don't need to be mangled, but the remaining
238 // ones do -- and they all need to be reordered too.
239 //
240 // The plan of action, then, is to copy the final encryption round's
241 // keys into place first, then to do each of the intermediate rounds
242 // in reverse order, and finally do the first round.
243 //
244 // Do all of the heavy lifting with SSE registers. The order we're
245 // doing this in means that it's OK if we read or write too much, and
246 // there's easily enough buffer space for the over-enthusiastic reads
247 // and writes because the context has space for 32-byte blocks, which
248 // is our maximum and an exact fit for two SSE registers.
249 9: mov NR, [CTX + nr] // number of rounds
250 #if NKW_NEEDS_REFRESH
251 mov BLKOFF, BLKSZ
252 mov LRK, NR
253 imul LRK, BLKOFF
254 #else
255 // If we retain NKW, then BLKSZ and BLKOFF are the same register
256 // because we won't need the former again.
257 mov LRK, NKW
258 sub LRK, BLKSZ
259 #endif
260 lea DI, [CTX + wi]
261 lea SI, [CTX + w + 4*WHOLE(LRK)] // last round's keys
262 shl BLKOFF, 2 // block size (in bytes now)
263
264 // Copy the last encryption round's keys.
265 movdqu xmm0, [SI]
266 movdqu [DI], xmm0
267 cmp BLKOFF, 16
268 jbe 0f
269 movdqu xmm0, [SI + 16]
270 movdqu [DI + 16], xmm0
271
272 // Update the loop variables and stop if we've finished.
273 0: add DI, WHOLE(BLKOFF)
274 sub SI, WHOLE(BLKOFF)
275 sub NR, 1
276 jbe 9f
277
278 // Do another middle round's keys...
279 movdqu xmm0, [SI]
280 aesimc xmm0, xmm0
281 movdqu [DI], xmm0
282 cmp BLKOFF, 16
283 jbe 0b
284 movdqu xmm0, [SI + 16]
285 aesimc xmm0, xmm0
286 movdqu [DI + 16], xmm0
287 jmp 0b
288
289 // Finally do the first encryption round.
290 9: movdqu xmm0, [SI]
291 movdqu [DI], xmm0
292 cmp BLKOFF, 16
293 jbe 1f
294 movdqu xmm0, [SI + 16]
295 movdqu [DI + 16], xmm0
296
297 // If the block size is not exactly four words then we must end-swap
298 // everything. We can use fancy SSE toys for this.
299 1: cmp BLKOFF, 16
300 je 9f
301
302 // Find the byte-reordering table.
303 ldgot ecx
304 movdqa xmm5, [INTADDR(endswap_tab, ecx)]
305
306 #if NKW_NEEDS_REFRESH
307 // Calculate the number of subkey words again. (It's a good job
308 // we've got a fast multiplier.)
309 mov NKW, [CTX + nr]
310 add NKW, 1
311 imul NKW, BLKSZ
312 #endif
313
314 // End-swap the encryption keys.
315 lea SI, [CTX + w]
316 call endswap_block
317
318 // And the decryption keys.
319 lea SI, [CTX + wi]
320 call endswap_block
321
322 9: // All done.
323 #if CPUFAM_X86
324 pop edi
325 pop esi
326 pop ebx
327 pop ebp
328 #endif
329 #if CPUFAM_AMD64 && ABI_WIN
330 pop rdi
331 pop rsi
332 #endif
333 ret
334
335 .align 16
336 endswap_block:
337 // End-swap NKW words starting at SI. The end-swapping table is
338 // already loaded into XMM5; and it's OK to work in 16-byte chunks.
339 mov ecx, NKW
340 0: movdqu xmm1, [SI]
341 pshufb xmm1, xmm5
342 movdqu [SI], xmm1
343 add SI, 16
344 sub ecx, 4
345 ja 0b
346 ret
347
348 #undef CTX
349 #undef BLKSZ
350 #undef SI
351 #undef DI
352 #undef KSZ
353 #undef RCON
354 #undef LIM
355 #undef NR
356 #undef LRK
357 #undef BLKOFF
358
359 ENDFUNC
360
361 ///--------------------------------------------------------------------------
362 /// Encrypting and decrypting blocks.
363
364 .macro encdec op, aes, koff
365 FUNC(rijndael_\op\()_x86ish_aesni)
366
367 #if CPUFAM_X86
368 // Arguments come in on the stack, and need to be collected. We
369 // don't have a shortage of registers.
370
371 # define K eax
372 # define SRC edx
373 # define DST edx
374 # define NR ecx
375
376 mov K, [esp + 4]
377 mov SRC, [esp + 8]
378 #endif
379
380 #if CPUFAM_AMD64 && ABI_SYSV
381 // Arguments come in registers. All is good.
382
383 # define K rdi
384 # define SRC rsi
385 # define DST rdx
386 # define NR eax
387 #endif
388
389 #if CPUFAM_AMD64 && ABI_WIN
390 // Arguments come in different registers.
391
392 # define K rcx
393 # define SRC rdx
394 # define DST r8
395 # define NR eax
396 .seh_endprologue
397 #endif
398
399 // Find the magic endianness-swapping table.
400 ldgot ecx
401 movdqa xmm5, [INTADDR(endswap_tab, ecx)]
402
403 // Initial setup.
404 movdqu xmm0, [SRC]
405 pshufb xmm0, xmm5
406 mov NR, [K + nr]
407 add K, \koff
408
409 // Initial whitening.
410 movdqu xmm1, [K]
411 add K, 16
412 pxor xmm0, xmm1
413 #if CPUFAM_X86
414 mov DST, [esp + 12]
415 #endif
416
417 // Dispatch to the correct code.
418 cmp NR, 10
419 je 10f
420 jb bogus
421 cmp NR, 14
422 je 14f
423 ja bogus
424 cmp NR, 12
425 je 12f
426 jb 11f
427 jmp 13f
428
429 .align 2
430
431 // 14 rounds...
432 14: movdqu xmm1, [K]
433 add K, 16
434 \aes xmm0, xmm1
435
436 // 13 rounds...
437 13: movdqu xmm1, [K]
438 add K, 16
439 \aes xmm0, xmm1
440
441 // 12 rounds...
442 12: movdqu xmm1, [K]
443 add K, 16
444 \aes xmm0, xmm1
445
446 // 11 rounds...
447 11: movdqu xmm1, [K]
448 add K, 16
449 \aes xmm0, xmm1
450
451 // 10 rounds...
452 10: movdqu xmm1, [K]
453 \aes xmm0, xmm1
454
455 // 9 rounds...
456 movdqu xmm1, [K + 16]
457 \aes xmm0, xmm1
458
459 // 8 rounds...
460 movdqu xmm1, [K + 32]
461 \aes xmm0, xmm1
462
463 // 7 rounds...
464 movdqu xmm1, [K + 48]
465 \aes xmm0, xmm1
466
467 // 6 rounds...
468 movdqu xmm1, [K + 64]
469 \aes xmm0, xmm1
470
471 // 5 rounds...
472 movdqu xmm1, [K + 80]
473 \aes xmm0, xmm1
474
475 // 4 rounds...
476 movdqu xmm1, [K + 96]
477 \aes xmm0, xmm1
478
479 // 3 rounds...
480 movdqu xmm1, [K + 112]
481 \aes xmm0, xmm1
482
483 // 2 rounds...
484 movdqu xmm1, [K + 128]
485 \aes xmm0, xmm1
486
487 // Final round...
488 movdqu xmm1, [K + 144]
489 \aes\()last xmm0, xmm1
490
491 // Unpermute the ciphertext block and store it.
492 pshufb xmm0, xmm5
493 movdqu [DST], xmm0
494
495 // And we're done.
496 ret
497
498 #undef K
499 #undef SRC
500 #undef DST
501 #undef NR
502
503 ENDFUNC
504 .endm
505
506 encdec eblk, aesenc, w
507 encdec dblk, aesdec, wi
508
509 ///--------------------------------------------------------------------------
510 /// Random utilities.
511
512 .align 16
513 // Abort the process because of a programming error. Indirecting
514 // through this point serves several purposes: (a) by CALLing, rather
515 // than branching to, `abort', we can save the return address, which
516 // might at least provide a hint as to what went wrong; (b) we don't
517 // have conditional CALLs (and they'd be big anyway); and (c) we can
518 // write a HLT here as a backstop against `abort' being mad.
519 bogus: callext F(abort)
520 0: hlt
521 jmp 0b
522
523 ///--------------------------------------------------------------------------
524 /// Data tables.
525
526 RODATA
527
528 .align 16
529 endswap_tab:
530 .byte 3, 2, 1, 0
531 .byte 7, 6, 5, 4
532 .byte 11, 10, 9, 8
533 .byte 15, 14, 13, 12
534
535 ///----- That's all, folks --------------------------------------------------