math/Makefile.am, symm/Makefile.am: Use `--no-install' on oddball tests.
[catacomb] / symm / rijndael-x86ish-aesni.S
1 /// -*- mode: asm; asm-comment-char: ?/ -*-
2 ///
3 /// AESNI-based implementation of Rijndael
4 ///
5 /// (c) 2015 Straylight/Edgeware
6 ///
7
8 ///----- Licensing notice ---------------------------------------------------
9 ///
10 /// This file is part of Catacomb.
11 ///
12 /// Catacomb is free software; you can redistribute it and/or modify
13 /// it under the terms of the GNU Library General Public License as
14 /// published by the Free Software Foundation; either version 2 of the
15 /// License, or (at your option) any later version.
16 ///
17 /// Catacomb is distributed in the hope that it will be useful,
18 /// but WITHOUT ANY WARRANTY; without even the implied warranty of
19 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 /// GNU Library General Public License for more details.
21 ///
22 /// You should have received a copy of the GNU Library General Public
23 /// License along with Catacomb; if not, write to the Free
24 /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
25 /// MA 02111-1307, USA.
26
27 ///--------------------------------------------------------------------------
28 /// External definitions.
29
30 #include "config.h"
31 #include "asm-common.h"
32
33 .extern F(abort)
34 .extern F(rijndael_rcon)
35
36 ///--------------------------------------------------------------------------
37 /// Main code.
38
39 .arch .aes
40 .text
41
42 /// The AESNI instructions implement a little-endian version of AES, but
43 /// Catacomb's internal interface presents as big-endian so as to work better
44 /// with things like GCM. We therefore maintain the round keys in
45 /// little-endian form, and have to end-swap blocks in and out.
46 ///
47 /// For added amusement, the AESNI instructions don't implement the
48 /// larger-block versions of Rijndael, so we have to end-swap the keys if
49 /// we're preparing for one of those.
50
51 // Useful constants.
52 .equ maxrounds, 16 // maximum number of rounds
53 .equ maxblksz, 32 // maximum block size, in bytes
54 .equ kbufsz, maxblksz*(maxrounds + 1) // size of a key-schedule buffer
55
56 // Context structure.
57 .equ nr, 0 // number of rounds
58 .equ w, nr + 4 // encryption key words
59 .equ wi, w + kbufsz // decryption key words
60
61 ///--------------------------------------------------------------------------
62 /// Key setup.
63
64 FUNC(rijndael_setup_x86ish_aesni)
65
66 #define SI WHOLE(si)
67 #define DI WHOLE(di)
68
69 #if CPUFAM_X86
70 // Arguments are on the stack. We'll need to stack the caller's
71 // register veriables, but we'll manage.
72
73 # define CTX ebp // context pointer
74 # define BLKSZ [esp + 24] // block size
75
76 # define KSZ ebx // key size
77 # define NKW edx // total number of key words
78 # define NKW_NEEDS_REFRESH 1 // ... needs recalculating
79 # define RCON ecx // round constants table
80 # define LIM edx // limit pointer
81 # define CYIX edi // index in shift-register cycle
82
83 # define NR ecx // number of rounds
84 # define LRK eax // distance to last key
85 # define BLKOFF edx // block size in bytes
86
87 // Stack the caller's registers.
88 pushreg ebp
89 pushreg ebx
90 pushreg esi
91 pushreg edi
92
93 // Set up our own variables.
94 mov CTX, [esp + 20] // context base pointer
95 mov SI, [esp + 28] // key material
96 mov KSZ, [esp + 32] // key size, in words
97 #endif
98
99 #if CPUFAM_AMD64 && ABI_SYSV
100 // Arguments are in registers. We have plenty, but, to be honest,
101 // the initial register allocation is a bit annoying.
102
103 # define CTX r8 // context pointer
104 # define BLKSZ r9d // block size
105
106 # define KSZ edx // key size
107 # define NKW r10d // total number of key words
108 # define RCON rdi // round constants table
109 # define LIM rcx // limit pointer
110 # define CYIX r11d // index in shift-register cycle
111
112 # define NR ecx // number of rounds
113 # define LRK eax // distance to last key
114 # define BLKOFF r9d // block size in bytes
115
116 // Move arguments to more useful places.
117 mov CTX, rdi // context base pointer
118 mov BLKSZ, esi // block size in words
119 mov SI, rdx // key material
120 mov KSZ, ecx // key size, in words
121 #endif
122
123 #if CPUFAM_AMD64 && ABI_WIN
124 // Arguments are in different registers, and they're a little tight.
125
126 # define CTX r8 // context pointer
127 # define BLKSZ edx // block size
128
129 # define KSZ r9d // key size
130 # define NKW r10d // total number of key words
131 # define RCON rdi // round constants table
132 # define LIM rcx // limit pointer
133 # define CYIX r11d // index in shift-register cycle
134
135 # define NR ecx // number of rounds
136 # define LRK eax // distance to last key
137 # define BLKOFF edx // block size in bytes
138
139 // We'll need the index registers, which belong to the caller in this
140 // ABI.
141 pushreg rsi
142 pushreg rdi
143
144 // Move arguments to more useful places.
145 mov rsi, r8 // key material
146 mov CTX, rcx // context base pointer
147 #endif
148
149 endprologue
150
151 // The initial round key material is taken directly from the input
152 // key, so copy it over.
153 #if CPUFAM_AMD64 && ABI_SYSV
154 // We've been lucky. We already have a copy of the context pointer
155 // in rdi, and the key size in ecx.
156 add rdi, w
157 #else
158 lea DI, [CTX + w]
159 mov ecx, KSZ
160 #endif
161 rep movsd
162
163 // Find out other useful things.
164 mov NKW, [CTX + nr] // number of rounds
165 add NKW, 1
166 imul NKW, BLKSZ // total key size in words
167 #if !NKW_NEEDS_REFRESH
168 // If we can't keep NKW for later, then we use the same register for
169 // it and LIM, so this move is unnecessary.
170 mov DWORD(LIM), NKW
171 #endif
172 sub DWORD(LIM), KSZ // offset by the key size
173
174 // Find the round constants.
175 ldgot WHOLE(c)
176 leaext RCON, F(rijndael_rcon), WHOLE(c)
177
178 // Prepare for the main loop.
179 lea SI, [CTX + w]
180 mov eax, [SI + 4*WHOLE(KSZ) - 4] // most recent key word
181 lea LIM, [SI + 4*LIM] // limit, offset by one key expansion
182 xor CYIX, CYIX // start of new cycle
183
184 // Main key expansion loop. The first word of each key-length chunk
185 // needs special treatment.
186 //
187 // This is rather tedious because the Intel `AESKEYGENASSIST'
188 // instruction is very strangely shaped. Firstly, it wants to
189 // operate on vast SSE registers, even though we're data-blocked from
190 // doing more than operation at a time unless we're doing two key
191 // schedules simultaneously -- and even then we can't do more than
192 // two, because the instruction ignores two of its input words
193 // entirely, and produces two different outputs for each of the other
194 // two. And secondly it insists on taking the magic round constant
195 // as an immediate, so it's kind of annoying if you're not
196 // open-coding the whole thing. It's much easier to leave that as
197 // zero and XOR in the round constant by hand.
198 0: cmp CYIX, 0 // first word of the cycle?
199 je 1f
200 cmp CYIX, 4 // fourth word of the cycle?
201 jne 2f
202 cmp KSZ, 7 // and a large key?
203 jb 2f
204
205 // Fourth word of the cycle, and seven or eight words of key. Do a
206 // byte substitution.
207 movd xmm0, eax
208 pshufd xmm0, xmm0, SHUF(2, 1, 0, 3)
209 aeskeygenassist xmm1, xmm0, 0
210 movd eax, xmm1
211 jmp 2f
212
213 // First word of the cycle. This is the complicated piece.
214 1: movd xmm0, eax
215 pshufd xmm0, xmm0, SHUF(0, 3, 2, 1)
216 aeskeygenassist xmm1, xmm0, 0
217 pshufd xmm1, xmm1, SHUF(2, 1, 0, 3)
218 movd eax, xmm1
219 xor al, [RCON]
220 inc RCON
221
222 // Common tail. Mix in the corresponding word from the previous
223 // cycle and prepare for the next loop.
224 2: xor eax, [SI]
225 mov [SI + 4*WHOLE(KSZ)], eax
226 add SI, 4
227 inc CYIX
228 cmp SI, LIM
229 jae 9f
230 cmp CYIX, KSZ
231 jb 0b
232 xor CYIX, CYIX
233 jmp 0b
234
235 // Next job is to construct the decryption keys. The keys for the
236 // first and last rounds don't need to be mangled, but the remaining
237 // ones do -- and they all need to be reordered too.
238 //
239 // The plan of action, then, is to copy the final encryption round's
240 // keys into place first, then to do each of the intermediate rounds
241 // in reverse order, and finally do the first round.
242 //
243 // Do all of the heavy lifting with SSE registers. The order we're
244 // doing this in means that it's OK if we read or write too much, and
245 // there's easily enough buffer space for the over-enthusiastic reads
246 // and writes because the context has space for 32-byte blocks, which
247 // is our maximum and an exact fit for two SSE registers.
248 9: mov NR, [CTX + nr] // number of rounds
249 #if NKW_NEEDS_REFRESH
250 mov BLKOFF, BLKSZ
251 mov LRK, NR
252 imul LRK, BLKOFF
253 #else
254 // If we retain NKW, then BLKSZ and BLKOFF are the same register
255 // because we won't need the former again.
256 mov LRK, NKW
257 sub LRK, BLKSZ
258 #endif
259 lea DI, [CTX + wi]
260 lea SI, [CTX + w + 4*WHOLE(LRK)] // last round's keys
261 shl BLKOFF, 2 // block size (in bytes now)
262
263 // Copy the last encryption round's keys.
264 movdqu xmm0, [SI]
265 movdqu [DI], xmm0
266 cmp BLKOFF, 16
267 jbe 0f
268 movdqu xmm0, [SI + 16]
269 movdqu [DI + 16], xmm0
270
271 // Update the loop variables and stop if we've finished.
272 0: add DI, WHOLE(BLKOFF)
273 sub SI, WHOLE(BLKOFF)
274 sub NR, 1
275 jbe 9f
276
277 // Do another middle round's keys...
278 movdqu xmm0, [SI]
279 aesimc xmm0, xmm0
280 movdqu [DI], xmm0
281 cmp BLKOFF, 16
282 jbe 0b
283 movdqu xmm0, [SI + 16]
284 aesimc xmm0, xmm0
285 movdqu [DI + 16], xmm0
286 jmp 0b
287
288 // Finally do the first encryption round.
289 9: movdqu xmm0, [SI]
290 movdqu [DI], xmm0
291 cmp BLKOFF, 16
292 jbe 1f
293 movdqu xmm0, [SI + 16]
294 movdqu [DI + 16], xmm0
295
296 // If the block size is not exactly four words then we must end-swap
297 // everything. We can use fancy SSE toys for this.
298 1: cmp BLKOFF, 16
299 je 9f
300
301 // Find the byte-reordering table.
302 ldgot ecx
303 movdqa xmm5, [INTADDR(endswap_tab, ecx)]
304
305 #if NKW_NEEDS_REFRESH
306 // Calculate the number of subkey words again. (It's a good job
307 // we've got a fast multiplier.)
308 mov NKW, [CTX + nr]
309 add NKW, 1
310 imul NKW, BLKSZ
311 #endif
312
313 // End-swap the encryption keys.
314 lea SI, [CTX + w]
315 call endswap_block
316
317 // And the decryption keys.
318 lea SI, [CTX + wi]
319 call endswap_block
320
321 9: // All done.
322 #if CPUFAM_X86
323 popreg edi
324 popreg esi
325 popreg ebx
326 popreg ebp
327 #endif
328 #if CPUFAM_AMD64 && ABI_WIN
329 popreg rdi
330 popreg rsi
331 #endif
332 ret
333
334 ENDFUNC
335
336 INTFUNC(endswap_block)
337 // End-swap NKW words starting at SI. The end-swapping table is
338 // already loaded into XMM5; and it's OK to work in 16-byte chunks.
339 endprologue
340
341 mov ecx, NKW
342 0: movdqu xmm1, [SI]
343 pshufb xmm1, xmm5
344 movdqu [SI], xmm1
345 add SI, 16
346 sub ecx, 4
347 ja 0b
348
349 ret
350
351 ENDFUNC
352
353 #undef CTX
354 #undef BLKSZ
355 #undef SI
356 #undef DI
357 #undef KSZ
358 #undef RCON
359 #undef LIM
360 #undef NR
361 #undef LRK
362 #undef BLKOFF
363
364 ///--------------------------------------------------------------------------
365 /// Encrypting and decrypting blocks.
366
367 .macro encdec op, aes, koff
368 FUNC(rijndael_\op\()_x86ish_aesni)
369
370 #if CPUFAM_X86
371 // Arguments come in on the stack, and need to be collected. We
372 // don't have a shortage of registers.
373
374 # define K eax
375 # define SRC edx
376 # define DST edx
377 # define NR ecx
378
379 mov K, [esp + 4]
380 mov SRC, [esp + 8]
381 #endif
382
383 #if CPUFAM_AMD64 && ABI_SYSV
384 // Arguments come in registers. All is good.
385
386 # define K rdi
387 # define SRC rsi
388 # define DST rdx
389 # define NR eax
390 #endif
391
392 #if CPUFAM_AMD64 && ABI_WIN
393 // Arguments come in different registers.
394
395 # define K rcx
396 # define SRC rdx
397 # define DST r8
398 # define NR eax
399 #endif
400
401 endprologue
402
403 // Find the magic endianness-swapping table.
404 ldgot ecx
405 movdqa xmm5, [INTADDR(endswap_tab, ecx)]
406
407 // Initial setup.
408 movdqu xmm0, [SRC]
409 pshufb xmm0, xmm5
410 mov NR, [K + nr]
411 add K, \koff
412
413 // Initial whitening.
414 movdqu xmm1, [K]
415 add K, 16
416 pxor xmm0, xmm1
417 #if CPUFAM_X86
418 mov DST, [esp + 12]
419 #endif
420
421 // Dispatch to the correct code.
422 cmp NR, 10
423 je 10f
424 jb bogus
425 cmp NR, 14
426 je 14f
427 ja bogus
428 cmp NR, 12
429 je 12f
430 jb 11f
431 jmp 13f
432
433 .align 2
434
435 // 14 rounds...
436 14: movdqu xmm1, [K]
437 add K, 16
438 \aes xmm0, xmm1
439
440 // 13 rounds...
441 13: movdqu xmm1, [K]
442 add K, 16
443 \aes xmm0, xmm1
444
445 // 12 rounds...
446 12: movdqu xmm1, [K]
447 add K, 16
448 \aes xmm0, xmm1
449
450 // 11 rounds...
451 11: movdqu xmm1, [K]
452 add K, 16
453 \aes xmm0, xmm1
454
455 // 10 rounds...
456 10: movdqu xmm1, [K]
457 \aes xmm0, xmm1
458
459 // 9 rounds...
460 movdqu xmm1, [K + 16]
461 \aes xmm0, xmm1
462
463 // 8 rounds...
464 movdqu xmm1, [K + 32]
465 \aes xmm0, xmm1
466
467 // 7 rounds...
468 movdqu xmm1, [K + 48]
469 \aes xmm0, xmm1
470
471 // 6 rounds...
472 movdqu xmm1, [K + 64]
473 \aes xmm0, xmm1
474
475 // 5 rounds...
476 movdqu xmm1, [K + 80]
477 \aes xmm0, xmm1
478
479 // 4 rounds...
480 movdqu xmm1, [K + 96]
481 \aes xmm0, xmm1
482
483 // 3 rounds...
484 movdqu xmm1, [K + 112]
485 \aes xmm0, xmm1
486
487 // 2 rounds...
488 movdqu xmm1, [K + 128]
489 \aes xmm0, xmm1
490
491 // Final round...
492 movdqu xmm1, [K + 144]
493 \aes\()last xmm0, xmm1
494
495 // Unpermute the ciphertext block and store it.
496 pshufb xmm0, xmm5
497 movdqu [DST], xmm0
498
499 // And we're done.
500 ret
501
502 #undef K
503 #undef SRC
504 #undef DST
505 #undef NR
506
507 ENDFUNC
508 .endm
509
510 encdec eblk, aesenc, w
511 encdec dblk, aesdec, wi
512
513 ///--------------------------------------------------------------------------
514 /// Random utilities.
515
516 INTFUNC(bogus)
517 // Abort the process because of a programming error. Indirecting
518 // through this point serves several purposes: (a) by CALLing, rather
519 // than branching to, `abort', we can save the return address, which
520 // might at least provide a hint as to what went wrong; (b) we don't
521 // have conditional CALLs (and they'd be big anyway); and (c) we can
522 // write a HLT here as a backstop against `abort' being mad.
523 endprologue
524
525 callext F(abort)
526 0: hlt
527 jmp 0b
528
529 ENDFUNC
530
531 ///--------------------------------------------------------------------------
532 /// Data tables.
533
534 RODATA
535
536 .align 16
537 endswap_tab:
538 .byte 3, 2, 1, 0
539 .byte 7, 6, 5, 4
540 .byte 11, 10, 9, 8
541 .byte 15, 14, 13, 12
542
543 ///----- That's all, folks --------------------------------------------------