math/Makefile.am, symm/Makefile.am: Use `--no-install' on oddball tests.
[catacomb] / symm / rijndael-x86ish-aesni.S
CommitLineData
1a0c09c4
MW
1/// -*- mode: asm; asm-comment-char: ?/ -*-
2///
3/// AESNI-based implementation of Rijndael
4///
5/// (c) 2015 Straylight/Edgeware
6///
7
8///----- Licensing notice ---------------------------------------------------
9///
10/// This file is part of Catacomb.
11///
12/// Catacomb is free software; you can redistribute it and/or modify
13/// it under the terms of the GNU Library General Public License as
14/// published by the Free Software Foundation; either version 2 of the
15/// License, or (at your option) any later version.
16///
17/// Catacomb is distributed in the hope that it will be useful,
18/// but WITHOUT ANY WARRANTY; without even the implied warranty of
19/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20/// GNU Library General Public License for more details.
21///
22/// You should have received a copy of the GNU Library General Public
23/// License along with Catacomb; if not, write to the Free
24/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
25/// MA 02111-1307, USA.
26
27///--------------------------------------------------------------------------
28/// External definitions.
29
30#include "config.h"
31#include "asm-common.h"
32
81bc2bb8
MW
33 .extern F(abort)
34 .extern F(rijndael_rcon)
1a0c09c4
MW
35
36///--------------------------------------------------------------------------
37/// Main code.
38
39 .arch .aes
bc9ac7eb 40 .text
1a0c09c4
MW
41
42/// The AESNI instructions implement a little-endian version of AES, but
43/// Catacomb's internal interface presents as big-endian so as to work better
44/// with things like GCM. We therefore maintain the round keys in
45/// little-endian form, and have to end-swap blocks in and out.
46///
47/// For added amusement, the AESNI instructions don't implement the
48/// larger-block versions of Rijndael, so we have to end-swap the keys if
49/// we're preparing for one of those.
50
51 // Useful constants.
52 .equ maxrounds, 16 // maximum number of rounds
53 .equ maxblksz, 32 // maximum block size, in bytes
54 .equ kbufsz, maxblksz*(maxrounds + 1) // size of a key-schedule buffer
55
56 // Context structure.
57 .equ nr, 0 // number of rounds
58 .equ w, nr + 4 // encryption key words
59 .equ wi, w + kbufsz // decryption key words
60
61///--------------------------------------------------------------------------
62/// Key setup.
63
0f23f75f 64FUNC(rijndael_setup_x86ish_aesni)
1a0c09c4 65
43ea7558
MW
66#define SI WHOLE(si)
67#define DI WHOLE(di)
68
0f23f75f
MW
69#if CPUFAM_X86
70 // Arguments are on the stack. We'll need to stack the caller's
71 // register veriables, but we'll manage.
1a0c09c4 72
0f23f75f
MW
73# define CTX ebp // context pointer
74# define BLKSZ [esp + 24] // block size
75
0f23f75f 76# define KSZ ebx // key size
0f23f75f
MW
77# define NKW edx // total number of key words
78# define NKW_NEEDS_REFRESH 1 // ... needs recalculating
79# define RCON ecx // round constants table
80# define LIM edx // limit pointer
16021451 81# define CYIX edi // index in shift-register cycle
0f23f75f
MW
82
83# define NR ecx // number of rounds
84# define LRK eax // distance to last key
0f23f75f 85# define BLKOFF edx // block size in bytes
0f23f75f
MW
86
87 // Stack the caller's registers.
0923a413
MW
88 pushreg ebp
89 pushreg ebx
90 pushreg esi
91 pushreg edi
1a0c09c4 92
0f23f75f
MW
93 // Set up our own variables.
94 mov CTX, [esp + 20] // context base pointer
95 mov SI, [esp + 28] // key material
96 mov KSZ, [esp + 32] // key size, in words
97#endif
98
99#if CPUFAM_AMD64 && ABI_SYSV
100 // Arguments are in registers. We have plenty, but, to be honest,
101 // the initial register allocation is a bit annoying.
102
103# define CTX r8 // context pointer
104# define BLKSZ r9d // block size
105
0f23f75f 106# define KSZ edx // key size
0f23f75f
MW
107# define NKW r10d // total number of key words
108# define RCON rdi // round constants table
43ea7558 109# define LIM rcx // limit pointer
16021451 110# define CYIX r11d // index in shift-register cycle
0f23f75f
MW
111
112# define NR ecx // number of rounds
113# define LRK eax // distance to last key
0f23f75f 114# define BLKOFF r9d // block size in bytes
0f23f75f
MW
115
116 // Move arguments to more useful places.
117 mov CTX, rdi // context base pointer
118 mov BLKSZ, esi // block size in words
119 mov SI, rdx // key material
120 mov KSZ, ecx // key size, in words
121#endif
122
123#if CPUFAM_AMD64 && ABI_WIN
124 // Arguments are in different registers, and they're a little tight.
125
126# define CTX r8 // context pointer
127# define BLKSZ edx // block size
128
0f23f75f 129# define KSZ r9d // key size
0f23f75f
MW
130# define NKW r10d // total number of key words
131# define RCON rdi // round constants table
43ea7558 132# define LIM rcx // limit pointer
16021451 133# define CYIX r11d // index in shift-register cycle
0f23f75f
MW
134
135# define NR ecx // number of rounds
136# define LRK eax // distance to last key
0f23f75f 137# define BLKOFF edx // block size in bytes
0f23f75f
MW
138
139 // We'll need the index registers, which belong to the caller in this
140 // ABI.
0923a413
MW
141 pushreg rsi
142 pushreg rdi
0f23f75f
MW
143
144 // Move arguments to more useful places.
43ea7558 145 mov rsi, r8 // key material
0f23f75f
MW
146 mov CTX, rcx // context base pointer
147#endif
148
0923a413
MW
149 endprologue
150
1a0c09c4
MW
151 // The initial round key material is taken directly from the input
152 // key, so copy it over.
0f23f75f
MW
153#if CPUFAM_AMD64 && ABI_SYSV
154 // We've been lucky. We already have a copy of the context pointer
155 // in rdi, and the key size in ecx.
43ea7558 156 add rdi, w
0f23f75f
MW
157#else
158 lea DI, [CTX + w]
159 mov ecx, KSZ
160#endif
1a0c09c4
MW
161 rep movsd
162
163 // Find out other useful things.
0f23f75f
MW
164 mov NKW, [CTX + nr] // number of rounds
165 add NKW, 1
166 imul NKW, BLKSZ // total key size in words
167#if !NKW_NEEDS_REFRESH
168 // If we can't keep NKW for later, then we use the same register for
169 // it and LIM, so this move is unnecessary.
43ea7558 170 mov DWORD(LIM), NKW
0f23f75f 171#endif
43ea7558 172 sub DWORD(LIM), KSZ // offset by the key size
1a0c09c4
MW
173
174 // Find the round constants.
43ea7558
MW
175 ldgot WHOLE(c)
176 leaext RCON, F(rijndael_rcon), WHOLE(c)
1a0c09c4
MW
177
178 // Prepare for the main loop.
0f23f75f 179 lea SI, [CTX + w]
43ea7558 180 mov eax, [SI + 4*WHOLE(KSZ) - 4] // most recent key word
0f23f75f 181 lea LIM, [SI + 4*LIM] // limit, offset by one key expansion
16021451 182 xor CYIX, CYIX // start of new cycle
1a0c09c4
MW
183
184 // Main key expansion loop. The first word of each key-length chunk
185 // needs special treatment.
186 //
187 // This is rather tedious because the Intel `AESKEYGENASSIST'
188 // instruction is very strangely shaped. Firstly, it wants to
189 // operate on vast SSE registers, even though we're data-blocked from
190 // doing more than operation at a time unless we're doing two key
191 // schedules simultaneously -- and even then we can't do more than
192 // two, because the instruction ignores two of its input words
193 // entirely, and produces two different outputs for each of the other
194 // two. And secondly it insists on taking the magic round constant
195 // as an immediate, so it's kind of annoying if you're not
196 // open-coding the whole thing. It's much easier to leave that as
197 // zero and XOR in the round constant by hand.
16021451
MW
1980: cmp CYIX, 0 // first word of the cycle?
199 je 1f
200 cmp CYIX, 4 // fourth word of the cycle?
201 jne 2f
202 cmp KSZ, 7 // and a large key?
203 jb 2f
204
205 // Fourth word of the cycle, and seven or eight words of key. Do a
206 // byte substitution.
207 movd xmm0, eax
a13b5730 208 pshufd xmm0, xmm0, SHUF(2, 1, 0, 3)
16021451
MW
209 aeskeygenassist xmm1, xmm0, 0
210 movd eax, xmm1
211 jmp 2f
212
213 // First word of the cycle. This is the complicated piece.
2141: movd xmm0, eax
a13b5730 215 pshufd xmm0, xmm0, SHUF(0, 3, 2, 1)
1a0c09c4 216 aeskeygenassist xmm1, xmm0, 0
a13b5730 217 pshufd xmm1, xmm1, SHUF(2, 1, 0, 3)
1a0c09c4 218 movd eax, xmm1
0f23f75f
MW
219 xor al, [RCON]
220 inc RCON
1a0c09c4 221
16021451
MW
222 // Common tail. Mix in the corresponding word from the previous
223 // cycle and prepare for the next loop.
2242: xor eax, [SI]
43ea7558 225 mov [SI + 4*WHOLE(KSZ)], eax
0f23f75f 226 add SI, 4
16021451 227 inc CYIX
0f23f75f 228 cmp SI, LIM
89b34050 229 jae 9f
16021451 230 cmp CYIX, KSZ
89b34050 231 jb 0b
16021451 232 xor CYIX, CYIX
89b34050 233 jmp 0b
1a0c09c4
MW
234
235 // Next job is to construct the decryption keys. The keys for the
236 // first and last rounds don't need to be mangled, but the remaining
237 // ones do -- and they all need to be reordered too.
238 //
239 // The plan of action, then, is to copy the final encryption round's
240 // keys into place first, then to do each of the intermediate rounds
241 // in reverse order, and finally do the first round.
242 //
243 // Do all of the heavy lifting with SSE registers. The order we're
244 // doing this in means that it's OK if we read or write too much, and
245 // there's easily enough buffer space for the over-enthusiastic reads
246 // and writes because the context has space for 32-byte blocks, which
247 // is our maximum and an exact fit for two SSE registers.
89b34050 2489: mov NR, [CTX + nr] // number of rounds
0f23f75f
MW
249#if NKW_NEEDS_REFRESH
250 mov BLKOFF, BLKSZ
251 mov LRK, NR
252 imul LRK, BLKOFF
253#else
254 // If we retain NKW, then BLKSZ and BLKOFF are the same register
255 // because we won't need the former again.
256 mov LRK, NKW
257 sub LRK, BLKSZ
258#endif
259 lea DI, [CTX + wi]
43ea7558 260 lea SI, [CTX + w + 4*WHOLE(LRK)] // last round's keys
0f23f75f 261 shl BLKOFF, 2 // block size (in bytes now)
1a0c09c4
MW
262
263 // Copy the last encryption round's keys.
0f23f75f
MW
264 movdqu xmm0, [SI]
265 movdqu [DI], xmm0
266 cmp BLKOFF, 16
89b34050 267 jbe 0f
0f23f75f
MW
268 movdqu xmm0, [SI + 16]
269 movdqu [DI + 16], xmm0
1a0c09c4
MW
270
271 // Update the loop variables and stop if we've finished.
43ea7558
MW
2720: add DI, WHOLE(BLKOFF)
273 sub SI, WHOLE(BLKOFF)
0f23f75f 274 sub NR, 1
89b34050 275 jbe 9f
1a0c09c4
MW
276
277 // Do another middle round's keys...
0f23f75f 278 movdqu xmm0, [SI]
1a0c09c4 279 aesimc xmm0, xmm0
0f23f75f
MW
280 movdqu [DI], xmm0
281 cmp BLKOFF, 16
89b34050 282 jbe 0b
0f23f75f 283 movdqu xmm0, [SI + 16]
1a0c09c4 284 aesimc xmm0, xmm0
0f23f75f 285 movdqu [DI + 16], xmm0
89b34050 286 jmp 0b
1a0c09c4
MW
287
288 // Finally do the first encryption round.
89b34050 2899: movdqu xmm0, [SI]
0f23f75f
MW
290 movdqu [DI], xmm0
291 cmp BLKOFF, 16
89b34050 292 jbe 1f
0f23f75f
MW
293 movdqu xmm0, [SI + 16]
294 movdqu [DI + 16], xmm0
1a0c09c4
MW
295
296 // If the block size is not exactly four words then we must end-swap
297 // everything. We can use fancy SSE toys for this.
89b34050
MW
2981: cmp BLKOFF, 16
299 je 9f
1a0c09c4
MW
300
301 // Find the byte-reordering table.
302 ldgot ecx
8d6ca554 303 movdqa xmm5, [INTADDR(endswap_tab, ecx)]
1a0c09c4 304
0f23f75f 305#if NKW_NEEDS_REFRESH
1a0c09c4
MW
306 // Calculate the number of subkey words again. (It's a good job
307 // we've got a fast multiplier.)
0f23f75f
MW
308 mov NKW, [CTX + nr]
309 add NKW, 1
310 imul NKW, BLKSZ
311#endif
1a0c09c4
MW
312
313 // End-swap the encryption keys.
0f23f75f 314 lea SI, [CTX + w]
1a0c09c4
MW
315 call endswap_block
316
317 // And the decryption keys.
0f23f75f 318 lea SI, [CTX + wi]
1a0c09c4
MW
319 call endswap_block
320
89b34050 3219: // All done.
0f23f75f 322#if CPUFAM_X86
0923a413
MW
323 popreg edi
324 popreg esi
325 popreg ebx
326 popreg ebp
0f23f75f
MW
327#endif
328#if CPUFAM_AMD64 && ABI_WIN
0923a413
MW
329 popreg rdi
330 popreg rsi
0f23f75f 331#endif
1a0c09c4
MW
332 ret
333
1a517bb3
MW
334ENDFUNC
335
336INTFUNC(endswap_block)
1a384903 337 // End-swap NKW words starting at SI. The end-swapping table is
8d6ca554 338 // already loaded into XMM5; and it's OK to work in 16-byte chunks.
0923a413 339 endprologue
1a517bb3 340
1a384903
MW
341 mov ecx, NKW
3420: movdqu xmm1, [SI]
8d6ca554 343 pshufb xmm1, xmm5
0f23f75f
MW
344 movdqu [SI], xmm1
345 add SI, 16
1a0c09c4 346 sub ecx, 4
1a384903 347 ja 0b
1a517bb3 348
1a0c09c4
MW
349 ret
350
1a517bb3
MW
351ENDFUNC
352
0f23f75f
MW
353#undef CTX
354#undef BLKSZ
355#undef SI
356#undef DI
357#undef KSZ
0f23f75f 358#undef RCON
0f23f75f
MW
359#undef LIM
360#undef NR
361#undef LRK
0f23f75f 362#undef BLKOFF
0f23f75f 363
1a0c09c4
MW
364///--------------------------------------------------------------------------
365/// Encrypting and decrypting blocks.
366
8a1aa284
MW
367.macro encdec op, aes, koff
368 FUNC(rijndael_\op\()_x86ish_aesni)
1a0c09c4 369
0f23f75f
MW
370#if CPUFAM_X86
371 // Arguments come in on the stack, and need to be collected. We
372 // don't have a shortage of registers.
373
c410f911 374# define K eax
0f23f75f
MW
375# define SRC edx
376# define DST edx
c410f911 377# define NR ecx
0f23f75f
MW
378
379 mov K, [esp + 4]
380 mov SRC, [esp + 8]
381#endif
382
383#if CPUFAM_AMD64 && ABI_SYSV
384 // Arguments come in registers. All is good.
385
386# define K rdi
387# define SRC rsi
388# define DST rdx
389# define NR eax
390#endif
391
392#if CPUFAM_AMD64 && ABI_WIN
393 // Arguments come in different registers.
394
395# define K rcx
396# define SRC rdx
397# define DST r8
398# define NR eax
399#endif
400
0923a413
MW
401 endprologue
402
28321c96
MW
403 // Find the magic endianness-swapping table.
404 ldgot ecx
405 movdqa xmm5, [INTADDR(endswap_tab, ecx)]
406
0f23f75f
MW
407 // Initial setup.
408 movdqu xmm0, [SRC]
8d6ca554 409 pshufb xmm0, xmm5
0f23f75f
MW
410 mov NR, [K + nr]
411 add K, \koff
1a0c09c4
MW
412
413 // Initial whitening.
0f23f75f
MW
414 movdqu xmm1, [K]
415 add K, 16
1a0c09c4 416 pxor xmm0, xmm1
1d63fee4
MW
417#if CPUFAM_X86
418 mov DST, [esp + 12]
419#endif
1a0c09c4
MW
420
421 // Dispatch to the correct code.
0f23f75f 422 cmp NR, 10
e297526c 423 je 10f
1a0c09c4 424 jb bogus
0f23f75f 425 cmp NR, 14
e297526c 426 je 14f
1a0c09c4 427 ja bogus
0f23f75f 428 cmp NR, 12
e297526c
MW
429 je 12f
430 jb 11f
431 jmp 13f
1a0c09c4
MW
432
433 .align 2
434
435 // 14 rounds...
0f23f75f
MW
43614: movdqu xmm1, [K]
437 add K, 16
e297526c 438 \aes xmm0, xmm1
1a0c09c4
MW
439
440 // 13 rounds...
0f23f75f
MW
44113: movdqu xmm1, [K]
442 add K, 16
e297526c 443 \aes xmm0, xmm1
1a0c09c4
MW
444
445 // 12 rounds...
0f23f75f
MW
44612: movdqu xmm1, [K]
447 add K, 16
e297526c 448 \aes xmm0, xmm1
1a0c09c4
MW
449
450 // 11 rounds...
0f23f75f
MW
45111: movdqu xmm1, [K]
452 add K, 16
e297526c 453 \aes xmm0, xmm1
1a0c09c4
MW
454
455 // 10 rounds...
0f23f75f 45610: movdqu xmm1, [K]
e297526c 457 \aes xmm0, xmm1
1a0c09c4
MW
458
459 // 9 rounds...
0f23f75f 460 movdqu xmm1, [K + 16]
e297526c 461 \aes xmm0, xmm1
1a0c09c4
MW
462
463 // 8 rounds...
0f23f75f 464 movdqu xmm1, [K + 32]
e297526c 465 \aes xmm0, xmm1
1a0c09c4
MW
466
467 // 7 rounds...
0f23f75f 468 movdqu xmm1, [K + 48]
e297526c 469 \aes xmm0, xmm1
1a0c09c4
MW
470
471 // 6 rounds...
0f23f75f 472 movdqu xmm1, [K + 64]
e297526c 473 \aes xmm0, xmm1
1a0c09c4
MW
474
475 // 5 rounds...
0f23f75f 476 movdqu xmm1, [K + 80]
e297526c 477 \aes xmm0, xmm1
1a0c09c4
MW
478
479 // 4 rounds...
0f23f75f 480 movdqu xmm1, [K + 96]
e297526c 481 \aes xmm0, xmm1
1a0c09c4
MW
482
483 // 3 rounds...
0f23f75f 484 movdqu xmm1, [K + 112]
e297526c 485 \aes xmm0, xmm1
1a0c09c4
MW
486
487 // 2 rounds...
0f23f75f 488 movdqu xmm1, [K + 128]
e297526c 489 \aes xmm0, xmm1
1a0c09c4
MW
490
491 // Final round...
0f23f75f 492 movdqu xmm1, [K + 144]
e297526c 493 \aes\()last xmm0, xmm1
1a0c09c4
MW
494
495 // Unpermute the ciphertext block and store it.
8d6ca554 496 pshufb xmm0, xmm5
0f23f75f 497 movdqu [DST], xmm0
1a0c09c4
MW
498
499 // And we're done.
500 ret
501
0f23f75f
MW
502#undef K
503#undef SRC
504#undef DST
505#undef NR
506
8a1aa284
MW
507 ENDFUNC
508.endm
1a0c09c4 509
e297526c
MW
510 encdec eblk, aesenc, w
511 encdec dblk, aesdec, wi
1a0c09c4
MW
512
513///--------------------------------------------------------------------------
514/// Random utilities.
515
1a517bb3 516INTFUNC(bogus)
1a0c09c4
MW
517 // Abort the process because of a programming error. Indirecting
518 // through this point serves several purposes: (a) by CALLing, rather
519 // than branching to, `abort', we can save the return address, which
520 // might at least provide a hint as to what went wrong; (b) we don't
521 // have conditional CALLs (and they'd be big anyway); and (c) we can
522 // write a HLT here as a backstop against `abort' being mad.
0923a413 523 endprologue
1a517bb3
MW
524
525 callext F(abort)
1a0c09c4
MW
5260: hlt
527 jmp 0b
528
1a517bb3
MW
529ENDFUNC
530
1a0c09c4
MW
531///--------------------------------------------------------------------------
532/// Data tables.
533
645fcce0
MW
534 RODATA
535
1a0c09c4
MW
536 .align 16
537endswap_tab:
538 .byte 3, 2, 1, 0
539 .byte 7, 6, 5, 4
540 .byte 11, 10, 9, 8
541 .byte 15, 14, 13, 12
542
543///----- That's all, folks --------------------------------------------------