base/asm-common.h: Include `.note.GNU-stack' section on ELF targets.
[catacomb] / symm / rijndael-x86ish-aesni.S
CommitLineData
1a0c09c4
MW
1/// -*- mode: asm; asm-comment-char: ?/ -*-
2///
3/// AESNI-based implementation of Rijndael
4///
5/// (c) 2015 Straylight/Edgeware
6///
7
8///----- Licensing notice ---------------------------------------------------
9///
10/// This file is part of Catacomb.
11///
12/// Catacomb is free software; you can redistribute it and/or modify
13/// it under the terms of the GNU Library General Public License as
14/// published by the Free Software Foundation; either version 2 of the
15/// License, or (at your option) any later version.
16///
17/// Catacomb is distributed in the hope that it will be useful,
18/// but WITHOUT ANY WARRANTY; without even the implied warranty of
19/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20/// GNU Library General Public License for more details.
21///
22/// You should have received a copy of the GNU Library General Public
23/// License along with Catacomb; if not, write to the Free
24/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
25/// MA 02111-1307, USA.
26
27///--------------------------------------------------------------------------
28/// External definitions.
29
30#include "config.h"
31#include "asm-common.h"
32
1a0c09c4
MW
33 .globl F(abort)
34 .globl F(rijndael_rcon)
35
36///--------------------------------------------------------------------------
37/// Main code.
38
39 .arch .aes
bc9ac7eb 40 .text
1a0c09c4
MW
41
42/// The AESNI instructions implement a little-endian version of AES, but
43/// Catacomb's internal interface presents as big-endian so as to work better
44/// with things like GCM. We therefore maintain the round keys in
45/// little-endian form, and have to end-swap blocks in and out.
46///
47/// For added amusement, the AESNI instructions don't implement the
48/// larger-block versions of Rijndael, so we have to end-swap the keys if
49/// we're preparing for one of those.
50
51 // Useful constants.
52 .equ maxrounds, 16 // maximum number of rounds
53 .equ maxblksz, 32 // maximum block size, in bytes
54 .equ kbufsz, maxblksz*(maxrounds + 1) // size of a key-schedule buffer
55
56 // Context structure.
57 .equ nr, 0 // number of rounds
58 .equ w, nr + 4 // encryption key words
59 .equ wi, w + kbufsz // decryption key words
60
61///--------------------------------------------------------------------------
62/// Key setup.
63
0f23f75f 64FUNC(rijndael_setup_x86ish_aesni)
1a0c09c4 65
0f23f75f
MW
66#if CPUFAM_X86
67 // Arguments are on the stack. We'll need to stack the caller's
68 // register veriables, but we'll manage.
1a0c09c4 69
0f23f75f
MW
70# define CTX ebp // context pointer
71# define BLKSZ [esp + 24] // block size
72
73# define SI esi // source pointer
74# define DI edi // destination pointer
75
76# define KSZ ebx // key size
77# define KSZo ebx // ... as address offset
78# define NKW edx // total number of key words
79# define NKW_NEEDS_REFRESH 1 // ... needs recalculating
80# define RCON ecx // round constants table
81# define LIM edx // limit pointer
82# define LIMn edx // ... as integer offset from base
16021451 83# define CYIX edi // index in shift-register cycle
0f23f75f
MW
84
85# define NR ecx // number of rounds
86# define LRK eax // distance to last key
87# define LRKo eax // ... as address offset
88# define BLKOFF edx // block size in bytes
89# define BLKOFFo edx // ... as address offset
90
91 // Stack the caller's registers.
1a0c09c4
MW
92 push ebp
93 push ebx
94 push esi
95 push edi
96
0f23f75f
MW
97 // Set up our own variables.
98 mov CTX, [esp + 20] // context base pointer
99 mov SI, [esp + 28] // key material
100 mov KSZ, [esp + 32] // key size, in words
101#endif
102
103#if CPUFAM_AMD64 && ABI_SYSV
104 // Arguments are in registers. We have plenty, but, to be honest,
105 // the initial register allocation is a bit annoying.
106
107# define CTX r8 // context pointer
108# define BLKSZ r9d // block size
109
110# define SI rsi // source pointer
111# define DI rdi // destination pointer
112
113# define KSZ edx // key size
114# define KSZo rdx // ... as address offset
115# define NKW r10d // total number of key words
116# define RCON rdi // round constants table
117# define LIMn ecx // limit pointer
118# define LIM rcx // ... as integer offset from base
16021451 119# define CYIX r11d // index in shift-register cycle
0f23f75f
MW
120
121# define NR ecx // number of rounds
122# define LRK eax // distance to last key
123# define LRKo rax // ... as address offset
124# define BLKOFF r9d // block size in bytes
125# define BLKOFFo r9 // ... as address offset
126
127 // Move arguments to more useful places.
128 mov CTX, rdi // context base pointer
129 mov BLKSZ, esi // block size in words
130 mov SI, rdx // key material
131 mov KSZ, ecx // key size, in words
132#endif
133
134#if CPUFAM_AMD64 && ABI_WIN
135 // Arguments are in different registers, and they're a little tight.
136
137# define CTX r8 // context pointer
138# define BLKSZ edx // block size
139
140# define SI rsi // source pointer
141# define DI rdi // destination pointer
142
143# define KSZ r9d // key size
144# define KSZo r9 // ... as address offset
145# define NKW r10d // total number of key words
146# define RCON rdi // round constants table
147# define LIMn ecx // limit pointer
148# define LIM rcx // ... as integer offset from base
16021451 149# define CYIX r11d // index in shift-register cycle
0f23f75f
MW
150
151# define NR ecx // number of rounds
152# define LRK eax // distance to last key
153# define LRKo rax // ... as address offset
154# define BLKOFF edx // block size in bytes
155# define BLKOFFo rdx // ... as address offset
156
157 // We'll need the index registers, which belong to the caller in this
158 // ABI.
159 push rsi
f71dd54d 160 .seh_pushreg rsi
0f23f75f 161 push rdi
f71dd54d
MW
162 .seh_pushreg rdi
163 .seh_endprologue
0f23f75f
MW
164
165 // Move arguments to more useful places.
166 mov SI, r8 // key material
167 mov CTX, rcx // context base pointer
168#endif
169
1a0c09c4
MW
170 // The initial round key material is taken directly from the input
171 // key, so copy it over.
0f23f75f
MW
172#if CPUFAM_AMD64 && ABI_SYSV
173 // We've been lucky. We already have a copy of the context pointer
174 // in rdi, and the key size in ecx.
175 add DI, w
176#else
177 lea DI, [CTX + w]
178 mov ecx, KSZ
179#endif
1a0c09c4
MW
180 rep movsd
181
182 // Find out other useful things.
0f23f75f
MW
183 mov NKW, [CTX + nr] // number of rounds
184 add NKW, 1
185 imul NKW, BLKSZ // total key size in words
186#if !NKW_NEEDS_REFRESH
187 // If we can't keep NKW for later, then we use the same register for
188 // it and LIM, so this move is unnecessary.
189 mov LIMn, NKW
190#endif
191 sub LIMn, KSZ // offset by the key size
1a0c09c4
MW
192
193 // Find the round constants.
194 ldgot ecx
811a896f 195 leaext RCON, F(rijndael_rcon), ecx
1a0c09c4
MW
196
197 // Prepare for the main loop.
0f23f75f
MW
198 lea SI, [CTX + w]
199 mov eax, [SI + 4*KSZo - 4] // most recent key word
200 lea LIM, [SI + 4*LIM] // limit, offset by one key expansion
16021451 201 xor CYIX, CYIX // start of new cycle
1a0c09c4
MW
202
203 // Main key expansion loop. The first word of each key-length chunk
204 // needs special treatment.
205 //
206 // This is rather tedious because the Intel `AESKEYGENASSIST'
207 // instruction is very strangely shaped. Firstly, it wants to
208 // operate on vast SSE registers, even though we're data-blocked from
209 // doing more than operation at a time unless we're doing two key
210 // schedules simultaneously -- and even then we can't do more than
211 // two, because the instruction ignores two of its input words
212 // entirely, and produces two different outputs for each of the other
213 // two. And secondly it insists on taking the magic round constant
214 // as an immediate, so it's kind of annoying if you're not
215 // open-coding the whole thing. It's much easier to leave that as
216 // zero and XOR in the round constant by hand.
16021451
MW
2170: cmp CYIX, 0 // first word of the cycle?
218 je 1f
219 cmp CYIX, 4 // fourth word of the cycle?
220 jne 2f
221 cmp KSZ, 7 // and a large key?
222 jb 2f
223
224 // Fourth word of the cycle, and seven or eight words of key. Do a
225 // byte substitution.
226 movd xmm0, eax
a13b5730 227 pshufd xmm0, xmm0, SHUF(2, 1, 0, 3)
16021451
MW
228 aeskeygenassist xmm1, xmm0, 0
229 movd eax, xmm1
230 jmp 2f
231
232 // First word of the cycle. This is the complicated piece.
2331: movd xmm0, eax
a13b5730 234 pshufd xmm0, xmm0, SHUF(0, 3, 2, 1)
1a0c09c4 235 aeskeygenassist xmm1, xmm0, 0
a13b5730 236 pshufd xmm1, xmm1, SHUF(2, 1, 0, 3)
1a0c09c4 237 movd eax, xmm1
0f23f75f
MW
238 xor al, [RCON]
239 inc RCON
1a0c09c4 240
16021451
MW
241 // Common tail. Mix in the corresponding word from the previous
242 // cycle and prepare for the next loop.
2432: xor eax, [SI]
0f23f75f
MW
244 mov [SI + 4*KSZo], eax
245 add SI, 4
16021451 246 inc CYIX
0f23f75f 247 cmp SI, LIM
89b34050 248 jae 9f
16021451 249 cmp CYIX, KSZ
89b34050 250 jb 0b
16021451 251 xor CYIX, CYIX
89b34050 252 jmp 0b
1a0c09c4
MW
253
254 // Next job is to construct the decryption keys. The keys for the
255 // first and last rounds don't need to be mangled, but the remaining
256 // ones do -- and they all need to be reordered too.
257 //
258 // The plan of action, then, is to copy the final encryption round's
259 // keys into place first, then to do each of the intermediate rounds
260 // in reverse order, and finally do the first round.
261 //
262 // Do all of the heavy lifting with SSE registers. The order we're
263 // doing this in means that it's OK if we read or write too much, and
264 // there's easily enough buffer space for the over-enthusiastic reads
265 // and writes because the context has space for 32-byte blocks, which
266 // is our maximum and an exact fit for two SSE registers.
89b34050 2679: mov NR, [CTX + nr] // number of rounds
0f23f75f
MW
268#if NKW_NEEDS_REFRESH
269 mov BLKOFF, BLKSZ
270 mov LRK, NR
271 imul LRK, BLKOFF
272#else
273 // If we retain NKW, then BLKSZ and BLKOFF are the same register
274 // because we won't need the former again.
275 mov LRK, NKW
276 sub LRK, BLKSZ
277#endif
278 lea DI, [CTX + wi]
279 lea SI, [CTX + w + 4*LRKo] // last round's keys
280 shl BLKOFF, 2 // block size (in bytes now)
1a0c09c4
MW
281
282 // Copy the last encryption round's keys.
0f23f75f
MW
283 movdqu xmm0, [SI]
284 movdqu [DI], xmm0
285 cmp BLKOFF, 16
89b34050 286 jbe 0f
0f23f75f
MW
287 movdqu xmm0, [SI + 16]
288 movdqu [DI + 16], xmm0
1a0c09c4
MW
289
290 // Update the loop variables and stop if we've finished.
89b34050 2910: add DI, BLKOFFo
0f23f75f
MW
292 sub SI, BLKOFFo
293 sub NR, 1
89b34050 294 jbe 9f
1a0c09c4
MW
295
296 // Do another middle round's keys...
0f23f75f 297 movdqu xmm0, [SI]
1a0c09c4 298 aesimc xmm0, xmm0
0f23f75f
MW
299 movdqu [DI], xmm0
300 cmp BLKOFF, 16
89b34050 301 jbe 0b
0f23f75f 302 movdqu xmm0, [SI + 16]
1a0c09c4 303 aesimc xmm0, xmm0
0f23f75f 304 movdqu [DI + 16], xmm0
89b34050 305 jmp 0b
1a0c09c4
MW
306
307 // Finally do the first encryption round.
89b34050 3089: movdqu xmm0, [SI]
0f23f75f
MW
309 movdqu [DI], xmm0
310 cmp BLKOFF, 16
89b34050 311 jbe 1f
0f23f75f
MW
312 movdqu xmm0, [SI + 16]
313 movdqu [DI + 16], xmm0
1a0c09c4
MW
314
315 // If the block size is not exactly four words then we must end-swap
316 // everything. We can use fancy SSE toys for this.
89b34050
MW
3171: cmp BLKOFF, 16
318 je 9f
1a0c09c4
MW
319
320 // Find the byte-reordering table.
321 ldgot ecx
8d6ca554 322 movdqa xmm5, [INTADDR(endswap_tab, ecx)]
1a0c09c4 323
0f23f75f 324#if NKW_NEEDS_REFRESH
1a0c09c4
MW
325 // Calculate the number of subkey words again. (It's a good job
326 // we've got a fast multiplier.)
0f23f75f
MW
327 mov NKW, [CTX + nr]
328 add NKW, 1
329 imul NKW, BLKSZ
330#endif
1a0c09c4
MW
331
332 // End-swap the encryption keys.
0f23f75f 333 lea SI, [CTX + w]
1a0c09c4
MW
334 call endswap_block
335
336 // And the decryption keys.
0f23f75f 337 lea SI, [CTX + wi]
1a0c09c4
MW
338 call endswap_block
339
89b34050 3409: // All done.
0f23f75f
MW
341#if CPUFAM_X86
342 pop edi
1a0c09c4
MW
343 pop esi
344 pop ebx
345 pop ebp
0f23f75f
MW
346#endif
347#if CPUFAM_AMD64 && ABI_WIN
348 pop rdi
349 pop rsi
350#endif
1a0c09c4
MW
351 ret
352
353 .align 16
354endswap_block:
1a384903 355 // End-swap NKW words starting at SI. The end-swapping table is
8d6ca554 356 // already loaded into XMM5; and it's OK to work in 16-byte chunks.
1a384903
MW
357 mov ecx, NKW
3580: movdqu xmm1, [SI]
8d6ca554 359 pshufb xmm1, xmm5
0f23f75f
MW
360 movdqu [SI], xmm1
361 add SI, 16
1a0c09c4 362 sub ecx, 4
1a384903 363 ja 0b
1a0c09c4
MW
364 ret
365
0f23f75f
MW
366#undef CTX
367#undef BLKSZ
368#undef SI
369#undef DI
370#undef KSZ
371#undef KSZo
372#undef RCON
373#undef LIMn
374#undef LIM
375#undef NR
376#undef LRK
377#undef LRKo
378#undef BLKOFF
379#undef BLKOFFo
380
1a0c09c4
MW
381ENDFUNC
382
383///--------------------------------------------------------------------------
384/// Encrypting and decrypting blocks.
385
8a1aa284
MW
386.macro encdec op, aes, koff
387 FUNC(rijndael_\op\()_x86ish_aesni)
1a0c09c4 388
0f23f75f
MW
389#if CPUFAM_X86
390 // Arguments come in on the stack, and need to be collected. We
391 // don't have a shortage of registers.
392
c410f911 393# define K eax
0f23f75f
MW
394# define SRC edx
395# define DST edx
c410f911 396# define NR ecx
0f23f75f
MW
397
398 mov K, [esp + 4]
399 mov SRC, [esp + 8]
400#endif
401
402#if CPUFAM_AMD64 && ABI_SYSV
403 // Arguments come in registers. All is good.
404
405# define K rdi
406# define SRC rsi
407# define DST rdx
408# define NR eax
409#endif
410
411#if CPUFAM_AMD64 && ABI_WIN
412 // Arguments come in different registers.
413
414# define K rcx
415# define SRC rdx
416# define DST r8
417# define NR eax
f71dd54d 418 .seh_endprologue
0f23f75f
MW
419#endif
420
28321c96
MW
421 // Find the magic endianness-swapping table.
422 ldgot ecx
423 movdqa xmm5, [INTADDR(endswap_tab, ecx)]
424
0f23f75f
MW
425 // Initial setup.
426 movdqu xmm0, [SRC]
8d6ca554 427 pshufb xmm0, xmm5
0f23f75f
MW
428 mov NR, [K + nr]
429 add K, \koff
1a0c09c4
MW
430
431 // Initial whitening.
0f23f75f
MW
432 movdqu xmm1, [K]
433 add K, 16
1a0c09c4 434 pxor xmm0, xmm1
1d63fee4
MW
435#if CPUFAM_X86
436 mov DST, [esp + 12]
437#endif
1a0c09c4
MW
438
439 // Dispatch to the correct code.
0f23f75f 440 cmp NR, 10
e297526c 441 je 10f
1a0c09c4 442 jb bogus
0f23f75f 443 cmp NR, 14
e297526c 444 je 14f
1a0c09c4 445 ja bogus
0f23f75f 446 cmp NR, 12
e297526c
MW
447 je 12f
448 jb 11f
449 jmp 13f
1a0c09c4
MW
450
451 .align 2
452
453 // 14 rounds...
0f23f75f
MW
45414: movdqu xmm1, [K]
455 add K, 16
e297526c 456 \aes xmm0, xmm1
1a0c09c4
MW
457
458 // 13 rounds...
0f23f75f
MW
45913: movdqu xmm1, [K]
460 add K, 16
e297526c 461 \aes xmm0, xmm1
1a0c09c4
MW
462
463 // 12 rounds...
0f23f75f
MW
46412: movdqu xmm1, [K]
465 add K, 16
e297526c 466 \aes xmm0, xmm1
1a0c09c4
MW
467
468 // 11 rounds...
0f23f75f
MW
46911: movdqu xmm1, [K]
470 add K, 16
e297526c 471 \aes xmm0, xmm1
1a0c09c4
MW
472
473 // 10 rounds...
0f23f75f 47410: movdqu xmm1, [K]
e297526c 475 \aes xmm0, xmm1
1a0c09c4
MW
476
477 // 9 rounds...
0f23f75f 478 movdqu xmm1, [K + 16]
e297526c 479 \aes xmm0, xmm1
1a0c09c4
MW
480
481 // 8 rounds...
0f23f75f 482 movdqu xmm1, [K + 32]
e297526c 483 \aes xmm0, xmm1
1a0c09c4
MW
484
485 // 7 rounds...
0f23f75f 486 movdqu xmm1, [K + 48]
e297526c 487 \aes xmm0, xmm1
1a0c09c4
MW
488
489 // 6 rounds...
0f23f75f 490 movdqu xmm1, [K + 64]
e297526c 491 \aes xmm0, xmm1
1a0c09c4
MW
492
493 // 5 rounds...
0f23f75f 494 movdqu xmm1, [K + 80]
e297526c 495 \aes xmm0, xmm1
1a0c09c4
MW
496
497 // 4 rounds...
0f23f75f 498 movdqu xmm1, [K + 96]
e297526c 499 \aes xmm0, xmm1
1a0c09c4
MW
500
501 // 3 rounds...
0f23f75f 502 movdqu xmm1, [K + 112]
e297526c 503 \aes xmm0, xmm1
1a0c09c4
MW
504
505 // 2 rounds...
0f23f75f 506 movdqu xmm1, [K + 128]
e297526c 507 \aes xmm0, xmm1
1a0c09c4
MW
508
509 // Final round...
0f23f75f 510 movdqu xmm1, [K + 144]
e297526c 511 \aes\()last xmm0, xmm1
1a0c09c4
MW
512
513 // Unpermute the ciphertext block and store it.
8d6ca554 514 pshufb xmm0, xmm5
0f23f75f 515 movdqu [DST], xmm0
1a0c09c4
MW
516
517 // And we're done.
518 ret
519
0f23f75f
MW
520#undef K
521#undef SRC
522#undef DST
523#undef NR
524
8a1aa284
MW
525 ENDFUNC
526.endm
1a0c09c4 527
e297526c
MW
528 encdec eblk, aesenc, w
529 encdec dblk, aesdec, wi
1a0c09c4
MW
530
531///--------------------------------------------------------------------------
532/// Random utilities.
533
534 .align 16
535 // Abort the process because of a programming error. Indirecting
536 // through this point serves several purposes: (a) by CALLing, rather
537 // than branching to, `abort', we can save the return address, which
538 // might at least provide a hint as to what went wrong; (b) we don't
539 // have conditional CALLs (and they'd be big anyway); and (c) we can
540 // write a HLT here as a backstop against `abort' being mad.
541bogus: callext F(abort)
5420: hlt
543 jmp 0b
544
1a0c09c4
MW
545///--------------------------------------------------------------------------
546/// Data tables.
547
548 .align 16
549endswap_tab:
550 .byte 3, 2, 1, 0
551 .byte 7, 6, 5, 4
552 .byte 11, 10, 9, 8
553 .byte 15, 14, 13, 12
554
555///----- That's all, folks --------------------------------------------------