base/asm-common.h, *.S: Introduce `AUXFN'/`ENDAUXFN'; abolish `gotaux'.
[catacomb] / symm / rijndael-x86ish-aesni.S
CommitLineData
1a0c09c4
MW
1/// -*- mode: asm; asm-comment-char: ?/ -*-
2///
3/// AESNI-based implementation of Rijndael
4///
5/// (c) 2015 Straylight/Edgeware
6///
7
8///----- Licensing notice ---------------------------------------------------
9///
10/// This file is part of Catacomb.
11///
12/// Catacomb is free software; you can redistribute it and/or modify
13/// it under the terms of the GNU Library General Public License as
14/// published by the Free Software Foundation; either version 2 of the
15/// License, or (at your option) any later version.
16///
17/// Catacomb is distributed in the hope that it will be useful,
18/// but WITHOUT ANY WARRANTY; without even the implied warranty of
19/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20/// GNU Library General Public License for more details.
21///
22/// You should have received a copy of the GNU Library General Public
23/// License along with Catacomb; if not, write to the Free
24/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
25/// MA 02111-1307, USA.
26
27///--------------------------------------------------------------------------
28/// External definitions.
29
30#include "config.h"
31#include "asm-common.h"
32
1a0c09c4
MW
33 .globl F(abort)
34 .globl F(rijndael_rcon)
35
36///--------------------------------------------------------------------------
47103664
MW
37/// Local utilities.
38
39// Magic constants for shuffling.
40#define ROTL 0x93
41#define ROT2 0x4e
42#define ROTR 0x39
43
44///--------------------------------------------------------------------------
1a0c09c4
MW
45/// Main code.
46
47 .arch .aes
bc9ac7eb 48 .text
1a0c09c4
MW
49
50/// The AESNI instructions implement a little-endian version of AES, but
51/// Catacomb's internal interface presents as big-endian so as to work better
52/// with things like GCM. We therefore maintain the round keys in
53/// little-endian form, and have to end-swap blocks in and out.
54///
55/// For added amusement, the AESNI instructions don't implement the
56/// larger-block versions of Rijndael, so we have to end-swap the keys if
57/// we're preparing for one of those.
58
59 // Useful constants.
60 .equ maxrounds, 16 // maximum number of rounds
61 .equ maxblksz, 32 // maximum block size, in bytes
62 .equ kbufsz, maxblksz*(maxrounds + 1) // size of a key-schedule buffer
63
64 // Context structure.
65 .equ nr, 0 // number of rounds
66 .equ w, nr + 4 // encryption key words
67 .equ wi, w + kbufsz // decryption key words
68
69///--------------------------------------------------------------------------
70/// Key setup.
71
0f23f75f 72FUNC(rijndael_setup_x86ish_aesni)
1a0c09c4 73
0f23f75f
MW
74#if CPUFAM_X86
75 // Arguments are on the stack. We'll need to stack the caller's
76 // register veriables, but we'll manage.
1a0c09c4 77
0f23f75f
MW
78# define CTX ebp // context pointer
79# define BLKSZ [esp + 24] // block size
80
81# define SI esi // source pointer
82# define DI edi // destination pointer
83
84# define KSZ ebx // key size
85# define KSZo ebx // ... as address offset
86# define NKW edx // total number of key words
87# define NKW_NEEDS_REFRESH 1 // ... needs recalculating
88# define RCON ecx // round constants table
89# define LIM edx // limit pointer
90# define LIMn edx // ... as integer offset from base
16021451 91# define CYIX edi // index in shift-register cycle
0f23f75f
MW
92
93# define NR ecx // number of rounds
94# define LRK eax // distance to last key
95# define LRKo eax // ... as address offset
96# define BLKOFF edx // block size in bytes
97# define BLKOFFo edx // ... as address offset
98
99 // Stack the caller's registers.
1a0c09c4
MW
100 push ebp
101 push ebx
102 push esi
103 push edi
104
0f23f75f
MW
105 // Set up our own variables.
106 mov CTX, [esp + 20] // context base pointer
107 mov SI, [esp + 28] // key material
108 mov KSZ, [esp + 32] // key size, in words
109#endif
110
111#if CPUFAM_AMD64 && ABI_SYSV
112 // Arguments are in registers. We have plenty, but, to be honest,
113 // the initial register allocation is a bit annoying.
114
115# define CTX r8 // context pointer
116# define BLKSZ r9d // block size
117
118# define SI rsi // source pointer
119# define DI rdi // destination pointer
120
121# define KSZ edx // key size
122# define KSZo rdx // ... as address offset
123# define NKW r10d // total number of key words
124# define RCON rdi // round constants table
125# define LIMn ecx // limit pointer
126# define LIM rcx // ... as integer offset from base
16021451 127# define CYIX r11d // index in shift-register cycle
0f23f75f
MW
128
129# define NR ecx // number of rounds
130# define LRK eax // distance to last key
131# define LRKo rax // ... as address offset
132# define BLKOFF r9d // block size in bytes
133# define BLKOFFo r9 // ... as address offset
134
135 // Move arguments to more useful places.
136 mov CTX, rdi // context base pointer
137 mov BLKSZ, esi // block size in words
138 mov SI, rdx // key material
139 mov KSZ, ecx // key size, in words
140#endif
141
142#if CPUFAM_AMD64 && ABI_WIN
143 // Arguments are in different registers, and they're a little tight.
144
145# define CTX r8 // context pointer
146# define BLKSZ edx // block size
147
148# define SI rsi // source pointer
149# define DI rdi // destination pointer
150
151# define KSZ r9d // key size
152# define KSZo r9 // ... as address offset
153# define NKW r10d // total number of key words
154# define RCON rdi // round constants table
155# define LIMn ecx // limit pointer
156# define LIM rcx // ... as integer offset from base
16021451 157# define CYIX r11d // index in shift-register cycle
0f23f75f
MW
158
159# define NR ecx // number of rounds
160# define LRK eax // distance to last key
161# define LRKo rax // ... as address offset
162# define BLKOFF edx // block size in bytes
163# define BLKOFFo rdx // ... as address offset
164
165 // We'll need the index registers, which belong to the caller in this
166 // ABI.
167 push rsi
168 push rdi
169
170 // Move arguments to more useful places.
171 mov SI, r8 // key material
172 mov CTX, rcx // context base pointer
173#endif
174
1a0c09c4
MW
175 // The initial round key material is taken directly from the input
176 // key, so copy it over.
0f23f75f
MW
177#if CPUFAM_AMD64 && ABI_SYSV
178 // We've been lucky. We already have a copy of the context pointer
179 // in rdi, and the key size in ecx.
180 add DI, w
181#else
182 lea DI, [CTX + w]
183 mov ecx, KSZ
184#endif
1a0c09c4
MW
185 rep movsd
186
187 // Find out other useful things.
0f23f75f
MW
188 mov NKW, [CTX + nr] // number of rounds
189 add NKW, 1
190 imul NKW, BLKSZ // total key size in words
191#if !NKW_NEEDS_REFRESH
192 // If we can't keep NKW for later, then we use the same register for
193 // it and LIM, so this move is unnecessary.
194 mov LIMn, NKW
195#endif
196 sub LIMn, KSZ // offset by the key size
1a0c09c4
MW
197
198 // Find the round constants.
199 ldgot ecx
811a896f 200 leaext RCON, F(rijndael_rcon), ecx
1a0c09c4
MW
201
202 // Prepare for the main loop.
0f23f75f
MW
203 lea SI, [CTX + w]
204 mov eax, [SI + 4*KSZo - 4] // most recent key word
205 lea LIM, [SI + 4*LIM] // limit, offset by one key expansion
16021451 206 xor CYIX, CYIX // start of new cycle
1a0c09c4
MW
207
208 // Main key expansion loop. The first word of each key-length chunk
209 // needs special treatment.
210 //
211 // This is rather tedious because the Intel `AESKEYGENASSIST'
212 // instruction is very strangely shaped. Firstly, it wants to
213 // operate on vast SSE registers, even though we're data-blocked from
214 // doing more than operation at a time unless we're doing two key
215 // schedules simultaneously -- and even then we can't do more than
216 // two, because the instruction ignores two of its input words
217 // entirely, and produces two different outputs for each of the other
218 // two. And secondly it insists on taking the magic round constant
219 // as an immediate, so it's kind of annoying if you're not
220 // open-coding the whole thing. It's much easier to leave that as
221 // zero and XOR in the round constant by hand.
16021451
MW
2220: cmp CYIX, 0 // first word of the cycle?
223 je 1f
224 cmp CYIX, 4 // fourth word of the cycle?
225 jne 2f
226 cmp KSZ, 7 // and a large key?
227 jb 2f
228
229 // Fourth word of the cycle, and seven or eight words of key. Do a
230 // byte substitution.
231 movd xmm0, eax
232 pshufd xmm0, xmm0, ROTL
233 aeskeygenassist xmm1, xmm0, 0
234 movd eax, xmm1
235 jmp 2f
236
237 // First word of the cycle. This is the complicated piece.
2381: movd xmm0, eax
47103664 239 pshufd xmm0, xmm0, ROTR
1a0c09c4 240 aeskeygenassist xmm1, xmm0, 0
47103664 241 pshufd xmm1, xmm1, ROTL
1a0c09c4 242 movd eax, xmm1
0f23f75f
MW
243 xor al, [RCON]
244 inc RCON
1a0c09c4 245
16021451
MW
246 // Common tail. Mix in the corresponding word from the previous
247 // cycle and prepare for the next loop.
2482: xor eax, [SI]
0f23f75f
MW
249 mov [SI + 4*KSZo], eax
250 add SI, 4
16021451 251 inc CYIX
0f23f75f 252 cmp SI, LIM
89b34050 253 jae 9f
16021451 254 cmp CYIX, KSZ
89b34050 255 jb 0b
16021451 256 xor CYIX, CYIX
89b34050 257 jmp 0b
1a0c09c4
MW
258
259 // Next job is to construct the decryption keys. The keys for the
260 // first and last rounds don't need to be mangled, but the remaining
261 // ones do -- and they all need to be reordered too.
262 //
263 // The plan of action, then, is to copy the final encryption round's
264 // keys into place first, then to do each of the intermediate rounds
265 // in reverse order, and finally do the first round.
266 //
267 // Do all of the heavy lifting with SSE registers. The order we're
268 // doing this in means that it's OK if we read or write too much, and
269 // there's easily enough buffer space for the over-enthusiastic reads
270 // and writes because the context has space for 32-byte blocks, which
271 // is our maximum and an exact fit for two SSE registers.
89b34050 2729: mov NR, [CTX + nr] // number of rounds
0f23f75f
MW
273#if NKW_NEEDS_REFRESH
274 mov BLKOFF, BLKSZ
275 mov LRK, NR
276 imul LRK, BLKOFF
277#else
278 // If we retain NKW, then BLKSZ and BLKOFF are the same register
279 // because we won't need the former again.
280 mov LRK, NKW
281 sub LRK, BLKSZ
282#endif
283 lea DI, [CTX + wi]
284 lea SI, [CTX + w + 4*LRKo] // last round's keys
285 shl BLKOFF, 2 // block size (in bytes now)
1a0c09c4
MW
286
287 // Copy the last encryption round's keys.
0f23f75f
MW
288 movdqu xmm0, [SI]
289 movdqu [DI], xmm0
290 cmp BLKOFF, 16
89b34050 291 jbe 0f
0f23f75f
MW
292 movdqu xmm0, [SI + 16]
293 movdqu [DI + 16], xmm0
1a0c09c4
MW
294
295 // Update the loop variables and stop if we've finished.
89b34050 2960: add DI, BLKOFFo
0f23f75f
MW
297 sub SI, BLKOFFo
298 sub NR, 1
89b34050 299 jbe 9f
1a0c09c4
MW
300
301 // Do another middle round's keys...
0f23f75f 302 movdqu xmm0, [SI]
1a0c09c4 303 aesimc xmm0, xmm0
0f23f75f
MW
304 movdqu [DI], xmm0
305 cmp BLKOFF, 16
89b34050 306 jbe 0b
0f23f75f 307 movdqu xmm0, [SI + 16]
1a0c09c4 308 aesimc xmm0, xmm0
0f23f75f 309 movdqu [DI + 16], xmm0
89b34050 310 jmp 0b
1a0c09c4
MW
311
312 // Finally do the first encryption round.
89b34050 3139: movdqu xmm0, [SI]
0f23f75f
MW
314 movdqu [DI], xmm0
315 cmp BLKOFF, 16
89b34050 316 jbe 1f
0f23f75f
MW
317 movdqu xmm0, [SI + 16]
318 movdqu [DI + 16], xmm0
1a0c09c4
MW
319
320 // If the block size is not exactly four words then we must end-swap
321 // everything. We can use fancy SSE toys for this.
89b34050
MW
3221: cmp BLKOFF, 16
323 je 9f
1a0c09c4
MW
324
325 // Find the byte-reordering table.
326 ldgot ecx
8d6ca554 327 movdqa xmm5, [INTADDR(endswap_tab, ecx)]
1a0c09c4 328
0f23f75f 329#if NKW_NEEDS_REFRESH
1a0c09c4
MW
330 // Calculate the number of subkey words again. (It's a good job
331 // we've got a fast multiplier.)
0f23f75f
MW
332 mov NKW, [CTX + nr]
333 add NKW, 1
334 imul NKW, BLKSZ
335#endif
1a0c09c4
MW
336
337 // End-swap the encryption keys.
0f23f75f 338 lea SI, [CTX + w]
1a0c09c4
MW
339 call endswap_block
340
341 // And the decryption keys.
0f23f75f 342 lea SI, [CTX + wi]
1a0c09c4
MW
343 call endswap_block
344
89b34050 3459: // All done.
0f23f75f
MW
346#if CPUFAM_X86
347 pop edi
1a0c09c4
MW
348 pop esi
349 pop ebx
350 pop ebp
0f23f75f
MW
351#endif
352#if CPUFAM_AMD64 && ABI_WIN
353 pop rdi
354 pop rsi
355#endif
1a0c09c4
MW
356 ret
357
358 .align 16
359endswap_block:
1a384903 360 // End-swap NKW words starting at SI. The end-swapping table is
8d6ca554 361 // already loaded into XMM5; and it's OK to work in 16-byte chunks.
1a384903
MW
362 mov ecx, NKW
3630: movdqu xmm1, [SI]
8d6ca554 364 pshufb xmm1, xmm5
0f23f75f
MW
365 movdqu [SI], xmm1
366 add SI, 16
1a0c09c4 367 sub ecx, 4
1a384903 368 ja 0b
1a0c09c4
MW
369 ret
370
0f23f75f
MW
371#undef CTX
372#undef BLKSZ
373#undef SI
374#undef DI
375#undef KSZ
376#undef KSZo
377#undef RCON
378#undef LIMn
379#undef LIM
380#undef NR
381#undef LRK
382#undef LRKo
383#undef BLKOFF
384#undef BLKOFFo
385
1a0c09c4
MW
386ENDFUNC
387
388///--------------------------------------------------------------------------
389/// Encrypting and decrypting blocks.
390
8a1aa284
MW
391.macro encdec op, aes, koff
392 FUNC(rijndael_\op\()_x86ish_aesni)
1a0c09c4
MW
393
394 // Find the magic endianness-swapping table.
395 ldgot ecx
8d6ca554 396 movdqa xmm5, [INTADDR(endswap_tab, ecx)]
1a0c09c4 397
0f23f75f
MW
398#if CPUFAM_X86
399 // Arguments come in on the stack, and need to be collected. We
400 // don't have a shortage of registers.
401
402# define K ecx
403# define SRC edx
404# define DST edx
405# define NR eax
406
407 mov K, [esp + 4]
408 mov SRC, [esp + 8]
409#endif
410
411#if CPUFAM_AMD64 && ABI_SYSV
412 // Arguments come in registers. All is good.
413
414# define K rdi
415# define SRC rsi
416# define DST rdx
417# define NR eax
418#endif
419
420#if CPUFAM_AMD64 && ABI_WIN
421 // Arguments come in different registers.
422
423# define K rcx
424# define SRC rdx
425# define DST r8
426# define NR eax
427#endif
428
429 // Initial setup.
430 movdqu xmm0, [SRC]
8d6ca554 431 pshufb xmm0, xmm5
0f23f75f
MW
432 mov NR, [K + nr]
433 add K, \koff
1a0c09c4
MW
434
435 // Initial whitening.
0f23f75f
MW
436 movdqu xmm1, [K]
437 add K, 16
1a0c09c4
MW
438 pxor xmm0, xmm1
439
440 // Dispatch to the correct code.
0f23f75f 441 cmp NR, 10
e297526c 442 je 10f
1a0c09c4 443 jb bogus
0f23f75f 444 cmp NR, 14
e297526c 445 je 14f
1a0c09c4 446 ja bogus
0f23f75f 447 cmp NR, 12
e297526c
MW
448 je 12f
449 jb 11f
450 jmp 13f
1a0c09c4
MW
451
452 .align 2
453
454 // 14 rounds...
0f23f75f
MW
45514: movdqu xmm1, [K]
456 add K, 16
e297526c 457 \aes xmm0, xmm1
1a0c09c4
MW
458
459 // 13 rounds...
0f23f75f
MW
46013: movdqu xmm1, [K]
461 add K, 16
e297526c 462 \aes xmm0, xmm1
1a0c09c4
MW
463
464 // 12 rounds...
0f23f75f
MW
46512: movdqu xmm1, [K]
466 add K, 16
e297526c 467 \aes xmm0, xmm1
1a0c09c4
MW
468
469 // 11 rounds...
0f23f75f
MW
47011: movdqu xmm1, [K]
471 add K, 16
e297526c 472 \aes xmm0, xmm1
1a0c09c4
MW
473
474 // 10 rounds...
0f23f75f 47510: movdqu xmm1, [K]
e297526c 476 \aes xmm0, xmm1
1a0c09c4
MW
477
478 // 9 rounds...
0f23f75f 479 movdqu xmm1, [K + 16]
e297526c 480 \aes xmm0, xmm1
1a0c09c4
MW
481
482 // 8 rounds...
0f23f75f 483 movdqu xmm1, [K + 32]
e297526c 484 \aes xmm0, xmm1
1a0c09c4
MW
485
486 // 7 rounds...
0f23f75f 487 movdqu xmm1, [K + 48]
e297526c 488 \aes xmm0, xmm1
1a0c09c4
MW
489
490 // 6 rounds...
0f23f75f 491 movdqu xmm1, [K + 64]
e297526c 492 \aes xmm0, xmm1
1a0c09c4
MW
493
494 // 5 rounds...
0f23f75f 495 movdqu xmm1, [K + 80]
e297526c 496 \aes xmm0, xmm1
1a0c09c4
MW
497
498 // 4 rounds...
0f23f75f 499 movdqu xmm1, [K + 96]
e297526c 500 \aes xmm0, xmm1
1a0c09c4
MW
501
502 // 3 rounds...
0f23f75f 503 movdqu xmm1, [K + 112]
e297526c 504 \aes xmm0, xmm1
1a0c09c4
MW
505
506 // 2 rounds...
0f23f75f 507 movdqu xmm1, [K + 128]
e297526c 508 \aes xmm0, xmm1
1a0c09c4
MW
509
510 // Final round...
0f23f75f 511 movdqu xmm1, [K + 144]
e297526c 512 \aes\()last xmm0, xmm1
1a0c09c4
MW
513
514 // Unpermute the ciphertext block and store it.
8d6ca554 515 pshufb xmm0, xmm5
0f23f75f
MW
516#if CPUFAM_X86
517 mov DST, [esp + 12]
518#endif
519 movdqu [DST], xmm0
1a0c09c4
MW
520
521 // And we're done.
522 ret
523
0f23f75f
MW
524#undef K
525#undef SRC
526#undef DST
527#undef NR
528
8a1aa284
MW
529 ENDFUNC
530.endm
1a0c09c4 531
e297526c
MW
532 encdec eblk, aesenc, w
533 encdec dblk, aesdec, wi
1a0c09c4
MW
534
535///--------------------------------------------------------------------------
536/// Random utilities.
537
538 .align 16
539 // Abort the process because of a programming error. Indirecting
540 // through this point serves several purposes: (a) by CALLing, rather
541 // than branching to, `abort', we can save the return address, which
542 // might at least provide a hint as to what went wrong; (b) we don't
543 // have conditional CALLs (and they'd be big anyway); and (c) we can
544 // write a HLT here as a backstop against `abort' being mad.
545bogus: callext F(abort)
5460: hlt
547 jmp 0b
548
1a0c09c4
MW
549///--------------------------------------------------------------------------
550/// Data tables.
551
552 .align 16
553endswap_tab:
554 .byte 3, 2, 1, 0
555 .byte 7, 6, 5, 4
556 .byte 11, 10, 9, 8
557 .byte 15, 14, 13, 12
558
559///----- That's all, folks --------------------------------------------------