Commit | Line | Data |
---|---|---|
1a0c09c4 MW |
1 | /// -*- mode: asm; asm-comment-char: ?/ -*- |
2 | /// | |
3 | /// AESNI-based implementation of Rijndael | |
4 | /// | |
5 | /// (c) 2015 Straylight/Edgeware | |
6 | /// | |
7 | ||
8 | ///----- Licensing notice --------------------------------------------------- | |
9 | /// | |
10 | /// This file is part of Catacomb. | |
11 | /// | |
12 | /// Catacomb is free software; you can redistribute it and/or modify | |
13 | /// it under the terms of the GNU Library General Public License as | |
14 | /// published by the Free Software Foundation; either version 2 of the | |
15 | /// License, or (at your option) any later version. | |
16 | /// | |
17 | /// Catacomb is distributed in the hope that it will be useful, | |
18 | /// but WITHOUT ANY WARRANTY; without even the implied warranty of | |
19 | /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
20 | /// GNU Library General Public License for more details. | |
21 | /// | |
22 | /// You should have received a copy of the GNU Library General Public | |
23 | /// License along with Catacomb; if not, write to the Free | |
24 | /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, | |
25 | /// MA 02111-1307, USA. | |
26 | ||
27 | ///-------------------------------------------------------------------------- | |
28 | /// External definitions. | |
29 | ||
30 | #include "config.h" | |
31 | #include "asm-common.h" | |
32 | ||
1a0c09c4 MW |
33 | .globl F(abort) |
34 | .globl F(rijndael_rcon) | |
35 | ||
36 | ///-------------------------------------------------------------------------- | |
47103664 MW |
37 | /// Local utilities. |
38 | ||
39 | // Magic constants for shuffling. | |
40 | #define ROTL 0x93 | |
41 | #define ROT2 0x4e | |
42 | #define ROTR 0x39 | |
43 | ||
44 | ///-------------------------------------------------------------------------- | |
1a0c09c4 MW |
45 | /// Main code. |
46 | ||
47 | .arch .aes | |
bc9ac7eb | 48 | .text |
1a0c09c4 MW |
49 | |
50 | /// The AESNI instructions implement a little-endian version of AES, but | |
51 | /// Catacomb's internal interface presents as big-endian so as to work better | |
52 | /// with things like GCM. We therefore maintain the round keys in | |
53 | /// little-endian form, and have to end-swap blocks in and out. | |
54 | /// | |
55 | /// For added amusement, the AESNI instructions don't implement the | |
56 | /// larger-block versions of Rijndael, so we have to end-swap the keys if | |
57 | /// we're preparing for one of those. | |
58 | ||
59 | // Useful constants. | |
60 | .equ maxrounds, 16 // maximum number of rounds | |
61 | .equ maxblksz, 32 // maximum block size, in bytes | |
62 | .equ kbufsz, maxblksz*(maxrounds + 1) // size of a key-schedule buffer | |
63 | ||
64 | // Context structure. | |
65 | .equ nr, 0 // number of rounds | |
66 | .equ w, nr + 4 // encryption key words | |
67 | .equ wi, w + kbufsz // decryption key words | |
68 | ||
69 | ///-------------------------------------------------------------------------- | |
70 | /// Key setup. | |
71 | ||
0f23f75f | 72 | FUNC(rijndael_setup_x86ish_aesni) |
1a0c09c4 | 73 | |
0f23f75f MW |
74 | #if CPUFAM_X86 |
75 | // Arguments are on the stack. We'll need to stack the caller's | |
76 | // register veriables, but we'll manage. | |
1a0c09c4 | 77 | |
0f23f75f MW |
78 | # define CTX ebp // context pointer |
79 | # define BLKSZ [esp + 24] // block size | |
80 | ||
81 | # define SI esi // source pointer | |
82 | # define DI edi // destination pointer | |
83 | ||
84 | # define KSZ ebx // key size | |
85 | # define KSZo ebx // ... as address offset | |
86 | # define NKW edx // total number of key words | |
87 | # define NKW_NEEDS_REFRESH 1 // ... needs recalculating | |
88 | # define RCON ecx // round constants table | |
89 | # define LIM edx // limit pointer | |
90 | # define LIMn edx // ... as integer offset from base | |
91 | ||
92 | # define NR ecx // number of rounds | |
93 | # define LRK eax // distance to last key | |
94 | # define LRKo eax // ... as address offset | |
95 | # define BLKOFF edx // block size in bytes | |
96 | # define BLKOFFo edx // ... as address offset | |
97 | ||
98 | // Stack the caller's registers. | |
1a0c09c4 MW |
99 | push ebp |
100 | push ebx | |
101 | push esi | |
102 | push edi | |
103 | ||
0f23f75f MW |
104 | // Set up our own variables. |
105 | mov CTX, [esp + 20] // context base pointer | |
106 | mov SI, [esp + 28] // key material | |
107 | mov KSZ, [esp + 32] // key size, in words | |
108 | #endif | |
109 | ||
110 | #if CPUFAM_AMD64 && ABI_SYSV | |
111 | // Arguments are in registers. We have plenty, but, to be honest, | |
112 | // the initial register allocation is a bit annoying. | |
113 | ||
114 | # define CTX r8 // context pointer | |
115 | # define BLKSZ r9d // block size | |
116 | ||
117 | # define SI rsi // source pointer | |
118 | # define DI rdi // destination pointer | |
119 | ||
120 | # define KSZ edx // key size | |
121 | # define KSZo rdx // ... as address offset | |
122 | # define NKW r10d // total number of key words | |
123 | # define RCON rdi // round constants table | |
124 | # define LIMn ecx // limit pointer | |
125 | # define LIM rcx // ... as integer offset from base | |
126 | ||
127 | # define NR ecx // number of rounds | |
128 | # define LRK eax // distance to last key | |
129 | # define LRKo rax // ... as address offset | |
130 | # define BLKOFF r9d // block size in bytes | |
131 | # define BLKOFFo r9 // ... as address offset | |
132 | ||
133 | // Move arguments to more useful places. | |
134 | mov CTX, rdi // context base pointer | |
135 | mov BLKSZ, esi // block size in words | |
136 | mov SI, rdx // key material | |
137 | mov KSZ, ecx // key size, in words | |
138 | #endif | |
139 | ||
140 | #if CPUFAM_AMD64 && ABI_WIN | |
141 | // Arguments are in different registers, and they're a little tight. | |
142 | ||
143 | # define CTX r8 // context pointer | |
144 | # define BLKSZ edx // block size | |
145 | ||
146 | # define SI rsi // source pointer | |
147 | # define DI rdi // destination pointer | |
148 | ||
149 | # define KSZ r9d // key size | |
150 | # define KSZo r9 // ... as address offset | |
151 | # define NKW r10d // total number of key words | |
152 | # define RCON rdi // round constants table | |
153 | # define LIMn ecx // limit pointer | |
154 | # define LIM rcx // ... as integer offset from base | |
155 | ||
156 | # define NR ecx // number of rounds | |
157 | # define LRK eax // distance to last key | |
158 | # define LRKo rax // ... as address offset | |
159 | # define BLKOFF edx // block size in bytes | |
160 | # define BLKOFFo rdx // ... as address offset | |
161 | ||
162 | // We'll need the index registers, which belong to the caller in this | |
163 | // ABI. | |
164 | push rsi | |
165 | push rdi | |
166 | ||
167 | // Move arguments to more useful places. | |
168 | mov SI, r8 // key material | |
169 | mov CTX, rcx // context base pointer | |
170 | #endif | |
171 | ||
1a0c09c4 MW |
172 | // The initial round key material is taken directly from the input |
173 | // key, so copy it over. | |
0f23f75f MW |
174 | #if CPUFAM_AMD64 && ABI_SYSV |
175 | // We've been lucky. We already have a copy of the context pointer | |
176 | // in rdi, and the key size in ecx. | |
177 | add DI, w | |
178 | #else | |
179 | lea DI, [CTX + w] | |
180 | mov ecx, KSZ | |
181 | #endif | |
1a0c09c4 MW |
182 | rep movsd |
183 | ||
184 | // Find out other useful things. | |
0f23f75f MW |
185 | mov NKW, [CTX + nr] // number of rounds |
186 | add NKW, 1 | |
187 | imul NKW, BLKSZ // total key size in words | |
188 | #if !NKW_NEEDS_REFRESH | |
189 | // If we can't keep NKW for later, then we use the same register for | |
190 | // it and LIM, so this move is unnecessary. | |
191 | mov LIMn, NKW | |
192 | #endif | |
193 | sub LIMn, KSZ // offset by the key size | |
1a0c09c4 MW |
194 | |
195 | // Find the round constants. | |
196 | ldgot ecx | |
811a896f | 197 | leaext RCON, F(rijndael_rcon), ecx |
1a0c09c4 MW |
198 | |
199 | // Prepare for the main loop. | |
0f23f75f MW |
200 | lea SI, [CTX + w] |
201 | mov eax, [SI + 4*KSZo - 4] // most recent key word | |
202 | lea LIM, [SI + 4*LIM] // limit, offset by one key expansion | |
1a0c09c4 MW |
203 | |
204 | // Main key expansion loop. The first word of each key-length chunk | |
205 | // needs special treatment. | |
206 | // | |
207 | // This is rather tedious because the Intel `AESKEYGENASSIST' | |
208 | // instruction is very strangely shaped. Firstly, it wants to | |
209 | // operate on vast SSE registers, even though we're data-blocked from | |
210 | // doing more than operation at a time unless we're doing two key | |
211 | // schedules simultaneously -- and even then we can't do more than | |
212 | // two, because the instruction ignores two of its input words | |
213 | // entirely, and produces two different outputs for each of the other | |
214 | // two. And secondly it insists on taking the magic round constant | |
215 | // as an immediate, so it's kind of annoying if you're not | |
216 | // open-coding the whole thing. It's much easier to leave that as | |
217 | // zero and XOR in the round constant by hand. | |
89b34050 | 218 | 0: movd xmm0, eax |
47103664 | 219 | pshufd xmm0, xmm0, ROTR |
1a0c09c4 | 220 | aeskeygenassist xmm1, xmm0, 0 |
47103664 | 221 | pshufd xmm1, xmm1, ROTL |
1a0c09c4 | 222 | movd eax, xmm1 |
0f23f75f MW |
223 | xor eax, [SI] |
224 | xor al, [RCON] | |
225 | inc RCON | |
226 | mov [SI + 4*KSZo], eax | |
227 | add SI, 4 | |
228 | cmp SI, LIM | |
89b34050 | 229 | jae 9f |
1a0c09c4 MW |
230 | |
231 | // The next three words are simple... | |
0f23f75f MW |
232 | xor eax, [SI] |
233 | mov [SI + 4*KSZo], eax | |
234 | add SI, 4 | |
235 | cmp SI, LIM | |
89b34050 | 236 | jae 9f |
1a0c09c4 MW |
237 | |
238 | // (Word 2...) | |
0f23f75f MW |
239 | xor eax, [SI] |
240 | mov [SI + 4*KSZo], eax | |
241 | add SI, 4 | |
242 | cmp SI, LIM | |
89b34050 | 243 | jae 9f |
1a0c09c4 MW |
244 | |
245 | // (Word 3...) | |
0f23f75f MW |
246 | xor eax, [SI] |
247 | mov [SI + 4*KSZo], eax | |
248 | add SI, 4 | |
249 | cmp SI, LIM | |
89b34050 | 250 | jae 9f |
1a0c09c4 MW |
251 | |
252 | // Word 4. If the key is /more/ than 6 words long, then we must | |
253 | // apply a substitution here. | |
0f23f75f | 254 | cmp KSZ, 5 |
89b34050 | 255 | jb 0b |
0f23f75f | 256 | cmp KSZ, 7 |
89b34050 | 257 | jb 1f |
1a0c09c4 | 258 | movd xmm0, eax |
47103664 | 259 | pshufd xmm0, xmm0, ROTL |
1a0c09c4 MW |
260 | aeskeygenassist xmm1, xmm0, 0 |
261 | movd eax, xmm1 | |
89b34050 | 262 | 1: xor eax, [SI] |
0f23f75f MW |
263 | mov [SI + 4*KSZo], eax |
264 | add SI, 4 | |
265 | cmp SI, LIM | |
89b34050 | 266 | jae 9f |
1a0c09c4 MW |
267 | |
268 | // (Word 5...) | |
0f23f75f | 269 | cmp KSZ, 6 |
89b34050 | 270 | jb 0b |
0f23f75f MW |
271 | xor eax, [SI] |
272 | mov [SI + 4*KSZo], eax | |
273 | add SI, 4 | |
274 | cmp SI, LIM | |
89b34050 | 275 | jae 9f |
1a0c09c4 MW |
276 | |
277 | // (Word 6...) | |
0f23f75f | 278 | cmp KSZ, 7 |
89b34050 | 279 | jb 0b |
0f23f75f MW |
280 | xor eax, [SI] |
281 | mov [SI + 4*KSZo], eax | |
282 | add SI, 4 | |
283 | cmp SI, LIM | |
89b34050 | 284 | jae 9f |
1a0c09c4 MW |
285 | |
286 | // (Word 7...) | |
0f23f75f | 287 | cmp KSZ, 8 |
89b34050 | 288 | jb 0b |
0f23f75f MW |
289 | xor eax, [SI] |
290 | mov [SI + 4*KSZo], eax | |
291 | add SI, 4 | |
292 | cmp SI, LIM | |
89b34050 | 293 | jae 9f |
1a0c09c4 MW |
294 | |
295 | // Must be done by now. | |
89b34050 | 296 | jmp 0b |
1a0c09c4 MW |
297 | |
298 | // Next job is to construct the decryption keys. The keys for the | |
299 | // first and last rounds don't need to be mangled, but the remaining | |
300 | // ones do -- and they all need to be reordered too. | |
301 | // | |
302 | // The plan of action, then, is to copy the final encryption round's | |
303 | // keys into place first, then to do each of the intermediate rounds | |
304 | // in reverse order, and finally do the first round. | |
305 | // | |
306 | // Do all of the heavy lifting with SSE registers. The order we're | |
307 | // doing this in means that it's OK if we read or write too much, and | |
308 | // there's easily enough buffer space for the over-enthusiastic reads | |
309 | // and writes because the context has space for 32-byte blocks, which | |
310 | // is our maximum and an exact fit for two SSE registers. | |
89b34050 | 311 | 9: mov NR, [CTX + nr] // number of rounds |
0f23f75f MW |
312 | #if NKW_NEEDS_REFRESH |
313 | mov BLKOFF, BLKSZ | |
314 | mov LRK, NR | |
315 | imul LRK, BLKOFF | |
316 | #else | |
317 | // If we retain NKW, then BLKSZ and BLKOFF are the same register | |
318 | // because we won't need the former again. | |
319 | mov LRK, NKW | |
320 | sub LRK, BLKSZ | |
321 | #endif | |
322 | lea DI, [CTX + wi] | |
323 | lea SI, [CTX + w + 4*LRKo] // last round's keys | |
324 | shl BLKOFF, 2 // block size (in bytes now) | |
1a0c09c4 MW |
325 | |
326 | // Copy the last encryption round's keys. | |
0f23f75f MW |
327 | movdqu xmm0, [SI] |
328 | movdqu [DI], xmm0 | |
329 | cmp BLKOFF, 16 | |
89b34050 | 330 | jbe 0f |
0f23f75f MW |
331 | movdqu xmm0, [SI + 16] |
332 | movdqu [DI + 16], xmm0 | |
1a0c09c4 MW |
333 | |
334 | // Update the loop variables and stop if we've finished. | |
89b34050 | 335 | 0: add DI, BLKOFFo |
0f23f75f MW |
336 | sub SI, BLKOFFo |
337 | sub NR, 1 | |
89b34050 | 338 | jbe 9f |
1a0c09c4 MW |
339 | |
340 | // Do another middle round's keys... | |
0f23f75f | 341 | movdqu xmm0, [SI] |
1a0c09c4 | 342 | aesimc xmm0, xmm0 |
0f23f75f MW |
343 | movdqu [DI], xmm0 |
344 | cmp BLKOFF, 16 | |
89b34050 | 345 | jbe 0b |
0f23f75f | 346 | movdqu xmm0, [SI + 16] |
1a0c09c4 | 347 | aesimc xmm0, xmm0 |
0f23f75f | 348 | movdqu [DI + 16], xmm0 |
89b34050 | 349 | jmp 0b |
1a0c09c4 MW |
350 | |
351 | // Finally do the first encryption round. | |
89b34050 | 352 | 9: movdqu xmm0, [SI] |
0f23f75f MW |
353 | movdqu [DI], xmm0 |
354 | cmp BLKOFF, 16 | |
89b34050 | 355 | jbe 1f |
0f23f75f MW |
356 | movdqu xmm0, [SI + 16] |
357 | movdqu [DI + 16], xmm0 | |
1a0c09c4 MW |
358 | |
359 | // If the block size is not exactly four words then we must end-swap | |
360 | // everything. We can use fancy SSE toys for this. | |
89b34050 MW |
361 | 1: cmp BLKOFF, 16 |
362 | je 9f | |
1a0c09c4 MW |
363 | |
364 | // Find the byte-reordering table. | |
365 | ldgot ecx | |
8d6ca554 | 366 | movdqa xmm5, [INTADDR(endswap_tab, ecx)] |
1a0c09c4 | 367 | |
0f23f75f | 368 | #if NKW_NEEDS_REFRESH |
1a0c09c4 MW |
369 | // Calculate the number of subkey words again. (It's a good job |
370 | // we've got a fast multiplier.) | |
0f23f75f MW |
371 | mov NKW, [CTX + nr] |
372 | add NKW, 1 | |
373 | imul NKW, BLKSZ | |
374 | #endif | |
1a0c09c4 MW |
375 | |
376 | // End-swap the encryption keys. | |
0f23f75f | 377 | lea SI, [CTX + w] |
1a0c09c4 MW |
378 | call endswap_block |
379 | ||
380 | // And the decryption keys. | |
0f23f75f | 381 | lea SI, [CTX + wi] |
1a0c09c4 MW |
382 | call endswap_block |
383 | ||
89b34050 | 384 | 9: // All done. |
0f23f75f MW |
385 | #if CPUFAM_X86 |
386 | pop edi | |
1a0c09c4 MW |
387 | pop esi |
388 | pop ebx | |
389 | pop ebp | |
0f23f75f MW |
390 | #endif |
391 | #if CPUFAM_AMD64 && ABI_WIN | |
392 | pop rdi | |
393 | pop rsi | |
394 | #endif | |
1a0c09c4 MW |
395 | ret |
396 | ||
397 | .align 16 | |
398 | endswap_block: | |
1a384903 | 399 | // End-swap NKW words starting at SI. The end-swapping table is |
8d6ca554 | 400 | // already loaded into XMM5; and it's OK to work in 16-byte chunks. |
1a384903 MW |
401 | mov ecx, NKW |
402 | 0: movdqu xmm1, [SI] | |
8d6ca554 | 403 | pshufb xmm1, xmm5 |
0f23f75f MW |
404 | movdqu [SI], xmm1 |
405 | add SI, 16 | |
1a0c09c4 | 406 | sub ecx, 4 |
1a384903 | 407 | ja 0b |
1a0c09c4 MW |
408 | ret |
409 | ||
0f23f75f MW |
410 | #undef CTX |
411 | #undef BLKSZ | |
412 | #undef SI | |
413 | #undef DI | |
414 | #undef KSZ | |
415 | #undef KSZo | |
416 | #undef RCON | |
417 | #undef LIMn | |
418 | #undef LIM | |
419 | #undef NR | |
420 | #undef LRK | |
421 | #undef LRKo | |
422 | #undef BLKOFF | |
423 | #undef BLKOFFo | |
424 | ||
1a0c09c4 MW |
425 | ENDFUNC |
426 | ||
427 | ///-------------------------------------------------------------------------- | |
428 | /// Encrypting and decrypting blocks. | |
429 | ||
8a1aa284 MW |
430 | .macro encdec op, aes, koff |
431 | FUNC(rijndael_\op\()_x86ish_aesni) | |
1a0c09c4 MW |
432 | |
433 | // Find the magic endianness-swapping table. | |
434 | ldgot ecx | |
8d6ca554 | 435 | movdqa xmm5, [INTADDR(endswap_tab, ecx)] |
1a0c09c4 | 436 | |
0f23f75f MW |
437 | #if CPUFAM_X86 |
438 | // Arguments come in on the stack, and need to be collected. We | |
439 | // don't have a shortage of registers. | |
440 | ||
441 | # define K ecx | |
442 | # define SRC edx | |
443 | # define DST edx | |
444 | # define NR eax | |
445 | ||
446 | mov K, [esp + 4] | |
447 | mov SRC, [esp + 8] | |
448 | #endif | |
449 | ||
450 | #if CPUFAM_AMD64 && ABI_SYSV | |
451 | // Arguments come in registers. All is good. | |
452 | ||
453 | # define K rdi | |
454 | # define SRC rsi | |
455 | # define DST rdx | |
456 | # define NR eax | |
457 | #endif | |
458 | ||
459 | #if CPUFAM_AMD64 && ABI_WIN | |
460 | // Arguments come in different registers. | |
461 | ||
462 | # define K rcx | |
463 | # define SRC rdx | |
464 | # define DST r8 | |
465 | # define NR eax | |
466 | #endif | |
467 | ||
468 | // Initial setup. | |
469 | movdqu xmm0, [SRC] | |
8d6ca554 | 470 | pshufb xmm0, xmm5 |
0f23f75f MW |
471 | mov NR, [K + nr] |
472 | add K, \koff | |
1a0c09c4 MW |
473 | |
474 | // Initial whitening. | |
0f23f75f MW |
475 | movdqu xmm1, [K] |
476 | add K, 16 | |
1a0c09c4 MW |
477 | pxor xmm0, xmm1 |
478 | ||
479 | // Dispatch to the correct code. | |
0f23f75f | 480 | cmp NR, 10 |
e297526c | 481 | je 10f |
1a0c09c4 | 482 | jb bogus |
0f23f75f | 483 | cmp NR, 14 |
e297526c | 484 | je 14f |
1a0c09c4 | 485 | ja bogus |
0f23f75f | 486 | cmp NR, 12 |
e297526c MW |
487 | je 12f |
488 | jb 11f | |
489 | jmp 13f | |
1a0c09c4 MW |
490 | |
491 | .align 2 | |
492 | ||
493 | // 14 rounds... | |
0f23f75f MW |
494 | 14: movdqu xmm1, [K] |
495 | add K, 16 | |
e297526c | 496 | \aes xmm0, xmm1 |
1a0c09c4 MW |
497 | |
498 | // 13 rounds... | |
0f23f75f MW |
499 | 13: movdqu xmm1, [K] |
500 | add K, 16 | |
e297526c | 501 | \aes xmm0, xmm1 |
1a0c09c4 MW |
502 | |
503 | // 12 rounds... | |
0f23f75f MW |
504 | 12: movdqu xmm1, [K] |
505 | add K, 16 | |
e297526c | 506 | \aes xmm0, xmm1 |
1a0c09c4 MW |
507 | |
508 | // 11 rounds... | |
0f23f75f MW |
509 | 11: movdqu xmm1, [K] |
510 | add K, 16 | |
e297526c | 511 | \aes xmm0, xmm1 |
1a0c09c4 MW |
512 | |
513 | // 10 rounds... | |
0f23f75f | 514 | 10: movdqu xmm1, [K] |
e297526c | 515 | \aes xmm0, xmm1 |
1a0c09c4 MW |
516 | |
517 | // 9 rounds... | |
0f23f75f | 518 | movdqu xmm1, [K + 16] |
e297526c | 519 | \aes xmm0, xmm1 |
1a0c09c4 MW |
520 | |
521 | // 8 rounds... | |
0f23f75f | 522 | movdqu xmm1, [K + 32] |
e297526c | 523 | \aes xmm0, xmm1 |
1a0c09c4 MW |
524 | |
525 | // 7 rounds... | |
0f23f75f | 526 | movdqu xmm1, [K + 48] |
e297526c | 527 | \aes xmm0, xmm1 |
1a0c09c4 MW |
528 | |
529 | // 6 rounds... | |
0f23f75f | 530 | movdqu xmm1, [K + 64] |
e297526c | 531 | \aes xmm0, xmm1 |
1a0c09c4 MW |
532 | |
533 | // 5 rounds... | |
0f23f75f | 534 | movdqu xmm1, [K + 80] |
e297526c | 535 | \aes xmm0, xmm1 |
1a0c09c4 MW |
536 | |
537 | // 4 rounds... | |
0f23f75f | 538 | movdqu xmm1, [K + 96] |
e297526c | 539 | \aes xmm0, xmm1 |
1a0c09c4 MW |
540 | |
541 | // 3 rounds... | |
0f23f75f | 542 | movdqu xmm1, [K + 112] |
e297526c | 543 | \aes xmm0, xmm1 |
1a0c09c4 MW |
544 | |
545 | // 2 rounds... | |
0f23f75f | 546 | movdqu xmm1, [K + 128] |
e297526c | 547 | \aes xmm0, xmm1 |
1a0c09c4 MW |
548 | |
549 | // Final round... | |
0f23f75f | 550 | movdqu xmm1, [K + 144] |
e297526c | 551 | \aes\()last xmm0, xmm1 |
1a0c09c4 MW |
552 | |
553 | // Unpermute the ciphertext block and store it. | |
8d6ca554 | 554 | pshufb xmm0, xmm5 |
0f23f75f MW |
555 | #if CPUFAM_X86 |
556 | mov DST, [esp + 12] | |
557 | #endif | |
558 | movdqu [DST], xmm0 | |
1a0c09c4 MW |
559 | |
560 | // And we're done. | |
561 | ret | |
562 | ||
0f23f75f MW |
563 | #undef K |
564 | #undef SRC | |
565 | #undef DST | |
566 | #undef NR | |
567 | ||
8a1aa284 MW |
568 | ENDFUNC |
569 | .endm | |
1a0c09c4 | 570 | |
e297526c MW |
571 | encdec eblk, aesenc, w |
572 | encdec dblk, aesdec, wi | |
1a0c09c4 MW |
573 | |
574 | ///-------------------------------------------------------------------------- | |
575 | /// Random utilities. | |
576 | ||
577 | .align 16 | |
578 | // Abort the process because of a programming error. Indirecting | |
579 | // through this point serves several purposes: (a) by CALLing, rather | |
580 | // than branching to, `abort', we can save the return address, which | |
581 | // might at least provide a hint as to what went wrong; (b) we don't | |
582 | // have conditional CALLs (and they'd be big anyway); and (c) we can | |
583 | // write a HLT here as a backstop against `abort' being mad. | |
584 | bogus: callext F(abort) | |
585 | 0: hlt | |
586 | jmp 0b | |
587 | ||
588 | gotaux ecx | |
589 | ||
590 | ///-------------------------------------------------------------------------- | |
591 | /// Data tables. | |
592 | ||
593 | .align 16 | |
594 | endswap_tab: | |
595 | .byte 3, 2, 1, 0 | |
596 | .byte 7, 6, 5, 4 | |
597 | .byte 11, 10, 9, 8 | |
598 | .byte 15, 14, 13, 12 | |
599 | ||
600 | ///----- That's all, folks -------------------------------------------------- |