.globl F(rijndael_rcon)
///--------------------------------------------------------------------------
-/// Local utilities.
-
-// Magic constants for shuffling.
-#define ROTL 0x93
-#define ROT2 0x4e
-#define ROTR 0x39
-
-///--------------------------------------------------------------------------
/// Main code.
.arch .aes
# define RCON ecx // round constants table
# define LIM edx // limit pointer
# define LIMn edx // ... as integer offset from base
+# define CYIX edi // index in shift-register cycle
# define NR ecx // number of rounds
# define LRK eax // distance to last key
# define RCON rdi // round constants table
# define LIMn ecx // limit pointer
# define LIM rcx // ... as integer offset from base
+# define CYIX r11d // index in shift-register cycle
# define NR ecx // number of rounds
# define LRK eax // distance to last key
# define RCON rdi // round constants table
# define LIMn ecx // limit pointer
# define LIM rcx // ... as integer offset from base
+# define CYIX r11d // index in shift-register cycle
# define NR ecx // number of rounds
# define LRK eax // distance to last key
// We'll need the index registers, which belong to the caller in this
// ABI.
push rsi
+ .seh_pushreg rsi
push rdi
+ .seh_pushreg rdi
+ .seh_endprologue
// Move arguments to more useful places.
mov SI, r8 // key material
lea SI, [CTX + w]
mov eax, [SI + 4*KSZo - 4] // most recent key word
lea LIM, [SI + 4*LIM] // limit, offset by one key expansion
+ xor CYIX, CYIX // start of new cycle
// Main key expansion loop. The first word of each key-length chunk
// needs special treatment.
// as an immediate, so it's kind of annoying if you're not
// open-coding the whole thing. It's much easier to leave that as
// zero and XOR in the round constant by hand.
-9: movd xmm0, eax
- pshufd xmm0, xmm0, ROTR
+0: cmp CYIX, 0 // first word of the cycle?
+ je 1f
+ cmp CYIX, 4 // fourth word of the cycle?
+ jne 2f
+ cmp KSZ, 7 // and a large key?
+ jb 2f
+
+ // Fourth word of the cycle, and seven or eight words of key. Do a
+ // byte substitution.
+ movd xmm0, eax
+ pshufd xmm0, xmm0, SHUF(2, 1, 0, 3)
aeskeygenassist xmm1, xmm0, 0
- pshufd xmm1, xmm1, ROTL
movd eax, xmm1
- xor eax, [SI]
- xor al, [RCON]
- inc RCON
- mov [SI + 4*KSZo], eax
- add SI, 4
- cmp SI, LIM
- jae 8f
-
- // The next three words are simple...
- xor eax, [SI]
- mov [SI + 4*KSZo], eax
- add SI, 4
- cmp SI, LIM
- jae 8f
+ jmp 2f
- // (Word 2...)
- xor eax, [SI]
- mov [SI + 4*KSZo], eax
- add SI, 4
- cmp SI, LIM
- jae 8f
-
- // (Word 3...)
- xor eax, [SI]
- mov [SI + 4*KSZo], eax
- add SI, 4
- cmp SI, LIM
- jae 8f
-
- // Word 4. If the key is /more/ than 6 words long, then we must
- // apply a substitution here.
- cmp KSZ, 5
- jb 9b
- cmp KSZ, 7
- jb 0f
- movd xmm0, eax
- pshufd xmm0, xmm0, ROTL
+ // First word of the cycle. This is the complicated piece.
+1: movd xmm0, eax
+ pshufd xmm0, xmm0, SHUF(0, 3, 2, 1)
aeskeygenassist xmm1, xmm0, 0
+ pshufd xmm1, xmm1, SHUF(2, 1, 0, 3)
movd eax, xmm1
-0: xor eax, [SI]
- mov [SI + 4*KSZo], eax
- add SI, 4
- cmp SI, LIM
- jae 8f
-
- // (Word 5...)
- cmp KSZ, 6
- jb 9b
- xor eax, [SI]
- mov [SI + 4*KSZo], eax
- add SI, 4
- cmp SI, LIM
- jae 8f
-
- // (Word 6...)
- cmp KSZ, 7
- jb 9b
- xor eax, [SI]
- mov [SI + 4*KSZo], eax
- add SI, 4
- cmp SI, LIM
- jae 8f
+ xor al, [RCON]
+ inc RCON
- // (Word 7...)
- cmp KSZ, 8
- jb 9b
- xor eax, [SI]
+ // Common tail. Mix in the corresponding word from the previous
+ // cycle and prepare for the next loop.
+2: xor eax, [SI]
mov [SI + 4*KSZo], eax
add SI, 4
+ inc CYIX
cmp SI, LIM
- jae 8f
-
- // Must be done by now.
- jmp 9b
+ jae 9f
+ cmp CYIX, KSZ
+ jb 0b
+ xor CYIX, CYIX
+ jmp 0b
// Next job is to construct the decryption keys. The keys for the
// first and last rounds don't need to be mangled, but the remaining
// there's easily enough buffer space for the over-enthusiastic reads
// and writes because the context has space for 32-byte blocks, which
// is our maximum and an exact fit for two SSE registers.
-8: mov NR, [CTX + nr] // number of rounds
+9: mov NR, [CTX + nr] // number of rounds
#if NKW_NEEDS_REFRESH
mov BLKOFF, BLKSZ
mov LRK, NR
movdqu xmm0, [SI]
movdqu [DI], xmm0
cmp BLKOFF, 16
- jbe 9f
+ jbe 0f
movdqu xmm0, [SI + 16]
movdqu [DI + 16], xmm0
// Update the loop variables and stop if we've finished.
-9: add DI, BLKOFFo
+0: add DI, BLKOFFo
sub SI, BLKOFFo
sub NR, 1
- jbe 0f
+ jbe 9f
// Do another middle round's keys...
movdqu xmm0, [SI]
aesimc xmm0, xmm0
movdqu [DI], xmm0
cmp BLKOFF, 16
- jbe 9b
+ jbe 0b
movdqu xmm0, [SI + 16]
aesimc xmm0, xmm0
movdqu [DI + 16], xmm0
- jmp 9b
+ jmp 0b
// Finally do the first encryption round.
-0: movdqu xmm0, [SI]
+9: movdqu xmm0, [SI]
movdqu [DI], xmm0
cmp BLKOFF, 16
- jbe 0f
+ jbe 1f
movdqu xmm0, [SI + 16]
movdqu [DI + 16], xmm0
// If the block size is not exactly four words then we must end-swap
// everything. We can use fancy SSE toys for this.
-0: cmp BLKOFF, 16
- je 0f
+1: cmp BLKOFF, 16
+ je 9f
// Find the byte-reordering table.
ldgot ecx
#endif
// End-swap the encryption keys.
- mov ecx, NKW
lea SI, [CTX + w]
call endswap_block
// And the decryption keys.
- mov ecx, NKW
lea SI, [CTX + wi]
call endswap_block
-0: // All done.
+9: // All done.
#if CPUFAM_X86
pop edi
pop esi
.align 16
endswap_block:
- // End-swap ECX words starting at SI. The end-swapping table is
+ // End-swap NKW words starting at SI. The end-swapping table is
// already loaded into XMM5; and it's OK to work in 16-byte chunks.
- movdqu xmm1, [SI]
+ mov ecx, NKW
+0: movdqu xmm1, [SI]
pshufb xmm1, xmm5
movdqu [SI], xmm1
add SI, 16
sub ecx, 4
- ja endswap_block
+ ja 0b
ret
#undef CTX
.macro encdec op, aes, koff
FUNC(rijndael_\op\()_x86ish_aesni)
- // Find the magic endianness-swapping table.
- ldgot ecx
- movdqa xmm5, [INTADDR(endswap_tab, ecx)]
-
#if CPUFAM_X86
// Arguments come in on the stack, and need to be collected. We
// don't have a shortage of registers.
-# define K ecx
+# define K eax
# define SRC edx
# define DST edx
-# define NR eax
+# define NR ecx
mov K, [esp + 4]
mov SRC, [esp + 8]
# define SRC rdx
# define DST r8
# define NR eax
+ .seh_endprologue
#endif
+ // Find the magic endianness-swapping table.
+ ldgot ecx
+ movdqa xmm5, [INTADDR(endswap_tab, ecx)]
+
// Initial setup.
movdqu xmm0, [SRC]
pshufb xmm0, xmm5
movdqu xmm1, [K]
add K, 16
pxor xmm0, xmm1
+#if CPUFAM_X86
+ mov DST, [esp + 12]
+#endif
// Dispatch to the correct code.
cmp NR, 10
// Unpermute the ciphertext block and store it.
pshufb xmm0, xmm5
-#if CPUFAM_X86
- mov DST, [esp + 12]
-#endif
movdqu [DST], xmm0
// And we're done.
0: hlt
jmp 0b
- gotaux ecx
-
///--------------------------------------------------------------------------
/// Data tables.
+ RODATA
+
.align 16
endswap_tab:
.byte 3, 2, 1, 0