X-Git-Url: https://git.distorted.org.uk/~mdw/catacomb/blobdiff_plain/e297526c6cfe427a9d70204966745651eac50fdb:/symm/rijndael-x86-aesni.S..0f23f75ff53acadf80e9d3dfd2dfd14cb526074f:/symm/rijndael-x86ish-aesni.S diff --git a/symm/rijndael-x86-aesni.S b/symm/rijndael-x86ish-aesni.S similarity index 53% rename from symm/rijndael-x86-aesni.S rename to symm/rijndael-x86ish-aesni.S index c0cd437a..91fcc352 100644 --- a/symm/rijndael-x86-aesni.S +++ b/symm/rijndael-x86ish-aesni.S @@ -72,45 +72,137 @@ ///-------------------------------------------------------------------------- /// Key setup. -FUNC(rijndael_setup_x86_aesni) +FUNC(rijndael_setup_x86ish_aesni) - // Initial state. We have four arguments: - // [esp + 20] is the context pointer - // [esp + 24] is the block size, in 32-bit words (4, 6, or 8) - // [esp + 28] points to the key material, unaligned - // [esp + 32] is the size of the key, in words - // The key size has already been checked for validity, and the number - // of rounds has been computed. Our job is only to fill in the `w' - // and `wi' vectors. +#if CPUFAM_X86 + // Arguments are on the stack. We'll need to stack the caller's + // register veriables, but we'll manage. +# define CTX ebp // context pointer +# define BLKSZ [esp + 24] // block size + +# define SI esi // source pointer +# define DI edi // destination pointer + +# define KSZ ebx // key size +# define KSZo ebx // ... as address offset +# define NKW edx // total number of key words +# define NKW_NEEDS_REFRESH 1 // ... needs recalculating +# define RCON ecx // round constants table +# define LIM edx // limit pointer +# define LIMn edx // ... as integer offset from base + +# define NR ecx // number of rounds +# define LRK eax // distance to last key +# define LRKo eax // ... as address offset +# define BLKOFF edx // block size in bytes +# define BLKOFFo edx // ... as address offset + + // Stack the caller's registers. push ebp push ebx push esi push edi + // Set up our own variables. + mov CTX, [esp + 20] // context base pointer + mov SI, [esp + 28] // key material + mov KSZ, [esp + 32] // key size, in words +#endif + +#if CPUFAM_AMD64 && ABI_SYSV + // Arguments are in registers. We have plenty, but, to be honest, + // the initial register allocation is a bit annoying. + +# define CTX r8 // context pointer +# define BLKSZ r9d // block size + +# define SI rsi // source pointer +# define DI rdi // destination pointer + +# define KSZ edx // key size +# define KSZo rdx // ... as address offset +# define NKW r10d // total number of key words +# define RCON rdi // round constants table +# define LIMn ecx // limit pointer +# define LIM rcx // ... as integer offset from base + +# define NR ecx // number of rounds +# define LRK eax // distance to last key +# define LRKo rax // ... as address offset +# define BLKOFF r9d // block size in bytes +# define BLKOFFo r9 // ... as address offset + + // Move arguments to more useful places. + mov CTX, rdi // context base pointer + mov BLKSZ, esi // block size in words + mov SI, rdx // key material + mov KSZ, ecx // key size, in words +#endif + +#if CPUFAM_AMD64 && ABI_WIN + // Arguments are in different registers, and they're a little tight. + +# define CTX r8 // context pointer +# define BLKSZ edx // block size + +# define SI rsi // source pointer +# define DI rdi // destination pointer + +# define KSZ r9d // key size +# define KSZo r9 // ... as address offset +# define NKW r10d // total number of key words +# define RCON rdi // round constants table +# define LIMn ecx // limit pointer +# define LIM rcx // ... as integer offset from base + +# define NR ecx // number of rounds +# define LRK eax // distance to last key +# define LRKo rax // ... as address offset +# define BLKOFF edx // block size in bytes +# define BLKOFFo rdx // ... as address offset + + // We'll need the index registers, which belong to the caller in this + // ABI. + push rsi + push rdi + + // Move arguments to more useful places. + mov SI, r8 // key material + mov CTX, rcx // context base pointer +#endif + // The initial round key material is taken directly from the input // key, so copy it over. - mov ebp, [esp + 20] // context base pointer - mov ebx, [esp + 32] // key size, in words - mov ecx, ebx - mov esi, [esp + 28] - lea edi, [ebp + w] +#if CPUFAM_AMD64 && ABI_SYSV + // We've been lucky. We already have a copy of the context pointer + // in rdi, and the key size in ecx. + add DI, w +#else + lea DI, [CTX + w] + mov ecx, KSZ +#endif rep movsd // Find out other useful things. - mov edx, [ebp + nr] // number of rounds - add edx, 1 - imul edx, [esp + 24] // total key size in words - sub edx, ebx // offset by the key size + mov NKW, [CTX + nr] // number of rounds + add NKW, 1 + imul NKW, BLKSZ // total key size in words +#if !NKW_NEEDS_REFRESH + // If we can't keep NKW for later, then we use the same register for + // it and LIM, so this move is unnecessary. + mov LIMn, NKW +#endif + sub LIMn, KSZ // offset by the key size // Find the round constants. ldgot ecx - leaext ecx, rijndael_rcon, ecx + leaext RCON, rijndael_rcon, ecx // Prepare for the main loop. - lea esi, [ebp + w] - mov eax, [esi + 4*ebx - 4] // most recent key word - lea edx, [esi + 4*edx] // limit, offset by one key expansion + lea SI, [CTX + w] + mov eax, [SI + 4*KSZo - 4] // most recent key word + lea LIM, [SI + 4*LIM] // limit, offset by one key expansion // Main key expansion loop. The first word of each key-length chunk // needs special treatment. @@ -131,76 +223,76 @@ FUNC(rijndael_setup_x86_aesni) aeskeygenassist xmm1, xmm0, 0 pshufd xmm1, xmm1, ROTL movd eax, xmm1 - xor eax, [esi] - xor al, [ecx] - inc ecx - mov [esi + 4*ebx], eax - add esi, 4 - cmp esi, edx + xor eax, [SI] + xor al, [RCON] + inc RCON + mov [SI + 4*KSZo], eax + add SI, 4 + cmp SI, LIM jae 8f // The next three words are simple... - xor eax, [esi] - mov [esi + 4*ebx], eax - add esi, 4 - cmp esi, edx + xor eax, [SI] + mov [SI + 4*KSZo], eax + add SI, 4 + cmp SI, LIM jae 8f // (Word 2...) - xor eax, [esi] - mov [esi + 4*ebx], eax - add esi, 4 - cmp esi, edx + xor eax, [SI] + mov [SI + 4*KSZo], eax + add SI, 4 + cmp SI, LIM jae 8f // (Word 3...) - xor eax, [esi] - mov [esi + 4*ebx], eax - add esi, 4 - cmp esi, edx + xor eax, [SI] + mov [SI + 4*KSZo], eax + add SI, 4 + cmp SI, LIM jae 8f // Word 4. If the key is /more/ than 6 words long, then we must // apply a substitution here. - cmp ebx, 5 + cmp KSZ, 5 jb 9b - cmp ebx, 7 + cmp KSZ, 7 jb 0f movd xmm0, eax pshufd xmm0, xmm0, ROTL aeskeygenassist xmm1, xmm0, 0 movd eax, xmm1 -0: xor eax, [esi] - mov [esi + 4*ebx], eax - add esi, 4 - cmp esi, edx +0: xor eax, [SI] + mov [SI + 4*KSZo], eax + add SI, 4 + cmp SI, LIM jae 8f // (Word 5...) - cmp ebx, 6 + cmp KSZ, 6 jb 9b - xor eax, [esi] - mov [esi + 4*ebx], eax - add esi, 4 - cmp esi, edx + xor eax, [SI] + mov [SI + 4*KSZo], eax + add SI, 4 + cmp SI, LIM jae 8f // (Word 6...) - cmp ebx, 7 + cmp KSZ, 7 jb 9b - xor eax, [esi] - mov [esi + 4*ebx], eax - add esi, 4 - cmp esi, edx + xor eax, [SI] + mov [SI + 4*KSZo], eax + add SI, 4 + cmp SI, LIM jae 8f // (Word 7...) - cmp ebx, 8 + cmp KSZ, 8 jb 9b - xor eax, [esi] - mov [esi + 4*ebx], eax - add esi, 4 - cmp esi, edx + xor eax, [SI] + mov [SI + 4*KSZo], eax + add SI, 4 + cmp SI, LIM jae 8f // Must be done by now. @@ -219,130 +311,183 @@ FUNC(rijndael_setup_x86_aesni) // there's easily enough buffer space for the over-enthusiastic reads // and writes because the context has space for 32-byte blocks, which // is our maximum and an exact fit for two SSE registers. -8: mov ecx, [ebp + nr] // number of rounds - mov ebx, [esp + 24] // block size (in words) - mov edx, ecx - imul edx, ebx - lea edi, [ebp + wi] - lea esi, [ebp + 4*edx + w] // last round's keys - shl ebx, 2 // block size (in bytes now) +8: mov NR, [CTX + nr] // number of rounds +#if NKW_NEEDS_REFRESH + mov BLKOFF, BLKSZ + mov LRK, NR + imul LRK, BLKOFF +#else + // If we retain NKW, then BLKSZ and BLKOFF are the same register + // because we won't need the former again. + mov LRK, NKW + sub LRK, BLKSZ +#endif + lea DI, [CTX + wi] + lea SI, [CTX + w + 4*LRKo] // last round's keys + shl BLKOFF, 2 // block size (in bytes now) // Copy the last encryption round's keys. - movdqu xmm0, [esi] - movdqu [edi], xmm0 - cmp ebx, 16 + movdqu xmm0, [SI] + movdqu [DI], xmm0 + cmp BLKOFF, 16 jbe 9f - movdqu xmm0, [esi + 16] - movdqu [edi + 16], xmm0 + movdqu xmm0, [SI + 16] + movdqu [DI + 16], xmm0 // Update the loop variables and stop if we've finished. -9: add edi, ebx - sub esi, ebx - sub ecx, 1 +9: add DI, BLKOFFo + sub SI, BLKOFFo + sub NR, 1 jbe 0f // Do another middle round's keys... - movdqu xmm0, [esi] + movdqu xmm0, [SI] aesimc xmm0, xmm0 - movdqu [edi], xmm0 - cmp ebx, 16 + movdqu [DI], xmm0 + cmp BLKOFF, 16 jbe 9b - movdqu xmm0, [esi + 16] + movdqu xmm0, [SI + 16] aesimc xmm0, xmm0 - movdqu [edi + 16], xmm0 + movdqu [DI + 16], xmm0 jmp 9b // Finally do the first encryption round. -0: movdqu xmm0, [esi] - movdqu [edi], xmm0 - cmp ebx, 16 +0: movdqu xmm0, [SI] + movdqu [DI], xmm0 + cmp BLKOFF, 16 jbe 0f - movdqu xmm0, [esi + 16] - movdqu [edi + 16], xmm0 + movdqu xmm0, [SI + 16] + movdqu [DI + 16], xmm0 // If the block size is not exactly four words then we must end-swap // everything. We can use fancy SSE toys for this. -0: cmp ebx, 16 +0: cmp BLKOFF, 16 je 0f // Find the byte-reordering table. ldgot ecx movdqa xmm5, [INTADDR(endswap_tab, ecx)] +#if NKW_NEEDS_REFRESH // Calculate the number of subkey words again. (It's a good job // we've got a fast multiplier.) - mov ecx, [ebp + nr] - add ecx, 1 - imul ecx, [esp + 24] // total keys in words + mov NKW, [CTX + nr] + add NKW, 1 + imul NKW, BLKSZ +#endif // End-swap the encryption keys. - mov eax, ecx - lea esi, [ebp + w] + mov ecx, NKW + lea SI, [CTX + w] call endswap_block // And the decryption keys. - mov ecx, eax - lea esi, [ebp + wi] + mov ecx, NKW + lea SI, [CTX + wi] call endswap_block - // All done. -0: pop edi +0: // All done. +#if CPUFAM_X86 + pop edi pop esi pop ebx pop ebp +#endif +#if CPUFAM_AMD64 && ABI_WIN + pop rdi + pop rsi +#endif ret .align 16 endswap_block: - // End-swap ECX words starting at ESI. The end-swapping table is + // End-swap ECX words starting at SI. The end-swapping table is // already loaded into XMM5; and it's OK to work in 16-byte chunks. - movdqu xmm1, [esi] + movdqu xmm1, [SI] pshufb xmm1, xmm5 - movdqu [esi], xmm1 - add esi, 16 + movdqu [SI], xmm1 + add SI, 16 sub ecx, 4 ja endswap_block ret +#undef CTX +#undef BLKSZ +#undef SI +#undef DI +#undef KSZ +#undef KSZo +#undef RCON +#undef LIMn +#undef LIM +#undef NR +#undef LRK +#undef LRKo +#undef BLKOFF +#undef BLKOFFo + ENDFUNC ///-------------------------------------------------------------------------- /// Encrypting and decrypting blocks. .macro encdec op, aes, koff -FUNC(rijndael_\op\()_x86_aesni) - - // On entry, we have: - // [esp + 4] points to the context block - // [esp + 8] points to the input data block - // [esp + 12] points to the output buffer +FUNC(rijndael_\op\()_x86ish_aesni) // Find the magic endianness-swapping table. ldgot ecx movdqa xmm5, [INTADDR(endswap_tab, ecx)] - // Load the input block and end-swap it. Also, start loading the - // keys. - mov eax, [esp + 8] - movdqu xmm0, [eax] +#if CPUFAM_X86 + // Arguments come in on the stack, and need to be collected. We + // don't have a shortage of registers. + +# define K ecx +# define SRC edx +# define DST edx +# define NR eax + + mov K, [esp + 4] + mov SRC, [esp + 8] +#endif + +#if CPUFAM_AMD64 && ABI_SYSV + // Arguments come in registers. All is good. + +# define K rdi +# define SRC rsi +# define DST rdx +# define NR eax +#endif + +#if CPUFAM_AMD64 && ABI_WIN + // Arguments come in different registers. + +# define K rcx +# define SRC rdx +# define DST r8 +# define NR eax +#endif + + // Initial setup. + movdqu xmm0, [SRC] pshufb xmm0, xmm5 - mov eax, [esp + 4] - lea edx, [eax + \koff] - mov eax, [eax + nr] + mov NR, [K + nr] + add K, \koff // Initial whitening. - movdqu xmm1, [edx] - add edx, 16 + movdqu xmm1, [K] + add K, 16 pxor xmm0, xmm1 // Dispatch to the correct code. - cmp eax, 10 + cmp NR, 10 je 10f jb bogus - cmp eax, 14 + cmp NR, 14 je 14f ja bogus - cmp eax, 12 + cmp NR, 12 je 12f jb 11f jmp 13f @@ -350,73 +495,80 @@ FUNC(rijndael_\op\()_x86_aesni) .align 2 // 14 rounds... -14: movdqu xmm1, [edx] - add edx, 16 +14: movdqu xmm1, [K] + add K, 16 \aes xmm0, xmm1 // 13 rounds... -13: movdqu xmm1, [edx] - add edx, 16 +13: movdqu xmm1, [K] + add K, 16 \aes xmm0, xmm1 // 12 rounds... -12: movdqu xmm1, [edx] - add edx, 16 +12: movdqu xmm1, [K] + add K, 16 \aes xmm0, xmm1 // 11 rounds... -11: movdqu xmm1, [edx] - add edx, 16 +11: movdqu xmm1, [K] + add K, 16 \aes xmm0, xmm1 // 10 rounds... -10: movdqu xmm1, [edx] +10: movdqu xmm1, [K] \aes xmm0, xmm1 // 9 rounds... - movdqu xmm1, [edx + 16] + movdqu xmm1, [K + 16] \aes xmm0, xmm1 // 8 rounds... - movdqu xmm1, [edx + 32] + movdqu xmm1, [K + 32] \aes xmm0, xmm1 // 7 rounds... - movdqu xmm1, [edx + 48] + movdqu xmm1, [K + 48] \aes xmm0, xmm1 // 6 rounds... - movdqu xmm1, [edx + 64] + movdqu xmm1, [K + 64] \aes xmm0, xmm1 // 5 rounds... - movdqu xmm1, [edx + 80] + movdqu xmm1, [K + 80] \aes xmm0, xmm1 // 4 rounds... - movdqu xmm1, [edx + 96] + movdqu xmm1, [K + 96] \aes xmm0, xmm1 // 3 rounds... - movdqu xmm1, [edx + 112] + movdqu xmm1, [K + 112] \aes xmm0, xmm1 // 2 rounds... - movdqu xmm1, [edx + 128] + movdqu xmm1, [K + 128] \aes xmm0, xmm1 // Final round... - movdqu xmm1, [edx + 144] + movdqu xmm1, [K + 144] \aes\()last xmm0, xmm1 // Unpermute the ciphertext block and store it. pshufb xmm0, xmm5 - mov eax, [esp + 12] - movdqu [eax], xmm0 +#if CPUFAM_X86 + mov DST, [esp + 12] +#endif + movdqu [DST], xmm0 // And we're done. ret +#undef K +#undef SRC +#undef DST +#undef NR + ENDFUNC .endm