/// MA 02111-1307, USA.
///--------------------------------------------------------------------------
-/// External definitions.
+/// Preliminaries.
#include "config.h"
#include "asm-common.h"
- .globl F(abort)
- .globl F(rijndael_rcon)
+ .arch .aes
-///--------------------------------------------------------------------------
-/// Main code.
+ .extern F(abort)
+ .extern F(rijndael_rcon)
- .arch .aes
.text
+///--------------------------------------------------------------------------
+/// Main code.
+
/// The AESNI instructions implement a little-endian version of AES, but
/// Catacomb's internal interface presents as big-endian so as to work better
/// with things like GCM. We therefore maintain the round keys in
///--------------------------------------------------------------------------
/// Key setup.
-FUNC(rijndael_setup_x86ish_aesni)
+FUNC(rijndael_setup_x86ish_aesni_avx)
+ vzeroupper // avoid penalty on `legacy' XMM access
+ endprologue
+ // and drop through...
+ENDFUNC
-#define SI WHOLE(si)
-#define DI WHOLE(di)
+FUNC(rijndael_setup_x86ish_aesni)
#if CPUFAM_X86
// Arguments are on the stack. We'll need to stack the caller's
// register veriables, but we'll manage.
-# define CTX ebp // context pointer
-# define BLKSZ [esp + 24] // block size
+# define CTX BP // context pointer
+# define BLKSZ [SP + 24] // block size
# define KSZ ebx // key size
# define NKW edx // total number of key words
# define BLKOFF edx // block size in bytes
// Stack the caller's registers.
- push ebp
- push ebx
- push esi
- push edi
+ pushreg BP
+ pushreg ebx
+ pushreg esi
+ pushreg edi
// Set up our own variables.
- mov CTX, [esp + 20] // context base pointer
- mov SI, [esp + 28] // key material
- mov KSZ, [esp + 32] // key size, in words
+ mov CTX, [SP + 20] // context base pointer
+ mov SI, [SP + 28] // key material
+ mov KSZ, [SP + 32] // key size, in words
#endif
#if CPUFAM_AMD64 && ABI_SYSV
// We'll need the index registers, which belong to the caller in this
// ABI.
- push rsi
- .seh_pushreg rsi
- push rdi
- .seh_pushreg rdi
- .seh_endprologue
+ pushreg rsi
+ pushreg rdi
// Move arguments to more useful places.
mov rsi, r8 // key material
mov CTX, rcx // context base pointer
#endif
+ endprologue
+
// The initial round key material is taken directly from the input
// key, so copy it over.
#if CPUFAM_AMD64 && ABI_SYSV
// Fourth word of the cycle, and seven or eight words of key. Do a
// byte substitution.
movd xmm0, eax
- pshufd xmm0, xmm0, SHUF(2, 1, 0, 3)
+ pshufd xmm0, xmm0, SHUF(3, 0, 1, 2)
aeskeygenassist xmm1, xmm0, 0
movd eax, xmm1
jmp 2f
// First word of the cycle. This is the complicated piece.
1: movd xmm0, eax
- pshufd xmm0, xmm0, SHUF(0, 3, 2, 1)
+ pshufd xmm0, xmm0, SHUF(1, 2, 3, 0)
aeskeygenassist xmm1, xmm0, 0
- pshufd xmm1, xmm1, SHUF(2, 1, 0, 3)
+ pshufd xmm1, xmm1, SHUF(3, 0, 1, 2)
movd eax, xmm1
xor al, [RCON]
inc RCON
9: // All done.
#if CPUFAM_X86
- pop edi
- pop esi
- pop ebx
- pop ebp
+ popreg edi
+ popreg esi
+ popreg ebx
+ popreg BP
#endif
#if CPUFAM_AMD64 && ABI_WIN
- pop rdi
- pop rsi
+ popreg rdi
+ popreg rsi
#endif
ret
- .align 16
-endswap_block:
+ENDFUNC
+
+INTFUNC(endswap_block)
// End-swap NKW words starting at SI. The end-swapping table is
// already loaded into XMM5; and it's OK to work in 16-byte chunks.
+ endprologue
+
mov ecx, NKW
0: movdqu xmm1, [SI]
pshufb xmm1, xmm5
add SI, 16
sub ecx, 4
ja 0b
+
ret
+ENDFUNC
+
#undef CTX
#undef BLKSZ
#undef SI
#undef LRK
#undef BLKOFF
-ENDFUNC
-
///--------------------------------------------------------------------------
/// Encrypting and decrypting blocks.
.macro encdec op, aes, koff
+ FUNC(rijndael_\op\()_x86ish_aesni_avx)
+ vzeroupper // avoid XMM penalties
+ endprologue
+ // and drop through...
+ ENDFUNC
+
FUNC(rijndael_\op\()_x86ish_aesni)
#if CPUFAM_X86
# define DST edx
# define NR ecx
- mov K, [esp + 4]
- mov SRC, [esp + 8]
+ mov K, [SP + 4]
+ mov SRC, [SP + 8]
#endif
#if CPUFAM_AMD64 && ABI_SYSV
# define SRC rdx
# define DST r8
# define NR eax
- .seh_endprologue
#endif
+ endprologue
+
// Find the magic endianness-swapping table.
ldgot ecx
movdqa xmm5, [INTADDR(endswap_tab, ecx)]
add K, 16
pxor xmm0, xmm1
#if CPUFAM_X86
- mov DST, [esp + 12]
+ mov DST, [SP + 12]
#endif
// Dispatch to the correct code.
///--------------------------------------------------------------------------
/// Random utilities.
- .align 16
+INTFUNC(bogus)
// Abort the process because of a programming error. Indirecting
// through this point serves several purposes: (a) by CALLing, rather
// than branching to, `abort', we can save the return address, which
// might at least provide a hint as to what went wrong; (b) we don't
// have conditional CALLs (and they'd be big anyway); and (c) we can
// write a HLT here as a backstop against `abort' being mad.
-bogus: callext F(abort)
+ endprologue
+
+ callext F(abort)
0: hlt
jmp 0b
+ENDFUNC
+
///--------------------------------------------------------------------------
/// Data tables.