#endif
///--------------------------------------------------------------------------
-/// x86-specific hacking.
+/// Windows-specific hacking.
+
+#if ABI_WIN
#if CPUFAM_X86
+# define F(name) _##name
+#endif
+
+#endif
+
+///--------------------------------------------------------------------------
+/// x86- and amd64-specific hacking.
+///
+/// It's (slightly) easier to deal with both of these in one go.
+
+#if CPUFAM_X86 || CPUFAM_AMD64
// Set the function hooks.
#define FUNC_PREHOOK(_) .balign 16
// Maybe load GOT address into GOT.
.macro ldgot got=GOTREG
-#if WANT_PIC
+#if WANT_PIC && CPUFAM_X86
call _where_am_i.\got
add \got, offset _GLOBAL_OFFSET_TABLE_
#endif
// Maybe build a helper subroutine for `ldgot GOT'.
.macro gotaux got=GOTREG
-#if WANT_PIC
+#if WANT_PIC && CPUFAM_X86
.align 16
_where_am_i.\got :
mov \got, [esp]
// Load address of external symbol ADDR into REG, maybe using GOT.
.macro leaext reg, addr, got=GOTREG
#if WANT_PIC
+# if CPUFAM_X86
mov \reg, [\got + \addr@GOT]
+# endif
+# if CPUFAM_AMD64
+ mov \reg, \addr@GOTPCREL[rip]
+# endif
#else
+# if CPUFAM_X86
mov \reg, offset \addr
+# endif
+# if CPUFAM_AMD64
+ lea \reg, \addr[rip]
+# endif
#endif
.endm
// referring to ADDR, which is within our module, maybe using GOT.
#define INTADDR(...) INTADDR__0(__VA_ARGS__, GOTREG, dummy)
#define INTADDR__0(addr, got, ...) INTADDR__1(addr, got)
-#if WANT_PIC
+#if CPUFAM_AMD64
+# define INTADDR__1(addr, got) addr + rip
+#elif WANT_PIC
# define INTADDR__1(addr, got) got + addr@GOTOFF
#else
# define INTADDR__1(addr, got) addr
/*----- Intel x86/AMD64 feature probing -----------------------------------*/
-#ifdef CPUFAM_X86
+#if CPUFAM_X86 || CPUFAM_AMD64
# define EFLAGS_ID (1u << 21)
# define CPUID1D_SSE2 (1u << 26)
*/
#ifdef __GNUC__
+# if CPUFAM_X86
static __inline__ unsigned getflags(void)
{ unsigned f; __asm__ ("pushf; popl %0" : "=g" (f)); return (f); }
static __inline__ unsigned setflags(unsigned f)
: "g" (f));
return (ff);
}
+# else
+static __inline__ unsigned long getflags(void)
+ { unsigned long f; __asm__ ("pushf; popq %0" : "=g" (f)); return (f); }
+static __inline__ unsigned long long setflags(unsigned long f)
+{
+ unsigned long ff;
+ __asm__ ("pushf; pushq %1; popf; pushf; popq %0; popf"
+ : "=g" (ff)
+ : "g" (f));
+ return (ff);
+}
+# endif
#endif
static void cpuid(struct cpuid *cc, unsigned a, unsigned c)
/* Alas, EBX is magical in PIC code, so abuse ESI instead. This isn't
* pretty, but it works.
*/
+# if CPUFAM_X86
__asm__ ("pushl %%ebx; cpuid; movl %%ebx, %%esi; popl %%ebx"
: "=a" (cc->a), "=S" (cc->b), "=c" (cc->c), "=d" (cc->d)
: "a" (a) , "c" (c));
+# elif CPUFAM_AMD64
+ __asm__ ("pushq %%rbx; cpuid; movl %%ebx, %%esi; popq %%rbx"
+ : "=a" (cc->a), "=S" (cc->b), "=c" (cc->c), "=d" (cc->d)
+ : "a" (a) , "c" (c));
+# else
+# error "I'm confused."
+# endif
+ dispatch_debug("CPUID(%08x, %08x) -> %08x, %08x, %08x, %08x",
+ a, c, cc->a, cc->b, cc->c, cc->d);
#else
dispatch_debug("GNU inline assembler not available; can't CPUID");
#endif
* XMM registers are actually alive.
*/
if (!cpuid_features_p(CPUID1D_FXSR, 0)) return (0);
+# if CPUFAM_X86
__asm__ ("movl %%esp, %%edx; subl $512, %%esp; andl $~15, %%esp\n"
"fxsave (%%esp)\n"
"movl 160(%%esp), %%eax; xorl $0xaaaa5555, 160(%%esp)\n"
: "=a" (f)
: /* no inputs */
: "%ecx", "%edx");
+# elif CPUFAM_AMD64
+ __asm__ ("movq %%rsp, %%rdx; subq $512, %%rsp; andq $~15, %%rsp\n"
+ "fxsave (%%rsp)\n"
+ "movl 160(%%rsp), %%eax; xorl $0xaaaa5555, 160(%%rsp)\n"
+ "fxrstor (%%rsp); fxsave (%%rsp)\n"
+ "movl 160(%%rsp), %%ecx; movl %%eax, 160(%%rsp)\n"
+ "fxrstor (%%rsp); movq %%rdx, %%rsp\n"
+ "xorl %%ecx, %%eax"
+ : "=a" (f)
+ : /* no inputs */
+ : "%ecx", "%rdx");
+# else
+# error "I'm confused."
+# endif
+ dispatch_debug("XMM registers %savailable", f ? "" : "not ");
return (f);
#else
dispatch_debug("GNU inline assembler not available; can't check for XMM");
return (feat_debug(ftok, "runtime probe", cond));
switch (feat) {
-#ifdef CPUFAM_X86
+#if CPUFAM_X86 || CPUFAM_AMD64
CASE_CPUFEAT(X86_SSE2, "x86:sse2",
xmm_registers_available_p() &&
cpuid_features_p(CPUID1D_SSE2, 0));
dnl not uniform: each dispatched function might or might not have an
dnl implementation for any particular CPU/ABI combination.
AC_DEFUN([catacomb_CPU_FAMILIES],
- [$1([i[[3-6]]86,*], [x86], [sysv])])
+ [$1([i[[3-6]]86,cygwin], [x86], [win])
+ $1([i[[3-6]]86,*], [x86], [sysv])
+ $1([x86_64,cygwin], [amd64], [win])
+ $1([x86_64,*], [amd64], [sysv])])
dnl A utility to clear the `seen' flags, used so as to process each CPU or
dnl ABI once.
BLKCS += rijndael rijndael192 rijndael256
libsymm_la_SOURCES += rijndael-base.h rijndael-base.c
if CPUFAM_X86
-libsymm_la_SOURCES += rijndael-x86-aesni.S
+libsymm_la_SOURCES += rijndael-x86ish-aesni.S
+endif
+if CPUFAM_AMD64
+libsymm_la_SOURCES += rijndael-x86ish-aesni.S
endif
libsymm_la_SOURCES += $(precomp)/rijndael-tab.c
PRECOMPS += $(precomp)/rijndael-tab.c
pkginclude_HEADERS += salsa20.h salsa20-core.h
libsymm_la_SOURCES += salsa20.c
if CPUFAM_X86
-libsymm_la_SOURCES += salsa20-x86-sse2.S
+libsymm_la_SOURCES += salsa20-x86ish-sse2.S
+endif
+if CPUFAM_AMD64
+libsymm_la_SOURCES += salsa20-x86ish-sse2.S
endif
TESTS += salsa20.$t
ALL_CIPHERS += salsa20 salsa2012 salsa208
pkginclude_HEADERS += chacha.h chacha-core.h
libsymm_la_SOURCES += chacha.c
if CPUFAM_X86
-libsymm_la_SOURCES += chacha-x86-sse2.S
+libsymm_la_SOURCES += chacha-x86ish-sse2.S
+endif
+if CPUFAM_AMD64
+libsymm_la_SOURCES += chacha-x86ish-sse2.S
endif
TESTS += chacha.$t
EXTRA_DIST += t/chacha
.arch pentium4
.section .text
-FUNC(chacha_core_x86_sse2)
+FUNC(chacha_core_x86ish_sse2)
+
+ // Initial setup.
+
+#if CPUFAM_X86
+ // Arguments come in on the stack, and will need to be collected. We
+ // we can get away with just the scratch registers for integer work,
+ // but we'll run out of XMM registers and will need some properly
+ // aligned space which we'll steal from the stack. I don't trust the
+ // stack pointer's alignment, so I'll have to mask the stack pointer,
+ // which in turn means I'll need to keep track of the old value.
+ // Hence I'm making a full i386-style stack frame here.
+ //
+ // The Windows and SysV ABIs are sufficiently similar that we don't
+ // need to worry about the differences here.
+
+# define NR ecx
+# define IN eax
+# define OUT edx
+# define SAVE0 xmm5
+# define SAVE1 xmm6
+# define SAVE2 xmm7
+# define SAVE3 [esp]
- // Initial state. We have three arguments:
- // [ebp + 8] is the number of rounds to do
- // [ebp + 12] points to the input matrix
- // [ebp + 16] points to the output matrix
push ebp
mov ebp, esp
sub esp, 16
- mov edx, [ebp + 12]
+ mov IN, [ebp + 12]
+ mov OUT, [ebp + 16]
and esp, ~15
+ mov NR, [ebp + 8]
+#endif
+
+#if CPUFAM_AMD64 && ABI_SYSV
+ // This is nice. We have plenty of XMM registers, and the arguments
+ // are in useful places. There's no need to spill anything and we
+ // can just get on with the code.
+
+# define NR edi
+# define IN rsi
+# define OUT rdx
+# define SAVE0 xmm5
+# define SAVE1 xmm6
+# define SAVE2 xmm7
+# define SAVE3 xmm8
+#endif
+
+#if CPUFAM_AMD64 && ABI_WIN
+ // Arguments come in registers, but they're different between Windows
+ // and everyone else (and everyone else is saner).
+ //
+ // The Windows ABI insists that we preserve some of the XMM
+ // registers, but we want more than we can use as scratch space. We
+ // only need to save a copy of the input for the feedforward at the
+ // end, so we might as well use memory rather than spill extra
+ // registers. (We need an extra 8 bytes to align the stack.)
+
+# define NR ecx
+# define IN rdx
+# define OUT r8
+# define SAVE0 xmm5
+# define SAVE1 [rsp + 0]
+# define SAVE2 [rsp + 16]
+# define SAVE3 [rsp + 32]
+
+ sub rsp, 48 + 8
+#endif
// First job is to slurp the matrix into XMM registers. Be careful:
// the input matrix isn't likely to be properly aligned.
// [ 4 5 6 7] (b, xmm1)
// [ 8 9 10 11] (c, xmm2)
// [12 13 14 15] (d, xmm3)
- movdqu xmm0, [edx + 0]
- movdqu xmm1, [edx + 16]
- movdqu xmm2, [edx + 32]
- movdqu xmm3, [edx + 48]
-
- // Prepare for the main loop.
- mov ecx, [ebp + 8]
+ movdqu xmm0, [IN + 0]
+ movdqu xmm1, [IN + 16]
+ movdqu xmm2, [IN + 32]
+ movdqu xmm3, [IN + 48]
// Take a copy for later. This one is aligned properly, by
// construction.
- movdqa [esp], xmm0
- movdqa xmm5, xmm1
- movdqa xmm6, xmm2
- movdqa xmm7, xmm3
+ movdqa SAVE0, xmm0
+ movdqa SAVE1, xmm1
+ movdqa SAVE2, xmm2
+ movdqa SAVE3, xmm3
loop:
// Apply a column quarterround to each of the columns simultaneously.
pshufd xmm1, xmm1, ROTL
// Decrement the loop counter and see if we should go round again.
- sub ecx, 2
+ sub NR, 2
ja loop
// Almost there. Firstly, the feedforward addition.
- mov edx, [ebp + 16]
- paddd xmm0, [esp]
- paddd xmm1, xmm5
- paddd xmm2, xmm6
- paddd xmm3, xmm7
+ paddd xmm0, SAVE0
+ paddd xmm1, SAVE1
+ paddd xmm2, SAVE2
+ paddd xmm3, SAVE3
// And now we write out the result. This one won't be aligned
// either.
- movdqu [edx + 0], xmm0
- movdqu [edx + 16], xmm1
- movdqu [edx + 32], xmm2
- movdqu [edx + 48], xmm3
+ movdqu [OUT + 0], xmm0
+ movdqu [OUT + 16], xmm1
+ movdqu [OUT + 32], xmm2
+ movdqu [OUT + 48], xmm3
// Tidy things up.
+#if CPUFAM_X86
mov esp, ebp
pop ebp
+#endif
+#if CPUFAM_AMD64 && ABI_WIN
+ add rsp, 48 + 8
+#endif
// And with that, we're done.
ret
chacha_matrix dest)
{ CHACHA_nR(dest, src, r); CHACHA_FFWD(dest, src); }
-#ifdef CPUFAM_X86
-extern core__functype chacha_core_x86_sse2;
+#if CPUFAM_X86 || CPUFAM_AMD64
+extern core__functype chacha_core_x86ish_sse2;
#endif
static core__functype *pick_core(void)
{
-#ifdef CPUFAM_X86
- DISPATCH_PICK_COND(chacha_core, chacha_core_x86_sse2,
+#if CPUFAM_X86 || CPUFAM_AMD64
+ DISPATCH_PICK_COND(chacha_core, chacha_core_x86ish_sse2,
cpu_feature_p(CPUFEAT_X86_SSE2));
#endif
DISPATCH_PICK_FALLBACK(chacha_core, simple_core);
const void *buf, unsigned nk),
(k, nb, buf, nk), pick_setup, simple_setup)
-#ifdef CPUFAM_X86
-extern setup__functype rijndael_setup_x86_aesni;
+#if CPUFAM_X86 || CPUFAM_AMD64
+extern setup__functype rijndael_setup_x86ish_aesni;
#endif
static setup__functype *pick_setup(void)
{
-#ifdef CPUFAM_X86
- DISPATCH_PICK_COND(rijndael_setup, rijndael_setup_x86_aesni,
+#if CPUFAM_X86 || CPUFAM_AMD64
+ DISPATCH_PICK_COND(rijndael_setup, rijndael_setup_x86ish_aesni,
cpu_feature_p(CPUFEAT_X86_AESNI));
#endif
DISPATCH_PICK_FALLBACK(rijndael_setup, simple_setup);
///--------------------------------------------------------------------------
/// Key setup.
-FUNC(rijndael_setup_x86_aesni)
+FUNC(rijndael_setup_x86ish_aesni)
- // Initial state. We have four arguments:
- // [esp + 20] is the context pointer
- // [esp + 24] is the block size, in 32-bit words (4, 6, or 8)
- // [esp + 28] points to the key material, unaligned
- // [esp + 32] is the size of the key, in words
- // The key size has already been checked for validity, and the number
- // of rounds has been computed. Our job is only to fill in the `w'
- // and `wi' vectors.
+#if CPUFAM_X86
+ // Arguments are on the stack. We'll need to stack the caller's
+ // register veriables, but we'll manage.
+# define CTX ebp // context pointer
+# define BLKSZ [esp + 24] // block size
+
+# define SI esi // source pointer
+# define DI edi // destination pointer
+
+# define KSZ ebx // key size
+# define KSZo ebx // ... as address offset
+# define NKW edx // total number of key words
+# define NKW_NEEDS_REFRESH 1 // ... needs recalculating
+# define RCON ecx // round constants table
+# define LIM edx // limit pointer
+# define LIMn edx // ... as integer offset from base
+
+# define NR ecx // number of rounds
+# define LRK eax // distance to last key
+# define LRKo eax // ... as address offset
+# define BLKOFF edx // block size in bytes
+# define BLKOFFo edx // ... as address offset
+
+ // Stack the caller's registers.
push ebp
push ebx
push esi
push edi
+ // Set up our own variables.
+ mov CTX, [esp + 20] // context base pointer
+ mov SI, [esp + 28] // key material
+ mov KSZ, [esp + 32] // key size, in words
+#endif
+
+#if CPUFAM_AMD64 && ABI_SYSV
+ // Arguments are in registers. We have plenty, but, to be honest,
+ // the initial register allocation is a bit annoying.
+
+# define CTX r8 // context pointer
+# define BLKSZ r9d // block size
+
+# define SI rsi // source pointer
+# define DI rdi // destination pointer
+
+# define KSZ edx // key size
+# define KSZo rdx // ... as address offset
+# define NKW r10d // total number of key words
+# define RCON rdi // round constants table
+# define LIMn ecx // limit pointer
+# define LIM rcx // ... as integer offset from base
+
+# define NR ecx // number of rounds
+# define LRK eax // distance to last key
+# define LRKo rax // ... as address offset
+# define BLKOFF r9d // block size in bytes
+# define BLKOFFo r9 // ... as address offset
+
+ // Move arguments to more useful places.
+ mov CTX, rdi // context base pointer
+ mov BLKSZ, esi // block size in words
+ mov SI, rdx // key material
+ mov KSZ, ecx // key size, in words
+#endif
+
+#if CPUFAM_AMD64 && ABI_WIN
+ // Arguments are in different registers, and they're a little tight.
+
+# define CTX r8 // context pointer
+# define BLKSZ edx // block size
+
+# define SI rsi // source pointer
+# define DI rdi // destination pointer
+
+# define KSZ r9d // key size
+# define KSZo r9 // ... as address offset
+# define NKW r10d // total number of key words
+# define RCON rdi // round constants table
+# define LIMn ecx // limit pointer
+# define LIM rcx // ... as integer offset from base
+
+# define NR ecx // number of rounds
+# define LRK eax // distance to last key
+# define LRKo rax // ... as address offset
+# define BLKOFF edx // block size in bytes
+# define BLKOFFo rdx // ... as address offset
+
+ // We'll need the index registers, which belong to the caller in this
+ // ABI.
+ push rsi
+ push rdi
+
+ // Move arguments to more useful places.
+ mov SI, r8 // key material
+ mov CTX, rcx // context base pointer
+#endif
+
// The initial round key material is taken directly from the input
// key, so copy it over.
- mov ebp, [esp + 20] // context base pointer
- mov ebx, [esp + 32] // key size, in words
- mov ecx, ebx
- mov esi, [esp + 28]
- lea edi, [ebp + w]
+#if CPUFAM_AMD64 && ABI_SYSV
+ // We've been lucky. We already have a copy of the context pointer
+ // in rdi, and the key size in ecx.
+ add DI, w
+#else
+ lea DI, [CTX + w]
+ mov ecx, KSZ
+#endif
rep movsd
// Find out other useful things.
- mov edx, [ebp + nr] // number of rounds
- add edx, 1
- imul edx, [esp + 24] // total key size in words
- sub edx, ebx // offset by the key size
+ mov NKW, [CTX + nr] // number of rounds
+ add NKW, 1
+ imul NKW, BLKSZ // total key size in words
+#if !NKW_NEEDS_REFRESH
+ // If we can't keep NKW for later, then we use the same register for
+ // it and LIM, so this move is unnecessary.
+ mov LIMn, NKW
+#endif
+ sub LIMn, KSZ // offset by the key size
// Find the round constants.
ldgot ecx
- leaext ecx, rijndael_rcon, ecx
+ leaext RCON, rijndael_rcon, ecx
// Prepare for the main loop.
- lea esi, [ebp + w]
- mov eax, [esi + 4*ebx - 4] // most recent key word
- lea edx, [esi + 4*edx] // limit, offset by one key expansion
+ lea SI, [CTX + w]
+ mov eax, [SI + 4*KSZo - 4] // most recent key word
+ lea LIM, [SI + 4*LIM] // limit, offset by one key expansion
// Main key expansion loop. The first word of each key-length chunk
// needs special treatment.
aeskeygenassist xmm1, xmm0, 0
pshufd xmm1, xmm1, ROTL
movd eax, xmm1
- xor eax, [esi]
- xor al, [ecx]
- inc ecx
- mov [esi + 4*ebx], eax
- add esi, 4
- cmp esi, edx
+ xor eax, [SI]
+ xor al, [RCON]
+ inc RCON
+ mov [SI + 4*KSZo], eax
+ add SI, 4
+ cmp SI, LIM
jae 8f
// The next three words are simple...
- xor eax, [esi]
- mov [esi + 4*ebx], eax
- add esi, 4
- cmp esi, edx
+ xor eax, [SI]
+ mov [SI + 4*KSZo], eax
+ add SI, 4
+ cmp SI, LIM
jae 8f
// (Word 2...)
- xor eax, [esi]
- mov [esi + 4*ebx], eax
- add esi, 4
- cmp esi, edx
+ xor eax, [SI]
+ mov [SI + 4*KSZo], eax
+ add SI, 4
+ cmp SI, LIM
jae 8f
// (Word 3...)
- xor eax, [esi]
- mov [esi + 4*ebx], eax
- add esi, 4
- cmp esi, edx
+ xor eax, [SI]
+ mov [SI + 4*KSZo], eax
+ add SI, 4
+ cmp SI, LIM
jae 8f
// Word 4. If the key is /more/ than 6 words long, then we must
// apply a substitution here.
- cmp ebx, 5
+ cmp KSZ, 5
jb 9b
- cmp ebx, 7
+ cmp KSZ, 7
jb 0f
movd xmm0, eax
pshufd xmm0, xmm0, ROTL
aeskeygenassist xmm1, xmm0, 0
movd eax, xmm1
-0: xor eax, [esi]
- mov [esi + 4*ebx], eax
- add esi, 4
- cmp esi, edx
+0: xor eax, [SI]
+ mov [SI + 4*KSZo], eax
+ add SI, 4
+ cmp SI, LIM
jae 8f
// (Word 5...)
- cmp ebx, 6
+ cmp KSZ, 6
jb 9b
- xor eax, [esi]
- mov [esi + 4*ebx], eax
- add esi, 4
- cmp esi, edx
+ xor eax, [SI]
+ mov [SI + 4*KSZo], eax
+ add SI, 4
+ cmp SI, LIM
jae 8f
// (Word 6...)
- cmp ebx, 7
+ cmp KSZ, 7
jb 9b
- xor eax, [esi]
- mov [esi + 4*ebx], eax
- add esi, 4
- cmp esi, edx
+ xor eax, [SI]
+ mov [SI + 4*KSZo], eax
+ add SI, 4
+ cmp SI, LIM
jae 8f
// (Word 7...)
- cmp ebx, 8
+ cmp KSZ, 8
jb 9b
- xor eax, [esi]
- mov [esi + 4*ebx], eax
- add esi, 4
- cmp esi, edx
+ xor eax, [SI]
+ mov [SI + 4*KSZo], eax
+ add SI, 4
+ cmp SI, LIM
jae 8f
// Must be done by now.
// there's easily enough buffer space for the over-enthusiastic reads
// and writes because the context has space for 32-byte blocks, which
// is our maximum and an exact fit for two SSE registers.
-8: mov ecx, [ebp + nr] // number of rounds
- mov ebx, [esp + 24] // block size (in words)
- mov edx, ecx
- imul edx, ebx
- lea edi, [ebp + wi]
- lea esi, [ebp + 4*edx + w] // last round's keys
- shl ebx, 2 // block size (in bytes now)
+8: mov NR, [CTX + nr] // number of rounds
+#if NKW_NEEDS_REFRESH
+ mov BLKOFF, BLKSZ
+ mov LRK, NR
+ imul LRK, BLKOFF
+#else
+ // If we retain NKW, then BLKSZ and BLKOFF are the same register
+ // because we won't need the former again.
+ mov LRK, NKW
+ sub LRK, BLKSZ
+#endif
+ lea DI, [CTX + wi]
+ lea SI, [CTX + w + 4*LRKo] // last round's keys
+ shl BLKOFF, 2 // block size (in bytes now)
// Copy the last encryption round's keys.
- movdqu xmm0, [esi]
- movdqu [edi], xmm0
- cmp ebx, 16
+ movdqu xmm0, [SI]
+ movdqu [DI], xmm0
+ cmp BLKOFF, 16
jbe 9f
- movdqu xmm0, [esi + 16]
- movdqu [edi + 16], xmm0
+ movdqu xmm0, [SI + 16]
+ movdqu [DI + 16], xmm0
// Update the loop variables and stop if we've finished.
-9: add edi, ebx
- sub esi, ebx
- sub ecx, 1
+9: add DI, BLKOFFo
+ sub SI, BLKOFFo
+ sub NR, 1
jbe 0f
// Do another middle round's keys...
- movdqu xmm0, [esi]
+ movdqu xmm0, [SI]
aesimc xmm0, xmm0
- movdqu [edi], xmm0
- cmp ebx, 16
+ movdqu [DI], xmm0
+ cmp BLKOFF, 16
jbe 9b
- movdqu xmm0, [esi + 16]
+ movdqu xmm0, [SI + 16]
aesimc xmm0, xmm0
- movdqu [edi + 16], xmm0
+ movdqu [DI + 16], xmm0
jmp 9b
// Finally do the first encryption round.
-0: movdqu xmm0, [esi]
- movdqu [edi], xmm0
- cmp ebx, 16
+0: movdqu xmm0, [SI]
+ movdqu [DI], xmm0
+ cmp BLKOFF, 16
jbe 0f
- movdqu xmm0, [esi + 16]
- movdqu [edi + 16], xmm0
+ movdqu xmm0, [SI + 16]
+ movdqu [DI + 16], xmm0
// If the block size is not exactly four words then we must end-swap
// everything. We can use fancy SSE toys for this.
-0: cmp ebx, 16
+0: cmp BLKOFF, 16
je 0f
// Find the byte-reordering table.
ldgot ecx
movdqa xmm5, [INTADDR(endswap_tab, ecx)]
+#if NKW_NEEDS_REFRESH
// Calculate the number of subkey words again. (It's a good job
// we've got a fast multiplier.)
- mov ecx, [ebp + nr]
- add ecx, 1
- imul ecx, [esp + 24] // total keys in words
+ mov NKW, [CTX + nr]
+ add NKW, 1
+ imul NKW, BLKSZ
+#endif
// End-swap the encryption keys.
- mov eax, ecx
- lea esi, [ebp + w]
+ mov ecx, NKW
+ lea SI, [CTX + w]
call endswap_block
// And the decryption keys.
- mov ecx, eax
- lea esi, [ebp + wi]
+ mov ecx, NKW
+ lea SI, [CTX + wi]
call endswap_block
- // All done.
-0: pop edi
+0: // All done.
+#if CPUFAM_X86
+ pop edi
pop esi
pop ebx
pop ebp
+#endif
+#if CPUFAM_AMD64 && ABI_WIN
+ pop rdi
+ pop rsi
+#endif
ret
.align 16
endswap_block:
- // End-swap ECX words starting at ESI. The end-swapping table is
+ // End-swap ECX words starting at SI. The end-swapping table is
// already loaded into XMM5; and it's OK to work in 16-byte chunks.
- movdqu xmm1, [esi]
+ movdqu xmm1, [SI]
pshufb xmm1, xmm5
- movdqu [esi], xmm1
- add esi, 16
+ movdqu [SI], xmm1
+ add SI, 16
sub ecx, 4
ja endswap_block
ret
+#undef CTX
+#undef BLKSZ
+#undef SI
+#undef DI
+#undef KSZ
+#undef KSZo
+#undef RCON
+#undef LIMn
+#undef LIM
+#undef NR
+#undef LRK
+#undef LRKo
+#undef BLKOFF
+#undef BLKOFFo
+
ENDFUNC
///--------------------------------------------------------------------------
/// Encrypting and decrypting blocks.
.macro encdec op, aes, koff
-FUNC(rijndael_\op\()_x86_aesni)
-
- // On entry, we have:
- // [esp + 4] points to the context block
- // [esp + 8] points to the input data block
- // [esp + 12] points to the output buffer
+FUNC(rijndael_\op\()_x86ish_aesni)
// Find the magic endianness-swapping table.
ldgot ecx
movdqa xmm5, [INTADDR(endswap_tab, ecx)]
- // Load the input block and end-swap it. Also, start loading the
- // keys.
- mov eax, [esp + 8]
- movdqu xmm0, [eax]
+#if CPUFAM_X86
+ // Arguments come in on the stack, and need to be collected. We
+ // don't have a shortage of registers.
+
+# define K ecx
+# define SRC edx
+# define DST edx
+# define NR eax
+
+ mov K, [esp + 4]
+ mov SRC, [esp + 8]
+#endif
+
+#if CPUFAM_AMD64 && ABI_SYSV
+ // Arguments come in registers. All is good.
+
+# define K rdi
+# define SRC rsi
+# define DST rdx
+# define NR eax
+#endif
+
+#if CPUFAM_AMD64 && ABI_WIN
+ // Arguments come in different registers.
+
+# define K rcx
+# define SRC rdx
+# define DST r8
+# define NR eax
+#endif
+
+ // Initial setup.
+ movdqu xmm0, [SRC]
pshufb xmm0, xmm5
- mov eax, [esp + 4]
- lea edx, [eax + \koff]
- mov eax, [eax + nr]
+ mov NR, [K + nr]
+ add K, \koff
// Initial whitening.
- movdqu xmm1, [edx]
- add edx, 16
+ movdqu xmm1, [K]
+ add K, 16
pxor xmm0, xmm1
// Dispatch to the correct code.
- cmp eax, 10
+ cmp NR, 10
je 10f
jb bogus
- cmp eax, 14
+ cmp NR, 14
je 14f
ja bogus
- cmp eax, 12
+ cmp NR, 12
je 12f
jb 11f
jmp 13f
.align 2
// 14 rounds...
-14: movdqu xmm1, [edx]
- add edx, 16
+14: movdqu xmm1, [K]
+ add K, 16
\aes xmm0, xmm1
// 13 rounds...
-13: movdqu xmm1, [edx]
- add edx, 16
+13: movdqu xmm1, [K]
+ add K, 16
\aes xmm0, xmm1
// 12 rounds...
-12: movdqu xmm1, [edx]
- add edx, 16
+12: movdqu xmm1, [K]
+ add K, 16
\aes xmm0, xmm1
// 11 rounds...
-11: movdqu xmm1, [edx]
- add edx, 16
+11: movdqu xmm1, [K]
+ add K, 16
\aes xmm0, xmm1
// 10 rounds...
-10: movdqu xmm1, [edx]
+10: movdqu xmm1, [K]
\aes xmm0, xmm1
// 9 rounds...
- movdqu xmm1, [edx + 16]
+ movdqu xmm1, [K + 16]
\aes xmm0, xmm1
// 8 rounds...
- movdqu xmm1, [edx + 32]
+ movdqu xmm1, [K + 32]
\aes xmm0, xmm1
// 7 rounds...
- movdqu xmm1, [edx + 48]
+ movdqu xmm1, [K + 48]
\aes xmm0, xmm1
// 6 rounds...
- movdqu xmm1, [edx + 64]
+ movdqu xmm1, [K + 64]
\aes xmm0, xmm1
// 5 rounds...
- movdqu xmm1, [edx + 80]
+ movdqu xmm1, [K + 80]
\aes xmm0, xmm1
// 4 rounds...
- movdqu xmm1, [edx + 96]
+ movdqu xmm1, [K + 96]
\aes xmm0, xmm1
// 3 rounds...
- movdqu xmm1, [edx + 112]
+ movdqu xmm1, [K + 112]
\aes xmm0, xmm1
// 2 rounds...
- movdqu xmm1, [edx + 128]
+ movdqu xmm1, [K + 128]
\aes xmm0, xmm1
// Final round...
- movdqu xmm1, [edx + 144]
+ movdqu xmm1, [K + 144]
\aes\()last xmm0, xmm1
// Unpermute the ciphertext block and store it.
pshufb xmm0, xmm5
- mov eax, [esp + 12]
- movdqu [eax], xmm0
+#if CPUFAM_X86
+ mov DST, [esp + 12]
+#endif
+ movdqu [DST], xmm0
// And we're done.
ret
+#undef K
+#undef SRC
+#undef DST
+#undef NR
+
ENDFUNC
.endm
uint32 d[4]),
(k, s, d), pick_dblk, simple_dblk)
-#ifdef CPUFAM_X86
-extern rijndael_eblk__functype rijndael_eblk_x86_aesni;
-extern rijndael_dblk__functype rijndael_dblk_x86_aesni;
+#if CPUFAM_X86 || CPUFAM_AMD64
+extern rijndael_eblk__functype rijndael_eblk_x86ish_aesni;
+extern rijndael_dblk__functype rijndael_dblk_x86ish_aesni;
#endif
static rijndael_eblk__functype *pick_eblk(void)
{
-#ifdef CPUFAM_X86
- DISPATCH_PICK_COND(rijndael_eblk, rijndael_eblk_x86_aesni,
+#if CPUFAM_X86 || CPUFAM_AMD64
+ DISPATCH_PICK_COND(rijndael_eblk, rijndael_eblk_x86ish_aesni,
cpu_feature_p(CPUFEAT_X86_AESNI));
#endif
DISPATCH_PICK_FALLBACK(rijndael_eblk, simple_eblk);
static rijndael_dblk__functype *pick_dblk(void)
{
-#ifdef CPUFAM_X86
- DISPATCH_PICK_COND(rijndael_dblk, rijndael_dblk_x86_aesni,
+#if CPUFAM_X86 || CPUFAM_AMD64
+ DISPATCH_PICK_COND(rijndael_dblk, rijndael_dblk_x86ish_aesni,
cpu_feature_p(CPUFEAT_X86_AESNI));
#endif
DISPATCH_PICK_FALLBACK(rijndael_dblk, simple_dblk);
.arch pentium4
.section .text
-FUNC(salsa20_core_x86_sse2)
+FUNC(salsa20_core_x86ish_sse2)
+
+ // Initial setup.
+
+#if CPUFAM_X86
+ // Arguments come in on the stack, and will need to be collected. We
+ // we can get away with just the scratch registers for integer work,
+ // but we'll run out of XMM registers and will need some properly
+ // aligned space which we'll steal from the stack. I don't trust the
+ // stack pointer's alignment, so I'll have to mask the stack pointer,
+ // which in turn means I'll need to keep track of the old value.
+ // Hence I'm making a full i386-style stack frame here.
+ //
+ // The Windows and SysV ABIs are sufficiently similar that we don't
+ // need to worry about the differences here.
+
+# define NR ecx
+# define IN eax
+# define OUT edx
+# define SAVE0 xmm6
+# define SAVE1 xmm7
+# define SAVE2 [esp + 0]
+# define SAVE3 [esp + 16]
- // Initial state. We have three arguments:
- // [ebp + 8] is the number of rounds to do
- // [ebp + 12] points to the input matrix
- // [ebp + 16] points to the output matrix
push ebp
mov ebp, esp
sub esp, 32
- mov edx, [ebp + 12]
+ mov IN, [ebp + 12]
+ mov OUT, [ebp + 16]
and esp, ~15
-
- // Prepare for the main loop.
- mov ecx, [ebp + 8]
+ mov NR, [ebp + 8]
+#endif
+
+#if CPUFAM_AMD64 && ABI_SYSV
+ // This is nice. We have plenty of XMM registers, and the arguments
+ // are in useful places. There's no need to spill anything and we
+ // can just get on with the code.
+
+# define NR edi
+# define IN rsi
+# define OUT rdx
+# define SAVE0 xmm6
+# define SAVE1 xmm7
+# define SAVE2 xmm8
+# define SAVE3 xmm9
+#endif
+
+# if CPUFAM_AMD64 && ABI_WIN
+ // Arguments come in registers, but they're different between Windows
+ // and everyone else (and everyone else is saner).
+ //
+ // The Windows ABI insists that we preserve some of the XMM
+ // registers, but we want more than we can use as scratch space. Two
+ // places we only need to save a copy of the input for the
+ // feedforward at the end; but the other two we want for the final
+ // permutation, so save the old values on the stack (We need an extra
+ // 8 bytes to align the stack.)
+
+# define NR ecx
+# define IN rdx
+# define OUT r8
+# define SAVE0 xmm6
+# define SAVE1 xmm7
+# define SAVE2 [rsp + 32]
+# define SAVE3 [rsp + 48]
+
+ sub rsp, 64 + 8
+ movdqa [rsp + 0], xmm6
+ movdqa [rsp + 16], xmm7
+#endif
// First job is to slurp the matrix into XMM registers. The words
// have already been permuted conveniently to make them line up
// [ 4 5 6 7] --> [ 4 9 14 3] (b, xmm1)
// [ 8 9 10 11] [ 8 13 2 7] (c, xmm2)
// [12 13 14 15] [12 1 6 11] (d, xmm3)
- movdqu xmm0, [edx + 0]
- movdqu xmm1, [edx + 16]
- movdqu xmm2, [edx + 32]
- movdqu xmm3, [edx + 48]
+ movdqu xmm0, [IN + 0]
+ movdqu xmm1, [IN + 16]
+ movdqu xmm2, [IN + 32]
+ movdqu xmm3, [IN + 48]
- // Take a copy for later.
- movdqa [esp + 0], xmm0
- movdqa [esp + 16], xmm1
- movdqa xmm6, xmm2
- movdqa xmm7, xmm3
+ ## Take a copy for later.
+ movdqa SAVE0, xmm0
+ movdqa SAVE1, xmm1
+ movdqa SAVE2, xmm2
+ movdqa SAVE3, xmm3
loop:
-
// Apply a column quarterround to each of the columns simultaneously.
// Alas, there doesn't seem to be a packed doubleword rotate, so we
// have to synthesize it.
// involve any movement of elements between rows.
//
// [ 0 5 10 15] [ 0 5 10 15] (a, xmm0)
- // [ 4 9 14 3] --> [ 1 6 11 12] (b, xmm3)
- // [ 8 13 2 7] [ 2 7 8 13] (c, xmm2)
- // [12 1 6 11] [ 3 4 9 14] (d, xmm1)
+ // [ 4 9 14 3] --> [ 1 6 11 12] (b, xmm3)
+ // [ 8 13 2 7] [ 2 7 8 13] (c, xmm2)
+ // [12 1 6 11] [ 3 4 9 14] (d, xmm1)
//
// The shuffles have quite high latency, so they've been pushed
// backwards into the main instruction list.
// back the shuffles because they take a long time coming through.
// Decrement the loop counter and see if we should go round again.
// Later processors fuse this pair into a single uop.
- sub ecx, 2
+ sub NR, 2
ja loop
// Almost there. Firstly, the feedforward addition, and then we have
// which was already applied to the input. Shuffling has quite high
// latency, so arrange to start a new shuffle into a temporary as
// soon as we've written out the old value.
- mov edx, [ebp + 16]
-
- paddd xmm0, [esp + 0]
- pshufd xmm4, xmm0, ROTR
- movd [edx + 0], xmm0
+ paddd xmm0, SAVE0
+ pshufd xmm4, xmm0, 0x39
+ movd [OUT + 0], xmm0
- paddd xmm1, [esp + 16]
+ paddd xmm1, SAVE1
pshufd xmm5, xmm1, ROTL
- movd [edx + 16], xmm1
+ movd [OUT + 16], xmm1
- paddd xmm2, xmm6
+ paddd xmm2, SAVE2
pshufd xmm6, xmm2, ROT2
- movd [edx + 32], xmm2
+ movd [OUT + 32], xmm2
- paddd xmm3, xmm7
+ paddd xmm3, SAVE3
pshufd xmm7, xmm3, ROTR
- movd [edx + 48], xmm3
+ movd [OUT + 48], xmm3
- movd [edx + 4], xmm7
+ movd [OUT + 4], xmm7
pshufd xmm7, xmm3, ROT2
- movd [edx + 24], xmm7
+ movd [OUT + 24], xmm7
pshufd xmm3, xmm3, ROTL
- movd [edx + 44], xmm3
+ movd [OUT + 44], xmm3
- movd [edx + 8], xmm6
+ movd [OUT + 8], xmm6
pshufd xmm6, xmm2, ROTL
- movd [edx + 28], xmm6
+ movd [OUT + 28], xmm6
pshufd xmm2, xmm2, ROTR
- movd [edx + 52], xmm2
+ movd [OUT + 52], xmm2
- movd [edx + 12], xmm5
+ movd [OUT + 12], xmm5
pshufd xmm5, xmm1, ROTR
- movd [edx + 36], xmm5
+ movd [OUT + 36], xmm5
pshufd xmm1, xmm1, ROT2
- movd [edx + 56], xmm1
+ movd [OUT + 56], xmm1
- movd [edx + 20], xmm4
+ movd [OUT + 20], xmm4
pshufd xmm4, xmm0, ROT2
- movd [edx + 40], xmm4
+ movd [OUT + 40], xmm4
pshufd xmm0, xmm0, ROTL
- movd [edx + 60], xmm0
+ movd [OUT + 60], xmm0
// Tidy things up.
+
+#if CPUFAM_X86
mov esp, ebp
pop ebp
+#endif
+#if CPUFAM_AMD64 && ABI_WIN
+ movdqa xmm6, [rsp + 0]
+ movdqa xmm7, [rsp + 16]
+ add rsp, 64 + 8
+#endif
// And with that, we're done.
ret
+#undef NR
+#undef IN
+#undef OUT
+#undef SAVE0
+#undef SAVE1
+#undef SAVE2
+#undef SAVE3
+
ENDFUNC
///----- That's all, folks --------------------------------------------------
salsa20_matrix dest)
{ SALSA20_nR(dest, src, r); SALSA20_FFWD(dest, src); }
-#ifdef CPUFAM_X86
-extern core__functype salsa20_core_x86_sse2;
+#if CPUFAM_X86 || CPUFAM_AMD64
+extern core__functype salsa20_core_x86ish_sse2;
#endif
static core__functype *pick_core(void)
{
-#ifdef CPUFAM_X86
- DISPATCH_PICK_COND(salsa20_core, salsa20_core_x86_sse2,
+#if CPUFAM_X86 || CPUFAM_AMD64
+ DISPATCH_PICK_COND(salsa20_core, salsa20_core_x86ish_sse2,
cpu_feature_p(CPUFEAT_X86_SSE2));
#endif
DISPATCH_PICK_FALLBACK(salsa20_core, simple_core);