From 0f23f75ff53acadf80e9d3dfd2dfd14cb526074f Mon Sep 17 00:00:00 2001
From: Mark Wooding <mdw@distorted.org.uk>
Date: Sat, 21 May 2016 14:33:28 +0100
Subject: [PATCH] Add support for AMD64 processors and Microsoft Windows.

  * Slightly modify CPU-feature-probing code in `base/dispatch.c',
    mostly to use wider registers for the stack operations since there
    are no 32-bit `push' instructions.  The feature codes are the same
    for both, so there's no corresponding header-file change.

  * Add appropriate macros to `base/asm-common.h' for dealing with PIC
    on AMD64.  It's refreshingly straightforward.

  * Modify the existing assembler code to support the new environments.
    This is mostly tuning register allocation and prologue/epilogue
    sequences.

  * Use the fancy code on the new platforms.
---
 base/asm-common.h                                  |  33 +-
 base/dispatch.c                                    |  43 ++-
 configure.ac                                       |   5 +-
 symm/Makefile.am                                   |  15 +-
 symm/{chacha-x86-sse2.S => chacha-x86ish-sse2.S}   | 111 ++++--
 symm/chacha.c                                      |   8 +-
 symm/rijndael-base.c                               |   8 +-
 ...ijndael-x86-aesni.S => rijndael-x86ish-aesni.S} | 428 ++++++++++++++-------
 symm/rijndael.c                                    |  14 +-
 symm/{salsa20-x86-sse2.S => salsa20-x86ish-sse2.S} | 161 +++++---
 symm/salsa20.c                                     |   8 +-
 11 files changed, 594 insertions(+), 240 deletions(-)
 rename symm/{chacha-x86-sse2.S => chacha-x86ish-sse2.S} (65%)
 rename symm/{rijndael-x86-aesni.S => rijndael-x86ish-aesni.S} (53%)
 rename symm/{salsa20-x86-sse2.S => salsa20-x86ish-sse2.S} (63%)

diff --git a/base/asm-common.h b/base/asm-common.h
index 7e62eb54..8745ea43 100644
--- a/base/asm-common.h
+++ b/base/asm-common.h
@@ -58,9 +58,22 @@ F(name):								\
 #endif
 
 ///--------------------------------------------------------------------------
-/// x86-specific hacking.
+/// Windows-specific hacking.
+
+#if ABI_WIN
 
 #if CPUFAM_X86
+#  define F(name) _##name
+#endif
+
+#endif
+
+///--------------------------------------------------------------------------
+/// x86- and amd64-specific hacking.
+///
+/// It's (slightly) easier to deal with both of these in one go.
+
+#if CPUFAM_X86 || CPUFAM_AMD64
 
 // Set the function hooks.
 #define FUNC_PREHOOK(_) .balign 16
@@ -86,7 +99,7 @@ F(name):								\
 
 // Maybe load GOT address into GOT.
 	.macro	ldgot got=GOTREG
-#if WANT_PIC
+#if WANT_PIC && CPUFAM_X86
 	call	_where_am_i.\got
 	add	\got, offset _GLOBAL_OFFSET_TABLE_
 #endif
@@ -94,7 +107,7 @@ F(name):								\
 
 // Maybe build a helper subroutine for `ldgot GOT'.
 	.macro	gotaux got=GOTREG
-#if WANT_PIC
+#if WANT_PIC && CPUFAM_X86
 	.align	16
 _where_am_i.\got :
 	mov	\got, [esp]
@@ -105,9 +118,19 @@ _where_am_i.\got :
 // Load address of external symbol ADDR into REG, maybe using GOT.
 	.macro	leaext reg, addr, got=GOTREG
 #if WANT_PIC
+#  if CPUFAM_X86
 	mov	\reg, [\got + \addr@GOT]
+#  endif
+#  if CPUFAM_AMD64
+	mov	\reg, \addr@GOTPCREL[rip]
+#  endif
 #else
+#  if CPUFAM_X86
 	mov	\reg, offset \addr
+#  endif
+#  if CPUFAM_AMD64
+	lea	\reg, \addr[rip]
+#  endif
 #endif
 	.endm
 
@@ -115,7 +138,9 @@ _where_am_i.\got :
 // referring to ADDR, which is within our module, maybe using GOT.
 #define INTADDR(...) INTADDR__0(__VA_ARGS__, GOTREG, dummy)
 #define INTADDR__0(addr, got, ...) INTADDR__1(addr, got)
-#if WANT_PIC
+#if CPUFAM_AMD64
+#  define INTADDR__1(addr, got) addr + rip
+#elif WANT_PIC
 #  define INTADDR__1(addr, got) got + addr@GOTOFF
 #else
 #  define INTADDR__1(addr, got) addr
diff --git a/base/dispatch.c b/base/dispatch.c
index 61c45fa7..8936ea4a 100644
--- a/base/dispatch.c
+++ b/base/dispatch.c
@@ -41,7 +41,7 @@
 
 /*----- Intel x86/AMD64 feature probing -----------------------------------*/
 
-#ifdef CPUFAM_X86
+#if CPUFAM_X86 || CPUFAM_AMD64
 
 #  define EFLAGS_ID (1u << 21)
 #  define CPUID1D_SSE2 (1u << 26)
@@ -64,6 +64,7 @@ struct cpuid { unsigned a, b, c, d; };
  */
 
 #ifdef __GNUC__
+#  if CPUFAM_X86
 static __inline__ unsigned getflags(void)
   { unsigned f; __asm__ ("pushf; popl %0" : "=g" (f)); return (f); }
 static __inline__ unsigned setflags(unsigned f)
@@ -74,6 +75,18 @@ static __inline__ unsigned setflags(unsigned f)
 	   : "g" (f));
   return (ff);
 }
+#  else
+static __inline__ unsigned long getflags(void)
+  { unsigned long f; __asm__ ("pushf; popq %0" : "=g" (f)); return (f); }
+static __inline__ unsigned long long setflags(unsigned long f)
+{
+  unsigned long ff;
+  __asm__ ("pushf; pushq %1; popf; pushf; popq %0; popf"
+	   : "=g" (ff)
+	   : "g" (f));
+  return (ff);
+}
+#  endif
 #endif
 
 static void cpuid(struct cpuid *cc, unsigned a, unsigned c)
@@ -97,9 +110,19 @@ static void cpuid(struct cpuid *cc, unsigned a, unsigned c)
   /* Alas, EBX is magical in PIC code, so abuse ESI instead.  This isn't
    * pretty, but it works.
    */
+#  if CPUFAM_X86
   __asm__ ("pushl %%ebx; cpuid; movl %%ebx, %%esi; popl %%ebx"
 	   : "=a" (cc->a), "=S" (cc->b), "=c" (cc->c), "=d" (cc->d)
 	   : "a" (a) , "c" (c));
+#  elif CPUFAM_AMD64
+  __asm__ ("pushq %%rbx; cpuid; movl %%ebx, %%esi; popq %%rbx"
+	   : "=a" (cc->a), "=S" (cc->b), "=c" (cc->c), "=d" (cc->d)
+	   : "a" (a) , "c" (c));
+#  else
+#    error "I'm confused."
+#  endif
+  dispatch_debug("CPUID(%08x, %08x) -> %08x, %08x, %08x, %08x",
+		 a, c, cc->a, cc->b, cc->c, cc->d);
 #else
   dispatch_debug("GNU inline assembler not available; can't CPUID");
 #endif
@@ -141,6 +164,7 @@ static int xmm_registers_available_p(void)
    * XMM registers are actually alive.
    */
   if (!cpuid_features_p(CPUID1D_FXSR, 0)) return (0);
+#  if CPUFAM_X86
   __asm__ ("movl %%esp, %%edx; subl $512, %%esp; andl $~15, %%esp\n"
 	   "fxsave (%%esp)\n"
 	   "movl 160(%%esp), %%eax; xorl $0xaaaa5555, 160(%%esp)\n"
@@ -151,6 +175,21 @@ static int xmm_registers_available_p(void)
 	   : "=a" (f)
 	   : /* no inputs */
 	   : "%ecx", "%edx");
+#  elif CPUFAM_AMD64
+  __asm__ ("movq %%rsp, %%rdx; subq $512, %%rsp; andq $~15, %%rsp\n"
+	   "fxsave (%%rsp)\n"
+	   "movl 160(%%rsp), %%eax; xorl $0xaaaa5555, 160(%%rsp)\n"
+	   "fxrstor (%%rsp); fxsave (%%rsp)\n"
+	   "movl 160(%%rsp), %%ecx; movl %%eax, 160(%%rsp)\n"
+	   "fxrstor (%%rsp); movq %%rdx, %%rsp\n"
+	   "xorl %%ecx, %%eax"
+	   : "=a" (f)
+	   : /* no inputs */
+	   : "%ecx", "%rdx");
+#  else
+#    error "I'm confused."
+#  endif
+  dispatch_debug("XMM registers %savailable", f ? "" : "not ");
   return (f);
 #else
   dispatch_debug("GNU inline assembler not available; can't check for XMM");
@@ -257,7 +296,7 @@ int cpu_feature_p(int feat)
     return (feat_debug(ftok, "runtime probe", cond));
 
   switch (feat) {
-#ifdef CPUFAM_X86
+#if CPUFAM_X86 || CPUFAM_AMD64
     CASE_CPUFEAT(X86_SSE2, "x86:sse2",
 		 xmm_registers_available_p() &&
 		 cpuid_features_p(CPUID1D_SSE2, 0));
diff --git a/configure.ac b/configure.ac
index b76c5619..8a58d782 100644
--- a/configure.ac
+++ b/configure.ac
@@ -55,7 +55,10 @@ dnl The table of CPU families and ABIs which we might support.  Support is
 dnl not uniform: each dispatched function might or might not have an
 dnl implementation for any particular CPU/ABI combination.
 AC_DEFUN([catacomb_CPU_FAMILIES],
-  [$1([i[[3-6]]86,*], [x86], [sysv])])
+  [$1([i[[3-6]]86,cygwin], [x86], [win])
+   $1([i[[3-6]]86,*], [x86], [sysv])
+   $1([x86_64,cygwin], [amd64], [win])
+   $1([x86_64,*], [amd64], [sysv])])
 
 dnl A utility to clear the `seen' flags, used so as to process each CPU or
 dnl ABI once.
diff --git a/symm/Makefile.am b/symm/Makefile.am
index ba037cd5..e78277b7 100644
--- a/symm/Makefile.am
+++ b/symm/Makefile.am
@@ -181,7 +181,10 @@ BLKCS			+= rc5
 BLKCS			+= rijndael rijndael192 rijndael256
 libsymm_la_SOURCES	+= rijndael-base.h rijndael-base.c
 if CPUFAM_X86
-libsymm_la_SOURCES	+= rijndael-x86-aesni.S
+libsymm_la_SOURCES	+= rijndael-x86ish-aesni.S
+endif
+if CPUFAM_AMD64
+libsymm_la_SOURCES	+= rijndael-x86ish-aesni.S
 endif
 libsymm_la_SOURCES	+= $(precomp)/rijndael-tab.c
 PRECOMPS		+= $(precomp)/rijndael-tab.c
@@ -382,7 +385,10 @@ EXTRA_DIST		+= salsa20-tvconv
 pkginclude_HEADERS	+= salsa20.h salsa20-core.h
 libsymm_la_SOURCES	+= salsa20.c
 if CPUFAM_X86
-libsymm_la_SOURCES	+= salsa20-x86-sse2.S
+libsymm_la_SOURCES	+= salsa20-x86ish-sse2.S
+endif
+if CPUFAM_AMD64
+libsymm_la_SOURCES	+= salsa20-x86ish-sse2.S
 endif
 TESTS			+= salsa20.$t
 ALL_CIPHERS		+= salsa20 salsa2012 salsa208
@@ -411,7 +417,10 @@ t/salsa20: salsa20-tvconv t/salsa20.local $(SALSA20_ESTREAM_TV)
 pkginclude_HEADERS	+= chacha.h chacha-core.h
 libsymm_la_SOURCES	+= chacha.c
 if CPUFAM_X86
-libsymm_la_SOURCES	+= chacha-x86-sse2.S
+libsymm_la_SOURCES	+= chacha-x86ish-sse2.S
+endif
+if CPUFAM_AMD64
+libsymm_la_SOURCES	+= chacha-x86ish-sse2.S
 endif
 TESTS			+= chacha.$t
 EXTRA_DIST		+= t/chacha
diff --git a/symm/chacha-x86-sse2.S b/symm/chacha-x86ish-sse2.S
similarity index 65%
rename from symm/chacha-x86-sse2.S
rename to symm/chacha-x86ish-sse2.S
index ccdfa538..f36bf90f 100644
--- a/symm/chacha-x86-sse2.S
+++ b/symm/chacha-x86ish-sse2.S
@@ -44,17 +44,73 @@
 	.arch pentium4
 	.section .text
 
-FUNC(chacha_core_x86_sse2)
+FUNC(chacha_core_x86ish_sse2)
+
+	// Initial setup.
+
+#if CPUFAM_X86
+	// Arguments come in on the stack, and will need to be collected.  We
+	// we can get away with just the scratch registers for integer work,
+	// but we'll run out of XMM registers and will need some properly
+	// aligned space which we'll steal from the stack.  I don't trust the
+	// stack pointer's alignment, so I'll have to mask the stack pointer,
+	// which in turn means I'll need to keep track of the old value.
+	// Hence I'm making a full i386-style stack frame here.
+	//
+	// The Windows and SysV ABIs are sufficiently similar that we don't
+	// need to worry about the differences here.
+
+#  define NR ecx
+#  define IN eax
+#  define OUT edx
+#  define SAVE0 xmm5
+#  define SAVE1 xmm6
+#  define SAVE2 xmm7
+#  define SAVE3 [esp]
 
-	// Initial state.  We have three arguments:
-	// [ebp +  8] is the number of rounds to do
-	// [ebp + 12] points to the input matrix
-	// [ebp + 16] points to the output matrix
 	push	ebp
 	mov	ebp, esp
 	sub	esp, 16
-	mov	edx, [ebp + 12]
+	mov	IN, [ebp + 12]
+	mov	OUT, [ebp + 16]
 	and	esp, ~15
+	mov	NR, [ebp + 8]
+#endif
+
+#if CPUFAM_AMD64 && ABI_SYSV
+	// This is nice.  We have plenty of XMM registers, and the arguments
+	// are in useful places.  There's no need to spill anything and we
+	// can just get on with the code.
+
+#  define NR edi
+#  define IN rsi
+#  define OUT rdx
+#  define SAVE0 xmm5
+#  define SAVE1 xmm6
+#  define SAVE2 xmm7
+#  define SAVE3 xmm8
+#endif
+
+#if CPUFAM_AMD64 && ABI_WIN
+	// Arguments come in registers, but they're different between Windows
+	// and everyone else (and everyone else is saner).
+	//
+	// The Windows ABI insists that we preserve some of the XMM
+	// registers, but we want more than we can use as scratch space.  We
+	// only need to save a copy of the input for the feedforward at the
+	// end, so we might as well use memory rather than spill extra
+	// registers.  (We need an extra 8 bytes to align the stack.)
+
+#  define NR ecx
+#  define IN rdx
+#  define OUT r8
+#  define SAVE0 xmm5
+#  define SAVE1 [rsp +  0]
+#  define SAVE2 [rsp + 16]
+#  define SAVE3 [rsp + 32]
+
+	sub	rsp, 48 + 8
+#endif
 
 	// First job is to slurp the matrix into XMM registers.  Be careful:
 	// the input matrix isn't likely to be properly aligned.
@@ -63,20 +119,17 @@ FUNC(chacha_core_x86_sse2)
 	//	[ 4  5  6  7] (b, xmm1)
 	//	[ 8  9 10 11] (c, xmm2)
 	//	[12 13 14 15] (d, xmm3)
-	movdqu	xmm0, [edx +  0]
-	movdqu	xmm1, [edx + 16]
-	movdqu	xmm2, [edx + 32]
-	movdqu	xmm3, [edx + 48]
-
-	// Prepare for the main loop.
-	mov	ecx, [ebp + 8]
+	movdqu	xmm0, [IN +  0]
+	movdqu	xmm1, [IN + 16]
+	movdqu	xmm2, [IN + 32]
+	movdqu	xmm3, [IN + 48]
 
 	// Take a copy for later.  This one is aligned properly, by
 	// construction.
-	movdqa	[esp], xmm0
-	movdqa	xmm5, xmm1
-	movdqa	xmm6, xmm2
-	movdqa	xmm7, xmm3
+	movdqa	SAVE0, xmm0
+	movdqa	SAVE1, xmm1
+	movdqa	SAVE2, xmm2
+	movdqa	SAVE3, xmm3
 
 loop:
 	// Apply a column quarterround to each of the columns simultaneously.
@@ -174,26 +227,30 @@ loop:
 	pshufd	xmm1, xmm1, ROTL
 
 	// Decrement the loop counter and see if we should go round again.
-	sub	ecx, 2
+	sub	NR, 2
 	ja	loop
 
 	// Almost there.  Firstly, the feedforward addition.
-	mov	edx, [ebp + 16]
-	paddd	xmm0, [esp]
-	paddd	xmm1, xmm5
-	paddd	xmm2, xmm6
-	paddd	xmm3, xmm7
+	paddd	xmm0, SAVE0
+	paddd	xmm1, SAVE1
+	paddd	xmm2, SAVE2
+	paddd	xmm3, SAVE3
 
 	// And now we write out the result.  This one won't be aligned
 	// either.
-	movdqu	[edx +  0], xmm0
-	movdqu	[edx + 16], xmm1
-	movdqu	[edx + 32], xmm2
-	movdqu	[edx + 48], xmm3
+	movdqu	[OUT +  0], xmm0
+	movdqu	[OUT + 16], xmm1
+	movdqu	[OUT + 32], xmm2
+	movdqu	[OUT + 48], xmm3
 
 	// Tidy things up.
+#if CPUFAM_X86
 	mov	esp, ebp
 	pop	ebp
+#endif
+#if CPUFAM_AMD64 && ABI_WIN
+	add	rsp, 48 + 8
+#endif
 
 	// And with that, we're done.
 	ret
diff --git a/symm/chacha.c b/symm/chacha.c
index 5683c8e9..80a84c17 100644
--- a/symm/chacha.c
+++ b/symm/chacha.c
@@ -72,14 +72,14 @@ static void simple_core(unsigned r, const chacha_matrix src,
 			chacha_matrix dest)
   { CHACHA_nR(dest, src, r); CHACHA_FFWD(dest, src); }
 
-#ifdef CPUFAM_X86
-extern core__functype chacha_core_x86_sse2;
+#if CPUFAM_X86 || CPUFAM_AMD64
+extern core__functype chacha_core_x86ish_sse2;
 #endif
 
 static core__functype *pick_core(void)
 {
-#ifdef CPUFAM_X86
-  DISPATCH_PICK_COND(chacha_core, chacha_core_x86_sse2,
+#if CPUFAM_X86 || CPUFAM_AMD64
+  DISPATCH_PICK_COND(chacha_core, chacha_core_x86ish_sse2,
 		     cpu_feature_p(CPUFEAT_X86_SSE2));
 #endif
   DISPATCH_PICK_FALLBACK(chacha_core, simple_core);
diff --git a/symm/rijndael-base.c b/symm/rijndael-base.c
index 3d2bb8ef..b0505a66 100644
--- a/symm/rijndael-base.c
+++ b/symm/rijndael-base.c
@@ -116,14 +116,14 @@ CPU_DISPATCH(static, EMPTY, void, setup, (rijndael_ctx *k, unsigned nb,
 					  const void *buf, unsigned nk),
 	     (k, nb, buf, nk), pick_setup, simple_setup)
 
-#ifdef CPUFAM_X86
-extern setup__functype rijndael_setup_x86_aesni;
+#if CPUFAM_X86 || CPUFAM_AMD64
+extern setup__functype rijndael_setup_x86ish_aesni;
 #endif
 
 static setup__functype *pick_setup(void)
 {
-#ifdef CPUFAM_X86
-  DISPATCH_PICK_COND(rijndael_setup, rijndael_setup_x86_aesni,
+#if CPUFAM_X86 || CPUFAM_AMD64
+  DISPATCH_PICK_COND(rijndael_setup, rijndael_setup_x86ish_aesni,
 		     cpu_feature_p(CPUFEAT_X86_AESNI));
 #endif
   DISPATCH_PICK_FALLBACK(rijndael_setup, simple_setup);
diff --git a/symm/rijndael-x86-aesni.S b/symm/rijndael-x86ish-aesni.S
similarity index 53%
rename from symm/rijndael-x86-aesni.S
rename to symm/rijndael-x86ish-aesni.S
index c0cd437a..91fcc352 100644
--- a/symm/rijndael-x86-aesni.S
+++ b/symm/rijndael-x86ish-aesni.S
@@ -72,45 +72,137 @@
 ///--------------------------------------------------------------------------
 /// Key setup.
 
-FUNC(rijndael_setup_x86_aesni)
+FUNC(rijndael_setup_x86ish_aesni)
 
-	// Initial state.  We have four arguments:
-	// [esp + 20] is the context pointer
-	// [esp + 24] is the block size, in 32-bit words (4, 6, or 8)
-	// [esp + 28] points to the key material, unaligned
-	// [esp + 32] is the size of the key, in words
-	// The key size has already been checked for validity, and the number
-	// of rounds has been computed.  Our job is only to fill in the `w'
-	// and `wi' vectors.
+#if CPUFAM_X86
+	// Arguments are on the stack.  We'll need to stack the caller's
+	// register veriables, but we'll manage.
 
+#  define CTX ebp			// context pointer
+#  define BLKSZ [esp + 24]		// block size
+
+#  define SI esi			// source pointer
+#  define DI edi			// destination pointer
+
+#  define KSZ ebx			// key size
+#  define KSZo ebx			// ... as address offset
+#  define NKW edx			// total number of key words
+#  define NKW_NEEDS_REFRESH 1		// ... needs recalculating
+#  define RCON ecx			// round constants table
+#  define LIM edx			// limit pointer
+#  define LIMn edx			// ... as integer offset from base
+
+#  define NR ecx			// number of rounds
+#  define LRK eax			// distance to last key
+#  define LRKo eax			// ... as address offset
+#  define BLKOFF edx			// block size in bytes
+#  define BLKOFFo edx			// ... as address offset
+
+	// Stack the caller's registers.
 	push	ebp
 	push	ebx
 	push	esi
 	push	edi
 
+	// Set up our own variables.
+	mov	CTX, [esp + 20]		// context base pointer
+	mov	SI, [esp + 28]		// key material
+	mov	KSZ, [esp + 32]		// key size, in words
+#endif
+
+#if CPUFAM_AMD64 && ABI_SYSV
+	// Arguments are in registers.  We have plenty, but, to be honest,
+	// the initial register allocation is a bit annoying.
+
+#  define CTX r8			// context pointer
+#  define BLKSZ r9d			// block size
+
+#  define SI rsi			// source pointer
+#  define DI rdi			// destination pointer
+
+#  define KSZ edx			// key size
+#  define KSZo rdx			// ... as address offset
+#  define NKW r10d			// total number of key words
+#  define RCON rdi			// round constants table
+#  define LIMn ecx			// limit pointer
+#  define LIM rcx			// ... as integer offset from base
+
+#  define NR ecx			// number of rounds
+#  define LRK eax			// distance to last key
+#  define LRKo rax			// ... as address offset
+#  define BLKOFF r9d			// block size in bytes
+#  define BLKOFFo r9			// ... as address offset
+
+	// Move arguments to more useful places.
+	mov	CTX, rdi		// context base pointer
+	mov	BLKSZ, esi		// block size in words
+	mov	SI, rdx			// key material
+	mov	KSZ, ecx		// key size, in words
+#endif
+
+#if CPUFAM_AMD64 && ABI_WIN
+	// Arguments are in different registers, and they're a little tight.
+
+#  define CTX r8			// context pointer
+#  define BLKSZ edx			// block size
+
+#  define SI rsi			// source pointer
+#  define DI rdi			// destination pointer
+
+#  define KSZ r9d			// key size
+#  define KSZo r9			// ... as address offset
+#  define NKW r10d			// total number of key words
+#  define RCON rdi			// round constants table
+#  define LIMn ecx			// limit pointer
+#  define LIM rcx			// ... as integer offset from base
+
+#  define NR ecx			// number of rounds
+#  define LRK eax			// distance to last key
+#  define LRKo rax			// ... as address offset
+#  define BLKOFF edx			// block size in bytes
+#  define BLKOFFo rdx			// ... as address offset
+
+	// We'll need the index registers, which belong to the caller in this
+	// ABI.
+	push	rsi
+	push	rdi
+
+	// Move arguments to more useful places.
+	mov	SI, r8			// key material
+	mov	CTX, rcx		// context base pointer
+#endif
+
 	// The initial round key material is taken directly from the input
 	// key, so copy it over.
-	mov	ebp, [esp + 20]		// context base pointer
-	mov	ebx, [esp + 32]		// key size, in words
-	mov	ecx, ebx
-	mov	esi, [esp + 28]
-	lea	edi, [ebp + w]
+#if CPUFAM_AMD64 && ABI_SYSV
+	// We've been lucky.  We already have a copy of the context pointer
+	// in rdi, and the key size in ecx.
+	add	DI, w
+#else
+	lea	DI, [CTX + w]
+	mov	ecx, KSZ
+#endif
 	rep	movsd
 
 	// Find out other useful things.
-	mov	edx, [ebp + nr]		// number of rounds
-	add	edx, 1
-	imul	edx, [esp + 24]		// total key size in words
-	sub	edx, ebx		// offset by the key size
+	mov	NKW, [CTX + nr]		// number of rounds
+	add	NKW, 1
+	imul	NKW, BLKSZ		// total key size in words
+#if !NKW_NEEDS_REFRESH
+	// If we can't keep NKW for later, then we use the same register for
+	// it and LIM, so this move is unnecessary.
+	mov	LIMn, NKW
+#endif
+	sub	LIMn, KSZ		// offset by the key size
 
 	// Find the round constants.
 	ldgot	ecx
-	leaext	ecx, rijndael_rcon, ecx
+	leaext	RCON, rijndael_rcon, ecx
 
 	// Prepare for the main loop.
-	lea	esi, [ebp + w]
-	mov	eax, [esi + 4*ebx - 4]	// most recent key word
-	lea	edx, [esi + 4*edx]	// limit, offset by one key expansion
+	lea	SI, [CTX + w]
+	mov	eax, [SI + 4*KSZo - 4]	// most recent key word
+	lea	LIM, [SI + 4*LIM]	// limit, offset by one key expansion
 
 	// Main key expansion loop.  The first word of each key-length chunk
 	// needs special treatment.
@@ -131,76 +223,76 @@ FUNC(rijndael_setup_x86_aesni)
 	aeskeygenassist xmm1, xmm0, 0
 	pshufd	xmm1, xmm1, ROTL
 	movd	eax, xmm1
-	xor	eax, [esi]
-	xor	al, [ecx]
-	inc	ecx
-	mov	[esi + 4*ebx], eax
-	add	esi, 4
-	cmp	esi, edx
+	xor	eax, [SI]
+	xor	al, [RCON]
+	inc	RCON
+	mov	[SI + 4*KSZo], eax
+	add	SI, 4
+	cmp	SI, LIM
 	jae	8f
 
 	// The next three words are simple...
-	xor	eax, [esi]
-	mov	[esi + 4*ebx], eax
-	add	esi, 4
-	cmp	esi, edx
+	xor	eax, [SI]
+	mov	[SI + 4*KSZo], eax
+	add	SI, 4
+	cmp	SI, LIM
 	jae	8f
 
 	// (Word 2...)
-	xor	eax, [esi]
-	mov	[esi + 4*ebx], eax
-	add	esi, 4
-	cmp	esi, edx
+	xor	eax, [SI]
+	mov	[SI + 4*KSZo], eax
+	add	SI, 4
+	cmp	SI, LIM
 	jae	8f
 
 	// (Word 3...)
-	xor	eax, [esi]
-	mov	[esi + 4*ebx], eax
-	add	esi, 4
-	cmp	esi, edx
+	xor	eax, [SI]
+	mov	[SI + 4*KSZo], eax
+	add	SI, 4
+	cmp	SI, LIM
 	jae	8f
 
 	// Word 4.  If the key is /more/ than 6 words long, then we must
 	// apply a substitution here.
-	cmp	ebx, 5
+	cmp	KSZ, 5
 	jb	9b
-	cmp	ebx, 7
+	cmp	KSZ, 7
 	jb	0f
 	movd	xmm0, eax
 	pshufd	xmm0, xmm0, ROTL
 	aeskeygenassist xmm1, xmm0, 0
 	movd	eax, xmm1
-0:	xor	eax, [esi]
-	mov	[esi + 4*ebx], eax
-	add	esi, 4
-	cmp	esi, edx
+0:	xor	eax, [SI]
+	mov	[SI + 4*KSZo], eax
+	add	SI, 4
+	cmp	SI, LIM
 	jae	8f
 
 	// (Word 5...)
-	cmp	ebx, 6
+	cmp	KSZ, 6
 	jb	9b
-	xor	eax, [esi]
-	mov	[esi + 4*ebx], eax
-	add	esi, 4
-	cmp	esi, edx
+	xor	eax, [SI]
+	mov	[SI + 4*KSZo], eax
+	add	SI, 4
+	cmp	SI, LIM
 	jae	8f
 
 	// (Word 6...)
-	cmp	ebx, 7
+	cmp	KSZ, 7
 	jb	9b
-	xor	eax, [esi]
-	mov	[esi + 4*ebx], eax
-	add	esi, 4
-	cmp	esi, edx
+	xor	eax, [SI]
+	mov	[SI + 4*KSZo], eax
+	add	SI, 4
+	cmp	SI, LIM
 	jae	8f
 
 	// (Word 7...)
-	cmp	ebx, 8
+	cmp	KSZ, 8
 	jb	9b
-	xor	eax, [esi]
-	mov	[esi + 4*ebx], eax
-	add	esi, 4
-	cmp	esi, edx
+	xor	eax, [SI]
+	mov	[SI + 4*KSZo], eax
+	add	SI, 4
+	cmp	SI, LIM
 	jae	8f
 
 	// Must be done by now.
@@ -219,130 +311,183 @@ FUNC(rijndael_setup_x86_aesni)
 	// there's easily enough buffer space for the over-enthusiastic reads
 	// and writes because the context has space for 32-byte blocks, which
 	// is our maximum and an exact fit for two SSE registers.
-8:	mov	ecx, [ebp + nr]		// number of rounds
-	mov	ebx, [esp + 24]		// block size (in words)
-	mov	edx, ecx
-	imul	edx, ebx
-	lea	edi, [ebp + wi]
-	lea	esi, [ebp + 4*edx + w]	// last round's keys
-	shl	ebx, 2			// block size (in bytes now)
+8:	mov	NR, [CTX + nr]		// number of rounds
+#if NKW_NEEDS_REFRESH
+	mov	BLKOFF, BLKSZ
+	mov	LRK, NR
+	imul	LRK, BLKOFF
+#else
+	// If we retain NKW, then BLKSZ and BLKOFF are the same register
+	// because we won't need the former again.
+	mov	LRK, NKW
+	sub	LRK, BLKSZ
+#endif
+	lea	DI, [CTX + wi]
+	lea	SI, [CTX + w + 4*LRKo]	// last round's keys
+	shl	BLKOFF, 2		// block size (in bytes now)
 
 	// Copy the last encryption round's keys.
-	movdqu	xmm0, [esi]
-	movdqu	[edi], xmm0
-	cmp	ebx, 16
+	movdqu	xmm0, [SI]
+	movdqu	[DI], xmm0
+	cmp	BLKOFF, 16
 	jbe	9f
-	movdqu	xmm0, [esi + 16]
-	movdqu	[edi + 16], xmm0
+	movdqu	xmm0, [SI + 16]
+	movdqu	[DI + 16], xmm0
 
 	// Update the loop variables and stop if we've finished.
-9:	add	edi, ebx
-	sub	esi, ebx
-	sub	ecx, 1
+9:	add	DI, BLKOFFo
+	sub	SI, BLKOFFo
+	sub	NR, 1
 	jbe	0f
 
 	// Do another middle round's keys...
-	movdqu	xmm0, [esi]
+	movdqu	xmm0, [SI]
 	aesimc	xmm0, xmm0
-	movdqu	[edi], xmm0
-	cmp	ebx, 16
+	movdqu	[DI], xmm0
+	cmp	BLKOFF, 16
 	jbe	9b
-	movdqu	xmm0, [esi + 16]
+	movdqu	xmm0, [SI + 16]
 	aesimc	xmm0, xmm0
-	movdqu	[edi + 16], xmm0
+	movdqu	[DI + 16], xmm0
 	jmp	9b
 
 	// Finally do the first encryption round.
-0:	movdqu	xmm0, [esi]
-	movdqu	[edi], xmm0
-	cmp	ebx, 16
+0:	movdqu	xmm0, [SI]
+	movdqu	[DI], xmm0
+	cmp	BLKOFF, 16
 	jbe	0f
-	movdqu	xmm0, [esi + 16]
-	movdqu	[edi + 16], xmm0
+	movdqu	xmm0, [SI + 16]
+	movdqu	[DI + 16], xmm0
 
 	// If the block size is not exactly four words then we must end-swap
 	// everything.  We can use fancy SSE toys for this.
-0:	cmp	ebx, 16
+0:	cmp	BLKOFF, 16
 	je	0f
 
 	// Find the byte-reordering table.
 	ldgot	ecx
 	movdqa	xmm5, [INTADDR(endswap_tab, ecx)]
 
+#if NKW_NEEDS_REFRESH
 	// Calculate the number of subkey words again.  (It's a good job
 	// we've got a fast multiplier.)
-	mov	ecx, [ebp + nr]
-	add	ecx, 1
-	imul	ecx, [esp + 24]		// total keys in words
+	mov	NKW, [CTX + nr]
+	add	NKW, 1
+	imul	NKW, BLKSZ
+#endif
 
 	// End-swap the encryption keys.
-	mov	eax, ecx
-	lea	esi, [ebp + w]
+	mov	ecx, NKW
+	lea	SI, [CTX + w]
 	call	endswap_block
 
 	// And the decryption keys.
-	mov	ecx, eax
-	lea	esi, [ebp + wi]
+	mov	ecx, NKW
+	lea	SI, [CTX + wi]
 	call	endswap_block
 
-	// All done.
-0:	pop	edi
+0:	// All done.
+#if CPUFAM_X86
+	pop	edi
 	pop	esi
 	pop	ebx
 	pop	ebp
+#endif
+#if CPUFAM_AMD64 && ABI_WIN
+	pop	rdi
+	pop	rsi
+#endif
 	ret
 
 	.align	16
 endswap_block:
-	// End-swap ECX words starting at ESI.  The end-swapping table is
+	// End-swap ECX words starting at SI.  The end-swapping table is
 	// already loaded into XMM5; and it's OK to work in 16-byte chunks.
-	movdqu	xmm1, [esi]
+	movdqu	xmm1, [SI]
 	pshufb	xmm1, xmm5
-	movdqu	[esi], xmm1
-	add	esi, 16
+	movdqu	[SI], xmm1
+	add	SI, 16
 	sub	ecx, 4
 	ja	endswap_block
 	ret
 
+#undef CTX
+#undef BLKSZ
+#undef SI
+#undef DI
+#undef KSZ
+#undef KSZo
+#undef RCON
+#undef LIMn
+#undef LIM
+#undef NR
+#undef LRK
+#undef LRKo
+#undef BLKOFF
+#undef BLKOFFo
+
 ENDFUNC
 
 ///--------------------------------------------------------------------------
 /// Encrypting and decrypting blocks.
 
 	.macro	encdec op, aes, koff
-FUNC(rijndael_\op\()_x86_aesni)
-
-	// On entry, we have:
-	// [esp +  4] points to the context block
-	// [esp +  8] points to the input data block
-	// [esp + 12] points to the output buffer
+FUNC(rijndael_\op\()_x86ish_aesni)
 
 	// Find the magic endianness-swapping table.
 	ldgot	ecx
 	movdqa	xmm5, [INTADDR(endswap_tab, ecx)]
 
-	// Load the input block and end-swap it.  Also, start loading the
-	// keys.
-	mov	eax, [esp + 8]
-	movdqu	xmm0, [eax]
+#if CPUFAM_X86
+	// Arguments come in on the stack, and need to be collected.  We
+	// don't have a shortage of registers.
+
+#  define K ecx
+#  define SRC edx
+#  define DST edx
+#  define NR eax
+
+	mov	K, [esp + 4]
+	mov	SRC, [esp + 8]
+#endif
+
+#if CPUFAM_AMD64 && ABI_SYSV
+	// Arguments come in registers.  All is good.
+
+#  define K rdi
+#  define SRC rsi
+#  define DST rdx
+#  define NR eax
+#endif
+
+#if CPUFAM_AMD64 && ABI_WIN
+	// Arguments come in different registers.
+
+#  define K rcx
+#  define SRC rdx
+#  define DST r8
+#  define NR eax
+#endif
+
+	// Initial setup.
+	movdqu	xmm0, [SRC]
 	pshufb	xmm0, xmm5
-	mov	eax, [esp + 4]
-	lea	edx, [eax + \koff]
-	mov	eax, [eax + nr]
+	mov	NR, [K + nr]
+	add	K, \koff
 
 	// Initial whitening.
-	movdqu	xmm1, [edx]
-	add	edx, 16
+	movdqu	xmm1, [K]
+	add	K, 16
 	pxor	xmm0, xmm1
 
 	// Dispatch to the correct code.
-	cmp	eax, 10
+	cmp	NR, 10
 	je	10f
 	jb	bogus
-	cmp	eax, 14
+	cmp	NR, 14
 	je	14f
 	ja	bogus
-	cmp	eax, 12
+	cmp	NR, 12
 	je	12f
 	jb	11f
 	jmp	13f
@@ -350,73 +495,80 @@ FUNC(rijndael_\op\()_x86_aesni)
 	.align	2
 
 	// 14 rounds...
-14:	movdqu	xmm1, [edx]
-	add	edx, 16
+14:	movdqu	xmm1, [K]
+	add	K, 16
 	\aes	xmm0, xmm1
 
 	// 13 rounds...
-13:	movdqu	xmm1, [edx]
-	add	edx, 16
+13:	movdqu	xmm1, [K]
+	add	K, 16
 	\aes	xmm0, xmm1
 
 	// 12 rounds...
-12:	movdqu	xmm1, [edx]
-	add	edx, 16
+12:	movdqu	xmm1, [K]
+	add	K, 16
 	\aes	xmm0, xmm1
 
 	// 11 rounds...
-11:	movdqu	xmm1, [edx]
-	add	edx, 16
+11:	movdqu	xmm1, [K]
+	add	K, 16
 	\aes	xmm0, xmm1
 
 	// 10 rounds...
-10:	movdqu	xmm1, [edx]
+10:	movdqu	xmm1, [K]
 	\aes	xmm0, xmm1
 
 	// 9 rounds...
-	movdqu	xmm1, [edx + 16]
+	movdqu	xmm1, [K + 16]
 	\aes	xmm0, xmm1
 
 	// 8 rounds...
-	movdqu	xmm1, [edx + 32]
+	movdqu	xmm1, [K + 32]
 	\aes	xmm0, xmm1
 
 	// 7 rounds...
-	movdqu	xmm1, [edx + 48]
+	movdqu	xmm1, [K + 48]
 	\aes	xmm0, xmm1
 
 	// 6 rounds...
-	movdqu	xmm1, [edx + 64]
+	movdqu	xmm1, [K + 64]
 	\aes	xmm0, xmm1
 
 	// 5 rounds...
-	movdqu	xmm1, [edx + 80]
+	movdqu	xmm1, [K + 80]
 	\aes	xmm0, xmm1
 
 	// 4 rounds...
-	movdqu	xmm1, [edx + 96]
+	movdqu	xmm1, [K + 96]
 	\aes	xmm0, xmm1
 
 	// 3 rounds...
-	movdqu	xmm1, [edx + 112]
+	movdqu	xmm1, [K + 112]
 	\aes	xmm0, xmm1
 
 	// 2 rounds...
-	movdqu	xmm1, [edx + 128]
+	movdqu	xmm1, [K + 128]
 	\aes	xmm0, xmm1
 
 	// Final round...
-	movdqu	xmm1, [edx + 144]
+	movdqu	xmm1, [K + 144]
 	\aes\()last xmm0, xmm1
 
 	// Unpermute the ciphertext block and store it.
 	pshufb	xmm0, xmm5
-	mov	eax, [esp + 12]
-	movdqu	[eax], xmm0
+#if CPUFAM_X86
+	mov	DST, [esp + 12]
+#endif
+	movdqu	[DST], xmm0
 
 	// And we're done.
 	ret
 
+#undef K
+#undef SRC
+#undef DST
+#undef NR
+
 ENDFUNC
 	.endm
 
diff --git a/symm/rijndael.c b/symm/rijndael.c
index dcb35e61..293f28da 100644
--- a/symm/rijndael.c
+++ b/symm/rijndael.c
@@ -82,15 +82,15 @@ CPU_DISPATCH(EMPTY, EMPTY, void, rijndael_dblk, (const rijndael_ctx *k,
 						 uint32 d[4]),
 	     (k, s, d), pick_dblk, simple_dblk)
 
-#ifdef CPUFAM_X86
-extern rijndael_eblk__functype rijndael_eblk_x86_aesni;
-extern rijndael_dblk__functype rijndael_dblk_x86_aesni;
+#if CPUFAM_X86 || CPUFAM_AMD64
+extern rijndael_eblk__functype rijndael_eblk_x86ish_aesni;
+extern rijndael_dblk__functype rijndael_dblk_x86ish_aesni;
 #endif
 
 static rijndael_eblk__functype *pick_eblk(void)
 {
-#ifdef CPUFAM_X86
-  DISPATCH_PICK_COND(rijndael_eblk, rijndael_eblk_x86_aesni,
+#if CPUFAM_X86 || CPUFAM_AMD64
+  DISPATCH_PICK_COND(rijndael_eblk, rijndael_eblk_x86ish_aesni,
 		     cpu_feature_p(CPUFEAT_X86_AESNI));
 #endif
   DISPATCH_PICK_FALLBACK(rijndael_eblk, simple_eblk);
@@ -98,8 +98,8 @@ static rijndael_eblk__functype *pick_eblk(void)
 
 static rijndael_dblk__functype *pick_dblk(void)
 {
-#ifdef CPUFAM_X86
-  DISPATCH_PICK_COND(rijndael_dblk, rijndael_dblk_x86_aesni,
+#if CPUFAM_X86 || CPUFAM_AMD64
+  DISPATCH_PICK_COND(rijndael_dblk, rijndael_dblk_x86ish_aesni,
 		     cpu_feature_p(CPUFEAT_X86_AESNI));
 #endif
   DISPATCH_PICK_FALLBACK(rijndael_dblk, simple_dblk);
diff --git a/symm/salsa20-x86-sse2.S b/symm/salsa20-x86ish-sse2.S
similarity index 63%
rename from symm/salsa20-x86-sse2.S
rename to symm/salsa20-x86ish-sse2.S
index 7a5bd2a3..a168d79a 100644
--- a/symm/salsa20-x86-sse2.S
+++ b/symm/salsa20-x86ish-sse2.S
@@ -44,20 +44,76 @@
 	.arch pentium4
 	.section .text
 
-FUNC(salsa20_core_x86_sse2)
+FUNC(salsa20_core_x86ish_sse2)
+
+	// Initial setup.
+
+#if CPUFAM_X86
+	// Arguments come in on the stack, and will need to be collected.  We
+	// we can get away with just the scratch registers for integer work,
+	// but we'll run out of XMM registers and will need some properly
+	// aligned space which we'll steal from the stack.  I don't trust the
+	// stack pointer's alignment, so I'll have to mask the stack pointer,
+	// which in turn means I'll need to keep track of the old value.
+	// Hence I'm making a full i386-style stack frame here.
+	//
+	// The Windows and SysV ABIs are sufficiently similar that we don't
+	// need to worry about the differences here.
+
+#  define NR ecx
+#  define IN eax
+#  define OUT edx
+#  define SAVE0 xmm6
+#  define SAVE1 xmm7
+#  define SAVE2 [esp + 0]
+#  define SAVE3 [esp + 16]
 
-	// Initial state.  We have three arguments:
-	// [ebp +  8] is the number of rounds to do
-	// [ebp + 12] points to the input matrix
-	// [ebp + 16] points to the output matrix
 	push	ebp
 	mov	ebp, esp
 	sub	esp, 32
-	mov	edx, [ebp + 12]
+	mov	IN, [ebp + 12]
+	mov	OUT, [ebp + 16]
 	and	esp, ~15
-
-	// Prepare for the main loop.
-	mov	ecx, [ebp + 8]
+	mov	NR, [ebp + 8]
+#endif
+
+#if CPUFAM_AMD64 && ABI_SYSV
+	// This is nice.  We have plenty of XMM registers, and the arguments
+	// are in useful places.  There's no need to spill anything and we
+	// can just get on with the code.
+
+#  define NR edi
+#  define IN rsi
+#  define OUT rdx
+#  define SAVE0 xmm6
+#  define SAVE1 xmm7
+#  define SAVE2 xmm8
+#  define SAVE3 xmm9
+#endif
+
+#  if CPUFAM_AMD64 && ABI_WIN
+	// Arguments come in registers, but they're different between Windows
+	// and everyone else (and everyone else is saner).
+	//
+	// The Windows ABI insists that we preserve some of the XMM
+	// registers, but we want more than we can use as scratch space.  Two
+	// places we only need to save a copy of the input for the
+	// feedforward at the end; but the other two we want for the final
+	// permutation, so save the old values on the stack (We need an extra
+	// 8 bytes to align the stack.)
+
+#  define NR ecx
+#  define IN rdx
+#  define OUT r8
+#  define SAVE0 xmm6
+#  define SAVE1 xmm7
+#  define SAVE2 [rsp + 32]
+#  define SAVE3 [rsp + 48]
+
+	sub	rsp, 64 + 8
+	movdqa	[rsp +  0], xmm6
+	movdqa	[rsp + 16], xmm7
+#endif
 
 	// First job is to slurp the matrix into XMM registers.  The words
 	// have already been permuted conveniently to make them line up
@@ -85,19 +141,18 @@ FUNC(salsa20_core_x86_sse2)
 	//	[ 4  5  6  7]    -->	[ 4  9 14  3] (b, xmm1)
 	//	[ 8  9 10 11]		[ 8 13  2  7] (c, xmm2)
 	//	[12 13 14 15]		[12  1  6 11] (d, xmm3)
-	movdqu	xmm0, [edx +  0]
-	movdqu	xmm1, [edx + 16]
-	movdqu	xmm2, [edx + 32]
-	movdqu	xmm3, [edx + 48]
+	movdqu	xmm0, [IN +  0]
+	movdqu	xmm1, [IN + 16]
+	movdqu	xmm2, [IN + 32]
+	movdqu	xmm3, [IN + 48]
 
-	// Take a copy for later.
-	movdqa	[esp +  0], xmm0
-	movdqa	[esp + 16], xmm1
-	movdqa	xmm6, xmm2
-	movdqa	xmm7, xmm3
+	## Take a copy for later.
+	movdqa	SAVE0, xmm0
+	movdqa	SAVE1, xmm1
+	movdqa	SAVE2, xmm2
+	movdqa	SAVE3, xmm3
 
 loop:
-
 	// Apply a column quarterround to each of the columns simultaneously.
 	// Alas, there doesn't seem to be a packed doubleword rotate, so we
 	// have to synthesize it.
@@ -147,9 +202,9 @@ loop:
 	// involve any movement of elements between rows.
 	//
 	//	[ 0  5 10 15]		[ 0  5 10 15] (a, xmm0)
-	//	[ 4  9 14  3]    -->	[ 1  6 11 12] (b, xmm3)
-	//	[ 8 13  2  7]		[ 2  7  8 13] (c, xmm2)
-	//	[12  1  6 11]		[ 3  4  9 14] (d, xmm1)
+	//	[ 4  9 14  3]	 -->	[ 1  6 11 12] (b, xmm3)
+	//	[ 8 13	2  7]		[ 2  7	8 13] (c, xmm2)
+	//	[12  1	6 11]		[ 3  4	9 14] (d, xmm1)
 	//
 	// The shuffles have quite high latency, so they've been pushed
 	// backwards into the main instruction list.
@@ -200,7 +255,7 @@ loop:
 	// back the shuffles because they take a long time coming through.
 	// Decrement the loop counter and see if we should go round again.
 	// Later processors fuse this pair into a single uop.
-	sub	ecx, 2
+	sub	NR, 2
 	ja	loop
 
 	// Almost there.  Firstly, the feedforward addition, and then we have
@@ -208,55 +263,69 @@ loop:
 	// which was already applied to the input.  Shuffling has quite high
 	// latency, so arrange to start a new shuffle into a temporary as
 	// soon as we've written out the old value.
-	mov	edx, [ebp + 16]
-
-	paddd	xmm0, [esp +  0]
-	pshufd	xmm4, xmm0, ROTR
-	movd	[edx +  0], xmm0
+	paddd	xmm0, SAVE0
+	pshufd	xmm4, xmm0, 0x39
+	movd	[OUT +  0], xmm0
 
-	paddd	xmm1, [esp + 16]
+	paddd	xmm1, SAVE1
 	pshufd	xmm5, xmm1, ROTL
-	movd	[edx + 16], xmm1
+	movd	[OUT + 16], xmm1
 
-	paddd	xmm2, xmm6
+	paddd	xmm2, SAVE2
 	pshufd	xmm6, xmm2, ROT2
-	movd	[edx + 32], xmm2
+	movd	[OUT + 32], xmm2
 
-	paddd	xmm3, xmm7
+	paddd	xmm3, SAVE3
 	pshufd	xmm7, xmm3, ROTR
-	movd	[edx + 48], xmm3
+	movd	[OUT + 48], xmm3
 
-	movd	[edx +  4], xmm7
+	movd	[OUT +  4], xmm7
 	pshufd	xmm7, xmm3, ROT2
-	movd	[edx + 24], xmm7
+	movd	[OUT + 24], xmm7
 	pshufd	xmm3, xmm3, ROTL
-	movd	[edx + 44], xmm3
+	movd	[OUT + 44], xmm3
 
-	movd	[edx +  8], xmm6
+	movd	[OUT +  8], xmm6
 	pshufd	xmm6, xmm2, ROTL
-	movd	[edx + 28], xmm6
+	movd	[OUT + 28], xmm6
 	pshufd	xmm2, xmm2, ROTR
-	movd	[edx + 52], xmm2
+	movd	[OUT + 52], xmm2
 
-	movd	[edx + 12], xmm5
+	movd	[OUT + 12], xmm5
 	pshufd	xmm5, xmm1, ROTR
-	movd	[edx + 36], xmm5
+	movd	[OUT + 36], xmm5
 	pshufd	xmm1, xmm1, ROT2
-	movd	[edx + 56], xmm1
+	movd	[OUT + 56], xmm1
 
-	movd	[edx + 20], xmm4
+	movd	[OUT + 20], xmm4
 	pshufd	xmm4, xmm0, ROT2
-	movd	[edx + 40], xmm4
+	movd	[OUT + 40], xmm4
 	pshufd	xmm0, xmm0, ROTL
-	movd	[edx + 60], xmm0
+	movd	[OUT + 60], xmm0
 
 	// Tidy things up.
+
+#if CPUFAM_X86
 	mov	esp, ebp
 	pop	ebp
+#endif
+#if CPUFAM_AMD64 && ABI_WIN
+	movdqa	xmm6, [rsp +  0]
+	movdqa	xmm7, [rsp + 16]
+	add	rsp, 64 + 8
+#endif
 
 	// And with that, we're done.
 	ret
 
+#undef NR
+#undef IN
+#undef OUT
+#undef SAVE0
+#undef SAVE1
+#undef SAVE2
+#undef SAVE3
+
 ENDFUNC
 
 ///----- That's all, folks --------------------------------------------------
diff --git a/symm/salsa20.c b/symm/salsa20.c
index 15e4d50e..eb4e67ad 100644
--- a/symm/salsa20.c
+++ b/symm/salsa20.c
@@ -52,14 +52,14 @@ static void simple_core(unsigned r, const salsa20_matrix src,
 			salsa20_matrix dest)
   { SALSA20_nR(dest, src, r); SALSA20_FFWD(dest, src); }
 
-#ifdef CPUFAM_X86
-extern core__functype salsa20_core_x86_sse2;
+#if CPUFAM_X86 || CPUFAM_AMD64
+extern core__functype salsa20_core_x86ish_sse2;
 #endif
 
 static core__functype *pick_core(void)
 {
-#ifdef CPUFAM_X86
-  DISPATCH_PICK_COND(salsa20_core, salsa20_core_x86_sse2,
+#if CPUFAM_X86 || CPUFAM_AMD64
+  DISPATCH_PICK_COND(salsa20_core, salsa20_core_x86ish_sse2,
 		     cpu_feature_p(CPUFEAT_X86_SSE2));
 #endif
   DISPATCH_PICK_FALLBACK(salsa20_core, simple_core);
-- 
2.11.0