Add support for AMD64 processors and Microsoft Windows.

author Mark Wooding <mdw@distorted.org.uk>

Sat, 21 May 2016 13:33:28 +0000 (14:33 +0100)

committer Mark Wooding <mdw@distorted.org.uk>

Sat, 21 May 2016 16:17:26 +0000 (17:17 +0100)
author Mark Wooding <mdw@distorted.org.uk>
Sat, 21 May 2016 13:33:28 +0000 (14:33 +0100)
committer Mark Wooding <mdw@distorted.org.uk>
Sat, 21 May 2016 16:17:26 +0000 (17:17 +0100)
diff --git a/base/asm-common.h b/base/asm-common.h

index 7e62eb5..8745ea4 100644 (file)
--- a/base/asm-common.h
+++ b/base/asm-common.h
@@ -58,9 +58,22 @@ F(name):                                                             \
  #endif
  
  ///--------------------------------------------------------------------------
-/// x86-specific hacking.
+/// Windows-specific hacking.
+
+#if ABI_WIN
  
  #if CPUFAM_X86
+#  define F(name) _##name
+#endif
+
+#endif
+
+///--------------------------------------------------------------------------
+/// x86- and amd64-specific hacking.
+///
+/// It's (slightly) easier to deal with both of these in one go.
+
+#if CPUFAM_X86 || CPUFAM_AMD64
  
  // Set the function hooks.
  #define FUNC_PREHOOK(_) .balign 16
@@ -86,7 +99,7 @@ F(name):                                                              \
  
  // Maybe load GOT address into GOT.
         .macro  ldgot got=GOTREG
-#if WANT_PIC
+#if WANT_PIC && CPUFAM_X86
         call    _where_am_i.\got
         add     \got, offset _GLOBAL_OFFSET_TABLE_
  #endif
@@ -94,7 +107,7 @@ F(name):                                                             \
  
  // Maybe build a helper subroutine for `ldgot GOT'.
         .macro  gotaux got=GOTREG
-#if WANT_PIC
+#if WANT_PIC && CPUFAM_X86
         .align  16
  _where_am_i.\got :
         mov     \got, [esp]
@@ -105,9 +118,19 @@ _where_am_i.\got :
  // Load address of external symbol ADDR into REG, maybe using GOT.
         .macro  leaext reg, addr, got=GOTREG
  #if WANT_PIC
+#  if CPUFAM_X86
         mov     \reg, [\got + \addr@GOT]
+#  endif
+#  if CPUFAM_AMD64
+       mov     \reg, \addr@GOTPCREL[rip]
+#  endif
  #else
+#  if CPUFAM_X86
         mov     \reg, offset \addr
+#  endif
+#  if CPUFAM_AMD64
+       lea     \reg, \addr[rip]
+#  endif
  #endif
         .endm
  
@@ -115,7 +138,9 @@ _where_am_i.\got :
  // referring to ADDR, which is within our module, maybe using GOT.
  #define INTADDR(...) INTADDR__0(__VA_ARGS__, GOTREG, dummy)
  #define INTADDR__0(addr, got, ...) INTADDR__1(addr, got)
-#if WANT_PIC
+#if CPUFAM_AMD64
+#  define INTADDR__1(addr, got) addr + rip
+#elif WANT_PIC
  #  define INTADDR__1(addr, got) got + addr@GOTOFF
  #else
  #  define INTADDR__1(addr, got) addr
diff --git a/base/dispatch.c b/base/dispatch.c

index 61c45fa..8936ea4 100644 (file)
--- a/base/dispatch.c
+++ b/base/dispatch.c
@@ -41,7 +41,7 @@
  
  /*----- Intel x86/AMD64 feature probing -----------------------------------*/
  
-#ifdef CPUFAM_X86
+#if CPUFAM_X86 || CPUFAM_AMD64
  
  #  define EFLAGS_ID (1u << 21)
  #  define CPUID1D_SSE2 (1u << 26)
@@ -64,6 +64,7 @@ struct cpuid { unsigned a, b, c, d; };
   */
  
  #ifdef __GNUC__
+#  if CPUFAM_X86
  static __inline__ unsigned getflags(void)
    { unsigned f; __asm__ ("pushf; popl %0" : "=g" (f)); return (f); }
  static __inline__ unsigned setflags(unsigned f)
@@ -74,6 +75,18 @@ static __inline__ unsigned setflags(unsigned f)
            : "g" (f));
    return (ff);
  }
+#  else
+static __inline__ unsigned long getflags(void)
+  { unsigned long f; __asm__ ("pushf; popq %0" : "=g" (f)); return (f); }
+static __inline__ unsigned long long setflags(unsigned long f)
+{
+  unsigned long ff;
+  __asm__ ("pushf; pushq %1; popf; pushf; popq %0; popf"
+          : "=g" (ff)
+          : "g" (f));
+  return (ff);
+}
+#  endif
  #endif
  
  static void cpuid(struct cpuid *cc, unsigned a, unsigned c)
@@ -97,9 +110,19 @@ static void cpuid(struct cpuid *cc, unsigned a, unsigned c)
    /* Alas, EBX is magical in PIC code, so abuse ESI instead.  This isn't
     * pretty, but it works.
     */
+#  if CPUFAM_X86
    __asm__ ("pushl %%ebx; cpuid; movl %%ebx, %%esi; popl %%ebx"
            : "=a" (cc->a), "=S" (cc->b), "=c" (cc->c), "=d" (cc->d)
            : "a" (a) , "c" (c));
+#  elif CPUFAM_AMD64
+  __asm__ ("pushq %%rbx; cpuid; movl %%ebx, %%esi; popq %%rbx"
+          : "=a" (cc->a), "=S" (cc->b), "=c" (cc->c), "=d" (cc->d)
+          : "a" (a) , "c" (c));
+#  else
+#    error "I'm confused."
+#  endif
+  dispatch_debug("CPUID(%08x, %08x) -> %08x, %08x, %08x, %08x",
+                a, c, cc->a, cc->b, cc->c, cc->d);
  #else
    dispatch_debug("GNU inline assembler not available; can't CPUID");
  #endif
@@ -141,6 +164,7 @@ static int xmm_registers_available_p(void)
     * XMM registers are actually alive.
     */
    if (!cpuid_features_p(CPUID1D_FXSR, 0)) return (0);
+#  if CPUFAM_X86
    __asm__ ("movl %%esp, %%edx; subl $512, %%esp; andl $~15, %%esp\n"
            "fxsave (%%esp)\n"
            "movl 160(%%esp), %%eax; xorl $0xaaaa5555, 160(%%esp)\n"
@@ -151,6 +175,21 @@ static int xmm_registers_available_p(void)
            : "=a" (f)
            : /* no inputs */
            : "%ecx", "%edx");
+#  elif CPUFAM_AMD64
+  __asm__ ("movq %%rsp, %%rdx; subq $512, %%rsp; andq $~15, %%rsp\n"
+          "fxsave (%%rsp)\n"
+          "movl 160(%%rsp), %%eax; xorl $0xaaaa5555, 160(%%rsp)\n"
+          "fxrstor (%%rsp); fxsave (%%rsp)\n"
+          "movl 160(%%rsp), %%ecx; movl %%eax, 160(%%rsp)\n"
+          "fxrstor (%%rsp); movq %%rdx, %%rsp\n"
+          "xorl %%ecx, %%eax"
+          : "=a" (f)
+          : /* no inputs */
+          : "%ecx", "%rdx");
+#  else
+#    error "I'm confused."
+#  endif
+  dispatch_debug("XMM registers %savailable", f ? "" : "not ");
    return (f);
  #else
    dispatch_debug("GNU inline assembler not available; can't check for XMM");
@@ -257,7 +296,7 @@ int cpu_feature_p(int feat)
      return (feat_debug(ftok, "runtime probe", cond));
  
    switch (feat) {
-#ifdef CPUFAM_X86
+#if CPUFAM_X86 || CPUFAM_AMD64
      CASE_CPUFEAT(X86_SSE2, "x86:sse2",
                  xmm_registers_available_p() &&
                  cpuid_features_p(CPUID1D_SSE2, 0));
diff --git a/configure.ac b/configure.ac

index b76c561..8a58d78 100644 (file)
--- a/configure.ac
+++ b/configure.ac
@@ -55,7 +55,10 @@ dnl The table of CPU families and ABIs which we might support.  Support is
  dnl not uniform: each dispatched function might or might not have an
  dnl implementation for any particular CPU/ABI combination.
  AC_DEFUN([catacomb_CPU_FAMILIES],
-  [$1([i[[3-6]]86,*], [x86], [sysv])])
+  [$1([i[[3-6]]86,cygwin], [x86], [win])
+   $1([i[[3-6]]86,*], [x86], [sysv])
+   $1([x86_64,cygwin], [amd64], [win])
+   $1([x86_64,*], [amd64], [sysv])])
  
  dnl A utility to clear the `seen' flags, used so as to process each CPU or
  dnl ABI once.
diff --git a/symm/Makefile.am b/symm/Makefile.am

index ba037cd..e78277b 100644 (file)
--- a/symm/Makefile.am
+++ b/symm/Makefile.am
@@ -181,7 +181,10 @@ BLKCS                      += rc5
  BLKCS                  += rijndael rijndael192 rijndael256
  libsymm_la_SOURCES     += rijndael-base.h rijndael-base.c
  if CPUFAM_X86
-libsymm_la_SOURCES     += rijndael-x86-aesni.S
+libsymm_la_SOURCES     += rijndael-x86ish-aesni.S
+endif
+if CPUFAM_AMD64
+libsymm_la_SOURCES     += rijndael-x86ish-aesni.S
  endif
  libsymm_la_SOURCES     += $(precomp)/rijndael-tab.c
  PRECOMPS               += $(precomp)/rijndael-tab.c
@@ -382,7 +385,10 @@ EXTRA_DIST         += salsa20-tvconv
  pkginclude_HEADERS     += salsa20.h salsa20-core.h
  libsymm_la_SOURCES     += salsa20.c
  if CPUFAM_X86
-libsymm_la_SOURCES     += salsa20-x86-sse2.S
+libsymm_la_SOURCES     += salsa20-x86ish-sse2.S
+endif
+if CPUFAM_AMD64
+libsymm_la_SOURCES     += salsa20-x86ish-sse2.S
  endif
  TESTS                  += salsa20.$t
  ALL_CIPHERS            += salsa20 salsa2012 salsa208
@@ -411,7 +417,10 @@ t/salsa20: salsa20-tvconv t/salsa20.local $(SALSA20_ESTREAM_TV)
  pkginclude_HEADERS     += chacha.h chacha-core.h
  libsymm_la_SOURCES     += chacha.c
  if CPUFAM_X86
-libsymm_la_SOURCES     += chacha-x86-sse2.S
+libsymm_la_SOURCES     += chacha-x86ish-sse2.S
+endif
+if CPUFAM_AMD64
+libsymm_la_SOURCES     += chacha-x86ish-sse2.S
  endif
  TESTS                  += chacha.$t
  EXTRA_DIST             += t/chacha
diff --git a/symm/chacha-x86-sse2.S b/symm/chacha-x86ish-sse2.S

similarity index 65%

rename from symm/chacha-x86-sse2.S

rename to symm/chacha-x86ish-sse2.S

index ccdfa53..f36bf90 100644 (file)
--- a/symm/chacha-x86-sse2.S
+++ b/symm/chacha-x86ish-sse2.S
@@ -44,17 +44,73 @@
         .arch pentium4
         .section .text
  
-FUNC(chacha_core_x86_sse2)
+FUNC(chacha_core_x86ish_sse2)
+
+       // Initial setup.
+
+#if CPUFAM_X86
+       // Arguments come in on the stack, and will need to be collected.  We
+       // we can get away with just the scratch registers for integer work,
+       // but we'll run out of XMM registers and will need some properly
+       // aligned space which we'll steal from the stack.  I don't trust the
+       // stack pointer's alignment, so I'll have to mask the stack pointer,
+       // which in turn means I'll need to keep track of the old value.
+       // Hence I'm making a full i386-style stack frame here.
+       //
+       // The Windows and SysV ABIs are sufficiently similar that we don't
+       // need to worry about the differences here.
+
+#  define NR ecx
+#  define IN eax
+#  define OUT edx
+#  define SAVE0 xmm5
+#  define SAVE1 xmm6
+#  define SAVE2 xmm7
+#  define SAVE3 [esp]
  
-       // Initial state.  We have three arguments:
-       // [ebp +  8] is the number of rounds to do
-       // [ebp + 12] points to the input matrix
-       // [ebp + 16] points to the output matrix
         push    ebp
         mov     ebp, esp
         sub     esp, 16
-       mov     edx, [ebp + 12]
+       mov     IN, [ebp + 12]
+       mov     OUT, [ebp + 16]
         and     esp, ~15
+       mov     NR, [ebp + 8]
+#endif
+
+#if CPUFAM_AMD64 && ABI_SYSV
+       // This is nice.  We have plenty of XMM registers, and the arguments
+       // are in useful places.  There's no need to spill anything and we
+       // can just get on with the code.
+
+#  define NR edi
+#  define IN rsi
+#  define OUT rdx
+#  define SAVE0 xmm5
+#  define SAVE1 xmm6
+#  define SAVE2 xmm7
+#  define SAVE3 xmm8
+#endif
+
+#if CPUFAM_AMD64 && ABI_WIN
+       // Arguments come in registers, but they're different between Windows
+       // and everyone else (and everyone else is saner).
+       //
+       // The Windows ABI insists that we preserve some of the XMM
+       // registers, but we want more than we can use as scratch space.  We
+       // only need to save a copy of the input for the feedforward at the
+       // end, so we might as well use memory rather than spill extra
+       // registers.  (We need an extra 8 bytes to align the stack.)
+
+#  define NR ecx
+#  define IN rdx
+#  define OUT r8
+#  define SAVE0 xmm5
+#  define SAVE1 [rsp +  0]
+#  define SAVE2 [rsp + 16]
+#  define SAVE3 [rsp + 32]
+
+       sub     rsp, 48 + 8
+#endif
  
         // First job is to slurp the matrix into XMM registers.  Be careful:
         // the input matrix isn't likely to be properly aligned.
@@ -63,20 +119,17 @@ FUNC(chacha_core_x86_sse2)
         //      [ 4  5  6  7] (b, xmm1)
         //      [ 8  9 10 11] (c, xmm2)
         //      [12 13 14 15] (d, xmm3)
-       movdqu  xmm0, [edx +  0]
-       movdqu  xmm1, [edx + 16]
-       movdqu  xmm2, [edx + 32]
-       movdqu  xmm3, [edx + 48]
-
-       // Prepare for the main loop.
-       mov     ecx, [ebp + 8]
+       movdqu  xmm0, [IN +  0]
+       movdqu  xmm1, [IN + 16]
+       movdqu  xmm2, [IN + 32]
+       movdqu  xmm3, [IN + 48]
  
         // Take a copy for later.  This one is aligned properly, by
         // construction.
-       movdqa  [esp], xmm0
-       movdqa  xmm5, xmm1
-       movdqa  xmm6, xmm2
-       movdqa  xmm7, xmm3
+       movdqa  SAVE0, xmm0
+       movdqa  SAVE1, xmm1
+       movdqa  SAVE2, xmm2
+       movdqa  SAVE3, xmm3
  
  loop:
         // Apply a column quarterround to each of the columns simultaneously.
@@ -174,26 +227,30 @@ loop:
         pshufd  xmm1, xmm1, ROTL
  
         // Decrement the loop counter and see if we should go round again.
-       sub     ecx, 2
+       sub     NR, 2
         ja      loop
  
         // Almost there.  Firstly, the feedforward addition.
-       mov     edx, [ebp + 16]
-       paddd   xmm0, [esp]
-       paddd   xmm1, xmm5
-       paddd   xmm2, xmm6
-       paddd   xmm3, xmm7
+       paddd   xmm0, SAVE0
+       paddd   xmm1, SAVE1
+       paddd   xmm2, SAVE2
+       paddd   xmm3, SAVE3
  
         // And now we write out the result.  This one won't be aligned
         // either.
-       movdqu  [edx +  0], xmm0
-       movdqu  [edx + 16], xmm1
-       movdqu  [edx + 32], xmm2
-       movdqu  [edx + 48], xmm3
+       movdqu  [OUT +  0], xmm0
+       movdqu  [OUT + 16], xmm1
+       movdqu  [OUT + 32], xmm2
+       movdqu  [OUT + 48], xmm3
  
         // Tidy things up.
+#if CPUFAM_X86
         mov     esp, ebp
         pop     ebp
+#endif
+#if CPUFAM_AMD64 && ABI_WIN
+       add     rsp, 48 + 8
+#endif
  
         // And with that, we're done.
         ret
diff --git a/symm/chacha.c b/symm/chacha.c

index 5683c8e..80a84c1 100644 (file)
--- a/symm/chacha.c
+++ b/symm/chacha.c
@@ -72,14 +72,14 @@ static void simple_core(unsigned r, const chacha_matrix src,
                         chacha_matrix dest)
    { CHACHA_nR(dest, src, r); CHACHA_FFWD(dest, src); }
  
-#ifdef CPUFAM_X86
-extern core__functype chacha_core_x86_sse2;
+#if CPUFAM_X86 || CPUFAM_AMD64
+extern core__functype chacha_core_x86ish_sse2;
  #endif
  
  static core__functype *pick_core(void)
  {
-#ifdef CPUFAM_X86
-  DISPATCH_PICK_COND(chacha_core, chacha_core_x86_sse2,
+#if CPUFAM_X86 || CPUFAM_AMD64
+  DISPATCH_PICK_COND(chacha_core, chacha_core_x86ish_sse2,
                      cpu_feature_p(CPUFEAT_X86_SSE2));
  #endif
    DISPATCH_PICK_FALLBACK(chacha_core, simple_core);
diff --git a/symm/rijndael-base.c b/symm/rijndael-base.c

index 3d2bb8e..b0505a6 100644 (file)
--- a/symm/rijndael-base.c
+++ b/symm/rijndael-base.c
@@ -116,14 +116,14 @@ CPU_DISPATCH(static, EMPTY, void, setup, (rijndael_ctx *k, unsigned nb,
                                           const void *buf, unsigned nk),
              (k, nb, buf, nk), pick_setup, simple_setup)
  
-#ifdef CPUFAM_X86
-extern setup__functype rijndael_setup_x86_aesni;
+#if CPUFAM_X86 || CPUFAM_AMD64
+extern setup__functype rijndael_setup_x86ish_aesni;
  #endif
  
  static setup__functype *pick_setup(void)
  {
-#ifdef CPUFAM_X86
-  DISPATCH_PICK_COND(rijndael_setup, rijndael_setup_x86_aesni,
+#if CPUFAM_X86 || CPUFAM_AMD64
+  DISPATCH_PICK_COND(rijndael_setup, rijndael_setup_x86ish_aesni,
                      cpu_feature_p(CPUFEAT_X86_AESNI));
  #endif
    DISPATCH_PICK_FALLBACK(rijndael_setup, simple_setup);
diff --git a/symm/rijndael-x86-aesni.S b/symm/rijndael-x86ish-aesni.S

similarity index 53%

rename from symm/rijndael-x86-aesni.S

rename to symm/rijndael-x86ish-aesni.S

index c0cd437..91fcc35 100644 (file)
--- a/symm/rijndael-x86-aesni.S
+++ b/symm/rijndael-x86ish-aesni.S
@@ -72,45 +72,137 @@
  ///--------------------------------------------------------------------------
  /// Key setup.
  
-FUNC(rijndael_setup_x86_aesni)
+FUNC(rijndael_setup_x86ish_aesni)
  
-       // Initial state.  We have four arguments:
-       // [esp + 20] is the context pointer
-       // [esp + 24] is the block size, in 32-bit words (4, 6, or 8)
-       // [esp + 28] points to the key material, unaligned
-       // [esp + 32] is the size of the key, in words
-       // The key size has already been checked for validity, and the number
-       // of rounds has been computed.  Our job is only to fill in the `w'
-       // and `wi' vectors.
+#if CPUFAM_X86
+       // Arguments are on the stack.  We'll need to stack the caller's
+       // register veriables, but we'll manage.
  
+#  define CTX ebp                      // context pointer
+#  define BLKSZ [esp + 24]             // block size
+
+#  define SI esi                       // source pointer
+#  define DI edi                       // destination pointer
+
+#  define KSZ ebx                      // key size
+#  define KSZo ebx                     // ... as address offset
+#  define NKW edx                      // total number of key words
+#  define NKW_NEEDS_REFRESH 1          // ... needs recalculating
+#  define RCON ecx                     // round constants table
+#  define LIM edx                      // limit pointer
+#  define LIMn edx                     // ... as integer offset from base
+
+#  define NR ecx                       // number of rounds
+#  define LRK eax                      // distance to last key
+#  define LRKo eax                     // ... as address offset
+#  define BLKOFF edx                   // block size in bytes
+#  define BLKOFFo edx                  // ... as address offset
+
+       // Stack the caller's registers.
         push    ebp
         push    ebx
         push    esi
         push    edi
  
+       // Set up our own variables.
+       mov     CTX, [esp + 20]         // context base pointer
+       mov     SI, [esp + 28]          // key material
+       mov     KSZ, [esp + 32]         // key size, in words
+#endif
+
+#if CPUFAM_AMD64 && ABI_SYSV
+       // Arguments are in registers.  We have plenty, but, to be honest,
+       // the initial register allocation is a bit annoying.
+
+#  define CTX r8                       // context pointer
+#  define BLKSZ r9d                    // block size
+
+#  define SI rsi                       // source pointer
+#  define DI rdi                       // destination pointer
+
+#  define KSZ edx                      // key size
+#  define KSZo rdx                     // ... as address offset
+#  define NKW r10d                     // total number of key words
+#  define RCON rdi                     // round constants table
+#  define LIMn ecx                     // limit pointer
+#  define LIM rcx                      // ... as integer offset from base
+
+#  define NR ecx                       // number of rounds
+#  define LRK eax                      // distance to last key
+#  define LRKo rax                     // ... as address offset
+#  define BLKOFF r9d                   // block size in bytes
+#  define BLKOFFo r9                   // ... as address offset
+
+       // Move arguments to more useful places.
+       mov     CTX, rdi                // context base pointer
+       mov     BLKSZ, esi              // block size in words
+       mov     SI, rdx                 // key material
+       mov     KSZ, ecx                // key size, in words
+#endif
+
+#if CPUFAM_AMD64 && ABI_WIN
+       // Arguments are in different registers, and they're a little tight.
+
+#  define CTX r8                       // context pointer
+#  define BLKSZ edx                    // block size
+
+#  define SI rsi                       // source pointer
+#  define DI rdi                       // destination pointer
+
+#  define KSZ r9d                      // key size
+#  define KSZo r9                      // ... as address offset
+#  define NKW r10d                     // total number of key words
+#  define RCON rdi                     // round constants table
+#  define LIMn ecx                     // limit pointer
+#  define LIM rcx                      // ... as integer offset from base
+
+#  define NR ecx                       // number of rounds
+#  define LRK eax                      // distance to last key
+#  define LRKo rax                     // ... as address offset
+#  define BLKOFF edx                   // block size in bytes
+#  define BLKOFFo rdx                  // ... as address offset
+
+       // We'll need the index registers, which belong to the caller in this
+       // ABI.
+       push    rsi
+       push    rdi
+
+       // Move arguments to more useful places.
+       mov     SI, r8                  // key material
+       mov     CTX, rcx                // context base pointer
+#endif
+
         // The initial round key material is taken directly from the input
         // key, so copy it over.
-       mov     ebp, [esp + 20]         // context base pointer
-       mov     ebx, [esp + 32]         // key size, in words
-       mov     ecx, ebx
-       mov     esi, [esp + 28]
-       lea     edi, [ebp + w]
+#if CPUFAM_AMD64 && ABI_SYSV
+       // We've been lucky.  We already have a copy of the context pointer
+       // in rdi, and the key size in ecx.
+       add     DI, w
+#else
+       lea     DI, [CTX + w]
+       mov     ecx, KSZ
+#endif
         rep     movsd
  
         // Find out other useful things.
-       mov     edx, [ebp + nr]         // number of rounds
-       add     edx, 1
-       imul    edx, [esp + 24]         // total key size in words
-       sub     edx, ebx                // offset by the key size
+       mov     NKW, [CTX + nr]         // number of rounds
+       add     NKW, 1
+       imul    NKW, BLKSZ              // total key size in words
+#if !NKW_NEEDS_REFRESH
+       // If we can't keep NKW for later, then we use the same register for
+       // it and LIM, so this move is unnecessary.
+       mov     LIMn, NKW
+#endif
+       sub     LIMn, KSZ               // offset by the key size
  
         // Find the round constants.
         ldgot   ecx
-       leaext  ecx, rijndael_rcon, ecx
+       leaext  RCON, rijndael_rcon, ecx
  
         // Prepare for the main loop.
-       lea     esi, [ebp + w]
-       mov     eax, [esi + 4*ebx - 4]  // most recent key word
-       lea     edx, [esi + 4*edx]      // limit, offset by one key expansion
+       lea     SI, [CTX + w]
+       mov     eax, [SI + 4*KSZo - 4]  // most recent key word
+       lea     LIM, [SI + 4*LIM]       // limit, offset by one key expansion
  
         // Main key expansion loop.  The first word of each key-length chunk
         // needs special treatment.
@@ -131,76 +223,76 @@ FUNC(rijndael_setup_x86_aesni)
         aeskeygenassist xmm1, xmm0, 0
         pshufd  xmm1, xmm1, ROTL
         movd    eax, xmm1
-       xor     eax, [esi]
-       xor     al, [ecx]
-       inc     ecx
-       mov     [esi + 4*ebx], eax
-       add     esi, 4
-       cmp     esi, edx
+       xor     eax, [SI]
+       xor     al, [RCON]
+       inc     RCON
+       mov     [SI + 4*KSZo], eax
+       add     SI, 4
+       cmp     SI, LIM
         jae     8f
  
         // The next three words are simple...
-       xor     eax, [esi]
-       mov     [esi + 4*ebx], eax
-       add     esi, 4
-       cmp     esi, edx
+       xor     eax, [SI]
+       mov     [SI + 4*KSZo], eax
+       add     SI, 4
+       cmp     SI, LIM
         jae     8f
  
         // (Word 2...)
-       xor     eax, [esi]
-       mov     [esi + 4*ebx], eax
-       add     esi, 4
-       cmp     esi, edx
+       xor     eax, [SI]
+       mov     [SI + 4*KSZo], eax
+       add     SI, 4
+       cmp     SI, LIM
         jae     8f
  
         // (Word 3...)
-       xor     eax, [esi]
-       mov     [esi + 4*ebx], eax
-       add     esi, 4
-       cmp     esi, edx
+       xor     eax, [SI]
+       mov     [SI + 4*KSZo], eax
+       add     SI, 4
+       cmp     SI, LIM
         jae     8f
  
         // Word 4.  If the key is /more/ than 6 words long, then we must
         // apply a substitution here.
-       cmp     ebx, 5
+       cmp     KSZ, 5
         jb      9b
-       cmp     ebx, 7
+       cmp     KSZ, 7
         jb      0f
         movd    xmm0, eax
         pshufd  xmm0, xmm0, ROTL
         aeskeygenassist xmm1, xmm0, 0
         movd    eax, xmm1
-0:     xor     eax, [esi]
-       mov     [esi + 4*ebx], eax
-       add     esi, 4
-       cmp     esi, edx
+0:     xor     eax, [SI]
+       mov     [SI + 4*KSZo], eax
+       add     SI, 4
+       cmp     SI, LIM
         jae     8f
  
         // (Word 5...)
-       cmp     ebx, 6
+       cmp     KSZ, 6
         jb      9b
-       xor     eax, [esi]
-       mov     [esi + 4*ebx], eax
-       add     esi, 4
-       cmp     esi, edx
+       xor     eax, [SI]
+       mov     [SI + 4*KSZo], eax
+       add     SI, 4
+       cmp     SI, LIM
         jae     8f
  
         // (Word 6...)
-       cmp     ebx, 7
+       cmp     KSZ, 7
         jb      9b
-       xor     eax, [esi]
-       mov     [esi + 4*ebx], eax
-       add     esi, 4
-       cmp     esi, edx
+       xor     eax, [SI]
+       mov     [SI + 4*KSZo], eax
+       add     SI, 4
+       cmp     SI, LIM
         jae     8f
  
         // (Word 7...)
-       cmp     ebx, 8
+       cmp     KSZ, 8
         jb      9b
-       xor     eax, [esi]
-       mov     [esi + 4*ebx], eax
-       add     esi, 4
-       cmp     esi, edx
+       xor     eax, [SI]
+       mov     [SI + 4*KSZo], eax
+       add     SI, 4
+       cmp     SI, LIM
         jae     8f
  
         // Must be done by now.
@@ -219,130 +311,183 @@ FUNC(rijndael_setup_x86_aesni)
         // there's easily enough buffer space for the over-enthusiastic reads
         // and writes because the context has space for 32-byte blocks, which
         // is our maximum and an exact fit for two SSE registers.
-8:     mov     ecx, [ebp + nr]         // number of rounds
-       mov     ebx, [esp + 24]         // block size (in words)
-       mov     edx, ecx
-       imul    edx, ebx
-       lea     edi, [ebp + wi]
-       lea     esi, [ebp + 4*edx + w]  // last round's keys
-       shl     ebx, 2                  // block size (in bytes now)
+8:     mov     NR, [CTX + nr]          // number of rounds
+#if NKW_NEEDS_REFRESH
+       mov     BLKOFF, BLKSZ
+       mov     LRK, NR
+       imul    LRK, BLKOFF
+#else
+       // If we retain NKW, then BLKSZ and BLKOFF are the same register
+       // because we won't need the former again.
+       mov     LRK, NKW
+       sub     LRK, BLKSZ
+#endif
+       lea     DI, [CTX + wi]
+       lea     SI, [CTX + w + 4*LRKo]  // last round's keys
+       shl     BLKOFF, 2               // block size (in bytes now)
  
         // Copy the last encryption round's keys.
-       movdqu  xmm0, [esi]
-       movdqu  [edi], xmm0
-       cmp     ebx, 16
+       movdqu  xmm0, [SI]
+       movdqu  [DI], xmm0
+       cmp     BLKOFF, 16
         jbe     9f
-       movdqu  xmm0, [esi + 16]
-       movdqu  [edi + 16], xmm0
+       movdqu  xmm0, [SI + 16]
+       movdqu  [DI + 16], xmm0
  
         // Update the loop variables and stop if we've finished.
-9:     add     edi, ebx
-       sub     esi, ebx
-       sub     ecx, 1
+9:     add     DI, BLKOFFo
+       sub     SI, BLKOFFo
+       sub     NR, 1
         jbe     0f
  
         // Do another middle round's keys...
-       movdqu  xmm0, [esi]
+       movdqu  xmm0, [SI]
         aesimc  xmm0, xmm0
-       movdqu  [edi], xmm0
-       cmp     ebx, 16
+       movdqu  [DI], xmm0
+       cmp     BLKOFF, 16
         jbe     9b
-       movdqu  xmm0, [esi + 16]
+       movdqu  xmm0, [SI + 16]
         aesimc  xmm0, xmm0
-       movdqu  [edi + 16], xmm0
+       movdqu  [DI + 16], xmm0
         jmp     9b
  
         // Finally do the first encryption round.
-0:     movdqu  xmm0, [esi]
-       movdqu  [edi], xmm0
-       cmp     ebx, 16
+0:     movdqu  xmm0, [SI]
+       movdqu  [DI], xmm0
+       cmp     BLKOFF, 16
         jbe     0f
-       movdqu  xmm0, [esi + 16]
-       movdqu  [edi + 16], xmm0
+       movdqu  xmm0, [SI + 16]
+       movdqu  [DI + 16], xmm0
  
         // If the block size is not exactly four words then we must end-swap
         // everything.  We can use fancy SSE toys for this.
-0:     cmp     ebx, 16
+0:     cmp     BLKOFF, 16
         je      0f
  
         // Find the byte-reordering table.
         ldgot   ecx
         movdqa  xmm5, [INTADDR(endswap_tab, ecx)]
  
+#if NKW_NEEDS_REFRESH
         // Calculate the number of subkey words again.  (It's a good job
         // we've got a fast multiplier.)
-       mov     ecx, [ebp + nr]
-       add     ecx, 1
-       imul    ecx, [esp + 24]         // total keys in words
+       mov     NKW, [CTX + nr]
+       add     NKW, 1
+       imul    NKW, BLKSZ
+#endif
  
         // End-swap the encryption keys.
-       mov     eax, ecx
-       lea     esi, [ebp + w]
+       mov     ecx, NKW
+       lea     SI, [CTX + w]
         call    endswap_block
  
         // And the decryption keys.
-       mov     ecx, eax
-       lea     esi, [ebp + wi]
+       mov     ecx, NKW
+       lea     SI, [CTX + wi]
         call    endswap_block
  
-       // All done.
-0:     pop     edi
+0:     // All done.
+#if CPUFAM_X86
+       pop     edi
         pop     esi
         pop     ebx
         pop     ebp
+#endif
+#if CPUFAM_AMD64 && ABI_WIN
+       pop     rdi
+       pop     rsi
+#endif
         ret
  
         .align  16
  endswap_block:
-       // End-swap ECX words starting at ESI.  The end-swapping table is
+       // End-swap ECX words starting at SI.  The end-swapping table is
         // already loaded into XMM5; and it's OK to work in 16-byte chunks.
-       movdqu  xmm1, [esi]
+       movdqu  xmm1, [SI]
         pshufb  xmm1, xmm5
-       movdqu  [esi], xmm1
-       add     esi, 16
+       movdqu  [SI], xmm1
+       add     SI, 16
         sub     ecx, 4
         ja      endswap_block
         ret
  
+#undef CTX
+#undef BLKSZ
+#undef SI
+#undef DI
+#undef KSZ
+#undef KSZo
+#undef RCON
+#undef LIMn
+#undef LIM
+#undef NR
+#undef LRK
+#undef LRKo
+#undef BLKOFF
+#undef BLKOFFo
+
  ENDFUNC
  
  ///--------------------------------------------------------------------------
  /// Encrypting and decrypting blocks.
  
         .macro  encdec op, aes, koff
-FUNC(rijndael_\op\()_x86_aesni)
-
-       // On entry, we have:
-       // [esp +  4] points to the context block
-       // [esp +  8] points to the input data block
-       // [esp + 12] points to the output buffer
+FUNC(rijndael_\op\()_x86ish_aesni)
  
         // Find the magic endianness-swapping table.
         ldgot   ecx
         movdqa  xmm5, [INTADDR(endswap_tab, ecx)]
  
-       // Load the input block and end-swap it.  Also, start loading the
-       // keys.
-       mov     eax, [esp + 8]
-       movdqu  xmm0, [eax]
+#if CPUFAM_X86
+       // Arguments come in on the stack, and need to be collected.  We
+       // don't have a shortage of registers.
+
+#  define K ecx
+#  define SRC edx
+#  define DST edx
+#  define NR eax
+
+       mov     K, [esp + 4]
+       mov     SRC, [esp + 8]
+#endif
+
+#if CPUFAM_AMD64 && ABI_SYSV
+       // Arguments come in registers.  All is good.
+
+#  define K rdi
+#  define SRC rsi
+#  define DST rdx
+#  define NR eax
+#endif
+
+#if CPUFAM_AMD64 && ABI_WIN
+       // Arguments come in different registers.
+
+#  define K rcx
+#  define SRC rdx
+#  define DST r8
+#  define NR eax
+#endif
+
+       // Initial setup.
+       movdqu  xmm0, [SRC]
         pshufb  xmm0, xmm5
-       mov     eax, [esp + 4]
-       lea     edx, [eax + \koff]
-       mov     eax, [eax + nr]
+       mov     NR, [K + nr]
+       add     K, \koff
  
         // Initial whitening.
-       movdqu  xmm1, [edx]
-       add     edx, 16
+       movdqu  xmm1, [K]
+       add     K, 16
         pxor    xmm0, xmm1
  
         // Dispatch to the correct code.
-       cmp     eax, 10
+       cmp     NR, 10
         je      10f
         jb      bogus
-       cmp     eax, 14
+       cmp     NR, 14
         je      14f
         ja      bogus
-       cmp     eax, 12
+       cmp     NR, 12
         je      12f
         jb      11f
         jmp     13f
@@ -350,73 +495,80 @@ FUNC(rijndael_\op\()_x86_aesni)
         .align  2
  
         // 14 rounds...
-14:    movdqu  xmm1, [edx]
-       add     edx, 16
+14:    movdqu  xmm1, [K]
+       add     K, 16
         \aes    xmm0, xmm1
  
         // 13 rounds...
-13:    movdqu  xmm1, [edx]
-       add     edx, 16
+13:    movdqu  xmm1, [K]
+       add     K, 16
         \aes    xmm0, xmm1
  
         // 12 rounds...
-12:    movdqu  xmm1, [edx]
-       add     edx, 16
+12:    movdqu  xmm1, [K]
+       add     K, 16
         \aes    xmm0, xmm1
  
         // 11 rounds...
-11:    movdqu  xmm1, [edx]
-       add     edx, 16
+11:    movdqu  xmm1, [K]
+       add     K, 16
         \aes    xmm0, xmm1
  
         // 10 rounds...
-10:    movdqu  xmm1, [edx]
+10:    movdqu  xmm1, [K]
         \aes    xmm0, xmm1
  
         // 9 rounds...
-       movdqu  xmm1, [edx + 16]
+       movdqu  xmm1, [K + 16]
         \aes    xmm0, xmm1
  
         // 8 rounds...
-       movdqu  xmm1, [edx + 32]
+       movdqu  xmm1, [K + 32]
         \aes    xmm0, xmm1
  
         // 7 rounds...
-       movdqu  xmm1, [edx + 48]
+       movdqu  xmm1, [K + 48]
         \aes    xmm0, xmm1
  
         // 6 rounds...
-       movdqu  xmm1, [edx + 64]
+       movdqu  xmm1, [K + 64]
         \aes    xmm0, xmm1
  
         // 5 rounds...
-       movdqu  xmm1, [edx + 80]
+       movdqu  xmm1, [K + 80]
         \aes    xmm0, xmm1
  
         // 4 rounds...
-       movdqu  xmm1, [edx + 96]
+       movdqu  xmm1, [K + 96]
         \aes    xmm0, xmm1
  
         // 3 rounds...
-       movdqu  xmm1, [edx + 112]
+       movdqu  xmm1, [K + 112]
         \aes    xmm0, xmm1
  
         // 2 rounds...
-       movdqu  xmm1, [edx + 128]
+       movdqu  xmm1, [K + 128]
         \aes    xmm0, xmm1
  
         // Final round...
-       movdqu  xmm1, [edx + 144]
+       movdqu  xmm1, [K + 144]
         \aes\()last xmm0, xmm1
  
         // Unpermute the ciphertext block and store it.
         pshufb  xmm0, xmm5
-       mov     eax, [esp + 12]
-       movdqu  [eax], xmm0
+#if CPUFAM_X86
+       mov     DST, [esp + 12]
+#endif
+       movdqu  [DST], xmm0
  
         // And we're done.
         ret
  
+#undef K
+#undef SRC
+#undef DST
+#undef NR
+
  ENDFUNC
         .endm
  
diff --git a/symm/rijndael.c b/symm/rijndael.c

index dcb35e6..293f28d 100644 (file)
--- a/symm/rijndael.c
+++ b/symm/rijndael.c
@@ -82,15 +82,15 @@ CPU_DISPATCH(EMPTY, EMPTY, void, rijndael_dblk, (const rijndael_ctx *k,
                                                  uint32 d[4]),
              (k, s, d), pick_dblk, simple_dblk)
  
-#ifdef CPUFAM_X86
-extern rijndael_eblk__functype rijndael_eblk_x86_aesni;
-extern rijndael_dblk__functype rijndael_dblk_x86_aesni;
+#if CPUFAM_X86 || CPUFAM_AMD64
+extern rijndael_eblk__functype rijndael_eblk_x86ish_aesni;
+extern rijndael_dblk__functype rijndael_dblk_x86ish_aesni;
  #endif
  
  static rijndael_eblk__functype *pick_eblk(void)
  {
-#ifdef CPUFAM_X86
-  DISPATCH_PICK_COND(rijndael_eblk, rijndael_eblk_x86_aesni,
+#if CPUFAM_X86 || CPUFAM_AMD64
+  DISPATCH_PICK_COND(rijndael_eblk, rijndael_eblk_x86ish_aesni,
                      cpu_feature_p(CPUFEAT_X86_AESNI));
  #endif
    DISPATCH_PICK_FALLBACK(rijndael_eblk, simple_eblk);
@@ -98,8 +98,8 @@ static rijndael_eblk__functype *pick_eblk(void)
  
  static rijndael_dblk__functype *pick_dblk(void)
  {
-#ifdef CPUFAM_X86
-  DISPATCH_PICK_COND(rijndael_dblk, rijndael_dblk_x86_aesni,
+#if CPUFAM_X86 || CPUFAM_AMD64
+  DISPATCH_PICK_COND(rijndael_dblk, rijndael_dblk_x86ish_aesni,
                      cpu_feature_p(CPUFEAT_X86_AESNI));
  #endif
    DISPATCH_PICK_FALLBACK(rijndael_dblk, simple_dblk);
diff --git a/symm/salsa20-x86-sse2.S b/symm/salsa20-x86ish-sse2.S

similarity index 63%

rename from symm/salsa20-x86-sse2.S

rename to symm/salsa20-x86ish-sse2.S

index 7a5bd2a..a168d79 100644 (file)
--- a/symm/salsa20-x86-sse2.S
+++ b/symm/salsa20-x86ish-sse2.S
@@ -44,20 +44,76 @@
         .arch pentium4
         .section .text
  
-FUNC(salsa20_core_x86_sse2)
+FUNC(salsa20_core_x86ish_sse2)
+
+       // Initial setup.
+
+#if CPUFAM_X86
+       // Arguments come in on the stack, and will need to be collected.  We
+       // we can get away with just the scratch registers for integer work,
+       // but we'll run out of XMM registers and will need some properly
+       // aligned space which we'll steal from the stack.  I don't trust the
+       // stack pointer's alignment, so I'll have to mask the stack pointer,
+       // which in turn means I'll need to keep track of the old value.
+       // Hence I'm making a full i386-style stack frame here.
+       //
+       // The Windows and SysV ABIs are sufficiently similar that we don't
+       // need to worry about the differences here.
+
+#  define NR ecx
+#  define IN eax
+#  define OUT edx
+#  define SAVE0 xmm6
+#  define SAVE1 xmm7
+#  define SAVE2 [esp + 0]
+#  define SAVE3 [esp + 16]
  
-       // Initial state.  We have three arguments:
-       // [ebp +  8] is the number of rounds to do
-       // [ebp + 12] points to the input matrix
-       // [ebp + 16] points to the output matrix
         push    ebp
         mov     ebp, esp
         sub     esp, 32
-       mov     edx, [ebp + 12]
+       mov     IN, [ebp + 12]
+       mov     OUT, [ebp + 16]
         and     esp, ~15
-
-       // Prepare for the main loop.
-       mov     ecx, [ebp + 8]
+       mov     NR, [ebp + 8]
+#endif
+
+#if CPUFAM_AMD64 && ABI_SYSV
+       // This is nice.  We have plenty of XMM registers, and the arguments
+       // are in useful places.  There's no need to spill anything and we
+       // can just get on with the code.
+
+#  define NR edi
+#  define IN rsi
+#  define OUT rdx
+#  define SAVE0 xmm6
+#  define SAVE1 xmm7
+#  define SAVE2 xmm8
+#  define SAVE3 xmm9
+#endif
+
+#  if CPUFAM_AMD64 && ABI_WIN
+       // Arguments come in registers, but they're different between Windows
+       // and everyone else (and everyone else is saner).
+       //
+       // The Windows ABI insists that we preserve some of the XMM
+       // registers, but we want more than we can use as scratch space.  Two
+       // places we only need to save a copy of the input for the
+       // feedforward at the end; but the other two we want for the final
+       // permutation, so save the old values on the stack (We need an extra
+       // 8 bytes to align the stack.)
+
+#  define NR ecx
+#  define IN rdx
+#  define OUT r8
+#  define SAVE0 xmm6
+#  define SAVE1 xmm7
+#  define SAVE2 [rsp + 32]
+#  define SAVE3 [rsp + 48]
+
+       sub     rsp, 64 + 8
+       movdqa  [rsp +  0], xmm6
+       movdqa  [rsp + 16], xmm7
+#endif
  
         // First job is to slurp the matrix into XMM registers.  The words
         // have already been permuted conveniently to make them line up
@@ -85,19 +141,18 @@ FUNC(salsa20_core_x86_sse2)
         //      [ 4  5  6  7]    -->    [ 4  9 14  3] (b, xmm1)
         //      [ 8  9 10 11]           [ 8 13  2  7] (c, xmm2)
         //      [12 13 14 15]           [12  1  6 11] (d, xmm3)
-       movdqu  xmm0, [edx +  0]
-       movdqu  xmm1, [edx + 16]
-       movdqu  xmm2, [edx + 32]
-       movdqu  xmm3, [edx + 48]
+       movdqu  xmm0, [IN +  0]
+       movdqu  xmm1, [IN + 16]
+       movdqu  xmm2, [IN + 32]
+       movdqu  xmm3, [IN + 48]
  
-       // Take a copy for later.
-       movdqa  [esp +  0], xmm0
-       movdqa  [esp + 16], xmm1
-       movdqa  xmm6, xmm2
-       movdqa  xmm7, xmm3
+       ## Take a copy for later.
+       movdqa  SAVE0, xmm0
+       movdqa  SAVE1, xmm1
+       movdqa  SAVE2, xmm2
+       movdqa  SAVE3, xmm3
  
  loop:
-
         // Apply a column quarterround to each of the columns simultaneously.
         // Alas, there doesn't seem to be a packed doubleword rotate, so we
         // have to synthesize it.
@@ -147,9 +202,9 @@ loop:
         // involve any movement of elements between rows.
         //
         //      [ 0  5 10 15]           [ 0  5 10 15] (a, xmm0)
-       //      [ 4  9 14  3]    -->    [ 1  6 11 12] (b, xmm3)
-       //      [ 8 13  2  7]           [ 2  7  8 13] (c, xmm2)
-       //      [12  1  6 11]           [ 3  4  9 14] (d, xmm1)
+       //      [ 4  9 14  3]    -->    [ 1  6 11 12] (b, xmm3)
+       //      [ 8 13  2  7]           [ 2  7  8 13] (c, xmm2)
+       //      [12  1  6 11]           [ 3  4  9 14] (d, xmm1)
         //
         // The shuffles have quite high latency, so they've been pushed
         // backwards into the main instruction list.
@@ -200,7 +255,7 @@ loop:
         // back the shuffles because they take a long time coming through.
         // Decrement the loop counter and see if we should go round again.
         // Later processors fuse this pair into a single uop.
-       sub     ecx, 2
+       sub     NR, 2
         ja      loop
  
         // Almost there.  Firstly, the feedforward addition, and then we have
@@ -208,55 +263,69 @@ loop:
         // which was already applied to the input.  Shuffling has quite high
         // latency, so arrange to start a new shuffle into a temporary as
         // soon as we've written out the old value.
-       mov     edx, [ebp + 16]
-
-       paddd   xmm0, [esp +  0]
-       pshufd  xmm4, xmm0, ROTR
-       movd    [edx +  0], xmm0
+       paddd   xmm0, SAVE0
+       pshufd  xmm4, xmm0, 0x39
+       movd    [OUT +  0], xmm0
  
-       paddd   xmm1, [esp + 16]
+       paddd   xmm1, SAVE1
         pshufd  xmm5, xmm1, ROTL
-       movd    [edx + 16], xmm1
+       movd    [OUT + 16], xmm1
  
-       paddd   xmm2, xmm6
+       paddd   xmm2, SAVE2
         pshufd  xmm6, xmm2, ROT2
-       movd    [edx + 32], xmm2
+       movd    [OUT + 32], xmm2
  
-       paddd   xmm3, xmm7
+       paddd   xmm3, SAVE3
         pshufd  xmm7, xmm3, ROTR
-       movd    [edx + 48], xmm3
+       movd    [OUT + 48], xmm3
  
-       movd    [edx +  4], xmm7
+       movd    [OUT +  4], xmm7
         pshufd  xmm7, xmm3, ROT2
-       movd    [edx + 24], xmm7
+       movd    [OUT + 24], xmm7
         pshufd  xmm3, xmm3, ROTL
-       movd    [edx + 44], xmm3
+       movd    [OUT + 44], xmm3
  
-       movd    [edx +  8], xmm6
+       movd    [OUT +  8], xmm6
         pshufd  xmm6, xmm2, ROTL
-       movd    [edx + 28], xmm6
+       movd    [OUT + 28], xmm6
         pshufd  xmm2, xmm2, ROTR
-       movd    [edx + 52], xmm2
+       movd    [OUT + 52], xmm2
  
-       movd    [edx + 12], xmm5
+       movd    [OUT + 12], xmm5
         pshufd  xmm5, xmm1, ROTR
-       movd    [edx + 36], xmm5
+       movd    [OUT + 36], xmm5
         pshufd  xmm1, xmm1, ROT2
-       movd    [edx + 56], xmm1
+       movd    [OUT + 56], xmm1
  
-       movd    [edx + 20], xmm4
+       movd    [OUT + 20], xmm4
         pshufd  xmm4, xmm0, ROT2
-       movd    [edx + 40], xmm4
+       movd    [OUT + 40], xmm4
         pshufd  xmm0, xmm0, ROTL
-       movd    [edx + 60], xmm0
+       movd    [OUT + 60], xmm0
  
         // Tidy things up.
+
+#if CPUFAM_X86
         mov     esp, ebp
         pop     ebp
+#endif
+#if CPUFAM_AMD64 && ABI_WIN
+       movdqa  xmm6, [rsp +  0]
+       movdqa  xmm7, [rsp + 16]
+       add     rsp, 64 + 8
+#endif
  
         // And with that, we're done.
         ret
  
+#undef NR
+#undef IN
+#undef OUT
+#undef SAVE0
+#undef SAVE1
+#undef SAVE2
+#undef SAVE3
+
  ENDFUNC
  
  ///----- That's all, folks --------------------------------------------------
diff --git a/symm/salsa20.c b/symm/salsa20.c

index 15e4d50..eb4e67a 100644 (file)
--- a/symm/salsa20.c
+++ b/symm/salsa20.c
@@ -52,14 +52,14 @@ static void simple_core(unsigned r, const salsa20_matrix src,
                         salsa20_matrix dest)
    { SALSA20_nR(dest, src, r); SALSA20_FFWD(dest, src); }
  
-#ifdef CPUFAM_X86
-extern core__functype salsa20_core_x86_sse2;
+#if CPUFAM_X86 || CPUFAM_AMD64
+extern core__functype salsa20_core_x86ish_sse2;
  #endif
  
  static core__functype *pick_core(void)
  {
-#ifdef CPUFAM_X86
-  DISPATCH_PICK_COND(salsa20_core, salsa20_core_x86_sse2,
+#if CPUFAM_X86 || CPUFAM_AMD64
+  DISPATCH_PICK_COND(salsa20_core, salsa20_core_x86ish_sse2,
                      cpu_feature_p(CPUFEAT_X86_SSE2));
  #endif
    DISPATCH_PICK_FALLBACK(salsa20_core, simple_core);
author	Mark Wooding <mdw@distorted.org.uk>
	Sat, 21 May 2016 13:33:28 +0000 (14:33 +0100)
committer	Mark Wooding <mdw@distorted.org.uk>
	Sat, 21 May 2016 16:17:26 +0000 (17:17 +0100)
base/asm-common.h		patch \| blob \| blame \| history
base/dispatch.c		patch \| blob \| blame \| history
configure.ac		patch \| blob \| blame \| history
symm/Makefile.am		patch \| blob \| blame \| history
symm/chacha-x86ish-sse2.S	[moved from symm/chacha-x86-sse2.S with 65% similarity]	patch \| blob \| blame \| history
symm/chacha.c		patch \| blob \| blame \| history
symm/rijndael-base.c		patch \| blob \| blame \| history
symm/rijndael-x86ish-aesni.S	[moved from symm/rijndael-x86-aesni.S with 53% similarity]	patch \| blob \| blame \| history
symm/rijndael.c		patch \| blob \| blame \| history
symm/salsa20-x86ish-sse2.S	[moved from symm/salsa20-x86-sse2.S with 63% similarity]	patch \| blob \| blame \| history
symm/salsa20.c		patch \| blob \| blame \| history