Add support for AMD64 processors and Microsoft Windows.

[catacomb] / symm / salsa20-x86ish-sse2.S
diff --git a/symm/salsa20-x86-sse2.S b/symm/salsa20-x86ish-sse2.S

similarity index 63%

rename from symm/salsa20-x86-sse2.S

rename to symm/salsa20-x86ish-sse2.S

index 7a5bd2a..a168d79 100644 (file)
--- a/symm/salsa20-x86-sse2.S
+++ b/symm/salsa20-x86ish-sse2.S
@@ -44,20 +44,76 @@
         .arch pentium4
         .section .text
  
-FUNC(salsa20_core_x86_sse2)
+FUNC(salsa20_core_x86ish_sse2)
+
+       // Initial setup.
+
+#if CPUFAM_X86
+       // Arguments come in on the stack, and will need to be collected.  We
+       // we can get away with just the scratch registers for integer work,
+       // but we'll run out of XMM registers and will need some properly
+       // aligned space which we'll steal from the stack.  I don't trust the
+       // stack pointer's alignment, so I'll have to mask the stack pointer,
+       // which in turn means I'll need to keep track of the old value.
+       // Hence I'm making a full i386-style stack frame here.
+       //
+       // The Windows and SysV ABIs are sufficiently similar that we don't
+       // need to worry about the differences here.
+
+#  define NR ecx
+#  define IN eax
+#  define OUT edx
+#  define SAVE0 xmm6
+#  define SAVE1 xmm7
+#  define SAVE2 [esp + 0]
+#  define SAVE3 [esp + 16]
  
-       // Initial state.  We have three arguments:
-       // [ebp +  8] is the number of rounds to do
-       // [ebp + 12] points to the input matrix
-       // [ebp + 16] points to the output matrix
         push    ebp
         mov     ebp, esp
         sub     esp, 32
-       mov     edx, [ebp + 12]
+       mov     IN, [ebp + 12]
+       mov     OUT, [ebp + 16]
         and     esp, ~15
-
-       // Prepare for the main loop.
-       mov     ecx, [ebp + 8]
+       mov     NR, [ebp + 8]
+#endif
+
+#if CPUFAM_AMD64 && ABI_SYSV
+       // This is nice.  We have plenty of XMM registers, and the arguments
+       // are in useful places.  There's no need to spill anything and we
+       // can just get on with the code.
+
+#  define NR edi
+#  define IN rsi
+#  define OUT rdx
+#  define SAVE0 xmm6
+#  define SAVE1 xmm7
+#  define SAVE2 xmm8
+#  define SAVE3 xmm9
+#endif
+
+#  if CPUFAM_AMD64 && ABI_WIN
+       // Arguments come in registers, but they're different between Windows
+       // and everyone else (and everyone else is saner).
+       //
+       // The Windows ABI insists that we preserve some of the XMM
+       // registers, but we want more than we can use as scratch space.  Two
+       // places we only need to save a copy of the input for the
+       // feedforward at the end; but the other two we want for the final
+       // permutation, so save the old values on the stack (We need an extra
+       // 8 bytes to align the stack.)
+
+#  define NR ecx
+#  define IN rdx
+#  define OUT r8
+#  define SAVE0 xmm6
+#  define SAVE1 xmm7
+#  define SAVE2 [rsp + 32]
+#  define SAVE3 [rsp + 48]
+
+       sub     rsp, 64 + 8
+       movdqa  [rsp +  0], xmm6
+       movdqa  [rsp + 16], xmm7
+#endif
  
         // First job is to slurp the matrix into XMM registers.  The words
         // have already been permuted conveniently to make them line up
@@ -85,19 +141,18 @@ FUNC(salsa20_core_x86_sse2)
         //      [ 4  5  6  7]    -->    [ 4  9 14  3] (b, xmm1)
         //      [ 8  9 10 11]           [ 8 13  2  7] (c, xmm2)
         //      [12 13 14 15]           [12  1  6 11] (d, xmm3)
-       movdqu  xmm0, [edx +  0]
-       movdqu  xmm1, [edx + 16]
-       movdqu  xmm2, [edx + 32]
-       movdqu  xmm3, [edx + 48]
+       movdqu  xmm0, [IN +  0]
+       movdqu  xmm1, [IN + 16]
+       movdqu  xmm2, [IN + 32]
+       movdqu  xmm3, [IN + 48]
  
-       // Take a copy for later.
-       movdqa  [esp +  0], xmm0
-       movdqa  [esp + 16], xmm1
-       movdqa  xmm6, xmm2
-       movdqa  xmm7, xmm3
+       ## Take a copy for later.
+       movdqa  SAVE0, xmm0
+       movdqa  SAVE1, xmm1
+       movdqa  SAVE2, xmm2
+       movdqa  SAVE3, xmm3
  
  loop:
-
         // Apply a column quarterround to each of the columns simultaneously.
         // Alas, there doesn't seem to be a packed doubleword rotate, so we
         // have to synthesize it.
@@ -147,9 +202,9 @@ loop:
         // involve any movement of elements between rows.
         //
         //      [ 0  5 10 15]           [ 0  5 10 15] (a, xmm0)
-       //      [ 4  9 14  3]    -->    [ 1  6 11 12] (b, xmm3)
-       //      [ 8 13  2  7]           [ 2  7  8 13] (c, xmm2)
-       //      [12  1  6 11]           [ 3  4  9 14] (d, xmm1)
+       //      [ 4  9 14  3]    -->    [ 1  6 11 12] (b, xmm3)
+       //      [ 8 13  2  7]           [ 2  7  8 13] (c, xmm2)
+       //      [12  1  6 11]           [ 3  4  9 14] (d, xmm1)
         //
         // The shuffles have quite high latency, so they've been pushed
         // backwards into the main instruction list.
@@ -200,7 +255,7 @@ loop:
         // back the shuffles because they take a long time coming through.
         // Decrement the loop counter and see if we should go round again.
         // Later processors fuse this pair into a single uop.
-       sub     ecx, 2
+       sub     NR, 2
         ja      loop
  
         // Almost there.  Firstly, the feedforward addition, and then we have
@@ -208,55 +263,69 @@ loop:
         // which was already applied to the input.  Shuffling has quite high
         // latency, so arrange to start a new shuffle into a temporary as
         // soon as we've written out the old value.
-       mov     edx, [ebp + 16]
-
-       paddd   xmm0, [esp +  0]
-       pshufd  xmm4, xmm0, ROTR
-       movd    [edx +  0], xmm0
+       paddd   xmm0, SAVE0
+       pshufd  xmm4, xmm0, 0x39
+       movd    [OUT +  0], xmm0
  
-       paddd   xmm1, [esp + 16]
+       paddd   xmm1, SAVE1
         pshufd  xmm5, xmm1, ROTL
-       movd    [edx + 16], xmm1
+       movd    [OUT + 16], xmm1
  
-       paddd   xmm2, xmm6
+       paddd   xmm2, SAVE2
         pshufd  xmm6, xmm2, ROT2
-       movd    [edx + 32], xmm2
+       movd    [OUT + 32], xmm2
  
-       paddd   xmm3, xmm7
+       paddd   xmm3, SAVE3
         pshufd  xmm7, xmm3, ROTR
-       movd    [edx + 48], xmm3
+       movd    [OUT + 48], xmm3
  
-       movd    [edx +  4], xmm7
+       movd    [OUT +  4], xmm7
         pshufd  xmm7, xmm3, ROT2
-       movd    [edx + 24], xmm7
+       movd    [OUT + 24], xmm7
         pshufd  xmm3, xmm3, ROTL
-       movd    [edx + 44], xmm3
+       movd    [OUT + 44], xmm3
  
-       movd    [edx +  8], xmm6
+       movd    [OUT +  8], xmm6
         pshufd  xmm6, xmm2, ROTL
-       movd    [edx + 28], xmm6
+       movd    [OUT + 28], xmm6
         pshufd  xmm2, xmm2, ROTR
-       movd    [edx + 52], xmm2
+       movd    [OUT + 52], xmm2
  
-       movd    [edx + 12], xmm5
+       movd    [OUT + 12], xmm5
         pshufd  xmm5, xmm1, ROTR
-       movd    [edx + 36], xmm5
+       movd    [OUT + 36], xmm5
         pshufd  xmm1, xmm1, ROT2
-       movd    [edx + 56], xmm1
+       movd    [OUT + 56], xmm1
  
-       movd    [edx + 20], xmm4
+       movd    [OUT + 20], xmm4
         pshufd  xmm4, xmm0, ROT2
-       movd    [edx + 40], xmm4
+       movd    [OUT + 40], xmm4
         pshufd  xmm0, xmm0, ROTL
-       movd    [edx + 60], xmm0
+       movd    [OUT + 60], xmm0
  
         // Tidy things up.
+
+#if CPUFAM_X86
         mov     esp, ebp
         pop     ebp
+#endif
+#if CPUFAM_AMD64 && ABI_WIN
+       movdqa  xmm6, [rsp +  0]
+       movdqa  xmm7, [rsp + 16]
+       add     rsp, 64 + 8
+#endif
  
         // And with that, we're done.
         ret
  
+#undef NR
+#undef IN
+#undef OUT
+#undef SAVE0
+#undef SAVE1
+#undef SAVE2
+#undef SAVE3
+
  ENDFUNC
  
  ///----- That's all, folks --------------------------------------------------