base/asm-common.h (x86), and knock-on: Add macros for full-size regs.

[catacomb] / symm / salsa20-x86ish-sse2.S
diff --git a/symm/salsa20-x86ish-sse2.S b/symm/salsa20-x86ish-sse2.S

index fbdfea7..eb346af 100644 (file)
--- a/symm/salsa20-x86ish-sse2.S
+++ b/symm/salsa20-x86ish-sse2.S
@@ -25,24 +25,24 @@
  /// MA 02111-1307, USA.
  
  ///--------------------------------------------------------------------------
-/// External definitions.
+/// Preliminaries.
  
  #include "config.h"
  #include "asm-common.h"
  
-///--------------------------------------------------------------------------
-/// Local utilities.
-
-// Magic constants for shuffling.
-#define ROTL 0x93
-#define ROT2 0x4e
-#define ROTR 0x39
+       .text
  
  ///--------------------------------------------------------------------------
  /// Main code.
  
-       .arch pentium4
-       .text
+FUNC(salsa20_core_x86ish_avx)
+       .arch   .avx
+       vzeroupper
+  endprologue
+       // drop through...
+ENDFUNC
+
+       .arch   pentium4
  
  FUNC(salsa20_core_x86ish_sse2)
  
@@ -50,12 +50,12 @@ FUNC(salsa20_core_x86ish_sse2)
  
  #if CPUFAM_X86
         // Arguments come in on the stack, and will need to be collected.  We
-       // we can get away with just the scratch registers for integer work,
-       // but we'll run out of XMM registers and will need some properly
-       // aligned space which we'll steal from the stack.  I don't trust the
-       // stack pointer's alignment, so I'll have to mask the stack pointer,
-       // which in turn means I'll need to keep track of the old value.
-       // Hence I'm making a full i386-style stack frame here.
+       // can get away with just the scratch registers for integer work, but
+       // we'll run out of XMM registers and will need some properly aligned
+       // space which we'll steal from the stack.  I don't trust the stack
+       // pointer's alignment, so I'll have to mask the stack pointer, which
+       // in turn means I'll need to keep track of the old value.  Hence I'm
+       // making a full i386-style stack frame here.
         //
         // The Windows and SysV ABIs are sufficiently similar that we don't
         // need to worry about the differences here.
@@ -65,16 +65,16 @@ FUNC(salsa20_core_x86ish_sse2)
  #  define OUT edx
  #  define SAVE0 xmm6
  #  define SAVE1 xmm7
-#  define SAVE2 [esp + 0]
-#  define SAVE3 [esp + 16]
-
-       push    ebp
-       mov     ebp, esp
-       sub     esp, 32
-       mov     IN, [ebp + 12]
-       mov     OUT, [ebp + 16]
-       and     esp, ~15
-       mov     NR, [ebp + 8]
+#  define SAVE2 [SP + 0]
+#  define SAVE3 [SP + 16]
+
+       pushreg BP
+       setfp
+       sub     SP, 32
+       mov     IN, [BP + 12]
+       mov     OUT, [BP + 16]
+       and     SP, ~15
+       mov     NR, [BP + 8]
  #endif
  
  #if CPUFAM_AMD64 && ABI_SYSV
@@ -107,18 +107,16 @@ FUNC(salsa20_core_x86ish_sse2)
  #  define OUT r8
  #  define SAVE0 xmm6
  #  define SAVE1 xmm7
-#  define SAVE2 [rsp + 32]
-#  define SAVE3 [rsp + 48]
-
-       sub     rsp, 64 + 8
-         .seh_stackalloc 64 + 8
-       movdqa  [rsp +  0], xmm6
-         .seh_savexmm xmm6, 0
-       movdqa  [rsp + 16], xmm7
-         .seh_savexmm xmm7, 16
-  .seh_endprologue
+#  define SAVE2 [SP + 32]
+#  define SAVE3 [SP + 48]
+
+       stalloc 64 + 8
+       savexmm xmm6, 0
+       savexmm xmm7, 16
  #endif
  
+  endprologue
+
         // First job is to slurp the matrix into XMM registers.  The words
         // have already been permuted conveniently to make them line up
         // better for SIMD processing.
@@ -182,7 +180,7 @@ FUNC(salsa20_core_x86ish_sse2)
         // d ^= (c + b) <<< 13
         movdqa  xmm4, xmm2
         paddd   xmm4, xmm1
-       pshufd  xmm1, xmm1, ROTL
+        pshufd xmm1, xmm1, SHUF(3, 0, 1, 2)
         movdqa  xmm5, xmm4
         pslld   xmm4, 13
         psrld   xmm5, 19
@@ -191,9 +189,9 @@ FUNC(salsa20_core_x86ish_sse2)
  
         // a ^= (d + c) <<< 18
         movdqa  xmm4, xmm3
-       pshufd  xmm3, xmm3, ROTR
+        pshufd xmm3, xmm3, SHUF(1, 2, 3, 0)
         paddd   xmm4, xmm2
-       pshufd  xmm2, xmm2, ROT2
+        pshufd xmm2, xmm2, SHUF(2, 3, 0, 1)
         movdqa  xmm5, xmm4
         pslld   xmm4, 18
         psrld   xmm5, 14
@@ -237,7 +235,7 @@ FUNC(salsa20_core_x86ish_sse2)
         // d ^= (c + b) <<< 13
         movdqa  xmm4, xmm2
         paddd   xmm4, xmm3
-       pshufd  xmm3, xmm3, ROTL
+        pshufd xmm3, xmm3, SHUF(3, 0, 1, 2)
         movdqa  xmm5, xmm4
         pslld   xmm4, 13
         psrld   xmm5, 19
@@ -246,9 +244,9 @@ FUNC(salsa20_core_x86ish_sse2)
  
         // a ^= (d + c) <<< 18
         movdqa  xmm4, xmm1
-       pshufd  xmm1, xmm1, ROTR
+        pshufd xmm1, xmm1, SHUF(1, 2, 3, 0)
         paddd   xmm4, xmm2
-       pshufd  xmm2, xmm2, ROT2
+        pshufd xmm2, xmm2, SHUF(2, 3, 0, 1)
         movdqa  xmm5, xmm4
         pslld   xmm4, 18
         psrld   xmm5, 14
@@ -262,60 +260,53 @@ FUNC(salsa20_core_x86ish_sse2)
         sub     NR, 2
         ja      0b
  
-       // Almost there.  Firstly, the feedforward addition, and then we have
-       // to write out the result.  Here we have to undo the permutation
-       // which was already applied to the input.  Shuffling has quite high
-       // latency, so arrange to start a new shuffle into a temporary as
-       // soon as we've written out the old value.
-       paddd   xmm0, SAVE0
-       pshufd  xmm4, xmm0, 0x39
-       movd    [OUT +  0], xmm0
-
-       paddd   xmm1, SAVE1
-       pshufd  xmm5, xmm1, ROTL
-       movd    [OUT + 16], xmm1
-
-       paddd   xmm2, SAVE2
-       pshufd  xmm6, xmm2, ROT2
-       movd    [OUT + 32], xmm2
-
-       paddd   xmm3, SAVE3
-       pshufd  xmm7, xmm3, ROTR
-       movd    [OUT + 48], xmm3
-
-       movd    [OUT +  4], xmm7
-       pshufd  xmm7, xmm3, ROT2
-       movd    [OUT + 24], xmm7
-       pshufd  xmm3, xmm3, ROTL
-       movd    [OUT + 44], xmm3
-
-       movd    [OUT +  8], xmm6
-       pshufd  xmm6, xmm2, ROTL
-       movd    [OUT + 28], xmm6
-       pshufd  xmm2, xmm2, ROTR
-       movd    [OUT + 52], xmm2
-
-       movd    [OUT + 12], xmm5
-       pshufd  xmm5, xmm1, ROTR
-       movd    [OUT + 36], xmm5
-       pshufd  xmm1, xmm1, ROT2
-       movd    [OUT + 56], xmm1
-
-       movd    [OUT + 20], xmm4
-       pshufd  xmm4, xmm0, ROT2
-       movd    [OUT + 40], xmm4
-       pshufd  xmm0, xmm0, ROTL
-       movd    [OUT + 60], xmm0
+       // Almost there.  Firstly, the feedforward addition.
+       paddd   xmm0, SAVE0                     //  0,  5, 10, 15
+       paddd   xmm1, SAVE1                     //  4,  9, 14,  3
+       paddd   xmm2, SAVE2                     //  8, 13,  2,  7
+       paddd   xmm3, SAVE3                     // 12,  1,  6, 11
+
+       // Next we must undo the permutation which was already applied to the
+       // input.  This can be done by juggling values in registers, with the
+       // following fancy footwork: some row rotations, a transpose, and
+       // some more rotations.
+       pshufd  xmm1, xmm1, SHUF(3, 0, 1, 2)    //  3,  4,  9, 14
+       pshufd  xmm2, xmm2, SHUF(2, 3, 0, 1)    //  2,  7,  8, 13
+       pshufd  xmm3, xmm3, SHUF(1, 2, 3, 0)    //  1,  6, 11, 12
+
+       movdqa  xmm4, xmm0
+       movdqa  xmm5, xmm3
+       punpckldq xmm0, xmm2                    //  0,  2,  5,  7
+       punpckldq xmm3, xmm1                    //  1,  3,  6,  4
+       punpckhdq xmm4, xmm2                    //  10, 8, 15, 13
+       punpckhdq xmm5, xmm1                    //  11, 9, 12, 14
+
+       movdqa  xmm1, xmm0
+       movdqa  xmm2, xmm4
+       punpckldq xmm0, xmm3                    //  0,  1,  2,  3
+       punpckldq xmm4, xmm5                    // 10, 11,  8,  9
+       punpckhdq xmm1, xmm3                    //  5,  6,  7,  4
+       punpckhdq xmm2, xmm5                    // 15, 12, 13, 14
+
+       pshufd  xmm1, xmm1, SHUF(3, 0, 1, 2)    //  4,  5,  6,  7
+       pshufd  xmm4, xmm4, SHUF(2, 3, 0, 1)    //  8,  9, 10, 11
+       pshufd  xmm2, xmm2, SHUF(1, 2, 3, 0)    // 12, 13, 14, 15
+
+       // Finally we have to write out the result.
+       movdqu  [OUT +  0], xmm0
+       movdqu  [OUT + 16], xmm1
+       movdqu  [OUT + 32], xmm4
+       movdqu  [OUT + 48], xmm2
  
         // Tidy things up.
  #if CPUFAM_X86
-       mov     esp, ebp
-       pop     ebp
+       dropfp
+       popreg  BP
  #endif
  #if CPUFAM_AMD64 && ABI_WIN
-       movdqa  xmm6, [rsp +  0]
-       movdqa  xmm7, [rsp + 16]
-       add     rsp, 64 + 8
+       rstrxmm xmm6, 0
+       rstrxmm xmm7, 16
+       stfree  64 + 8
  #endif
  
         // And with that, we're done.