Add support for AMD64 processors and Microsoft Windows.

[catacomb] / symm / salsa20-x86ish-sse2.S
diff --git a/symm/salsa20-x86ish-sse2.S b/symm/salsa20-x86ish-sse2.S

new file mode 100644 (file)

index 0000000..a168d79
--- /dev/null
+++ b/symm/salsa20-x86ish-sse2.S
@@ -0,0 +1,331 @@
+/// -*- mode: asm; asm-comment-char: ?/ -*-
+///
+/// Fancy SIMD implementation of Salsa20
+///
+/// (c) 2015 Straylight/Edgeware
+///
+
+///----- Licensing notice ---------------------------------------------------
+///
+/// This file is part of Catacomb.
+///
+/// Catacomb is free software; you can redistribute it and/or modify
+/// it under the terms of the GNU Library General Public License as
+/// published by the Free Software Foundation; either version 2 of the
+/// License, or (at your option) any later version.
+///
+/// Catacomb is distributed in the hope that it will be useful,
+/// but WITHOUT ANY WARRANTY; without even the implied warranty of
+/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+/// GNU Library General Public License for more details.
+///
+/// You should have received a copy of the GNU Library General Public
+/// License along with Catacomb; if not, write to the Free
+/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+/// MA 02111-1307, USA.
+
+///--------------------------------------------------------------------------
+/// External definitions.
+
+#include "config.h"
+#include "asm-common.h"
+
+///--------------------------------------------------------------------------
+/// Local utilities.
+
+// Magic constants for shuffling.
+#define ROTL 0x93
+#define ROT2 0x4e
+#define ROTR 0x39
+
+///--------------------------------------------------------------------------
+/// Main code.
+
+       .arch pentium4
+       .section .text
+
+FUNC(salsa20_core_x86ish_sse2)
+
+       // Initial setup.
+
+#if CPUFAM_X86
+       // Arguments come in on the stack, and will need to be collected.  We
+       // we can get away with just the scratch registers for integer work,
+       // but we'll run out of XMM registers and will need some properly
+       // aligned space which we'll steal from the stack.  I don't trust the
+       // stack pointer's alignment, so I'll have to mask the stack pointer,
+       // which in turn means I'll need to keep track of the old value.
+       // Hence I'm making a full i386-style stack frame here.
+       //
+       // The Windows and SysV ABIs are sufficiently similar that we don't
+       // need to worry about the differences here.
+
+#  define NR ecx
+#  define IN eax
+#  define OUT edx
+#  define SAVE0 xmm6
+#  define SAVE1 xmm7
+#  define SAVE2 [esp + 0]
+#  define SAVE3 [esp + 16]
+
+       push    ebp
+       mov     ebp, esp
+       sub     esp, 32
+       mov     IN, [ebp + 12]
+       mov     OUT, [ebp + 16]
+       and     esp, ~15
+       mov     NR, [ebp + 8]
+#endif
+
+#if CPUFAM_AMD64 && ABI_SYSV
+       // This is nice.  We have plenty of XMM registers, and the arguments
+       // are in useful places.  There's no need to spill anything and we
+       // can just get on with the code.
+
+#  define NR edi
+#  define IN rsi
+#  define OUT rdx
+#  define SAVE0 xmm6
+#  define SAVE1 xmm7
+#  define SAVE2 xmm8
+#  define SAVE3 xmm9
+#endif
+
+#  if CPUFAM_AMD64 && ABI_WIN
+       // Arguments come in registers, but they're different between Windows
+       // and everyone else (and everyone else is saner).
+       //
+       // The Windows ABI insists that we preserve some of the XMM
+       // registers, but we want more than we can use as scratch space.  Two
+       // places we only need to save a copy of the input for the
+       // feedforward at the end; but the other two we want for the final
+       // permutation, so save the old values on the stack (We need an extra
+       // 8 bytes to align the stack.)
+
+#  define NR ecx
+#  define IN rdx
+#  define OUT r8
+#  define SAVE0 xmm6
+#  define SAVE1 xmm7
+#  define SAVE2 [rsp + 32]
+#  define SAVE3 [rsp + 48]
+
+       sub     rsp, 64 + 8
+       movdqa  [rsp +  0], xmm6
+       movdqa  [rsp + 16], xmm7
+#endif
+
+       // First job is to slurp the matrix into XMM registers.  The words
+       // have already been permuted conveniently to make them line up
+       // better for SIMD processing.
+       //
+       // The textbook arrangement of the matrix is this.
+       //
+       //      [C K K K]
+       //      [K C N N]
+       //      [T T C K]
+       //      [K K K C]
+       //
+       // But we've rotated the columns up so that the main diagonal with
+       // the constants on it end up in the first row, giving something more
+       // like
+       //
+       //      [C C C C]
+       //      [K T K K]
+       //      [T K K N]
+       //      [K K N K]
+       //
+       // so the transformation looks like this:
+       //
+       //      [ 0  1  2  3]           [ 0  5 10 15] (a, xmm0)
+       //      [ 4  5  6  7]    -->    [ 4  9 14  3] (b, xmm1)
+       //      [ 8  9 10 11]           [ 8 13  2  7] (c, xmm2)
+       //      [12 13 14 15]           [12  1  6 11] (d, xmm3)
+       movdqu  xmm0, [IN +  0]
+       movdqu  xmm1, [IN + 16]
+       movdqu  xmm2, [IN + 32]
+       movdqu  xmm3, [IN + 48]
+
+       ## Take a copy for later.
+       movdqa  SAVE0, xmm0
+       movdqa  SAVE1, xmm1
+       movdqa  SAVE2, xmm2
+       movdqa  SAVE3, xmm3
+
+loop:
+       // Apply a column quarterround to each of the columns simultaneously.
+       // Alas, there doesn't seem to be a packed doubleword rotate, so we
+       // have to synthesize it.
+
+       // b ^= (a + d) <<<  7
+       movdqa  xmm4, xmm0
+       paddd   xmm4, xmm3
+       movdqa  xmm5, xmm4
+       pslld   xmm4, 7
+       psrld   xmm5, 25
+       por     xmm4, xmm5
+       pxor    xmm1, xmm4
+
+       // c ^= (b + a) <<<  9
+       movdqa  xmm4, xmm1
+       paddd   xmm4, xmm0
+       movdqa  xmm5, xmm4
+       pslld   xmm4, 9
+       psrld   xmm5, 23
+       por     xmm4, xmm5
+       pxor    xmm2, xmm4
+
+       // d ^= (c + b) <<< 13
+       movdqa  xmm4, xmm2
+       paddd   xmm4, xmm1
+       pshufd  xmm1, xmm1, ROTL
+       movdqa  xmm5, xmm4
+       pslld   xmm4, 13
+       psrld   xmm5, 19
+       por     xmm4, xmm5
+       pxor    xmm3, xmm4
+
+       // a ^= (d + c) <<< 18
+       movdqa  xmm4, xmm3
+       pshufd  xmm3, xmm3, ROTR
+       paddd   xmm4, xmm2
+       pshufd  xmm2, xmm2, ROT2
+       movdqa  xmm5, xmm4
+       pslld   xmm4, 18
+       psrld   xmm5, 14
+       por     xmm4, xmm5
+       pxor    xmm0, xmm4
+
+       // The transpose conveniently only involves reordering elements of
+       // individual rows, which can be done quite easily, and reordering
+       // the rows themselves, which is a trivial renaming.  It doesn't
+       // involve any movement of elements between rows.
+       //
+       //      [ 0  5 10 15]           [ 0  5 10 15] (a, xmm0)
+       //      [ 4  9 14  3]    -->    [ 1  6 11 12] (b, xmm3)
+       //      [ 8 13  2  7]           [ 2  7  8 13] (c, xmm2)
+       //      [12  1  6 11]           [ 3  4  9 14] (d, xmm1)
+       //
+       // The shuffles have quite high latency, so they've been pushed
+       // backwards into the main instruction list.
+
+       // Apply the row quarterround to each of the columns (yes!)
+       // simultaneously.
+
+       // b ^= (a + d) <<<  7
+       movdqa  xmm4, xmm0
+       paddd   xmm4, xmm1
+       movdqa  xmm5, xmm4
+       pslld   xmm4, 7
+       psrld   xmm5, 25
+       por     xmm4, xmm5
+       pxor    xmm3, xmm4
+
+       // c ^= (b + a) <<<  9
+       movdqa  xmm4, xmm3
+       paddd   xmm4, xmm0
+       movdqa  xmm5, xmm4
+       pslld   xmm4, 9
+       psrld   xmm5, 23
+       por     xmm4, xmm5
+       pxor    xmm2, xmm4
+
+       // d ^= (c + b) <<< 13
+       movdqa  xmm4, xmm2
+       paddd   xmm4, xmm3
+       pshufd  xmm3, xmm3, ROTL
+       movdqa  xmm5, xmm4
+       pslld   xmm4, 13
+       psrld   xmm5, 19
+       por     xmm4, xmm5
+       pxor    xmm1, xmm4
+
+       // a ^= (d + c) <<< 18
+       movdqa  xmm4, xmm1
+       pshufd  xmm1, xmm1, ROTR
+       paddd   xmm4, xmm2
+       pshufd  xmm2, xmm2, ROT2
+       movdqa  xmm5, xmm4
+       pslld   xmm4, 18
+       psrld   xmm5, 14
+       por     xmm4, xmm5
+       pxor    xmm0, xmm4
+
+       // We had to undo the transpose ready for the next loop.  Again, push
+       // back the shuffles because they take a long time coming through.
+       // Decrement the loop counter and see if we should go round again.
+       // Later processors fuse this pair into a single uop.
+       sub     NR, 2
+       ja      loop
+
+       // Almost there.  Firstly, the feedforward addition, and then we have
+       // to write out the result.  Here we have to undo the permutation
+       // which was already applied to the input.  Shuffling has quite high
+       // latency, so arrange to start a new shuffle into a temporary as
+       // soon as we've written out the old value.
+       paddd   xmm0, SAVE0
+       pshufd  xmm4, xmm0, 0x39
+       movd    [OUT +  0], xmm0
+
+       paddd   xmm1, SAVE1
+       pshufd  xmm5, xmm1, ROTL
+       movd    [OUT + 16], xmm1
+
+       paddd   xmm2, SAVE2
+       pshufd  xmm6, xmm2, ROT2
+       movd    [OUT + 32], xmm2
+
+       paddd   xmm3, SAVE3
+       pshufd  xmm7, xmm3, ROTR
+       movd    [OUT + 48], xmm3
+
+       movd    [OUT +  4], xmm7
+       pshufd  xmm7, xmm3, ROT2
+       movd    [OUT + 24], xmm7
+       pshufd  xmm3, xmm3, ROTL
+       movd    [OUT + 44], xmm3
+
+       movd    [OUT +  8], xmm6
+       pshufd  xmm6, xmm2, ROTL
+       movd    [OUT + 28], xmm6
+       pshufd  xmm2, xmm2, ROTR
+       movd    [OUT + 52], xmm2
+
+       movd    [OUT + 12], xmm5
+       pshufd  xmm5, xmm1, ROTR
+       movd    [OUT + 36], xmm5
+       pshufd  xmm1, xmm1, ROT2
+       movd    [OUT + 56], xmm1
+
+       movd    [OUT + 20], xmm4
+       pshufd  xmm4, xmm0, ROT2
+       movd    [OUT + 40], xmm4
+       pshufd  xmm0, xmm0, ROTL
+       movd    [OUT + 60], xmm0
+
+       // Tidy things up.
+
+#if CPUFAM_X86
+       mov     esp, ebp
+       pop     ebp
+#endif
+#if CPUFAM_AMD64 && ABI_WIN
+       movdqa  xmm6, [rsp +  0]
+       movdqa  xmm7, [rsp + 16]
+       add     rsp, 64 + 8
+#endif
+
+       // And with that, we're done.
+       ret
+
+#undef NR
+#undef IN
+#undef OUT
+#undef SAVE0
+#undef SAVE1
+#undef SAVE2
+#undef SAVE3
+
+ENDFUNC
+
+///----- That's all, folks --------------------------------------------------