/// -*- mode: asm; asm-comment-char: ?/ -*-
///
/// Fancy SIMD implementation of Salsa20
///
/// (c) 2015 Straylight/Edgeware
///

///----- Licensing notice ---------------------------------------------------
///
/// This file is part of Catacomb.
///
/// Catacomb is free software; you can redistribute it and/or modify
/// it under the terms of the GNU Library General Public License as
/// published by the Free Software Foundation; either version 2 of the
/// License, or (at your option) any later version.
///
/// Catacomb is distributed in the hope that it will be useful,
/// but WITHOUT ANY WARRANTY; without even the implied warranty of
/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
/// GNU Library General Public License for more details.
///
/// You should have received a copy of the GNU Library General Public
/// License along with Catacomb; if not, write to the Free
/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
/// MA 02111-1307, USA.

///--------------------------------------------------------------------------
/// External definitions.

#include "config.h"
#include "asm-common.h"

///--------------------------------------------------------------------------
/// Main code.

	.arch pentium4
	.text

FUNC(salsa20_core_x86ish_sse2)

	// Initial setup.

#if CPUFAM_X86
	// Arguments come in on the stack, and will need to be collected.  We
	// we can get away with just the scratch registers for integer work,
	// but we'll run out of XMM registers and will need some properly
	// aligned space which we'll steal from the stack.  I don't trust the
	// stack pointer's alignment, so I'll have to mask the stack pointer,
	// which in turn means I'll need to keep track of the old value.
	// Hence I'm making a full i386-style stack frame here.
	//
	// The Windows and SysV ABIs are sufficiently similar that we don't
	// need to worry about the differences here.

#  define NR ecx
#  define IN eax
#  define OUT edx
#  define SAVE0 xmm6
#  define SAVE1 xmm7
#  define SAVE2 [esp + 0]
#  define SAVE3 [esp + 16]

	push	ebp
	mov	ebp, esp
	sub	esp, 32
	mov	IN, [ebp + 12]
	mov	OUT, [ebp + 16]
	and	esp, ~15
	mov	NR, [ebp + 8]
#endif

#if CPUFAM_AMD64 && ABI_SYSV
	// This is nice.  We have plenty of XMM registers, and the arguments
	// are in useful places.  There's no need to spill anything and we
	// can just get on with the code.

#  define NR edi
#  define IN rsi
#  define OUT rdx
#  define SAVE0 xmm6
#  define SAVE1 xmm7
#  define SAVE2 xmm8
#  define SAVE3 xmm9
#endif

#  if CPUFAM_AMD64 && ABI_WIN
	// Arguments come in registers, but they're different between Windows
	// and everyone else (and everyone else is saner).
	//
	// The Windows ABI insists that we preserve some of the XMM
	// registers, but we want more than we can use as scratch space.  Two
	// places we only need to save a copy of the input for the
	// feedforward at the end; but the other two we want for the final
	// permutation, so save the old values on the stack.  (We need an
	// extra 8 bytes to align the stack.)

#  define NR ecx
#  define IN rdx
#  define OUT r8
#  define SAVE0 xmm6
#  define SAVE1 xmm7
#  define SAVE2 [rsp + 32]
#  define SAVE3 [rsp + 48]

	sub	rsp, 64 + 8
	  .seh_stackalloc 64 + 8
	movdqa	[rsp +  0], xmm6
	  .seh_savexmm xmm6, 0
	movdqa	[rsp + 16], xmm7
	  .seh_savexmm xmm7, 16
  .seh_endprologue
#endif

	// First job is to slurp the matrix into XMM registers.  The words
	// have already been permuted conveniently to make them line up
	// better for SIMD processing.
	//
	// The textbook arrangement of the matrix is this.
	//
	//	[C K K K]
	//	[K C N N]
	//	[T T C K]
	//	[K K K C]
	//
	// But we've rotated the columns up so that the main diagonal with
	// the constants on it end up in the first row, giving something more
	// like
	//
	//	[C C C C]
	//	[K T K K]
	//	[T K K N]
	//	[K K N K]
	//
	// so the transformation looks like this:
	//
	//	[ 0  1  2  3]		[ 0  5 10 15] (a, xmm0)
	//	[ 4  5  6  7]    -->	[ 4  9 14  3] (b, xmm1)
	//	[ 8  9 10 11]		[ 8 13  2  7] (c, xmm2)
	//	[12 13 14 15]		[12  1  6 11] (d, xmm3)
	movdqu	xmm0, [IN +  0]
	movdqu	xmm1, [IN + 16]
	movdqu	xmm2, [IN + 32]
	movdqu	xmm3, [IN + 48]

	// Take a copy for later.
	movdqa	SAVE0, xmm0
	movdqa	SAVE1, xmm1
	movdqa	SAVE2, xmm2
	movdqa	SAVE3, xmm3

0:
	// Apply a column quarterround to each of the columns simultaneously.
	// Alas, there doesn't seem to be a packed doubleword rotate, so we
	// have to synthesize it.

	// b ^= (a + d) <<<  7
	movdqa	xmm4, xmm0
	paddd	xmm4, xmm3
	movdqa	xmm5, xmm4
	pslld	xmm4, 7
	psrld	xmm5, 25
	por	xmm4, xmm5
	pxor	xmm1, xmm4

	// c ^= (b + a) <<<  9
	movdqa	xmm4, xmm1
	paddd	xmm4, xmm0
	movdqa	xmm5, xmm4
	pslld	xmm4, 9
	psrld	xmm5, 23
	por	xmm4, xmm5
	pxor	xmm2, xmm4

	// d ^= (c + b) <<< 13
	movdqa	xmm4, xmm2
	paddd	xmm4, xmm1
	 pshufd	xmm1, xmm1, SHUF(2, 1, 0, 3)
	movdqa	xmm5, xmm4
	pslld	xmm4, 13
	psrld	xmm5, 19
	por	xmm4, xmm5
	pxor	xmm3, xmm4

	// a ^= (d + c) <<< 18
	movdqa	xmm4, xmm3
	 pshufd	xmm3, xmm3, SHUF(0, 3, 2, 1)
	paddd	xmm4, xmm2
	 pshufd	xmm2, xmm2, SHUF(1, 0, 3, 2)
	movdqa	xmm5, xmm4
	pslld	xmm4, 18
	psrld	xmm5, 14
	por	xmm4, xmm5
	pxor	xmm0, xmm4

	// The transpose conveniently only involves reordering elements of
	// individual rows, which can be done quite easily, and reordering
	// the rows themselves, which is a trivial renaming.  It doesn't
	// involve any movement of elements between rows.
	//
	//	[ 0  5 10 15]		[ 0  5 10 15] (a, xmm0)
	//	[ 4  9 14  3]	 -->	[ 1  6 11 12] (b, xmm3)
	//	[ 8 13	2  7]		[ 2  7	8 13] (c, xmm2)
	//	[12  1	6 11]		[ 3  4	9 14] (d, xmm1)
	//
	// The shuffles have quite high latency, so they've been pushed
	// backwards into the main instruction list.

	// Apply the row quarterround to each of the columns (yes!)
	// simultaneously.

	// b ^= (a + d) <<<  7
	movdqa	xmm4, xmm0
	paddd	xmm4, xmm1
	movdqa	xmm5, xmm4
	pslld	xmm4, 7
	psrld	xmm5, 25
	por	xmm4, xmm5
	pxor	xmm3, xmm4

	// c ^= (b + a) <<<  9
	movdqa	xmm4, xmm3
	paddd	xmm4, xmm0
	movdqa	xmm5, xmm4
	pslld	xmm4, 9
	psrld	xmm5, 23
	por	xmm4, xmm5
	pxor	xmm2, xmm4

	// d ^= (c + b) <<< 13
	movdqa	xmm4, xmm2
	paddd	xmm4, xmm3
	 pshufd	xmm3, xmm3, SHUF(2, 1, 0, 3)
	movdqa	xmm5, xmm4
	pslld	xmm4, 13
	psrld	xmm5, 19
	por	xmm4, xmm5
	pxor	xmm1, xmm4

	// a ^= (d + c) <<< 18
	movdqa	xmm4, xmm1
	 pshufd	xmm1, xmm1, SHUF(0, 3, 2, 1)
	paddd	xmm4, xmm2
	 pshufd	xmm2, xmm2, SHUF(1, 0, 3, 2)
	movdqa	xmm5, xmm4
	pslld	xmm4, 18
	psrld	xmm5, 14
	por	xmm4, xmm5
	pxor	xmm0, xmm4

	// We had to undo the transpose ready for the next loop.  Again, push
	// back the shuffles because they take a long time coming through.
	// Decrement the loop counter and see if we should go round again.
	// Later processors fuse this pair into a single uop.
	sub	NR, 2
	ja	0b

	// Almost there.  Firstly, the feedforward addition, and then we have
	// to write out the result.  Here we have to undo the permutation
	// which was already applied to the input.  Shuffling has quite high
	// latency, so arrange to start a new shuffle into a temporary as
	// soon as we've written out the old value.
	paddd	xmm0, SAVE0
	 pshufd	xmm4, xmm0, 0x39
	movd	[OUT +  0], xmm0

	paddd	xmm1, SAVE1
	 pshufd	xmm5, xmm1, SHUF(2, 1, 0, 3)
	movd	[OUT + 16], xmm1

	paddd	xmm2, SAVE2
	 pshufd	xmm6, xmm2, SHUF(1, 0, 3, 2)
	movd	[OUT + 32], xmm2

	paddd	xmm3, SAVE3
	 pshufd	xmm7, xmm3, SHUF(0, 3, 2, 1)
	movd	[OUT + 48], xmm3

	movd	[OUT +  4], xmm7
	 pshufd	xmm7, xmm3, SHUF(1, 0, 3, 2)
	movd	[OUT + 24], xmm7
	 pshufd	xmm3, xmm3, SHUF(2, 1, 0, 3)
	movd	[OUT + 44], xmm3

	movd	[OUT +  8], xmm6
	 pshufd	xmm6, xmm2, SHUF(2, 1, 0, 3)
	movd	[OUT + 28], xmm6
	 pshufd	xmm2, xmm2, SHUF(0, 3, 2, 1)
	movd	[OUT + 52], xmm2

	movd	[OUT + 12], xmm5
	 pshufd	xmm5, xmm1, SHUF(0, 3, 2, 1)
	movd	[OUT + 36], xmm5
	 pshufd	xmm1, xmm1, SHUF(1, 0, 3, 2)
	movd	[OUT + 56], xmm1

	movd	[OUT + 20], xmm4
	 pshufd	xmm4, xmm0, SHUF(1, 0, 3, 2)
	movd	[OUT + 40], xmm4
	 pshufd	xmm0, xmm0, SHUF(2, 1, 0, 3)
	movd	[OUT + 60], xmm0

	// Tidy things up.
#if CPUFAM_X86
	mov	esp, ebp
	pop	ebp
#endif
#if CPUFAM_AMD64 && ABI_WIN
	movdqa	xmm6, [rsp +  0]
	movdqa	xmm7, [rsp + 16]
	add	rsp, 64 + 8
#endif

	// And with that, we're done.
	ret

#undef NR
#undef IN
#undef OUT
#undef SAVE0
#undef SAVE1
#undef SAVE2
#undef SAVE3

ENDFUNC

///----- That's all, folks --------------------------------------------------