X-Git-Url: https://git.distorted.org.uk/~mdw/catacomb/blobdiff_plain/e297526c6cfe427a9d70204966745651eac50fdb:/symm/salsa20-x86-sse2.S..0f23f75ff53acadf80e9d3dfd2dfd14cb526074f:/symm/salsa20-x86ish-sse2.S

diff --git a/symm/salsa20-x86-sse2.S b/symm/salsa20-x86ish-sse2.S
similarity index 63%
rename from symm/salsa20-x86-sse2.S
rename to symm/salsa20-x86ish-sse2.S
index 7a5bd2a3..a168d79a 100644
--- a/symm/salsa20-x86-sse2.S
+++ b/symm/salsa20-x86ish-sse2.S
@@ -44,20 +44,76 @@
 	.arch pentium4
 	.section .text
 
-FUNC(salsa20_core_x86_sse2)
+FUNC(salsa20_core_x86ish_sse2)
+
+	// Initial setup.
+
+#if CPUFAM_X86
+	// Arguments come in on the stack, and will need to be collected.  We
+	// we can get away with just the scratch registers for integer work,
+	// but we'll run out of XMM registers and will need some properly
+	// aligned space which we'll steal from the stack.  I don't trust the
+	// stack pointer's alignment, so I'll have to mask the stack pointer,
+	// which in turn means I'll need to keep track of the old value.
+	// Hence I'm making a full i386-style stack frame here.
+	//
+	// The Windows and SysV ABIs are sufficiently similar that we don't
+	// need to worry about the differences here.
+
+#  define NR ecx
+#  define IN eax
+#  define OUT edx
+#  define SAVE0 xmm6
+#  define SAVE1 xmm7
+#  define SAVE2 [esp + 0]
+#  define SAVE3 [esp + 16]
 
-	// Initial state.  We have three arguments:
-	// [ebp +  8] is the number of rounds to do
-	// [ebp + 12] points to the input matrix
-	// [ebp + 16] points to the output matrix
 	push	ebp
 	mov	ebp, esp
 	sub	esp, 32
-	mov	edx, [ebp + 12]
+	mov	IN, [ebp + 12]
+	mov	OUT, [ebp + 16]
 	and	esp, ~15
-
-	// Prepare for the main loop.
-	mov	ecx, [ebp + 8]
+	mov	NR, [ebp + 8]
+#endif
+
+#if CPUFAM_AMD64 && ABI_SYSV
+	// This is nice.  We have plenty of XMM registers, and the arguments
+	// are in useful places.  There's no need to spill anything and we
+	// can just get on with the code.
+
+#  define NR edi
+#  define IN rsi
+#  define OUT rdx
+#  define SAVE0 xmm6
+#  define SAVE1 xmm7
+#  define SAVE2 xmm8
+#  define SAVE3 xmm9
+#endif
+
+#  if CPUFAM_AMD64 && ABI_WIN
+	// Arguments come in registers, but they're different between Windows
+	// and everyone else (and everyone else is saner).
+	//
+	// The Windows ABI insists that we preserve some of the XMM
+	// registers, but we want more than we can use as scratch space.  Two
+	// places we only need to save a copy of the input for the
+	// feedforward at the end; but the other two we want for the final
+	// permutation, so save the old values on the stack (We need an extra
+	// 8 bytes to align the stack.)
+
+#  define NR ecx
+#  define IN rdx
+#  define OUT r8
+#  define SAVE0 xmm6
+#  define SAVE1 xmm7
+#  define SAVE2 [rsp + 32]
+#  define SAVE3 [rsp + 48]
+
+	sub	rsp, 64 + 8
+	movdqa	[rsp +  0], xmm6
+	movdqa	[rsp + 16], xmm7
+#endif
 
 	// First job is to slurp the matrix into XMM registers.  The words
 	// have already been permuted conveniently to make them line up
@@ -85,19 +141,18 @@ FUNC(salsa20_core_x86_sse2)
 	//	[ 4  5  6  7]    -->	[ 4  9 14  3] (b, xmm1)
 	//	[ 8  9 10 11]		[ 8 13  2  7] (c, xmm2)
 	//	[12 13 14 15]		[12  1  6 11] (d, xmm3)
-	movdqu	xmm0, [edx +  0]
-	movdqu	xmm1, [edx + 16]
-	movdqu	xmm2, [edx + 32]
-	movdqu	xmm3, [edx + 48]
+	movdqu	xmm0, [IN +  0]
+	movdqu	xmm1, [IN + 16]
+	movdqu	xmm2, [IN + 32]
+	movdqu	xmm3, [IN + 48]
 
-	// Take a copy for later.
-	movdqa	[esp +  0], xmm0
-	movdqa	[esp + 16], xmm1
-	movdqa	xmm6, xmm2
-	movdqa	xmm7, xmm3
+	## Take a copy for later.
+	movdqa	SAVE0, xmm0
+	movdqa	SAVE1, xmm1
+	movdqa	SAVE2, xmm2
+	movdqa	SAVE3, xmm3
 
 loop:
-
 	// Apply a column quarterround to each of the columns simultaneously.
 	// Alas, there doesn't seem to be a packed doubleword rotate, so we
 	// have to synthesize it.
@@ -147,9 +202,9 @@ loop:
 	// involve any movement of elements between rows.
 	//
 	//	[ 0  5 10 15]		[ 0  5 10 15] (a, xmm0)
-	//	[ 4  9 14  3]    -->	[ 1  6 11 12] (b, xmm3)
-	//	[ 8 13  2  7]		[ 2  7  8 13] (c, xmm2)
-	//	[12  1  6 11]		[ 3  4  9 14] (d, xmm1)
+	//	[ 4  9 14  3]	 -->	[ 1  6 11 12] (b, xmm3)
+	//	[ 8 13	2  7]		[ 2  7	8 13] (c, xmm2)
+	//	[12  1	6 11]		[ 3  4	9 14] (d, xmm1)
 	//
 	// The shuffles have quite high latency, so they've been pushed
 	// backwards into the main instruction list.
@@ -200,7 +255,7 @@ loop:
 	// back the shuffles because they take a long time coming through.
 	// Decrement the loop counter and see if we should go round again.
 	// Later processors fuse this pair into a single uop.
-	sub	ecx, 2
+	sub	NR, 2
 	ja	loop
 
 	// Almost there.  Firstly, the feedforward addition, and then we have
@@ -208,55 +263,69 @@ loop:
 	// which was already applied to the input.  Shuffling has quite high
 	// latency, so arrange to start a new shuffle into a temporary as
 	// soon as we've written out the old value.
-	mov	edx, [ebp + 16]
-
-	paddd	xmm0, [esp +  0]
-	pshufd	xmm4, xmm0, ROTR
-	movd	[edx +  0], xmm0
+	paddd	xmm0, SAVE0
+	pshufd	xmm4, xmm0, 0x39
+	movd	[OUT +  0], xmm0
 
-	paddd	xmm1, [esp + 16]
+	paddd	xmm1, SAVE1
 	pshufd	xmm5, xmm1, ROTL
-	movd	[edx + 16], xmm1
+	movd	[OUT + 16], xmm1
 
-	paddd	xmm2, xmm6
+	paddd	xmm2, SAVE2
 	pshufd	xmm6, xmm2, ROT2
-	movd	[edx + 32], xmm2
+	movd	[OUT + 32], xmm2
 
-	paddd	xmm3, xmm7
+	paddd	xmm3, SAVE3
 	pshufd	xmm7, xmm3, ROTR
-	movd	[edx + 48], xmm3
+	movd	[OUT + 48], xmm3
 
-	movd	[edx +  4], xmm7
+	movd	[OUT +  4], xmm7
 	pshufd	xmm7, xmm3, ROT2
-	movd	[edx + 24], xmm7
+	movd	[OUT + 24], xmm7
 	pshufd	xmm3, xmm3, ROTL
-	movd	[edx + 44], xmm3
+	movd	[OUT + 44], xmm3
 
-	movd	[edx +  8], xmm6
+	movd	[OUT +  8], xmm6
 	pshufd	xmm6, xmm2, ROTL
-	movd	[edx + 28], xmm6
+	movd	[OUT + 28], xmm6
 	pshufd	xmm2, xmm2, ROTR
-	movd	[edx + 52], xmm2
+	movd	[OUT + 52], xmm2
 
-	movd	[edx + 12], xmm5
+	movd	[OUT + 12], xmm5
 	pshufd	xmm5, xmm1, ROTR
-	movd	[edx + 36], xmm5
+	movd	[OUT + 36], xmm5
 	pshufd	xmm1, xmm1, ROT2
-	movd	[edx + 56], xmm1
+	movd	[OUT + 56], xmm1
 
-	movd	[edx + 20], xmm4
+	movd	[OUT + 20], xmm4
 	pshufd	xmm4, xmm0, ROT2
-	movd	[edx + 40], xmm4
+	movd	[OUT + 40], xmm4
 	pshufd	xmm0, xmm0, ROTL
-	movd	[edx + 60], xmm0
+	movd	[OUT + 60], xmm0
 
 	// Tidy things up.
+
+#if CPUFAM_X86
 	mov	esp, ebp
 	pop	ebp
+#endif
+#if CPUFAM_AMD64 && ABI_WIN
+	movdqa	xmm6, [rsp +  0]
+	movdqa	xmm7, [rsp + 16]
+	add	rsp, 64 + 8
+#endif
 
 	// And with that, we're done.
 	ret
 
+#undef NR
+#undef IN
+#undef OUT
+#undef SAVE0
+#undef SAVE1
+#undef SAVE2
+#undef SAVE3
+
 ENDFUNC
 
 ///----- That's all, folks --------------------------------------------------