X-Git-Url: https://git.distorted.org.uk/~mdw/catacomb/blobdiff_plain/e297526c6cfe427a9d70204966745651eac50fdb:/symm/chacha-x86-sse2.S..0f23f75ff53acadf80e9d3dfd2dfd14cb526074f:/symm/chacha-x86ish-sse2.S

diff --git a/symm/chacha-x86-sse2.S b/symm/chacha-x86ish-sse2.S
similarity index 65%
rename from symm/chacha-x86-sse2.S
rename to symm/chacha-x86ish-sse2.S
index ccdfa538..f36bf90f 100644
--- a/symm/chacha-x86-sse2.S
+++ b/symm/chacha-x86ish-sse2.S
@@ -44,17 +44,73 @@
 	.arch pentium4
 	.section .text
 
-FUNC(chacha_core_x86_sse2)
+FUNC(chacha_core_x86ish_sse2)
+
+	// Initial setup.
+
+#if CPUFAM_X86
+	// Arguments come in on the stack, and will need to be collected.  We
+	// we can get away with just the scratch registers for integer work,
+	// but we'll run out of XMM registers and will need some properly
+	// aligned space which we'll steal from the stack.  I don't trust the
+	// stack pointer's alignment, so I'll have to mask the stack pointer,
+	// which in turn means I'll need to keep track of the old value.
+	// Hence I'm making a full i386-style stack frame here.
+	//
+	// The Windows and SysV ABIs are sufficiently similar that we don't
+	// need to worry about the differences here.
+
+#  define NR ecx
+#  define IN eax
+#  define OUT edx
+#  define SAVE0 xmm5
+#  define SAVE1 xmm6
+#  define SAVE2 xmm7
+#  define SAVE3 [esp]
 
-	// Initial state.  We have three arguments:
-	// [ebp +  8] is the number of rounds to do
-	// [ebp + 12] points to the input matrix
-	// [ebp + 16] points to the output matrix
 	push	ebp
 	mov	ebp, esp
 	sub	esp, 16
-	mov	edx, [ebp + 12]
+	mov	IN, [ebp + 12]
+	mov	OUT, [ebp + 16]
 	and	esp, ~15
+	mov	NR, [ebp + 8]
+#endif
+
+#if CPUFAM_AMD64 && ABI_SYSV
+	// This is nice.  We have plenty of XMM registers, and the arguments
+	// are in useful places.  There's no need to spill anything and we
+	// can just get on with the code.
+
+#  define NR edi
+#  define IN rsi
+#  define OUT rdx
+#  define SAVE0 xmm5
+#  define SAVE1 xmm6
+#  define SAVE2 xmm7
+#  define SAVE3 xmm8
+#endif
+
+#if CPUFAM_AMD64 && ABI_WIN
+	// Arguments come in registers, but they're different between Windows
+	// and everyone else (and everyone else is saner).
+	//
+	// The Windows ABI insists that we preserve some of the XMM
+	// registers, but we want more than we can use as scratch space.  We
+	// only need to save a copy of the input for the feedforward at the
+	// end, so we might as well use memory rather than spill extra
+	// registers.  (We need an extra 8 bytes to align the stack.)
+
+#  define NR ecx
+#  define IN rdx
+#  define OUT r8
+#  define SAVE0 xmm5
+#  define SAVE1 [rsp +  0]
+#  define SAVE2 [rsp + 16]
+#  define SAVE3 [rsp + 32]
+
+	sub	rsp, 48 + 8
+#endif
 
 	// First job is to slurp the matrix into XMM registers.  Be careful:
 	// the input matrix isn't likely to be properly aligned.
@@ -63,20 +119,17 @@ FUNC(chacha_core_x86_sse2)
 	//	[ 4  5  6  7] (b, xmm1)
 	//	[ 8  9 10 11] (c, xmm2)
 	//	[12 13 14 15] (d, xmm3)
-	movdqu	xmm0, [edx +  0]
-	movdqu	xmm1, [edx + 16]
-	movdqu	xmm2, [edx + 32]
-	movdqu	xmm3, [edx + 48]
-
-	// Prepare for the main loop.
-	mov	ecx, [ebp + 8]
+	movdqu	xmm0, [IN +  0]
+	movdqu	xmm1, [IN + 16]
+	movdqu	xmm2, [IN + 32]
+	movdqu	xmm3, [IN + 48]
 
 	// Take a copy for later.  This one is aligned properly, by
 	// construction.
-	movdqa	[esp], xmm0
-	movdqa	xmm5, xmm1
-	movdqa	xmm6, xmm2
-	movdqa	xmm7, xmm3
+	movdqa	SAVE0, xmm0
+	movdqa	SAVE1, xmm1
+	movdqa	SAVE2, xmm2
+	movdqa	SAVE3, xmm3
 
 loop:
 	// Apply a column quarterround to each of the columns simultaneously.
@@ -174,26 +227,30 @@ loop:
 	pshufd	xmm1, xmm1, ROTL
 
 	// Decrement the loop counter and see if we should go round again.
-	sub	ecx, 2
+	sub	NR, 2
 	ja	loop
 
 	// Almost there.  Firstly, the feedforward addition.
-	mov	edx, [ebp + 16]
-	paddd	xmm0, [esp]
-	paddd	xmm1, xmm5
-	paddd	xmm2, xmm6
-	paddd	xmm3, xmm7
+	paddd	xmm0, SAVE0
+	paddd	xmm1, SAVE1
+	paddd	xmm2, SAVE2
+	paddd	xmm3, SAVE3
 
 	// And now we write out the result.  This one won't be aligned
 	// either.
-	movdqu	[edx +  0], xmm0
-	movdqu	[edx + 16], xmm1
-	movdqu	[edx + 32], xmm2
-	movdqu	[edx + 48], xmm3
+	movdqu	[OUT +  0], xmm0
+	movdqu	[OUT + 16], xmm1
+	movdqu	[OUT + 32], xmm2
+	movdqu	[OUT + 48], xmm3
 
 	// Tidy things up.
+#if CPUFAM_X86
 	mov	esp, ebp
 	pop	ebp
+#endif
+#if CPUFAM_AMD64 && ABI_WIN
+	add	rsp, 48 + 8
+#endif
 
 	// And with that, we're done.
 	ret